diff --git a/helix-core/src/indent.rs b/helix-core/src/indent.rs index 5ae66769..d272dd68 100644 --- a/helix-core/src/indent.rs +++ b/helix-core/src/indent.rs @@ -1,10 +1,177 @@ use crate::{ + chars::{char_is_line_ending, char_is_whitespace}, find_first_non_whitespace_char, syntax::{IndentQuery, LanguageConfiguration, Syntax}, tree_sitter::Node, - RopeSlice, + Rope, RopeSlice, }; +/// Enum representing indentation style. +/// +/// Only values 1-8 are valid for the `Spaces` variant. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum IndentStyle { + Tabs, + Spaces(u8), +} + +impl IndentStyle { + /// Creates an `IndentStyle` from an indentation string. + /// + /// For example, passing `" "` (four spaces) will create `IndentStyle::Spaces(4)`. + #[allow(clippy::should_implement_trait)] + #[inline] + pub fn from_str(indent: &str) -> Self { + // XXX: do we care about validating the input more than this? Probably not...? + debug_assert!(!indent.is_empty() && indent.len() <= 8); + + if indent.starts_with(' ') { + IndentStyle::Spaces(indent.len() as u8) + } else { + IndentStyle::Tabs + } + } + + #[inline] + pub fn as_str(&self) -> &'static str { + match *self { + IndentStyle::Tabs => "\t", + IndentStyle::Spaces(1) => " ", + IndentStyle::Spaces(2) => " ", + IndentStyle::Spaces(3) => " ", + IndentStyle::Spaces(4) => " ", + IndentStyle::Spaces(5) => " ", + IndentStyle::Spaces(6) => " ", + IndentStyle::Spaces(7) => " ", + IndentStyle::Spaces(8) => " ", + + // Unsupported indentation style. This should never happen, + // but just in case fall back to two spaces. + IndentStyle::Spaces(n) => { + debug_assert!(n > 0 && n <= 8); // Always triggers. `debug_panic!()` wanted. + " " + } + } + } +} + +/// Attempts to detect the indentation style used in a document. +/// +/// Returns the indentation style if the auto-detect confidence is +/// reasonably high, otherwise returns `None`. +pub fn auto_detect_indent_style(document_text: &Rope) -> Option { + // Build a histogram of the indentation *increases* between + // subsequent lines, ignoring lines that are all whitespace. + // + // Index 0 is for tabs, the rest are 1-8 spaces. + let histogram: [usize; 9] = { + let mut histogram = [0; 9]; + let mut prev_line_is_tabs = false; + let mut prev_line_leading_count = 0usize; + + // Loop through the lines, checking for and recording indentation + // increases as we go. + 'outer: for line in document_text.lines().take(1000) { + let mut c_iter = line.chars(); + + // Is first character a tab or space? + let is_tabs = match c_iter.next() { + Some('\t') => true, + Some(' ') => false, + + // Ignore blank lines. + Some(c) if char_is_line_ending(c) => continue, + + _ => { + prev_line_is_tabs = false; + prev_line_leading_count = 0; + continue; + } + }; + + // Count the line's total leading tab/space characters. + let mut leading_count = 1; + let mut count_is_done = false; + for c in c_iter { + match c { + '\t' if is_tabs && !count_is_done => leading_count += 1, + ' ' if !is_tabs && !count_is_done => leading_count += 1, + + // We stop counting if we hit whitespace that doesn't + // qualify as indent or doesn't match the leading + // whitespace, but we don't exit the loop yet because + // we still want to determine if the line is blank. + c if char_is_whitespace(c) => count_is_done = true, + + // Ignore blank lines. + c if char_is_line_ending(c) => continue 'outer, + + _ => break, + } + + // Bound the worst-case execution time for weird text files. + if leading_count > 256 { + continue 'outer; + } + } + + // If there was an increase in indentation over the previous + // line, update the histogram with that increase. + if (prev_line_is_tabs == is_tabs || prev_line_leading_count == 0) + && prev_line_leading_count < leading_count + { + if is_tabs { + histogram[0] += 1; + } else { + let amount = leading_count - prev_line_leading_count; + if amount <= 8 { + histogram[amount] += 1; + } + } + } + + // Store this line's leading whitespace info for use with + // the next line. + prev_line_is_tabs = is_tabs; + prev_line_leading_count = leading_count; + } + + // Give more weight to tabs, because their presence is a very + // strong indicator. + histogram[0] *= 2; + + histogram + }; + + // Find the most frequent indent, its frequency, and the frequency of + // the next-most frequent indent. + let indent = histogram + .iter() + .enumerate() + .max_by_key(|kv| kv.1) + .unwrap() + .0; + let indent_freq = histogram[indent]; + let indent_freq_2 = *histogram + .iter() + .enumerate() + .filter(|kv| kv.0 != indent) + .map(|kv| kv.1) + .max() + .unwrap(); + + // Return the the auto-detected result if we're confident enough in its + // accuracy, based on some heuristics. + if indent_freq >= 1 && (indent_freq_2 as f64 / indent_freq as f64) < 0.66 { + Some(match indent { + 0 => IndentStyle::Tabs, + _ => IndentStyle::Spaces(indent as u8), + }) + } else { + None + } +} + /// To determine indentation of a newly inserted line, figure out the indentation at the last col /// of the previous line. #[allow(dead_code)] diff --git a/helix-term/src/commands.rs b/helix-term/src/commands.rs index 2f071306..7403f5b2 100644 --- a/helix-term/src/commands.rs +++ b/helix-term/src/commands.rs @@ -1,5 +1,6 @@ use helix_core::{ comment, coords_at_pos, find_first_non_whitespace_char, find_root, graphemes, indent, + indent::IndentStyle, line_ending::{get_line_ending_of_str, line_end_char_index, str_is_line_ending}, match_brackets, movement::{self, Direction}, @@ -11,7 +12,7 @@ use helix_core::{ }; use helix_view::{ - document::{IndentStyle, Mode}, + document::Mode, editor::Action, input::KeyEvent, keyboard::KeyCode, diff --git a/helix-view/src/document.rs b/helix-view/src/document.rs index 9eabda46..c02d6656 100644 --- a/helix-view/src/document.rs +++ b/helix-view/src/document.rs @@ -9,8 +9,8 @@ use std::str::FromStr; use std::sync::Arc; use helix_core::{ - chars::{char_is_line_ending, char_is_whitespace}, history::History, + indent::{auto_detect_indent_style, IndentStyle}, line_ending::auto_detect_line_ending, syntax::{self, LanguageConfiguration}, ChangeSet, Diagnostic, LineEnding, Rope, RopeBuilder, Selection, State, Syntax, Transaction, @@ -63,12 +63,6 @@ impl<'de> Deserialize<'de> for Mode { } } -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub enum IndentStyle { - Tabs, - Spaces(u8), -} - pub struct Document { pub(crate) id: DocumentId, text: Rope, @@ -460,9 +454,7 @@ impl Document { doc.detect_language(theme, loader); } - // Detect indentation style and line ending. - doc.detect_indent_style(); - doc.line_ending = auto_detect_line_ending(&doc.text).unwrap_or(DEFAULT_LINE_ENDING); + doc.detect_indent_and_line_ending(); Ok(doc) } @@ -580,6 +572,18 @@ impl Document { } } + pub fn detect_indent_and_line_ending(&mut self) { + self.indent_style = auto_detect_indent_style(&self.text).unwrap_or_else(|| { + IndentStyle::from_str( + self.language + .as_ref() + .and_then(|config| config.indent.as_ref()) + .map_or(" ", |config| config.unit.as_str()), // Fallback to 2 spaces. + ) + }); + self.line_ending = auto_detect_line_ending(&self.text).unwrap_or(DEFAULT_LINE_ENDING); + } + /// Reload the document from its path. pub fn reload(&mut self, view_id: ViewId) -> Result<(), Error> { let encoding = &self.encoding; @@ -598,9 +602,7 @@ impl Document { self.append_changes_to_history(view_id); self.reset_modified(); - // Detect indentation style and line ending. - self.detect_indent_style(); - self.line_ending = auto_detect_line_ending(&self.text).unwrap_or(DEFAULT_LINE_ENDING); + self.detect_indent_and_line_ending(); Ok(()) } @@ -619,132 +621,6 @@ impl Document { self.encoding } - fn detect_indent_style(&mut self) { - // Build a histogram of the indentation *increases* between - // subsequent lines, ignoring lines that are all whitespace. - // - // Index 0 is for tabs, the rest are 1-8 spaces. - let histogram: [usize; 9] = { - let mut histogram = [0; 9]; - let mut prev_line_is_tabs = false; - let mut prev_line_leading_count = 0usize; - - // Loop through the lines, checking for and recording indentation - // increases as we go. - 'outer: for line in self.text.lines().take(1000) { - let mut c_iter = line.chars(); - - // Is first character a tab or space? - let is_tabs = match c_iter.next() { - Some('\t') => true, - Some(' ') => false, - - // Ignore blank lines. - Some(c) if char_is_line_ending(c) => continue, - - _ => { - prev_line_is_tabs = false; - prev_line_leading_count = 0; - continue; - } - }; - - // Count the line's total leading tab/space characters. - let mut leading_count = 1; - let mut count_is_done = false; - for c in c_iter { - match c { - '\t' if is_tabs && !count_is_done => leading_count += 1, - ' ' if !is_tabs && !count_is_done => leading_count += 1, - - // We stop counting if we hit whitespace that doesn't - // qualify as indent or doesn't match the leading - // whitespace, but we don't exit the loop yet because - // we still want to determine if the line is blank. - c if char_is_whitespace(c) => count_is_done = true, - - // Ignore blank lines. - c if char_is_line_ending(c) => continue 'outer, - - _ => break, - } - - // Bound the worst-case execution time for weird text files. - if leading_count > 256 { - continue 'outer; - } - } - - // If there was an increase in indentation over the previous - // line, update the histogram with that increase. - if (prev_line_is_tabs == is_tabs || prev_line_leading_count == 0) - && prev_line_leading_count < leading_count - { - if is_tabs { - histogram[0] += 1; - } else { - let amount = leading_count - prev_line_leading_count; - if amount <= 8 { - histogram[amount] += 1; - } - } - } - - // Store this line's leading whitespace info for use with - // the next line. - prev_line_is_tabs = is_tabs; - prev_line_leading_count = leading_count; - } - - // Give more weight to tabs, because their presence is a very - // strong indicator. - histogram[0] *= 2; - - histogram - }; - - // Find the most frequent indent, its frequency, and the frequency of - // the next-most frequent indent. - let indent = histogram - .iter() - .enumerate() - .max_by_key(|kv| kv.1) - .unwrap() - .0; - let indent_freq = histogram[indent]; - let indent_freq_2 = *histogram - .iter() - .enumerate() - .filter(|kv| kv.0 != indent) - .map(|kv| kv.1) - .max() - .unwrap(); - - // Use the auto-detected result if we're confident enough in its - // accuracy, based on some heuristics. Otherwise fall back to - // the language-based setting. - if indent_freq >= 1 && (indent_freq_2 as f64 / indent_freq as f64) < 0.66 { - // Use the auto-detected setting. - self.indent_style = match indent { - 0 => IndentStyle::Tabs, - _ => IndentStyle::Spaces(indent as u8), - }; - } else { - // Fall back to language-based setting. - let indent = self - .language - .as_ref() - .and_then(|config| config.indent.as_ref()) - .map_or(" ", |config| config.unit.as_str()); // fallback to 2 spaces - - self.indent_style = if indent.starts_with(' ') { - IndentStyle::Spaces(indent.len() as u8) - } else { - IndentStyle::Tabs - }; - } - } - pub fn set_path(&mut self, path: &Path) -> Result<(), std::io::Error> { let path = canonicalize_path(path)?; @@ -1002,21 +878,7 @@ impl Document { /// TODO: we might not need this function anymore, since the information /// is conveniently available in `Document::indent_style` now. pub fn indent_unit(&self) -> &'static str { - match self.indent_style { - IndentStyle::Tabs => "\t", - IndentStyle::Spaces(1) => " ", - IndentStyle::Spaces(2) => " ", - IndentStyle::Spaces(3) => " ", - IndentStyle::Spaces(4) => " ", - IndentStyle::Spaces(5) => " ", - IndentStyle::Spaces(6) => " ", - IndentStyle::Spaces(7) => " ", - IndentStyle::Spaces(8) => " ", - - // Unsupported indentation style. This should never happen, - // but just in case fall back to two spaces. - _ => " ", - } + self.indent_style.as_str() } #[inline]