Merge pull request #224 from helix-editor/line_ending_detection

Line ending detection
3 years ago · a70de6e980
parent c704970fd7 f2954fa153
commit a70de6e980
17 changed files with 562 additions and 205 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -344,6 +344,7 @@ dependencies = [
 "bitflags",
 "cassowary",
 "crossterm",
+ "helix-core",
 "serde",
 "unicode-segmentation",
 "unicode-width",
--- a/helix-core/src/auto_pairs.rs
+++ b/helix-core/src/auto_pairs.rs
@ -12,7 +12,7 @@ pub const PAIRS: &[(char, char)] = &[
    ('`', '`'),
 ];

-const CLOSE_BEFORE: &str = ")]}'\":;> \n"; // includes space and newline
+const CLOSE_BEFORE: &str = ")]}'\":;> \n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}"; // includes space and newlines

 // insert hook:
 // Fn(doc, selection, char) => Option<Transaction>
--- a/helix-core/src/chars.rs
+++ b/helix-core/src/chars.rs
@ -1,25 +1,44 @@
-/// Determine whether a character is a line break.
-pub fn char_is_linebreak(c: char) -> bool {
-    matches!(
-        c,
-        '\u{000A}' | // LineFeed
-        '\u{000B}' | // VerticalTab
-        '\u{000C}' | // FormFeed
-        '\u{000D}' | // CarriageReturn
-        '\u{0085}' | // NextLine
-        '\u{2028}' | // Line Separator
-        '\u{2029}' // ParagraphSeparator
-    )
+use crate::LineEnding;
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum CharCategory {
+    Whitespace,
+    Eol,
+    Word,
+    Punctuation,
+    Unknown,
+}
+
+#[inline]
+pub fn categorize_char(ch: char) -> CharCategory {
+    if char_is_line_ending(ch) {
+        CharCategory::Eol
+    } else if ch.is_whitespace() {
+        CharCategory::Whitespace
+    } else if char_is_word(ch) {
+        CharCategory::Word
+    } else if char_is_punctuation(ch) {
+        CharCategory::Punctuation
+    } else {
+        CharCategory::Unknown
+    }
+}
+
+/// Determine whether a character is a line ending.
+#[inline]
+pub fn char_is_line_ending(ch: char) -> bool {
+    LineEnding::from_char(ch).is_some()
 }

 /// Determine whether a character qualifies as (non-line-break)
 /// whitespace.
-pub fn char_is_whitespace(c: char) -> bool {
+#[inline]
+pub fn char_is_whitespace(ch: char) -> bool {
    // TODO: this is a naive binary categorization of whitespace
    // characters.  For display, word wrapping, etc. we'll need a better
    // categorization based on e.g. breaking vs non-breaking spaces
    // and whether they're zero-width or not.
-    match c {
+    match ch {
        //'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace)
        '\u{0009}' | // Character Tabulation
        '\u{0020}' | // Space
@ -34,8 +53,81 @@ pub fn char_is_whitespace(c: char) -> bool {
        // En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
        // Four-per-em Space, Six-per-em Space, Figure Space,
        // Punctuation Space, Thin Space, Hair Space, Zero Width Space.
-        c if ('\u{2000}' ..= '\u{200B}').contains(&c) => true,
+        ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,

        _ => false,
    }
 }
+
+#[inline]
+pub fn char_is_punctuation(ch: char) -> bool {
+    use unicode_general_category::{get_general_category, GeneralCategory};
+
+    matches!(
+        get_general_category(ch),
+        GeneralCategory::OtherPunctuation
+            | GeneralCategory::OpenPunctuation
+            | GeneralCategory::ClosePunctuation
+            | GeneralCategory::InitialPunctuation
+            | GeneralCategory::FinalPunctuation
+            | GeneralCategory::ConnectorPunctuation
+            | GeneralCategory::DashPunctuation
+            | GeneralCategory::MathSymbol
+            | GeneralCategory::CurrencySymbol
+            | GeneralCategory::ModifierSymbol
+    )
+}
+
+#[inline]
+pub fn char_is_word(ch: char) -> bool {
+    ch.is_alphanumeric() || ch == '_'
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_categorize() {
+        const EOL_TEST_CASE: &'static str = "\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}";
+        const WORD_TEST_CASE: &'static str =
+            "_hello_world_あいうえおー1234567890１２３４５６７８９０";
+        const PUNCTUATION_TEST_CASE: &'static str =
+            "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~！”＃＄％＆’（）＊＋、。：；＜＝＞？＠「」＾｀｛｜｝～";
+        const WHITESPACE_TEST_CASE: &'static str = "  　   ";
+
+        for ch in EOL_TEST_CASE.chars() {
+            assert_eq!(CharCategory::Eol, categorize_char(ch));
+        }
+
+        for ch in WHITESPACE_TEST_CASE.chars() {
+            assert_eq!(
+                CharCategory::Whitespace,
+                categorize_char(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
+                ch,
+                categorize_char(ch)
+            );
+        }
+
+        for ch in WORD_TEST_CASE.chars() {
+            assert_eq!(
+                CharCategory::Word,
+                categorize_char(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Word`",
+                ch,
+                categorize_char(ch)
+            );
+        }
+
+        for ch in PUNCTUATION_TEST_CASE.chars() {
+            assert_eq!(
+                CharCategory::Punctuation,
+                categorize_char(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
+                ch,
+                categorize_char(ch)
+            );
+        }
+    }
+}
--- a/helix-core/src/lib.rs
+++ b/helix-core/src/lib.rs
@ -6,6 +6,7 @@ pub mod diagnostic;
 pub mod graphemes;
 pub mod history;
 pub mod indent;
+pub mod line_ending;
 pub mod macros;
 pub mod match_brackets;
 pub mod movement;
@ -106,6 +107,7 @@ pub use tendril::StrTendril as Tendril;
 #[doc(inline)]
 pub use {regex, tree_sitter};

+pub use graphemes::RopeGraphemes;
 pub use position::{coords_at_pos, pos_at_coords, Position};
 pub use selection::{Range, Selection};
 pub use smallvec::SmallVec;
@ -114,4 +116,5 @@ pub use syntax::Syntax;
 pub use diagnostic::Diagnostic;
 pub use state::State;

+pub use line_ending::{LineEnding, DEFAULT_LINE_ENDING};
 pub use transaction::{Assoc, Change, ChangeSet, Operation, Transaction};
--- a/helix-core/src/line_ending.rs
+++ b/helix-core/src/line_ending.rs
@ -0,0 +1,252 @@
+use crate::{Rope, RopeGraphemes, RopeSlice};
+
+#[cfg(target_os = "windows")]
+pub const DEFAULT_LINE_ENDING: LineEnding = LineEnding::Crlf;
+#[cfg(not(target_os = "windows"))]
+pub const DEFAULT_LINE_ENDING: LineEnding = LineEnding::LF;
+
+/// Represents one of the valid Unicode line endings.
+#[derive(PartialEq, Copy, Clone, Debug)]
+pub enum LineEnding {
+    Crlf, // CarriageReturn followed by LineFeed
+    LF,   // U+000A -- LineFeed
+    VT,   // U+000B -- VerticalTab
+    FF,   // U+000C -- FormFeed
+    CR,   // U+000D -- CarriageReturn
+    Nel,  // U+0085 -- NextLine
+    LS,   // U+2028 -- Line Separator
+    PS,   // U+2029 -- ParagraphSeparator
+}
+
+impl LineEnding {
+    #[inline]
+    pub fn len_chars(&self) -> usize {
+        match self {
+            Self::Crlf => 2,
+            _ => 1,
+        }
+    }
+
+    #[inline]
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::Crlf => "\u{000D}\u{000A}",
+            Self::LF => "\u{000A}",
+            Self::VT => "\u{000B}",
+            Self::FF => "\u{000C}",
+            Self::CR => "\u{000D}",
+            Self::Nel => "\u{0085}",
+            Self::LS => "\u{2028}",
+            Self::PS => "\u{2029}",
+        }
+    }
+
+    #[inline]
+    pub fn from_char(ch: char) -> Option<LineEnding> {
+        match ch {
+            '\u{000A}' => Some(LineEnding::LF),
+            '\u{000B}' => Some(LineEnding::VT),
+            '\u{000C}' => Some(LineEnding::FF),
+            '\u{000D}' => Some(LineEnding::CR),
+            '\u{0085}' => Some(LineEnding::Nel),
+            '\u{2028}' => Some(LineEnding::LS),
+            '\u{2029}' => Some(LineEnding::PS),
+            // Not a line ending
+            _ => None,
+        }
+    }
+
+    // Normally we'd want to implement the FromStr trait, but in this case
+    // that would force us into a different return type than from_char or
+    // or from_rope_slice, which would be weird.
+    #[allow(clippy::should_implement_trait)]
+    #[inline]
+    pub fn from_str(g: &str) -> Option<LineEnding> {
+        match g {
+            "\u{000D}\u{000A}" => Some(LineEnding::Crlf),
+            "\u{000A}" => Some(LineEnding::LF),
+            "\u{000B}" => Some(LineEnding::VT),
+            "\u{000C}" => Some(LineEnding::FF),
+            "\u{000D}" => Some(LineEnding::CR),
+            "\u{0085}" => Some(LineEnding::Nel),
+            "\u{2028}" => Some(LineEnding::LS),
+            "\u{2029}" => Some(LineEnding::PS),
+            // Not a line ending
+            _ => None,
+        }
+    }
+
+    #[inline]
+    pub fn from_rope_slice(g: &RopeSlice) -> Option<LineEnding> {
+        if let Some(text) = g.as_str() {
+            LineEnding::from_str(text)
+        } else {
+            // Non-contiguous, so it can't be a line ending.
+            // Specifically, Ropey guarantees that CRLF is always
+            // contiguous.  And the remaining line endings are all
+            // single `char`s, and therefore trivially contiguous.
+            None
+        }
+    }
+}
+
+#[inline]
+pub fn str_is_line_ending(s: &str) -> bool {
+    LineEnding::from_str(s).is_some()
+}
+
+/// Attempts to detect what line ending the passed document uses.
+pub fn auto_detect_line_ending(doc: &Rope) -> Option<LineEnding> {
+    // Return first matched line ending. Not all possible line endings
+    // are being matched, as they might be special-use only
+    for line in doc.lines().take(100) {
+        match get_line_ending(&line) {
+            None | Some(LineEnding::VT) | Some(LineEnding::FF) | Some(LineEnding::PS) => {}
+            ending => return ending,
+        }
+    }
+    None
+}
+
+/// Returns the passed line's line ending, if any.
+pub fn get_line_ending(line: &RopeSlice) -> Option<LineEnding> {
+    // Last character as str.
+    let g1 = line
+        .slice(line.len_chars().saturating_sub(1)..)
+        .as_str()
+        .unwrap();
+
+    // Last two characters as str, or empty str if they're not contiguous.
+    // It's fine to punt on the non-contiguous case, because Ropey guarantees
+    // that CRLF is always contiguous.
+    let g2 = line
+        .slice(line.len_chars().saturating_sub(2)..)
+        .as_str()
+        .unwrap_or("");
+
+    // First check the two-character case for CRLF, then check the single-character case.
+    LineEnding::from_str(g2).or_else(|| LineEnding::from_str(g1))
+}
+
+/// Returns the passed line's line ending, if any.
+pub fn get_line_ending_of_str(line: &str) -> Option<LineEnding> {
+    if line.ends_with("\u{000D}\u{000A}") {
+        Some(LineEnding::Crlf)
+    } else if line.ends_with('\u{000A}') {
+        Some(LineEnding::LF)
+    } else if line.ends_with('\u{000B}') {
+        Some(LineEnding::VT)
+    } else if line.ends_with('\u{000C}') {
+        Some(LineEnding::FF)
+    } else if line.ends_with('\u{000D}') {
+        Some(LineEnding::CR)
+    } else if line.ends_with('\u{0085}') {
+        Some(LineEnding::Nel)
+    } else if line.ends_with('\u{2028}') {
+        Some(LineEnding::LS)
+    } else if line.ends_with('\u{2029}') {
+        Some(LineEnding::PS)
+    } else {
+        None
+    }
+}
+
+/// Returns the char index of the end of the given line, not including its line ending.
+pub fn line_end_char_index(slice: &RopeSlice, line: usize) -> usize {
+    slice.line_to_char(line + 1)
+        - get_line_ending(&slice.line(line))
+            .map(|le| le.len_chars())
+            .unwrap_or(0)
+}
+
+#[cfg(test)]
+mod line_ending_tests {
+    use super::*;
+
+    #[test]
+    fn line_ending_autodetect() {
+        assert_eq!(
+            auto_detect_line_ending(&Rope::from_str("\n")),
+            Some(LineEnding::LF)
+        );
+        assert_eq!(
+            auto_detect_line_ending(&Rope::from_str("\r\n")),
+            Some(LineEnding::Crlf)
+        );
+        assert_eq!(auto_detect_line_ending(&Rope::from_str("hello")), None);
+        assert_eq!(auto_detect_line_ending(&Rope::from_str("")), None);
+        assert_eq!(
+            auto_detect_line_ending(&Rope::from_str("hello\nhelix\r\n")),
+            Some(LineEnding::LF)
+        );
+        assert_eq!(
+            auto_detect_line_ending(&Rope::from_str("a formfeed\u{000C}")),
+            None
+        );
+        assert_eq!(
+            auto_detect_line_ending(&Rope::from_str("\n\u{000A}\n \u{000A}")),
+            Some(LineEnding::LF)
+        );
+        assert_eq!(
+            auto_detect_line_ending(&Rope::from_str(
+                "a formfeed\u{000C} with a\u{000C} linefeed\u{000A}"
+            )),
+            Some(LineEnding::LF)
+        );
+        assert_eq!(auto_detect_line_ending(&Rope::from_str("a formfeed\u{000C} with a\u{000C} carriage return linefeed\u{000D}\u{000A} and a linefeed\u{000A}")), Some(LineEnding::Crlf));
+    }
+
+    #[test]
+    fn str_to_line_ending() {
+        assert_eq!(LineEnding::from_str("\r"), Some(LineEnding::CR));
+        assert_eq!(LineEnding::from_str("\n"), Some(LineEnding::LF));
+        assert_eq!(LineEnding::from_str("\r\n"), Some(LineEnding::Crlf));
+        assert_eq!(LineEnding::from_str("hello\n"), None);
+    }
+
+    #[test]
+    fn rope_slice_to_line_ending() {
+        let r = Rope::from_str("hello\r\n");
+        assert_eq!(
+            LineEnding::from_rope_slice(&r.slice(5..6)),
+            Some(LineEnding::CR)
+        );
+        assert_eq!(
+            LineEnding::from_rope_slice(&r.slice(6..7)),
+            Some(LineEnding::LF)
+        );
+        assert_eq!(
+            LineEnding::from_rope_slice(&r.slice(5..7)),
+            Some(LineEnding::Crlf)
+        );
+        assert_eq!(LineEnding::from_rope_slice(&r.slice(..)), None);
+    }
+
+    #[test]
+    fn get_line_ending_rope_slice() {
+        let r = Rope::from_str("Hello\rworld\nhow\r\nare you?");
+        assert_eq!(get_line_ending(&r.slice(..6)), Some(LineEnding::CR));
+        assert_eq!(get_line_ending(&r.slice(..12)), Some(LineEnding::LF));
+        assert_eq!(get_line_ending(&r.slice(..17)), Some(LineEnding::Crlf));
+        assert_eq!(get_line_ending(&r.slice(..)), None);
+    }
+
+    #[test]
+    fn get_line_ending_str() {
+        let text = "Hello\rworld\nhow\r\nare you?";
+        assert_eq!(get_line_ending_of_str(&text[..6]), Some(LineEnding::CR));
+        assert_eq!(get_line_ending_of_str(&text[..12]), Some(LineEnding::LF));
+        assert_eq!(get_line_ending_of_str(&text[..17]), Some(LineEnding::Crlf));
+        assert_eq!(get_line_ending_of_str(&text[..]), None);
+    }
+
+    #[test]
+    fn line_end_char_index_rope_slice() {
+        let r = Rope::from_str("Hello\rworld\nhow\r\nare you?");
+        let s = &r.slice(..);
+        assert_eq!(line_end_char_index(s, 0), 5);
+        assert_eq!(line_end_char_index(s, 1), 11);
+        assert_eq!(line_end_char_index(s, 2), 15);
+        assert_eq!(line_end_char_index(s, 3), 25);
+    }
+}
--- a/helix-core/src/movement.rs
+++ b/helix-core/src/movement.rs
@ -3,8 +3,13 @@ use std::iter::{self, from_fn, Peekable, SkipWhile};
 use ropey::iter::Chars;

 use crate::{
+    chars::{
+        categorize_char, char_is_line_ending, char_is_punctuation, char_is_whitespace,
+        char_is_word, CharCategory,
+    },
    coords_at_pos,
    graphemes::{nth_next_grapheme_boundary, nth_prev_grapheme_boundary},
+    line_ending::{get_line_ending, line_end_char_index},
    pos_at_coords, Position, Range, RopeSlice,
 };

@ -37,9 +42,8 @@ pub fn move_horizontally(
            nth_prev_grapheme_boundary(slice, pos, count).max(start)
        }
        Direction::Forward => {
-            // Line end is pos at the start of next line - 1
-            let end = slice.line_to_char(line + 1).saturating_sub(1);
-            nth_next_grapheme_boundary(slice, pos, count).min(end)
+            let end_char_idx = line_end_char_index(&slice, line);
+            nth_next_grapheme_boundary(slice, pos, count).min(end_char_idx)
        }
    };
    let anchor = match behaviour {
@ -68,8 +72,11 @@ pub fn move_vertically(
        ),
    };

-    // convert to 0-indexed, subtract another 1 because len_chars() counts \n
-    let new_line_len = slice.line(new_line).len_chars().saturating_sub(2);
+    // Length of the line sans line-ending.
+    let new_line_len = {
+        let line = slice.line(new_line);
+        line.len_chars() - get_line_ending(&line).map(|le| le.len_chars()).unwrap_or(0)
+    };

    let new_col = std::cmp::min(horiz as usize, new_line_len);

@ -104,64 +111,6 @@ fn word_move(slice: RopeSlice, mut range: Range, count: usize, target: WordMotio
 }

 // ---- util ------------
-#[inline]
-pub(crate) fn is_word(ch: char) -> bool {
-    ch.is_alphanumeric() || ch == '_'
-}
-
-#[inline]
-pub(crate) fn is_end_of_line(ch: char) -> bool {
-    ch == '\n'
-}
-
-#[inline]
-// Whitespace, but not end of line
-pub(crate) fn is_strict_whitespace(ch: char) -> bool {
-    ch.is_whitespace() && !is_end_of_line(ch)
-}
-
-#[inline]
-pub(crate) fn is_punctuation(ch: char) -> bool {
-    use unicode_general_category::{get_general_category, GeneralCategory};
-
-    matches!(
-        get_general_category(ch),
-        GeneralCategory::OtherPunctuation
-            | GeneralCategory::OpenPunctuation
-            | GeneralCategory::ClosePunctuation
-            | GeneralCategory::InitialPunctuation
-            | GeneralCategory::FinalPunctuation
-            | GeneralCategory::ConnectorPunctuation
-            | GeneralCategory::DashPunctuation
-            | GeneralCategory::MathSymbol
-            | GeneralCategory::CurrencySymbol
-            | GeneralCategory::ModifierSymbol
-    )
-}
-
-#[derive(Debug, Eq, PartialEq)]
-pub enum Category {
-    Whitespace,
-    Eol,
-    Word,
-    Punctuation,
-    Unknown,
-}
-
-#[inline]
-pub(crate) fn categorize(ch: char) -> Category {
-    if is_end_of_line(ch) {
-        Category::Eol
-    } else if ch.is_whitespace() {
-        Category::Whitespace
-    } else if is_word(ch) {
-        Category::Word
-    } else if is_punctuation(ch) {
-        Category::Punctuation
-    } else {
-        Category::Unknown
-    }
-}

 #[inline]
 /// Returns first index that doesn't satisfy a given predicate when
@ -235,7 +184,8 @@ impl CharHelpers for Chars<'_> {
        let mut phase = WordMotionPhase::Start;
        let mut head = origin.head;
        let mut anchor: Option<usize> = None;
-        let is_boundary = |a: char, b: Option<char>| categorize(a) != categorize(b.unwrap_or(a));
+        let is_boundary =
+            |a: char, b: Option<char>| categorize_char(a) != categorize_char(b.unwrap_or(a));
        while let Some(peek) = characters.peek().copied() {
            phase = match phase {
                WordMotionPhase::Start => {
@ -244,7 +194,8 @@ impl CharHelpers for Chars<'_> {
                        break; // We're at the end, so there's nothing to do.
                    }
                    // Anchor may remain here if the head wasn't at a boundary
-                    if !is_boundary(peek, characters.peek().copied()) && !is_end_of_line(peek) {
+                    if !is_boundary(peek, characters.peek().copied()) && !char_is_line_ending(peek)
+                    {
                        anchor = Some(head);
                    }
                    // First character is always skipped by the head
@ -252,7 +203,7 @@ impl CharHelpers for Chars<'_> {
                    WordMotionPhase::SkipNewlines
                }
                WordMotionPhase::SkipNewlines => {
-                    if is_end_of_line(peek) {
+                    if char_is_line_ending(peek) {
                        characters.next();
                        if characters.peek().is_some() {
                            advance(&mut head);
@ -286,12 +237,12 @@ fn reached_target(target: WordMotionTarget, peek: char, next_peek: Option<&char>

    match target {
        WordMotionTarget::NextWordStart => {
-            ((categorize(peek) != categorize(*next_peek))
-                && (is_end_of_line(*next_peek) || !next_peek.is_whitespace()))
+            ((categorize_char(peek) != categorize_char(*next_peek))
+                && (char_is_line_ending(*next_peek) || !next_peek.is_whitespace()))
        }
        WordMotionTarget::NextWordEnd | WordMotionTarget::PrevWordStart => {
-            ((categorize(peek) != categorize(*next_peek))
-                && (!peek.is_whitespace() || is_end_of_line(*next_peek)))
+            ((categorize_char(peek) != categorize_char(*next_peek))
+                && (!peek.is_whitespace() || char_is_line_ending(*next_peek)))
        }
    }
 }
@ -330,7 +281,7 @@ mod test {
                slice,
                move_vertically(slice, range, Direction::Forward, 1, Movement::Move).head
            ),
-            (1, 2).into()
+            (1, 3).into()
        );
    }

@ -343,12 +294,12 @@ mod test {
        let mut range = Range::point(position);

        let moves_and_expected_coordinates = [
-            ((Direction::Forward, 1usize), (0, 1)),
-            ((Direction::Forward, 2usize), (0, 3)),
-            ((Direction::Forward, 0usize), (0, 3)),
-            ((Direction::Forward, 999usize), (0, 31)),
-            ((Direction::Forward, 999usize), (0, 31)),
-            ((Direction::Backward, 999usize), (0, 0)),
+            ((Direction::Forward, 1usize), (0, 1)), // T|his is a simple alphabetic line
+            ((Direction::Forward, 2usize), (0, 3)), // Thi|s is a simple alphabetic line
+            ((Direction::Forward, 0usize), (0, 3)), // Thi|s is a simple alphabetic line
+            ((Direction::Forward, 999usize), (0, 32)), // This is a simple alphabetic line|
+            ((Direction::Forward, 999usize), (0, 32)), // This is a simple alphabetic line|
+            ((Direction::Backward, 999usize), (0, 0)), // |This is a simple alphabetic line
        ];

        for ((direction, amount), coordinates) in IntoIter::new(moves_and_expected_coordinates) {
@ -366,15 +317,15 @@ mod test {
        let mut range = Range::point(position);

        let moves_and_expected_coordinates = IntoIter::new([
-            ((Direction::Forward, 1usize), (0, 1)),    // M_ltiline
-            ((Direction::Forward, 2usize), (0, 3)),    // Mul_iline
-            ((Direction::Backward, 6usize), (0, 0)),   // _ultiline
-            ((Direction::Backward, 999usize), (0, 0)), // _ultiline
-            ((Direction::Forward, 3usize), (0, 3)),    // Mul_iline
-            ((Direction::Forward, 0usize), (0, 3)),    // Mul_iline
-            ((Direction::Backward, 0usize), (0, 3)),   // Mul_iline
-            ((Direction::Forward, 999usize), (0, 9)),  // Multilin_
-            ((Direction::Forward, 999usize), (0, 9)),  // Multilin_
+            ((Direction::Forward, 1usize), (0, 1)),    // M|ultiline\n
+            ((Direction::Forward, 2usize), (0, 3)),    // Mul|tiline\n
+            ((Direction::Backward, 6usize), (0, 0)),   // |Multiline\n
+            ((Direction::Backward, 999usize), (0, 0)), // |Multiline\n
+            ((Direction::Forward, 3usize), (0, 3)),    // Mul|tiline\n
+            ((Direction::Forward, 0usize), (0, 3)),    // Mul|tiline\n
+            ((Direction::Backward, 0usize), (0, 3)),   // Mul|tiline\n
+            ((Direction::Forward, 999usize), (0, 9)),  // Multiline|\n
+            ((Direction::Forward, 999usize), (0, 9)),  // Multiline|\n
        ]);

        for ((direction, amount), coordinates) in moves_and_expected_coordinates {
@ -446,7 +397,7 @@ mod test {
            // First descent preserves column as the target line is wider
            ((Axis::V, Direction::Forward, 1usize), (1, 8)),
            // Second descent clamps column as the target line is shorter
-            ((Axis::V, Direction::Forward, 1usize), (2, 4)),
+            ((Axis::V, Direction::Forward, 1usize), (2, 5)),
            // Third descent restores the original column
            ((Axis::V, Direction::Forward, 1usize), (3, 8)),
            // Behaviour is preserved even through long jumps
@ -760,45 +711,4 @@ mod test {
            }
        }
    }
-
-    #[test]
-    fn test_categorize() {
-        const WORD_TEST_CASE: &'static str =
-            "_hello_world_あいうえおー1234567890１２３４５６７８９０";
-        const PUNCTUATION_TEST_CASE: &'static str =
-            "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~！”＃＄％＆’（）＊＋、。：；＜＝＞？＠「」＾｀｛｜｝～";
-        const WHITESPACE_TEST_CASE: &'static str = "  　   ";
-
-        assert_eq!(Category::Eol, categorize('\n'));
-
-        for ch in WHITESPACE_TEST_CASE.chars() {
-            assert_eq!(
-                Category::Whitespace,
-                categorize(ch),
-                "Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
-                ch,
-                categorize(ch)
-            );
-        }
-
-        for ch in WORD_TEST_CASE.chars() {
-            assert_eq!(
-                Category::Word,
-                categorize(ch),
-                "Testing '{}', but got `{:?}` instead of `Category::Word`",
-                ch,
-                categorize(ch)
-            );
-        }
-
-        for ch in PUNCTUATION_TEST_CASE.chars() {
-            assert_eq!(
-                Category::Punctuation,
-                categorize(ch),
-                "Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
-                ch,
-                categorize(ch)
-            );
-        }
-    }
 }
--- a/helix-core/src/position.rs
+++ b/helix-core/src/position.rs
@ -1,4 +1,5 @@
 use crate::{
+    chars::char_is_line_ending,
    graphemes::{nth_next_grapheme_boundary, RopeGraphemes},
    Rope, RopeSlice,
 };
@ -23,8 +24,9 @@ impl Position {
    pub fn traverse(self, text: &crate::Tendril) -> Self {
        let Self { mut row, mut col } = self;
        // TODO: there should be a better way here
-        for ch in text.chars() {
-            if ch == '\n' {
+        let mut chars = text.chars().peekable();
+        while let Some(ch) = chars.next() {
+            if char_is_line_ending(ch) && !(ch == '\r' && chars.peek() == Some(&'\n')) {
                row += 1;
                col = 0;
            } else {
--- a/helix-core/src/syntax.rs
+++ b/helix-core/src/syntax.rs
@ -1,4 +1,4 @@
-use crate::{regex::Regex, Change, Rope, RopeSlice, Transaction};
+use crate::{chars::char_is_line_ending, regex::Regex, Change, Rope, RopeSlice, Transaction};
 pub use helix_syntax::{get_language, get_language_name, Lang};

 use arc_swap::ArcSwap;
@ -589,9 +589,10 @@ impl LanguageLayer {
                mut column,
            } = point;

-            // TODO: there should be a better way here
-            for ch in text.bytes() {
-                if ch == b'\n' {
+            // TODO: there should be a better way here.
+            let mut chars = text.chars().peekable();
+            while let Some(ch) = chars.next() {
+                if char_is_line_ending(ch) && !(ch == '\r' && chars.peek() == Some(&'\n')) {
                    row += 1;
                    column = 0;
                } else {
--- a/helix-lsp/src/client.rs
+++ b/helix-lsp/src/client.rs
@ -3,7 +3,7 @@ use crate::{
    Call, Error, OffsetEncoding, Result,
 };

-use helix_core::{find_root, ChangeSet, Rope};
+use helix_core::{chars::char_is_line_ending, find_root, ChangeSet, Rope};
 use jsonrpc_core as jsonrpc;
 use lsp_types as lsp;
 use serde_json::Value;
@ -337,8 +337,9 @@ impl Client {
                mut character,
            } = pos;

-            for ch in text.chars() {
-                if ch == '\n' {
+            let mut chars = text.chars().peekable();
+            while let Some(ch) = chars.next() {
+                if char_is_line_ending(ch) && !(ch == '\r' && chars.peek() == Some(&'\n')) {
                    line += 1;
                    character = 0;
                } else {
--- a/helix-term/src/commands.rs
+++ b/helix-term/src/commands.rs
@ -1,12 +1,15 @@
 use helix_core::{
    comment, coords_at_pos, find_first_non_whitespace_char, find_root, graphemes, indent,
+    line_ending::{
+        get_line_ending, get_line_ending_of_str, line_end_char_index, str_is_line_ending,
+    },
    match_brackets,
    movement::{self, Direction},
    object, pos_at_coords,
    regex::{self, Regex},
    register::{self, Register, Registers},
-    search, selection, Change, ChangeSet, Position, Range, Rope, RopeSlice, Selection, SmallVec,
-    Tendril, Transaction,
+    search, selection, Change, ChangeSet, LineEnding, Position, Range, Rope, RopeGraphemes,
+    RopeSlice, Selection, SmallVec, Tendril, Transaction, DEFAULT_LINE_ENDING,
 };

 use helix_view::{
@ -303,9 +306,8 @@ fn move_line_end(cx: &mut Context) {
        let text = doc.text();
        let line = text.char_to_line(range.head);

-        // Line end is pos at the start of next line - 1
-        // subtract another 1 because the line ends with \n
-        let pos = text.line_to_char(line + 1).saturating_sub(2);
+        let pos = line_end_char_index(&text.slice(..), line);
+
        Range::new(pos, pos)
    });

@ -452,12 +454,28 @@ where
    let count = cx.count();

    // need to wait for next key
+    // TODO: should this be done by grapheme rather than char?  For example,
+    // we can't properly handle the line-ending CRLF case here in terms of char.
    cx.on_next_key(move |cx, event| {
        let ch = match event {
            KeyEvent {
                code: KeyCode::Enter,
                ..
-            } => '\n',
+            } =>
+            // TODO: this isn't quite correct when CRLF is involved.
+            // This hack will work in most cases, since documents don't
+            // usually mix line endings.  But we should fix it eventually
+            // anyway.
+            {
+                current!(cx.editor)
+                    .1
+                    .line_ending
+                    .as_str()
+                    .chars()
+                    .next()
+                    .unwrap()
+            }
+
            KeyEvent {
                code: KeyCode::Char(ch),
                ..
@ -575,32 +593,37 @@ fn extend_first_nonwhitespace(cx: &mut Context) {
 }

 fn replace(cx: &mut Context) {
+    let mut buf = [0u8; 4]; // To hold utf8 encoded char.
+
    // need to wait for next key
    cx.on_next_key(move |cx, event| {
+        let (view, doc) = current!(cx.editor);
        let ch = match event {
            KeyEvent {
                code: KeyCode::Char(ch),
                ..
-            } => Some(ch),
+            } => Some(&ch.encode_utf8(&mut buf[..])[..]),
            KeyEvent {
                code: KeyCode::Enter,
                ..
-            } => Some('\n'),
+            } => Some(doc.line_ending.as_str()),
            _ => None,
        };

        if let Some(ch) = ch {
-            let (view, doc) = current!(cx.editor);
-
            let transaction =
                Transaction::change_by_selection(doc.text(), doc.selection(view.id), |range| {
                    let max_to = doc.text().len_chars().saturating_sub(1);
                    let to = std::cmp::min(max_to, range.to() + 1);
-                    let text: String = doc
-                        .text()
-                        .slice(range.from()..to)
-                        .chars()
-                        .map(|c| if c == '\n' { '\n' } else { ch })
+                    let text: String = RopeGraphemes::new(doc.text().slice(range.from()..to))
+                        .map(|g| {
+                            let cow: Cow<str> = g.into();
+                            if str_is_line_ending(&cow) {
+                                cow
+                            } else {
+                                ch.into()
+                            }
+                        })
                        .collect();

                    (range.from(), to, Some(text.into()))
@ -725,9 +748,8 @@ fn extend_line_end(cx: &mut Context) {
        let text = doc.text();
        let line = text.char_to_line(range.head);

-        // Line end is pos at the start of next line - 1
-        // subtract another 1 because the line ends with \n
-        let pos = text.line_to_char(line + 1).saturating_sub(2);
+        let pos = line_end_char_index(&text.slice(..), line);
+
        Range::new(range.anchor, pos)
    });

@ -783,7 +805,8 @@ fn split_selection_on_newline(cx: &mut Context) {
    let text = doc.text().slice(..);
    // only compile the regex once
    #[allow(clippy::trivial_regex)]
-    static REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n").unwrap());
+    static REGEX: Lazy<Regex> =
+        Lazy::new(|| Regex::new(r"\r\n|[\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}]").unwrap());
    let selection = selection::split_on_matches(text, doc.selection(view.id), &REGEX);
    doc.set_selection(view.id, selection);
 }
@ -922,7 +945,13 @@ fn delete_selection_impl(reg: &mut Register, doc: &mut Document, view_id: ViewId
    // then delete
    let transaction =
        Transaction::change_by_selection(doc.text(), doc.selection(view_id), |range| {
-            let max_to = doc.text().len_chars().saturating_sub(1);
+            let alltext = doc.text();
+            let line = alltext.char_to_line(range.head);
+            let max_to = doc.text().len_chars().saturating_sub(
+                get_line_ending(&alltext.line(line))
+                    .map(|le| le.len_chars())
+                    .unwrap_or(0),
+            );
            let to = std::cmp::min(max_to, range.to() + 1);
            (range.from(), to, None)
        });
@ -1003,7 +1032,7 @@ fn append_mode(cx: &mut Context) {
    if selection.iter().any(|range| range.head == end) {
        let transaction = Transaction::change(
            doc.text(),
-            std::array::IntoIter::new([(end, end, Some(Tendril::from_char('\n')))]),
+            std::array::IntoIter::new([(end, end, Some(doc.line_ending.as_str().into()))]),
        );
        doc.apply(&transaction, view.id);
    }
@ -1131,6 +1160,45 @@ mod cmd {
        }
    }

+    /// Sets or reports the current document's line ending setting.
+    fn set_line_ending(editor: &mut Editor, args: &[&str], event: PromptEvent) {
+        use LineEnding::*;
+
+        // If no argument, report current line ending setting.
+        if args.is_empty() {
+            let line_ending = current!(editor).1.line_ending;
+            editor.set_status(match line_ending {
+                Crlf => "crlf".into(),
+                LF => "line feed".into(),
+                FF => "form feed".into(),
+                CR => "carriage return".into(),
+                Nel => "next line".into(),
+
+                // These should never be a document's default line ending.
+                VT | LS | PS => "error".into(),
+            });
+            return;
+        }
+
+        // Attempt to parse argument as a line ending.
+        let line_ending = match args.get(0) {
+            // We check for CR first because it shares a common prefix with CRLF.
+            Some(arg) if "cr".starts_with(&arg.to_lowercase()) => Some(CR),
+            Some(arg) if "crlf".starts_with(&arg.to_lowercase()) => Some(Crlf),
+            Some(arg) if "lf".starts_with(&arg.to_lowercase()) => Some(LF),
+            Some(arg) if "ff".starts_with(&arg.to_lowercase()) => Some(FF),
+            Some(arg) if "nel".starts_with(&arg.to_lowercase()) => Some(Nel),
+            _ => None,
+        };
+
+        if let Some(le) = line_ending {
+            doc_mut!(editor).line_ending = le;
+        } else {
+            // Invalid argument.
+            editor.set_error(format!("invalid line ending '{}'", args[0],));
+        }
+    }
+
    fn earlier(editor: &mut Editor, args: &[&str], event: PromptEvent) {
        let uk = match args.join(" ").parse::<helix_core::history::UndoKind>() {
            Ok(uk) => uk,
@ -1274,7 +1342,11 @@ mod cmd {
    }

    fn yank_joined_to_clipboard(editor: &mut Editor, args: &[&str], _: PromptEvent) {
-        let separator = args.first().copied().unwrap_or("\n");
+        let (_, doc) = current!(editor);
+        let separator = args
+            .first()
+            .copied()
+            .unwrap_or_else(|| doc.line_ending.as_str());
        yank_joined_to_clipboard_impl(editor, separator);
    }

@ -1359,6 +1431,13 @@ mod cmd {
            fun: set_indent_style,
            completer: None,
        },
+        TypableCommand {
+            name: "line-ending",
+            alias: None,
+            doc: "Set the document's default line ending. Options: crlf, lf, cr, ff, nel.",
+            fun: set_line_ending,
+            completer: None,
+        },
        TypableCommand {
            name: "earlier",
            alias: Some("ear"),
@ -1683,8 +1762,7 @@ fn append_to_line(cx: &mut Context) {
    let selection = doc.selection(view.id).transform(|range| {
        let text = doc.text();
        let line = text.char_to_line(range.head);
-        // we can't use line_to_char(line + 1) - 2 because the last line might not contain \n
-        let pos = (text.line_to_char(line) + text.line(line).len_chars()).saturating_sub(1);
+        let pos = line_end_char_index(&text.slice(..), line);
        Range::new(pos, pos)
    });
    doc.set_selection(view.id, selection);
@ -1731,7 +1809,7 @@ fn open(cx: &mut Context, open: Open) {
        let indent = doc.indent_unit().repeat(indent_level);
        let indent_len = indent.len();
        let mut text = String::with_capacity(1 + indent_len);
-        text.push('\n');
+        text.push_str(doc.line_ending.as_str());
        text.push_str(&indent);
        let text = text.repeat(count);

@ -2344,7 +2422,7 @@ pub mod insert {
            );
            let indent = doc.indent_unit().repeat(indent_level);
            let mut text = String::with_capacity(1 + indent.len());
-            text.push('\n');
+            text.push_str(doc.line_ending.as_str());
            text.push_str(&indent);

            let head = pos + offs + text.chars().count();
@ -2365,7 +2443,7 @@ pub mod insert {
            if helix_core::auto_pairs::PAIRS.contains(&(prev, curr)) {
                // another newline, indent the end bracket one level less
                let indent = doc.indent_unit().repeat(indent_level.saturating_sub(1));
-                text.push('\n');
+                text.push_str(doc.line_ending.as_str());
                text.push_str(&indent);
            }

@ -2488,7 +2566,8 @@ fn yank_joined_to_clipboard_impl(editor: &mut Editor, separator: &str) {
 }

 fn yank_joined_to_clipboard(cx: &mut Context) {
-    yank_joined_to_clipboard_impl(&mut cx.editor, "\n");
+    let line_ending = current!(cx.editor).1.line_ending;
+    yank_joined_to_clipboard_impl(&mut cx.editor, line_ending.as_str());
 }

 fn yank_main_selection_to_clipboard_impl(editor: &mut Editor) {
@ -2529,8 +2608,10 @@ fn paste_impl(
            .unwrap(),
    );

-    // if any of values ends \n it's linewise paste
-    let linewise = values.iter().any(|value| value.ends_with('\n'));
+    // if any of values ends with a line ending, it's linewise paste
+    let linewise = values
+        .iter()
+        .any(|value| get_line_ending_of_str(value).is_some());

    let mut values = values.iter().cloned().map(Tendril::from).chain(repeat);

--- a/helix-term/src/ui/editor.rs
+++ b/helix-term/src/ui/editor.rs
@ -9,7 +9,7 @@ use crate::{
 use helix_core::{
    coords_at_pos,
    syntax::{self, HighlightEvent},
-    Position, Range,
+    LineEnding, Position, Range,
 };
 use helix_lsp::LspProgressMap;
 use helix_view::{document::Mode, Document, Editor, Theme, View};
@ -184,7 +184,7 @@ impl EditorView {

                    // iterate over range char by char
                    for grapheme in RopeGraphemes::new(text) {
-                        if grapheme == "\n" {
+                        if LineEnding::from_rope_slice(&grapheme).is_some() {
                            visual_x = 0;
                            line += 1;

--- a/helix-term/src/ui/markdown.rs
+++ b/helix-term/src/ui/markdown.rs
@ -115,6 +115,8 @@ fn parse<'a>(
                                        // TODO: replace tabs with indentation

                                        let mut slice = &text[start..end];
+                                        // TODO: do we need to handle all unicode line endings
+                                        // here, or is just '\n' okay?
                                        while let Some(end) = slice.find('\n') {
                                            // emit span up to newline
                                            let text = &slice[..end];
--- a/helix-tui/Cargo.toml
+++ b/helix-tui/Cargo.toml
@ -22,3 +22,4 @@ unicode-segmentation = "1.2"
 unicode-width = "0.1"
 crossterm = { version = "0.20", optional = true }
 serde = { version = "1", "optional" = true, features = ["derive"]}
+helix-core = { version = "0.2", path = "../helix-core" }
--- a/helix-tui/src/text.rs
+++ b/helix-tui/src/text.rs
@ -47,6 +47,7 @@
 //! ]);
 //! ```
 use crate::style::Style;
+use helix_core::line_ending::str_is_line_ending;
 use std::borrow::Cow;
 use unicode_segmentation::UnicodeSegmentation;
 use unicode_width::UnicodeWidthStr;
@ -177,7 +178,7 @@ impl<'a> Span<'a> {
                symbol: g,
                style: base_style.patch(self.style),
            })
-            .filter(|s| s.symbol != "\n")
+            .filter(|s| !str_is_line_ending(s.symbol))
    }
 }

--- a/helix-tui/src/widgets/reflow.rs
+++ b/helix-tui/src/widgets/reflow.rs
@ -1,4 +1,5 @@
 use crate::text::StyledGrapheme;
+use helix_core::line_ending::str_is_line_ending;
 use unicode_segmentation::UnicodeSegmentation;
 use unicode_width::UnicodeWidthStr;

@ -62,13 +63,13 @@ impl<'a, 'b> LineComposer<'a> for WordWrapper<'a, 'b> {
            // Ignore characters wider that the total max width.
            if symbol.width() as u16 > self.max_line_width
                // Skip leading whitespace when trim is enabled.
-                || self.trim && symbol_whitespace && symbol != "\n" && current_line_width == 0
+                || self.trim && symbol_whitespace && !str_is_line_ending(symbol) && current_line_width == 0
            {
                continue;
            }

            // Break on newline and discard it.
-            if symbol == "\n" {
+            if str_is_line_ending(symbol) {
                if prev_whitespace {
                    current_line_width = width_to_last_word_end;
                    self.current_line.truncate(symbols_to_last_word_end);
@ -170,7 +171,7 @@ impl<'a, 'b> LineComposer<'a> for LineTruncator<'a, 'b> {
            }

            // Break on newline and discard it.
-            if symbol == "\n" {
+            if str_is_line_ending(symbol) {
                break;
            }

@ -199,7 +200,7 @@ impl<'a, 'b> LineComposer<'a> for LineTruncator<'a, 'b> {

        if skip_rest {
            for StyledGrapheme { symbol, .. } in &mut self.symbols {
-                if symbol == "\n" {
+                if str_is_line_ending(symbol) {
                    break;
                }
            }
--- a/helix-view/src/document.rs
+++ b/helix-view/src/document.rs
@ -7,10 +7,12 @@ use std::str::FromStr;
 use std::sync::Arc;

 use helix_core::{
-    chars::{char_is_linebreak, char_is_whitespace},
+    chars::{char_is_line_ending, char_is_whitespace},
    history::History,
+    line_ending::auto_detect_line_ending,
    syntax::{self, LanguageConfiguration},
-    ChangeSet, Diagnostic, Rope, Selection, State, Syntax, Transaction,
+    ChangeSet, Diagnostic, LineEnding, Rope, Selection, State, Syntax, Transaction,
+    DEFAULT_LINE_ENDING,
 };

 use crate::{DocumentId, Theme, ViewId};
@ -45,6 +47,9 @@ pub struct Document {
    /// Current indent style.
    pub indent_style: IndentStyle,

+    /// The document's default line ending.
+    pub line_ending: LineEnding,
+
    syntax: Option<Syntax>,
    // /// Corresponding language scope name. Usually `source.<lang>`.
    pub(crate) language: Option<Arc<LanguageConfiguration>>,
@ -232,6 +237,7 @@ impl Document {
            history: Cell::new(History::default()),
            last_saved_revision: 0,
            language_server: None,
+            line_ending: DEFAULT_LINE_ENDING,
        }
    }

@ -243,22 +249,26 @@ impl Document {
    ) -> Result<Self, Error> {
        use std::{fs::File, io::BufReader};

-        let doc = if !path.exists() {
-            Rope::from("\n")
+        let mut doc = if !path.exists() {
+            Rope::from(DEFAULT_LINE_ENDING.as_str())
        } else {
            let file = File::open(&path).context(format!("unable to open {:?}", path))?;
-            let mut doc = Rope::from_reader(BufReader::new(file))?;
+            Rope::from_reader(BufReader::new(file))?
+        };
+
+        // search for line endings
+        let line_ending = auto_detect_line_ending(&doc).unwrap_or(DEFAULT_LINE_ENDING);
+
        // add missing newline at the end of file
-            if doc.len_bytes() == 0 || doc.byte(doc.len_bytes() - 1) != b'\n' {
-                doc.insert_char(doc.len_chars(), '\n');
+        if doc.len_bytes() == 0 || char_is_line_ending(doc.char(doc.len_chars() - 1)) {
+            doc.insert(doc.len_chars(), line_ending.as_str());
        }
-            doc
-        };

        let mut doc = Self::new(doc);
        // set the path and try detecting the language
        doc.set_path(&path)?;
        doc.detect_indent_style();
+        doc.line_ending = line_ending;

        if let Some(loader) = config_loader {
            doc.detect_language(theme, loader);
@ -366,7 +376,7 @@ impl Document {
                    Some(' ') => false,

                    // Ignore blank lines.
-                    Some(c) if char_is_linebreak(c) => continue,
+                    Some(c) if char_is_line_ending(c) => continue,

                    _ => {
                        prev_line_is_tabs = false;
@ -390,7 +400,7 @@ impl Document {
                        c if char_is_whitespace(c) => count_is_done = true,

                        // Ignore blank lines.
-                        c if char_is_linebreak(c) => continue 'outer,
+                        c if char_is_line_ending(c) => continue 'outer,

                        _ => break,
                    }
--- a/helix-view/src/editor.rs
+++ b/helix-view/src/editor.rs
@ -15,10 +15,9 @@ use slotmap::SlotMap;

 use anyhow::Error;

-use helix_core::Position;
-
 pub use helix_core::diagnostic::Severity;
 pub use helix_core::register::Registers;
+use helix_core::{Position, DEFAULT_LINE_ENDING};

 #[derive(Debug)]
 pub struct Editor {
@ -173,7 +172,7 @@ impl Editor {

    pub fn new_file(&mut self, action: Action) -> DocumentId {
        use helix_core::Rope;
-        let doc = Document::new(Rope::from("\n"));
+        let doc = Document::new(Rope::from(DEFAULT_LINE_ENDING.as_str()));
        let id = self.documents.insert(doc);
        self.documents[id].id = id;
        self.switch(id, action);