From 7ebcf4e919a27b60610ba8f3a24a213282eb9ee6 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Thu, 9 Feb 2023 08:19:29 +0100 Subject: [PATCH] properly handle LSP position encoding (#5711) * properly handle LSP position encoding * add debug assertion to Transaction::change * Apply suggestions from code review Co-authored-by: Michael Davis --------- Co-authored-by: Michael Davis --- helix-core/src/line_ending.rs | 7 +++ helix-core/src/transaction.rs | 5 ++ helix-lsp/src/client.rs | 2 +- helix-lsp/src/lib.rs | 112 ++++++++++++++++++++++++++++------ 4 files changed, 107 insertions(+), 19 deletions(-) diff --git a/helix-core/src/line_ending.rs b/helix-core/src/line_ending.rs index 09e925230..953d567d5 100644 --- a/helix-core/src/line_ending.rs +++ b/helix-core/src/line_ending.rs @@ -203,6 +203,13 @@ pub fn line_end_char_index(slice: &RopeSlice, line: usize) -> usize { .unwrap_or(0) } +pub fn line_end_byte_index(slice: &RopeSlice, line: usize) -> usize { + slice.line_to_byte(line + 1) + - get_line_ending(&slice.line(line)) + .map(|le| le.as_str().len()) + .unwrap_or(0) +} + /// Fetches line `line_idx` from the passed rope slice, sans any line ending. pub fn line_without_line_ending<'a>(slice: &'a RopeSlice, line_idx: usize) -> RopeSlice<'a> { let start = slice.line_to_char(line_idx); diff --git a/helix-core/src/transaction.rs b/helix-core/src/transaction.rs index 482fd6d97..d2f4de07d 100644 --- a/helix-core/src/transaction.rs +++ b/helix-core/src/transaction.rs @@ -481,6 +481,11 @@ impl Transaction { for (from, to, tendril) in changes { // Verify ranges are ordered and not overlapping debug_assert!(last <= from); + // Verify ranges are correct + debug_assert!( + from <= to, + "Edit end must end before it starts (should {from} <= {to})" + ); // Retain from last "to" to current "from" changeset.retain(from - last); diff --git a/helix-lsp/src/client.rs b/helix-lsp/src/client.rs index 6827f568d..365c69901 100644 --- a/helix-lsp/src/client.rs +++ b/helix-lsp/src/client.rs @@ -104,7 +104,7 @@ impl Client { server_tx, request_counter: AtomicU64::new(0), capabilities: OnceCell::new(), - offset_encoding: OffsetEncoding::Utf8, + offset_encoding: OffsetEncoding::Utf16, config, req_timeout, diff --git a/helix-lsp/src/lib.rs b/helix-lsp/src/lib.rs index 8418896cb..bcaff10a9 100644 --- a/helix-lsp/src/lib.rs +++ b/helix-lsp/src/lib.rs @@ -57,6 +57,7 @@ pub enum OffsetEncoding { pub mod util { use super::*; + use helix_core::line_ending::{line_end_byte_index, line_end_char_index}; use helix_core::{diagnostic::NumberOrString, Range, Rope, Selection, Tendril, Transaction}; /// Converts a diagnostic in the document to [`lsp::Diagnostic`]. @@ -117,7 +118,7 @@ pub mod util { /// Converts [`lsp::Position`] to a position in the document. /// - /// Returns `None` if position exceeds document length or an operation overflows. + /// Returns `None` if position.line is out of bounds or an overflow occurs pub fn lsp_pos_to_pos( doc: &Rope, pos: lsp::Position, @@ -128,22 +129,58 @@ pub mod util { return None; } - match offset_encoding { + // We need to be careful here to fully comply ith the LSP spec. + // Two relevant quotes from the spec: + // + // https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#position + // > If the character value is greater than the line length it defaults back + // > to the line length. + // + // https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#textDocuments + // > To ensure that both client and server split the string into the same + // > line representation the protocol specifies the following end-of-line sequences: + // > ‘\n’, ‘\r\n’ and ‘\r’. Positions are line end character agnostic. + // > So you can not specify a position that denotes \r|\n or \n| where | represents the character offset. + // + // This means that while the line must be in bounds the `charater` + // must be capped to the end of the line. + // Note that the end of the line here is **before** the line terminator + // so we must use `line_end_char_index` istead of `doc.line_to_char(pos_line + 1)` + // + // FIXME: Helix does not fully comply with the LSP spec for line terminators. + // The LSP standard requires that line terminators are ['\n', '\r\n', '\r']. + // Without the unicode-linebreak feature disabled, the `\r` terminator is not handled by helix. + // With the unicode-linebreak feature, helix recognizes multiple extra line break chars + // which means that positions will be decoded/encoded incorrectly in their presence + + let line = match offset_encoding { OffsetEncoding::Utf8 => { - let line = doc.line_to_char(pos_line); - let pos = line.checked_add(pos.character as usize)?; - if pos <= doc.len_chars() { - Some(pos) - } else { - None - } + let line_start = doc.line_to_byte(pos_line); + let line_end = line_end_byte_index(&doc.slice(..), pos_line); + line_start..line_end } OffsetEncoding::Utf16 => { - let line = doc.line_to_char(pos_line); - let line_start = doc.char_to_utf16_cu(line); - let pos = line_start.checked_add(pos.character as usize)?; - doc.try_utf16_cu_to_char(pos).ok() + // TODO directly translate line index to char-idx + // ropey can do this just as easily as utf-8 byte translation + // but the functions are just missing. + // Translate to char first and then utf-16 as a workaround + let line_start = doc.line_to_char(pos_line); + let line_end = line_end_char_index(&doc.slice(..), pos_line); + doc.char_to_utf16_cu(line_start)..doc.char_to_utf16_cu(line_end) } + }; + + // The LSP spec demands that the offset is capped to the end of the line + let pos = line + .start + .checked_add(pos.character as usize) + .unwrap_or(line.end) + .min(line.end); + + // TODO prefer UTF32/char indices to avoid this step + match offset_encoding { + OffsetEncoding::Utf8 => doc.try_byte_to_char(pos).ok(), + OffsetEncoding::Utf16 => doc.try_utf16_cu_to_char(pos).ok(), } } @@ -158,8 +195,8 @@ pub mod util { match offset_encoding { OffsetEncoding::Utf8 => { let line = doc.char_to_line(pos); - let line_start = doc.line_to_char(line); - let col = pos - line_start; + let line_start = doc.line_to_byte(line); + let col = doc.char_to_byte(pos) - line_start; lsp::Position::new(line as u32, col as u32) } @@ -606,16 +643,55 @@ mod tests { } test_case!("", (0, 0) => Some(0)); - test_case!("", (0, 1) => None); + test_case!("", (0, 1) => Some(0)); test_case!("", (1, 0) => None); test_case!("\n\n", (0, 0) => Some(0)); test_case!("\n\n", (1, 0) => Some(1)); - test_case!("\n\n", (1, 1) => Some(2)); + test_case!("\n\n", (1, 1) => Some(1)); test_case!("\n\n", (2, 0) => Some(2)); test_case!("\n\n", (3, 0) => None); test_case!("test\n\n\n\ncase", (4, 3) => Some(11)); test_case!("test\n\n\n\ncase", (4, 4) => Some(12)); - test_case!("test\n\n\n\ncase", (4, 5) => None); + test_case!("test\n\n\n\ncase", (4, 5) => Some(12)); test_case!("", (u32::MAX, u32::MAX) => None); } + + #[test] + fn emoji_format_gh_4791() { + use lsp_types::{Position, Range, TextEdit}; + + let edits = vec![ + TextEdit { + range: Range { + start: Position { + line: 0, + character: 1, + }, + end: Position { + line: 1, + character: 0, + }, + }, + new_text: "\n ".to_string(), + }, + TextEdit { + range: Range { + start: Position { + line: 1, + character: 7, + }, + end: Position { + line: 2, + character: 0, + }, + }, + new_text: "\n ".to_string(), + }, + ]; + + let mut source = Rope::from_str("[\n\"🇺🇸\",\n\"🎄\",\n]"); + + let transaction = generate_transaction_from_edits(&source, edits, OffsetEncoding::Utf8); + assert!(transaction.apply(&mut source)); + } }