From ba026aaab43eb6e5ea6a6933943f167666afd0ee Mon Sep 17 00:00:00 2001 From: Rolo Date: Fri, 12 Jul 2024 01:40:18 -0700 Subject: [PATCH] refactor(shellwords): change arg handling strategy refactor: no longer special case for bracket lists refactor: no longer special case end space This was a hold over from before the `raw` function was added to `Args`. perf: remove `bytes` field to save 16 bytes From 56 bytes to 40, saving 16 bytes. perf: move `in_quotes` field to local variable perf: move `quote` field to local variable refactor: remove `is_finished` state from `Args` test: change example command to `read` `yank-join` now uses `raw` and thus would not be parsed with the `next` function so no longer applicable. refactor: remove unneeded range end for index refactor: remove backtracking escape check Instead, it can be tracked as the parser scans through the first time. refactor: clean up code and add more comments --- helix-core/src/shellwords.rs | 905 ++++++++++++++++++++++--------- helix-term/src/commands/typed.rs | 3 +- 2 files changed, 654 insertions(+), 254 deletions(-) diff --git a/helix-core/src/shellwords.rs b/helix-core/src/shellwords.rs index 9d873c366..55eedfd68 100644 --- a/helix-core/src/shellwords.rs +++ b/helix-core/src/shellwords.rs @@ -1,6 +1,329 @@ +use smartstring::{LazyCompact, SmartString}; use std::borrow::Cow; +/// A utility for parsing shell-like command lines. +/// +/// The `Shellwords` struct takes an input string and allows extracting the command and its arguments. +/// +/// # Features +/// +/// - Parses command and arguments from input strings. +/// - Supports single, double, and backtick quoted arguments. +/// - Respects backslash escaping in arguments. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// # use helix_core::shellwords::Shellwords; +/// let shellwords = Shellwords::from(":o helix-core/src/shellwords.rs"); +/// assert_eq!(":o", shellwords.command()); +/// assert_eq!("helix-core/src/shellwords.rs", shellwords.args().next().unwrap()); +/// ``` +/// +/// Empty command: +/// +/// ``` +/// # use helix_core::shellwords::Shellwords; +/// let shellwords = Shellwords::from(" "); +/// assert!(shellwords.command().is_empty()); +/// ``` +/// +/// # Iterator +/// +/// The `args` method returns a non-allocating iterator, `Args`, over the arguments of the input. +/// +/// ``` +/// # use helix_core::shellwords::Shellwords; +/// let shellwords = Shellwords::from(":o a b c"); +/// let mut args = shellwords.args(); +/// assert_eq!(Some("a"), args.next()); +/// assert_eq!(Some("b"), args.next()); +/// assert_eq!(Some("c"), args.next()); +/// assert_eq!(None, args.next()); +/// ``` +#[derive(Clone, Copy)] +pub struct Shellwords<'a> { + input: &'a str, +} + +impl<'a> From<&'a str> for Shellwords<'a> { + #[inline] + fn from(input: &'a str) -> Self { + Self { input } + } +} + +impl<'a> From<&'a String> for Shellwords<'a> { + #[inline] + fn from(input: &'a String) -> Self { + Self { input } + } +} + +impl<'a> From<&'a Cow<'a, str>> for Shellwords<'a> { + #[inline] + fn from(input: &'a Cow) -> Self { + Self { input } + } +} + +impl<'a> Shellwords<'a> { + #[inline] + #[must_use] + pub fn command(&self) -> &str { + self.input + .split_once(' ') + .map_or(self.input, |(command, _)| command) + } + + #[inline] + #[must_use] + pub fn args(&self) -> Args<'a> { + let args = self.input.split_once(' ').map_or("", |(_, args)| args); + Args::parse(args) + } + + #[inline] + pub fn input(&self) -> &str { + self.input + } + + /// Checks that the input ends with a whitespace character which is not escaped. + /// + /// # Examples + /// + /// ```rust + /// # use helix_core::shellwords::Shellwords; + /// assert_eq!(Shellwords::from(" ").ends_with_whitespace(), true); + /// assert_eq!(Shellwords::from(":open ").ends_with_whitespace(), true); + /// assert_eq!(Shellwords::from(":open foo.txt ").ends_with_whitespace(), true); + /// assert_eq!(Shellwords::from(":open").ends_with_whitespace(), false); + /// assert_eq!(Shellwords::from(":open a\\ ").ends_with_whitespace(), true); + /// assert_eq!(Shellwords::from(":open a\\ b.txt").ends_with_whitespace(), false); + /// ``` + #[inline] + pub fn ends_with_whitespace(&self) -> bool { + self.input.ends_with(' ') + } +} + +/// An iterator over an input string which yields arguments. +/// +/// Splits on whitespace, but respects quoted substrings (using double quotes, single quotes, or backticks). +#[derive(Debug, Clone, Copy)] +pub struct Args<'a> { + input: &'a str, + idx: usize, + start: usize, +} + +impl<'a> Args<'a> { + #[inline] + fn parse(input: &'a str) -> Self { + Self { + input, + idx: 0, + start: 0, + } + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.input.is_empty() + } + + /// Returns the args exactly as input. + /// + /// # Examples + /// ``` + /// # use helix_core::shellwords::Args; + /// let args = Args::from(r#"sed -n "s/test t/not /p""#); + /// assert_eq!(r#"sed -n "s/test t/not /p""#, args.raw()); + /// + /// let args = Args::from(r#"cat "file name with space.txt""#); + /// assert_eq!(r#"cat "file name with space.txt""#, args.raw()); + /// ``` + #[inline] + pub fn raw(&self) -> &str { + self.input + } + + /// Returns the remainder of the args exactly as input. + /// + /// # Examples + /// ``` + /// # use helix_core::shellwords::Args; + /// let mut args = Args::from(r#"sed -n "s/test t/not /p""#); + /// assert_eq!("sed", args.next().unwrap()); + /// assert_eq!(r#"-n "s/test t/not /p""#, args.rest()); + /// ``` + /// + /// Never calling `next` and using `rest` is functionally equivalent to calling `raw`. + #[inline] + pub fn rest(&self) -> &str { + &self.input[self.idx..] + } + + /// Convenient function to return an empty `Args`. + /// + /// When used in any iteration, it will always return `None`. + #[inline(always)] + pub const fn empty() -> Self { + Self { + input: "", + idx: 0, + start: 0, + } + } +} + +#[allow(clippy::copy_iterator)] +impl<'a> Iterator for Args<'a> { + type Item = &'a str; + + #[inline] + #[allow(clippy::too_many_lines)] + fn next(&mut self) -> Option { + // The parser loop is split into three main blocks to handle different types of input processing: + // + // 1. Quote block: + // - Detects an unescaped quote character, either starting an in-quote scan or, if already in-quote, + // locating the closing quote to return the quoted argument. + // - Handles cases where mismatched quotes are ignored and when quotes appear as the last character. + // + // 2. Whitespace block: + // - Handles arguments separated by whitespace (space or tab), respecting quotes so quoted phrases + // remain grouped together. + // - Splits arguments by whitespace when outside of a quoted context and updates boundaries accordingly. + // + // 3. Catch-all block: + // - Handles any other character, updating the `is_escaped` status if a backslash is encountered, + // advancing the loop to the next character. + + let bytes = self.input.as_bytes(); + let mut in_quotes = false; + let mut quote = b'\0'; + let mut is_escaped = false; + + while self.idx < bytes.len() { + match bytes[self.idx] { + b'"' | b'\'' | b'`' if !is_escaped => { + if in_quotes { + // Found the proper closing quote, so can return the arg and advance the state along. + if bytes[self.idx] == quote { + let arg = Some(&self.input[self.start..self.idx]); + self.idx += 1; + self.start = self.idx; + return arg; + } + // If quote does not match the type of the opening quote, then do nothing and advance. + self.idx += 1; + } else if self.idx == bytes.len() - 1 { + // Special case for when a quote is the last input in args. + // e.g: :read "file with space.txt"" + // This preserves the quote as an arg: + // - `file with space` + // - `"` + let arg = Some(&self.input[self.idx..]); + self.idx = bytes.len(); + self.start = bytes.len(); + return arg; + } else { + // Found opening quote. + in_quotes = true; + // Kind of quote that was found. + quote = bytes[self.idx]; + + if self.start < self.idx { + // When part of the input ends in a quote, `one two" three`, this properly returns the `two` + // before advancing to the quoted arg for the next iteration: + // - `one` <- previous arg + // - `two` <- this step + // - ` three` <- next arg + let arg = Some(&self.input[self.start..self.idx]); + self.idx += 1; + self.start = self.idx; + return arg; + } + + // Advance after quote. + self.idx += 1; + // Exclude quote from arg output. + self.start = self.idx; + } + } + b' ' | b'\t' if !in_quotes => { + // Found a true whitespace separator that wasn't inside quotes. + + // Check if there is anything to return or if its just advancing over whitespace. + // `start` will only be less than `idx` when there is something to return. + if self.start < self.idx { + let arg = Some(&self.input[self.start..self.idx]); + self.idx += 1; + self.start = self.idx; + return arg; + } + + // Advance beyond the whitespace. + self.idx += 1; + + // This is where `start` will be set to the start of an arg boundary, either encountering a word + // boundary or a quote boundary. If it finds a quote, then it will be advanced again in that part + // of the code. Either way, all that remains for the check above will be to return a full arg. + self.start = self.idx; + } + _ => { + // If previous loop didn't find any backslash and was already escaped it will change to false + // as the backslash chain was broken. + // + // If the previous loop had no backslash escape, and found one this iteration, then its the start + // of an escape chain. + is_escaped = match (is_escaped, bytes[self.idx]) { + (false, b'\\') => true, // Set `is_escaped` if the current byte is a backslash + _ => false, //Reset `is_escaped` if it was true, otherwise keep `is_escaped` as false + }; + + // Advance to next `char`. + self.idx += 1; + } + } + } + + // Fallback that catches when the loop would have exited but failed to return the arg between start and the end. + if self.start < bytes.len() { + let arg = Some(&self.input[self.start..]); + self.start = bytes.len(); + return arg; + } + + // All args have been parsed. + None + } +} + +impl<'a> From<&'a String> for Args<'a> { + fn from(args: &'a String) -> Self { + Args::parse(args) + } +} + +impl<'a> From<&'a str> for Args<'a> { + fn from(args: &'a str) -> Self { + Args::parse(args) + } +} + +impl<'a> From<&'a Cow<'_, str>> for Args<'a> { + fn from(args: &'a Cow) -> Self { + Args::parse(args) + } +} + /// Auto escape for shellwords usage. +#[inline] +#[must_use] pub fn escape(input: Cow) -> Cow { if !input.chars().any(|x| x.is_ascii_whitespace()) { input @@ -13,186 +336,141 @@ pub fn escape(input: Cow) -> Cow { buf })) } else { - Cow::Owned(format!("\"{}\"", input)) + Cow::Owned(format!("\"{input}\"")) } } -enum State { - OnWhitespace, - Unquoted, - UnquotedEscaped, - Quoted, - QuoteEscaped, - Dquoted, - DquoteEscaped, -} +/// Unescapes a string, converting escape sequences into their literal characters. +/// +/// This function handles the following escape sequences: +/// - `\\n` is converted to `\n` (newline) +/// - `\\t` is converted to `\t` (tab) +/// - `\\u{...}` is converted to the corresponding Unicode character +/// +/// Other escape sequences, such as `\\` followed by any character not listed above, will remain unchanged. +/// +/// If input is invalid, for example if there is invalid unicode, \u{999999999}, it will return the input as is. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// # use helix_core::shellwords::unescape; +/// let unescaped = unescape("hello\\nworld"); +/// assert_eq!("hello\nworld", unescaped); +/// ``` +/// +/// Unescaping tabs: +/// +/// ``` +/// # use helix_core::shellwords::unescape; +/// let unescaped = unescape("hello\\tworld"); +/// assert_eq!("hello\tworld", unescaped); +/// ``` +/// +/// Unescaping Unicode characters: +/// +/// ``` +/// # use helix_core::shellwords::unescape; +/// let unescaped = unescape("hello\\u{1f929}world"); +/// assert_eq!("hello\u{1f929}world", unescaped); +/// assert_eq!("hello🤩world", unescaped); +/// ``` +/// +/// Handling backslashes: +/// +/// ``` +/// # use helix_core::shellwords::unescape; +/// let unescaped = unescape(r"hello\\world"); +/// assert_eq!(r"hello\\world", unescaped); +/// +/// let unescaped = unescape(r"hello\\\\world"); +/// assert_eq!(r"hello\\\\world", unescaped); +/// ``` +/// +/// # Note +/// +/// This function is opinionated, with a clear purpose of handling user input, not a general or generic unescaping utility, and does not unescape sequences like `\\'` or `\\\"`, leaving them as is. +#[inline] +#[must_use] +pub fn unescape(input: &str) -> Cow<'_, str> { + enum State { + Normal, + Escaped, + Unicode, + } -pub struct Shellwords<'a> { - state: State, - /// Shellwords where whitespace and escapes has been resolved. - words: Vec>, - /// The parts of the input that are divided into shellwords. This can be - /// used to retrieve the original text for a given word by looking up the - /// same index in the Vec as the word in `words`. - parts: Vec<&'a str>, -} + let mut unescaped = String::new(); + let mut state = State::Normal; + let mut is_escaped = false; + // NOTE: Max unicode code point is U+10FFFF for a maximum of 6 chars + let mut unicode = SmartString::::new_const(); -impl<'a> From<&'a str> for Shellwords<'a> { - fn from(input: &'a str) -> Self { - use State::*; - - let mut state = Unquoted; - let mut words = Vec::new(); - let mut parts = Vec::new(); - let mut escaped = String::with_capacity(input.len()); - - let mut part_start = 0; - let mut unescaped_start = 0; - let mut end = 0; - - for (i, c) in input.char_indices() { - state = match state { - OnWhitespace => match c { - '"' => { - end = i; - Dquoted - } - '\'' => { - end = i; - Quoted - } - '\\' => { - if cfg!(unix) { - escaped.push_str(&input[unescaped_start..i]); - unescaped_start = i + 1; - UnquotedEscaped - } else { - OnWhitespace - } - } - c if c.is_ascii_whitespace() => { - end = i; - OnWhitespace - } - _ => Unquoted, - }, - Unquoted => match c { - '\\' => { - if cfg!(unix) { - escaped.push_str(&input[unescaped_start..i]); - unescaped_start = i + 1; - UnquotedEscaped - } else { - Unquoted - } - } - c if c.is_ascii_whitespace() => { - end = i; - OnWhitespace - } - _ => Unquoted, - }, - UnquotedEscaped => Unquoted, - Quoted => match c { - '\\' => { - if cfg!(unix) { - escaped.push_str(&input[unescaped_start..i]); - unescaped_start = i + 1; - QuoteEscaped - } else { - Quoted + for (idx, ch) in input.char_indices() { + match state { + State::Normal => match ch { + '\\' => { + if !is_escaped { + // PERF: As not every separator will be escaped, we use `String::new` as that has no initial + // allocation. If an escape is found, then we reserve capacity thats the len of the separator, + // as the new unescaped string will be at least that long. + unescaped.reserve(input.len()); + if idx > 0 { + // First time finding an escape, so all prior chars can be added to the new unescaped + // version if its not the very first char found. + unescaped.push_str(&input[0..idx]); } } - '\'' => { - end = i; - OnWhitespace - } - _ => Quoted, - }, - QuoteEscaped => Quoted, - Dquoted => match c { - '\\' => { - if cfg!(unix) { - escaped.push_str(&input[unescaped_start..i]); - unescaped_start = i + 1; - DquoteEscaped - } else { - Dquoted - } + state = State::Escaped; + is_escaped = true; + } + _ => { + if is_escaped { + unescaped.push(ch); } - '"' => { - end = i; - OnWhitespace + } + }, + State::Escaped => { + match ch { + 'n' => unescaped.push('\n'), + 't' => unescaped.push('\t'), + 'u' => { + state = State::Unicode; + continue; } - _ => Dquoted, - }, - DquoteEscaped => Dquoted, - }; - - let c_len = c.len_utf8(); - if i == input.len() - c_len && end == 0 { - end = i + c_len; - } - - if end > 0 { - let esc_trim = escaped.trim(); - let inp = &input[unescaped_start..end]; - - if !(esc_trim.is_empty() && inp.trim().is_empty()) { - if esc_trim.is_empty() { - words.push(inp.into()); - parts.push(inp); - } else { - words.push([escaped, inp.into()].concat().into()); - parts.push(&input[part_start..end]); - escaped = "".to_string(); + // Uncomment if you want to handle '\\' to '\' + // '\\' => unescaped.push('\\'), + _ => { + unescaped.push('\\'); + unescaped.push(ch); } } - unescaped_start = i + 1; - part_start = i + 1; - end = 0; + state = State::Normal; } - } - - debug_assert!(words.len() == parts.len()); - - Self { - state, - words, - parts, + State::Unicode => match ch { + '{' => continue, + '}' => { + let Ok(digit) = u32::from_str_radix(&unicode, 16) else { + return input.into(); + }; + let Some(point) = char::from_u32(digit) else { + return input.into(); + }; + unescaped.push(point); + // Might be more unicode to unescape so clear for reuse. + unicode.clear(); + state = State::Normal; + } + _ => unicode.push(ch), + }, } } -} -impl<'a> Shellwords<'a> { - /// Checks that the input ends with a whitespace character which is not escaped. - /// - /// # Examples - /// - /// ```rust - /// use helix_core::shellwords::Shellwords; - /// assert_eq!(Shellwords::from(" ").ends_with_whitespace(), true); - /// assert_eq!(Shellwords::from(":open ").ends_with_whitespace(), true); - /// assert_eq!(Shellwords::from(":open foo.txt ").ends_with_whitespace(), true); - /// assert_eq!(Shellwords::from(":open").ends_with_whitespace(), false); - /// #[cfg(unix)] - /// assert_eq!(Shellwords::from(":open a\\ ").ends_with_whitespace(), false); - /// #[cfg(unix)] - /// assert_eq!(Shellwords::from(":open a\\ b.txt").ends_with_whitespace(), false); - /// ``` - pub fn ends_with_whitespace(&self) -> bool { - matches!(self.state, State::OnWhitespace) - } - - /// Returns the list of shellwords calculated from the input string. - pub fn words(&self) -> &[Cow<'a, str>] { - &self.words - } - - /// Returns a list of strings which correspond to [`Self::words`] but represent the original - /// text in the input string - including escape characters - without separating whitespace. - pub fn parts(&self) -> &[&'a str] { - &self.parts + if is_escaped { + unescaped.into() + } else { + input.into() } } @@ -201,114 +479,191 @@ mod test { use super::*; #[test] - #[cfg(windows)] - fn test_normal() { + fn base() { let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#; let shellwords = Shellwords::from(input); - let result = shellwords.words().to_vec(); - let expected = vec![ - Cow::from(":o"), - Cow::from("single_word"), - Cow::from("twó"), - Cow::from("wörds"), - Cow::from("\\three\\"), - Cow::from("\\"), - Cow::from("with\\ escaping\\\\"), + let args = vec![ + "single_word", + "twó", + "wörds", + r"\three\", + r#"\"with\"#, + r"escaping\\", ]; - // TODO test is_owned and is_borrowed, once they get stabilized. - assert_eq!(expected, result); + + assert_eq!(":o", shellwords.command()); + assert_eq!(args, shellwords.args().collect::>()); } #[test] - #[cfg(unix)] - fn test_normal() { - let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#; - let shellwords = Shellwords::from(input); - let result = shellwords.words().to_vec(); - let expected = vec![ - Cow::from(":o"), - Cow::from("single_word"), - Cow::from("twó"), - Cow::from("wörds"), - Cow::from(r#"three "with escaping\"#), - ]; - // TODO test is_owned and is_borrowed, once they get stabilized. - assert_eq!(expected, result); + fn should_have_empty_args() { + let shellwords = Shellwords::from(":quit"); + assert!( + shellwords.args().is_empty(), + "args: `{}`", + shellwords.args().next().unwrap() + ); + assert!(shellwords.args().next().is_none()); } #[test] - #[cfg(unix)] - fn test_quoted() { + fn should_return_empty_command() { + let shellwords = Shellwords::from(" "); + assert!(shellwords.command().is_empty()); + } + + #[test] + fn should_support_unicode_args() { + assert_eq!( + Shellwords::from(":sh echo 𒀀").args().collect::>(), + &["echo", "𒀀"] + ); + assert_eq!( + Shellwords::from(":sh echo 𒀀 hello world𒀀") + .args() + .collect::>(), + &["echo", "𒀀", "hello", "world𒀀"] + ); + } + + #[test] + fn should_preserve_quote_if_last_argument() { + let sh = Shellwords::from(r#":read "file with space.txt"""#); + let mut args = sh.args(); + assert_eq!("file with space.txt", args.next().unwrap()); + assert_eq!(r#"""#, args.next().unwrap()); + } + + #[test] + fn should_return_rest_of_non_closed_quote_as_one_argument() { + let sh = Shellwords::from(r":rename 'should be one \'argument"); + assert_eq!(r"should be one \'argument", sh.args().next().unwrap()); + } + + #[test] + fn should_respect_escaped_quote_in_what_looks_like_non_closed_arg() { + let sh = Shellwords::from(r":rename 'should be one \\'argument"); + let mut args = sh.args(); + assert_eq!(r"should be one \\", args.next().unwrap()); + assert_eq!(r"argument", args.next().unwrap()); + } + + #[test] + fn should_split_args() { + assert_eq!(Shellwords::from(":o a").args().collect::>(), &["a"]); + assert_eq!( + Shellwords::from(":o a\\ ").args().collect::>(), + &["a\\"] + ); + } + + #[test] + fn should_parse_args_even_with_leading_whitespace() { + // Three spaces + assert_eq!( + Shellwords::from(":o a").args().collect::>(), + &["a"] + ); + } + + #[test] + fn should_parse_single_quotes_while_respecting_escapes() { let quoted = r#":o 'single_word' 'twó wörds' '' ' ''\three\' \"with\ escaping\\' 'quote incomplete"#; let shellwords = Shellwords::from(quoted); - let result = shellwords.words().to_vec(); + let result = shellwords.args().collect::>(); let expected = vec![ - Cow::from(":o"), - Cow::from("single_word"), - Cow::from("twó wörds"), - Cow::from(r#"three' "with escaping\"#), - Cow::from("quote incomplete"), + "single_word", + "twó wörds", + "", + " ", + r#"\three\' \"with\ escaping\\"#, + "quote incomplete", ]; assert_eq!(expected, result); } #[test] - #[cfg(unix)] - fn test_dquoted() { + fn should_parse_double_quotes_while_respecting_escapes() { let dquoted = r#":o "single_word" "twó wörds" "" " ""\three\' \"with\ escaping\\" "dquote incomplete"#; let shellwords = Shellwords::from(dquoted); - let result = shellwords.words().to_vec(); + let result = shellwords.args().collect::>(); let expected = vec![ - Cow::from(":o"), - Cow::from("single_word"), - Cow::from("twó wörds"), - Cow::from(r#"three' "with escaping\"#), - Cow::from("dquote incomplete"), + "single_word", + "twó wörds", + "", + " ", + r#"\three\' \"with\ escaping\\"#, + "dquote incomplete", ]; assert_eq!(expected, result); } #[test] - #[cfg(unix)] - fn test_mixed() { + fn should_respect_escapes_with_mixed_quotes() { let dquoted = r#":o single_word 'twó wörds' "\three\' \"with\ escaping\\""no space before"'and after' $#%^@ "%^&(%^" ')(*&^%''a\\\\\b' '"#; let shellwords = Shellwords::from(dquoted); - let result = shellwords.words().to_vec(); + let result = shellwords.args().collect::>(); let expected = vec![ - Cow::from(":o"), - Cow::from("single_word"), - Cow::from("twó wörds"), - Cow::from("three' \"with escaping\\"), - Cow::from("no space before"), - Cow::from("and after"), - Cow::from("$#%^@"), - Cow::from("%^&(%^"), - Cow::from(")(*&^%"), - Cow::from(r#"a\\b"#), - //last ' just changes to quoted but since we dont have anything after it, it should be ignored + "single_word", + "twó wörds", + r#"\three\' \"with\ escaping\\"#, + "no space before", + "and after", + "$#%^@", + "%^&(%^", + r")(*&^%", + r"a\\\\\b", + // Last ' is important, as if the user input an accidental quote at the end, this should be checked in + // commands where there should only be one input and return an error rather than silently succeed. + "'", ]; assert_eq!(expected, result); } #[test] - fn test_lists() { - let input = - r#":set statusline.center ["file-type","file-encoding"] '["list", "in", "quotes"]'"#; + fn should_return_rest() { + let input = r#":set statusline.center ["file-type","file-encoding"]"#; let shellwords = Shellwords::from(input); - let result = shellwords.words().to_vec(); - let expected = vec![ - Cow::from(":set"), - Cow::from("statusline.center"), - Cow::from(r#"["file-type","file-encoding"]"#), - Cow::from(r#"["list", "in", "quotes"]"#), - ]; - assert_eq!(expected, result); + let mut args = shellwords.args(); + assert_eq!(":set", shellwords.command()); + assert_eq!(Some("statusline.center"), args.next()); + assert_eq!(r#"["file-type","file-encoding"]"#, args.rest()); + } + + #[test] + fn should_return_no_args() { + let mut args = Args::parse(""); + assert!(args.next().is_none()); + } + + #[test] + fn should_leave_escaped_quotes() { + let input = r#"\" \` \' \"with \'with \`with"#; + let result = Args::parse(input).collect::>(); + assert_eq!(r#"\""#, result[0]); + assert_eq!(r"\`", result[1]); + assert_eq!(r"\'", result[2]); + assert_eq!(r#"\"with"#, result[3]); + assert_eq!(r"\'with", result[4]); + assert_eq!(r"\`with", result[5]); + } + + #[test] + fn should_leave_literal_newline_alone() { + let result = Args::parse(r"\n").collect::>(); + assert_eq!(r"\n", result[0]); + } + + #[test] + fn should_leave_literal_unicode_alone() { + let result = Args::parse(r"\u{C}").collect::>(); + assert_eq!(r"\u{C}", result[0]); } #[test] #[cfg(unix)] - fn test_escaping_unix() { + fn should_escape_unix() { assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar")); assert_eq!(escape("foo bar".into()), Cow::Borrowed("foo\\ bar")); assert_eq!(escape("foo\tbar".into()), Cow::Borrowed("foo\\\tbar")); @@ -316,35 +671,79 @@ mod test { #[test] #[cfg(windows)] - fn test_escaping_windows() { + fn should_escape_windows() { assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar")); assert_eq!(escape("foo bar".into()), Cow::Borrowed("\"foo bar\"")); } #[test] - #[cfg(unix)] - fn test_parts() { - assert_eq!(Shellwords::from(":o a").parts(), &[":o", "a"]); - assert_eq!(Shellwords::from(":o a\\ ").parts(), &[":o", "a\\ "]); + fn should_unescape_newline() { + let unescaped = unescape("hello\\nworld"); + assert_eq!("hello\nworld", unescaped); } #[test] - #[cfg(windows)] - fn test_parts() { - assert_eq!(Shellwords::from(":o a").parts(), &[":o", "a"]); - assert_eq!(Shellwords::from(":o a\\ ").parts(), &[":o", "a\\"]); + fn should_unescape_tab() { + let unescaped = unescape("hello\\tworld"); + assert_eq!("hello\tworld", unescaped); } #[test] - fn test_multibyte_at_end() { - assert_eq!(Shellwords::from("𒀀").parts(), &["𒀀"]); - assert_eq!( - Shellwords::from(":sh echo 𒀀").parts(), - &[":sh", "echo", "𒀀"] - ); - assert_eq!( - Shellwords::from(":sh echo 𒀀 hello world𒀀").parts(), - &[":sh", "echo", "𒀀", "hello", "world𒀀"] - ); + fn should_unescape_unicode() { + let unescaped = unescape("hello\\u{1f929}world"); + assert_eq!("hello\u{1f929}world", unescaped, "char: 🤩 "); + assert_eq!("hello🤩world", unescaped); + } + + #[test] + fn should_return_original_input_due_to_bad_unicode() { + let unescaped = unescape("hello\\u{999999999}world"); + assert_eq!("hello\\u{999999999}world", unescaped); + } + + #[test] + fn should_not_unescape_slash() { + let unescaped = unescape(r"hello\\world"); + assert_eq!(r"hello\\world", unescaped); + + let unescaped = unescape(r"hello\\\\world"); + assert_eq!(r"hello\\\\world", unescaped); + } + + #[test] + fn should_not_unescape_slash_single_quote() { + let unescaped = unescape("\\'"); + assert_eq!(r"\'", unescaped); + } + + #[test] + fn should_not_unescape_slash_double_quote() { + let unescaped = unescape("\\\""); + assert_eq!(r#"\""#, unescaped); + } + + #[test] + fn should_not_change_anything() { + let unescaped = unescape("'"); + assert_eq!("'", unescaped); + let unescaped = unescape(r#"""#); + assert_eq!(r#"""#, unescaped); + } + + #[test] + fn should_only_unescape_newline_not_slash_single_quote() { + let unescaped = unescape("\\n\'"); + assert_eq!("\n'", unescaped); + let unescaped = unescape("\\n\\'"); + assert_eq!("\n\\'", unescaped); + } + + #[test] + fn should_unescape_args() { + // 1f929: 🤩 + let args = Args::parse(r#"'hello\u{1f929} world' '["hello", "\u{1f929}", "world"]'"#) + .collect::>(); + assert_eq!("hello\u{1f929} world", unescape(args[0])); + assert_eq!(r#"["hello", "🤩", "world"]"#, unescape(args[1])); } } diff --git a/helix-term/src/commands/typed.rs b/helix-term/src/commands/typed.rs index 68ba9bab5..fadbe2688 100644 --- a/helix-term/src/commands/typed.rs +++ b/helix-term/src/commands/typed.rs @@ -3190,7 +3190,8 @@ pub(super) fn command_mode(cx: &mut Context) { completer(editor, word) .into_iter() .map(|(range, file)| { - let file = shellwords::escape(file); + // TEST: Might not need to escape with new changes? + // let file = shellwords::escape(&file); // offset ranges to input let offset = input.len() - word_len;