From 8ac2d50feccc27c351968ed9a886c0a35ea86a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bla=C5=BE=20Hrastnik?= Date: Wed, 9 Jun 2021 13:22:55 +0900 Subject: [PATCH] Rebuild search by using regex-automata --- Cargo.lock | 10 +++ helix-core/Cargo.toml | 1 + helix-core/src/search.rs | 177 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 188 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 6aa9830b9..df5f50939 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -316,6 +316,7 @@ dependencies = [ "once_cell", "quickcheck", "regex", + "regex-automata", "ropey", "serde", "similar", @@ -757,6 +758,15 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax", +] + [[package]] name = "regex-syntax" version = "0.6.25" diff --git a/helix-core/Cargo.toml b/helix-core/Cargo.toml index 4316dc2c4..d117582bb 100644 --- a/helix-core/Cargo.toml +++ b/helix-core/Cargo.toml @@ -26,6 +26,7 @@ tree-sitter = "0.19" once_cell = "1.8" arc-swap = "1" regex = "1" +regex-automata = "0.1" serde = { version = "1.0", features = ["derive"] } toml = "0.5" diff --git a/helix-core/src/search.rs b/helix-core/src/search.rs index 243ac227a..3a0ed88f9 100644 --- a/helix-core/src/search.rs +++ b/helix-core/src/search.rs @@ -43,3 +43,180 @@ pub fn find_nth_prev(text: RopeSlice, ch: char, mut pos: usize, n: usize) -> Opt Some(pos) } + +use crate::movement::Direction; +use regex_automata::{dense, DenseDFA, Error as RegexError, DFA}; +use std::ops::Range; + +pub struct Searcher { + /// Locate end of match searching right. + right_fdfa: DenseDFA, usize>, + /// Locate start of match searching right. + right_rdfa: DenseDFA, usize>, + + /// Locate start of match searching left. + left_fdfa: DenseDFA, usize>, + /// Locate end of match searching left. + left_rdfa: DenseDFA, usize>, +} + +impl Searcher { + pub fn new(pattern: &str) -> Result { + // Check case info for smart case + let has_uppercase = pattern.chars().any(|c| c.is_uppercase()); + + // Create Regex DFAs for all search directions. + let mut builder = dense::Builder::new(); + let builder = builder.case_insensitive(!has_uppercase); + + let left_fdfa = builder.clone().reverse(true).build(pattern)?; + let left_rdfa = builder + .clone() + .anchored(true) + .longest_match(true) + .build(pattern)?; + + let right_fdfa = builder.clone().build(pattern)?; + let right_rdfa = builder + .anchored(true) + .longest_match(true) + .reverse(true) + .build(pattern)?; + + Ok(Searcher { + right_fdfa, + right_rdfa, + left_fdfa, + left_rdfa, + }) + } + pub fn search_prev(&self, text: RopeSlice, offset: usize) -> Option> { + let text = text.slice(..offset); + let start = self.rfind(text, &self.left_fdfa)?; + let end = self.find(text.slice(start..), &self.left_rdfa)?; + + Some(start..start + end) + } + + pub fn search_next(&self, text: RopeSlice, offset: usize) -> Option> { + let text = text.slice(offset..); + let end = self.find(text, &self.right_fdfa)?; + let start = self.rfind(text.slice(..end), &self.right_rdfa)?; + + Some(offset + start..offset + end) + } + + /// Returns the end offset of the longest match. If no match exists, then None is returned. + /// NOTE: based on DFA::find_at + fn find(&self, text: RopeSlice, dfa: &impl DFA) -> Option { + // TOOD: needs to change to rfind condition if searching reverse + // TODO: check this inside main search + // if dfa.is_anchored() && start > 0 { + // return None; + // } + + let mut state = dfa.start_state(); + let mut last_match = if dfa.is_dead_state(state) { + return None; + } else if dfa.is_match_state(state) { + Some(0) + } else { + None + }; + + for chunk in text.chunks() { + for (i, &b) in chunk.as_bytes().iter().enumerate() { + state = unsafe { dfa.next_state_unchecked(state, b) }; + if dfa.is_match_or_dead_state(state) { + if dfa.is_dead_state(state) { + return last_match; + } + last_match = Some(i + 1); + } + } + } + + last_match + } + + /// Returns the start offset of the longest match in reverse, by searching from the end of the + /// input towards the start of the input. If no match exists, then None is returned. In other + /// words, this has the same match semantics as find, but in reverse. + /// + /// NOTE: based on DFA::rfind_at + fn rfind(&self, text: RopeSlice, dfa: &impl DFA) -> Option { + // if dfa.is_anchored() && start < bytes.len() { + // return None; + // } + + let mut state = dfa.start_state(); + let mut last_match = if dfa.is_dead_state(state) { + return None; + } else if dfa.is_match_state(state) { + Some(text.len_bytes()) + } else { + None + }; + + // This is basically chunks().rev() + let (mut chunks, _, _, _) = text.chunks_at_byte(text.len_bytes()); + + while let Some(chunk) = chunks.prev() { + for (i, &b) in chunk.as_bytes().iter().enumerate().rev() { + state = unsafe { dfa.next_state_unchecked(state, b) }; + if dfa.is_match_or_dead_state(state) { + if dfa.is_dead_state(state) { + return last_match; + } + last_match = Some(i); + } + } + } + last_match + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_search_next() { + use crate::Rope; + let text = Rope::from("hello world!"); + + let searcher = Searcher::new(r"\w+").unwrap(); + + let result = searcher.search_next(text.slice(..), 0).unwrap(); + let fragment = text.slice(result.start..result.end); + assert_eq!("hello", fragment); + + let result = searcher.search_next(text.slice(..), result.end).unwrap(); + let fragment = text.slice(result.start..result.end); + assert_eq!("world", fragment); + + let result = searcher.search_next(text.slice(..), result.end); + assert!(result.is_none()); + } + + #[test] + fn test_search_prev() { + use crate::Rope; + let text = Rope::from("hello world!"); + + let searcher = Searcher::new(r"\w+").unwrap(); + + let result = searcher + .search_prev(text.slice(..), text.len_bytes()) + .unwrap(); + let fragment = text.slice(result.start..result.end); + assert_eq!("world", fragment); + + let result = searcher.search_prev(text.slice(..), result.start).unwrap(); + let fragment = text.slice(result.start..result.end); + assert_eq!("hello", fragment); + + let result = searcher.search_prev(text.slice(..), result.start); + assert!(result.is_none()); + } +}