From f538b697597dccdd2492606858f0976c7cf7e4d2 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Tue, 22 Nov 2022 03:54:22 +0100 Subject: [PATCH] significantly improve treesitter performance while editing large files (#4716) * significantly improve treesitter performance while editing large files * Apply stylistic suggestions from code review Co-authored-by: Michael Davis * use PartialEq and Hash instead of a freestanding function Co-authored-by: Michael Davis --- Cargo.lock | 27 ++++++++- helix-core/Cargo.toml | 2 + helix-core/src/syntax.rs | 117 ++++++++++++++++++++++++++------------- 3 files changed, 106 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5833714d..a2b2ffd8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,6 +13,18 @@ dependencies = [ "version_check", ] +[[package]] +name = "ahash" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf6ccdb167abbf410dcb915cabd428929d7f6a04980b54a11f26a39f1c7f7107" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", +] + [[package]] name = "aho-corasick" version = "0.7.18" @@ -400,18 +412,29 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" dependencies = [ - "ahash", + "ahash 0.7.6", +] + +[[package]] +name = "hashbrown" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ff8ae62cd3a9102e5637afc8452c55acf3844001bd5374e0b0bd7b6616c038" +dependencies = [ + "ahash 0.8.2", ] [[package]] name = "helix-core" version = "0.6.0" dependencies = [ + "ahash 0.8.2", "arc-swap", "bitflags", "chrono", "encoding_rs", "etcetera", + "hashbrown 0.13.1", "helix-loader", "log", "once_cell", @@ -1288,7 +1311,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5faade31a542b8b35855fff6e8def199853b2da8da256da52f52f1316ee3137" dependencies = [ - "hashbrown", + "hashbrown 0.12.3", "regex", ] diff --git a/helix-core/Cargo.toml b/helix-core/Cargo.toml index 45272f98..eb886c90 100644 --- a/helix-core/Cargo.toml +++ b/helix-core/Cargo.toml @@ -30,6 +30,8 @@ once_cell = "1.16" arc-swap = "1" regex = "1" bitflags = "1.3" +ahash = "0.8.2" +hashbrown = { version = "0.13.1", features = ["raw"] } log = "0.4" serde = { version = "1.0", features = ["derive"] } diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index b567a8ab..8dc34a3e 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -7,8 +7,10 @@ use crate::{ Rope, RopeSlice, Tendril, }; +use ahash::RandomState; use arc_swap::{ArcSwap, Guard}; use bitflags::bitflags; +use hashbrown::raw::RawTable; use slotmap::{DefaultKey as LayerId, HopSlotMap}; use std::{ @@ -16,7 +18,8 @@ use std::{ cell::RefCell, collections::{HashMap, VecDeque}, fmt, - mem::replace, + hash::{Hash, Hasher}, + mem::{replace, transmute}, path::Path, str::FromStr, sync::Arc, @@ -770,30 +773,38 @@ impl Syntax { // Convert the changeset into tree sitter edits. let edits = generate_edits(old_source, changeset); + // This table allows inverse indexing of `layers`. + // That is by hashing a `Layer` you can find + // the `LayerId` of an existing equivalent `Layer` in `layers`. + // + // It is used to determine if a new layer exists for an injection + // or if an existing layer needs to be updated. + let mut layers_table = RawTable::with_capacity(self.layers.len()); + let layers_hasher = RandomState::new(); // Use the edits to update all layers markers - if !edits.is_empty() { - fn point_add(a: Point, b: Point) -> Point { - if b.row > 0 { - Point::new(a.row.saturating_add(b.row), b.column) - } else { - Point::new(0, a.column.saturating_add(b.column)) - } + fn point_add(a: Point, b: Point) -> Point { + if b.row > 0 { + Point::new(a.row.saturating_add(b.row), b.column) + } else { + Point::new(0, a.column.saturating_add(b.column)) } - fn point_sub(a: Point, b: Point) -> Point { - if a.row > b.row { - Point::new(a.row.saturating_sub(b.row), a.column) - } else { - Point::new(0, a.column.saturating_sub(b.column)) - } + } + fn point_sub(a: Point, b: Point) -> Point { + if a.row > b.row { + Point::new(a.row.saturating_sub(b.row), a.column) + } else { + Point::new(0, a.column.saturating_sub(b.column)) } + } - for layer in self.layers.values_mut() { - // The root layer always covers the whole range (0..usize::MAX) - if layer.depth == 0 { - layer.flags = LayerUpdateFlags::MODIFIED; - continue; - } + for (layer_id, layer) in self.layers.iter_mut() { + // The root layer always covers the whole range (0..usize::MAX) + if layer.depth == 0 { + layer.flags = LayerUpdateFlags::MODIFIED; + continue; + } + if !edits.is_empty() { for range in &mut layer.ranges { // Roughly based on https://github.com/tree-sitter/tree-sitter/blob/ddeaa0c7f534268b35b4f6cb39b52df082754413/lib/src/subtree.c#L691-L720 for edit in edits.iter().rev() { @@ -858,6 +869,12 @@ impl Syntax { } } } + + let hash = layers_hasher.hash_one(layer); + // Safety: insert_no_grow is unsafe because it assumes that the table + // has enough capacity to hold additional elements. + // This is always the case as we reserved enough capacity above. + unsafe { layers_table.insert_no_grow(hash, layer_id) }; } PARSER.with(|ts_parser| { @@ -982,27 +999,23 @@ impl Syntax { let depth = layer.depth + 1; // TODO: can't inline this since matches borrows self.layers for (config, ranges) in injections { - // Find an existing layer - let layer = self - .layers - .iter_mut() - .find(|(_, layer)| { - layer.depth == depth && // TODO: track parent id instead - layer.config.language == config.language && layer.ranges == ranges + let new_layer = LanguageLayer { + tree: None, + config, + depth, + ranges, + flags: LayerUpdateFlags::empty(), + }; + + // Find an identical existing layer + let layer = layers_table + .get(layers_hasher.hash_one(&new_layer), |&it| { + self.layers[it] == new_layer }) - .map(|(id, _layer)| id); + .copied(); // ...or insert a new one. - let layer_id = layer.unwrap_or_else(|| { - self.layers.insert(LanguageLayer { - tree: None, - config, - depth, - ranges, - // set the modified flag to ensure the layer is parsed - flags: LayerUpdateFlags::empty(), - }) - }); + let layer_id = layer.unwrap_or_else(|| self.layers.insert(new_layer)); queue.push_back(layer_id); } @@ -1139,6 +1152,34 @@ pub struct LanguageLayer { flags: LayerUpdateFlags, } +/// This PartialEq implementation only checks if that +/// two layers are theoretically identical (meaning they highlight the same text range with the same language). +/// It does not check whether the layers have the same internal treesitter +/// state. +impl PartialEq for LanguageLayer { + fn eq(&self, other: &Self) -> bool { + self.depth == other.depth + && self.config.language == other.config.language + && self.ranges == other.ranges + } +} + +/// Hash implementation belongs to PartialEq implementation above. +/// See its documentation for details. +impl Hash for LanguageLayer { + fn hash(&self, state: &mut H) { + self.depth.hash(state); + // The transmute is necessary here because tree_sitter::Language does not derive Hash at the moment. + // However it does use #[repr] transparent so the transmute here is safe + // as `Language` (which `Grammar` is an alias for) is just a newtype wrapper around a (thin) pointer. + // This is also compatible with the PartialEq implementation of language + // as that is just a pointer comparison. + let language: *const () = unsafe { transmute(self.config.language) }; + language.hash(state); + self.ranges.hash(state); + } +} + impl LanguageLayer { pub fn tree(&self) -> &Tree { // TODO: no unwrap