From 77dbbc73f9c9b6599bc39b18625285685fe2e4b1 Mon Sep 17 00:00:00 2001 From: ath3 <45574139+ath3@users.noreply.github.com> Date: Mon, 8 Nov 2021 16:19:44 +0100 Subject: [PATCH] Detect filetype from shebang line (#1001) --- book/src/guides/adding_languages.md | 3 ++- helix-core/src/indent.rs | 1 + helix-core/src/syntax.rs | 24 ++++++++++++++++++++ helix-view/src/document.rs | 4 +++- languages.toml | 35 +++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 2 deletions(-) diff --git a/book/src/guides/adding_languages.md b/book/src/guides/adding_languages.md index c606f8fc6..446eb479d 100644 --- a/book/src/guides/adding_languages.md +++ b/book/src/guides/adding_languages.md @@ -33,10 +33,11 @@ These are the available keys and descriptions for the file. | scope | A string like `source.js` that identifies the language. Currently, we strive to match the scope names used by popular TextMate grammars and by the Linguist library. Usually `source.` or `text.` in case of markup languages | | injection-regex | regex pattern that will be tested against a language name in order to determine whether this language should be used for a potential [language injection][treesitter-language-injection] site. | | file-types | The filetypes of the language, for example `["yml", "yaml"]` | +| shebangs | The interpreters from the shebang line, for example `["sh", "bash"]` | | roots | A set of marker files to look for when trying to find the workspace root. For example `Cargo.lock`, `yarn.lock` | | auto-format | Whether to autoformat this language when saving | | comment-token | The token to use as a comment-token | -| indent | The indent to use. Has sub keys `tab-width` and `unit` | +| indent | The indent to use. Has sub keys `tab-width` and `unit` | | config | Language server configuration | ## Queries diff --git a/helix-core/src/indent.rs b/helix-core/src/indent.rs index 20f034ea7..b6f5081ac 100644 --- a/helix-core/src/indent.rs +++ b/helix-core/src/indent.rs @@ -450,6 +450,7 @@ where language: vec![LanguageConfiguration { scope: "source.rust".to_string(), file_types: vec!["rs".to_string()], + shebangs: vec![], language_id: "Rust".to_string(), highlight_config: OnceCell::new(), config: None, diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index f3e3f238b..84952248a 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -14,6 +14,8 @@ use std::{ cell::RefCell, collections::{HashMap, HashSet}, fmt, + fs::File, + io::Read, path::Path, sync::Arc, }; @@ -52,6 +54,7 @@ pub struct LanguageConfiguration { pub language_id: String, pub scope: String, // source.rust pub file_types: Vec, // filename ends_with? + pub shebangs: Vec, // interpreter(s) associated with language pub roots: Vec, // these indicate project roots <.git, Cargo.toml> pub comment_token: Option, @@ -254,6 +257,7 @@ pub struct Loader { // highlight_names ? language_configs: Vec>, language_config_ids_by_file_type: HashMap, // Vec + language_config_ids_by_shebang: HashMap, } impl Loader { @@ -261,6 +265,7 @@ impl Loader { let mut loader = Self { language_configs: Vec::new(), language_config_ids_by_file_type: HashMap::new(), + language_config_ids_by_shebang: HashMap::new(), }; for config in config.language { @@ -273,6 +278,11 @@ impl Loader { .language_config_ids_by_file_type .insert(file_type.clone(), language_id); } + for shebang in &config.shebangs { + loader + .language_config_ids_by_shebang + .insert(shebang.clone(), language_id); + } loader.language_configs.push(Arc::new(config)); } @@ -298,6 +308,20 @@ impl Loader { // TODO: content_regex handling conflict resolution } + pub fn language_config_for_shebang(&self, path: &Path) -> Option> { + // Read the first 128 bytes of the file. If its a shebang line, try to find the language + let file = File::open(path).ok()?; + let mut buf = String::with_capacity(128); + file.take(128).read_to_string(&mut buf).ok()?; + static SHEBANG_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^#!\s*(?:\S*[/\\](?:env\s+)?)?([^\s\.\d]+)").unwrap()); + let configuration_id = SHEBANG_REGEX + .captures(&buf) + .and_then(|cap| self.language_config_ids_by_shebang.get(&cap[1])); + + configuration_id.and_then(|&id| self.language_configs.get(id).cloned()) + } + pub fn language_config_for_scope(&self, scope: &str) -> Option> { self.language_configs .iter() diff --git a/helix-view/src/document.rs b/helix-view/src/document.rs index ce5df8ee8..a68ab7595 100644 --- a/helix-view/src/document.rs +++ b/helix-view/src/document.rs @@ -494,7 +494,9 @@ impl Document { /// Detect the programming language based on the file type. pub fn detect_language(&mut self, theme: Option<&Theme>, config_loader: &syntax::Loader) { if let Some(path) = &self.path { - let language_config = config_loader.language_config_for_file_name(path); + let language_config = config_loader + .language_config_for_file_name(path) + .or_else(|| config_loader.language_config_for_shebang(path)); self.set_language(theme, language_config); } } diff --git a/languages.toml b/languages.toml index 988921718..067138e4c 100644 --- a/languages.toml +++ b/languages.toml @@ -3,6 +3,7 @@ name = "rust" scope = "source.rust" injection-regex = "rust" file-types = ["rs"] +shebangs = [] roots = [] auto-format = true comment-token = "//" @@ -17,6 +18,7 @@ name = "toml" scope = "source.toml" injection-regex = "toml" file-types = ["toml"] +shebangs = [] roots = [] comment-token = "#" @@ -27,6 +29,7 @@ name = "protobuf" scope = "source.proto" injection-regex = "protobuf" file-types = ["proto"] +shebangs = [] roots = [] comment-token = "//" @@ -37,6 +40,7 @@ name = "elixir" scope = "source.elixir" injection-regex = "elixir" file-types = ["ex", "exs"] +shebangs = [] roots = [] comment-token = "#" @@ -48,6 +52,7 @@ name = "mint" scope = "source.mint" injection-regex = "mint" file-types = ["mint"] +shebangs = [] roots = [] comment-token = "//" @@ -59,6 +64,7 @@ name = "json" scope = "source.json" injection-regex = "json" file-types = ["json"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -68,6 +74,7 @@ name = "c" scope = "source.c" injection-regex = "c" file-types = ["c"] # TODO: ["h"] +shebangs = [] roots = [] comment-token = "//" @@ -79,6 +86,7 @@ name = "cpp" scope = "source.cpp" injection-regex = "cpp" file-types = ["cc", "hh", "cpp", "hpp", "h", "ipp", "tpp", "cxx", "hxx", "ixx", "txx", "ino"] +shebangs = [] roots = [] comment-token = "//" @@ -90,6 +98,7 @@ name = "c-sharp" scope = "source.csharp" injection-regex = "c-?sharp" file-types = ["cs"] +shebangs = [] roots = [] comment-token = "//" @@ -100,6 +109,7 @@ name = "go" scope = "source.go" injection-regex = "go" file-types = ["go"] +shebangs = [] roots = ["Gopkg.toml", "go.mod"] auto-format = true comment-token = "//" @@ -113,6 +123,7 @@ name = "javascript" scope = "source.js" injection-regex = "^(js|javascript)$" file-types = ["js", "mjs"] +shebangs = [] roots = [] comment-token = "//" # TODO: highlights-jsx, highlights-params @@ -124,6 +135,7 @@ name = "typescript" scope = "source.ts" injection-regex = "^(ts|typescript)$" file-types = ["ts"] +shebangs = [] roots = [] # TODO: highlights-jsx, highlights-params @@ -135,6 +147,7 @@ name = "tsx" scope = "source.tsx" injection-regex = "^(tsx)$" # |typescript file-types = ["tsx"] +shebangs = [] roots = [] # TODO: highlights-jsx, highlights-params @@ -146,6 +159,7 @@ name = "css" scope = "source.css" injection-regex = "css" file-types = ["css"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -155,6 +169,7 @@ name = "html" scope = "text.html.basic" injection-regex = "html" file-types = ["html"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -164,6 +179,7 @@ name = "python" scope = "source.python" injection-regex = "python" file-types = ["py"] +shebangs = ["python"] roots = [] comment-token = "#" @@ -176,6 +192,7 @@ name = "nix" scope = "source.nix" injection-regex = "nix" file-types = ["nix"] +shebangs = [] roots = [] comment-token = "#" @@ -187,6 +204,7 @@ name = "ruby" scope = "source.ruby" injection-regex = "ruby" file-types = ["rb"] +shebangs = ["ruby"] roots = [] comment-token = "#" @@ -198,6 +216,7 @@ name = "bash" scope = "source.bash" injection-regex = "bash" file-types = ["sh", "bash"] +shebangs = ["sh", "bash", "dash"] roots = [] comment-token = "#" @@ -209,6 +228,7 @@ name = "php" scope = "source.php" injection-regex = "php" file-types = ["php"] +shebangs = ["php"] roots = [] indent = { tab-width = 4, unit = " " } @@ -218,6 +238,7 @@ name = "latex" scope = "source.tex" injection-regex = "tex" file-types = ["tex"] +shebangs = [] roots = [] comment-token = "%" @@ -228,6 +249,7 @@ name = "julia" scope = "source.julia" injection-regex = "julia" file-types = ["jl"] +shebangs = [] roots = [] comment-token = "#" language-server = { command = "julia", args = [ @@ -253,6 +275,7 @@ name = "java" scope = "source.java" injection-regex = "java" file-types = ["java"] +shebangs = [] roots = [] indent = { tab-width = 4, unit = " " } @@ -261,6 +284,7 @@ name = "ledger" scope = "source.ledger" injection-regex = "ledger" file-types = ["ldg", "ledger", "journal"] +shebangs = [] roots = [] comment-token = ";" indent = { tab-width = 4, unit = " " } @@ -270,6 +294,7 @@ name = "ocaml" scope = "source.ocaml" injection-regex = "ocaml" file-types = ["ml"] +shebangs = [] roots = [] comment-token = "(**)" indent = { tab-width = 2, unit = " " } @@ -278,6 +303,7 @@ indent = { tab-width = 2, unit = " " } name = "ocaml-interface" scope = "source.ocaml.interface" file-types = ["mli"] +shebangs = [] roots = [] comment-token = "(**)" indent = { tab-width = 2, unit = " "} @@ -286,6 +312,7 @@ indent = { tab-width = 2, unit = " "} name = "lua" scope = "source.lua" file-types = ["lua"] +shebangs = [] roots = [] comment-token = "--" indent = { tab-width = 2, unit = " " } @@ -295,6 +322,7 @@ name = "svelte" scope = "source.svelte" injection-regex = "svelte" file-types = ["svelte"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } language-server = { command = "svelteserver", args = ["--stdio"] } @@ -305,6 +333,7 @@ name = "vue" scope = "source.vue" injection-regex = "vue" file-types = ["vue"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -312,6 +341,7 @@ indent = { tab-width = 2, unit = " " } name = "yaml" scope = "source.yaml" file-types = ["yml", "yaml"] +shebangs = [] roots = [] comment-token = "#" indent = { tab-width = 2, unit = " " } @@ -331,6 +361,7 @@ name = "zig" scope = "source.zig" injection-regex = "zig" file-types = ["zig"] +shebangs = [] roots = ["build.zig"] auto-format = true comment-token = "//" @@ -343,6 +374,7 @@ name = "prolog" scope = "source.prolog" roots = [] file-types = ["pl", "prolog"] +shebangs = ["swipl"] comment-token = "%" language-server = { command = "swipl", args = [ @@ -354,6 +386,7 @@ language-server = { command = "swipl", args = [ name = "tsq" scope = "source.tsq" file-types = ["scm"] +shebangs = [] roots = [] comment-token = ";" indent = { tab-width = 2, unit = " " } @@ -362,6 +395,7 @@ indent = { tab-width = 2, unit = " " } name = "cmake" scope = "source.cmake" file-types = ["cmake", "CMakeLists.txt"] +shebangs = [] roots = [] comment-token = "#" indent = { tab-width = 2, unit = " " } @@ -371,6 +405,7 @@ language-server = { command = "cmake-language-server" } name = "perl" scope = "source.perl" file-types = ["pl", "pm"] +shebangs = ["perl"] roots = [] comment-token = "#" indent = { tab-width = 2, unit = " " }