Change syntax for suffix file-types configurations (#4414)

The change in d801a6693c to search for
suffixes in `file-types` is too permissive: files like the tutor or
`*.txt` files are now mistakenly interpreted as R or perl,
respectively.

This change changes the syntax for specifying a file-types entry that
matches by suffix:

```toml
file-types = [{ suffix = ".git/config" }]
```

And changes the file-type detection to first search for any non-suffix
patterns and then search for suffixes only with the file-types entries
marked explicitly as suffixes.
pull/1/head
Michael Davis 2 years ago committed by GitHub
parent 131d8392bb
commit 17daf6ac0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -50,7 +50,7 @@ These configuration keys are available:
| `name` | The name of the language |
| `scope` | A string like `source.js` that identifies the language. Currently, we strive to match the scope names used by popular TextMate grammars and by the Linguist library. Usually `source.<name>` or `text.<name>` in case of markup languages |
| `injection-regex` | regex pattern that will be tested against a language name in order to determine whether this language should be used for a potential [language injection][treesitter-language-injection] site. |
| `file-types` | The filetypes of the language, for example `["yml", "yaml"]`. This attempts to match by exact file name (`.zshrc`), then by file extension (`toml`), then by path suffix (`.git/config`). |
| `file-types` | The filetypes of the language, for example `["yml", "yaml"]`. See the file-type detection section below. |
| `shebangs` | The interpreters from the shebang line, for example `["sh", "bash"]` |
| `roots` | A set of marker files to look for when trying to find the workspace root. For example `Cargo.lock`, `yarn.lock` |
| `auto-format` | Whether to autoformat this language when saving |
@ -63,6 +63,32 @@ These configuration keys are available:
| `formatter` | The formatter for the language, it will take precedence over the lsp when defined. The formatter must be able to take the original file as input from stdin and write the formatted file to stdout |
| `max-line-length` | Maximum line length. Used for the `:reflow` command |
### File-type detection and the `file-types` key
Helix determines which language configuration to use with the `file-types` key
from the above section. `file-types` is a list of strings or tables, for
example:
```toml
file-types = ["Makefile", "toml", { suffix = ".git/config" }]
```
When determining a language configuration to use, Helix searches the file-types
with the following priorities:
1. Exact match: if the filename of a file is an exact match of a string in a
`file-types` list, that language wins. In the example above, `"Makefile"`
will match against `Makefile` files.
2. Extension: if there are no exact matches, any `file-types` string that
matches the file extension of a given file wins. In the example above, the
`"toml"` matches files like `Cargo.toml` or `languages.toml`.
3. Suffix: if there are still no matches, any values in `suffix` tables
are checked against the full path of the given file. In the example above,
the `{ suffix = ".git/config" }` would match against any `config` files
in `.git` directories. Note: `/` is used as the directory separator but is
replaced at runtime with the appropriate path separator for the operating
system, so this rule would match against `.git\config` files on Windows.
### Language Server configuration
The `language-server` field takes the following keys:

@ -73,11 +73,11 @@ impl Default for Configuration {
pub struct LanguageConfiguration {
#[serde(rename = "name")]
pub language_id: String, // c-sharp, rust
pub scope: String, // source.rust
pub file_types: Vec<String>, // filename ends_with? <Gemfile, rb, etc>
pub scope: String, // source.rust
pub file_types: Vec<FileType>, // filename extension or ends_with? <Gemfile, rb, etc>
#[serde(default)]
pub shebangs: Vec<String>, // interpreter(s) associated with language
pub roots: Vec<String>, // these indicate project roots <.git, Cargo.toml>
pub roots: Vec<String>, // these indicate project roots <.git, Cargo.toml>
pub comment_token: Option<String>,
pub max_line_length: Option<usize>,
@ -125,6 +125,78 @@ pub struct LanguageConfiguration {
pub rulers: Option<Vec<u16>>, // if set, override editor's rulers
}
#[derive(Debug, PartialEq, Eq, Hash)]
pub enum FileType {
/// The extension of the file, either the `Path::extension` or the full
/// filename if the file does not have an extension.
Extension(String),
/// The suffix of a file. This is compared to a given file's absolute
/// path, so it can be used to detect files based on their directories.
Suffix(String),
}
impl Serialize for FileType {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeMap;
match self {
FileType::Extension(extension) => serializer.serialize_str(extension),
FileType::Suffix(suffix) => {
let mut map = serializer.serialize_map(Some(1))?;
map.serialize_entry("suffix", &suffix.replace(std::path::MAIN_SEPARATOR, "/"))?;
map.end()
}
}
}
}
impl<'de> Deserialize<'de> for FileType {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::de::Deserializer<'de>,
{
struct FileTypeVisitor;
impl<'de> serde::de::Visitor<'de> for FileTypeVisitor {
type Value = FileType;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("string or table")
}
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(FileType::Extension(value.to_string()))
}
fn visit_map<M>(self, mut map: M) -> Result<Self::Value, M::Error>
where
M: serde::de::MapAccess<'de>,
{
match map.next_entry::<String, String>()? {
Some((key, suffix)) if key == "suffix" => Ok(FileType::Suffix(
suffix.replace('/', &std::path::MAIN_SEPARATOR.to_string()),
)),
Some((key, _value)) => Err(serde::de::Error::custom(format!(
"unknown key in `file-types` list: {}",
key
))),
None => Err(serde::de::Error::custom(
"expected a `suffix` key in the `file-types` entry",
)),
}
}
}
deserializer.deserialize_any(FileTypeVisitor)
}
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct LanguageServerConfiguration {
@ -454,7 +526,8 @@ impl LanguageConfiguration {
pub struct Loader {
// highlight_names ?
language_configs: Vec<Arc<LanguageConfiguration>>,
language_config_ids_by_file_type: HashMap<String, usize>, // Vec<usize>
language_config_ids_by_extension: HashMap<String, usize>, // Vec<usize>
language_config_ids_by_suffix: HashMap<String, usize>,
language_config_ids_by_shebang: HashMap<String, usize>,
scopes: ArcSwap<Vec<String>>,
@ -464,7 +537,8 @@ impl Loader {
pub fn new(config: Configuration) -> Self {
let mut loader = Self {
language_configs: Vec::new(),
language_config_ids_by_file_type: HashMap::new(),
language_config_ids_by_extension: HashMap::new(),
language_config_ids_by_suffix: HashMap::new(),
language_config_ids_by_shebang: HashMap::new(),
scopes: ArcSwap::from_pointee(Vec::new()),
};
@ -475,10 +549,14 @@ impl Loader {
for file_type in &config.file_types {
// entry().or_insert(Vec::new).push(language_id);
let file_type = file_type.replace('/', &std::path::MAIN_SEPARATOR.to_string());
loader
.language_config_ids_by_file_type
.insert(file_type, language_id);
match file_type {
FileType::Extension(extension) => loader
.language_config_ids_by_extension
.insert(extension.clone(), language_id),
FileType::Suffix(suffix) => loader
.language_config_ids_by_suffix
.insert(suffix.clone(), language_id),
};
}
for shebang in &config.shebangs {
loader
@ -498,14 +576,14 @@ impl Loader {
let configuration_id = path
.file_name()
.and_then(|n| n.to_str())
.and_then(|file_name| self.language_config_ids_by_file_type.get(file_name))
.and_then(|file_name| self.language_config_ids_by_extension.get(file_name))
.or_else(|| {
path.extension()
.and_then(|extension| extension.to_str())
.and_then(|extension| self.language_config_ids_by_file_type.get(extension))
.and_then(|extension| self.language_config_ids_by_extension.get(extension))
})
.or_else(|| {
self.language_config_ids_by_file_type
self.language_config_ids_by_suffix
.iter()
.find_map(|(file_type, id)| {
if path.to_str()?.ends_with(file_type) {

@ -1053,8 +1053,7 @@ source = { git = "https://github.com/tree-sitter/tree-sitter-regex", rev = "e1cf
name = "git-config"
scope = "source.gitconfig"
roots = []
# TODO: allow specifying file-types as a regex so we can read directory names (e.g. `.git/config`)
file-types = [".gitmodules", ".gitconfig", ".git/config", ".config/git/config"]
file-types = [".gitmodules", ".gitconfig", { suffix = ".git/config" }, { suffix = ".config/git/config" }]
injection-regex = "git-config"
comment-token = "#"
indent = { tab-width = 4, unit = "\t" }
@ -1491,7 +1490,7 @@ source = { git = "https://github.com/bearcove/tree-sitter-meson", rev = "feea83b
[[language]]
name = "sshclientconfig"
scope = "source.sshclientconfig"
file-types = [".ssh/config", "/etc/ssh/ssh_config"]
file-types = [{ suffix = ".ssh/config" }, { suffix = "/etc/ssh/ssh_config" }]
roots = []
[[grammar]]

Loading…
Cancel
Save