You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
helix/helix-core/src/chars.rs

138 lines
4.3 KiB
Rust

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

//! Utility functions to categorize a `char`.
use crate::LineEnding;
#[derive(Debug, Eq, PartialEq)]
pub enum CharCategory {
Whitespace,
Eol,
Word,
Punctuation,
Unknown,
}
#[inline]
pub fn categorize_char(ch: char) -> CharCategory {
if char_is_line_ending(ch) {
CharCategory::Eol
} else if ch.is_whitespace() {
CharCategory::Whitespace
} else if char_is_word(ch) {
CharCategory::Word
} else if char_is_punctuation(ch) {
CharCategory::Punctuation
} else {
CharCategory::Unknown
}
}
/// Determine whether a character is a line ending.
#[inline]
pub fn char_is_line_ending(ch: char) -> bool {
LineEnding::from_char(ch).is_some()
}
/// Determine whether a character qualifies as (non-line-break)
/// whitespace.
#[inline]
pub fn char_is_whitespace(ch: char) -> bool {
// TODO: this is a naive binary categorization of whitespace
// characters. For display, word wrapping, etc. we'll need a better
// categorization based on e.g. breaking vs non-breaking spaces
// and whether they're zero-width or not.
match ch {
//'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace)
'\u{0009}' | // Character Tabulation
'\u{0020}' | // Space
'\u{00A0}' | // No-break Space
'\u{180E}' | // Mongolian Vowel Separator
'\u{202F}' | // Narrow No-break Space
'\u{205F}' | // Medium Mathematical Space
'\u{3000}' | // Ideographic Space
'\u{FEFF}' // Zero Width No-break Space
=> true,
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
// Four-per-em Space, Six-per-em Space, Figure Space,
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
_ => false,
}
}
#[inline]
pub fn char_is_punctuation(ch: char) -> bool {
use unicode_general_category::{get_general_category, GeneralCategory};
matches!(
get_general_category(ch),
GeneralCategory::OtherPunctuation
| GeneralCategory::OpenPunctuation
| GeneralCategory::ClosePunctuation
| GeneralCategory::InitialPunctuation
| GeneralCategory::FinalPunctuation
| GeneralCategory::ConnectorPunctuation
| GeneralCategory::DashPunctuation
| GeneralCategory::MathSymbol
| GeneralCategory::CurrencySymbol
| GeneralCategory::ModifierSymbol
)
}
#[inline]
pub fn char_is_word(ch: char) -> bool {
ch.is_alphanumeric() || ch == '_'
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_categorize() {
#[cfg(not(feature = "unicode-lines"))]
const EOL_TEST_CASE: &str = "\n";
#[cfg(feature = "unicode-lines")]
const EOL_TEST_CASE: &str = "\n\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}";
const WORD_TEST_CASE: &str = "_hello_world_あいうえおー1234567890";
const PUNCTUATION_TEST_CASE: &str =
"!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~";
const WHITESPACE_TEST_CASE: &str = "  ";
for ch in EOL_TEST_CASE.chars() {
assert_eq!(CharCategory::Eol, categorize_char(ch));
}
for ch in WHITESPACE_TEST_CASE.chars() {
assert_eq!(
CharCategory::Whitespace,
categorize_char(ch),
"Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
ch,
categorize_char(ch)
);
}
for ch in WORD_TEST_CASE.chars() {
assert_eq!(
CharCategory::Word,
categorize_char(ch),
"Testing '{}', but got `{:?}` instead of `Category::Word`",
ch,
categorize_char(ch)
);
}
for ch in PUNCTUATION_TEST_CASE.chars() {
assert_eq!(
CharCategory::Punctuation,
categorize_char(ch),
"Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
ch,
categorize_char(ch)
);
}
}
}