Fix Unicode (#135)

* init

* wip

* wip

* fix unicode break

* fix unicode break

* Update helix-core/src/transaction.rs

Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu>

* clippy

* fix

* add changes

* added test

* wip

* wip

* wip

* wip

* fix

* fix view

* fix #88

Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu>
imgbot
Kirawi 3 years ago committed by GitHub
parent 8f1eb7b2b0
commit b873fb9897
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

7
Cargo.lock generated

@ -265,6 +265,7 @@ dependencies = [
"tendril", "tendril",
"toml", "toml",
"tree-sitter", "tree-sitter",
"unicode-general-category",
"unicode-segmentation", "unicode-segmentation",
"unicode-width", "unicode-width",
] ]
@ -969,6 +970,12 @@ dependencies = [
"matches", "matches",
] ]
[[package]]
name = "unicode-general-category"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07547e3ee45e28326cc23faac56d44f58f16ab23e413db526debce3b0bfd2742"
[[package]] [[package]]
name = "unicode-normalization" name = "unicode-normalization"
version = "0.1.19" version = "0.1.19"

@ -19,6 +19,7 @@ smallvec = "1.4"
tendril = "0.4.2" tendril = "0.4.2"
unicode-segmentation = "1.6" unicode-segmentation = "1.6"
unicode-width = "0.1" unicode-width = "0.1"
unicode-general-category = "0.4.0"
# slab = "0.4.2" # slab = "0.4.2"
tree-sitter = "0.19" tree-sitter = "0.19"
once_cell = "1.4" once_cell = "1.4"

@ -88,11 +88,11 @@ pub fn move_next_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->
if is_word(ch) { if is_word(ch) {
skip_over_next(slice, &mut end, is_word); skip_over_next(slice, &mut end, is_word);
} else if ch.is_ascii_punctuation() { } else if is_punctuation(ch) {
skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation()); skip_over_next(slice, &mut end, is_punctuation);
} }
skip_over_next(slice, &mut end, is_horiz_blank); skip_over_next(slice, &mut end, char::is_whitespace);
} }
Some(Range::new(begin, end - 1)) Some(Range::new(begin, end - 1))
@ -119,15 +119,15 @@ pub fn move_prev_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->
end = begin; end = begin;
with_end = skip_over_prev(slice, &mut end, is_horiz_blank); with_end = skip_over_prev(slice, &mut end, char::is_whitespace);
// refetch // refetch
let ch = slice.char(end); let ch = slice.char(end);
if is_word(ch) { if is_word(ch) {
with_end = skip_over_prev(slice, &mut end, is_word); with_end = skip_over_prev(slice, &mut end, is_word);
} else if ch.is_ascii_punctuation() { } else if is_punctuation(ch) {
with_end = skip_over_prev(slice, &mut end, |ch| ch.is_ascii_punctuation()); with_end = skip_over_prev(slice, &mut end, is_punctuation);
} }
} }
@ -155,15 +155,15 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O
end = begin; end = begin;
skip_over_next(slice, &mut end, is_horiz_blank); skip_over_next(slice, &mut end, char::is_whitespace);
// refetch // refetch
let ch = slice.char(end); let ch = slice.char(end);
if is_word(ch) { if is_word(ch) {
skip_over_next(slice, &mut end, is_word); skip_over_next(slice, &mut end, is_word);
} else if ch.is_ascii_punctuation() { } else if is_punctuation(ch) {
skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation()); skip_over_next(slice, &mut end, is_punctuation);
} }
} }
@ -174,12 +174,28 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O
// used for by-word movement // used for by-word movement
#[inline]
pub(crate) fn is_word(ch: char) -> bool { pub(crate) fn is_word(ch: char) -> bool {
ch.is_alphanumeric() || ch == '_' ch.is_alphanumeric() || ch == '_'
} }
pub(crate) fn is_horiz_blank(ch: char) -> bool { #[inline]
matches!(ch, ' ' | '\t') pub(crate) fn is_punctuation(ch: char) -> bool {
use unicode_general_category::{get_general_category, GeneralCategory};
matches!(
get_general_category(ch),
GeneralCategory::OtherPunctuation
| GeneralCategory::OpenPunctuation
| GeneralCategory::ClosePunctuation
| GeneralCategory::InitialPunctuation
| GeneralCategory::FinalPunctuation
| GeneralCategory::ConnectorPunctuation
| GeneralCategory::DashPunctuation
| GeneralCategory::MathSymbol
| GeneralCategory::CurrencySymbol
| GeneralCategory::ModifierSymbol
)
} }
#[derive(Debug, Eq, PartialEq)] #[derive(Debug, Eq, PartialEq)]
@ -191,14 +207,15 @@ pub(crate) enum Category {
Unknown, Unknown,
} }
#[inline]
pub(crate) fn categorize(ch: char) -> Category { pub(crate) fn categorize(ch: char) -> Category {
if ch == '\n' { if ch == '\n' {
Category::Eol Category::Eol
} else if ch.is_ascii_whitespace() { } else if ch.is_whitespace() {
Category::Whitespace Category::Whitespace
} else if is_word(ch) { } else if is_word(ch) {
Category::Word Category::Word
} else if ch.is_ascii_punctuation() { } else if is_punctuation(ch) {
Category::Punctuation Category::Punctuation
} else { } else {
Category::Unknown Category::Unknown
@ -213,6 +230,7 @@ where
{ {
let mut chars = slice.chars_at(*pos); let mut chars = slice.chars_at(*pos);
#[allow(clippy::while_let_on_iterator)]
while let Some(ch) = chars.next() { while let Some(ch) = chars.next() {
if !fun(ch) { if !fun(ch) {
break; break;
@ -231,6 +249,7 @@ where
// need to +1 so that prev() includes current char // need to +1 so that prev() includes current char
let mut chars = slice.chars_at(*pos + 1); let mut chars = slice.chars_at(*pos + 1);
#[allow(clippy::while_let_on_iterator)]
while let Some(ch) = chars.prev() { while let Some(ch) = chars.prev() {
if !fun(ch) { if !fun(ch) {
break; break;
@ -259,4 +278,44 @@ mod test {
(1, 2).into() (1, 2).into()
); );
} }
#[test]
fn test_categorize() {
const WORD_TEST_CASE: &'static str =
"_hello_world_あいうえおー1234567890";
const PUNCTUATION_TEST_CASE: &'static str = "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~";
const WHITESPACE_TEST_CASE: &'static str = "  ";
assert_eq!(Category::Eol, categorize('\n'));
for ch in WHITESPACE_TEST_CASE.chars() {
assert_eq!(
Category::Whitespace,
categorize(ch),
"Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
ch,
categorize(ch)
);
}
for ch in WORD_TEST_CASE.chars() {
assert_eq!(
Category::Word,
categorize(ch),
"Testing '{}', but got `{:?}` instead of `Category::Word`",
ch,
categorize(ch)
);
}
for ch in PUNCTUATION_TEST_CASE.chars() {
assert_eq!(
Category::Punctuation,
categorize(ch),
"Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
ch,
categorize(ch)
);
}
}
} }

@ -758,7 +758,7 @@ mod test {
#[test] #[test]
fn combine_with_utf8() { fn combine_with_utf8() {
const TEST_CASE: &'static str = "Hello, これはヒレクスエディターです!"; const TEST_CASE: &'static str = "Hello, これはヘリックスエディターです!";
let empty = Rope::from(""); let empty = Rope::from("");
let mut a = ChangeSet::new(&empty); let mut a = ChangeSet::new(&empty);

@ -1,4 +1,4 @@
use crate::movement::{categorize, is_horiz_blank, is_word, skip_over_prev}; use crate::movement::{categorize, is_punctuation, is_word, skip_over_prev};
use ropey::RopeSlice; use ropey::RopeSlice;
#[must_use] #[must_use]
@ -13,15 +13,15 @@ pub fn nth_prev_word_boundary(slice: RopeSlice, mut char_idx: usize, count: usiz
// return if not skip while? // return if not skip while?
skip_over_prev(slice, &mut char_idx, |ch| ch == '\n'); skip_over_prev(slice, &mut char_idx, |ch| ch == '\n');
with_end = skip_over_prev(slice, &mut char_idx, is_horiz_blank); with_end = skip_over_prev(slice, &mut char_idx, char::is_whitespace);
// refetch // refetch
let ch = slice.char(char_idx); let ch = slice.char(char_idx);
if is_word(ch) { if is_word(ch) {
with_end = skip_over_prev(slice, &mut char_idx, is_word); with_end = skip_over_prev(slice, &mut char_idx, is_word);
} else if ch.is_ascii_punctuation() { } else if is_punctuation(ch) {
with_end = skip_over_prev(slice, &mut char_idx, |ch| ch.is_ascii_punctuation()); with_end = skip_over_prev(slice, &mut char_idx, is_punctuation);
} }
} }
@ -47,11 +47,11 @@ fn different_prev_word_boundary() {
t("hello, world", "hello, "); t("hello, world", "hello, ");
t("hello, ", "hello"); t("hello, ", "hello");
t("hello", ""); t("hello", "");
t("こんにちは、世界!", "こんにちは、世界"); // TODO: punctuation t("こんにちは、世界!", "こんにちは、世界");
t("こんにちは、世界", "こんにちは、"); t("こんにちは、世界", "こんにちは、");
t("こんにちは、", "こんにちは"); // what? t("こんにちは、", "こんにちは");
t("こんにちは", ""); t("こんにちは", "");
t("この世界。", "この世界"); // what? t("この世界。", "この世界");
t("この世界", ""); t("この世界", "");
t("お前はもう死んでいる", ""); t("お前はもう死んでいる", "");
t("その300円です", ""); // TODO: should stop at 300 t("その300円です", ""); // TODO: should stop at 300

@ -654,9 +654,10 @@ pub fn split_selection_on_newline(cx: &mut Context) {
fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, extend: bool) { fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, extend: bool) {
let text = doc.text(); let text = doc.text();
let selection = doc.selection(view.id); let selection = doc.selection(view.id);
let start = selection.cursor(); let start = text.char_to_byte(selection.cursor());
// use find_at to find the next match after the cursor, loop around the end // use find_at to find the next match after the cursor, loop around the end
// Careful, `Regex` uses `bytes` as offsets, not character indices!
let mat = regex let mat = regex
.find_at(contents, start) .find_at(contents, start)
.or_else(|| regex.find(contents)); .or_else(|| regex.find(contents));
@ -670,7 +671,7 @@ fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, e
return; return;
} }
let head = end; let head = end - 1;
let selection = if extend { let selection = if extend {
selection.clone().push(Range::new(start, head)) selection.clone().push(Range::new(start, head))
@ -1027,7 +1028,7 @@ pub fn command_mode(cx: &mut Context) {
let mut prompt = Prompt::new( let mut prompt = Prompt::new(
":".to_owned(), ":".to_owned(),
|input: &str| { |input: &str| {
// we use .this over split_ascii_whitespace() because we care about empty segments // we use .this over split_whitespace() because we care about empty segments
let parts = input.split(' ').collect::<Vec<&str>>(); let parts = input.split(' ').collect::<Vec<&str>>();
// simple heuristic: if there's no just one part, complete command name. // simple heuristic: if there's no just one part, complete command name.
@ -1069,7 +1070,7 @@ pub fn command_mode(cx: &mut Context) {
return; return;
} }
let parts = input.split_ascii_whitespace().collect::<Vec<&str>>(); let parts = input.split_whitespace().collect::<Vec<&str>>();
if parts.is_empty() { if parts.is_empty() {
return; return;
} }

@ -106,7 +106,7 @@ impl View {
/// Calculates the last visible line on screen /// Calculates the last visible line on screen
#[inline] #[inline]
pub fn last_line(&self, doc: &Document) -> usize { pub fn last_line(&self, doc: &Document) -> usize {
let height = self.area.height.saturating_sub(2); // - 2 for statusline let height = self.area.height.saturating_sub(1); // - 1 for statusline
std::cmp::min( std::cmp::min(
self.first_line + height as usize, self.first_line + height as usize,
doc.text().len_lines() - 1, doc.text().len_lines() - 1,

Loading…
Cancel
Save