Change text handling when tokenizing

pull/1/head
trivernis 4 years ago
parent ef7bbae025
commit 01c9203bac

@ -1,7 +1,12 @@
#![feature(test)]
extern crate test;
#[macro_use]
extern crate maplit;
#[macro_use]
extern crate lazy_static;
pub mod elements;
pub mod tokenizer;
mod tokens;
@ -10,9 +15,10 @@ mod tokens;
mod tests {
use crate::tokenizer::Tokenizer;
use crate::tokens::{Grouping, Misc, Operation, Relation, Text, Token};
use test::Bencher;
#[test]
fn it_works() {
fn it_tokenizes_expressions1() {
let expression = "sum_(i=1)^n";
let mut tokenizer = Tokenizer::new(expression.to_string());
let tokens = tokenizer.parse();
@ -22,13 +28,91 @@ mod tests {
Token::Operation(Operation::Sum),
Token::Misc(Misc::Sub),
Token::Grouping(Grouping::RParen),
Token::Text(Text::Plain("i".to_string())),
Token::Text(Text::Symbol("i".to_string())),
Token::Relation(Relation::Eq),
Token::Text(Text::Plain("1".to_string())),
Token::Text(Text::Number("1".to_string())),
Token::Grouping(Grouping::LParen),
Token::Misc(Misc::Pow),
Token::Text(Text::Plain("n".to_string()))
Token::Text(Text::Symbol("n".to_string()))
]
);
}
#[test]
fn it_tokenizes_expressions2() {
let expression = "G_(11) = 5.16e6 € * (215)/(170) = 6.53e6";
let mut tokenizer = Tokenizer::new(expression.to_string());
let tokens = tokenizer.parse();
assert_eq!(
tokens,
vec![
Token::Text(Text::Symbol("G".to_string())),
Token::Misc(Misc::Sub),
Token::Grouping(Grouping::RParen),
Token::Text(Text::Number("11".to_string())),
Token::Grouping(Grouping::LParen),
Token::Text(Text::Whitespace),
Token::Relation(Relation::Eq),
Token::Text(Text::Whitespace),
Token::Text(Text::Number("5.16e6".to_string())),
Token::Text(Text::Whitespace),
Token::Text(Text::Symbol("€".to_string())),
Token::Text(Text::Whitespace),
Token::Operation(Operation::CDot),
Token::Text(Text::Whitespace),
Token::Grouping(Grouping::RParen),
Token::Text(Text::Number("215".to_string())),
Token::Grouping(Grouping::LParen),
Token::Misc(Misc::AsciiFrac),
Token::Grouping(Grouping::RParen),
Token::Text(Text::Number("170".to_string())),
Token::Grouping(Grouping::LParen),
Token::Text(Text::Whitespace),
Token::Relation(Relation::Eq),
Token::Text(Text::Whitespace),
Token::Text(Text::Number("6.53e6".to_string()))
]
);
}
#[test]
fn it_tokenizes_text1() {
let expression = "\"just plain text\"";
let mut tokenizer = Tokenizer::new(expression.to_string());
let tokens = tokenizer.parse();
assert_eq!(
tokens,
vec![Token::Text(Text::Plain("just plain text".to_string()))]
)
}
#[test]
fn it_tokenizes_text2() {
let expression = "\"plain text\" * \"plain text 2\" + a";
let mut tokenizer = Tokenizer::new(expression.to_string());
let tokens = tokenizer.parse();
assert_eq!(
tokens,
vec![
Token::Text(Text::Plain("plain text".to_string())),
Token::Text(Text::Whitespace),
Token::Operation(Operation::CDot),
Token::Text(Text::Whitespace),
Token::Text(Text::Plain("plain text 2".to_string())),
Token::Text(Text::Whitespace),
Token::Operation(Operation::Plus),
Token::Text(Text::Whitespace),
Token::Text(Text::Symbol("a".to_string()))
]
)
}
#[bench]
fn bench_tokenizer(b: &mut Bencher) {
let expression = "sum_(iiiiiiiii=1)^n i^3=((n(n+1))/2)^2";
b.iter(|| {
let mut tokenizer = Tokenizer::new(expression.to_string());
let _ = tokenizer.parse();
});
}
}

@ -1,3 +1,4 @@
use crate::tokens::constants::misc::{A_TEXT, G_NUMALLOWED};
use crate::tokens::constants::TokenPattern;
use crate::tokens::mappings::{
get_accent_mappings, get_arrow_mapping, get_font_mappings, get_greek_mappings,
@ -25,7 +26,6 @@ impl Tokenizer {
pub fn parse(&mut self) -> Vec<Token> {
let mut tokens = Vec::<Token>::new();
let mut tmp_string = String::new();
self.ctm.seek_whitespace();
while !self.ctm.check_eof() {
@ -49,22 +49,17 @@ impl Tokenizer {
tokens.push(Token::Font(font))
} else if let Some(whitespace) = self.parse_whitespace() {
tokens.push(Token::Text(whitespace))
} else if let Some(text) = self.parse_text() {
tokens.push(Token::Text(text))
} else if let Some(number) = self.parse_number() {
tokens.push(Token::Text(number))
} else {
tmp_string.push(self.ctm.get_current());
let _ = self.ctm.seek_one();
continue;
}
if !tmp_string.is_empty() {
let last = tokens.pop().unwrap();
tokens.push(Token::Text(Text::Plain(tmp_string.clone())));
tmp_string.clear();
tokens.push(last);
tokens.push(Token::Text(Text::Symbol(
self.ctm.get_current().to_string(),
)))
}
let _ = self.ctm.seek_one();
}
if !tmp_string.is_empty() {
tokens.push(Token::Text(Text::Plain(tmp_string)));
}
// stripping the whitespace at the end
if let Some(Token::Text(Text::Whitespace)) = tokens.last() {
tokens.pop().unwrap();
@ -212,4 +207,38 @@ impl Tokenizer {
None
}
}
fn parse_text(&mut self) -> Option<Text> {
if self.ctm.check_char(&A_TEXT) {
let mut string = String::new();
while let Some(ch) = self.ctm.next_char() {
if ch == A_TEXT {
break;
}
string.push(ch);
}
Some(Text::Plain(string))
} else {
None
}
}
fn parse_number(&mut self) -> Option<Text> {
if self.ctm.get_current().is_numeric() {
let mut string = self.ctm.get_current().to_string();
while let Some(ch) = self.ctm.next_char() {
if ch.is_numeric() || self.ctm.check_any(&G_NUMALLOWED) {
string.push(ch);
} else {
break;
}
}
self.ctm.rewind(self.ctm.get_index() - 1);
Some(Text::Number(string))
} else {
None
}
}
}

@ -37,5 +37,9 @@ pub const G_RATIONAL: &'static [&str] = &["QQ"];
pub const G_REAL: &'static [&str] = &["RR"];
pub const G_INTEGER: &'static [&str] = &["ZZ"];
pub const G_A_TEXT: &'static [&str] = &["\""];
pub const A_TEXT: char = '"';
pub const G_T_TEX: &'static [&str] = &["text"];
pub const A_NUMCOMMA: char = '.';
pub const A_SCIEXP: char = 'e';
pub const G_NUMALLOWED: &'static [char] = &[A_NUMCOMMA, A_SCIEXP];

@ -94,7 +94,6 @@ pub fn get_misc_mappings() -> Vec<HashMap<TokenPattern, Misc>> {
G_RATIONAL => Misc::Rational,
G_REAL => Misc::Real,
G_INTEGER => Misc::Integer,
G_A_TEXT => Misc::AsciiText,
G_T_TEX => Misc::LatexText,
},
]

@ -17,6 +17,8 @@ pub enum Token {
#[derive(Debug, Clone, PartialOrd, PartialEq)]
pub enum Text {
Number(String),
Symbol(String),
Plain(String),
Whitespace,
}
@ -89,7 +91,6 @@ pub enum Misc {
Rational,
Real,
Integer,
AsciiText,
LatexText,
}

Loading…
Cancel
Save