diff --git a/src/lib.rs b/src/lib.rs index c03b696..562a07b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,12 @@ +#![feature(test)] +extern crate test; + #[macro_use] extern crate maplit; + #[macro_use] extern crate lazy_static; + pub mod elements; pub mod tokenizer; mod tokens; @@ -10,9 +15,10 @@ mod tokens; mod tests { use crate::tokenizer::Tokenizer; use crate::tokens::{Grouping, Misc, Operation, Relation, Text, Token}; + use test::Bencher; #[test] - fn it_works() { + fn it_tokenizes_expressions1() { let expression = "sum_(i=1)^n"; let mut tokenizer = Tokenizer::new(expression.to_string()); let tokens = tokenizer.parse(); @@ -22,13 +28,91 @@ mod tests { Token::Operation(Operation::Sum), Token::Misc(Misc::Sub), Token::Grouping(Grouping::RParen), - Token::Text(Text::Plain("i".to_string())), + Token::Text(Text::Symbol("i".to_string())), Token::Relation(Relation::Eq), - Token::Text(Text::Plain("1".to_string())), + Token::Text(Text::Number("1".to_string())), Token::Grouping(Grouping::LParen), Token::Misc(Misc::Pow), - Token::Text(Text::Plain("n".to_string())) + Token::Text(Text::Symbol("n".to_string())) + ] + ); + } + + #[test] + fn it_tokenizes_expressions2() { + let expression = "G_(11) = 5.16e6 € * (215)/(170) = 6.53e6"; + let mut tokenizer = Tokenizer::new(expression.to_string()); + let tokens = tokenizer.parse(); + assert_eq!( + tokens, + vec![ + Token::Text(Text::Symbol("G".to_string())), + Token::Misc(Misc::Sub), + Token::Grouping(Grouping::RParen), + Token::Text(Text::Number("11".to_string())), + Token::Grouping(Grouping::LParen), + Token::Text(Text::Whitespace), + Token::Relation(Relation::Eq), + Token::Text(Text::Whitespace), + Token::Text(Text::Number("5.16e6".to_string())), + Token::Text(Text::Whitespace), + Token::Text(Text::Symbol("€".to_string())), + Token::Text(Text::Whitespace), + Token::Operation(Operation::CDot), + Token::Text(Text::Whitespace), + Token::Grouping(Grouping::RParen), + Token::Text(Text::Number("215".to_string())), + Token::Grouping(Grouping::LParen), + Token::Misc(Misc::AsciiFrac), + Token::Grouping(Grouping::RParen), + Token::Text(Text::Number("170".to_string())), + Token::Grouping(Grouping::LParen), + Token::Text(Text::Whitespace), + Token::Relation(Relation::Eq), + Token::Text(Text::Whitespace), + Token::Text(Text::Number("6.53e6".to_string())) ] ); } + + #[test] + fn it_tokenizes_text1() { + let expression = "\"just plain text\""; + let mut tokenizer = Tokenizer::new(expression.to_string()); + let tokens = tokenizer.parse(); + assert_eq!( + tokens, + vec![Token::Text(Text::Plain("just plain text".to_string()))] + ) + } + + #[test] + fn it_tokenizes_text2() { + let expression = "\"plain text\" * \"plain text 2\" + a"; + let mut tokenizer = Tokenizer::new(expression.to_string()); + let tokens = tokenizer.parse(); + assert_eq!( + tokens, + vec![ + Token::Text(Text::Plain("plain text".to_string())), + Token::Text(Text::Whitespace), + Token::Operation(Operation::CDot), + Token::Text(Text::Whitespace), + Token::Text(Text::Plain("plain text 2".to_string())), + Token::Text(Text::Whitespace), + Token::Operation(Operation::Plus), + Token::Text(Text::Whitespace), + Token::Text(Text::Symbol("a".to_string())) + ] + ) + } + + #[bench] + fn bench_tokenizer(b: &mut Bencher) { + let expression = "sum_(iiiiiiiii=1)^n i^3=((n(n+1))/2)^2"; + b.iter(|| { + let mut tokenizer = Tokenizer::new(expression.to_string()); + let _ = tokenizer.parse(); + }); + } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 725a35c..7d4c478 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,3 +1,4 @@ +use crate::tokens::constants::misc::{A_TEXT, G_NUMALLOWED}; use crate::tokens::constants::TokenPattern; use crate::tokens::mappings::{ get_accent_mappings, get_arrow_mapping, get_font_mappings, get_greek_mappings, @@ -25,7 +26,6 @@ impl Tokenizer { pub fn parse(&mut self) -> Vec { let mut tokens = Vec::::new(); - let mut tmp_string = String::new(); self.ctm.seek_whitespace(); while !self.ctm.check_eof() { @@ -49,22 +49,17 @@ impl Tokenizer { tokens.push(Token::Font(font)) } else if let Some(whitespace) = self.parse_whitespace() { tokens.push(Token::Text(whitespace)) + } else if let Some(text) = self.parse_text() { + tokens.push(Token::Text(text)) + } else if let Some(number) = self.parse_number() { + tokens.push(Token::Text(number)) } else { - tmp_string.push(self.ctm.get_current()); - let _ = self.ctm.seek_one(); - continue; - } - if !tmp_string.is_empty() { - let last = tokens.pop().unwrap(); - tokens.push(Token::Text(Text::Plain(tmp_string.clone()))); - tmp_string.clear(); - tokens.push(last); + tokens.push(Token::Text(Text::Symbol( + self.ctm.get_current().to_string(), + ))) } let _ = self.ctm.seek_one(); } - if !tmp_string.is_empty() { - tokens.push(Token::Text(Text::Plain(tmp_string))); - } // stripping the whitespace at the end if let Some(Token::Text(Text::Whitespace)) = tokens.last() { tokens.pop().unwrap(); @@ -212,4 +207,38 @@ impl Tokenizer { None } } + + fn parse_text(&mut self) -> Option { + if self.ctm.check_char(&A_TEXT) { + let mut string = String::new(); + + while let Some(ch) = self.ctm.next_char() { + if ch == A_TEXT { + break; + } + string.push(ch); + } + Some(Text::Plain(string)) + } else { + None + } + } + + fn parse_number(&mut self) -> Option { + if self.ctm.get_current().is_numeric() { + let mut string = self.ctm.get_current().to_string(); + + while let Some(ch) = self.ctm.next_char() { + if ch.is_numeric() || self.ctm.check_any(&G_NUMALLOWED) { + string.push(ch); + } else { + break; + } + } + self.ctm.rewind(self.ctm.get_index() - 1); + Some(Text::Number(string)) + } else { + None + } + } } diff --git a/src/tokens/constants/misc.rs b/src/tokens/constants/misc.rs index d10b66b..e579183 100644 --- a/src/tokens/constants/misc.rs +++ b/src/tokens/constants/misc.rs @@ -37,5 +37,9 @@ pub const G_RATIONAL: &'static [&str] = &["QQ"]; pub const G_REAL: &'static [&str] = &["RR"]; pub const G_INTEGER: &'static [&str] = &["ZZ"]; -pub const G_A_TEXT: &'static [&str] = &["\""]; +pub const A_TEXT: char = '"'; pub const G_T_TEX: &'static [&str] = &["text"]; +pub const A_NUMCOMMA: char = '.'; +pub const A_SCIEXP: char = 'e'; + +pub const G_NUMALLOWED: &'static [char] = &[A_NUMCOMMA, A_SCIEXP]; diff --git a/src/tokens/mappings.rs b/src/tokens/mappings.rs index 60f3318..ccd643e 100644 --- a/src/tokens/mappings.rs +++ b/src/tokens/mappings.rs @@ -94,7 +94,6 @@ pub fn get_misc_mappings() -> Vec> { G_RATIONAL => Misc::Rational, G_REAL => Misc::Real, G_INTEGER => Misc::Integer, - G_A_TEXT => Misc::AsciiText, G_T_TEX => Misc::LatexText, }, ] diff --git a/src/tokens/mod.rs b/src/tokens/mod.rs index 53f072c..27a1e8f 100644 --- a/src/tokens/mod.rs +++ b/src/tokens/mod.rs @@ -17,6 +17,8 @@ pub enum Token { #[derive(Debug, Clone, PartialOrd, PartialEq)] pub enum Text { + Number(String), + Symbol(String), Plain(String), Whitespace, } @@ -89,7 +91,6 @@ pub enum Misc { Rational, Real, Integer, - AsciiText, LatexText, }