commit b8e2b070c44ef6670cbb246d631cca3420402cd3 Author: trivernis Date: Fri Apr 23 17:16:33 2021 +0200 Add tokenizing of some tokens Signed-off-by: trivernis diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..4782a3e --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/snekdown-v2.iml b/.idea/snekdown-v2.iml new file mode 100644 index 0000000..c254557 --- /dev/null +++ b/.idea/snekdown-v2.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..243d0c9 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,165 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "bytes" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "charred" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a7c637fc7dcf9e573a46ccd25bb981ba9b02d3f48ed669931fcaa4e529299d0" +dependencies = [ + "log", + "thiserror", + "tokio", +] + +[[package]] +name = "hermit-abi" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905" + +[[package]] +name = "proc-macro2" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "snekdown-v2" +version = "0.1.0" +dependencies = [ + "charred", + "thiserror", + "tokio", +] + +[[package]] +name = "syn" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9505f307c872bab8eb46f77ae357c8eba1fdacead58ee5a850116b1d7f82883" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "thiserror" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" +dependencies = [ + "autocfg", + "bytes", + "memchr", + "num_cpus", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf7b11a536f46a809a8a9f0bb4237020f70ecbf115b842360afb127ea2fda57" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-xid" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9fd0acf --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "snekdown-v2" +version = "0.1.0" +authors = ["trivernis "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +charred = "2.2.0" +thiserror = "1.0.24" + +[dependencies.tokio] +version = "1.5.0" +features = ["rt-multi-thread", "macros", "fs"] \ No newline at end of file diff --git a/src/bin/snekdown.rs b/src/bin/snekdown.rs new file mode 100644 index 0000000..f328e4d --- /dev/null +++ b/src/bin/snekdown.rs @@ -0,0 +1 @@ +fn main() {} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..52dfd46 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,10 @@ +use charred::error::TapeError; +use thiserror::Error; + +pub type SnekdownResult = Result; + +#[derive(Debug, Error)] +pub enum SnekdownError { + #[error(transparent)] + TapeError(#[from] TapeError), +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 0000000..ee18bf0 --- /dev/null +++ b/src/lexer/mod.rs @@ -0,0 +1,29 @@ +use crate::error::SnekdownResult; +use crate::lexer::token_parsers::{ + parse_header_start, parse_linebreak, parse_whitespace, parse_word, +}; +use charred::input_reader::InputReader; +use charred::lexer::Lexer; +use charred::token::{Token, TokenCheckerFn}; +use std::sync::Arc; +use tokio::io::AsyncBufRead; + +mod token_parsers; +pub mod tokens; + +/// Tokenizes a string +pub async fn tokenize( + reader: R, +) -> SnekdownResult> { + let input_reader = InputReader::new(reader); + let checkers: Vec = vec![ + Arc::new(|r| Box::pin(parse_header_start(r))), + Arc::new(|r| Box::pin(parse_whitespace(r))), + Arc::new(|r| Box::pin(parse_linebreak(r))), + Arc::new(|r| Box::pin(parse_word(r))), + ]; + let mut lexer = Lexer::new(input_reader, checkers); + let tokens = lexer.scan().await?; + + Ok(tokens) +} diff --git a/src/lexer/token_parsers.rs b/src/lexer/token_parsers.rs new file mode 100644 index 0000000..20de78c --- /dev/null +++ b/src/lexer/token_parsers.rs @@ -0,0 +1,69 @@ +use crate::lexer::tokens::{HeaderStartToken, LinebreakToken, WhitespaceToken, WordToken}; +use charred::error::TapeResult; +use charred::input_reader::InputReader; +use charred::token::Token; + +/// Parses a whitespace token +pub async fn parse_whitespace(reader: &mut InputReader) -> TapeResult> { + let check_whitespace = |c: char| c.is_whitespace() && c != '\n'; + let mut count = 0; + + while !reader.check_eof().await && check_whitespace(reader.peek().await?) { + reader.consume().await?; + count += 1; + } + + if count > 0 { + Ok(Some(Token::new(WhitespaceToken))) + } else { + Ok(None) + } +} + +/// Parses a linebreak token +pub async fn parse_linebreak(reader: &mut InputReader) -> TapeResult> { + if !reader.check_eof().await && reader.peek().await? == '\r' { + reader.consume().await?; + } + if reader.check_eof().await || reader.peek().await? != '\n' { + return Ok(None); + } + reader.consume().await?; + + Ok(Some(Token::new(LinebreakToken))) +} + +/// Parses a word token +pub async fn parse_word(reader: &mut InputReader) -> TapeResult> { + let mut text = String::new(); + let check_word = |c: char| !c.is_whitespace(); + + while !reader.check_eof().await && check_word(reader.peek().await?) { + text.push(reader.consume().await?) + } + + if text.len() > 0 { + Ok(Some(Token::new(WordToken(text)))) + } else { + Ok(None) + } +} + +/// Parses a markdown header start +pub async fn parse_header_start(reader: &mut InputReader) -> TapeResult> { + let mut size = 0u8; + let previous = reader.previous().await; + if previous.is_some() && previous.unwrap() != '\n' { + return Ok(None); + } + while !reader.check_eof().await && reader.peek().await? == '#' { + reader.consume().await?; + size += 1; + } + + if size > 0 { + Ok(Some(Token::new(HeaderStartToken(size)))) + } else { + Ok(None) + } +} diff --git a/src/lexer/tokens.rs b/src/lexer/tokens.rs new file mode 100644 index 0000000..1db26b4 --- /dev/null +++ b/src/lexer/tokens.rs @@ -0,0 +1,5 @@ +pub struct HeaderStartToken(pub u8); + +pub struct WhitespaceToken; +pub struct LinebreakToken; +pub struct WordToken(pub String); diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..9af08e9 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,5 @@ +#[cfg(test)] +mod tests; + +pub mod error; +pub mod lexer; diff --git a/src/tests/lexer_tests.rs b/src/tests/lexer_tests.rs new file mode 100644 index 0000000..b33374c --- /dev/null +++ b/src/tests/lexer_tests.rs @@ -0,0 +1,24 @@ +use crate::lexer::tokenize; +use crate::lexer::tokens::{HeaderStartToken, LinebreakToken, WhitespaceToken, WordToken}; +use charred::token::UnknownToken; +use std::io::Cursor; + +#[tokio::test] +async fn it_tokenizes_everything() { + let input = r#" +# A Snekdown Document +With multiple lines +<[import.md] +And some whitespaces + +| tables | exist | +|--------|-------| + "#; + let tokens = tokenize(Cursor::new(input)).await.unwrap(); + let mut tokens = tokens.into_iter(); + assert!(tokens.next().unwrap().is::()); + assert!(tokens.next().unwrap().is::()); + assert!(tokens.next().unwrap().is::()); + assert!(tokens.next().unwrap().is::()); + assert!(tokens.all(|t| !t.is::())); +} diff --git a/src/tests/mod.rs b/src/tests/mod.rs new file mode 100644 index 0000000..953071a --- /dev/null +++ b/src/tests/mod.rs @@ -0,0 +1 @@ +mod lexer_tests; \ No newline at end of file