Add tokenizing of some tokens

Signed-off-by: trivernis <trivernis@protonmail.com>
version-2
trivernis 3 years ago
commit b8e2b070c4
Signed by: Trivernis
GPG Key ID: DFFFCC2C7A02DB45

1
.gitignore vendored

@ -0,0 +1 @@
/target

8
.idea/.gitignore vendored

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/snekdown-v2.iml" filepath="$PROJECT_DIR$/.idea/snekdown-v2.iml" />
</modules>
</component>
</project>

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="CPP_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

165
Cargo.lock generated

@ -0,0 +1,165 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "autocfg"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "bytes"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "charred"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a7c637fc7dcf9e573a46ccd25bb981ba9b02d3f48ed669931fcaa4e529299d0"
dependencies = [
"log",
"thiserror",
"tokio",
]
[[package]]
name = "hermit-abi"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c"
dependencies = [
"libc",
]
[[package]]
name = "libc"
version = "0.2.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
[[package]]
name = "log"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
dependencies = [
"cfg-if",
]
[[package]]
name = "memchr"
version = "2.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
[[package]]
name = "num_cpus"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "pin-project-lite"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905"
[[package]]
name = "proc-macro2"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quote"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "snekdown-v2"
version = "0.1.0"
dependencies = [
"charred",
"thiserror",
"tokio",
]
[[package]]
name = "syn"
version = "1.0.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9505f307c872bab8eb46f77ae357c8eba1fdacead58ee5a850116b1d7f82883"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "thiserror"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tokio"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
dependencies = [
"autocfg",
"bytes",
"memchr",
"num_cpus",
"pin-project-lite",
"tokio-macros",
]
[[package]]
name = "tokio-macros"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf7b11a536f46a809a8a9f0bb4237020f70ecbf115b842360afb127ea2fda57"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicode-xid"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"

@ -0,0 +1,15 @@
[package]
name = "snekdown-v2"
version = "0.1.0"
authors = ["trivernis <trivernis@protonmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
charred = "2.2.0"
thiserror = "1.0.24"
[dependencies.tokio]
version = "1.5.0"
features = ["rt-multi-thread", "macros", "fs"]

@ -0,0 +1 @@
fn main() {}

@ -0,0 +1,10 @@
use charred::error::TapeError;
use thiserror::Error;
pub type SnekdownResult<T> = Result<T, SnekdownError>;
#[derive(Debug, Error)]
pub enum SnekdownError {
#[error(transparent)]
TapeError(#[from] TapeError),
}

@ -0,0 +1,29 @@
use crate::error::SnekdownResult;
use crate::lexer::token_parsers::{
parse_header_start, parse_linebreak, parse_whitespace, parse_word,
};
use charred::input_reader::InputReader;
use charred::lexer::Lexer;
use charred::token::{Token, TokenCheckerFn};
use std::sync::Arc;
use tokio::io::AsyncBufRead;
mod token_parsers;
pub mod tokens;
/// Tokenizes a string
pub async fn tokenize<R: AsyncBufRead + Unpin + Send + 'static>(
reader: R,
) -> SnekdownResult<Vec<Token>> {
let input_reader = InputReader::new(reader);
let checkers: Vec<TokenCheckerFn> = vec![
Arc::new(|r| Box::pin(parse_header_start(r))),
Arc::new(|r| Box::pin(parse_whitespace(r))),
Arc::new(|r| Box::pin(parse_linebreak(r))),
Arc::new(|r| Box::pin(parse_word(r))),
];
let mut lexer = Lexer::new(input_reader, checkers);
let tokens = lexer.scan().await?;
Ok(tokens)
}

@ -0,0 +1,69 @@
use crate::lexer::tokens::{HeaderStartToken, LinebreakToken, WhitespaceToken, WordToken};
use charred::error::TapeResult;
use charred::input_reader::InputReader;
use charred::token::Token;
/// Parses a whitespace token
pub async fn parse_whitespace(reader: &mut InputReader) -> TapeResult<Option<Token>> {
let check_whitespace = |c: char| c.is_whitespace() && c != '\n';
let mut count = 0;
while !reader.check_eof().await && check_whitespace(reader.peek().await?) {
reader.consume().await?;
count += 1;
}
if count > 0 {
Ok(Some(Token::new(WhitespaceToken)))
} else {
Ok(None)
}
}
/// Parses a linebreak token
pub async fn parse_linebreak(reader: &mut InputReader) -> TapeResult<Option<Token>> {
if !reader.check_eof().await && reader.peek().await? == '\r' {
reader.consume().await?;
}
if reader.check_eof().await || reader.peek().await? != '\n' {
return Ok(None);
}
reader.consume().await?;
Ok(Some(Token::new(LinebreakToken)))
}
/// Parses a word token
pub async fn parse_word(reader: &mut InputReader) -> TapeResult<Option<Token>> {
let mut text = String::new();
let check_word = |c: char| !c.is_whitespace();
while !reader.check_eof().await && check_word(reader.peek().await?) {
text.push(reader.consume().await?)
}
if text.len() > 0 {
Ok(Some(Token::new(WordToken(text))))
} else {
Ok(None)
}
}
/// Parses a markdown header start
pub async fn parse_header_start(reader: &mut InputReader) -> TapeResult<Option<Token>> {
let mut size = 0u8;
let previous = reader.previous().await;
if previous.is_some() && previous.unwrap() != '\n' {
return Ok(None);
}
while !reader.check_eof().await && reader.peek().await? == '#' {
reader.consume().await?;
size += 1;
}
if size > 0 {
Ok(Some(Token::new(HeaderStartToken(size))))
} else {
Ok(None)
}
}

@ -0,0 +1,5 @@
pub struct HeaderStartToken(pub u8);
pub struct WhitespaceToken;
pub struct LinebreakToken;
pub struct WordToken(pub String);

@ -0,0 +1,5 @@
#[cfg(test)]
mod tests;
pub mod error;
pub mod lexer;

@ -0,0 +1,24 @@
use crate::lexer::tokenize;
use crate::lexer::tokens::{HeaderStartToken, LinebreakToken, WhitespaceToken, WordToken};
use charred::token::UnknownToken;
use std::io::Cursor;
#[tokio::test]
async fn it_tokenizes_everything() {
let input = r#"
# A Snekdown Document
With multiple lines
<[import.md]
And some whitespaces
| tables | exist |
|--------|-------|
"#;
let tokens = tokenize(Cursor::new(input)).await.unwrap();
let mut tokens = tokens.into_iter();
assert!(tokens.next().unwrap().is::<LinebreakToken>());
assert!(tokens.next().unwrap().is::<HeaderStartToken>());
assert!(tokens.next().unwrap().is::<WhitespaceToken>());
assert!(tokens.next().unwrap().is::<WordToken>());
assert!(tokens.all(|t| !t.is::<UnknownToken>()));
}

@ -0,0 +1 @@
mod lexer_tests;
Loading…
Cancel
Save