Add tokenizing of some tokens
Signed-off-by: trivernis <trivernis@protonmail.com>version-2
commit
b8e2b070c4
@ -0,0 +1 @@
|
|||||||
|
/target
|
@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/snekdown-v2.iml" filepath="$PROJECT_DIR$/.idea/snekdown-v2.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="CPP_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -0,0 +1,165 @@
|
|||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
[[package]]
|
||||||
|
name = "autocfg"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bytes"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfg-if"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "charred"
|
||||||
|
version = "2.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3a7c637fc7dcf9e573a46ccd25bb981ba9b02d3f48ed669931fcaa4e529299d0"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"thiserror",
|
||||||
|
"tokio",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hermit-abi"
|
||||||
|
version = "0.1.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libc"
|
||||||
|
version = "0.2.93"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "log"
|
||||||
|
version = "0.4.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.3.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num_cpus"
|
||||||
|
version = "1.13.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
|
||||||
|
dependencies = [
|
||||||
|
"hermit-abi",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pin-project-lite"
|
||||||
|
version = "0.2.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.26"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-xid",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "snekdown-v2"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"charred",
|
||||||
|
"thiserror",
|
||||||
|
"tokio",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "1.0.70"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b9505f307c872bab8eb46f77ae357c8eba1fdacead58ee5a850116b1d7f82883"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-xid",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thiserror"
|
||||||
|
version = "1.0.24"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e"
|
||||||
|
dependencies = [
|
||||||
|
"thiserror-impl",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thiserror-impl"
|
||||||
|
version = "1.0.24"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokio"
|
||||||
|
version = "1.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
"bytes",
|
||||||
|
"memchr",
|
||||||
|
"num_cpus",
|
||||||
|
"pin-project-lite",
|
||||||
|
"tokio-macros",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokio-macros"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "caf7b11a536f46a809a8a9f0bb4237020f70ecbf115b842360afb127ea2fda57"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-xid"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
@ -0,0 +1,15 @@
|
|||||||
|
[package]
|
||||||
|
name = "snekdown-v2"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["trivernis <trivernis@protonmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
charred = "2.2.0"
|
||||||
|
thiserror = "1.0.24"
|
||||||
|
|
||||||
|
[dependencies.tokio]
|
||||||
|
version = "1.5.0"
|
||||||
|
features = ["rt-multi-thread", "macros", "fs"]
|
@ -0,0 +1 @@
|
|||||||
|
fn main() {}
|
@ -0,0 +1,10 @@
|
|||||||
|
use charred::error::TapeError;
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
pub type SnekdownResult<T> = Result<T, SnekdownError>;
|
||||||
|
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub enum SnekdownError {
|
||||||
|
#[error(transparent)]
|
||||||
|
TapeError(#[from] TapeError),
|
||||||
|
}
|
@ -0,0 +1,29 @@
|
|||||||
|
use crate::error::SnekdownResult;
|
||||||
|
use crate::lexer::token_parsers::{
|
||||||
|
parse_header_start, parse_linebreak, parse_whitespace, parse_word,
|
||||||
|
};
|
||||||
|
use charred::input_reader::InputReader;
|
||||||
|
use charred::lexer::Lexer;
|
||||||
|
use charred::token::{Token, TokenCheckerFn};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::io::AsyncBufRead;
|
||||||
|
|
||||||
|
mod token_parsers;
|
||||||
|
pub mod tokens;
|
||||||
|
|
||||||
|
/// Tokenizes a string
|
||||||
|
pub async fn tokenize<R: AsyncBufRead + Unpin + Send + 'static>(
|
||||||
|
reader: R,
|
||||||
|
) -> SnekdownResult<Vec<Token>> {
|
||||||
|
let input_reader = InputReader::new(reader);
|
||||||
|
let checkers: Vec<TokenCheckerFn> = vec![
|
||||||
|
Arc::new(|r| Box::pin(parse_header_start(r))),
|
||||||
|
Arc::new(|r| Box::pin(parse_whitespace(r))),
|
||||||
|
Arc::new(|r| Box::pin(parse_linebreak(r))),
|
||||||
|
Arc::new(|r| Box::pin(parse_word(r))),
|
||||||
|
];
|
||||||
|
let mut lexer = Lexer::new(input_reader, checkers);
|
||||||
|
let tokens = lexer.scan().await?;
|
||||||
|
|
||||||
|
Ok(tokens)
|
||||||
|
}
|
@ -0,0 +1,69 @@
|
|||||||
|
use crate::lexer::tokens::{HeaderStartToken, LinebreakToken, WhitespaceToken, WordToken};
|
||||||
|
use charred::error::TapeResult;
|
||||||
|
use charred::input_reader::InputReader;
|
||||||
|
use charred::token::Token;
|
||||||
|
|
||||||
|
/// Parses a whitespace token
|
||||||
|
pub async fn parse_whitespace(reader: &mut InputReader) -> TapeResult<Option<Token>> {
|
||||||
|
let check_whitespace = |c: char| c.is_whitespace() && c != '\n';
|
||||||
|
let mut count = 0;
|
||||||
|
|
||||||
|
while !reader.check_eof().await && check_whitespace(reader.peek().await?) {
|
||||||
|
reader.consume().await?;
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if count > 0 {
|
||||||
|
Ok(Some(Token::new(WhitespaceToken)))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses a linebreak token
|
||||||
|
pub async fn parse_linebreak(reader: &mut InputReader) -> TapeResult<Option<Token>> {
|
||||||
|
if !reader.check_eof().await && reader.peek().await? == '\r' {
|
||||||
|
reader.consume().await?;
|
||||||
|
}
|
||||||
|
if reader.check_eof().await || reader.peek().await? != '\n' {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
reader.consume().await?;
|
||||||
|
|
||||||
|
Ok(Some(Token::new(LinebreakToken)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses a word token
|
||||||
|
pub async fn parse_word(reader: &mut InputReader) -> TapeResult<Option<Token>> {
|
||||||
|
let mut text = String::new();
|
||||||
|
let check_word = |c: char| !c.is_whitespace();
|
||||||
|
|
||||||
|
while !reader.check_eof().await && check_word(reader.peek().await?) {
|
||||||
|
text.push(reader.consume().await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
if text.len() > 0 {
|
||||||
|
Ok(Some(Token::new(WordToken(text))))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses a markdown header start
|
||||||
|
pub async fn parse_header_start(reader: &mut InputReader) -> TapeResult<Option<Token>> {
|
||||||
|
let mut size = 0u8;
|
||||||
|
let previous = reader.previous().await;
|
||||||
|
if previous.is_some() && previous.unwrap() != '\n' {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
while !reader.check_eof().await && reader.peek().await? == '#' {
|
||||||
|
reader.consume().await?;
|
||||||
|
size += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if size > 0 {
|
||||||
|
Ok(Some(Token::new(HeaderStartToken(size))))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,5 @@
|
|||||||
|
pub struct HeaderStartToken(pub u8);
|
||||||
|
|
||||||
|
pub struct WhitespaceToken;
|
||||||
|
pub struct LinebreakToken;
|
||||||
|
pub struct WordToken(pub String);
|
@ -0,0 +1,5 @@
|
|||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
|
|
||||||
|
pub mod error;
|
||||||
|
pub mod lexer;
|
@ -0,0 +1,24 @@
|
|||||||
|
use crate::lexer::tokenize;
|
||||||
|
use crate::lexer::tokens::{HeaderStartToken, LinebreakToken, WhitespaceToken, WordToken};
|
||||||
|
use charred::token::UnknownToken;
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn it_tokenizes_everything() {
|
||||||
|
let input = r#"
|
||||||
|
# A Snekdown Document
|
||||||
|
With multiple lines
|
||||||
|
<[import.md]
|
||||||
|
And some whitespaces
|
||||||
|
|
||||||
|
| tables | exist |
|
||||||
|
|--------|-------|
|
||||||
|
"#;
|
||||||
|
let tokens = tokenize(Cursor::new(input)).await.unwrap();
|
||||||
|
let mut tokens = tokens.into_iter();
|
||||||
|
assert!(tokens.next().unwrap().is::<LinebreakToken>());
|
||||||
|
assert!(tokens.next().unwrap().is::<HeaderStartToken>());
|
||||||
|
assert!(tokens.next().unwrap().is::<WhitespaceToken>());
|
||||||
|
assert!(tokens.next().unwrap().is::<WordToken>());
|
||||||
|
assert!(tokens.all(|t| !t.is::<UnknownToken>()));
|
||||||
|
}
|
@ -0,0 +1 @@
|
|||||||
|
mod lexer_tests;
|
Loading…
Reference in New Issue