From 7829f739fc8b0ec1325f3ad8139b905125b9fa6d Mon Sep 17 00:00:00 2001 From: Scott Richmond Date: Sun, 27 Oct 2024 17:45:17 -0400 Subject: [PATCH] lexing? --- Cargo.toml | 11 +++ src/lib.rs | 192 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs.old | 198 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 401 insertions(+) create mode 100644 Cargo.toml create mode 100644 src/lib.rs create mode 100644 src/lib.rs.old diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9240eaa --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "rudus" +version = "0.0.1" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +ariadne = { git = "https://github.com/zesterer/ariadne" } +chumsky = { git = "https://github.com/zesterer/chumsky" } +imbl = "3.0.0" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..8b105f0 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,192 @@ +use chumsky::prelude::*; +use std::fmt; + +pub type Span = SimpleSpan; + +#[derive(Clone, Debug, PartialEq)] +pub enum Token<'src> { + Number(f64), + Word(&'src str), + Boolean(bool), + Keyword(&'src str), + String(&'src str), + Reserved(&'src str), + Nil, + Punctuation(&'src str), +} + +impl<'src> fmt::Display for Token<'src> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Token::Number(n) => write!(f, "{}", n), + Token::Word(w) => write!(f, "{}", w), + Token::Boolean(b) => write!(f, "{}", b), + Token::Keyword(k) => write!(f, ":{}", k), + Token::String(s) => write!(f, "{}", s), + Token::Reserved(r) => write!(f, "{}", r), + Token::Nil => write!(f, "nil"), + Token::Punctuation(p) => write!(f, "{}", p), + } + } +} + +pub fn lexer<'src>( +) -> impl Parser<'src, &'src str, Vec<(Token<'src>, Span)>, extra::Err>> { + let number = just('-') + .or_not() + .then(text::int(10).then(just('.').then(text::digits(10)).or_not())) + .to_slice() + .from_str() + .unwrapped() + .map(Token::Number); + + let word = any() + .filter(char::is_ascii_lowercase) + .then( + any() + .filter(char::is_ascii_alphanumeric) + .or(one_of("*/?!_")) + .repeated(), + ) + .to_slice(); + + let reserved_or_word = word.map(|word: &str| match word { + "true" => Token::Boolean(true), + "false" => Token::Boolean(false), + "nil" => Token::Nil, + "as" | "box" | "do" | "else" | "fn" | "if" | "import" | "let" | "loop" | "match" | "ns" + | "panic!" | "pkg" | "recur" | "repeat" | "test" | "then" | "use" | "when" | "with" => { + Token::Reserved(word) + } + _ => Token::Word(word), + }); + + let keyword = just(':').ignore_then(word.clone()).map(Token::Keyword); + + let string = just('"') + .ignore_then(none_of("\"").repeated().to_slice()) + .then_ignore(just('"')) + .map(Token::String); + + let punctuation = one_of(",=[]{}()>;\n") + .to_slice() + .or(just("->")) + .or(just("...")) + .or(just("#{")) + .or(just("${")) + .map(Token::Punctuation); + + let token = number + .or(reserved_or_word) + .or(keyword) + .or(string) + .or(punctuation); + + let comment = just('&') + .ignore_then(any().and_is(just('\n').not()).repeated()) + .repeated(); + + let ludus_ws = just(' ').or(just('\t')).repeated(); + + token + .map_with(|tok, e| (tok, e.span())) + .padded_by(ludus_ws) + .padded_by(comment) + .recover_with(skip_then_retry_until(any().ignored(), end())) + .repeated() + .collect() +} + +#[cfg(test)] +mod tests { + use crate::lexer; + use crate::Token; + use chumsky::prelude::*; + + #[test] + fn it_lexes_positive_ints() { + let (mytoken, _) = lexer().parse("42").unwrap()[0].clone(); + assert_eq!(mytoken, Token::Number(42.0)) + } + + #[test] + fn it_lexes_negative_ints() { + let (mytoken, _) = lexer().parse("-42").unwrap()[0].clone(); + assert_eq!(mytoken, Token::Number(-42.0)) + } + + #[test] + fn it_lexes_positive_floats() { + let (mytoken, _) = lexer().parse("42.032").unwrap()[0].clone(); + assert_eq!(mytoken, Token::Number(42.032)) + } + + #[test] + fn it_lexes_positive_decimals() { + let (mytoken, _) = lexer().parse("0.123").unwrap()[0].clone(); + assert_eq!(mytoken, Token::Number(0.123)) + } + + #[test] + fn it_lexes_negative_floats() { + let mytoken = lexer().parse("-42.123").unwrap()[0].clone().0; + assert_eq!(mytoken, Token::Number(-42.123)) + } + + #[test] + fn it_lexes_negative_decimals() { + let mytoken = lexer().parse("-0.123").unwrap()[0].clone().0; + assert_eq!(mytoken, Token::Number(-0.123)) + } + + #[test] + fn it_lexes_bools() { + let tt = lexer().parse("true").unwrap()[0].clone().0; + assert_eq!(tt, Token::Boolean(true)); + let ff = lexer().parse("false").unwrap()[0].clone().0; + assert_eq!(ff, Token::Boolean(false)) + } + + #[test] + fn it_lexes_words() { + let mytoken = lexer().parse("foo").unwrap()[0].clone().0; + assert_eq!(mytoken, Token::Word("foo")) + } + + #[test] + fn it_lexes_keywords() { + let kw = lexer().parse(":foo").unwrap()[0].clone().0; + assert_eq!(kw, Token::Keyword("foo")) + } + + #[test] + fn it_lexes_strings() { + let s = lexer().parse("\"foo bar baz\"").unwrap()[0].clone().0; + assert_eq!(s, Token::String("foo bar baz")) + } + + #[test] + fn it_ignores_comments() { + let e = lexer().parse("foo &bar\nbaz").unwrap(); + assert_eq!(e[0].0, Token::Word("foo")); + assert_eq!(e[2].0, Token::Word("baz")) + } + + #[test] + fn it_lexes_multiple_tokens() { + let toks = lexer().parse("foo;bar\nbaz").unwrap(); + assert_eq!(toks[0].0, Token::Word("foo")); + assert_eq!(toks[2].0, Token::Word("bar")); + assert_eq!(toks[4].0, Token::Word("baz")) + } + + #[test] + fn it_lexes_collections() { + let toks = lexer().parse("(1, 2)").unwrap(); + assert_eq!(toks[0].0, Token::Punctuation("(")); + assert_eq!(toks[1].0, Token::Number(1.0)); + assert_eq!(toks[2].0, Token::Punctuation(",")); + assert_eq!(toks[3].0, Token::Number(2.0)); + assert_eq!(toks[4].0, Token::Punctuation(")")) + } +} diff --git a/src/lib.rs.old b/src/lib.rs.old new file mode 100644 index 0000000..d4b7f92 --- /dev/null +++ b/src/lib.rs.old @@ -0,0 +1,198 @@ +// use ariadne::{sources, Color, Label, Report, ReportKind}; +use chumsky::prelude::*; +use std::fmt; + +pub type Span = SimpleSpan; + +#[derive(Clone, Debug, PartialEq)] +pub enum Token<'src> { + // atomic types + Boolean(bool), + Number(f64), + String(&'src str), + Word(&'src str), + Keyword(&'src str), + Pkgkeyword(&'src str), + Ignored(&'src str), + + // reserved words + As, + Box, + Do, + Else, + Fn, + If, + Import, + Let, + Loop, + Match, + Nil, + Ns, + Panic, + Pkg, + Recur, + Repeat, + Test, + Then, + Use, + When, + With, + + // punctuation + Arrow, + Comma, + Equals, + Lbrace, + Lbracket, + Lparen, + Newline, + Pipeline, + Placeholder, + Rbrace, + Rbracket, + Rparen, + Semi, + Splat, + Startdict, + Startset, +} + +impl<'src> fmt::Display for Token<'src> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Token::Boolean(b) => write!(f, "{}", b), + Token::Number(n) => write!(f, "{}", n), + Token::String(s) => write!(f, "{}", s), + Token::Word(w) => write!(f, "{}", w), + Token::Keyword(k) => write!(f, ":{}", k), + Token::Ignored(i) => write!(f, "_{}", i), + Token::Pkgkeyword(k) => write!(f, ":{}", k), + + Token::As => write!(f, "as"), + Token::Box => write!(f, "box"), + Token::Do => write!(f, "do"), + Token::Else => write!(f, "else"), + Token::Fn => write!(f, "fn"), + Token::If => write!(f, "if"), + Token::Import => write!(f, "import"), + Token::Let => write!(f, "let"), + Token::Loop => write!(f, "loop"), + Token::Match => write!(f, "match"), + Token::Nil => write!(f, "nil"), + Token::Ns => write!(f, "ns"), + Token::Panic => write!(f, "panic!"), + Token::Pkg => write!(f, "pkg"), + Token::Recur => write!(f, "recur"), + Token::Repeat => write!(f, "repeat"), + Token::Test => write!(f, "test"), + Token::Then => write!(f, "then"), + Token::Use => write!(f, "use"), + Token::When => write!(f, "when"), + Token::With => write!(f, "with"), + + Token::Arrow => write!(f, "->"), + Token::Comma => write!(f, ","), + Token::Equals => write!(f, "="), + Token::Lbrace => write!(f, "{{"), + Token::Lbracket => write!(f, "["), + Token::Lparen => write!(f, "("), + Token::Newline => write!(f, "\\n"), + Token::Pipeline => write!(f, ">"), + Token::Placeholder => write!(f, "_"), + Token::Rbrace => write!(f, "}}"), + Token::Rbracket => write!(f, "]"), + Token::Rparen => write!(f, ")"), + Token::Semi => write!(f, ";"), + Token::Splat => write!(f, "..."), + Token::Startdict => write!(f, "#{{"), + Token::Startset => write!(f, "${{"), + } + } +} + +pub fn lexer<'src>( +) -> impl Parser<'src, &'src str, Vec<(Token<'src>, Span)>, extra::Err>> { + let string = just('"') + .ignore_then(none_of('"').repeated().to_slice()) + .then_ignore(just('"')) + .map(Token::String); + + let word = any() + .filter(char::is_ascii_lowercase) + .then( + any() + .filter(char::is_ascii_alphanumeric) + .or(one_of("*_/!?")), + ) + .repeated() + .to_slice(); + + let keyword = just(':').ignore_then(word.clone()).map(Token::Keyword); + + let number = just('-') + .or_not() + .then(text::int(10).then(just('.').then(text::digits(10)).or_not())) + .to_slice() + .from_str() + .unwrapped() + .map(Token::Number); + + let reserved_or_word = word.map(|word: &str| match word { + "as" => Token::As, + "box" => Token::Box, + "do" => Token::Do, + "else" => Token::Else, + "false" => Token::Boolean(false), + "fn" => Token::Fn, + "if" => Token::If, + "import" => Token::Import, + "let" => Token::Let, + "loop" => Token::Loop, + "match" => Token::Match, + "nil" => Token::Nil, + "ns" => Token::Ns, + "panic!" => Token::Panic, // won't match until C-style ident -> Ludus word + "pkg" => Token::Pkg, + "recur" => Token::Recur, + "repeat" => Token::Repeat, + "test" => Token::Test, + "then" => Token::Then, + "true" => Token::Boolean(true), + "use" => Token::Use, + "when" => Token::When, + "with" => Token::With, + _ => Token::Word(word), + }); + + let arrow = just("->").to(Token::Arrow); + let comma = just(',').to(Token::Comma); + let semicolon = just(';').to(Token::Semi); + let placeholder = just('_').to(Token::Placeholder); + + let control = arrow.or(comma).or(semicolon).or(placeholder); + + let comment = just('&') + .then(any().and_is(just('\n').not()).repeated()) + .padded(); + + let atom = number.or(string).or(keyword).or(reserved_or_word); + + atom.or(control) + .map_with(|tok, e| (tok, e.span())) + .padded_by(comment.repeated()) + .padded() +} + +#[cfg(test)] +mod tests { + use crate::lexer; + use crate::Token; + use chumsky::{container::Seq, prelude::*}; + + #[test] + fn it_works() { + let toks = lexer().parse("42").unwrap(); + let (tok, _) = toks[0].clone(); + assert_eq!(tok, Token::Number(42.0)); + } +}