From 7c564d18a293c9044d161eb479860736a7e74120 Mon Sep 17 00:00:00 2001 From: Myrddin Dundragon Date: Wed, 16 Apr 2025 01:54:22 -0400 Subject: [PATCH] [#3] TokenStream now hold generic variants. This makes it so that the TokenStream and all it's associated Token types use a generic when dealing with the variant of the Token. Span was also given the ability to merge with another span. This will make it easier to track the span as users group TokenTypes together to make their domain specific types. All tests and examples were updated for this change. The version was incremented to 0.2.0. --- Cargo.toml | 2 +- README.md | 2 +- examples/basic.rs | 141 ++++++++++++++++++++++++--- examples/{example.txt => example.md} | 4 + src/lexer.rs | 25 ++--- src/position.rs | 13 +++ src/token.rs | 54 +++++----- tests/lexer_tests.rs | 24 +++-- 8 files changed, 201 insertions(+), 64 deletions(-) rename examples/{example.txt => example.md} (65%) diff --git a/Cargo.toml b/Cargo.toml index 36cceb8..92b6226 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rune" -version = "0.1.0" +version = "0.2.0" edition = "2021" description = "A lexical analysis library." repository = "/myrddin/rune" diff --git a/README.md b/README.md index 1f782fe..a9a4b1b 100644 --- a/README.md +++ b/README.md @@ -31,5 +31,5 @@ Then add this to your Cargo.toml file. ```toml [dependencies] -rune = { registry = "cybermages" } +rune = { version = "0.2.0", registry = "cybermages" } ``` diff --git a/examples/basic.rs b/examples/basic.rs index 181a170..dd2ac83 100644 --- a/examples/basic.rs +++ b/examples/basic.rs @@ -1,34 +1,151 @@ use std::path::PathBuf; -use rune::{Lexer, TokenStream, TokenType}; +use rune::{Lexer, Span, TokenStream, TokenType}; + + + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum MarkdownTokenType { + Heading(u8), + EmphasisStart, + EmphasisEnd, + StrongStart, + StrongEnd, + CodeSpan, + Text, + Symbol, + Whitespace, + Newline, + Unknown, +} + + +impl std::fmt::Display for MarkdownTokenType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level), + MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"), + MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"), + MarkdownTokenType::StrongStart => write!(f, "StrongStart"), + MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"), + MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"), + MarkdownTokenType::Text => write!(f, "Text"), + MarkdownTokenType::Symbol => write!(f, "Symbol"), + MarkdownTokenType::Whitespace => write!(f, "Whitespace"), + MarkdownTokenType::Newline => write!(f, "Newline"), + MarkdownTokenType::Unknown => write!(f, "Unknown"), + } + } +} // Define how you want to interpret base tokens -fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)> -{ - let mut new_tokens = Vec::new(); +pub fn transform(input: &TokenStream) -> TokenStream { + let mut output = TokenStream::new(); - for token in tokens - { - new_tokens.push((*token.variant, token.lexeme.to_string())); - } + let mut i = 0; + while i < input.len() { + let token = input.get(i).unwrap(); // safe due to bounds check above - new_tokens + match token.variant { + TokenType::Symbol if token.lexeme == "#" => { + // Count consecutive #s for heading level + let mut level = 1; + let mut span = token.span.clone(); + + while i + 1 < input.len() { + let next = input.get(i + 1).unwrap(); + if *next.variant == TokenType::Symbol && next.lexeme == "#" { + level += 1; + span.end = next.span.end; + i += 1; + } else { + break; + } + } + + output.push(token.lexeme.repeat(level), + MarkdownTokenType::Heading(level as u8), + span); + } + + TokenType::Symbol if token.lexeme == "*" => { + // Look ahead to see if it's strong (**) or emphasis (*) + if i + 1 < input.len() { + let next = input.get(i + 1).unwrap(); + if *next.variant == TokenType::Symbol && next.lexeme == "*" { + output.push("**".to_string(), + MarkdownTokenType::StrongStart, + Span::merge(*token.span, *next.span)); + i += 1; // skip the second '*' + } else { + output.push("*".to_string(), + MarkdownTokenType::EmphasisStart, + token.span.clone()); + } + } else { + output.push("*".to_string(), + MarkdownTokenType::EmphasisStart, + token.span.clone()); + } + } + + TokenType::Symbol if token.lexeme == "`" => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::CodeSpan, + token.span.clone()); + } + + TokenType::Text => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Text, + token.span.clone()); + } + + TokenType::Symbol => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Symbol, + token.span.clone()); + } + + TokenType::Whitespace => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Whitespace, + token.span.clone()); + } + + TokenType::Newline => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Newline, + token.span.clone()); + } + + _ => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Unknown, + token.span.clone()); + } + } + + i += 1; + } + + output } fn main() -> Result<(), Box> { let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - path.push("examples/example.txt"); + path.push("examples/example.md"); let tokens = Lexer::scan_file(path, transform)?; // The tuple here is from the transform functions return type. - for (ty, lexeme) in tokens + for token in &tokens { - println!("{:?}: {:?}", ty, lexeme); + println!("{}", token); } Ok(()) diff --git a/examples/example.txt b/examples/example.md similarity index 65% rename from examples/example.txt rename to examples/example.md index 3428015..f8b711b 100644 --- a/examples/example.txt +++ b/examples/example.md @@ -1,2 +1,6 @@ +# Example File +This is an example file for Rune. + +## Rune The name Rune is inspired by arcane glyphs — ancient symbols holding deep meaning. Just like your tokens! diff --git a/src/lexer.rs b/src/lexer.rs index ee63104..b3bdc8b 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -31,24 +31,17 @@ const BUFFER_SIZE: usize = 1024 * 1024; /// ```rust /// use rune::{Lexer, TokenStream, TokenType}; /// -/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)> +/// fn transform(tokens: &TokenStream) -> TokenStream /// { -/// let mut new_tokens = Vec::new(); -/// -/// for token in tokens -/// { -/// new_tokens.push((*token.variant, token.lexeme.to_string())); -/// } -/// -/// new_tokens +/// tokens.clone() /// } /// /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); /// /// // The tuple here is from the transform functions return type. -/// for (ty, lexeme) in tokens +/// for token in &tokens /// { -/// println!("{:?}: {:?}", ty, lexeme); +/// println!("{}", token); /// } /// ``` /// @@ -72,9 +65,9 @@ impl Lexer { /// Scans a file and produces a vector of transformed tokens. pub fn scan_file(path: P, transform: F) - -> Result, LexerError> + -> Result, LexerError> where P: AsRef, - F: FnOnce(&TokenStream) -> Vec + F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); @@ -123,8 +116,8 @@ impl Lexer /// Scans a full in-memory string and returns transformed tokens. pub fn scan_text(text: &str, transform: F) - -> Result, LexerError> - where F: FnOnce(&TokenStream) -> Vec + -> Result, LexerError> + where F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); @@ -145,7 +138,7 @@ impl Lexer } /// Internal: scans a single line of text into tokens. - fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) + fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) { for c in line.chars() { diff --git a/src/position.rs b/src/position.rs index c69a917..30f32ae 100644 --- a/src/position.rs +++ b/src/position.rs @@ -61,6 +61,19 @@ impl Span Span { start: val, end: val } } + + pub fn merge(a: Span, b: Span) -> Self + { + Span { start: a.start, + end: b.end } + } + + pub fn merge_with(&self, other: Span) -> Span { + Span { + start: self.start, + end: other.end, + } + } } impl Default for Span diff --git a/src/token.rs b/src/token.rs index cb9f2a7..49bccae 100644 --- a/src/token.rs +++ b/src/token.rs @@ -51,13 +51,13 @@ pub enum TokenType /// Everything is in flat arrays for fast access /// and minimal cache misses. #[derive(Debug, Clone, Default)] -pub struct TokenStream +pub struct TokenStream { /// The text of the `Token`. pub lexemes: Vec, /// The type of `Token`. - pub variants: Vec, + pub variants: Vec, /// The location of the `Token` in the file. pub locations: Vec @@ -66,13 +66,13 @@ pub struct TokenStream /// A `Token` found during the lexigraphical scan. #[derive(Debug)] -pub struct Token<'a> +pub struct Token<'a, T> { /// The characters of the `Token`. pub lexeme: &'a str, /// The `Token`'s type. - pub variant: &'a TokenType, + pub variant: &'a T, /// The location in the file of this `Token`. pub span: &'a Span @@ -82,35 +82,35 @@ pub struct Token<'a> /// /// This is the mutable reference. #[derive(Debug)] -pub struct TokenMut<'a> +pub struct TokenMut<'a, T> { /// The characters of the `Token`. pub lexeme: &'a mut str, /// The `Token`'s type. - pub variant: &'a mut TokenType, + pub variant: &'a mut T, /// The location for this `Token` in the file. pub span: &'a mut Span } /// An iterator over the Tokens in a `TokenStream`. -pub struct TokenStreamIter<'a> +pub struct TokenStreamIter<'a, T> { /// The stream to iterate over. - stream: &'a TokenStream, + stream: &'a TokenStream, /// The position in the stream. index: usize } /// A mutable iterator over the Tokens in a `TokenStream`. -pub struct TokenStreamIterMut<'a> +pub struct TokenStreamIterMut<'a, T> { /// The characters of the `Token`. lexemes: std::slice::IterMut<'a, String>, /// The `Token`'s type. - variants: std::slice::IterMut<'a, TokenType>, + variants: std::slice::IterMut<'a, T>, /// The location for this `Token` in the file. locations: std::slice::IterMut<'a, Span> @@ -118,7 +118,7 @@ pub struct TokenStreamIterMut<'a> -impl TokenStream +impl TokenStream { pub fn new() -> Self { @@ -137,7 +137,7 @@ impl TokenStream self.lexemes.is_empty() } - pub fn get(&self, index: usize) -> Option> + pub fn get(&self, index: usize) -> Option> { if index < self.lexemes.len() { @@ -151,20 +151,20 @@ impl TokenStream } } - pub fn iter(&self) -> TokenStreamIter<'_> + pub fn iter(&self) -> TokenStreamIter<'_, T> { TokenStreamIter { stream: self, index: 0 } } - pub fn get_mut(&mut self, index: usize) -> Option> + pub fn get_mut(&mut self, index: usize) -> Option> { if index < self.lexemes.len() { // SAFETY: We manually split the borrows to avoid // double mutable borrow. let lexeme = &mut self.lexemes[index] as *mut String; - let variant = &mut self.variants[index] as *mut TokenType; + let variant = &mut self.variants[index] as *mut T; let span = &mut self.locations[index] as *mut Span; // Convert &mut String to &mut str safely. @@ -189,14 +189,14 @@ impl TokenStream self.locations.clear(); } - pub fn push(&mut self, lexeme: String, variant: TokenType, span: Span) + pub fn push(&mut self, lexeme: String, variant: T, span: Span) { self.lexemes.push(lexeme); self.variants.push(variant); self.locations.push(span); } - pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_> + pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_, T> { TokenStreamIterMut { lexemes: self.lexemes.iter_mut(), variants: self.variants.iter_mut(), @@ -205,10 +205,10 @@ impl TokenStream } -impl<'a> IntoIterator for &'a TokenStream +impl<'a, T> IntoIterator for &'a TokenStream { - type IntoIter = TokenStreamIter<'a>; - type Item = Token<'a>; + type IntoIter = TokenStreamIter<'a, T>; + type Item = Token<'a, T>; fn into_iter(self) -> Self::IntoIter { @@ -217,9 +217,9 @@ impl<'a> IntoIterator for &'a TokenStream } } -impl<'a> Iterator for TokenStreamIter<'a> +impl<'a, T> Iterator for TokenStreamIter<'a, T> { - type Item = Token<'a>; + type Item = Token<'a, T>; fn next(&mut self) -> Option { @@ -240,9 +240,9 @@ impl<'a> Iterator for TokenStreamIter<'a> } -impl<'a> Iterator for TokenStreamIterMut<'a> +impl<'a, T> Iterator for TokenStreamIterMut<'a, T> { - type Item = TokenMut<'a>; + type Item = TokenMut<'a, T>; fn next(&mut self) -> Option { @@ -257,13 +257,13 @@ impl<'a> Iterator for TokenStreamIterMut<'a> } -impl<'a> ::std::fmt::Display for Token<'a> +impl<'a, T: std::fmt::Display> ::std::fmt::Display for Token<'a, T> { fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { - match *self.variant + match self.lexeme { - TokenType::Newline => write!(f, "[{}, {}]", self.variant, "\\n"), + "\n" => write!(f, "[{}, {}]", self.variant, "\\n"), _ => write!(f, "[{}: {}]", self.variant, self.lexeme) } diff --git a/tests/lexer_tests.rs b/tests/lexer_tests.rs index 8cb249a..63a5a30 100644 --- a/tests/lexer_tests.rs +++ b/tests/lexer_tests.rs @@ -15,16 +15,22 @@ struct TestCase<'a> -fn dummy_transform(tokens: &TokenStream) -> Vec<(TokenType, String)> +fn dummy_transform(tokens: &TokenStream) -> TokenStream { - let mut new_tokens = Vec::new(); + /* + let mut stream: TokenStream<(TokenType, String)> = TokenStream::default(); - for token in tokens + stream.lexemes = tokens.lexemes.clone(); + stream.locations = tokens.locations.clone(); + + for 0..tokens.lexemes.len() { - new_tokens.push((*token.variant, token.lexeme.to_string())); + stream.variants } - new_tokens + stream + */ + tokens.clone() } fn write_temp_file(name: &str, content: &str) -> PathBuf @@ -53,6 +59,8 @@ fn test_basic_lexing() Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \ succeed"); + let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); + let expected = vec![(TokenType::Text, "magic".to_string()), (TokenType::Whitespace, " ".to_string()), (TokenType::Text, "runes".to_string()), @@ -69,6 +77,8 @@ fn test_symbols_and_numbers() Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \ succeed"); + let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); + let expected = vec![(TokenType::Numeric, "13".into()), (TokenType::Whitespace, " ".into()), (TokenType::Symbol, "+".into()), @@ -119,14 +129,14 @@ fn test_lexer_with_cases() on case '{}'", case.name)); - let result_stripped: Vec<(TokenType, String)> = result; + let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); let expected = case.expected .iter() .map(|(ty, s)| (*ty, s.to_string())) .collect::>(); - assert_eq!(result_stripped, expected, + assert_eq!(result, expected, "Mismatch in test case '{}'", case.name);