diff --git a/Cargo.toml b/Cargo.toml index 36cceb8..92b6226 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rune" -version = "0.1.0" +version = "0.2.0" edition = "2021" description = "A lexical analysis library." repository = "/myrddin/rune" diff --git a/README.md b/README.md index 1f782fe..a9a4b1b 100644 --- a/README.md +++ b/README.md @@ -31,5 +31,5 @@ Then add this to your Cargo.toml file. ```toml [dependencies] -rune = { registry = "cybermages" } +rune = { version = "0.2.0", registry = "cybermages" } ``` diff --git a/examples/basic.rs b/examples/basic.rs index 181a170..dd2ac83 100644 --- a/examples/basic.rs +++ b/examples/basic.rs @@ -1,34 +1,151 @@ use std::path::PathBuf; -use rune::{Lexer, TokenStream, TokenType}; +use rune::{Lexer, Span, TokenStream, TokenType}; + + + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum MarkdownTokenType { + Heading(u8), + EmphasisStart, + EmphasisEnd, + StrongStart, + StrongEnd, + CodeSpan, + Text, + Symbol, + Whitespace, + Newline, + Unknown, +} + + +impl std::fmt::Display for MarkdownTokenType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level), + MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"), + MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"), + MarkdownTokenType::StrongStart => write!(f, "StrongStart"), + MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"), + MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"), + MarkdownTokenType::Text => write!(f, "Text"), + MarkdownTokenType::Symbol => write!(f, "Symbol"), + MarkdownTokenType::Whitespace => write!(f, "Whitespace"), + MarkdownTokenType::Newline => write!(f, "Newline"), + MarkdownTokenType::Unknown => write!(f, "Unknown"), + } + } +} // Define how you want to interpret base tokens -fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)> -{ - let mut new_tokens = Vec::new(); +pub fn transform(input: &TokenStream) -> TokenStream { + let mut output = TokenStream::new(); - for token in tokens - { - new_tokens.push((*token.variant, token.lexeme.to_string())); - } + let mut i = 0; + while i < input.len() { + let token = input.get(i).unwrap(); // safe due to bounds check above - new_tokens + match token.variant { + TokenType::Symbol if token.lexeme == "#" => { + // Count consecutive #s for heading level + let mut level = 1; + let mut span = token.span.clone(); + + while i + 1 < input.len() { + let next = input.get(i + 1).unwrap(); + if *next.variant == TokenType::Symbol && next.lexeme == "#" { + level += 1; + span.end = next.span.end; + i += 1; + } else { + break; + } + } + + output.push(token.lexeme.repeat(level), + MarkdownTokenType::Heading(level as u8), + span); + } + + TokenType::Symbol if token.lexeme == "*" => { + // Look ahead to see if it's strong (**) or emphasis (*) + if i + 1 < input.len() { + let next = input.get(i + 1).unwrap(); + if *next.variant == TokenType::Symbol && next.lexeme == "*" { + output.push("**".to_string(), + MarkdownTokenType::StrongStart, + Span::merge(*token.span, *next.span)); + i += 1; // skip the second '*' + } else { + output.push("*".to_string(), + MarkdownTokenType::EmphasisStart, + token.span.clone()); + } + } else { + output.push("*".to_string(), + MarkdownTokenType::EmphasisStart, + token.span.clone()); + } + } + + TokenType::Symbol if token.lexeme == "`" => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::CodeSpan, + token.span.clone()); + } + + TokenType::Text => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Text, + token.span.clone()); + } + + TokenType::Symbol => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Symbol, + token.span.clone()); + } + + TokenType::Whitespace => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Whitespace, + token.span.clone()); + } + + TokenType::Newline => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Newline, + token.span.clone()); + } + + _ => { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Unknown, + token.span.clone()); + } + } + + i += 1; + } + + output } fn main() -> Result<(), Box> { let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - path.push("examples/example.txt"); + path.push("examples/example.md"); let tokens = Lexer::scan_file(path, transform)?; // The tuple here is from the transform functions return type. - for (ty, lexeme) in tokens + for token in &tokens { - println!("{:?}: {:?}", ty, lexeme); + println!("{}", token); } Ok(()) diff --git a/examples/example.txt b/examples/example.md similarity index 65% rename from examples/example.txt rename to examples/example.md index 3428015..f8b711b 100644 --- a/examples/example.txt +++ b/examples/example.md @@ -1,2 +1,6 @@ +# Example File +This is an example file for Rune. + +## Rune The name Rune is inspired by arcane glyphs — ancient symbols holding deep meaning. Just like your tokens! diff --git a/src/lexer.rs b/src/lexer.rs index ee63104..b3bdc8b 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -31,24 +31,17 @@ const BUFFER_SIZE: usize = 1024 * 1024; /// ```rust /// use rune::{Lexer, TokenStream, TokenType}; /// -/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)> +/// fn transform(tokens: &TokenStream) -> TokenStream /// { -/// let mut new_tokens = Vec::new(); -/// -/// for token in tokens -/// { -/// new_tokens.push((*token.variant, token.lexeme.to_string())); -/// } -/// -/// new_tokens +/// tokens.clone() /// } /// /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); /// /// // The tuple here is from the transform functions return type. -/// for (ty, lexeme) in tokens +/// for token in &tokens /// { -/// println!("{:?}: {:?}", ty, lexeme); +/// println!("{}", token); /// } /// ``` /// @@ -72,9 +65,9 @@ impl Lexer { /// Scans a file and produces a vector of transformed tokens. pub fn scan_file(path: P, transform: F) - -> Result, LexerError> + -> Result, LexerError> where P: AsRef, - F: FnOnce(&TokenStream) -> Vec + F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); @@ -123,8 +116,8 @@ impl Lexer /// Scans a full in-memory string and returns transformed tokens. pub fn scan_text(text: &str, transform: F) - -> Result, LexerError> - where F: FnOnce(&TokenStream) -> Vec + -> Result, LexerError> + where F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); @@ -145,7 +138,7 @@ impl Lexer } /// Internal: scans a single line of text into tokens. - fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) + fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) { for c in line.chars() { diff --git a/src/position.rs b/src/position.rs index c69a917..30f32ae 100644 --- a/src/position.rs +++ b/src/position.rs @@ -61,6 +61,19 @@ impl Span Span { start: val, end: val } } + + pub fn merge(a: Span, b: Span) -> Self + { + Span { start: a.start, + end: b.end } + } + + pub fn merge_with(&self, other: Span) -> Span { + Span { + start: self.start, + end: other.end, + } + } } impl Default for Span diff --git a/src/token.rs b/src/token.rs index cb9f2a7..49bccae 100644 --- a/src/token.rs +++ b/src/token.rs @@ -51,13 +51,13 @@ pub enum TokenType /// Everything is in flat arrays for fast access /// and minimal cache misses. #[derive(Debug, Clone, Default)] -pub struct TokenStream +pub struct TokenStream { /// The text of the `Token`. pub lexemes: Vec, /// The type of `Token`. - pub variants: Vec, + pub variants: Vec, /// The location of the `Token` in the file. pub locations: Vec @@ -66,13 +66,13 @@ pub struct TokenStream /// A `Token` found during the lexigraphical scan. #[derive(Debug)] -pub struct Token<'a> +pub struct Token<'a, T> { /// The characters of the `Token`. pub lexeme: &'a str, /// The `Token`'s type. - pub variant: &'a TokenType, + pub variant: &'a T, /// The location in the file of this `Token`. pub span: &'a Span @@ -82,35 +82,35 @@ pub struct Token<'a> /// /// This is the mutable reference. #[derive(Debug)] -pub struct TokenMut<'a> +pub struct TokenMut<'a, T> { /// The characters of the `Token`. pub lexeme: &'a mut str, /// The `Token`'s type. - pub variant: &'a mut TokenType, + pub variant: &'a mut T, /// The location for this `Token` in the file. pub span: &'a mut Span } /// An iterator over the Tokens in a `TokenStream`. -pub struct TokenStreamIter<'a> +pub struct TokenStreamIter<'a, T> { /// The stream to iterate over. - stream: &'a TokenStream, + stream: &'a TokenStream, /// The position in the stream. index: usize } /// A mutable iterator over the Tokens in a `TokenStream`. -pub struct TokenStreamIterMut<'a> +pub struct TokenStreamIterMut<'a, T> { /// The characters of the `Token`. lexemes: std::slice::IterMut<'a, String>, /// The `Token`'s type. - variants: std::slice::IterMut<'a, TokenType>, + variants: std::slice::IterMut<'a, T>, /// The location for this `Token` in the file. locations: std::slice::IterMut<'a, Span> @@ -118,7 +118,7 @@ pub struct TokenStreamIterMut<'a> -impl TokenStream +impl TokenStream { pub fn new() -> Self { @@ -137,7 +137,7 @@ impl TokenStream self.lexemes.is_empty() } - pub fn get(&self, index: usize) -> Option> + pub fn get(&self, index: usize) -> Option> { if index < self.lexemes.len() { @@ -151,20 +151,20 @@ impl TokenStream } } - pub fn iter(&self) -> TokenStreamIter<'_> + pub fn iter(&self) -> TokenStreamIter<'_, T> { TokenStreamIter { stream: self, index: 0 } } - pub fn get_mut(&mut self, index: usize) -> Option> + pub fn get_mut(&mut self, index: usize) -> Option> { if index < self.lexemes.len() { // SAFETY: We manually split the borrows to avoid // double mutable borrow. let lexeme = &mut self.lexemes[index] as *mut String; - let variant = &mut self.variants[index] as *mut TokenType; + let variant = &mut self.variants[index] as *mut T; let span = &mut self.locations[index] as *mut Span; // Convert &mut String to &mut str safely. @@ -189,14 +189,14 @@ impl TokenStream self.locations.clear(); } - pub fn push(&mut self, lexeme: String, variant: TokenType, span: Span) + pub fn push(&mut self, lexeme: String, variant: T, span: Span) { self.lexemes.push(lexeme); self.variants.push(variant); self.locations.push(span); } - pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_> + pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_, T> { TokenStreamIterMut { lexemes: self.lexemes.iter_mut(), variants: self.variants.iter_mut(), @@ -205,10 +205,10 @@ impl TokenStream } -impl<'a> IntoIterator for &'a TokenStream +impl<'a, T> IntoIterator for &'a TokenStream { - type IntoIter = TokenStreamIter<'a>; - type Item = Token<'a>; + type IntoIter = TokenStreamIter<'a, T>; + type Item = Token<'a, T>; fn into_iter(self) -> Self::IntoIter { @@ -217,9 +217,9 @@ impl<'a> IntoIterator for &'a TokenStream } } -impl<'a> Iterator for TokenStreamIter<'a> +impl<'a, T> Iterator for TokenStreamIter<'a, T> { - type Item = Token<'a>; + type Item = Token<'a, T>; fn next(&mut self) -> Option { @@ -240,9 +240,9 @@ impl<'a> Iterator for TokenStreamIter<'a> } -impl<'a> Iterator for TokenStreamIterMut<'a> +impl<'a, T> Iterator for TokenStreamIterMut<'a, T> { - type Item = TokenMut<'a>; + type Item = TokenMut<'a, T>; fn next(&mut self) -> Option { @@ -257,13 +257,13 @@ impl<'a> Iterator for TokenStreamIterMut<'a> } -impl<'a> ::std::fmt::Display for Token<'a> +impl<'a, T: std::fmt::Display> ::std::fmt::Display for Token<'a, T> { fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { - match *self.variant + match self.lexeme { - TokenType::Newline => write!(f, "[{}, {}]", self.variant, "\\n"), + "\n" => write!(f, "[{}, {}]", self.variant, "\\n"), _ => write!(f, "[{}: {}]", self.variant, self.lexeme) } diff --git a/tests/lexer_tests.rs b/tests/lexer_tests.rs index 8cb249a..63a5a30 100644 --- a/tests/lexer_tests.rs +++ b/tests/lexer_tests.rs @@ -15,16 +15,22 @@ struct TestCase<'a> -fn dummy_transform(tokens: &TokenStream) -> Vec<(TokenType, String)> +fn dummy_transform(tokens: &TokenStream) -> TokenStream { - let mut new_tokens = Vec::new(); + /* + let mut stream: TokenStream<(TokenType, String)> = TokenStream::default(); - for token in tokens + stream.lexemes = tokens.lexemes.clone(); + stream.locations = tokens.locations.clone(); + + for 0..tokens.lexemes.len() { - new_tokens.push((*token.variant, token.lexeme.to_string())); + stream.variants } - new_tokens + stream + */ + tokens.clone() } fn write_temp_file(name: &str, content: &str) -> PathBuf @@ -53,6 +59,8 @@ fn test_basic_lexing() Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \ succeed"); + let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); + let expected = vec![(TokenType::Text, "magic".to_string()), (TokenType::Whitespace, " ".to_string()), (TokenType::Text, "runes".to_string()), @@ -69,6 +77,8 @@ fn test_symbols_and_numbers() Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \ succeed"); + let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); + let expected = vec![(TokenType::Numeric, "13".into()), (TokenType::Whitespace, " ".into()), (TokenType::Symbol, "+".into()), @@ -119,14 +129,14 @@ fn test_lexer_with_cases() on case '{}'", case.name)); - let result_stripped: Vec<(TokenType, String)> = result; + let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); let expected = case.expected .iter() .map(|(ty, s)| (*ty, s.to_string())) .collect::>(); - assert_eq!(result_stripped, expected, + assert_eq!(result, expected, "Mismatch in test case '{}'", case.name);