[#3] TokenStream now hold generic variants.

This makes it so that the TokenStream and all it's associated Token types use a generic when dealing with the variant of the Token. Span was also given the ability to merge with another span. This will make it easier to track the span as users group TokenTypes together to make their domain specific types. All tests and examples were updated for this change. The version was incremented to 0.2.0.
2025-04-16 01:54:22 -04:00 · 2025-04-16 01:54:22 -04:00 · 7c564d18a2
commit 7c564d18a2
parent f924811c47
8 changed files with 201 additions and 64 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "rune"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 description = "A lexical analysis library."
 repository = "/myrddin/rune"
--- a/README.md
+++ b/README.md
@ -31,5 +31,5 @@ Then add this to your Cargo.toml file.

 ```toml
 [dependencies]
-rune = { registry = "cybermages" }
+rune = { version = "0.2.0", registry = "cybermages" }
 ```
--- a/examples/basic.rs
+++ b/examples/basic.rs
@ -1,34 +1,151 @@
 use std::path::PathBuf;

-use rune::{Lexer, TokenStream, TokenType};
+use rune::{Lexer, Span, TokenStream, TokenType};
+
+
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum MarkdownTokenType {
+    Heading(u8),
+    EmphasisStart,
+    EmphasisEnd,
+    StrongStart,
+    StrongEnd,
+    CodeSpan,
+    Text,
+    Symbol,
+    Whitespace,
+    Newline,
+    Unknown,
+}
+
+
+impl std::fmt::Display for MarkdownTokenType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
+            MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
+            MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
+            MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
+            MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
+            MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
+            MarkdownTokenType::Text => write!(f, "Text"),
+            MarkdownTokenType::Symbol => write!(f, "Symbol"),
+            MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
+            MarkdownTokenType::Newline => write!(f, "Newline"),
+            MarkdownTokenType::Unknown => write!(f, "Unknown"),
+        }
+    }
+}



 // Define how you want to interpret base tokens
-fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
-{
-   let mut new_tokens = Vec::new();
+pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> {
+    let mut output = TokenStream::new();

-   for token in tokens
-   {
-      new_tokens.push((*token.variant, token.lexeme.to_string()));
-   }
+    let mut i = 0;
+    while i < input.len() {
+        let token = input.get(i).unwrap(); // safe due to bounds check above

-   new_tokens
+        match token.variant {
+            TokenType::Symbol if token.lexeme == "#" => {
+                // Count consecutive #s for heading level
+                let mut level = 1;
+                let mut span = token.span.clone();
+
+                while i + 1 < input.len() {
+                    let next = input.get(i + 1).unwrap();
+                    if *next.variant == TokenType::Symbol && next.lexeme == "#" {
+                        level += 1;
+                        span.end = next.span.end;
+                        i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                output.push(token.lexeme.repeat(level),
+                            MarkdownTokenType::Heading(level as u8),
+                            span);
+            }
+
+            TokenType::Symbol if token.lexeme == "*" => {
+                // Look ahead to see if it's strong (**) or emphasis (*)
+                if i + 1 < input.len() {
+                    let next = input.get(i + 1).unwrap();
+                    if *next.variant == TokenType::Symbol && next.lexeme == "*" {
+                        output.push("**".to_string(),
+                                    MarkdownTokenType::StrongStart,
+                                    Span::merge(*token.span, *next.span));
+                        i += 1; // skip the second '*'
+                    } else {
+                        output.push("*".to_string(),
+                                    MarkdownTokenType::EmphasisStart,
+                                    token.span.clone());
+                    }
+                } else {
+                    output.push("*".to_string(),
+                                MarkdownTokenType::EmphasisStart,
+                                token.span.clone());
+                }
+            }
+
+            TokenType::Symbol if token.lexeme == "`" => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::CodeSpan,
+                            token.span.clone());
+            }
+
+            TokenType::Text => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Text,
+                            token.span.clone());
+            }
+
+            TokenType::Symbol => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Symbol,
+                            token.span.clone());
+            }
+
+            TokenType::Whitespace => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Whitespace,
+                            token.span.clone());
+            }
+
+            TokenType::Newline => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Newline,
+                            token.span.clone());
+            }
+
+            _ => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Unknown,
+                            token.span.clone());
+            }
+        }
+
+        i += 1;
+    }
+
+    output
 }


 fn main() -> Result<(), Box<dyn std::error::Error>>
 {
   let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-   path.push("examples/example.txt");
+   path.push("examples/example.md");

   let tokens = Lexer::scan_file(path, transform)?;

   // The tuple here is from the transform functions return type.
-   for (ty, lexeme) in tokens
+   for token in &tokens
   {
-      println!("{:?}: {:?}", ty, lexeme);
+      println!("{}", token);
   }

   Ok(())
--- a/examples/example.txt
+++ b/examples/example.txt
@ -1,2 +1,6 @@
+# Example File
+This is an example file for Rune.
+
+## Rune
 The name Rune is inspired by arcane glyphs — ancient symbols holding deep meaning.
 Just like your tokens!
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -31,24 +31,17 @@ const BUFFER_SIZE: usize = 1024 * 1024;
 /// ```rust
 /// use rune::{Lexer, TokenStream, TokenType};
 ///
-/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
+/// fn transform(tokens: &TokenStream<TokenType>) -> TokenStream<TokenType>
 /// {
-///    let mut new_tokens = Vec::new();
-///
-///    for token in tokens
-///    {
-///       new_tokens.push((*token.variant, token.lexeme.to_string()));
-///    }
-///
-///    new_tokens
+///    tokens.clone()
 /// }
 ///
 /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
 ///
 /// // The tuple here is from the transform functions return type.
-/// for (ty, lexeme) in tokens
+/// for token in &tokens
 /// {
-///    println!("{:?}: {:?}", ty, lexeme);
+///    println!("{}", token);
 /// }
 /// ```
 ///
@ -72,9 +65,9 @@ impl Lexer
 {
   /// Scans a file and produces a vector of transformed tokens.
   pub fn scan_file<P, F, T>(path: P, transform: F)
-                             -> Result<Vec<T>, LexerError>
+                             -> Result<TokenStream<T>, LexerError>
      where P: AsRef<std::path::Path>,
-            F: FnOnce(&TokenStream) -> Vec<T>
+            F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
   {
      let mut cursor = Position::default();
      let mut stream = TokenStream::new();
@ -123,8 +116,8 @@ impl Lexer

   /// Scans a full in-memory string and returns transformed tokens.
   pub fn scan_text<F, T>(text: &str, transform: F)
-                          -> Result<Vec<T>, LexerError>
-      where F: FnOnce(&TokenStream) -> Vec<T>
+                          -> Result<TokenStream<T>, LexerError>
+      where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
   {
      let mut cursor = Position::default();
      let mut stream = TokenStream::new();
@ -145,7 +138,7 @@ impl Lexer
   }

   /// Internal: scans a single line of text into tokens.
-   fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position)
+   fn scan(line: &str, stream: &mut TokenStream<TokenType>, cursor: &mut Position)
   {
      for c in line.chars()
      {
--- a/src/position.rs
+++ b/src/position.rs
@ -61,6 +61,19 @@ impl Span
      Span { start: val,
             end: val }
   }
+
+   pub fn merge(a: Span, b: Span) -> Self
+   {
+      Span { start: a.start,
+             end: b.end }
+   }
+
+   pub fn merge_with(&self, other: Span) -> Span {
+        Span {
+            start: self.start,
+            end: other.end,
+        }
+    }
 }

 impl Default for Span
--- a/src/token.rs
+++ b/src/token.rs
@ -51,13 +51,13 @@ pub enum TokenType
 /// Everything is in flat arrays for fast access
 /// and minimal cache misses.
 #[derive(Debug, Clone, Default)]
-pub struct TokenStream
+pub struct TokenStream<T>
 {
   /// The text of the `Token`.
   pub lexemes: Vec<String>,

   /// The type of `Token`.
-   pub variants: Vec<TokenType>,
+   pub variants: Vec<T>,

   /// The location of the `Token` in the file.
   pub locations: Vec<Span>
@ -66,13 +66,13 @@ pub struct TokenStream

 /// A `Token` found during the lexigraphical scan.
 #[derive(Debug)]
-pub struct Token<'a>
+pub struct Token<'a, T>
 {
   /// The characters of the `Token`.
   pub lexeme: &'a str,

   /// The `Token`'s type.
-   pub variant: &'a TokenType,
+   pub variant: &'a T,

   /// The location in the file of this `Token`.
   pub span: &'a Span
@ -82,35 +82,35 @@ pub struct Token<'a>
 ///
 /// This is the mutable reference.
 #[derive(Debug)]
-pub struct TokenMut<'a>
+pub struct TokenMut<'a, T>
 {
   /// The characters of the `Token`.
   pub lexeme: &'a mut str,
   /// The `Token`'s type.
-   pub variant: &'a mut TokenType,
+   pub variant: &'a mut T,
   /// The location for this `Token` in the file.
   pub span: &'a mut Span
 }


 /// An iterator over the Tokens in a `TokenStream`.
-pub struct TokenStreamIter<'a>
+pub struct TokenStreamIter<'a, T>
 {
   /// The stream to iterate over.
-   stream: &'a TokenStream,
+   stream: &'a TokenStream<T>,

   /// The position in the stream.
   index: usize
 }

 /// A mutable iterator over the Tokens in a `TokenStream`.
-pub struct TokenStreamIterMut<'a>
+pub struct TokenStreamIterMut<'a, T>
 {
   /// The characters of the `Token`.
   lexemes: std::slice::IterMut<'a, String>,

   /// The `Token`'s type.
-   variants: std::slice::IterMut<'a, TokenType>,
+   variants: std::slice::IterMut<'a, T>,

   /// The location for this `Token` in the file.
   locations: std::slice::IterMut<'a, Span>
@ -118,7 +118,7 @@ pub struct TokenStreamIterMut<'a>



-impl TokenStream
+impl<T> TokenStream<T>
 {
   pub fn new() -> Self
   {
@ -137,7 +137,7 @@ impl TokenStream
      self.lexemes.is_empty()
   }

-   pub fn get(&self, index: usize) -> Option<Token<'_>>
+   pub fn get(&self, index: usize) -> Option<Token<'_, T>>
   {
      if index < self.lexemes.len()
      {
@ -151,20 +151,20 @@ impl TokenStream
      }
   }

-   pub fn iter(&self) -> TokenStreamIter<'_>
+   pub fn iter(&self) -> TokenStreamIter<'_, T>
   {
      TokenStreamIter { stream: self,
                        index: 0 }
   }

-   pub fn get_mut(&mut self, index: usize) -> Option<TokenMut<'_>>
+   pub fn get_mut(&mut self, index: usize) -> Option<TokenMut<'_, T>>
   {
      if index < self.lexemes.len()
      {
         // SAFETY: We manually split the borrows to avoid
         // double mutable borrow.
         let lexeme = &mut self.lexemes[index] as *mut String;
-         let variant = &mut self.variants[index] as *mut TokenType;
+         let variant = &mut self.variants[index] as *mut T;
         let span = &mut self.locations[index] as *mut Span;

         // Convert &mut String to &mut str safely.
@ -189,14 +189,14 @@ impl TokenStream
      self.locations.clear();
   }

-   pub fn push(&mut self, lexeme: String, variant: TokenType, span: Span)
+   pub fn push(&mut self, lexeme: String, variant: T, span: Span)
   {
      self.lexemes.push(lexeme);
      self.variants.push(variant);
      self.locations.push(span);
   }

-   pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_>
+   pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_, T>
   {
      TokenStreamIterMut { lexemes: self.lexemes.iter_mut(),
                           variants: self.variants.iter_mut(),
@ -205,10 +205,10 @@ impl TokenStream
 }


-impl<'a> IntoIterator for &'a TokenStream
+impl<'a, T> IntoIterator for &'a TokenStream<T>
 {
-   type IntoIter = TokenStreamIter<'a>;
-   type Item = Token<'a>;
+   type IntoIter = TokenStreamIter<'a, T>;
+   type Item = Token<'a, T>;

   fn into_iter(self) -> Self::IntoIter
   {
@ -217,9 +217,9 @@ impl<'a> IntoIterator for &'a TokenStream
   }
 }

-impl<'a> Iterator for TokenStreamIter<'a>
+impl<'a, T> Iterator for TokenStreamIter<'a, T>
 {
-   type Item = Token<'a>;
+   type Item = Token<'a, T>;

   fn next(&mut self) -> Option<Self::Item>
   {
@ -240,9 +240,9 @@ impl<'a> Iterator for TokenStreamIter<'a>
 }


-impl<'a> Iterator for TokenStreamIterMut<'a>
+impl<'a, T> Iterator for TokenStreamIterMut<'a, T>
 {
-   type Item = TokenMut<'a>;
+   type Item = TokenMut<'a, T>;

   fn next(&mut self) -> Option<Self::Item>
   {
@ -257,13 +257,13 @@ impl<'a> Iterator for TokenStreamIterMut<'a>
 }


-impl<'a> ::std::fmt::Display for Token<'a>
+impl<'a, T: std::fmt::Display> ::std::fmt::Display for Token<'a, T>
 {
   fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result
   {
-      match *self.variant
+      match self.lexeme
      {
-         TokenType::Newline => write!(f, "[{}, {}]", self.variant, "\\n"),
+         "\n" => write!(f, "[{}, {}]", self.variant, "\\n"),

         _ => write!(f, "[{}: {}]", self.variant, self.lexeme)
      }
--- a/tests/lexer_tests.rs
+++ b/tests/lexer_tests.rs
@ -15,16 +15,22 @@ struct TestCase<'a>



-fn dummy_transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
+fn dummy_transform(tokens: &TokenStream<TokenType>) -> TokenStream<TokenType>
 {
-   let mut new_tokens = Vec::new();
+   /*
+   let mut stream: TokenStream<(TokenType, String)> = TokenStream::default();

-   for token in tokens
+   stream.lexemes = tokens.lexemes.clone();
+   stream.locations = tokens.locations.clone();
+
+   for 0..tokens.lexemes.len()
   {
-      new_tokens.push((*token.variant, token.lexeme.to_string()));
+      stream.variants
   }

-   new_tokens
+   stream
+   */
+   tokens.clone()
 }

 fn write_temp_file(name: &str, content: &str) -> PathBuf
@ -53,6 +59,8 @@ fn test_basic_lexing()
      Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
                                                               succeed");

+   let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+
   let expected = vec![(TokenType::Text, "magic".to_string()),
                       (TokenType::Whitespace, " ".to_string()),
                       (TokenType::Text, "runes".to_string()),
@ -69,6 +77,8 @@ fn test_symbols_and_numbers()
      Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
                                                           succeed");

+   let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+
   let expected = vec![(TokenType::Numeric, "13".into()),
                       (TokenType::Whitespace, " ".into()),
                       (TokenType::Symbol, "+".into()),
@ -119,14 +129,14 @@ fn test_lexer_with_cases()
                                                              on case '{}'",
                                                             case.name));

-      let result_stripped: Vec<(TokenType, String)> = result;
+      let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();

      let expected = case.expected
                         .iter()
                         .map(|(ty, s)| (*ty, s.to_string()))
                         .collect::<Vec<_>>();

-      assert_eq!(result_stripped, expected,
+      assert_eq!(result, expected,
                 "Mismatch in test case '{}'",
                 case.name);