[#3] TokenStream now hold generic variants.

This makes it so that the TokenStream and all it's associated Token types use a generic when dealing with the variant of the Token. Span was also given the ability to merge with another span. This will make it easier to track the span as users group TokenTypes together to make their domain specific types. All tests and examples were updated for this change. The version was incremented to 0.2.0.
2025-04-16 01:54:22 -04:00
parent f924811c47
commit 7c564d18a2
8 changed files with 201 additions and 64 deletions
--- a/examples/basic.rs
+++ b/examples/basic.rs
@ -1,34 +1,151 @@
 use std::path::PathBuf;

-use rune::{Lexer, TokenStream, TokenType};
+use rune::{Lexer, Span, TokenStream, TokenType};
+
+
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum MarkdownTokenType {
+    Heading(u8),
+    EmphasisStart,
+    EmphasisEnd,
+    StrongStart,
+    StrongEnd,
+    CodeSpan,
+    Text,
+    Symbol,
+    Whitespace,
+    Newline,
+    Unknown,
+}
+
+
+impl std::fmt::Display for MarkdownTokenType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
+            MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
+            MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
+            MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
+            MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
+            MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
+            MarkdownTokenType::Text => write!(f, "Text"),
+            MarkdownTokenType::Symbol => write!(f, "Symbol"),
+            MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
+            MarkdownTokenType::Newline => write!(f, "Newline"),
+            MarkdownTokenType::Unknown => write!(f, "Unknown"),
+        }
+    }
+}



 // Define how you want to interpret base tokens
-fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
-{
-   let mut new_tokens = Vec::new();
+pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> {
+    let mut output = TokenStream::new();

-   for token in tokens
-   {
-      new_tokens.push((*token.variant, token.lexeme.to_string()));
-   }
+    let mut i = 0;
+    while i < input.len() {
+        let token = input.get(i).unwrap(); // safe due to bounds check above

-   new_tokens
+        match token.variant {
+            TokenType::Symbol if token.lexeme == "#" => {
+                // Count consecutive #s for heading level
+                let mut level = 1;
+                let mut span = token.span.clone();
+
+                while i + 1 < input.len() {
+                    let next = input.get(i + 1).unwrap();
+                    if *next.variant == TokenType::Symbol && next.lexeme == "#" {
+                        level += 1;
+                        span.end = next.span.end;
+                        i += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                output.push(token.lexeme.repeat(level),
+                            MarkdownTokenType::Heading(level as u8),
+                            span);
+            }
+
+            TokenType::Symbol if token.lexeme == "*" => {
+                // Look ahead to see if it's strong (**) or emphasis (*)
+                if i + 1 < input.len() {
+                    let next = input.get(i + 1).unwrap();
+                    if *next.variant == TokenType::Symbol && next.lexeme == "*" {
+                        output.push("**".to_string(),
+                                    MarkdownTokenType::StrongStart,
+                                    Span::merge(*token.span, *next.span));
+                        i += 1; // skip the second '*'
+                    } else {
+                        output.push("*".to_string(),
+                                    MarkdownTokenType::EmphasisStart,
+                                    token.span.clone());
+                    }
+                } else {
+                    output.push("*".to_string(),
+                                MarkdownTokenType::EmphasisStart,
+                                token.span.clone());
+                }
+            }
+
+            TokenType::Symbol if token.lexeme == "`" => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::CodeSpan,
+                            token.span.clone());
+            }
+
+            TokenType::Text => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Text,
+                            token.span.clone());
+            }
+
+            TokenType::Symbol => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Symbol,
+                            token.span.clone());
+            }
+
+            TokenType::Whitespace => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Whitespace,
+                            token.span.clone());
+            }
+
+            TokenType::Newline => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Newline,
+                            token.span.clone());
+            }
+
+            _ => {
+                output.push(token.lexeme.to_string(),
+                            MarkdownTokenType::Unknown,
+                            token.span.clone());
+            }
+        }
+
+        i += 1;
+    }
+
+    output
 }


 fn main() -> Result<(), Box<dyn std::error::Error>>
 {
   let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-   path.push("examples/example.txt");
+   path.push("examples/example.md");

   let tokens = Lexer::scan_file(path, transform)?;

   // The tuple here is from the transform functions return type.
-   for (ty, lexeme) in tokens
+   for token in &tokens
   {
-      println!("{:?}: {:?}", ty, lexeme);
+      println!("{}", token);
   }

   Ok(())
--- a/examples/example.txt
+++ b/examples/example.txt
@ -1,2 +1,6 @@
+# Example File
+This is an example file for Rune.
+
+## Rune
 The name Rune is inspired by arcane glyphs — ancient symbols holding deep meaning.
 Just like your tokens!