[#2] A data-oriented Lexer.

I took the Token module from the Arcanum project and brought it over to here. It was a nice data oriented way of handling the Tokens. I then created a Lexer that can scan a file or text and allow the user to transform the scanned tokens before the final token array is returned. This should allow for more complex and specific tokens to be created for whatever domain is being targeted. I also added basic library examples and testing. Finally, I made sure the documentation generated nicely. This is now marked as version: 0.1.0
2025-04-15 21:17:28 -04:00
parent 0350a151a9
commit acf869efbb
13 changed files with 823 additions and 25 deletions
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -0,0 +1,185 @@
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+
+use super::error::LexerError;
+use super::position::{Position, Span};
+use super::token::{TokenStream, TokenType};
+
+
+
+/// The size of data chunks to read from a file. This was arbitrarily chosen to
+/// be 1mb.
+const BUFFER_SIZE: usize = 1024 * 1024;
+
+
+
+/// The `Lexer` is the core component responsible for performing
+/// lexicographical analysis on a text file.
+///
+/// It reads input from a file character-by-character, generating a stream
+/// of base tokens such as text, numbers, whitespace, symbols, and newlines.
+/// These tokens are accumulated into a `TokenStream`, which is a flat,
+/// cache-friendly data structure.
+///
+/// After tokenization, the lexer applies a user-provided `transform` function
+/// to each token in the stream, allowing consumers of the library to convert
+/// base tokens into richer, domain-specific token types (e.g. Markdown
+/// elements, syntax trees, or custom DSL tokens).
+///
+/// # Example
+///
+/// ```rust
+/// use rune::{Lexer, TokenStream, TokenType};
+///
+/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
+/// {
+///    let mut new_tokens = Vec::new();
+///
+///    for token in tokens
+///    {
+///       new_tokens.push((*token.variant, token.lexeme.to_string()));
+///    }
+///
+///    new_tokens
+/// }
+///
+/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
+///
+/// // The tuple here is from the transform functions return type.
+/// for (ty, lexeme) in tokens
+/// {
+///    println!("{:?}: {:?}", ty, lexeme);
+/// }
+/// ```
+///
+/// # Design Notes
+///
+/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
+///   overhead.
+/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
+///   numbers).
+/// - Easily extensible via the `transform` function to support higher-level
+///   parsing tasks.
+///
+/// # Errors
+///
+/// Returns a `LexerError` if the file cannot be opened or read.
+pub enum Lexer {}
+
+
+
+impl Lexer
+{
+   /// Scans a file and produces a vector of transformed tokens.
+   pub fn scan_file<P, F, T>(path: P, transform: F)
+                             -> Result<Vec<T>, LexerError>
+      where P: AsRef<std::path::Path>,
+            F: FnOnce(&TokenStream) -> Vec<T>
+   {
+      let mut cursor = Position::default();
+      let mut stream = TokenStream::new();
+
+      let input_file = File::open(&path).map_err(|_error| {
+                                           LexerError::new(
+                "Unable to open file for Lexigraphical Analysis.",
+                Span::default(),
+                Some(path.as_ref().to_string_lossy().to_string()),
+                None,
+            )
+                                        })?;
+
+      let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
+
+      for line in reader.lines()
+      {
+         match line
+         {
+            Ok(text) =>
+            {
+               Self::scan(&text, &mut stream, &mut cursor);
+            }
+            Err(_) =>
+            {
+               return Err(LexerError::new("Unable to read line during \
+                                           Lexigraphical Analysis.",
+                                          Span::default(),
+                                          Some(path.as_ref()
+                                                   .to_string_lossy()
+                                                   .to_string()),
+                                          None));
+            }
+         }
+
+         stream.push("\n".to_string(),
+                     TokenType::Newline,
+                     Span::with_single(cursor));
+
+         cursor.line += 1;
+         cursor.column = 0;
+      }
+
+      Ok(transform(&stream))
+   }
+
+   /// Scans a full in-memory string and returns transformed tokens.
+   pub fn scan_text<F, T>(text: &str, transform: F)
+                          -> Result<Vec<T>, LexerError>
+      where F: FnOnce(&TokenStream) -> Vec<T>
+   {
+      let mut cursor = Position::default();
+      let mut stream = TokenStream::new();
+
+      for line in text.lines()
+      {
+         Self::scan(line, &mut stream, &mut cursor);
+
+         stream.push("\n".to_string(),
+                     TokenType::Newline,
+                     Span::with_single(cursor));
+
+         cursor.line += 1;
+         cursor.column = 0;
+      }
+
+      Ok(transform(&stream))
+   }
+
+   /// Internal: scans a single line of text into tokens.
+   fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position)
+   {
+      for c in line.chars()
+      {
+         let variant = get_token_type(c);
+         let last = stream.len().saturating_sub(1);
+
+         if !stream.is_empty() &&
+            variant == stream.variants[last] &&
+            (variant == TokenType::Numeric || variant == TokenType::Text)
+         {
+            stream.lexemes[last].push(c);
+            stream.locations[last].end = *cursor;
+         }
+         else
+         {
+            stream.push(c.to_string(), variant, Span::with_single(*cursor));
+         }
+
+         cursor.column += 1;
+      }
+   }
+}
+
+
+
+fn get_token_type(curr_char: char) -> TokenType
+{
+   match curr_char
+   {
+      '\n' => TokenType::Newline,
+      c if c.is_whitespace() => TokenType::Whitespace,
+      c if c.is_numeric() => TokenType::Numeric,
+      c if c.is_alphabetic() => TokenType::Text,
+      c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
+      _ => TokenType::Unknown
+   }
+}