Just some basic updating and cleaning up.

- Added comments. - Ran cargo fmt. - Updated the versioning.
2025-04-22 02:18:12 -04:00
parent cd50b53be5
commit f5780f50c2
7 changed files with 330 additions and 237 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4,4 +4,4 @@ version = 4
 [[package]]
 name = "rune"
-version = "0.2.0"
+version = "0.3.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "rune"
-version = "0.2.0"
+version = "0.3.0"
 edition = "2021"
 description = "A lexical analysis library."
 repository = "/myrddin/rune"
--- a/README.md
+++ b/README.md
@ -31,5 +31,5 @@ Then add this to your Cargo.toml file.
 ```toml
 [dependencies]
-rune = { version = "0.2.0", registry = "cybermages" }
+rune = { version = "0.3.0", registry = "cybermages" }
 ```
--- a/examples/basic.rs
+++ b/examples/basic.rs
@ -5,7 +5,8 @@ use rune::{Lexer, Span, TokenStream, TokenType};
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub enum MarkdownTokenType {
+pub enum MarkdownTokenType
 {
   Heading(u8),
   EmphasisStart,
   EmphasisEnd,
@ -16,13 +17,16 @@ pub enum MarkdownTokenType {
   Symbol,
   Whitespace,
   Newline,
-    Unknown,
+   Unknown
 }
-impl std::fmt::Display for MarkdownTokenType {
+impl std::fmt::Display for MarkdownTokenType
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+{
-        match self {
+   fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
   {
      match self
      {
         MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
         MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
         MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
@ -33,7 +37,7 @@ impl std::fmt::Display for MarkdownTokenType {
         MarkdownTokenType::Symbol => write!(f, "Symbol"),
         MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
         MarkdownTokenType::Newline => write!(f, "Newline"),
-            MarkdownTokenType::Unknown => write!(f, "Unknown"),
+         MarkdownTokenType::Unknown => write!(f, "Unknown")
      }
   }
 }
@ -41,26 +45,35 @@ impl std::fmt::Display for MarkdownTokenType {
 // Define how you want to interpret base tokens
-pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> {
+pub fn transform(input: &TokenStream<TokenType>)
                 -> TokenStream<MarkdownTokenType>
 {
   let mut output = TokenStream::new();
   let mut i = 0;
-    while i < input.len() {
+   while i < input.len()
   {
      let token = input.get(i).unwrap(); // safe due to bounds check above
-        match token.variant {
+      match token.variant
-            TokenType::Symbol if token.lexeme == "#" => {
+      {
         TokenType::Symbol if token.lexeme == "#" =>
         {
            // Count consecutive #s for heading level
            let mut level = 1;
            let mut span = token.span.clone();
-                while i + 1 < input.len() {
+            while i + 1 < input.len()
            {
               let next = input.get(i + 1).unwrap();
-                    if *next.variant == TokenType::Symbol && next.lexeme == "#" {
+               if *next.variant == TokenType::Symbol && next.lexeme == "#"
               {
                  level += 1;
                  span.end = next.span.end;
                  i += 1;
-                    } else {
+               }
               else
               {
                  break;
               }
            }
@ -70,58 +83,71 @@ pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenTyp
                        span);
         }
-            TokenType::Symbol if token.lexeme == "*" => {
+         TokenType::Symbol if token.lexeme == "*" =>
         {
            // Look ahead to see if it's strong (**) or emphasis (*)
-                if i + 1 < input.len() {
+            if i + 1 < input.len()
            {
               let next = input.get(i + 1).unwrap();
-                    if *next.variant == TokenType::Symbol && next.lexeme == "*" {
+               if *next.variant == TokenType::Symbol && next.lexeme == "*"
               {
                  output.push("**".to_string(),
                              MarkdownTokenType::StrongStart,
                              Span::merge(*token.span, *next.span));
                  i += 1; // skip the second '*'
-                    } else {
+               }
               else
               {
                  output.push("*".to_string(),
                              MarkdownTokenType::EmphasisStart,
                              token.span.clone());
               }
-                } else {
+            }
            else
            {
               output.push("*".to_string(),
                           MarkdownTokenType::EmphasisStart,
                           token.span.clone());
            }
         }
-            TokenType::Symbol if token.lexeme == "`" => {
+         TokenType::Symbol if token.lexeme == "`" =>
         {
            output.push(token.lexeme.to_string(),
                        MarkdownTokenType::CodeSpan,
                        token.span.clone());
         }
-            TokenType::Text => {
+         TokenType::Text =>
         {
            output.push(token.lexeme.to_string(),
                        MarkdownTokenType::Text,
                        token.span.clone());
         }
-            TokenType::Symbol => {
+         TokenType::Symbol =>
         {
            output.push(token.lexeme.to_string(),
                        MarkdownTokenType::Symbol,
                        token.span.clone());
         }
-            TokenType::Whitespace => {
+         TokenType::Whitespace =>
         {
            output.push(token.lexeme.to_string(),
                        MarkdownTokenType::Whitespace,
                        token.span.clone());
         }
-            TokenType::Newline => {
+         TokenType::Newline =>
         {
            output.push(token.lexeme.to_string(),
                        MarkdownTokenType::Newline,
                        token.span.clone());
         }
-            _ => {
+         _ =>
         {
            output.push(token.lexeme.to_string(),
                        MarkdownTokenType::Unknown,
                        token.span.clone());
--- a/src/error.rs
+++ b/src/error.rs
@ -1,4 +1,6 @@
-use std::{error::Error, path::PathBuf};
+use std::error::Error;
 use std::path::PathBuf;
 use super::position::Span;
@ -12,7 +14,8 @@ use super::position::Span;
 /// It is designed to provide detailed diagnostics for file-based or
 /// in-memory parsing and is compatible with error reporting ecosystems.
 #[derive(Debug)]
-pub struct LexerError {
+pub struct LexerError
 {
   /// A human-readable error message.
   pub message: String,
@ -26,10 +29,11 @@ pub struct LexerError {
   pub snippet: Option<String>,
   /// An optional underlying error that caused this one.
-    pub source: Option<Box<dyn Error>>,
+   pub source: Option<Box<dyn Error>>
 }
-impl LexerError {
+impl LexerError
 {
   /// Creates a new `LexerError` with a message, span, and optional context.
   ///
   /// # Parameters
@ -40,31 +44,24 @@ impl LexerError {
   ///
   /// # Returns
   /// A new instance of `LexerError`.
-    pub fn new<S, T>(
+   pub fn new<S, T>(message: S, span: Span, file: Option<T>,
-        message: S,
+                    snippet: Option<S>)
-        span: Span,
+                    -> Self
-        file: Option<T>,
+      where S: Into<String>,
-        snippet: Option<S>,
+            T: Into<PathBuf>
    ) -> Self
    where
        S: Into<String>,
        T: Into<PathBuf>,
   {
-        LexerError {
+      LexerError { message: message.into(),
            message: message.into(),
                   span,
                   file: file.map(Into::into),
                   snippet: snippet.map(Into::into),
-            source: None,
+                   source: None }
        }
   }
   /// Creates a `LexerError` from only a message and span.
   ///
   /// This is useful when file or snippet context is not available.
   pub fn from_message<S>(message: S, span: Span) -> Self
-    where
+      where S: Into<String>
        S: Into<String>,
   {
      Self::new(message, span, None::<PathBuf>, None::<S>)
   }
@ -73,8 +70,7 @@ impl LexerError {
   ///
   /// This is helpful for diagnostics and tooling output.
   pub fn with_snippet<S>(mut self, snippet: S) -> Self
-    where
+      where S: Into<String>
        S: Into<String>,
   {
      self.snippet = Some(snippet.into());
      self
@ -82,8 +78,7 @@ impl LexerError {
   /// Attaches the path of the file in which the error occurred.
   pub fn with_file<T>(mut self, file: T) -> Self
-    where
+      where T: Into<PathBuf>
        T: Into<PathBuf>,
   {
      self.file = Some(file.into());
      self
@ -93,25 +88,28 @@ impl LexerError {
   ///
   /// This allows you to chain errors for more detailed diagnostics.
   pub fn with_source<E>(mut self, err: E) -> Self
-    where
+      where E: Error + 'static
        E: Error + 'static,
   {
      self.source = Some(Box::new(err));
      self
   }
 }
-impl std::fmt::Display for LexerError {
+impl std::fmt::Display for LexerError
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+{
   fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
   {
      write!(f, "Lexer error at {}", self.span)?;
-        if let Some(file) = &self.file {
+      if let Some(file) = &self.file
      {
         write!(f, " in file `{}`", file.display())?;
      }
      write!(f, ": {}", self.message)?;
-        if let Some(snippet) = &self.snippet {
+      if let Some(snippet) = &self.snippet
      {
         write!(f, "\n  --> Snippet: `{}`", snippet)?;
      }
@ -119,9 +117,11 @@ impl std::fmt::Display for LexerError {
   }
 }
-impl Error for LexerError {
+impl Error for LexerError
 {
   /// Returns the underlying cause of this error, if any.
-    fn source(&self) -> Option<&(dyn Error + 'static)> {
+   fn source(&self) -> Option<&(dyn Error + 'static)>
   {
      self.source.as_ref().map(|e| e.as_ref())
   }
 }
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -7,24 +7,25 @@ use super::token::{TokenStream, TokenType};
-/// The size of data chunks to read from a file. This was arbitrarily chosen to
+/// The size of data chunks to read from a file. This is an arbitrary choice,
-/// be 1mb.
+/// set to 1MB.
 const BUFFER_SIZE: usize = 1024 * 1024;
-/// The `Lexer` is the core component responsible for performing
+/// The `Lexer` struct is responsible for performing lexical analysis
-/// lexicographical analysis on a text file.
+/// (tokenization) on text.
 ///
-/// It reads input from a file character-by-character, generating a stream
+/// It processes input from a file or string character-by-character and
-/// of base tokens such as text, numbers, whitespace, symbols, and newlines.
+/// generates a stream of tokens, such as text, numbers, whitespace, symbols,
-/// These tokens are accumulated into a `TokenStream`, which is a flat,
+/// and newlines. These tokens are accumulated into a `TokenStream`, which is a
-/// cache-friendly data structure.
+/// flat, cache-friendly data structure designed for efficient iteration.
 ///
-/// After tokenization, the lexer applies a user-provided `transform` function
+/// After the base tokens are generated, the `Lexer` allows for transformation
-/// to each token in the stream, allowing consumers of the library to convert
+/// of these tokens into richer, domain-specific types via a user-provided
-/// base tokens into richer, domain-specific token types (e.g. Markdown
+/// `transform` function. This transformation can be used to convert base tokens
-/// elements, syntax trees, or custom DSL tokens).
+/// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
 /// other custom format you need.
 ///
 /// # Example
 ///
@ -38,32 +39,51 @@ const BUFFER_SIZE: usize = 1024 * 1024;
 ///
 /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
 ///
 /// // The tuple here is from the transform functions return type.
 /// for token in &tokens
 /// {
 ///    println!("{}", token);
 /// }
 /// ```
 ///
-/// # Design Notes
+/// # Design Considerations
-///
+/// - Utilizes a flat `TokenStream` to improve performance and reduce heap
 /// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
 ///   overhead.
-/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
+/// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
 ///   numbers).
-/// - Easily extensible via the `transform` function to support higher-level
+/// - Extensible via the `transform` function, enabling the creation of
-///   parsing tasks.
+///   higher-level constructs, like Markdown elements or syntax trees for a
 ///   custom DSL.
 ///
-/// # Errors
+/// # Error Handling
-///
+/// The lexer will return a `LexerError` if the input file cannot be opened or
-/// Returns a `LexerError` if the file cannot be opened or read.
+/// read. Errors include issues such as missing files, read failures, or invalid
 /// input formats.
 pub enum Lexer {}
 impl Lexer
 {
-   /// Scans a file and produces a vector of transformed tokens.
+   /// Scans a file and generates a vector of transformed tokens based on the
   /// provided `transform` function.
   ///
   /// This method opens a file from the given `path`, reads the file line by
   /// line, and converts the input into a stream of tokens. The tokens are
   /// then passed to the `transform` function, which allows users to map
   /// base tokens into domain-specific types.
   ///
   /// # Parameters
   /// - `path`: A path to the file to be lexically analyzed.
   /// - `transform`: A function that takes a `TokenStream<TokenType>` and
   ///   transforms it into a `TokenStream<T>` where `T` is a domain-specific
   ///   type.
   ///
   /// # Returns
   /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
   /// type, or an error.
   ///
   /// # Errors
   /// Returns a `LexerError` if the file cannot be opened or read.
   pub fn scan_file<P, F, T>(path: P, transform: F)
                             -> Result<TokenStream<T>, LexerError>
      where P: AsRef<std::path::Path>,
@ -82,6 +102,7 @@ impl Lexer
      let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
      // Read the file line by line.
      for line in reader.lines()
      {
         match line
@ -93,7 +114,7 @@ impl Lexer
            Err(_) =>
            {
               return Err(LexerError::new("Unable to read line during \
-                                           Lexigraphical Analysis.",
+                                           Lexical Analysis.",
                                          Span::default(),
                                          Some(path.as_ref()
                                                   .to_string_lossy()
@ -102,10 +123,10 @@ impl Lexer
            }
         }
         // Add the newline token after each line.
         stream.push("\n".to_string(),
                     TokenType::Newline,
                     Span::with_single(cursor));
         cursor.line += 1;
         cursor.column = 0;
      }
@ -113,7 +134,22 @@ impl Lexer
      Ok(transform(&stream))
   }
-   /// Scans a full in-memory string and returns transformed tokens.
+   /// Scans a full in-memory string and produces transformed tokens.
   ///
   /// This method tokenizes the input string `text` and returns the transformed
   /// tokens using the provided `transform` function. It's a convenient way
   /// to perform lexical analysis on in-memory strings without needing to
   /// read from a file.
   ///
   /// # Parameters
   /// - `text`: A string slice representing the in-memory input text to
   ///   analyze.
   /// - `transform`: A function that transforms the base tokens into
   ///   domain-specific types.
   ///
   /// # Returns
   /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
   /// type, or an error.
   pub fn scan_text<F, T>(text: &str, transform: F)
                          -> Result<TokenStream<T>, LexerError>
      where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
@ -121,14 +157,15 @@ impl Lexer
      let mut cursor = Position::default();
      let mut stream = TokenStream::new();
      // Process each line in the input string.
      for line in text.lines()
      {
         Self::scan(line, &mut stream, &mut cursor);
         // Add the newline token after each line.
         stream.push("\n".to_string(),
                     TokenType::Newline,
                     Span::with_single(cursor));
         cursor.line += 1;
         cursor.column = 0;
      }
@ -136,15 +173,28 @@ impl Lexer
      Ok(transform(&stream))
   }
-   /// Internal: scans a single line of text into tokens.
+   /// Internal method that scans a single line of text into tokens.
   ///
   /// This method processes each character of a line and generates the
   /// corresponding token. It handles cases like numeric tokens, text
   /// tokens, symbols, and whitespace.
   ///
   /// # Parameters
   /// - `line`: A line of text to be lexically analyzed.
   /// - `stream`: A mutable reference to the token stream where the generated
   ///   tokens will be pushed.
   /// - `cursor`: A mutable reference to the cursor position, which tracks the
   ///   current position in the input.
   fn scan(line: &str, stream: &mut TokenStream<TokenType>,
           cursor: &mut Position)
   {
      for c in line.chars()
      {
         // Get the token type based on the character.
         let variant = get_token_type(c);
         let last = stream.len().saturating_sub(1);
         // Handle token merging for contiguous tokens like numbers or text.
         if !stream.is_empty() &&
            variant == stream.variants[last] &&
            (variant == TokenType::Numeric || variant == TokenType::Text)
@ -154,6 +204,7 @@ impl Lexer
         }
         else
         {
            // Add a new token to the stream.
            stream.push(c.to_string(), variant, Span::with_single(*cursor));
         }
@ -164,6 +215,18 @@ impl Lexer
 /// Determines the type of a token based on the current character.
 ///
 /// This helper function is responsible for identifying whether the current
 /// character is part of a known token type such as numeric, text, whitespace,
 /// or symbol.
 ///
 /// # Parameters
 /// - `curr_char`: The current character to analyze.
 ///
 /// # Returns
 /// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
 /// `Whitespace`, etc.).
 fn get_token_type(curr_char: char) -> TokenType
 {
   match curr_char
--- a/tests/lexer_tests.rs
+++ b/tests/lexer_tests.rs
@ -59,7 +59,9 @@ fn test_basic_lexing()
      Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
                                                               succeed");
-   let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+   let tokens = tokens.into_iter()
                      .map(|t| (*t.variant, String::from(t.lexeme)))
                      .collect::<Vec<_>>();
   let expected = vec![(TokenType::Text, "magic".to_string()),
                       (TokenType::Whitespace, " ".to_string()),
@ -77,7 +79,9 @@ fn test_symbols_and_numbers()
      Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
                                                           succeed");
-   let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+   let tokens = tokens.into_iter()
                      .map(|t| (*t.variant, String::from(t.lexeme)))
                      .collect::<Vec<_>>();
   let expected = vec![(TokenType::Numeric, "13".into()),
                       (TokenType::Whitespace, " ".into()),
@ -129,16 +133,16 @@ fn test_lexer_with_cases()
                                                              on case '{}'",
                                                             case.name));
-      let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+      let result = result.into_iter()
                         .map(|t| (*t.variant, String::from(t.lexeme)))
                         .collect::<Vec<_>>();
      let expected = case.expected
                         .iter()
                         .map(|(ty, s)| (*ty, s.to_string()))
                         .collect::<Vec<_>>();
-      assert_eq!(result, expected,
+      assert_eq!(result, expected, "Mismatch in test case '{}'", case.name);
                 "Mismatch in test case '{}'",
                 case.name);
      cleanup_temp_file(&path);
   }