Fixing how the Lexer handles text.

Scanning a file has all the lines terminating with a '\n' newline character, but when giving the text directly to the lexer via the scan_text function it should not append a newline at the end if there was no newline in the original input.
Added a Lookahead iterator.
2025-05-05 18:17:32 -04:00 · 2025-05-05 18:17:32 -04:00 · 2025-05-05 18:17:32 -04:00 · 2025-05-05 18:17:32 -04:00 · 2025-05-05 18:17:32 -04:00
11 changed files with 925 additions and 244 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4,4 +4,4 @@ version = 4

 [[package]]
 name = "rune"
-version = "0.2.0"
+version = "0.3.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "rune"
-version = "0.2.0"
+version = "0.3.0"
 edition = "2021"
 description = "A lexical analysis library."
 repository = "/myrddin/rune"
--- a/README.md
+++ b/README.md
@ -31,5 +31,5 @@ Then add this to your Cargo.toml file.

 ```toml
 [dependencies]
-rune = { version = "0.2.0", registry = "cybermages" }
+rune = { version = "0.3.0", registry = "cybermages" }
 ```
--- a/examples/basic.rs
+++ b/examples/basic.rs
@ -5,133 +5,159 @@ use rune::{Lexer, Span, TokenStream, TokenType};


 #[derive(Debug, Clone, PartialEq, Eq)]
-pub enum MarkdownTokenType {
-    Heading(u8),
-    EmphasisStart,
-    EmphasisEnd,
-    StrongStart,
-    StrongEnd,
-    CodeSpan,
-    Text,
-    Symbol,
-    Whitespace,
-    Newline,
-    Unknown,
+pub enum MarkdownTokenType
+{
+   Heading(u8),
+   EmphasisStart,
+   EmphasisEnd,
+   StrongStart,
+   StrongEnd,
+   CodeSpan,
+   Text,
+   Symbol,
+   Whitespace,
+   Newline,
+   Unknown
 }


-impl std::fmt::Display for MarkdownTokenType {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
-            MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
-            MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
-            MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
-            MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
-            MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
-            MarkdownTokenType::Text => write!(f, "Text"),
-            MarkdownTokenType::Symbol => write!(f, "Symbol"),
-            MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
-            MarkdownTokenType::Newline => write!(f, "Newline"),
-            MarkdownTokenType::Unknown => write!(f, "Unknown"),
-        }
-    }
+impl std::fmt::Display for MarkdownTokenType
+{
+   fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
+   {
+      match self
+      {
+         MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
+         MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
+         MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
+         MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
+         MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
+         MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
+         MarkdownTokenType::Text => write!(f, "Text"),
+         MarkdownTokenType::Symbol => write!(f, "Symbol"),
+         MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
+         MarkdownTokenType::Newline => write!(f, "Newline"),
+         MarkdownTokenType::Unknown => write!(f, "Unknown")
+      }
+   }
 }



 // Define how you want to interpret base tokens
-pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> {
-    let mut output = TokenStream::new();
+pub fn transform(input: &TokenStream<TokenType>)
+                 -> TokenStream<MarkdownTokenType>
+{
+   let mut output = TokenStream::new();

-    let mut i = 0;
-    while i < input.len() {
-        let token = input.get(i).unwrap(); // safe due to bounds check above
+   let mut i = 0;
+   while i < input.len()
+   {
+      let token = input.get(i).unwrap(); // safe due to bounds check above

-        match token.variant {
-            TokenType::Symbol if token.lexeme == "#" => {
-                // Count consecutive #s for heading level
-                let mut level = 1;
-                let mut span = token.span.clone();
+      match token.variant
+      {
+         TokenType::Symbol if token.lexeme == "#" =>
+         {
+            // Count consecutive #s for heading level
+            let mut level = 1;
+            let mut span = token.span.clone();

-                while i + 1 < input.len() {
-                    let next = input.get(i + 1).unwrap();
-                    if *next.variant == TokenType::Symbol && next.lexeme == "#" {
-                        level += 1;
-                        span.end = next.span.end;
-                        i += 1;
-                    } else {
-                        break;
-                    }
-                }
-
-                output.push(token.lexeme.repeat(level),
-                            MarkdownTokenType::Heading(level as u8),
-                            span);
+            while i + 1 < input.len()
+            {
+               let next = input.get(i + 1).unwrap();
+               if *next.variant == TokenType::Symbol && next.lexeme == "#"
+               {
+                  level += 1;
+                  span.end = next.span.end;
+                  i += 1;
+               }
+               else
+               {
+                  break;
+               }
            }

-            TokenType::Symbol if token.lexeme == "*" => {
-                // Look ahead to see if it's strong (**) or emphasis (*)
-                if i + 1 < input.len() {
-                    let next = input.get(i + 1).unwrap();
-                    if *next.variant == TokenType::Symbol && next.lexeme == "*" {
-                        output.push("**".to_string(),
-                                    MarkdownTokenType::StrongStart,
-                                    Span::merge(*token.span, *next.span));
-                        i += 1; // skip the second '*'
-                    } else {
-                        output.push("*".to_string(),
-                                    MarkdownTokenType::EmphasisStart,
-                                    token.span.clone());
-                    }
-                } else {
-                    output.push("*".to_string(),
-                                MarkdownTokenType::EmphasisStart,
-                                token.span.clone());
-                }
+            output.push(token.lexeme.repeat(level),
+                        MarkdownTokenType::Heading(level as u8),
+                        span);
+         }
+
+         TokenType::Symbol if token.lexeme == "*" =>
+         {
+            // Look ahead to see if it's strong (**) or emphasis (*)
+            if i + 1 < input.len()
+            {
+               let next = input.get(i + 1).unwrap();
+               if *next.variant == TokenType::Symbol && next.lexeme == "*"
+               {
+                  output.push("**".to_string(),
+                              MarkdownTokenType::StrongStart,
+                              Span::merge(*token.span, *next.span));
+                  i += 1; // skip the second '*'
+               }
+               else
+               {
+                  output.push("*".to_string(),
+                              MarkdownTokenType::EmphasisStart,
+                              token.span.clone());
+               }
            }
-
-            TokenType::Symbol if token.lexeme == "`" => {
-                output.push(token.lexeme.to_string(),
-                            MarkdownTokenType::CodeSpan,
-                            token.span.clone());
+            else
+            {
+               output.push("*".to_string(),
+                           MarkdownTokenType::EmphasisStart,
+                           token.span.clone());
            }
+         }

-            TokenType::Text => {
-                output.push(token.lexeme.to_string(),
-                            MarkdownTokenType::Text,
-                            token.span.clone());
-            }
+         TokenType::Symbol if token.lexeme == "`" =>
+         {
+            output.push(token.lexeme.to_string(),
+                        MarkdownTokenType::CodeSpan,
+                        token.span.clone());
+         }

-            TokenType::Symbol => {
-                output.push(token.lexeme.to_string(),
-                            MarkdownTokenType::Symbol,
-                            token.span.clone());
-            }
+         TokenType::Text =>
+         {
+            output.push(token.lexeme.to_string(),
+                        MarkdownTokenType::Text,
+                        token.span.clone());
+         }

-            TokenType::Whitespace => {
-                output.push(token.lexeme.to_string(),
-                            MarkdownTokenType::Whitespace,
-                            token.span.clone());
-            }
+         TokenType::Symbol =>
+         {
+            output.push(token.lexeme.to_string(),
+                        MarkdownTokenType::Symbol,
+                        token.span.clone());
+         }

-            TokenType::Newline => {
-                output.push(token.lexeme.to_string(),
-                            MarkdownTokenType::Newline,
-                            token.span.clone());
-            }
+         TokenType::Whitespace =>
+         {
+            output.push(token.lexeme.to_string(),
+                        MarkdownTokenType::Whitespace,
+                        token.span.clone());
+         }

-            _ => {
-                output.push(token.lexeme.to_string(),
-                            MarkdownTokenType::Unknown,
-                            token.span.clone());
-            }
-        }
+         TokenType::Newline =>
+         {
+            output.push(token.lexeme.to_string(),
+                        MarkdownTokenType::Newline,
+                        token.span.clone());
+         }

-        i += 1;
-    }
+         _ =>
+         {
+            output.push(token.lexeme.to_string(),
+                        MarkdownTokenType::Unknown,
+                        token.span.clone());
+         }
+      }

-    output
+      i += 1;
+   }
+
+   output
 }


--- a/src/ast.rs
+++ b/src/ast.rs
@ -0,0 +1,327 @@
+use std::collections::VecDeque;
+
+use super::position::Span;
+
+
+
+/// A unique identifier for a node in the AST. Internally, this is just an index
+/// into the node arrays.
+pub type NodeId = usize;
+
+
+
+/// The possible orders in which an AST may be stored for traversal.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TraversalOrder
+{
+   /// Depth-first search (pre-order) layout.
+   DFS,
+   /// Breadth-first search layout.
+   BFS
+}
+
+
+
+/// The data associated with a single node in the AST.
+#[derive(Debug, Clone)]
+pub struct NodeData<T>
+{
+   pub span: Span,
+   pub data: T
+}
+
+/// The mutable AST structure used during parsing. Nodes are created
+/// incrementally and linked via parent relationships. Traversal order is not
+/// guaranteed until `optimize()` is called.
+pub struct Ast<T>
+{
+   nodes: Vec<NodeData<T>>,
+   parents: Vec<Option<NodeId>>
+}
+
+/// An optimized, immutable AST layout produced from `Ast<T>::optimize`.
+/// This structure is ideal for traversal, analysis, and code generation.
+pub struct OptimizedAst<T>
+{
+   /// Node data in a linear layout (DFS or BFS order).
+   pub nodes: Vec<NodeData<T>>,
+   /// Each node’s parent, if any.
+   pub parents: Vec<Option<NodeId>>,
+   /// The traversal order the nodes are stored in.
+   pub order: TraversalOrder
+}
+
+
+
+impl<T> Ast<T>
+{
+   /// Creates a new, empty AST.
+   pub fn new() -> Self
+   {
+      Ast { nodes: Vec::new(),
+            parents: Vec::new() }
+   }
+
+   /// Returns the parent of a node, if any.
+   pub fn get_parent(&self, id: NodeId) -> Option<NodeId>
+   {
+      self.parents.get(id).copied().flatten()
+   }
+
+   /// Returns a reference to the node data at the given ID, if it exists.
+   pub fn get(&self, id: NodeId) -> Option<&NodeData<T>>
+   {
+      self.nodes.get(id)
+   }
+
+   /// Returns a mutable reference to the node data at the given ID, if it
+   /// exists.
+   pub fn get_mut(&mut self, id: NodeId) -> Option<&mut NodeData<T>>
+   {
+      self.nodes.get_mut(id)
+   }
+
+   /// Adds a new node to the AST.
+   ///
+   /// - `data`: The custom payload of the node (usually an enum or struct).
+   /// - `span`: The source span the node represents.
+   /// - `parent`: Optional parent NodeId to attach this node to.
+   ///
+   /// Returns the NodeId of the newly added node.
+   pub fn add_node(&mut self, data: T, span: Span, parent: Option<NodeId>)
+                   -> NodeId
+   {
+      let id = self.nodes.len();
+      self.nodes.push(NodeData { data, span });
+      self.parents.push(parent);
+      id
+   }
+
+   /// Joins another AST into this one, returning a mapping from old node IDs
+   /// in `other` to new node IDs in `self`.
+   ///
+   /// Optionally attaches all root nodes of the other AST to a parent node
+   /// in the current AST.
+   pub fn join(&mut self, other: Ast<T>, attach_to: Option<NodeId>)
+               -> Vec<NodeId>
+   {
+      let base_id = self.nodes.len();
+      let mut id_map = Vec::with_capacity(other.nodes.len());
+
+      for (i, node) in other.nodes.into_iter().enumerate()
+      {
+         self.nodes.push(node);
+         let new_parent = match other.parents[i]
+         {
+            Some(pid) => Some(base_id + pid),
+            None => attach_to // attach root nodes to given parent if provided
+         };
+         self.parents.push(new_parent);
+         id_map.push(base_id + i);
+      }
+
+      id_map
+   }
+
+   /// Prunes the subtree rooted at `root`, compacting the AST in place.
+   /// Node IDs will change after this operation.
+   pub fn prune(&mut self, root: NodeId)
+   {
+      let mut to_remove = Vec::new();
+      collect_descendants(root, &self.parents, &mut to_remove);
+      to_remove.push(root);
+
+      let mut is_removed = vec![false; self.nodes.len()];
+      for &id in &to_remove
+      {
+         is_removed[id] = true;
+      }
+
+      let mut remap = vec![None; self.nodes.len()];
+      let mut next_insert = 0;
+
+      for i in 0..self.nodes.len()
+      {
+         if !is_removed[i]
+         {
+            if i != next_insert
+            {
+               self.nodes.swap(i, next_insert);
+               self.parents.swap(i, next_insert);
+            }
+            remap[i] = Some(next_insert);
+            next_insert += 1;
+         }
+      }
+
+      self.nodes.truncate(next_insert);
+      self.parents.truncate(next_insert);
+
+      for parent in self.parents.iter_mut()
+      {
+         if let Some(pid) = *parent
+         {
+            *parent = remap[pid];
+         }
+      }
+   }
+
+   /// Optimizes the AST layout for a specific traversal order (DFS or BFS).
+   ///
+   /// This consumes the `Ast`, rearranges the internal storage so that
+   /// iterating over the nodes reflects the chosen traversal order, and
+   /// returns a new, immutable `OptimizedAst<T>`.
+   ///
+   /// No need for `T: Clone` anymore, since we will move data instead of
+   /// cloning it.
+   pub fn optimize(self, order: TraversalOrder) -> OptimizedAst<T>
+   {
+      let ordering = match order
+      {
+         TraversalOrder::DFS => dfs_order(&self.parents),
+         TraversalOrder::BFS => bfs_order(&self.parents)
+      };
+
+      let mut remap = vec![0; self.nodes.len()];
+      for (new_id, &old_id) in ordering.iter().enumerate()
+      {
+         remap[old_id] = new_id;
+      }
+
+      // Wrap nodes in Option to allow taking them by value without cloning
+      let mut nodes_opt: Vec<Option<NodeData<T>>> =
+         self.nodes.into_iter().map(Some).collect();
+
+      let mut new_nodes = Vec::with_capacity(nodes_opt.len());
+      let mut new_parents = vec![None; self.parents.len()];
+
+      for &old_id in &ordering
+      {
+         let new_id = remap[old_id];
+         let node = nodes_opt[old_id].take()
+                                     .expect("Node was already moved out");
+
+         let parent = self.parents[old_id].map(|pid| remap[pid]);
+
+         new_nodes.push(node);
+         new_parents[new_id] = parent;
+      }
+
+      OptimizedAst { nodes: new_nodes,
+                     parents: new_parents,
+                     order }
+   }
+}
+
+
+
+/// Helper to recursively collect all descendants of a node.
+fn collect_descendants(root: NodeId, parents: &[Option<NodeId>],
+                       acc: &mut Vec<NodeId>)
+{
+   for (i, &parent) in parents.iter().enumerate()
+   {
+      if parent == Some(root)
+      {
+         collect_descendants(i, parents, acc);
+         acc.push(i);
+      }
+   }
+}
+
+/// Recursively visits nodes in a depth-first (pre-order) manner starting from
+/// `current`, building up the DFS traversal order.
+///
+/// - `current`: The current node ID being visited.
+/// - `parents`: A slice representing the parent relationship for each node
+///   (index = child, value = optional parent).
+/// - `order`: A mutable vector that will accumulate the DFS traversal order.
+/// - `visited`: A mutable slice used to track which nodes have already been
+///   visited.
+fn visit(current: NodeId, parents: &[Option<NodeId>], order: &mut Vec<NodeId>,
+         visited: &mut [bool])
+{
+   // Skip this node if it's already been visited
+   if visited[current]
+   {
+      return;
+   }
+
+   // Mark the node as visited to avoid reprocessing it
+   visited[current] = true;
+
+   // Add this node to the traversal order (pre-order)
+   order.push(current);
+
+   // Recursively visit all child nodes (i.e., nodes that list `current` as
+   // their parent)
+   for (i, &parent) in parents.iter().enumerate()
+   {
+      if parent == Some(current)
+      {
+         visit(i, parents, order, visited);
+      }
+   }
+}
+
+/// Computes the DFS (depth-first, pre-order) ordering of nodes based on the
+/// parent table.
+///
+/// Returns a Vec<NodeId> containing the node IDs in DFS order.
+fn dfs_order(parents: &[Option<NodeId>]) -> Vec<NodeId>
+{
+   let mut order = Vec::new();
+   let mut visited = vec![false; parents.len()];
+
+   // Start DFS from all root nodes (nodes with no parent).
+   for (i, &parent) in parents.iter().enumerate()
+   {
+      if parent.is_none()
+      {
+         visit(i, parents, &mut order, &mut visited);
+      }
+   }
+
+   order
+}
+
+/// Computes the BFS (breadth-first) ordering of nodes based on the parent
+/// table.
+///
+/// Returns a Vec<NodeId> containing the node IDs in BFS order.
+fn bfs_order(parents: &[Option<NodeId>]) -> Vec<NodeId>
+{
+   let mut order = Vec::new();
+   let mut visited = vec![false; parents.len()];
+   let mut queue = VecDeque::new();
+
+   // Start BFS from all root nodes.
+   for (i, &parent) in parents.iter().enumerate()
+   {
+      if parent.is_none()
+      {
+         queue.push_back(i);
+      }
+   }
+
+   while let Some(current) = queue.pop_front()
+   {
+      if visited[current]
+      {
+         continue;
+      }
+
+      visited[current] = true;
+      order.push(current);
+
+      for (i, &parent) in parents.iter().enumerate()
+      {
+         if parent == Some(current)
+         {
+            queue.push_back(i);
+         }
+      }
+   }
+
+   order
+}
--- a/src/error.rs
+++ b/src/error.rs
@ -1,4 +1,6 @@
-use std::{error::Error, path::PathBuf};
+use std::error::Error;
+use std::path::PathBuf;
+
 use super::position::Span;


@ -12,116 +14,114 @@ use super::position::Span;
 /// It is designed to provide detailed diagnostics for file-based or
 /// in-memory parsing and is compatible with error reporting ecosystems.
 #[derive(Debug)]
-pub struct LexerError {
-    /// A human-readable error message.
-    pub message: String,
+pub struct LexerError
+{
+   /// A human-readable error message.
+   pub message: String,

-    /// The span where the error occurred.
-    pub span: Span,
+   /// The span where the error occurred.
+   pub span: Span,

-    /// The file that the error occurred in, if known.
-    pub file: Option<PathBuf>,
+   /// The file that the error occurred in, if known.
+   pub file: Option<PathBuf>,

-    /// The source snippet related to the error, if known.
-    pub snippet: Option<String>,
+   /// The source snippet related to the error, if known.
+   pub snippet: Option<String>,

-    /// An optional underlying error that caused this one.
-    pub source: Option<Box<dyn Error>>,
+   /// An optional underlying error that caused this one.
+   pub source: Option<Box<dyn Error>>
 }

-impl LexerError {
-    /// Creates a new `LexerError` with a message, span, and optional context.
-    ///
-    /// # Parameters
-    /// - `message`: A human-readable explanation of the error.
-    /// - `span`: The region in the source where the error occurred.
-    /// - `file`: An optional path to the file in which the error occurred.
-    /// - `snippet`: An optional problematic input string.
-    ///
-    /// # Returns
-    /// A new instance of `LexerError`.
-    pub fn new<S, T>(
-        message: S,
-        span: Span,
-        file: Option<T>,
-        snippet: Option<S>,
-    ) -> Self
-    where
-        S: Into<String>,
-        T: Into<PathBuf>,
-    {
-        LexerError {
-            message: message.into(),
-            span,
-            file: file.map(Into::into),
-            snippet: snippet.map(Into::into),
-            source: None,
-        }
-    }
+impl LexerError
+{
+   /// Creates a new `LexerError` with a message, span, and optional context.
+   ///
+   /// # Parameters
+   /// - `message`: A human-readable explanation of the error.
+   /// - `span`: The region in the source where the error occurred.
+   /// - `file`: An optional path to the file in which the error occurred.
+   /// - `snippet`: An optional problematic input string.
+   ///
+   /// # Returns
+   /// A new instance of `LexerError`.
+   pub fn new<S, T>(message: S, span: Span, file: Option<T>,
+                    snippet: Option<S>)
+                    -> Self
+      where S: Into<String>,
+            T: Into<PathBuf>
+   {
+      LexerError { message: message.into(),
+                   span,
+                   file: file.map(Into::into),
+                   snippet: snippet.map(Into::into),
+                   source: None }
+   }

-    /// Creates a `LexerError` from only a message and span.
-    ///
-    /// This is useful when file or snippet context is not available.
-    pub fn from_message<S>(message: S, span: Span) -> Self
-    where
-        S: Into<String>,
-    {
-        Self::new(message, span, None::<PathBuf>, None::<S>)
-    }
+   /// Creates a `LexerError` from only a message and span.
+   ///
+   /// This is useful when file or snippet context is not available.
+   pub fn from_message<S>(message: S, span: Span) -> Self
+      where S: Into<String>
+   {
+      Self::new(message, span, None::<PathBuf>, None::<S>)
+   }

-    /// Attaches a snippet of the offending source code.
-    ///
-    /// This is helpful for diagnostics and tooling output.
-    pub fn with_snippet<S>(mut self, snippet: S) -> Self
-    where
-        S: Into<String>,
-    {
-        self.snippet = Some(snippet.into());
-        self
-    }
+   /// Attaches a snippet of the offending source code.
+   ///
+   /// This is helpful for diagnostics and tooling output.
+   pub fn with_snippet<S>(mut self, snippet: S) -> Self
+      where S: Into<String>
+   {
+      self.snippet = Some(snippet.into());
+      self
+   }

-    /// Attaches the path of the file in which the error occurred.
-    pub fn with_file<T>(mut self, file: T) -> Self
-    where
-        T: Into<PathBuf>,
-    {
-        self.file = Some(file.into());
-        self
-    }
+   /// Attaches the path of the file in which the error occurred.
+   pub fn with_file<T>(mut self, file: T) -> Self
+      where T: Into<PathBuf>
+   {
+      self.file = Some(file.into());
+      self
+   }

-    /// Wraps a source error that caused this `LexerError`.
-    ///
-    /// This allows you to chain errors for more detailed diagnostics.
-    pub fn with_source<E>(mut self, err: E) -> Self
-    where
-        E: Error + 'static,
-    {
-        self.source = Some(Box::new(err));
-        self
-    }
+   /// Wraps a source error that caused this `LexerError`.
+   ///
+   /// This allows you to chain errors for more detailed diagnostics.
+   pub fn with_source<E>(mut self, err: E) -> Self
+      where E: Error + 'static
+   {
+      self.source = Some(Box::new(err));
+      self
+   }
 }

-impl std::fmt::Display for LexerError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Lexer error at {}", self.span)?;
+impl std::fmt::Display for LexerError
+{
+   fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
+   {
+      write!(f, "Lexer error at {}", self.span)?;

-        if let Some(file) = &self.file {
-            write!(f, " in file `{}`", file.display())?;
-        }
+      if let Some(file) = &self.file
+      {
+         write!(f, " in file `{}`", file.display())?;
+      }

-        write!(f, ": {}", self.message)?;
+      write!(f, ": {}", self.message)?;

-        if let Some(snippet) = &self.snippet {
-            write!(f, "\n  --> Snippet: `{}`", snippet)?;
-        }
+      if let Some(snippet) = &self.snippet
+      {
+         write!(f, "\n  --> Snippet: `{}`", snippet)?;
+      }

-        Ok(())
-    }
+      Ok(())
+   }
 }

-impl Error for LexerError {
-    /// Returns the underlying cause of this error, if any.
-    fn source(&self) -> Option<&(dyn Error + 'static)> {
-        self.source.as_ref().map(|e| e.as_ref())
-    }
+impl Error for LexerError
+{
+   /// Returns the underlying cause of this error, if any.
+   fn source(&self) -> Option<&(dyn Error + 'static)>
+   {
+      self.source.as_ref().map(|e| e.as_ref())
+   }
 }
--- a/src/iter.rs
+++ b/src/iter.rs
@ -0,0 +1,209 @@
+//! An iterator adapter for arbitrary lookahead functionality.
+//!
+//! This module provides [`Lookahead`], an adapter for any iterator that allows
+//! you to peek ahead by any number of elements, without consuming them.
+//!
+//! ## Example
+//! ```
+//! use rune::LookaheadExt;
+//!
+//! let mut it = vec![10, 20, 30].into_iter().lookahead();
+//!
+//! assert_eq!(it.peek(0), Some(&10));
+//! assert_eq!(it.peek(1), Some(&20));
+//! assert_eq!(it.next(), Some(10));
+//! assert_eq!(it.peek(0), Some(&20));
+//! ```
+
+use std::collections::VecDeque;
+use std::fmt;
+use std::iter::{Fuse, FusedIterator};
+
+
+
+/// An iterator adapter that allows arbitrary lookahead peeking.
+///
+/// This struct wraps an iterator and buffers items so that any future
+/// item can be accessed by index without consuming them. Similar to
+/// [`std::iter::Peekable`], but supports peeking any number of steps ahead.
+pub struct Lookahead<I>
+   where I: Iterator
+{
+   iter: Fuse<I>,
+   buffer: VecDeque<I::Item>
+}
+
+
+
+impl<I> Lookahead<I> where I: Iterator
+{
+   /// Creates a new [`Lookahead`] from the given iterator.
+   ///
+   /// This constructor is typically used indirectly via the
+   /// [`LookaheadExt::lookahead()`] method or [`lookahead()`] free function.
+   #[must_use]
+   pub fn new(iter: I) -> Self
+   {
+      Lookahead { iter: iter.fuse(),
+                  buffer: VecDeque::new() }
+   }
+
+   /// Returns a reference to the `n`th upcoming item, if it exists.
+   ///
+   /// `peek(0)` is the same as peeking at the next item.
+   ///
+   /// This does **not consume** any items from the iterator.
+   ///
+   /// # Examples
+   /// ```
+   /// use rune::LookaheadExt;
+   ///
+   /// let mut it = vec![1, 2, 3].into_iter().lookahead();
+   ///
+   /// assert_eq!(it.peek(1), Some(&2));
+   /// assert_eq!(it.next(), Some(1));
+   /// ```
+   pub fn peek(&mut self, n: usize) -> Option<&I::Item>
+   {
+      while self.buffer.len() <= n
+      {
+         if let Some(item) = self.iter.next()
+         {
+            self.buffer.push_back(item);
+         }
+         else
+         {
+            break;
+         }
+      }
+      self.buffer.get(n)
+   }
+
+   /// Returns a mutable reference to the `n`th upcoming item, if it exists.
+   ///
+   /// This allows in-place modification of peeked items before consumption.
+   ///
+   /// # Examples
+   /// ```
+   /// use rune::LookaheadExt;
+   ///
+   /// let mut it = vec![1, 2, 3].into_iter().lookahead();
+   /// if let Some(x) = it.peek_mut(1)
+   /// {
+   ///    *x *= 10;
+   /// }
+   /// assert_eq!(it.next(), Some(1));
+   /// assert_eq!(it.next(), Some(20));
+   /// ```
+   pub fn peek_mut(&mut self, n: usize) -> Option<&mut I::Item>
+   {
+      while self.buffer.len() <= n
+      {
+         if let Some(item) = self.iter.next()
+         {
+            self.buffer.push_back(item);
+         }
+         else
+         {
+            break;
+         }
+      }
+      self.buffer.get_mut(n)
+   }
+}
+
+impl<I> Iterator for Lookahead<I> where I: Iterator
+{
+   type Item = I::Item;
+
+   /// Retrieves the next item, consuming it.
+   ///
+   /// If any items were previously peeked and buffered, they are returned
+   /// first before accessing the underlying iterator.
+   fn next(&mut self) -> Option<Self::Item>
+   {
+      if let Some(front) = self.buffer.pop_front()
+      {
+         Some(front)
+      }
+      else
+      {
+         self.iter.next()
+      }
+   }
+
+   /// Provides a size hint accounting for both buffered and remaining elements.
+   fn size_hint(&self) -> (usize, Option<usize>)
+   {
+      let (low, high) = self.iter.size_hint();
+      let buffered = self.buffer.len();
+      (low.saturating_add(buffered), high.and_then(|h| h.checked_add(buffered)))
+   }
+}
+
+impl<I> Clone for Lookahead<I>
+   where I: Iterator + Clone,
+         I::Item: Clone
+{
+   fn clone(&self) -> Self
+   {
+      Lookahead { iter: self.iter.clone(),
+                  buffer: self.buffer.clone() }
+   }
+}
+
+impl<I> fmt::Debug for Lookahead<I>
+   where I: Iterator + fmt::Debug,
+         I::Item: fmt::Debug
+{
+   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
+   {
+      f.debug_struct("Lookahead")
+       .field("iter", &self.iter)
+       .field("buffer", &self.buffer)
+       .finish()
+   }
+}
+
+impl<I> FusedIterator for Lookahead<I> where I: Iterator + FusedIterator {}
+
+
+
+/// Extension trait to provide `.lookahead()` on all iterators.
+///
+/// This lets you easily call `.lookahead()` on any iterator to
+/// create a [`Lookahead`] instance.
+pub trait LookaheadExt: Iterator + Sized
+{
+   /// Wraps the iterator in a [`Lookahead`] adapter.
+   fn lookahead(self) -> Lookahead<Self>;
+}
+
+impl<I: Iterator> LookaheadExt for I
+{
+   fn lookahead(self) -> Lookahead<Self>
+   {
+      Lookahead::new(self)
+   }
+}
+
+
+
+/// Creates a [`Lookahead`] from any iterable.
+///
+/// This is a convenience function for use in functional-style code or
+/// when not using the extension trait.
+///
+/// # Example
+/// ```
+/// use rune::lookahead;
+///
+/// let mut it = lookahead(vec![1, 2, 3]);
+///
+/// assert_eq!(it.peek(2), Some(&3));
+/// ```
+pub fn lookahead<I>(iterable: I) -> Lookahead<I::IntoIter>
+   where I: IntoIterator
+{
+   Lookahead::new(iterable.into_iter())
+}
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -7,24 +7,25 @@ use super::token::{TokenStream, TokenType};



-/// The size of data chunks to read from a file. This was arbitrarily chosen to
-/// be 1mb.
+/// The size of data chunks to read from a file. This is an arbitrary choice,
+/// set to 1MB.
 const BUFFER_SIZE: usize = 1024 * 1024;



-/// The `Lexer` is the core component responsible for performing
-/// lexicographical analysis on a text file.
+/// The `Lexer` struct is responsible for performing lexical analysis
+/// (tokenization) on text.
 ///
-/// It reads input from a file character-by-character, generating a stream
-/// of base tokens such as text, numbers, whitespace, symbols, and newlines.
-/// These tokens are accumulated into a `TokenStream`, which is a flat,
-/// cache-friendly data structure.
+/// It processes input from a file or string character-by-character and
+/// generates a stream of tokens, such as text, numbers, whitespace, symbols,
+/// and newlines. These tokens are accumulated into a `TokenStream`, which is a
+/// flat, cache-friendly data structure designed for efficient iteration.
 ///
-/// After tokenization, the lexer applies a user-provided `transform` function
-/// to each token in the stream, allowing consumers of the library to convert
-/// base tokens into richer, domain-specific token types (e.g. Markdown
-/// elements, syntax trees, or custom DSL tokens).
+/// After the base tokens are generated, the `Lexer` allows for transformation
+/// of these tokens into richer, domain-specific types via a user-provided
+/// `transform` function. This transformation can be used to convert base tokens
+/// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
+/// other custom format you need.
 ///
 /// # Example
 ///
@ -38,32 +39,51 @@ const BUFFER_SIZE: usize = 1024 * 1024;
 ///
 /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
 ///
-/// // The tuple here is from the transform functions return type.
 /// for token in &tokens
 /// {
 ///    println!("{}", token);
 /// }
 /// ```
 ///
-/// # Design Notes
-///
-/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
+/// # Design Considerations
+/// - Utilizes a flat `TokenStream` to improve performance and reduce heap
 ///   overhead.
-/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
+/// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
 ///   numbers).
-/// - Easily extensible via the `transform` function to support higher-level
-///   parsing tasks.
+/// - Extensible via the `transform` function, enabling the creation of
+///   higher-level constructs, like Markdown elements or syntax trees for a
+///   custom DSL.
 ///
-/// # Errors
-///
-/// Returns a `LexerError` if the file cannot be opened or read.
+/// # Error Handling
+/// The lexer will return a `LexerError` if the input file cannot be opened or
+/// read. Errors include issues such as missing files, read failures, or invalid
+/// input formats.
 pub enum Lexer {}



 impl Lexer
 {
-   /// Scans a file and produces a vector of transformed tokens.
+   /// Scans a file and generates a vector of transformed tokens based on the
+   /// provided `transform` function.
+   ///
+   /// This method opens a file from the given `path`, reads the file line by
+   /// line, and converts the input into a stream of tokens. The tokens are
+   /// then passed to the `transform` function, which allows users to map
+   /// base tokens into domain-specific types.
+   ///
+   /// # Parameters
+   /// - `path`: A path to the file to be lexically analyzed.
+   /// - `transform`: A function that takes a `TokenStream<TokenType>` and
+   ///   transforms it into a `TokenStream<T>` where `T` is a domain-specific
+   ///   type.
+   ///
+   /// # Returns
+   /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
+   /// type, or an error.
+   ///
+   /// # Errors
+   /// Returns a `LexerError` if the file cannot be opened or read.
   pub fn scan_file<P, F, T>(path: P, transform: F)
                             -> Result<TokenStream<T>, LexerError>
      where P: AsRef<std::path::Path>,
@ -82,6 +102,7 @@ impl Lexer

      let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);

+      // Read the file line by line.
      for line in reader.lines()
      {
         match line
@ -93,7 +114,7 @@ impl Lexer
            Err(_) =>
            {
               return Err(LexerError::new("Unable to read line during \
-                                           Lexigraphical Analysis.",
+                                           Lexical Analysis.",
                                          Span::default(),
                                          Some(path.as_ref()
                                                   .to_string_lossy()
@ -102,10 +123,10 @@ impl Lexer
            }
         }

+         // Add the newline token after each line.
         stream.push("\n".to_string(),
                     TokenType::Newline,
                     Span::with_single(cursor));
-
         cursor.line += 1;
         cursor.column = 0;
      }
@ -113,7 +134,22 @@ impl Lexer
      Ok(transform(&stream))
   }

-   /// Scans a full in-memory string and returns transformed tokens.
+   /// Scans a full in-memory string and produces transformed tokens.
+   ///
+   /// This method tokenizes the input string `text` and returns the transformed
+   /// tokens using the provided `transform` function. It's a convenient way
+   /// to perform lexical analysis on in-memory strings without needing to
+   /// read from a file.
+   ///
+   /// # Parameters
+   /// - `text`: A string slice representing the in-memory input text to
+   ///   analyze.
+   /// - `transform`: A function that transforms the base tokens into
+   ///   domain-specific types.
+   ///
+   /// # Returns
+   /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
+   /// type, or an error.
   pub fn scan_text<F, T>(text: &str, transform: F)
                          -> Result<TokenStream<T>, LexerError>
      where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
@ -121,30 +157,51 @@ impl Lexer
      let mut cursor = Position::default();
      let mut stream = TokenStream::new();

+      // Process each line in the input string.
      for line in text.lines()
      {
         Self::scan(line, &mut stream, &mut cursor);

+         // Add the newline token after each line.
         stream.push("\n".to_string(),
                     TokenType::Newline,
                     Span::with_single(cursor));
-
         cursor.line += 1;
         cursor.column = 0;
      }

+      // Remove the last newline character if the text did not end with a
+      // newline.
+      if !text.ends_with('\n')
+      {
+         stream.pop();
+      }
+
      Ok(transform(&stream))
   }

-   /// Internal: scans a single line of text into tokens.
+   /// Internal method that scans a single line of text into tokens.
+   ///
+   /// This method processes each character of a line and generates the
+   /// corresponding token. It handles cases like numeric tokens, text
+   /// tokens, symbols, and whitespace.
+   ///
+   /// # Parameters
+   /// - `line`: A line of text to be lexically analyzed.
+   /// - `stream`: A mutable reference to the token stream where the generated
+   ///   tokens will be pushed.
+   /// - `cursor`: A mutable reference to the cursor position, which tracks the
+   ///   current position in the input.
   fn scan(line: &str, stream: &mut TokenStream<TokenType>,
           cursor: &mut Position)
   {
      for c in line.chars()
      {
+         // Get the token type based on the character.
         let variant = get_token_type(c);
         let last = stream.len().saturating_sub(1);

+         // Handle token merging for contiguous tokens like numbers or text.
         if !stream.is_empty() &&
            variant == stream.variants[last] &&
            (variant == TokenType::Numeric || variant == TokenType::Text)
@ -154,6 +211,7 @@ impl Lexer
         }
         else
         {
+            // Add a new token to the stream.
            stream.push(c.to_string(), variant, Span::with_single(*cursor));
         }

@ -164,6 +222,18 @@ impl Lexer



+/// Determines the type of a token based on the current character.
+///
+/// This helper function is responsible for identifying whether the current
+/// character is part of a known token type such as numeric, text, whitespace,
+/// or symbol.
+///
+/// # Parameters
+/// - `curr_char`: The current character to analyze.
+///
+/// # Returns
+/// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
+/// `Whitespace`, etc.).
 fn get_token_type(curr_char: char) -> TokenType
 {
   match curr_char
--- a/src/lib.rs
+++ b/src/lib.rs
@ -7,14 +7,18 @@

 pub mod library;

+mod ast;
 mod error;
+mod iter;
 mod lexer;
 mod position;
 mod token;



+pub use crate::ast::*;
 pub use crate::error::*;
+pub use crate::iter::*;
 pub use crate::lexer::*;
 pub use crate::position::*;
 pub use crate::token::*;
--- a/tests/iter.rs
+++ b/tests/iter.rs
@ -0,0 +1,43 @@
+use rune::*;
+
+#[test]
+fn peek_works()
+{
+   let mut it = vec![1, 2, 3].into_iter().lookahead();
+   assert_eq!(it.peek(0), Some(&1));
+   assert_eq!(it.peek(1), Some(&2));
+   assert_eq!(it.peek(2), Some(&3));
+   assert_eq!(it.peek(3), None);
+}
+
+#[test]
+fn peek_mut_modifies_item()
+{
+   let mut it = vec![10, 20, 30].into_iter().lookahead();
+   if let Some(x) = it.peek_mut(1)
+   {
+      *x += 100;
+   }
+   assert_eq!(it.next(), Some(10));
+   assert_eq!(it.next(), Some(120));
+}
+
+#[test]
+fn iterates_correctly()
+{
+   let mut it = vec![1, 2, 3].into_iter().lookahead();
+   assert_eq!(it.next(), Some(1));
+   assert_eq!(it.next(), Some(2));
+   assert_eq!(it.next(), Some(3));
+   assert_eq!(it.next(), None);
+}
+
+#[test]
+fn size_hint_accounts_for_buffer()
+{
+   let mut it = vec![1, 2, 3, 4].into_iter().lookahead();
+   it.peek(2);
+   let (low, high) = it.size_hint();
+   assert!(low >= 4);
+   assert_eq!(high, Some(4));
+}
--- a/tests/lexer_tests.rs
+++ b/tests/lexer_tests.rs
@ -53,45 +53,47 @@ fn cleanup_temp_file(path: &PathBuf)


 #[test]
-fn test_basic_lexing()
+fn basic_lexing()
 {
   let tokens =
      Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
                                                               succeed");

-   let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+   let tokens = tokens.into_iter()
+                      .map(|t| (*t.variant, String::from(t.lexeme)))
+                      .collect::<Vec<_>>();

   let expected = vec![(TokenType::Text, "magic".to_string()),
                       (TokenType::Whitespace, " ".to_string()),
-                       (TokenType::Text, "runes".to_string()),
-                       (TokenType::Newline, "\n".to_string()),];
+                       (TokenType::Text, "runes".to_string()),];

   assert_eq!(tokens, expected);
 }


 #[test]
-fn test_symbols_and_numbers()
+fn symbols_and_numbers()
 {
   let tokens =
      Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
                                                           succeed");

-   let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+   let tokens = tokens.into_iter()
+                      .map(|t| (*t.variant, String::from(t.lexeme)))
+                      .collect::<Vec<_>>();

   let expected = vec![(TokenType::Numeric, "13".into()),
                       (TokenType::Whitespace, " ".into()),
                       (TokenType::Symbol, "+".into()),
                       (TokenType::Whitespace, " ".into()),
-                       (TokenType::Numeric, "37".into()),
-                       (TokenType::Newline, "\n".into()),];
+                       (TokenType::Numeric, "37".into()),];

   assert_eq!(tokens, expected);
 }


 #[test]
-fn test_lexer_with_cases()
+fn lexer_with_cases()
 {
   let cases = vec![TestCase { name: "simple_words",
                               input: "magic rune",
@ -129,16 +131,16 @@ fn test_lexer_with_cases()
                                                              on case '{}'",
                                                             case.name));

-      let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
+      let result = result.into_iter()
+                         .map(|t| (*t.variant, String::from(t.lexeme)))
+                         .collect::<Vec<_>>();

      let expected = case.expected
                         .iter()
                         .map(|(ty, s)| (*ty, s.to_string()))
                         .collect::<Vec<_>>();

-      assert_eq!(result, expected,
-                 "Mismatch in test case '{}'",
-                 case.name);
+      assert_eq!(result, expected, "Mismatch in test case '{}'", case.name);

      cleanup_temp_file(&path);
   }
Author	SHA1	Message	Date
Myrddin Dundragon	693ff20224	Fixing how the Lexer handles text. Scanning a file has all the lines terminating with a '\n' newline character, but when giving the text directly to the lexer via the scan_text function it should not append a newline at the end if there was no newline in the original input.	2025-05-05 18:17:32 -04:00
Myrddin Dundragon	34a579332d	Added a Lookahead iterator. This adds a Lookahead iterator so that while parsing it is easier to peek ahead however much the parser needs. Basic parsers may not need any, but a lot of parsers have two token lookahead. I've even seen some with three.	2025-05-05 18:17:32 -04:00
Myrddin Dundragon	cb882ceb84	Changing test names.	2025-05-05 18:17:32 -04:00
Myrddin Dundragon	e604bf172b	[#4 ] The initial AST. This is the initial design of the AST. It is built in a data oriented style. It also needs iterators over the AST and the optimized AST as well as some more transformation functions.	2025-05-05 18:17:32 -04:00
Myrddin Dundragon	f5780f50c2	Just some basic updating and cleaning up. - Added comments. - Ran cargo fmt. - Updated the versioning.	2025-05-05 18:17:32 -04:00