Compare commits

...

5 Commits

Author SHA1 Message Date
693ff20224 Fixing how the Lexer handles text.
Scanning a file has all the lines terminating with a '\n' newline
character, but when giving the text directly to the lexer via the
scan_text function it should not append a newline at the end if there
was no newline in the original input.
2025-05-05 18:17:32 -04:00
34a579332d Added a Lookahead iterator.
This adds a Lookahead iterator so that while parsing it is easier
to peek ahead however much the parser needs. Basic parsers may not
need any, but a lot of parsers have two token lookahead. I've even
seen some with three.
2025-05-05 18:17:32 -04:00
cb882ceb84 Changing test names. 2025-05-05 18:17:32 -04:00
e604bf172b [#4] The initial AST.
This is the initial design of the AST. It is built in a data oriented
style. It also needs iterators over the AST and the optimized AST
as well as some more transformation functions.
2025-05-05 18:17:32 -04:00
f5780f50c2 Just some basic updating and cleaning up.
- Added comments.
- Ran cargo fmt.
- Updated the versioning.
2025-05-05 18:17:32 -04:00
11 changed files with 925 additions and 244 deletions

2
Cargo.lock generated
View File

@ -4,4 +4,4 @@ version = 4
[[package]] [[package]]
name = "rune" name = "rune"
version = "0.2.0" version = "0.3.0"

View File

@ -1,6 +1,6 @@
[package] [package]
name = "rune" name = "rune"
version = "0.2.0" version = "0.3.0"
edition = "2021" edition = "2021"
description = "A lexical analysis library." description = "A lexical analysis library."
repository = "/myrddin/rune" repository = "/myrddin/rune"

View File

@ -31,5 +31,5 @@ Then add this to your Cargo.toml file.
```toml ```toml
[dependencies] [dependencies]
rune = { version = "0.2.0", registry = "cybermages" } rune = { version = "0.3.0", registry = "cybermages" }
``` ```

View File

@ -5,133 +5,159 @@ use rune::{Lexer, Span, TokenStream, TokenType};
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum MarkdownTokenType { pub enum MarkdownTokenType
Heading(u8), {
EmphasisStart, Heading(u8),
EmphasisEnd, EmphasisStart,
StrongStart, EmphasisEnd,
StrongEnd, StrongStart,
CodeSpan, StrongEnd,
Text, CodeSpan,
Symbol, Text,
Whitespace, Symbol,
Newline, Whitespace,
Unknown, Newline,
Unknown
} }
impl std::fmt::Display for MarkdownTokenType { impl std::fmt::Display for MarkdownTokenType
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { {
match self { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level), {
MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"), match self
MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"), {
MarkdownTokenType::StrongStart => write!(f, "StrongStart"), MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"), MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"), MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
MarkdownTokenType::Text => write!(f, "Text"), MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
MarkdownTokenType::Symbol => write!(f, "Symbol"), MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
MarkdownTokenType::Whitespace => write!(f, "Whitespace"), MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
MarkdownTokenType::Newline => write!(f, "Newline"), MarkdownTokenType::Text => write!(f, "Text"),
MarkdownTokenType::Unknown => write!(f, "Unknown"), MarkdownTokenType::Symbol => write!(f, "Symbol"),
} MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
} MarkdownTokenType::Newline => write!(f, "Newline"),
MarkdownTokenType::Unknown => write!(f, "Unknown")
}
}
} }
// Define how you want to interpret base tokens // Define how you want to interpret base tokens
pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> { pub fn transform(input: &TokenStream<TokenType>)
let mut output = TokenStream::new(); -> TokenStream<MarkdownTokenType>
{
let mut output = TokenStream::new();
let mut i = 0; let mut i = 0;
while i < input.len() { while i < input.len()
let token = input.get(i).unwrap(); // safe due to bounds check above {
let token = input.get(i).unwrap(); // safe due to bounds check above
match token.variant { match token.variant
TokenType::Symbol if token.lexeme == "#" => { {
// Count consecutive #s for heading level TokenType::Symbol if token.lexeme == "#" =>
let mut level = 1; {
let mut span = token.span.clone(); // Count consecutive #s for heading level
let mut level = 1;
let mut span = token.span.clone();
while i + 1 < input.len() { while i + 1 < input.len()
let next = input.get(i + 1).unwrap(); {
if *next.variant == TokenType::Symbol && next.lexeme == "#" { let next = input.get(i + 1).unwrap();
level += 1; if *next.variant == TokenType::Symbol && next.lexeme == "#"
span.end = next.span.end; {
i += 1; level += 1;
} else { span.end = next.span.end;
break; i += 1;
} }
} else
{
output.push(token.lexeme.repeat(level), break;
MarkdownTokenType::Heading(level as u8), }
span);
} }
TokenType::Symbol if token.lexeme == "*" => { output.push(token.lexeme.repeat(level),
// Look ahead to see if it's strong (**) or emphasis (*) MarkdownTokenType::Heading(level as u8),
if i + 1 < input.len() { span);
let next = input.get(i + 1).unwrap(); }
if *next.variant == TokenType::Symbol && next.lexeme == "*" {
output.push("**".to_string(), TokenType::Symbol if token.lexeme == "*" =>
MarkdownTokenType::StrongStart, {
Span::merge(*token.span, *next.span)); // Look ahead to see if it's strong (**) or emphasis (*)
i += 1; // skip the second '*' if i + 1 < input.len()
} else { {
output.push("*".to_string(), let next = input.get(i + 1).unwrap();
MarkdownTokenType::EmphasisStart, if *next.variant == TokenType::Symbol && next.lexeme == "*"
token.span.clone()); {
} output.push("**".to_string(),
} else { MarkdownTokenType::StrongStart,
output.push("*".to_string(), Span::merge(*token.span, *next.span));
MarkdownTokenType::EmphasisStart, i += 1; // skip the second '*'
token.span.clone()); }
} else
{
output.push("*".to_string(),
MarkdownTokenType::EmphasisStart,
token.span.clone());
}
} }
else
TokenType::Symbol if token.lexeme == "`" => { {
output.push(token.lexeme.to_string(), output.push("*".to_string(),
MarkdownTokenType::CodeSpan, MarkdownTokenType::EmphasisStart,
token.span.clone()); token.span.clone());
} }
}
TokenType::Text => { TokenType::Symbol if token.lexeme == "`" =>
output.push(token.lexeme.to_string(), {
MarkdownTokenType::Text, output.push(token.lexeme.to_string(),
token.span.clone()); MarkdownTokenType::CodeSpan,
} token.span.clone());
}
TokenType::Symbol => { TokenType::Text =>
output.push(token.lexeme.to_string(), {
MarkdownTokenType::Symbol, output.push(token.lexeme.to_string(),
token.span.clone()); MarkdownTokenType::Text,
} token.span.clone());
}
TokenType::Whitespace => { TokenType::Symbol =>
output.push(token.lexeme.to_string(), {
MarkdownTokenType::Whitespace, output.push(token.lexeme.to_string(),
token.span.clone()); MarkdownTokenType::Symbol,
} token.span.clone());
}
TokenType::Newline => { TokenType::Whitespace =>
output.push(token.lexeme.to_string(), {
MarkdownTokenType::Newline, output.push(token.lexeme.to_string(),
token.span.clone()); MarkdownTokenType::Whitespace,
} token.span.clone());
}
_ => { TokenType::Newline =>
output.push(token.lexeme.to_string(), {
MarkdownTokenType::Unknown, output.push(token.lexeme.to_string(),
token.span.clone()); MarkdownTokenType::Newline,
} token.span.clone());
} }
i += 1; _ =>
} {
output.push(token.lexeme.to_string(),
MarkdownTokenType::Unknown,
token.span.clone());
}
}
output i += 1;
}
output
} }

327
src/ast.rs Normal file
View File

@ -0,0 +1,327 @@
use std::collections::VecDeque;
use super::position::Span;
/// A unique identifier for a node in the AST. Internally, this is just an index
/// into the node arrays.
pub type NodeId = usize;
/// The possible orders in which an AST may be stored for traversal.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TraversalOrder
{
/// Depth-first search (pre-order) layout.
DFS,
/// Breadth-first search layout.
BFS
}
/// The data associated with a single node in the AST.
#[derive(Debug, Clone)]
pub struct NodeData<T>
{
pub span: Span,
pub data: T
}
/// The mutable AST structure used during parsing. Nodes are created
/// incrementally and linked via parent relationships. Traversal order is not
/// guaranteed until `optimize()` is called.
pub struct Ast<T>
{
nodes: Vec<NodeData<T>>,
parents: Vec<Option<NodeId>>
}
/// An optimized, immutable AST layout produced from `Ast<T>::optimize`.
/// This structure is ideal for traversal, analysis, and code generation.
pub struct OptimizedAst<T>
{
/// Node data in a linear layout (DFS or BFS order).
pub nodes: Vec<NodeData<T>>,
/// Each nodes parent, if any.
pub parents: Vec<Option<NodeId>>,
/// The traversal order the nodes are stored in.
pub order: TraversalOrder
}
impl<T> Ast<T>
{
/// Creates a new, empty AST.
pub fn new() -> Self
{
Ast { nodes: Vec::new(),
parents: Vec::new() }
}
/// Returns the parent of a node, if any.
pub fn get_parent(&self, id: NodeId) -> Option<NodeId>
{
self.parents.get(id).copied().flatten()
}
/// Returns a reference to the node data at the given ID, if it exists.
pub fn get(&self, id: NodeId) -> Option<&NodeData<T>>
{
self.nodes.get(id)
}
/// Returns a mutable reference to the node data at the given ID, if it
/// exists.
pub fn get_mut(&mut self, id: NodeId) -> Option<&mut NodeData<T>>
{
self.nodes.get_mut(id)
}
/// Adds a new node to the AST.
///
/// - `data`: The custom payload of the node (usually an enum or struct).
/// - `span`: The source span the node represents.
/// - `parent`: Optional parent NodeId to attach this node to.
///
/// Returns the NodeId of the newly added node.
pub fn add_node(&mut self, data: T, span: Span, parent: Option<NodeId>)
-> NodeId
{
let id = self.nodes.len();
self.nodes.push(NodeData { data, span });
self.parents.push(parent);
id
}
/// Joins another AST into this one, returning a mapping from old node IDs
/// in `other` to new node IDs in `self`.
///
/// Optionally attaches all root nodes of the other AST to a parent node
/// in the current AST.
pub fn join(&mut self, other: Ast<T>, attach_to: Option<NodeId>)
-> Vec<NodeId>
{
let base_id = self.nodes.len();
let mut id_map = Vec::with_capacity(other.nodes.len());
for (i, node) in other.nodes.into_iter().enumerate()
{
self.nodes.push(node);
let new_parent = match other.parents[i]
{
Some(pid) => Some(base_id + pid),
None => attach_to // attach root nodes to given parent if provided
};
self.parents.push(new_parent);
id_map.push(base_id + i);
}
id_map
}
/// Prunes the subtree rooted at `root`, compacting the AST in place.
/// Node IDs will change after this operation.
pub fn prune(&mut self, root: NodeId)
{
let mut to_remove = Vec::new();
collect_descendants(root, &self.parents, &mut to_remove);
to_remove.push(root);
let mut is_removed = vec![false; self.nodes.len()];
for &id in &to_remove
{
is_removed[id] = true;
}
let mut remap = vec![None; self.nodes.len()];
let mut next_insert = 0;
for i in 0..self.nodes.len()
{
if !is_removed[i]
{
if i != next_insert
{
self.nodes.swap(i, next_insert);
self.parents.swap(i, next_insert);
}
remap[i] = Some(next_insert);
next_insert += 1;
}
}
self.nodes.truncate(next_insert);
self.parents.truncate(next_insert);
for parent in self.parents.iter_mut()
{
if let Some(pid) = *parent
{
*parent = remap[pid];
}
}
}
/// Optimizes the AST layout for a specific traversal order (DFS or BFS).
///
/// This consumes the `Ast`, rearranges the internal storage so that
/// iterating over the nodes reflects the chosen traversal order, and
/// returns a new, immutable `OptimizedAst<T>`.
///
/// No need for `T: Clone` anymore, since we will move data instead of
/// cloning it.
pub fn optimize(self, order: TraversalOrder) -> OptimizedAst<T>
{
let ordering = match order
{
TraversalOrder::DFS => dfs_order(&self.parents),
TraversalOrder::BFS => bfs_order(&self.parents)
};
let mut remap = vec![0; self.nodes.len()];
for (new_id, &old_id) in ordering.iter().enumerate()
{
remap[old_id] = new_id;
}
// Wrap nodes in Option to allow taking them by value without cloning
let mut nodes_opt: Vec<Option<NodeData<T>>> =
self.nodes.into_iter().map(Some).collect();
let mut new_nodes = Vec::with_capacity(nodes_opt.len());
let mut new_parents = vec![None; self.parents.len()];
for &old_id in &ordering
{
let new_id = remap[old_id];
let node = nodes_opt[old_id].take()
.expect("Node was already moved out");
let parent = self.parents[old_id].map(|pid| remap[pid]);
new_nodes.push(node);
new_parents[new_id] = parent;
}
OptimizedAst { nodes: new_nodes,
parents: new_parents,
order }
}
}
/// Helper to recursively collect all descendants of a node.
fn collect_descendants(root: NodeId, parents: &[Option<NodeId>],
acc: &mut Vec<NodeId>)
{
for (i, &parent) in parents.iter().enumerate()
{
if parent == Some(root)
{
collect_descendants(i, parents, acc);
acc.push(i);
}
}
}
/// Recursively visits nodes in a depth-first (pre-order) manner starting from
/// `current`, building up the DFS traversal order.
///
/// - `current`: The current node ID being visited.
/// - `parents`: A slice representing the parent relationship for each node
/// (index = child, value = optional parent).
/// - `order`: A mutable vector that will accumulate the DFS traversal order.
/// - `visited`: A mutable slice used to track which nodes have already been
/// visited.
fn visit(current: NodeId, parents: &[Option<NodeId>], order: &mut Vec<NodeId>,
visited: &mut [bool])
{
// Skip this node if it's already been visited
if visited[current]
{
return;
}
// Mark the node as visited to avoid reprocessing it
visited[current] = true;
// Add this node to the traversal order (pre-order)
order.push(current);
// Recursively visit all child nodes (i.e., nodes that list `current` as
// their parent)
for (i, &parent) in parents.iter().enumerate()
{
if parent == Some(current)
{
visit(i, parents, order, visited);
}
}
}
/// Computes the DFS (depth-first, pre-order) ordering of nodes based on the
/// parent table.
///
/// Returns a Vec<NodeId> containing the node IDs in DFS order.
fn dfs_order(parents: &[Option<NodeId>]) -> Vec<NodeId>
{
let mut order = Vec::new();
let mut visited = vec![false; parents.len()];
// Start DFS from all root nodes (nodes with no parent).
for (i, &parent) in parents.iter().enumerate()
{
if parent.is_none()
{
visit(i, parents, &mut order, &mut visited);
}
}
order
}
/// Computes the BFS (breadth-first) ordering of nodes based on the parent
/// table.
///
/// Returns a Vec<NodeId> containing the node IDs in BFS order.
fn bfs_order(parents: &[Option<NodeId>]) -> Vec<NodeId>
{
let mut order = Vec::new();
let mut visited = vec![false; parents.len()];
let mut queue = VecDeque::new();
// Start BFS from all root nodes.
for (i, &parent) in parents.iter().enumerate()
{
if parent.is_none()
{
queue.push_back(i);
}
}
while let Some(current) = queue.pop_front()
{
if visited[current]
{
continue;
}
visited[current] = true;
order.push(current);
for (i, &parent) in parents.iter().enumerate()
{
if parent == Some(current)
{
queue.push_back(i);
}
}
}
order
}

View File

@ -1,4 +1,6 @@
use std::{error::Error, path::PathBuf}; use std::error::Error;
use std::path::PathBuf;
use super::position::Span; use super::position::Span;
@ -12,116 +14,114 @@ use super::position::Span;
/// It is designed to provide detailed diagnostics for file-based or /// It is designed to provide detailed diagnostics for file-based or
/// in-memory parsing and is compatible with error reporting ecosystems. /// in-memory parsing and is compatible with error reporting ecosystems.
#[derive(Debug)] #[derive(Debug)]
pub struct LexerError { pub struct LexerError
/// A human-readable error message. {
pub message: String, /// A human-readable error message.
pub message: String,
/// The span where the error occurred. /// The span where the error occurred.
pub span: Span, pub span: Span,
/// The file that the error occurred in, if known. /// The file that the error occurred in, if known.
pub file: Option<PathBuf>, pub file: Option<PathBuf>,
/// The source snippet related to the error, if known. /// The source snippet related to the error, if known.
pub snippet: Option<String>, pub snippet: Option<String>,
/// An optional underlying error that caused this one. /// An optional underlying error that caused this one.
pub source: Option<Box<dyn Error>>, pub source: Option<Box<dyn Error>>
} }
impl LexerError { impl LexerError
/// Creates a new `LexerError` with a message, span, and optional context. {
/// /// Creates a new `LexerError` with a message, span, and optional context.
/// # Parameters ///
/// - `message`: A human-readable explanation of the error. /// # Parameters
/// - `span`: The region in the source where the error occurred. /// - `message`: A human-readable explanation of the error.
/// - `file`: An optional path to the file in which the error occurred. /// - `span`: The region in the source where the error occurred.
/// - `snippet`: An optional problematic input string. /// - `file`: An optional path to the file in which the error occurred.
/// /// - `snippet`: An optional problematic input string.
/// # Returns ///
/// A new instance of `LexerError`. /// # Returns
pub fn new<S, T>( /// A new instance of `LexerError`.
message: S, pub fn new<S, T>(message: S, span: Span, file: Option<T>,
span: Span, snippet: Option<S>)
file: Option<T>, -> Self
snippet: Option<S>, where S: Into<String>,
) -> Self T: Into<PathBuf>
where {
S: Into<String>, LexerError { message: message.into(),
T: Into<PathBuf>, span,
{ file: file.map(Into::into),
LexerError { snippet: snippet.map(Into::into),
message: message.into(), source: None }
span, }
file: file.map(Into::into),
snippet: snippet.map(Into::into),
source: None,
}
}
/// Creates a `LexerError` from only a message and span. /// Creates a `LexerError` from only a message and span.
/// ///
/// This is useful when file or snippet context is not available. /// This is useful when file or snippet context is not available.
pub fn from_message<S>(message: S, span: Span) -> Self pub fn from_message<S>(message: S, span: Span) -> Self
where where S: Into<String>
S: Into<String>, {
{ Self::new(message, span, None::<PathBuf>, None::<S>)
Self::new(message, span, None::<PathBuf>, None::<S>) }
}
/// Attaches a snippet of the offending source code. /// Attaches a snippet of the offending source code.
/// ///
/// This is helpful for diagnostics and tooling output. /// This is helpful for diagnostics and tooling output.
pub fn with_snippet<S>(mut self, snippet: S) -> Self pub fn with_snippet<S>(mut self, snippet: S) -> Self
where where S: Into<String>
S: Into<String>, {
{ self.snippet = Some(snippet.into());
self.snippet = Some(snippet.into()); self
self }
}
/// Attaches the path of the file in which the error occurred. /// Attaches the path of the file in which the error occurred.
pub fn with_file<T>(mut self, file: T) -> Self pub fn with_file<T>(mut self, file: T) -> Self
where where T: Into<PathBuf>
T: Into<PathBuf>, {
{ self.file = Some(file.into());
self.file = Some(file.into()); self
self }
}
/// Wraps a source error that caused this `LexerError`. /// Wraps a source error that caused this `LexerError`.
/// ///
/// This allows you to chain errors for more detailed diagnostics. /// This allows you to chain errors for more detailed diagnostics.
pub fn with_source<E>(mut self, err: E) -> Self pub fn with_source<E>(mut self, err: E) -> Self
where where E: Error + 'static
E: Error + 'static, {
{ self.source = Some(Box::new(err));
self.source = Some(Box::new(err)); self
self }
}
} }
impl std::fmt::Display for LexerError { impl std::fmt::Display for LexerError
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { {
write!(f, "Lexer error at {}", self.span)?; fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
{
write!(f, "Lexer error at {}", self.span)?;
if let Some(file) = &self.file { if let Some(file) = &self.file
write!(f, " in file `{}`", file.display())?; {
} write!(f, " in file `{}`", file.display())?;
}
write!(f, ": {}", self.message)?; write!(f, ": {}", self.message)?;
if let Some(snippet) = &self.snippet { if let Some(snippet) = &self.snippet
write!(f, "\n --> Snippet: `{}`", snippet)?; {
} write!(f, "\n --> Snippet: `{}`", snippet)?;
}
Ok(()) Ok(())
} }
} }
impl Error for LexerError { impl Error for LexerError
/// Returns the underlying cause of this error, if any. {
fn source(&self) -> Option<&(dyn Error + 'static)> { /// Returns the underlying cause of this error, if any.
self.source.as_ref().map(|e| e.as_ref()) fn source(&self) -> Option<&(dyn Error + 'static)>
} {
self.source.as_ref().map(|e| e.as_ref())
}
} }

209
src/iter.rs Normal file
View File

@ -0,0 +1,209 @@
//! An iterator adapter for arbitrary lookahead functionality.
//!
//! This module provides [`Lookahead`], an adapter for any iterator that allows
//! you to peek ahead by any number of elements, without consuming them.
//!
//! ## Example
//! ```
//! use rune::LookaheadExt;
//!
//! let mut it = vec![10, 20, 30].into_iter().lookahead();
//!
//! assert_eq!(it.peek(0), Some(&10));
//! assert_eq!(it.peek(1), Some(&20));
//! assert_eq!(it.next(), Some(10));
//! assert_eq!(it.peek(0), Some(&20));
//! ```
use std::collections::VecDeque;
use std::fmt;
use std::iter::{Fuse, FusedIterator};
/// An iterator adapter that allows arbitrary lookahead peeking.
///
/// This struct wraps an iterator and buffers items so that any future
/// item can be accessed by index without consuming them. Similar to
/// [`std::iter::Peekable`], but supports peeking any number of steps ahead.
pub struct Lookahead<I>
where I: Iterator
{
iter: Fuse<I>,
buffer: VecDeque<I::Item>
}
impl<I> Lookahead<I> where I: Iterator
{
/// Creates a new [`Lookahead`] from the given iterator.
///
/// This constructor is typically used indirectly via the
/// [`LookaheadExt::lookahead()`] method or [`lookahead()`] free function.
#[must_use]
pub fn new(iter: I) -> Self
{
Lookahead { iter: iter.fuse(),
buffer: VecDeque::new() }
}
/// Returns a reference to the `n`th upcoming item, if it exists.
///
/// `peek(0)` is the same as peeking at the next item.
///
/// This does **not consume** any items from the iterator.
///
/// # Examples
/// ```
/// use rune::LookaheadExt;
///
/// let mut it = vec![1, 2, 3].into_iter().lookahead();
///
/// assert_eq!(it.peek(1), Some(&2));
/// assert_eq!(it.next(), Some(1));
/// ```
pub fn peek(&mut self, n: usize) -> Option<&I::Item>
{
while self.buffer.len() <= n
{
if let Some(item) = self.iter.next()
{
self.buffer.push_back(item);
}
else
{
break;
}
}
self.buffer.get(n)
}
/// Returns a mutable reference to the `n`th upcoming item, if it exists.
///
/// This allows in-place modification of peeked items before consumption.
///
/// # Examples
/// ```
/// use rune::LookaheadExt;
///
/// let mut it = vec![1, 2, 3].into_iter().lookahead();
/// if let Some(x) = it.peek_mut(1)
/// {
/// *x *= 10;
/// }
/// assert_eq!(it.next(), Some(1));
/// assert_eq!(it.next(), Some(20));
/// ```
pub fn peek_mut(&mut self, n: usize) -> Option<&mut I::Item>
{
while self.buffer.len() <= n
{
if let Some(item) = self.iter.next()
{
self.buffer.push_back(item);
}
else
{
break;
}
}
self.buffer.get_mut(n)
}
}
impl<I> Iterator for Lookahead<I> where I: Iterator
{
type Item = I::Item;
/// Retrieves the next item, consuming it.
///
/// If any items were previously peeked and buffered, they are returned
/// first before accessing the underlying iterator.
fn next(&mut self) -> Option<Self::Item>
{
if let Some(front) = self.buffer.pop_front()
{
Some(front)
}
else
{
self.iter.next()
}
}
/// Provides a size hint accounting for both buffered and remaining elements.
fn size_hint(&self) -> (usize, Option<usize>)
{
let (low, high) = self.iter.size_hint();
let buffered = self.buffer.len();
(low.saturating_add(buffered), high.and_then(|h| h.checked_add(buffered)))
}
}
impl<I> Clone for Lookahead<I>
where I: Iterator + Clone,
I::Item: Clone
{
fn clone(&self) -> Self
{
Lookahead { iter: self.iter.clone(),
buffer: self.buffer.clone() }
}
}
impl<I> fmt::Debug for Lookahead<I>
where I: Iterator + fmt::Debug,
I::Item: fmt::Debug
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
{
f.debug_struct("Lookahead")
.field("iter", &self.iter)
.field("buffer", &self.buffer)
.finish()
}
}
impl<I> FusedIterator for Lookahead<I> where I: Iterator + FusedIterator {}
/// Extension trait to provide `.lookahead()` on all iterators.
///
/// This lets you easily call `.lookahead()` on any iterator to
/// create a [`Lookahead`] instance.
pub trait LookaheadExt: Iterator + Sized
{
/// Wraps the iterator in a [`Lookahead`] adapter.
fn lookahead(self) -> Lookahead<Self>;
}
impl<I: Iterator> LookaheadExt for I
{
fn lookahead(self) -> Lookahead<Self>
{
Lookahead::new(self)
}
}
/// Creates a [`Lookahead`] from any iterable.
///
/// This is a convenience function for use in functional-style code or
/// when not using the extension trait.
///
/// # Example
/// ```
/// use rune::lookahead;
///
/// let mut it = lookahead(vec![1, 2, 3]);
///
/// assert_eq!(it.peek(2), Some(&3));
/// ```
pub fn lookahead<I>(iterable: I) -> Lookahead<I::IntoIter>
where I: IntoIterator
{
Lookahead::new(iterable.into_iter())
}

View File

@ -7,24 +7,25 @@ use super::token::{TokenStream, TokenType};
/// The size of data chunks to read from a file. This was arbitrarily chosen to /// The size of data chunks to read from a file. This is an arbitrary choice,
/// be 1mb. /// set to 1MB.
const BUFFER_SIZE: usize = 1024 * 1024; const BUFFER_SIZE: usize = 1024 * 1024;
/// The `Lexer` is the core component responsible for performing /// The `Lexer` struct is responsible for performing lexical analysis
/// lexicographical analysis on a text file. /// (tokenization) on text.
/// ///
/// It reads input from a file character-by-character, generating a stream /// It processes input from a file or string character-by-character and
/// of base tokens such as text, numbers, whitespace, symbols, and newlines. /// generates a stream of tokens, such as text, numbers, whitespace, symbols,
/// These tokens are accumulated into a `TokenStream`, which is a flat, /// and newlines. These tokens are accumulated into a `TokenStream`, which is a
/// cache-friendly data structure. /// flat, cache-friendly data structure designed for efficient iteration.
/// ///
/// After tokenization, the lexer applies a user-provided `transform` function /// After the base tokens are generated, the `Lexer` allows for transformation
/// to each token in the stream, allowing consumers of the library to convert /// of these tokens into richer, domain-specific types via a user-provided
/// base tokens into richer, domain-specific token types (e.g. Markdown /// `transform` function. This transformation can be used to convert base tokens
/// elements, syntax trees, or custom DSL tokens). /// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
/// other custom format you need.
/// ///
/// # Example /// # Example
/// ///
@ -38,32 +39,51 @@ const BUFFER_SIZE: usize = 1024 * 1024;
/// ///
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
/// ///
/// // The tuple here is from the transform functions return type.
/// for token in &tokens /// for token in &tokens
/// { /// {
/// println!("{}", token); /// println!("{}", token);
/// } /// }
/// ``` /// ```
/// ///
/// # Design Notes /// # Design Considerations
/// /// - Utilizes a flat `TokenStream` to improve performance and reduce heap
/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
/// overhead. /// overhead.
/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit /// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
/// numbers). /// numbers).
/// - Easily extensible via the `transform` function to support higher-level /// - Extensible via the `transform` function, enabling the creation of
/// parsing tasks. /// higher-level constructs, like Markdown elements or syntax trees for a
/// custom DSL.
/// ///
/// # Errors /// # Error Handling
/// /// The lexer will return a `LexerError` if the input file cannot be opened or
/// Returns a `LexerError` if the file cannot be opened or read. /// read. Errors include issues such as missing files, read failures, or invalid
/// input formats.
pub enum Lexer {} pub enum Lexer {}
impl Lexer impl Lexer
{ {
/// Scans a file and produces a vector of transformed tokens. /// Scans a file and generates a vector of transformed tokens based on the
/// provided `transform` function.
///
/// This method opens a file from the given `path`, reads the file line by
/// line, and converts the input into a stream of tokens. The tokens are
/// then passed to the `transform` function, which allows users to map
/// base tokens into domain-specific types.
///
/// # Parameters
/// - `path`: A path to the file to be lexically analyzed.
/// - `transform`: A function that takes a `TokenStream<TokenType>` and
/// transforms it into a `TokenStream<T>` where `T` is a domain-specific
/// type.
///
/// # Returns
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
/// type, or an error.
///
/// # Errors
/// Returns a `LexerError` if the file cannot be opened or read.
pub fn scan_file<P, F, T>(path: P, transform: F) pub fn scan_file<P, F, T>(path: P, transform: F)
-> Result<TokenStream<T>, LexerError> -> Result<TokenStream<T>, LexerError>
where P: AsRef<std::path::Path>, where P: AsRef<std::path::Path>,
@ -82,6 +102,7 @@ impl Lexer
let reader = BufReader::with_capacity(BUFFER_SIZE, input_file); let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
// Read the file line by line.
for line in reader.lines() for line in reader.lines()
{ {
match line match line
@ -93,7 +114,7 @@ impl Lexer
Err(_) => Err(_) =>
{ {
return Err(LexerError::new("Unable to read line during \ return Err(LexerError::new("Unable to read line during \
Lexigraphical Analysis.", Lexical Analysis.",
Span::default(), Span::default(),
Some(path.as_ref() Some(path.as_ref()
.to_string_lossy() .to_string_lossy()
@ -102,10 +123,10 @@ impl Lexer
} }
} }
// Add the newline token after each line.
stream.push("\n".to_string(), stream.push("\n".to_string(),
TokenType::Newline, TokenType::Newline,
Span::with_single(cursor)); Span::with_single(cursor));
cursor.line += 1; cursor.line += 1;
cursor.column = 0; cursor.column = 0;
} }
@ -113,7 +134,22 @@ impl Lexer
Ok(transform(&stream)) Ok(transform(&stream))
} }
/// Scans a full in-memory string and returns transformed tokens. /// Scans a full in-memory string and produces transformed tokens.
///
/// This method tokenizes the input string `text` and returns the transformed
/// tokens using the provided `transform` function. It's a convenient way
/// to perform lexical analysis on in-memory strings without needing to
/// read from a file.
///
/// # Parameters
/// - `text`: A string slice representing the in-memory input text to
/// analyze.
/// - `transform`: A function that transforms the base tokens into
/// domain-specific types.
///
/// # Returns
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
/// type, or an error.
pub fn scan_text<F, T>(text: &str, transform: F) pub fn scan_text<F, T>(text: &str, transform: F)
-> Result<TokenStream<T>, LexerError> -> Result<TokenStream<T>, LexerError>
where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T> where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
@ -121,30 +157,51 @@ impl Lexer
let mut cursor = Position::default(); let mut cursor = Position::default();
let mut stream = TokenStream::new(); let mut stream = TokenStream::new();
// Process each line in the input string.
for line in text.lines() for line in text.lines()
{ {
Self::scan(line, &mut stream, &mut cursor); Self::scan(line, &mut stream, &mut cursor);
// Add the newline token after each line.
stream.push("\n".to_string(), stream.push("\n".to_string(),
TokenType::Newline, TokenType::Newline,
Span::with_single(cursor)); Span::with_single(cursor));
cursor.line += 1; cursor.line += 1;
cursor.column = 0; cursor.column = 0;
} }
// Remove the last newline character if the text did not end with a
// newline.
if !text.ends_with('\n')
{
stream.pop();
}
Ok(transform(&stream)) Ok(transform(&stream))
} }
/// Internal: scans a single line of text into tokens. /// Internal method that scans a single line of text into tokens.
///
/// This method processes each character of a line and generates the
/// corresponding token. It handles cases like numeric tokens, text
/// tokens, symbols, and whitespace.
///
/// # Parameters
/// - `line`: A line of text to be lexically analyzed.
/// - `stream`: A mutable reference to the token stream where the generated
/// tokens will be pushed.
/// - `cursor`: A mutable reference to the cursor position, which tracks the
/// current position in the input.
fn scan(line: &str, stream: &mut TokenStream<TokenType>, fn scan(line: &str, stream: &mut TokenStream<TokenType>,
cursor: &mut Position) cursor: &mut Position)
{ {
for c in line.chars() for c in line.chars()
{ {
// Get the token type based on the character.
let variant = get_token_type(c); let variant = get_token_type(c);
let last = stream.len().saturating_sub(1); let last = stream.len().saturating_sub(1);
// Handle token merging for contiguous tokens like numbers or text.
if !stream.is_empty() && if !stream.is_empty() &&
variant == stream.variants[last] && variant == stream.variants[last] &&
(variant == TokenType::Numeric || variant == TokenType::Text) (variant == TokenType::Numeric || variant == TokenType::Text)
@ -154,6 +211,7 @@ impl Lexer
} }
else else
{ {
// Add a new token to the stream.
stream.push(c.to_string(), variant, Span::with_single(*cursor)); stream.push(c.to_string(), variant, Span::with_single(*cursor));
} }
@ -164,6 +222,18 @@ impl Lexer
/// Determines the type of a token based on the current character.
///
/// This helper function is responsible for identifying whether the current
/// character is part of a known token type such as numeric, text, whitespace,
/// or symbol.
///
/// # Parameters
/// - `curr_char`: The current character to analyze.
///
/// # Returns
/// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
/// `Whitespace`, etc.).
fn get_token_type(curr_char: char) -> TokenType fn get_token_type(curr_char: char) -> TokenType
{ {
match curr_char match curr_char

View File

@ -7,14 +7,18 @@
pub mod library; pub mod library;
mod ast;
mod error; mod error;
mod iter;
mod lexer; mod lexer;
mod position; mod position;
mod token; mod token;
pub use crate::ast::*;
pub use crate::error::*; pub use crate::error::*;
pub use crate::iter::*;
pub use crate::lexer::*; pub use crate::lexer::*;
pub use crate::position::*; pub use crate::position::*;
pub use crate::token::*; pub use crate::token::*;

43
tests/iter.rs Normal file
View File

@ -0,0 +1,43 @@
use rune::*;
#[test]
fn peek_works()
{
let mut it = vec![1, 2, 3].into_iter().lookahead();
assert_eq!(it.peek(0), Some(&1));
assert_eq!(it.peek(1), Some(&2));
assert_eq!(it.peek(2), Some(&3));
assert_eq!(it.peek(3), None);
}
#[test]
fn peek_mut_modifies_item()
{
let mut it = vec![10, 20, 30].into_iter().lookahead();
if let Some(x) = it.peek_mut(1)
{
*x += 100;
}
assert_eq!(it.next(), Some(10));
assert_eq!(it.next(), Some(120));
}
#[test]
fn iterates_correctly()
{
let mut it = vec![1, 2, 3].into_iter().lookahead();
assert_eq!(it.next(), Some(1));
assert_eq!(it.next(), Some(2));
assert_eq!(it.next(), Some(3));
assert_eq!(it.next(), None);
}
#[test]
fn size_hint_accounts_for_buffer()
{
let mut it = vec![1, 2, 3, 4].into_iter().lookahead();
it.peek(2);
let (low, high) = it.size_hint();
assert!(low >= 4);
assert_eq!(high, Some(4));
}

View File

@ -53,45 +53,47 @@ fn cleanup_temp_file(path: &PathBuf)
#[test] #[test]
fn test_basic_lexing() fn basic_lexing()
{ {
let tokens = let tokens =
Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \ Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
succeed"); succeed");
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>(); let tokens = tokens.into_iter()
.map(|t| (*t.variant, String::from(t.lexeme)))
.collect::<Vec<_>>();
let expected = vec![(TokenType::Text, "magic".to_string()), let expected = vec![(TokenType::Text, "magic".to_string()),
(TokenType::Whitespace, " ".to_string()), (TokenType::Whitespace, " ".to_string()),
(TokenType::Text, "runes".to_string()), (TokenType::Text, "runes".to_string()),];
(TokenType::Newline, "\n".to_string()),];
assert_eq!(tokens, expected); assert_eq!(tokens, expected);
} }
#[test] #[test]
fn test_symbols_and_numbers() fn symbols_and_numbers()
{ {
let tokens = let tokens =
Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \ Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
succeed"); succeed");
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>(); let tokens = tokens.into_iter()
.map(|t| (*t.variant, String::from(t.lexeme)))
.collect::<Vec<_>>();
let expected = vec![(TokenType::Numeric, "13".into()), let expected = vec![(TokenType::Numeric, "13".into()),
(TokenType::Whitespace, " ".into()), (TokenType::Whitespace, " ".into()),
(TokenType::Symbol, "+".into()), (TokenType::Symbol, "+".into()),
(TokenType::Whitespace, " ".into()), (TokenType::Whitespace, " ".into()),
(TokenType::Numeric, "37".into()), (TokenType::Numeric, "37".into()),];
(TokenType::Newline, "\n".into()),];
assert_eq!(tokens, expected); assert_eq!(tokens, expected);
} }
#[test] #[test]
fn test_lexer_with_cases() fn lexer_with_cases()
{ {
let cases = vec![TestCase { name: "simple_words", let cases = vec![TestCase { name: "simple_words",
input: "magic rune", input: "magic rune",
@ -129,16 +131,16 @@ fn test_lexer_with_cases()
on case '{}'", on case '{}'",
case.name)); case.name));
let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>(); let result = result.into_iter()
.map(|t| (*t.variant, String::from(t.lexeme)))
.collect::<Vec<_>>();
let expected = case.expected let expected = case.expected
.iter() .iter()
.map(|(ty, s)| (*ty, s.to_string())) .map(|(ty, s)| (*ty, s.to_string()))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert_eq!(result, expected, assert_eq!(result, expected, "Mismatch in test case '{}'", case.name);
"Mismatch in test case '{}'",
case.name);
cleanup_temp_file(&path); cleanup_temp_file(&path);
} }