Just some basic updating and cleaning up.

- Added comments.
- Ran cargo fmt.
- Updated the versioning.
This commit is contained in:
Myrddin Dundragon 2025-04-22 02:18:12 -04:00
parent cd50b53be5
commit f5780f50c2
7 changed files with 330 additions and 237 deletions

2
Cargo.lock generated
View File

@ -4,4 +4,4 @@ version = 4
[[package]] [[package]]
name = "rune" name = "rune"
version = "0.2.0" version = "0.3.0"

View File

@ -1,6 +1,6 @@
[package] [package]
name = "rune" name = "rune"
version = "0.2.0" version = "0.3.0"
edition = "2021" edition = "2021"
description = "A lexical analysis library." description = "A lexical analysis library."
repository = "/myrddin/rune" repository = "/myrddin/rune"

View File

@ -31,5 +31,5 @@ Then add this to your Cargo.toml file.
```toml ```toml
[dependencies] [dependencies]
rune = { version = "0.2.0", registry = "cybermages" } rune = { version = "0.3.0", registry = "cybermages" }
``` ```

View File

@ -5,7 +5,8 @@ use rune::{Lexer, Span, TokenStream, TokenType};
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum MarkdownTokenType { pub enum MarkdownTokenType
{
Heading(u8), Heading(u8),
EmphasisStart, EmphasisStart,
EmphasisEnd, EmphasisEnd,
@ -16,13 +17,16 @@ pub enum MarkdownTokenType {
Symbol, Symbol,
Whitespace, Whitespace,
Newline, Newline,
Unknown, Unknown
} }
impl std::fmt::Display for MarkdownTokenType { impl std::fmt::Display for MarkdownTokenType
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { {
match self { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
{
match self
{
MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level), MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"), MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"), MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
@ -33,7 +37,7 @@ impl std::fmt::Display for MarkdownTokenType {
MarkdownTokenType::Symbol => write!(f, "Symbol"), MarkdownTokenType::Symbol => write!(f, "Symbol"),
MarkdownTokenType::Whitespace => write!(f, "Whitespace"), MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
MarkdownTokenType::Newline => write!(f, "Newline"), MarkdownTokenType::Newline => write!(f, "Newline"),
MarkdownTokenType::Unknown => write!(f, "Unknown"), MarkdownTokenType::Unknown => write!(f, "Unknown")
} }
} }
} }
@ -41,26 +45,35 @@ impl std::fmt::Display for MarkdownTokenType {
// Define how you want to interpret base tokens // Define how you want to interpret base tokens
pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> { pub fn transform(input: &TokenStream<TokenType>)
-> TokenStream<MarkdownTokenType>
{
let mut output = TokenStream::new(); let mut output = TokenStream::new();
let mut i = 0; let mut i = 0;
while i < input.len() { while i < input.len()
{
let token = input.get(i).unwrap(); // safe due to bounds check above let token = input.get(i).unwrap(); // safe due to bounds check above
match token.variant { match token.variant
TokenType::Symbol if token.lexeme == "#" => { {
TokenType::Symbol if token.lexeme == "#" =>
{
// Count consecutive #s for heading level // Count consecutive #s for heading level
let mut level = 1; let mut level = 1;
let mut span = token.span.clone(); let mut span = token.span.clone();
while i + 1 < input.len() { while i + 1 < input.len()
{
let next = input.get(i + 1).unwrap(); let next = input.get(i + 1).unwrap();
if *next.variant == TokenType::Symbol && next.lexeme == "#" { if *next.variant == TokenType::Symbol && next.lexeme == "#"
{
level += 1; level += 1;
span.end = next.span.end; span.end = next.span.end;
i += 1; i += 1;
} else { }
else
{
break; break;
} }
} }
@ -70,58 +83,71 @@ pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenTyp
span); span);
} }
TokenType::Symbol if token.lexeme == "*" => { TokenType::Symbol if token.lexeme == "*" =>
{
// Look ahead to see if it's strong (**) or emphasis (*) // Look ahead to see if it's strong (**) or emphasis (*)
if i + 1 < input.len() { if i + 1 < input.len()
{
let next = input.get(i + 1).unwrap(); let next = input.get(i + 1).unwrap();
if *next.variant == TokenType::Symbol && next.lexeme == "*" { if *next.variant == TokenType::Symbol && next.lexeme == "*"
{
output.push("**".to_string(), output.push("**".to_string(),
MarkdownTokenType::StrongStart, MarkdownTokenType::StrongStart,
Span::merge(*token.span, *next.span)); Span::merge(*token.span, *next.span));
i += 1; // skip the second '*' i += 1; // skip the second '*'
} else { }
else
{
output.push("*".to_string(), output.push("*".to_string(),
MarkdownTokenType::EmphasisStart, MarkdownTokenType::EmphasisStart,
token.span.clone()); token.span.clone());
} }
} else { }
else
{
output.push("*".to_string(), output.push("*".to_string(),
MarkdownTokenType::EmphasisStart, MarkdownTokenType::EmphasisStart,
token.span.clone()); token.span.clone());
} }
} }
TokenType::Symbol if token.lexeme == "`" => { TokenType::Symbol if token.lexeme == "`" =>
{
output.push(token.lexeme.to_string(), output.push(token.lexeme.to_string(),
MarkdownTokenType::CodeSpan, MarkdownTokenType::CodeSpan,
token.span.clone()); token.span.clone());
} }
TokenType::Text => { TokenType::Text =>
{
output.push(token.lexeme.to_string(), output.push(token.lexeme.to_string(),
MarkdownTokenType::Text, MarkdownTokenType::Text,
token.span.clone()); token.span.clone());
} }
TokenType::Symbol => { TokenType::Symbol =>
{
output.push(token.lexeme.to_string(), output.push(token.lexeme.to_string(),
MarkdownTokenType::Symbol, MarkdownTokenType::Symbol,
token.span.clone()); token.span.clone());
} }
TokenType::Whitespace => { TokenType::Whitespace =>
{
output.push(token.lexeme.to_string(), output.push(token.lexeme.to_string(),
MarkdownTokenType::Whitespace, MarkdownTokenType::Whitespace,
token.span.clone()); token.span.clone());
} }
TokenType::Newline => { TokenType::Newline =>
{
output.push(token.lexeme.to_string(), output.push(token.lexeme.to_string(),
MarkdownTokenType::Newline, MarkdownTokenType::Newline,
token.span.clone()); token.span.clone());
} }
_ => { _ =>
{
output.push(token.lexeme.to_string(), output.push(token.lexeme.to_string(),
MarkdownTokenType::Unknown, MarkdownTokenType::Unknown,
token.span.clone()); token.span.clone());

View File

@ -1,4 +1,6 @@
use std::{error::Error, path::PathBuf}; use std::error::Error;
use std::path::PathBuf;
use super::position::Span; use super::position::Span;
@ -12,7 +14,8 @@ use super::position::Span;
/// It is designed to provide detailed diagnostics for file-based or /// It is designed to provide detailed diagnostics for file-based or
/// in-memory parsing and is compatible with error reporting ecosystems. /// in-memory parsing and is compatible with error reporting ecosystems.
#[derive(Debug)] #[derive(Debug)]
pub struct LexerError { pub struct LexerError
{
/// A human-readable error message. /// A human-readable error message.
pub message: String, pub message: String,
@ -26,10 +29,11 @@ pub struct LexerError {
pub snippet: Option<String>, pub snippet: Option<String>,
/// An optional underlying error that caused this one. /// An optional underlying error that caused this one.
pub source: Option<Box<dyn Error>>, pub source: Option<Box<dyn Error>>
} }
impl LexerError { impl LexerError
{
/// Creates a new `LexerError` with a message, span, and optional context. /// Creates a new `LexerError` with a message, span, and optional context.
/// ///
/// # Parameters /// # Parameters
@ -40,31 +44,24 @@ impl LexerError {
/// ///
/// # Returns /// # Returns
/// A new instance of `LexerError`. /// A new instance of `LexerError`.
pub fn new<S, T>( pub fn new<S, T>(message: S, span: Span, file: Option<T>,
message: S, snippet: Option<S>)
span: Span, -> Self
file: Option<T>, where S: Into<String>,
snippet: Option<S>, T: Into<PathBuf>
) -> Self
where
S: Into<String>,
T: Into<PathBuf>,
{ {
LexerError { LexerError { message: message.into(),
message: message.into(),
span, span,
file: file.map(Into::into), file: file.map(Into::into),
snippet: snippet.map(Into::into), snippet: snippet.map(Into::into),
source: None, source: None }
}
} }
/// Creates a `LexerError` from only a message and span. /// Creates a `LexerError` from only a message and span.
/// ///
/// This is useful when file or snippet context is not available. /// This is useful when file or snippet context is not available.
pub fn from_message<S>(message: S, span: Span) -> Self pub fn from_message<S>(message: S, span: Span) -> Self
where where S: Into<String>
S: Into<String>,
{ {
Self::new(message, span, None::<PathBuf>, None::<S>) Self::new(message, span, None::<PathBuf>, None::<S>)
} }
@ -73,8 +70,7 @@ impl LexerError {
/// ///
/// This is helpful for diagnostics and tooling output. /// This is helpful for diagnostics and tooling output.
pub fn with_snippet<S>(mut self, snippet: S) -> Self pub fn with_snippet<S>(mut self, snippet: S) -> Self
where where S: Into<String>
S: Into<String>,
{ {
self.snippet = Some(snippet.into()); self.snippet = Some(snippet.into());
self self
@ -82,8 +78,7 @@ impl LexerError {
/// Attaches the path of the file in which the error occurred. /// Attaches the path of the file in which the error occurred.
pub fn with_file<T>(mut self, file: T) -> Self pub fn with_file<T>(mut self, file: T) -> Self
where where T: Into<PathBuf>
T: Into<PathBuf>,
{ {
self.file = Some(file.into()); self.file = Some(file.into());
self self
@ -93,25 +88,28 @@ impl LexerError {
/// ///
/// This allows you to chain errors for more detailed diagnostics. /// This allows you to chain errors for more detailed diagnostics.
pub fn with_source<E>(mut self, err: E) -> Self pub fn with_source<E>(mut self, err: E) -> Self
where where E: Error + 'static
E: Error + 'static,
{ {
self.source = Some(Box::new(err)); self.source = Some(Box::new(err));
self self
} }
} }
impl std::fmt::Display for LexerError { impl std::fmt::Display for LexerError
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
{
write!(f, "Lexer error at {}", self.span)?; write!(f, "Lexer error at {}", self.span)?;
if let Some(file) = &self.file { if let Some(file) = &self.file
{
write!(f, " in file `{}`", file.display())?; write!(f, " in file `{}`", file.display())?;
} }
write!(f, ": {}", self.message)?; write!(f, ": {}", self.message)?;
if let Some(snippet) = &self.snippet { if let Some(snippet) = &self.snippet
{
write!(f, "\n --> Snippet: `{}`", snippet)?; write!(f, "\n --> Snippet: `{}`", snippet)?;
} }
@ -119,9 +117,11 @@ impl std::fmt::Display for LexerError {
} }
} }
impl Error for LexerError { impl Error for LexerError
{
/// Returns the underlying cause of this error, if any. /// Returns the underlying cause of this error, if any.
fn source(&self) -> Option<&(dyn Error + 'static)> { fn source(&self) -> Option<&(dyn Error + 'static)>
{
self.source.as_ref().map(|e| e.as_ref()) self.source.as_ref().map(|e| e.as_ref())
} }
} }

View File

@ -7,24 +7,25 @@ use super::token::{TokenStream, TokenType};
/// The size of data chunks to read from a file. This was arbitrarily chosen to /// The size of data chunks to read from a file. This is an arbitrary choice,
/// be 1mb. /// set to 1MB.
const BUFFER_SIZE: usize = 1024 * 1024; const BUFFER_SIZE: usize = 1024 * 1024;
/// The `Lexer` is the core component responsible for performing /// The `Lexer` struct is responsible for performing lexical analysis
/// lexicographical analysis on a text file. /// (tokenization) on text.
/// ///
/// It reads input from a file character-by-character, generating a stream /// It processes input from a file or string character-by-character and
/// of base tokens such as text, numbers, whitespace, symbols, and newlines. /// generates a stream of tokens, such as text, numbers, whitespace, symbols,
/// These tokens are accumulated into a `TokenStream`, which is a flat, /// and newlines. These tokens are accumulated into a `TokenStream`, which is a
/// cache-friendly data structure. /// flat, cache-friendly data structure designed for efficient iteration.
/// ///
/// After tokenization, the lexer applies a user-provided `transform` function /// After the base tokens are generated, the `Lexer` allows for transformation
/// to each token in the stream, allowing consumers of the library to convert /// of these tokens into richer, domain-specific types via a user-provided
/// base tokens into richer, domain-specific token types (e.g. Markdown /// `transform` function. This transformation can be used to convert base tokens
/// elements, syntax trees, or custom DSL tokens). /// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
/// other custom format you need.
/// ///
/// # Example /// # Example
/// ///
@ -38,32 +39,51 @@ const BUFFER_SIZE: usize = 1024 * 1024;
/// ///
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
/// ///
/// // The tuple here is from the transform functions return type.
/// for token in &tokens /// for token in &tokens
/// { /// {
/// println!("{}", token); /// println!("{}", token);
/// } /// }
/// ``` /// ```
/// ///
/// # Design Notes /// # Design Considerations
/// /// - Utilizes a flat `TokenStream` to improve performance and reduce heap
/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
/// overhead. /// overhead.
/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit /// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
/// numbers). /// numbers).
/// - Easily extensible via the `transform` function to support higher-level /// - Extensible via the `transform` function, enabling the creation of
/// parsing tasks. /// higher-level constructs, like Markdown elements or syntax trees for a
/// custom DSL.
/// ///
/// # Errors /// # Error Handling
/// /// The lexer will return a `LexerError` if the input file cannot be opened or
/// Returns a `LexerError` if the file cannot be opened or read. /// read. Errors include issues such as missing files, read failures, or invalid
/// input formats.
pub enum Lexer {} pub enum Lexer {}
impl Lexer impl Lexer
{ {
/// Scans a file and produces a vector of transformed tokens. /// Scans a file and generates a vector of transformed tokens based on the
/// provided `transform` function.
///
/// This method opens a file from the given `path`, reads the file line by
/// line, and converts the input into a stream of tokens. The tokens are
/// then passed to the `transform` function, which allows users to map
/// base tokens into domain-specific types.
///
/// # Parameters
/// - `path`: A path to the file to be lexically analyzed.
/// - `transform`: A function that takes a `TokenStream<TokenType>` and
/// transforms it into a `TokenStream<T>` where `T` is a domain-specific
/// type.
///
/// # Returns
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
/// type, or an error.
///
/// # Errors
/// Returns a `LexerError` if the file cannot be opened or read.
pub fn scan_file<P, F, T>(path: P, transform: F) pub fn scan_file<P, F, T>(path: P, transform: F)
-> Result<TokenStream<T>, LexerError> -> Result<TokenStream<T>, LexerError>
where P: AsRef<std::path::Path>, where P: AsRef<std::path::Path>,
@ -82,6 +102,7 @@ impl Lexer
let reader = BufReader::with_capacity(BUFFER_SIZE, input_file); let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
// Read the file line by line.
for line in reader.lines() for line in reader.lines()
{ {
match line match line
@ -93,7 +114,7 @@ impl Lexer
Err(_) => Err(_) =>
{ {
return Err(LexerError::new("Unable to read line during \ return Err(LexerError::new("Unable to read line during \
Lexigraphical Analysis.", Lexical Analysis.",
Span::default(), Span::default(),
Some(path.as_ref() Some(path.as_ref()
.to_string_lossy() .to_string_lossy()
@ -102,10 +123,10 @@ impl Lexer
} }
} }
// Add the newline token after each line.
stream.push("\n".to_string(), stream.push("\n".to_string(),
TokenType::Newline, TokenType::Newline,
Span::with_single(cursor)); Span::with_single(cursor));
cursor.line += 1; cursor.line += 1;
cursor.column = 0; cursor.column = 0;
} }
@ -113,7 +134,22 @@ impl Lexer
Ok(transform(&stream)) Ok(transform(&stream))
} }
/// Scans a full in-memory string and returns transformed tokens. /// Scans a full in-memory string and produces transformed tokens.
///
/// This method tokenizes the input string `text` and returns the transformed
/// tokens using the provided `transform` function. It's a convenient way
/// to perform lexical analysis on in-memory strings without needing to
/// read from a file.
///
/// # Parameters
/// - `text`: A string slice representing the in-memory input text to
/// analyze.
/// - `transform`: A function that transforms the base tokens into
/// domain-specific types.
///
/// # Returns
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
/// type, or an error.
pub fn scan_text<F, T>(text: &str, transform: F) pub fn scan_text<F, T>(text: &str, transform: F)
-> Result<TokenStream<T>, LexerError> -> Result<TokenStream<T>, LexerError>
where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T> where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
@ -121,14 +157,15 @@ impl Lexer
let mut cursor = Position::default(); let mut cursor = Position::default();
let mut stream = TokenStream::new(); let mut stream = TokenStream::new();
// Process each line in the input string.
for line in text.lines() for line in text.lines()
{ {
Self::scan(line, &mut stream, &mut cursor); Self::scan(line, &mut stream, &mut cursor);
// Add the newline token after each line.
stream.push("\n".to_string(), stream.push("\n".to_string(),
TokenType::Newline, TokenType::Newline,
Span::with_single(cursor)); Span::with_single(cursor));
cursor.line += 1; cursor.line += 1;
cursor.column = 0; cursor.column = 0;
} }
@ -136,15 +173,28 @@ impl Lexer
Ok(transform(&stream)) Ok(transform(&stream))
} }
/// Internal: scans a single line of text into tokens. /// Internal method that scans a single line of text into tokens.
///
/// This method processes each character of a line and generates the
/// corresponding token. It handles cases like numeric tokens, text
/// tokens, symbols, and whitespace.
///
/// # Parameters
/// - `line`: A line of text to be lexically analyzed.
/// - `stream`: A mutable reference to the token stream where the generated
/// tokens will be pushed.
/// - `cursor`: A mutable reference to the cursor position, which tracks the
/// current position in the input.
fn scan(line: &str, stream: &mut TokenStream<TokenType>, fn scan(line: &str, stream: &mut TokenStream<TokenType>,
cursor: &mut Position) cursor: &mut Position)
{ {
for c in line.chars() for c in line.chars()
{ {
// Get the token type based on the character.
let variant = get_token_type(c); let variant = get_token_type(c);
let last = stream.len().saturating_sub(1); let last = stream.len().saturating_sub(1);
// Handle token merging for contiguous tokens like numbers or text.
if !stream.is_empty() && if !stream.is_empty() &&
variant == stream.variants[last] && variant == stream.variants[last] &&
(variant == TokenType::Numeric || variant == TokenType::Text) (variant == TokenType::Numeric || variant == TokenType::Text)
@ -154,6 +204,7 @@ impl Lexer
} }
else else
{ {
// Add a new token to the stream.
stream.push(c.to_string(), variant, Span::with_single(*cursor)); stream.push(c.to_string(), variant, Span::with_single(*cursor));
} }
@ -164,6 +215,18 @@ impl Lexer
/// Determines the type of a token based on the current character.
///
/// This helper function is responsible for identifying whether the current
/// character is part of a known token type such as numeric, text, whitespace,
/// or symbol.
///
/// # Parameters
/// - `curr_char`: The current character to analyze.
///
/// # Returns
/// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
/// `Whitespace`, etc.).
fn get_token_type(curr_char: char) -> TokenType fn get_token_type(curr_char: char) -> TokenType
{ {
match curr_char match curr_char

View File

@ -59,7 +59,9 @@ fn test_basic_lexing()
Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \ Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
succeed"); succeed");
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>(); let tokens = tokens.into_iter()
.map(|t| (*t.variant, String::from(t.lexeme)))
.collect::<Vec<_>>();
let expected = vec![(TokenType::Text, "magic".to_string()), let expected = vec![(TokenType::Text, "magic".to_string()),
(TokenType::Whitespace, " ".to_string()), (TokenType::Whitespace, " ".to_string()),
@ -77,7 +79,9 @@ fn test_symbols_and_numbers()
Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \ Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
succeed"); succeed");
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>(); let tokens = tokens.into_iter()
.map(|t| (*t.variant, String::from(t.lexeme)))
.collect::<Vec<_>>();
let expected = vec![(TokenType::Numeric, "13".into()), let expected = vec![(TokenType::Numeric, "13".into()),
(TokenType::Whitespace, " ".into()), (TokenType::Whitespace, " ".into()),
@ -129,16 +133,16 @@ fn test_lexer_with_cases()
on case '{}'", on case '{}'",
case.name)); case.name));
let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>(); let result = result.into_iter()
.map(|t| (*t.variant, String::from(t.lexeme)))
.collect::<Vec<_>>();
let expected = case.expected let expected = case.expected
.iter() .iter()
.map(|(ty, s)| (*ty, s.to_string())) .map(|(ty, s)| (*ty, s.to_string()))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert_eq!(result, expected, assert_eq!(result, expected, "Mismatch in test case '{}'", case.name);
"Mismatch in test case '{}'",
case.name);
cleanup_temp_file(&path); cleanup_temp_file(&path);
} }