From f5780f50c290fcf1edf622c5230fbcde962e320e Mon Sep 17 00:00:00 2001 From: Myrddin Dundragon Date: Tue, 22 Apr 2025 02:18:12 -0400 Subject: [PATCH] Just some basic updating and cleaning up. - Added comments. - Ran cargo fmt. - Updated the versioning. --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 2 +- examples/basic.rs | 236 ++++++++++++++++++++++++------------------- src/error.rs | 190 +++++++++++++++++----------------- src/lexer.rs | 119 +++++++++++++++++----- tests/lexer_tests.rs | 16 +-- 7 files changed, 330 insertions(+), 237 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2415206..7b3a30f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,4 +4,4 @@ version = 4 [[package]] name = "rune" -version = "0.2.0" +version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index 92b6226..793fb83 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rune" -version = "0.2.0" +version = "0.3.0" edition = "2021" description = "A lexical analysis library." repository = "/myrddin/rune" diff --git a/README.md b/README.md index a9a4b1b..9ead52d 100644 --- a/README.md +++ b/README.md @@ -31,5 +31,5 @@ Then add this to your Cargo.toml file. ```toml [dependencies] -rune = { version = "0.2.0", registry = "cybermages" } +rune = { version = "0.3.0", registry = "cybermages" } ``` diff --git a/examples/basic.rs b/examples/basic.rs index dd2ac83..8e5f138 100644 --- a/examples/basic.rs +++ b/examples/basic.rs @@ -5,133 +5,159 @@ use rune::{Lexer, Span, TokenStream, TokenType}; #[derive(Debug, Clone, PartialEq, Eq)] -pub enum MarkdownTokenType { - Heading(u8), - EmphasisStart, - EmphasisEnd, - StrongStart, - StrongEnd, - CodeSpan, - Text, - Symbol, - Whitespace, - Newline, - Unknown, +pub enum MarkdownTokenType +{ + Heading(u8), + EmphasisStart, + EmphasisEnd, + StrongStart, + StrongEnd, + CodeSpan, + Text, + Symbol, + Whitespace, + Newline, + Unknown } -impl std::fmt::Display for MarkdownTokenType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level), - MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"), - MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"), - MarkdownTokenType::StrongStart => write!(f, "StrongStart"), - MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"), - MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"), - MarkdownTokenType::Text => write!(f, "Text"), - MarkdownTokenType::Symbol => write!(f, "Symbol"), - MarkdownTokenType::Whitespace => write!(f, "Whitespace"), - MarkdownTokenType::Newline => write!(f, "Newline"), - MarkdownTokenType::Unknown => write!(f, "Unknown"), - } - } +impl std::fmt::Display for MarkdownTokenType +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result + { + match self + { + MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level), + MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"), + MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"), + MarkdownTokenType::StrongStart => write!(f, "StrongStart"), + MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"), + MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"), + MarkdownTokenType::Text => write!(f, "Text"), + MarkdownTokenType::Symbol => write!(f, "Symbol"), + MarkdownTokenType::Whitespace => write!(f, "Whitespace"), + MarkdownTokenType::Newline => write!(f, "Newline"), + MarkdownTokenType::Unknown => write!(f, "Unknown") + } + } } // Define how you want to interpret base tokens -pub fn transform(input: &TokenStream) -> TokenStream { - let mut output = TokenStream::new(); +pub fn transform(input: &TokenStream) + -> TokenStream +{ + let mut output = TokenStream::new(); - let mut i = 0; - while i < input.len() { - let token = input.get(i).unwrap(); // safe due to bounds check above + let mut i = 0; + while i < input.len() + { + let token = input.get(i).unwrap(); // safe due to bounds check above - match token.variant { - TokenType::Symbol if token.lexeme == "#" => { - // Count consecutive #s for heading level - let mut level = 1; - let mut span = token.span.clone(); + match token.variant + { + TokenType::Symbol if token.lexeme == "#" => + { + // Count consecutive #s for heading level + let mut level = 1; + let mut span = token.span.clone(); - while i + 1 < input.len() { - let next = input.get(i + 1).unwrap(); - if *next.variant == TokenType::Symbol && next.lexeme == "#" { - level += 1; - span.end = next.span.end; - i += 1; - } else { - break; - } - } - - output.push(token.lexeme.repeat(level), - MarkdownTokenType::Heading(level as u8), - span); + while i + 1 < input.len() + { + let next = input.get(i + 1).unwrap(); + if *next.variant == TokenType::Symbol && next.lexeme == "#" + { + level += 1; + span.end = next.span.end; + i += 1; + } + else + { + break; + } } - TokenType::Symbol if token.lexeme == "*" => { - // Look ahead to see if it's strong (**) or emphasis (*) - if i + 1 < input.len() { - let next = input.get(i + 1).unwrap(); - if *next.variant == TokenType::Symbol && next.lexeme == "*" { - output.push("**".to_string(), - MarkdownTokenType::StrongStart, - Span::merge(*token.span, *next.span)); - i += 1; // skip the second '*' - } else { - output.push("*".to_string(), - MarkdownTokenType::EmphasisStart, - token.span.clone()); - } - } else { - output.push("*".to_string(), - MarkdownTokenType::EmphasisStart, - token.span.clone()); - } + output.push(token.lexeme.repeat(level), + MarkdownTokenType::Heading(level as u8), + span); + } + + TokenType::Symbol if token.lexeme == "*" => + { + // Look ahead to see if it's strong (**) or emphasis (*) + if i + 1 < input.len() + { + let next = input.get(i + 1).unwrap(); + if *next.variant == TokenType::Symbol && next.lexeme == "*" + { + output.push("**".to_string(), + MarkdownTokenType::StrongStart, + Span::merge(*token.span, *next.span)); + i += 1; // skip the second '*' + } + else + { + output.push("*".to_string(), + MarkdownTokenType::EmphasisStart, + token.span.clone()); + } } - - TokenType::Symbol if token.lexeme == "`" => { - output.push(token.lexeme.to_string(), - MarkdownTokenType::CodeSpan, - token.span.clone()); + else + { + output.push("*".to_string(), + MarkdownTokenType::EmphasisStart, + token.span.clone()); } + } - TokenType::Text => { - output.push(token.lexeme.to_string(), - MarkdownTokenType::Text, - token.span.clone()); - } + TokenType::Symbol if token.lexeme == "`" => + { + output.push(token.lexeme.to_string(), + MarkdownTokenType::CodeSpan, + token.span.clone()); + } - TokenType::Symbol => { - output.push(token.lexeme.to_string(), - MarkdownTokenType::Symbol, - token.span.clone()); - } + TokenType::Text => + { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Text, + token.span.clone()); + } - TokenType::Whitespace => { - output.push(token.lexeme.to_string(), - MarkdownTokenType::Whitespace, - token.span.clone()); - } + TokenType::Symbol => + { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Symbol, + token.span.clone()); + } - TokenType::Newline => { - output.push(token.lexeme.to_string(), - MarkdownTokenType::Newline, - token.span.clone()); - } + TokenType::Whitespace => + { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Whitespace, + token.span.clone()); + } - _ => { - output.push(token.lexeme.to_string(), - MarkdownTokenType::Unknown, - token.span.clone()); - } - } + TokenType::Newline => + { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Newline, + token.span.clone()); + } - i += 1; - } + _ => + { + output.push(token.lexeme.to_string(), + MarkdownTokenType::Unknown, + token.span.clone()); + } + } - output + i += 1; + } + + output } diff --git a/src/error.rs b/src/error.rs index 6d0615f..96b69cb 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,4 +1,6 @@ -use std::{error::Error, path::PathBuf}; +use std::error::Error; +use std::path::PathBuf; + use super::position::Span; @@ -12,116 +14,114 @@ use super::position::Span; /// It is designed to provide detailed diagnostics for file-based or /// in-memory parsing and is compatible with error reporting ecosystems. #[derive(Debug)] -pub struct LexerError { - /// A human-readable error message. - pub message: String, +pub struct LexerError +{ + /// A human-readable error message. + pub message: String, - /// The span where the error occurred. - pub span: Span, + /// The span where the error occurred. + pub span: Span, - /// The file that the error occurred in, if known. - pub file: Option, + /// The file that the error occurred in, if known. + pub file: Option, - /// The source snippet related to the error, if known. - pub snippet: Option, + /// The source snippet related to the error, if known. + pub snippet: Option, - /// An optional underlying error that caused this one. - pub source: Option>, + /// An optional underlying error that caused this one. + pub source: Option> } -impl LexerError { - /// Creates a new `LexerError` with a message, span, and optional context. - /// - /// # Parameters - /// - `message`: A human-readable explanation of the error. - /// - `span`: The region in the source where the error occurred. - /// - `file`: An optional path to the file in which the error occurred. - /// - `snippet`: An optional problematic input string. - /// - /// # Returns - /// A new instance of `LexerError`. - pub fn new( - message: S, - span: Span, - file: Option, - snippet: Option, - ) -> Self - where - S: Into, - T: Into, - { - LexerError { - message: message.into(), - span, - file: file.map(Into::into), - snippet: snippet.map(Into::into), - source: None, - } - } +impl LexerError +{ + /// Creates a new `LexerError` with a message, span, and optional context. + /// + /// # Parameters + /// - `message`: A human-readable explanation of the error. + /// - `span`: The region in the source where the error occurred. + /// - `file`: An optional path to the file in which the error occurred. + /// - `snippet`: An optional problematic input string. + /// + /// # Returns + /// A new instance of `LexerError`. + pub fn new(message: S, span: Span, file: Option, + snippet: Option) + -> Self + where S: Into, + T: Into + { + LexerError { message: message.into(), + span, + file: file.map(Into::into), + snippet: snippet.map(Into::into), + source: None } + } - /// Creates a `LexerError` from only a message and span. - /// - /// This is useful when file or snippet context is not available. - pub fn from_message(message: S, span: Span) -> Self - where - S: Into, - { - Self::new(message, span, None::, None::) - } + /// Creates a `LexerError` from only a message and span. + /// + /// This is useful when file or snippet context is not available. + pub fn from_message(message: S, span: Span) -> Self + where S: Into + { + Self::new(message, span, None::, None::) + } - /// Attaches a snippet of the offending source code. - /// - /// This is helpful for diagnostics and tooling output. - pub fn with_snippet(mut self, snippet: S) -> Self - where - S: Into, - { - self.snippet = Some(snippet.into()); - self - } + /// Attaches a snippet of the offending source code. + /// + /// This is helpful for diagnostics and tooling output. + pub fn with_snippet(mut self, snippet: S) -> Self + where S: Into + { + self.snippet = Some(snippet.into()); + self + } - /// Attaches the path of the file in which the error occurred. - pub fn with_file(mut self, file: T) -> Self - where - T: Into, - { - self.file = Some(file.into()); - self - } + /// Attaches the path of the file in which the error occurred. + pub fn with_file(mut self, file: T) -> Self + where T: Into + { + self.file = Some(file.into()); + self + } - /// Wraps a source error that caused this `LexerError`. - /// - /// This allows you to chain errors for more detailed diagnostics. - pub fn with_source(mut self, err: E) -> Self - where - E: Error + 'static, - { - self.source = Some(Box::new(err)); - self - } + /// Wraps a source error that caused this `LexerError`. + /// + /// This allows you to chain errors for more detailed diagnostics. + pub fn with_source(mut self, err: E) -> Self + where E: Error + 'static + { + self.source = Some(Box::new(err)); + self + } } -impl std::fmt::Display for LexerError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Lexer error at {}", self.span)?; +impl std::fmt::Display for LexerError +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result + { + write!(f, "Lexer error at {}", self.span)?; - if let Some(file) = &self.file { - write!(f, " in file `{}`", file.display())?; - } + if let Some(file) = &self.file + { + write!(f, " in file `{}`", file.display())?; + } - write!(f, ": {}", self.message)?; + write!(f, ": {}", self.message)?; - if let Some(snippet) = &self.snippet { - write!(f, "\n --> Snippet: `{}`", snippet)?; - } + if let Some(snippet) = &self.snippet + { + write!(f, "\n --> Snippet: `{}`", snippet)?; + } - Ok(()) - } + Ok(()) + } } -impl Error for LexerError { - /// Returns the underlying cause of this error, if any. - fn source(&self) -> Option<&(dyn Error + 'static)> { - self.source.as_ref().map(|e| e.as_ref()) - } +impl Error for LexerError +{ + /// Returns the underlying cause of this error, if any. + fn source(&self) -> Option<&(dyn Error + 'static)> + { + self.source.as_ref().map(|e| e.as_ref()) + } } diff --git a/src/lexer.rs b/src/lexer.rs index ab0a583..1a2451e 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -7,24 +7,25 @@ use super::token::{TokenStream, TokenType}; -/// The size of data chunks to read from a file. This was arbitrarily chosen to -/// be 1mb. +/// The size of data chunks to read from a file. This is an arbitrary choice, +/// set to 1MB. const BUFFER_SIZE: usize = 1024 * 1024; -/// The `Lexer` is the core component responsible for performing -/// lexicographical analysis on a text file. +/// The `Lexer` struct is responsible for performing lexical analysis +/// (tokenization) on text. /// -/// It reads input from a file character-by-character, generating a stream -/// of base tokens such as text, numbers, whitespace, symbols, and newlines. -/// These tokens are accumulated into a `TokenStream`, which is a flat, -/// cache-friendly data structure. +/// It processes input from a file or string character-by-character and +/// generates a stream of tokens, such as text, numbers, whitespace, symbols, +/// and newlines. These tokens are accumulated into a `TokenStream`, which is a +/// flat, cache-friendly data structure designed for efficient iteration. /// -/// After tokenization, the lexer applies a user-provided `transform` function -/// to each token in the stream, allowing consumers of the library to convert -/// base tokens into richer, domain-specific token types (e.g. Markdown -/// elements, syntax trees, or custom DSL tokens). +/// After the base tokens are generated, the `Lexer` allows for transformation +/// of these tokens into richer, domain-specific types via a user-provided +/// `transform` function. This transformation can be used to convert base tokens +/// into specific elements of a Markdown syntax tree, custom DSL tokens, or any +/// other custom format you need. /// /// # Example /// @@ -38,32 +39,51 @@ const BUFFER_SIZE: usize = 1024 * 1024; /// /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); /// -/// // The tuple here is from the transform functions return type. /// for token in &tokens /// { /// println!("{}", token); /// } /// ``` /// -/// # Design Notes -/// -/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap +/// # Design Considerations +/// - Utilizes a flat `TokenStream` to improve performance and reduce heap /// overhead. -/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit +/// - Consolidates contiguous characters into compound tokens (e.g., multi-digit /// numbers). -/// - Easily extensible via the `transform` function to support higher-level -/// parsing tasks. +/// - Extensible via the `transform` function, enabling the creation of +/// higher-level constructs, like Markdown elements or syntax trees for a +/// custom DSL. /// -/// # Errors -/// -/// Returns a `LexerError` if the file cannot be opened or read. +/// # Error Handling +/// The lexer will return a `LexerError` if the input file cannot be opened or +/// read. Errors include issues such as missing files, read failures, or invalid +/// input formats. pub enum Lexer {} impl Lexer { - /// Scans a file and produces a vector of transformed tokens. + /// Scans a file and generates a vector of transformed tokens based on the + /// provided `transform` function. + /// + /// This method opens a file from the given `path`, reads the file line by + /// line, and converts the input into a stream of tokens. The tokens are + /// then passed to the `transform` function, which allows users to map + /// base tokens into domain-specific types. + /// + /// # Parameters + /// - `path`: A path to the file to be lexically analyzed. + /// - `transform`: A function that takes a `TokenStream` and + /// transforms it into a `TokenStream` where `T` is a domain-specific + /// type. + /// + /// # Returns + /// A `Result, LexerError>` where `T` is the transformed token + /// type, or an error. + /// + /// # Errors + /// Returns a `LexerError` if the file cannot be opened or read. pub fn scan_file(path: P, transform: F) -> Result, LexerError> where P: AsRef, @@ -82,6 +102,7 @@ impl Lexer let reader = BufReader::with_capacity(BUFFER_SIZE, input_file); + // Read the file line by line. for line in reader.lines() { match line @@ -93,7 +114,7 @@ impl Lexer Err(_) => { return Err(LexerError::new("Unable to read line during \ - Lexigraphical Analysis.", + Lexical Analysis.", Span::default(), Some(path.as_ref() .to_string_lossy() @@ -102,10 +123,10 @@ impl Lexer } } + // Add the newline token after each line. stream.push("\n".to_string(), TokenType::Newline, Span::with_single(cursor)); - cursor.line += 1; cursor.column = 0; } @@ -113,7 +134,22 @@ impl Lexer Ok(transform(&stream)) } - /// Scans a full in-memory string and returns transformed tokens. + /// Scans a full in-memory string and produces transformed tokens. + /// + /// This method tokenizes the input string `text` and returns the transformed + /// tokens using the provided `transform` function. It's a convenient way + /// to perform lexical analysis on in-memory strings without needing to + /// read from a file. + /// + /// # Parameters + /// - `text`: A string slice representing the in-memory input text to + /// analyze. + /// - `transform`: A function that transforms the base tokens into + /// domain-specific types. + /// + /// # Returns + /// A `Result, LexerError>` where `T` is the transformed token + /// type, or an error. pub fn scan_text(text: &str, transform: F) -> Result, LexerError> where F: FnOnce(&TokenStream) -> TokenStream @@ -121,14 +157,15 @@ impl Lexer let mut cursor = Position::default(); let mut stream = TokenStream::new(); + // Process each line in the input string. for line in text.lines() { Self::scan(line, &mut stream, &mut cursor); + // Add the newline token after each line. stream.push("\n".to_string(), TokenType::Newline, Span::with_single(cursor)); - cursor.line += 1; cursor.column = 0; } @@ -136,15 +173,28 @@ impl Lexer Ok(transform(&stream)) } - /// Internal: scans a single line of text into tokens. + /// Internal method that scans a single line of text into tokens. + /// + /// This method processes each character of a line and generates the + /// corresponding token. It handles cases like numeric tokens, text + /// tokens, symbols, and whitespace. + /// + /// # Parameters + /// - `line`: A line of text to be lexically analyzed. + /// - `stream`: A mutable reference to the token stream where the generated + /// tokens will be pushed. + /// - `cursor`: A mutable reference to the cursor position, which tracks the + /// current position in the input. fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) { for c in line.chars() { + // Get the token type based on the character. let variant = get_token_type(c); let last = stream.len().saturating_sub(1); + // Handle token merging for contiguous tokens like numbers or text. if !stream.is_empty() && variant == stream.variants[last] && (variant == TokenType::Numeric || variant == TokenType::Text) @@ -154,6 +204,7 @@ impl Lexer } else { + // Add a new token to the stream. stream.push(c.to_string(), variant, Span::with_single(*cursor)); } @@ -164,6 +215,18 @@ impl Lexer +/// Determines the type of a token based on the current character. +/// +/// This helper function is responsible for identifying whether the current +/// character is part of a known token type such as numeric, text, whitespace, +/// or symbol. +/// +/// # Parameters +/// - `curr_char`: The current character to analyze. +/// +/// # Returns +/// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`, +/// `Whitespace`, etc.). fn get_token_type(curr_char: char) -> TokenType { match curr_char diff --git a/tests/lexer_tests.rs b/tests/lexer_tests.rs index 63a5a30..16b8452 100644 --- a/tests/lexer_tests.rs +++ b/tests/lexer_tests.rs @@ -59,7 +59,9 @@ fn test_basic_lexing() Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \ succeed"); - let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); + let tokens = tokens.into_iter() + .map(|t| (*t.variant, String::from(t.lexeme))) + .collect::>(); let expected = vec![(TokenType::Text, "magic".to_string()), (TokenType::Whitespace, " ".to_string()), @@ -77,7 +79,9 @@ fn test_symbols_and_numbers() Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \ succeed"); - let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); + let tokens = tokens.into_iter() + .map(|t| (*t.variant, String::from(t.lexeme))) + .collect::>(); let expected = vec![(TokenType::Numeric, "13".into()), (TokenType::Whitespace, " ".into()), @@ -129,16 +133,16 @@ fn test_lexer_with_cases() on case '{}'", case.name)); - let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::>(); + let result = result.into_iter() + .map(|t| (*t.variant, String::from(t.lexeme))) + .collect::>(); let expected = case.expected .iter() .map(|(ty, s)| (*ty, s.to_string())) .collect::>(); - assert_eq!(result, expected, - "Mismatch in test case '{}'", - case.name); + assert_eq!(result, expected, "Mismatch in test case '{}'", case.name); cleanup_temp_file(&path); }