Just some basic updating and cleaning up.
- Added comments. - Ran cargo fmt. - Updated the versioning.
This commit is contained in:
parent
cd50b53be5
commit
f5780f50c2
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -4,4 +4,4 @@ version = 4
|
||||
|
||||
[[package]]
|
||||
name = "rune"
|
||||
version = "0.2.0"
|
||||
version = "0.3.0"
|
||||
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "rune"
|
||||
version = "0.2.0"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
description = "A lexical analysis library."
|
||||
repository = "/myrddin/rune"
|
||||
|
@ -31,5 +31,5 @@ Then add this to your Cargo.toml file.
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
rune = { version = "0.2.0", registry = "cybermages" }
|
||||
rune = { version = "0.3.0", registry = "cybermages" }
|
||||
```
|
||||
|
@ -5,133 +5,159 @@ use rune::{Lexer, Span, TokenStream, TokenType};
|
||||
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum MarkdownTokenType {
|
||||
Heading(u8),
|
||||
EmphasisStart,
|
||||
EmphasisEnd,
|
||||
StrongStart,
|
||||
StrongEnd,
|
||||
CodeSpan,
|
||||
Text,
|
||||
Symbol,
|
||||
Whitespace,
|
||||
Newline,
|
||||
Unknown,
|
||||
pub enum MarkdownTokenType
|
||||
{
|
||||
Heading(u8),
|
||||
EmphasisStart,
|
||||
EmphasisEnd,
|
||||
StrongStart,
|
||||
StrongEnd,
|
||||
CodeSpan,
|
||||
Text,
|
||||
Symbol,
|
||||
Whitespace,
|
||||
Newline,
|
||||
Unknown
|
||||
}
|
||||
|
||||
|
||||
impl std::fmt::Display for MarkdownTokenType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
|
||||
MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
|
||||
MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
|
||||
MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
|
||||
MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
|
||||
MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
|
||||
MarkdownTokenType::Text => write!(f, "Text"),
|
||||
MarkdownTokenType::Symbol => write!(f, "Symbol"),
|
||||
MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
|
||||
MarkdownTokenType::Newline => write!(f, "Newline"),
|
||||
MarkdownTokenType::Unknown => write!(f, "Unknown"),
|
||||
}
|
||||
}
|
||||
impl std::fmt::Display for MarkdownTokenType
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
|
||||
{
|
||||
match self
|
||||
{
|
||||
MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
|
||||
MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
|
||||
MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
|
||||
MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
|
||||
MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
|
||||
MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
|
||||
MarkdownTokenType::Text => write!(f, "Text"),
|
||||
MarkdownTokenType::Symbol => write!(f, "Symbol"),
|
||||
MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
|
||||
MarkdownTokenType::Newline => write!(f, "Newline"),
|
||||
MarkdownTokenType::Unknown => write!(f, "Unknown")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Define how you want to interpret base tokens
|
||||
pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> {
|
||||
let mut output = TokenStream::new();
|
||||
pub fn transform(input: &TokenStream<TokenType>)
|
||||
-> TokenStream<MarkdownTokenType>
|
||||
{
|
||||
let mut output = TokenStream::new();
|
||||
|
||||
let mut i = 0;
|
||||
while i < input.len() {
|
||||
let token = input.get(i).unwrap(); // safe due to bounds check above
|
||||
let mut i = 0;
|
||||
while i < input.len()
|
||||
{
|
||||
let token = input.get(i).unwrap(); // safe due to bounds check above
|
||||
|
||||
match token.variant {
|
||||
TokenType::Symbol if token.lexeme == "#" => {
|
||||
// Count consecutive #s for heading level
|
||||
let mut level = 1;
|
||||
let mut span = token.span.clone();
|
||||
match token.variant
|
||||
{
|
||||
TokenType::Symbol if token.lexeme == "#" =>
|
||||
{
|
||||
// Count consecutive #s for heading level
|
||||
let mut level = 1;
|
||||
let mut span = token.span.clone();
|
||||
|
||||
while i + 1 < input.len() {
|
||||
let next = input.get(i + 1).unwrap();
|
||||
if *next.variant == TokenType::Symbol && next.lexeme == "#" {
|
||||
level += 1;
|
||||
span.end = next.span.end;
|
||||
i += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
output.push(token.lexeme.repeat(level),
|
||||
MarkdownTokenType::Heading(level as u8),
|
||||
span);
|
||||
while i + 1 < input.len()
|
||||
{
|
||||
let next = input.get(i + 1).unwrap();
|
||||
if *next.variant == TokenType::Symbol && next.lexeme == "#"
|
||||
{
|
||||
level += 1;
|
||||
span.end = next.span.end;
|
||||
i += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
TokenType::Symbol if token.lexeme == "*" => {
|
||||
// Look ahead to see if it's strong (**) or emphasis (*)
|
||||
if i + 1 < input.len() {
|
||||
let next = input.get(i + 1).unwrap();
|
||||
if *next.variant == TokenType::Symbol && next.lexeme == "*" {
|
||||
output.push("**".to_string(),
|
||||
MarkdownTokenType::StrongStart,
|
||||
Span::merge(*token.span, *next.span));
|
||||
i += 1; // skip the second '*'
|
||||
} else {
|
||||
output.push("*".to_string(),
|
||||
MarkdownTokenType::EmphasisStart,
|
||||
token.span.clone());
|
||||
}
|
||||
} else {
|
||||
output.push("*".to_string(),
|
||||
MarkdownTokenType::EmphasisStart,
|
||||
token.span.clone());
|
||||
}
|
||||
output.push(token.lexeme.repeat(level),
|
||||
MarkdownTokenType::Heading(level as u8),
|
||||
span);
|
||||
}
|
||||
|
||||
TokenType::Symbol if token.lexeme == "*" =>
|
||||
{
|
||||
// Look ahead to see if it's strong (**) or emphasis (*)
|
||||
if i + 1 < input.len()
|
||||
{
|
||||
let next = input.get(i + 1).unwrap();
|
||||
if *next.variant == TokenType::Symbol && next.lexeme == "*"
|
||||
{
|
||||
output.push("**".to_string(),
|
||||
MarkdownTokenType::StrongStart,
|
||||
Span::merge(*token.span, *next.span));
|
||||
i += 1; // skip the second '*'
|
||||
}
|
||||
else
|
||||
{
|
||||
output.push("*".to_string(),
|
||||
MarkdownTokenType::EmphasisStart,
|
||||
token.span.clone());
|
||||
}
|
||||
}
|
||||
|
||||
TokenType::Symbol if token.lexeme == "`" => {
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::CodeSpan,
|
||||
token.span.clone());
|
||||
else
|
||||
{
|
||||
output.push("*".to_string(),
|
||||
MarkdownTokenType::EmphasisStart,
|
||||
token.span.clone());
|
||||
}
|
||||
}
|
||||
|
||||
TokenType::Text => {
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Text,
|
||||
token.span.clone());
|
||||
}
|
||||
TokenType::Symbol if token.lexeme == "`" =>
|
||||
{
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::CodeSpan,
|
||||
token.span.clone());
|
||||
}
|
||||
|
||||
TokenType::Symbol => {
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Symbol,
|
||||
token.span.clone());
|
||||
}
|
||||
TokenType::Text =>
|
||||
{
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Text,
|
||||
token.span.clone());
|
||||
}
|
||||
|
||||
TokenType::Whitespace => {
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Whitespace,
|
||||
token.span.clone());
|
||||
}
|
||||
TokenType::Symbol =>
|
||||
{
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Symbol,
|
||||
token.span.clone());
|
||||
}
|
||||
|
||||
TokenType::Newline => {
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Newline,
|
||||
token.span.clone());
|
||||
}
|
||||
TokenType::Whitespace =>
|
||||
{
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Whitespace,
|
||||
token.span.clone());
|
||||
}
|
||||
|
||||
_ => {
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Unknown,
|
||||
token.span.clone());
|
||||
}
|
||||
}
|
||||
TokenType::Newline =>
|
||||
{
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Newline,
|
||||
token.span.clone());
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
_ =>
|
||||
{
|
||||
output.push(token.lexeme.to_string(),
|
||||
MarkdownTokenType::Unknown,
|
||||
token.span.clone());
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
i += 1;
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
|
||||
|
190
src/error.rs
190
src/error.rs
@ -1,4 +1,6 @@
|
||||
use std::{error::Error, path::PathBuf};
|
||||
use std::error::Error;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use super::position::Span;
|
||||
|
||||
|
||||
@ -12,116 +14,114 @@ use super::position::Span;
|
||||
/// It is designed to provide detailed diagnostics for file-based or
|
||||
/// in-memory parsing and is compatible with error reporting ecosystems.
|
||||
#[derive(Debug)]
|
||||
pub struct LexerError {
|
||||
/// A human-readable error message.
|
||||
pub message: String,
|
||||
pub struct LexerError
|
||||
{
|
||||
/// A human-readable error message.
|
||||
pub message: String,
|
||||
|
||||
/// The span where the error occurred.
|
||||
pub span: Span,
|
||||
/// The span where the error occurred.
|
||||
pub span: Span,
|
||||
|
||||
/// The file that the error occurred in, if known.
|
||||
pub file: Option<PathBuf>,
|
||||
/// The file that the error occurred in, if known.
|
||||
pub file: Option<PathBuf>,
|
||||
|
||||
/// The source snippet related to the error, if known.
|
||||
pub snippet: Option<String>,
|
||||
/// The source snippet related to the error, if known.
|
||||
pub snippet: Option<String>,
|
||||
|
||||
/// An optional underlying error that caused this one.
|
||||
pub source: Option<Box<dyn Error>>,
|
||||
/// An optional underlying error that caused this one.
|
||||
pub source: Option<Box<dyn Error>>
|
||||
}
|
||||
|
||||
impl LexerError {
|
||||
/// Creates a new `LexerError` with a message, span, and optional context.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `message`: A human-readable explanation of the error.
|
||||
/// - `span`: The region in the source where the error occurred.
|
||||
/// - `file`: An optional path to the file in which the error occurred.
|
||||
/// - `snippet`: An optional problematic input string.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new instance of `LexerError`.
|
||||
pub fn new<S, T>(
|
||||
message: S,
|
||||
span: Span,
|
||||
file: Option<T>,
|
||||
snippet: Option<S>,
|
||||
) -> Self
|
||||
where
|
||||
S: Into<String>,
|
||||
T: Into<PathBuf>,
|
||||
{
|
||||
LexerError {
|
||||
message: message.into(),
|
||||
span,
|
||||
file: file.map(Into::into),
|
||||
snippet: snippet.map(Into::into),
|
||||
source: None,
|
||||
}
|
||||
}
|
||||
impl LexerError
|
||||
{
|
||||
/// Creates a new `LexerError` with a message, span, and optional context.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `message`: A human-readable explanation of the error.
|
||||
/// - `span`: The region in the source where the error occurred.
|
||||
/// - `file`: An optional path to the file in which the error occurred.
|
||||
/// - `snippet`: An optional problematic input string.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new instance of `LexerError`.
|
||||
pub fn new<S, T>(message: S, span: Span, file: Option<T>,
|
||||
snippet: Option<S>)
|
||||
-> Self
|
||||
where S: Into<String>,
|
||||
T: Into<PathBuf>
|
||||
{
|
||||
LexerError { message: message.into(),
|
||||
span,
|
||||
file: file.map(Into::into),
|
||||
snippet: snippet.map(Into::into),
|
||||
source: None }
|
||||
}
|
||||
|
||||
/// Creates a `LexerError` from only a message and span.
|
||||
///
|
||||
/// This is useful when file or snippet context is not available.
|
||||
pub fn from_message<S>(message: S, span: Span) -> Self
|
||||
where
|
||||
S: Into<String>,
|
||||
{
|
||||
Self::new(message, span, None::<PathBuf>, None::<S>)
|
||||
}
|
||||
/// Creates a `LexerError` from only a message and span.
|
||||
///
|
||||
/// This is useful when file or snippet context is not available.
|
||||
pub fn from_message<S>(message: S, span: Span) -> Self
|
||||
where S: Into<String>
|
||||
{
|
||||
Self::new(message, span, None::<PathBuf>, None::<S>)
|
||||
}
|
||||
|
||||
/// Attaches a snippet of the offending source code.
|
||||
///
|
||||
/// This is helpful for diagnostics and tooling output.
|
||||
pub fn with_snippet<S>(mut self, snippet: S) -> Self
|
||||
where
|
||||
S: Into<String>,
|
||||
{
|
||||
self.snippet = Some(snippet.into());
|
||||
self
|
||||
}
|
||||
/// Attaches a snippet of the offending source code.
|
||||
///
|
||||
/// This is helpful for diagnostics and tooling output.
|
||||
pub fn with_snippet<S>(mut self, snippet: S) -> Self
|
||||
where S: Into<String>
|
||||
{
|
||||
self.snippet = Some(snippet.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Attaches the path of the file in which the error occurred.
|
||||
pub fn with_file<T>(mut self, file: T) -> Self
|
||||
where
|
||||
T: Into<PathBuf>,
|
||||
{
|
||||
self.file = Some(file.into());
|
||||
self
|
||||
}
|
||||
/// Attaches the path of the file in which the error occurred.
|
||||
pub fn with_file<T>(mut self, file: T) -> Self
|
||||
where T: Into<PathBuf>
|
||||
{
|
||||
self.file = Some(file.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Wraps a source error that caused this `LexerError`.
|
||||
///
|
||||
/// This allows you to chain errors for more detailed diagnostics.
|
||||
pub fn with_source<E>(mut self, err: E) -> Self
|
||||
where
|
||||
E: Error + 'static,
|
||||
{
|
||||
self.source = Some(Box::new(err));
|
||||
self
|
||||
}
|
||||
/// Wraps a source error that caused this `LexerError`.
|
||||
///
|
||||
/// This allows you to chain errors for more detailed diagnostics.
|
||||
pub fn with_source<E>(mut self, err: E) -> Self
|
||||
where E: Error + 'static
|
||||
{
|
||||
self.source = Some(Box::new(err));
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for LexerError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Lexer error at {}", self.span)?;
|
||||
impl std::fmt::Display for LexerError
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result
|
||||
{
|
||||
write!(f, "Lexer error at {}", self.span)?;
|
||||
|
||||
if let Some(file) = &self.file {
|
||||
write!(f, " in file `{}`", file.display())?;
|
||||
}
|
||||
if let Some(file) = &self.file
|
||||
{
|
||||
write!(f, " in file `{}`", file.display())?;
|
||||
}
|
||||
|
||||
write!(f, ": {}", self.message)?;
|
||||
write!(f, ": {}", self.message)?;
|
||||
|
||||
if let Some(snippet) = &self.snippet {
|
||||
write!(f, "\n --> Snippet: `{}`", snippet)?;
|
||||
}
|
||||
if let Some(snippet) = &self.snippet
|
||||
{
|
||||
write!(f, "\n --> Snippet: `{}`", snippet)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for LexerError {
|
||||
/// Returns the underlying cause of this error, if any.
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
||||
self.source.as_ref().map(|e| e.as_ref())
|
||||
}
|
||||
impl Error for LexerError
|
||||
{
|
||||
/// Returns the underlying cause of this error, if any.
|
||||
fn source(&self) -> Option<&(dyn Error + 'static)>
|
||||
{
|
||||
self.source.as_ref().map(|e| e.as_ref())
|
||||
}
|
||||
}
|
||||
|
119
src/lexer.rs
119
src/lexer.rs
@ -7,24 +7,25 @@ use super::token::{TokenStream, TokenType};
|
||||
|
||||
|
||||
|
||||
/// The size of data chunks to read from a file. This was arbitrarily chosen to
|
||||
/// be 1mb.
|
||||
/// The size of data chunks to read from a file. This is an arbitrary choice,
|
||||
/// set to 1MB.
|
||||
const BUFFER_SIZE: usize = 1024 * 1024;
|
||||
|
||||
|
||||
|
||||
/// The `Lexer` is the core component responsible for performing
|
||||
/// lexicographical analysis on a text file.
|
||||
/// The `Lexer` struct is responsible for performing lexical analysis
|
||||
/// (tokenization) on text.
|
||||
///
|
||||
/// It reads input from a file character-by-character, generating a stream
|
||||
/// of base tokens such as text, numbers, whitespace, symbols, and newlines.
|
||||
/// These tokens are accumulated into a `TokenStream`, which is a flat,
|
||||
/// cache-friendly data structure.
|
||||
/// It processes input from a file or string character-by-character and
|
||||
/// generates a stream of tokens, such as text, numbers, whitespace, symbols,
|
||||
/// and newlines. These tokens are accumulated into a `TokenStream`, which is a
|
||||
/// flat, cache-friendly data structure designed for efficient iteration.
|
||||
///
|
||||
/// After tokenization, the lexer applies a user-provided `transform` function
|
||||
/// to each token in the stream, allowing consumers of the library to convert
|
||||
/// base tokens into richer, domain-specific token types (e.g. Markdown
|
||||
/// elements, syntax trees, or custom DSL tokens).
|
||||
/// After the base tokens are generated, the `Lexer` allows for transformation
|
||||
/// of these tokens into richer, domain-specific types via a user-provided
|
||||
/// `transform` function. This transformation can be used to convert base tokens
|
||||
/// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
|
||||
/// other custom format you need.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
@ -38,32 +39,51 @@ const BUFFER_SIZE: usize = 1024 * 1024;
|
||||
///
|
||||
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
|
||||
///
|
||||
/// // The tuple here is from the transform functions return type.
|
||||
/// for token in &tokens
|
||||
/// {
|
||||
/// println!("{}", token);
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// # Design Notes
|
||||
///
|
||||
/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
|
||||
/// # Design Considerations
|
||||
/// - Utilizes a flat `TokenStream` to improve performance and reduce heap
|
||||
/// overhead.
|
||||
/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
|
||||
/// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
|
||||
/// numbers).
|
||||
/// - Easily extensible via the `transform` function to support higher-level
|
||||
/// parsing tasks.
|
||||
/// - Extensible via the `transform` function, enabling the creation of
|
||||
/// higher-level constructs, like Markdown elements or syntax trees for a
|
||||
/// custom DSL.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `LexerError` if the file cannot be opened or read.
|
||||
/// # Error Handling
|
||||
/// The lexer will return a `LexerError` if the input file cannot be opened or
|
||||
/// read. Errors include issues such as missing files, read failures, or invalid
|
||||
/// input formats.
|
||||
pub enum Lexer {}
|
||||
|
||||
|
||||
|
||||
impl Lexer
|
||||
{
|
||||
/// Scans a file and produces a vector of transformed tokens.
|
||||
/// Scans a file and generates a vector of transformed tokens based on the
|
||||
/// provided `transform` function.
|
||||
///
|
||||
/// This method opens a file from the given `path`, reads the file line by
|
||||
/// line, and converts the input into a stream of tokens. The tokens are
|
||||
/// then passed to the `transform` function, which allows users to map
|
||||
/// base tokens into domain-specific types.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `path`: A path to the file to be lexically analyzed.
|
||||
/// - `transform`: A function that takes a `TokenStream<TokenType>` and
|
||||
/// transforms it into a `TokenStream<T>` where `T` is a domain-specific
|
||||
/// type.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
|
||||
/// type, or an error.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns a `LexerError` if the file cannot be opened or read.
|
||||
pub fn scan_file<P, F, T>(path: P, transform: F)
|
||||
-> Result<TokenStream<T>, LexerError>
|
||||
where P: AsRef<std::path::Path>,
|
||||
@ -82,6 +102,7 @@ impl Lexer
|
||||
|
||||
let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
|
||||
|
||||
// Read the file line by line.
|
||||
for line in reader.lines()
|
||||
{
|
||||
match line
|
||||
@ -93,7 +114,7 @@ impl Lexer
|
||||
Err(_) =>
|
||||
{
|
||||
return Err(LexerError::new("Unable to read line during \
|
||||
Lexigraphical Analysis.",
|
||||
Lexical Analysis.",
|
||||
Span::default(),
|
||||
Some(path.as_ref()
|
||||
.to_string_lossy()
|
||||
@ -102,10 +123,10 @@ impl Lexer
|
||||
}
|
||||
}
|
||||
|
||||
// Add the newline token after each line.
|
||||
stream.push("\n".to_string(),
|
||||
TokenType::Newline,
|
||||
Span::with_single(cursor));
|
||||
|
||||
cursor.line += 1;
|
||||
cursor.column = 0;
|
||||
}
|
||||
@ -113,7 +134,22 @@ impl Lexer
|
||||
Ok(transform(&stream))
|
||||
}
|
||||
|
||||
/// Scans a full in-memory string and returns transformed tokens.
|
||||
/// Scans a full in-memory string and produces transformed tokens.
|
||||
///
|
||||
/// This method tokenizes the input string `text` and returns the transformed
|
||||
/// tokens using the provided `transform` function. It's a convenient way
|
||||
/// to perform lexical analysis on in-memory strings without needing to
|
||||
/// read from a file.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `text`: A string slice representing the in-memory input text to
|
||||
/// analyze.
|
||||
/// - `transform`: A function that transforms the base tokens into
|
||||
/// domain-specific types.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
|
||||
/// type, or an error.
|
||||
pub fn scan_text<F, T>(text: &str, transform: F)
|
||||
-> Result<TokenStream<T>, LexerError>
|
||||
where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
|
||||
@ -121,14 +157,15 @@ impl Lexer
|
||||
let mut cursor = Position::default();
|
||||
let mut stream = TokenStream::new();
|
||||
|
||||
// Process each line in the input string.
|
||||
for line in text.lines()
|
||||
{
|
||||
Self::scan(line, &mut stream, &mut cursor);
|
||||
|
||||
// Add the newline token after each line.
|
||||
stream.push("\n".to_string(),
|
||||
TokenType::Newline,
|
||||
Span::with_single(cursor));
|
||||
|
||||
cursor.line += 1;
|
||||
cursor.column = 0;
|
||||
}
|
||||
@ -136,15 +173,28 @@ impl Lexer
|
||||
Ok(transform(&stream))
|
||||
}
|
||||
|
||||
/// Internal: scans a single line of text into tokens.
|
||||
/// Internal method that scans a single line of text into tokens.
|
||||
///
|
||||
/// This method processes each character of a line and generates the
|
||||
/// corresponding token. It handles cases like numeric tokens, text
|
||||
/// tokens, symbols, and whitespace.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `line`: A line of text to be lexically analyzed.
|
||||
/// - `stream`: A mutable reference to the token stream where the generated
|
||||
/// tokens will be pushed.
|
||||
/// - `cursor`: A mutable reference to the cursor position, which tracks the
|
||||
/// current position in the input.
|
||||
fn scan(line: &str, stream: &mut TokenStream<TokenType>,
|
||||
cursor: &mut Position)
|
||||
{
|
||||
for c in line.chars()
|
||||
{
|
||||
// Get the token type based on the character.
|
||||
let variant = get_token_type(c);
|
||||
let last = stream.len().saturating_sub(1);
|
||||
|
||||
// Handle token merging for contiguous tokens like numbers or text.
|
||||
if !stream.is_empty() &&
|
||||
variant == stream.variants[last] &&
|
||||
(variant == TokenType::Numeric || variant == TokenType::Text)
|
||||
@ -154,6 +204,7 @@ impl Lexer
|
||||
}
|
||||
else
|
||||
{
|
||||
// Add a new token to the stream.
|
||||
stream.push(c.to_string(), variant, Span::with_single(*cursor));
|
||||
}
|
||||
|
||||
@ -164,6 +215,18 @@ impl Lexer
|
||||
|
||||
|
||||
|
||||
/// Determines the type of a token based on the current character.
|
||||
///
|
||||
/// This helper function is responsible for identifying whether the current
|
||||
/// character is part of a known token type such as numeric, text, whitespace,
|
||||
/// or symbol.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `curr_char`: The current character to analyze.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
|
||||
/// `Whitespace`, etc.).
|
||||
fn get_token_type(curr_char: char) -> TokenType
|
||||
{
|
||||
match curr_char
|
||||
|
@ -59,7 +59,9 @@ fn test_basic_lexing()
|
||||
Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
|
||||
succeed");
|
||||
|
||||
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
|
||||
let tokens = tokens.into_iter()
|
||||
.map(|t| (*t.variant, String::from(t.lexeme)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let expected = vec![(TokenType::Text, "magic".to_string()),
|
||||
(TokenType::Whitespace, " ".to_string()),
|
||||
@ -77,7 +79,9 @@ fn test_symbols_and_numbers()
|
||||
Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
|
||||
succeed");
|
||||
|
||||
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
|
||||
let tokens = tokens.into_iter()
|
||||
.map(|t| (*t.variant, String::from(t.lexeme)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let expected = vec![(TokenType::Numeric, "13".into()),
|
||||
(TokenType::Whitespace, " ".into()),
|
||||
@ -129,16 +133,16 @@ fn test_lexer_with_cases()
|
||||
on case '{}'",
|
||||
case.name));
|
||||
|
||||
let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
|
||||
let result = result.into_iter()
|
||||
.map(|t| (*t.variant, String::from(t.lexeme)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let expected = case.expected
|
||||
.iter()
|
||||
.map(|(ty, s)| (*ty, s.to_string()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(result, expected,
|
||||
"Mismatch in test case '{}'",
|
||||
case.name);
|
||||
assert_eq!(result, expected, "Mismatch in test case '{}'", case.name);
|
||||
|
||||
cleanup_temp_file(&path);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user