2025-04-15 21:17:28 -04:00
|
|
|
use std::fs::File;
|
|
|
|
use std::io::{BufRead, BufReader};
|
|
|
|
|
|
|
|
use super::error::LexerError;
|
|
|
|
use super::position::{Position, Span};
|
|
|
|
use super::token::{TokenStream, TokenType};
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
/// The size of data chunks to read from a file. This is an arbitrary choice,
|
|
|
|
/// set to 1MB.
|
2025-04-15 21:17:28 -04:00
|
|
|
const BUFFER_SIZE: usize = 1024 * 1024;
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
/// The `Lexer` struct is responsible for performing lexical analysis
|
|
|
|
/// (tokenization) on text.
|
2025-04-15 21:17:28 -04:00
|
|
|
///
|
2025-04-22 02:18:12 -04:00
|
|
|
/// It processes input from a file or string character-by-character and
|
|
|
|
/// generates a stream of tokens, such as text, numbers, whitespace, symbols,
|
|
|
|
/// and newlines. These tokens are accumulated into a `TokenStream`, which is a
|
|
|
|
/// flat, cache-friendly data structure designed for efficient iteration.
|
2025-04-15 21:17:28 -04:00
|
|
|
///
|
2025-04-22 02:18:12 -04:00
|
|
|
/// After the base tokens are generated, the `Lexer` allows for transformation
|
|
|
|
/// of these tokens into richer, domain-specific types via a user-provided
|
|
|
|
/// `transform` function. This transformation can be used to convert base tokens
|
|
|
|
/// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
|
|
|
|
/// other custom format you need.
|
2025-04-15 21:17:28 -04:00
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```rust
|
|
|
|
/// use rune::{Lexer, TokenStream, TokenType};
|
|
|
|
///
|
2025-04-16 01:54:22 -04:00
|
|
|
/// fn transform(tokens: &TokenStream<TokenType>) -> TokenStream<TokenType>
|
2025-04-15 21:17:28 -04:00
|
|
|
/// {
|
2025-04-16 01:54:22 -04:00
|
|
|
/// tokens.clone()
|
2025-04-15 21:17:28 -04:00
|
|
|
/// }
|
|
|
|
///
|
|
|
|
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
|
|
|
|
///
|
2025-04-16 01:54:22 -04:00
|
|
|
/// for token in &tokens
|
2025-04-15 21:17:28 -04:00
|
|
|
/// {
|
2025-04-16 01:54:22 -04:00
|
|
|
/// println!("{}", token);
|
2025-04-15 21:17:28 -04:00
|
|
|
/// }
|
|
|
|
/// ```
|
|
|
|
///
|
2025-04-22 02:18:12 -04:00
|
|
|
/// # Design Considerations
|
|
|
|
/// - Utilizes a flat `TokenStream` to improve performance and reduce heap
|
2025-04-15 21:17:28 -04:00
|
|
|
/// overhead.
|
2025-04-22 02:18:12 -04:00
|
|
|
/// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
|
2025-04-15 21:17:28 -04:00
|
|
|
/// numbers).
|
2025-04-22 02:18:12 -04:00
|
|
|
/// - Extensible via the `transform` function, enabling the creation of
|
|
|
|
/// higher-level constructs, like Markdown elements or syntax trees for a
|
|
|
|
/// custom DSL.
|
2025-04-15 21:17:28 -04:00
|
|
|
///
|
2025-04-22 02:18:12 -04:00
|
|
|
/// # Error Handling
|
|
|
|
/// The lexer will return a `LexerError` if the input file cannot be opened or
|
|
|
|
/// read. Errors include issues such as missing files, read failures, or invalid
|
|
|
|
/// input formats.
|
2025-04-15 21:17:28 -04:00
|
|
|
pub enum Lexer {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
impl Lexer
|
|
|
|
{
|
2025-04-22 02:18:12 -04:00
|
|
|
/// Scans a file and generates a vector of transformed tokens based on the
|
|
|
|
/// provided `transform` function.
|
|
|
|
///
|
|
|
|
/// This method opens a file from the given `path`, reads the file line by
|
|
|
|
/// line, and converts the input into a stream of tokens. The tokens are
|
|
|
|
/// then passed to the `transform` function, which allows users to map
|
|
|
|
/// base tokens into domain-specific types.
|
|
|
|
///
|
|
|
|
/// # Parameters
|
|
|
|
/// - `path`: A path to the file to be lexically analyzed.
|
|
|
|
/// - `transform`: A function that takes a `TokenStream<TokenType>` and
|
|
|
|
/// transforms it into a `TokenStream<T>` where `T` is a domain-specific
|
|
|
|
/// type.
|
|
|
|
///
|
|
|
|
/// # Returns
|
|
|
|
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
|
|
|
|
/// type, or an error.
|
|
|
|
///
|
|
|
|
/// # Errors
|
|
|
|
/// Returns a `LexerError` if the file cannot be opened or read.
|
2025-04-15 21:17:28 -04:00
|
|
|
pub fn scan_file<P, F, T>(path: P, transform: F)
|
2025-04-16 01:54:22 -04:00
|
|
|
-> Result<TokenStream<T>, LexerError>
|
2025-04-15 21:17:28 -04:00
|
|
|
where P: AsRef<std::path::Path>,
|
2025-04-16 01:54:22 -04:00
|
|
|
F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
|
2025-04-15 21:17:28 -04:00
|
|
|
{
|
|
|
|
let mut cursor = Position::default();
|
|
|
|
let mut stream = TokenStream::new();
|
|
|
|
|
2025-04-16 20:03:15 -04:00
|
|
|
let input_file = File::open(&path).map_err(|err| {
|
2025-04-15 21:17:28 -04:00
|
|
|
LexerError::new(
|
2025-04-16 20:03:15 -04:00
|
|
|
"Unable to open file for Lexical Analysis.",
|
2025-04-15 21:17:28 -04:00
|
|
|
Span::default(),
|
2025-04-16 20:03:15 -04:00
|
|
|
Some(path.as_ref().to_path_buf()),
|
|
|
|
None).with_source(err)
|
2025-04-15 21:17:28 -04:00
|
|
|
})?;
|
|
|
|
|
|
|
|
let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
// Read the file line by line.
|
2025-04-15 21:17:28 -04:00
|
|
|
for line in reader.lines()
|
|
|
|
{
|
|
|
|
match line
|
|
|
|
{
|
|
|
|
Ok(text) =>
|
|
|
|
{
|
|
|
|
Self::scan(&text, &mut stream, &mut cursor);
|
|
|
|
}
|
|
|
|
Err(_) =>
|
|
|
|
{
|
|
|
|
return Err(LexerError::new("Unable to read line during \
|
2025-04-22 02:18:12 -04:00
|
|
|
Lexical Analysis.",
|
2025-04-15 21:17:28 -04:00
|
|
|
Span::default(),
|
|
|
|
Some(path.as_ref()
|
|
|
|
.to_string_lossy()
|
|
|
|
.to_string()),
|
|
|
|
None));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
// Add the newline token after each line.
|
2025-04-15 21:17:28 -04:00
|
|
|
stream.push("\n".to_string(),
|
|
|
|
TokenType::Newline,
|
|
|
|
Span::with_single(cursor));
|
|
|
|
cursor.line += 1;
|
|
|
|
cursor.column = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(transform(&stream))
|
|
|
|
}
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
/// Scans a full in-memory string and produces transformed tokens.
|
|
|
|
///
|
|
|
|
/// This method tokenizes the input string `text` and returns the transformed
|
|
|
|
/// tokens using the provided `transform` function. It's a convenient way
|
|
|
|
/// to perform lexical analysis on in-memory strings without needing to
|
|
|
|
/// read from a file.
|
|
|
|
///
|
|
|
|
/// # Parameters
|
|
|
|
/// - `text`: A string slice representing the in-memory input text to
|
|
|
|
/// analyze.
|
|
|
|
/// - `transform`: A function that transforms the base tokens into
|
|
|
|
/// domain-specific types.
|
|
|
|
///
|
|
|
|
/// # Returns
|
|
|
|
/// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
|
|
|
|
/// type, or an error.
|
2025-04-15 21:17:28 -04:00
|
|
|
pub fn scan_text<F, T>(text: &str, transform: F)
|
2025-04-16 01:54:22 -04:00
|
|
|
-> Result<TokenStream<T>, LexerError>
|
|
|
|
where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
|
2025-04-15 21:17:28 -04:00
|
|
|
{
|
|
|
|
let mut cursor = Position::default();
|
|
|
|
let mut stream = TokenStream::new();
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
// Process each line in the input string.
|
2025-04-15 21:17:28 -04:00
|
|
|
for line in text.lines()
|
|
|
|
{
|
|
|
|
Self::scan(line, &mut stream, &mut cursor);
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
// Add the newline token after each line.
|
2025-04-15 21:17:28 -04:00
|
|
|
stream.push("\n".to_string(),
|
|
|
|
TokenType::Newline,
|
|
|
|
Span::with_single(cursor));
|
|
|
|
cursor.line += 1;
|
|
|
|
cursor.column = 0;
|
|
|
|
}
|
|
|
|
|
2025-05-05 18:11:47 -04:00
|
|
|
// Remove the last newline character if the text did not end with a
|
|
|
|
// newline.
|
|
|
|
if !text.ends_with('\n')
|
|
|
|
{
|
|
|
|
stream.pop();
|
|
|
|
}
|
|
|
|
|
2025-04-15 21:17:28 -04:00
|
|
|
Ok(transform(&stream))
|
|
|
|
}
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
/// Internal method that scans a single line of text into tokens.
|
|
|
|
///
|
|
|
|
/// This method processes each character of a line and generates the
|
|
|
|
/// corresponding token. It handles cases like numeric tokens, text
|
|
|
|
/// tokens, symbols, and whitespace.
|
|
|
|
///
|
|
|
|
/// # Parameters
|
|
|
|
/// - `line`: A line of text to be lexically analyzed.
|
|
|
|
/// - `stream`: A mutable reference to the token stream where the generated
|
|
|
|
/// tokens will be pushed.
|
|
|
|
/// - `cursor`: A mutable reference to the cursor position, which tracks the
|
|
|
|
/// current position in the input.
|
2025-04-16 20:03:15 -04:00
|
|
|
fn scan(line: &str, stream: &mut TokenStream<TokenType>,
|
|
|
|
cursor: &mut Position)
|
2025-04-15 21:17:28 -04:00
|
|
|
{
|
|
|
|
for c in line.chars()
|
|
|
|
{
|
2025-04-22 02:18:12 -04:00
|
|
|
// Get the token type based on the character.
|
2025-04-15 21:17:28 -04:00
|
|
|
let variant = get_token_type(c);
|
|
|
|
let last = stream.len().saturating_sub(1);
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
// Handle token merging for contiguous tokens like numbers or text.
|
2025-04-15 21:17:28 -04:00
|
|
|
if !stream.is_empty() &&
|
|
|
|
variant == stream.variants[last] &&
|
|
|
|
(variant == TokenType::Numeric || variant == TokenType::Text)
|
|
|
|
{
|
|
|
|
stream.lexemes[last].push(c);
|
|
|
|
stream.locations[last].end = *cursor;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2025-04-22 02:18:12 -04:00
|
|
|
// Add a new token to the stream.
|
2025-04-15 21:17:28 -04:00
|
|
|
stream.push(c.to_string(), variant, Span::with_single(*cursor));
|
|
|
|
}
|
|
|
|
|
|
|
|
cursor.column += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-04-22 02:18:12 -04:00
|
|
|
/// Determines the type of a token based on the current character.
|
|
|
|
///
|
|
|
|
/// This helper function is responsible for identifying whether the current
|
|
|
|
/// character is part of a known token type such as numeric, text, whitespace,
|
|
|
|
/// or symbol.
|
|
|
|
///
|
|
|
|
/// # Parameters
|
|
|
|
/// - `curr_char`: The current character to analyze.
|
|
|
|
///
|
|
|
|
/// # Returns
|
|
|
|
/// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
|
|
|
|
/// `Whitespace`, etc.).
|
2025-04-15 21:17:28 -04:00
|
|
|
fn get_token_type(curr_char: char) -> TokenType
|
|
|
|
{
|
|
|
|
match curr_char
|
|
|
|
{
|
|
|
|
'\n' => TokenType::Newline,
|
|
|
|
c if c.is_whitespace() => TokenType::Whitespace,
|
|
|
|
c if c.is_numeric() => TokenType::Numeric,
|
|
|
|
c if c.is_alphabetic() => TokenType::Text,
|
|
|
|
c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
|
|
|
|
_ => TokenType::Unknown
|
|
|
|
}
|
|
|
|
}
|