rune/src/lexer.rs

186 lines
5.3 KiB
Rust
Raw Normal View History

use std::fs::File;
use std::io::{BufRead, BufReader};
use super::error::LexerError;
use super::position::{Position, Span};
use super::token::{TokenStream, TokenType};
/// The size of data chunks to read from a file. This was arbitrarily chosen to
/// be 1mb.
const BUFFER_SIZE: usize = 1024 * 1024;
/// The `Lexer` is the core component responsible for performing
/// lexicographical analysis on a text file.
///
/// It reads input from a file character-by-character, generating a stream
/// of base tokens such as text, numbers, whitespace, symbols, and newlines.
/// These tokens are accumulated into a `TokenStream`, which is a flat,
/// cache-friendly data structure.
///
/// After tokenization, the lexer applies a user-provided `transform` function
/// to each token in the stream, allowing consumers of the library to convert
/// base tokens into richer, domain-specific token types (e.g. Markdown
/// elements, syntax trees, or custom DSL tokens).
///
/// # Example
///
/// ```rust
/// use rune::{Lexer, TokenStream, TokenType};
///
/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
/// {
/// let mut new_tokens = Vec::new();
///
/// for token in tokens
/// {
/// new_tokens.push((*token.variant, token.lexeme.to_string()));
/// }
///
/// new_tokens
/// }
///
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
///
/// // The tuple here is from the transform functions return type.
/// for (ty, lexeme) in tokens
/// {
/// println!("{:?}: {:?}", ty, lexeme);
/// }
/// ```
///
/// # Design Notes
///
/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
/// overhead.
/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
/// numbers).
/// - Easily extensible via the `transform` function to support higher-level
/// parsing tasks.
///
/// # Errors
///
/// Returns a `LexerError` if the file cannot be opened or read.
pub enum Lexer {}
impl Lexer
{
/// Scans a file and produces a vector of transformed tokens.
pub fn scan_file<P, F, T>(path: P, transform: F)
-> Result<Vec<T>, LexerError>
where P: AsRef<std::path::Path>,
F: FnOnce(&TokenStream) -> Vec<T>
{
let mut cursor = Position::default();
let mut stream = TokenStream::new();
let input_file = File::open(&path).map_err(|_error| {
LexerError::new(
"Unable to open file for Lexigraphical Analysis.",
Span::default(),
Some(path.as_ref().to_string_lossy().to_string()),
None,
)
})?;
let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
for line in reader.lines()
{
match line
{
Ok(text) =>
{
Self::scan(&text, &mut stream, &mut cursor);
}
Err(_) =>
{
return Err(LexerError::new("Unable to read line during \
Lexigraphical Analysis.",
Span::default(),
Some(path.as_ref()
.to_string_lossy()
.to_string()),
None));
}
}
stream.push("\n".to_string(),
TokenType::Newline,
Span::with_single(cursor));
cursor.line += 1;
cursor.column = 0;
}
Ok(transform(&stream))
}
/// Scans a full in-memory string and returns transformed tokens.
pub fn scan_text<F, T>(text: &str, transform: F)
-> Result<Vec<T>, LexerError>
where F: FnOnce(&TokenStream) -> Vec<T>
{
let mut cursor = Position::default();
let mut stream = TokenStream::new();
for line in text.lines()
{
Self::scan(line, &mut stream, &mut cursor);
stream.push("\n".to_string(),
TokenType::Newline,
Span::with_single(cursor));
cursor.line += 1;
cursor.column = 0;
}
Ok(transform(&stream))
}
/// Internal: scans a single line of text into tokens.
fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position)
{
for c in line.chars()
{
let variant = get_token_type(c);
let last = stream.len().saturating_sub(1);
if !stream.is_empty() &&
variant == stream.variants[last] &&
(variant == TokenType::Numeric || variant == TokenType::Text)
{
stream.lexemes[last].push(c);
stream.locations[last].end = *cursor;
}
else
{
stream.push(c.to_string(), variant, Span::with_single(*cursor));
}
cursor.column += 1;
}
}
}
fn get_token_type(curr_char: char) -> TokenType
{
match curr_char
{
'\n' => TokenType::Newline,
c if c.is_whitespace() => TokenType::Whitespace,
c if c.is_numeric() => TokenType::Numeric,
c if c.is_alphabetic() => TokenType::Text,
c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
_ => TokenType::Unknown
}
}