186 lines
5.3 KiB
Rust
186 lines
5.3 KiB
Rust
|
use std::fs::File;
|
||
|
use std::io::{BufRead, BufReader};
|
||
|
|
||
|
use super::error::LexerError;
|
||
|
use super::position::{Position, Span};
|
||
|
use super::token::{TokenStream, TokenType};
|
||
|
|
||
|
|
||
|
|
||
|
/// The size of data chunks to read from a file. This was arbitrarily chosen to
|
||
|
/// be 1mb.
|
||
|
const BUFFER_SIZE: usize = 1024 * 1024;
|
||
|
|
||
|
|
||
|
|
||
|
/// The `Lexer` is the core component responsible for performing
|
||
|
/// lexicographical analysis on a text file.
|
||
|
///
|
||
|
/// It reads input from a file character-by-character, generating a stream
|
||
|
/// of base tokens such as text, numbers, whitespace, symbols, and newlines.
|
||
|
/// These tokens are accumulated into a `TokenStream`, which is a flat,
|
||
|
/// cache-friendly data structure.
|
||
|
///
|
||
|
/// After tokenization, the lexer applies a user-provided `transform` function
|
||
|
/// to each token in the stream, allowing consumers of the library to convert
|
||
|
/// base tokens into richer, domain-specific token types (e.g. Markdown
|
||
|
/// elements, syntax trees, or custom DSL tokens).
|
||
|
///
|
||
|
/// # Example
|
||
|
///
|
||
|
/// ```rust
|
||
|
/// use rune::{Lexer, TokenStream, TokenType};
|
||
|
///
|
||
|
/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
|
||
|
/// {
|
||
|
/// let mut new_tokens = Vec::new();
|
||
|
///
|
||
|
/// for token in tokens
|
||
|
/// {
|
||
|
/// new_tokens.push((*token.variant, token.lexeme.to_string()));
|
||
|
/// }
|
||
|
///
|
||
|
/// new_tokens
|
||
|
/// }
|
||
|
///
|
||
|
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
|
||
|
///
|
||
|
/// // The tuple here is from the transform functions return type.
|
||
|
/// for (ty, lexeme) in tokens
|
||
|
/// {
|
||
|
/// println!("{:?}: {:?}", ty, lexeme);
|
||
|
/// }
|
||
|
/// ```
|
||
|
///
|
||
|
/// # Design Notes
|
||
|
///
|
||
|
/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
|
||
|
/// overhead.
|
||
|
/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
|
||
|
/// numbers).
|
||
|
/// - Easily extensible via the `transform` function to support higher-level
|
||
|
/// parsing tasks.
|
||
|
///
|
||
|
/// # Errors
|
||
|
///
|
||
|
/// Returns a `LexerError` if the file cannot be opened or read.
|
||
|
pub enum Lexer {}
|
||
|
|
||
|
|
||
|
|
||
|
impl Lexer
|
||
|
{
|
||
|
/// Scans a file and produces a vector of transformed tokens.
|
||
|
pub fn scan_file<P, F, T>(path: P, transform: F)
|
||
|
-> Result<Vec<T>, LexerError>
|
||
|
where P: AsRef<std::path::Path>,
|
||
|
F: FnOnce(&TokenStream) -> Vec<T>
|
||
|
{
|
||
|
let mut cursor = Position::default();
|
||
|
let mut stream = TokenStream::new();
|
||
|
|
||
|
let input_file = File::open(&path).map_err(|_error| {
|
||
|
LexerError::new(
|
||
|
"Unable to open file for Lexigraphical Analysis.",
|
||
|
Span::default(),
|
||
|
Some(path.as_ref().to_string_lossy().to_string()),
|
||
|
None,
|
||
|
)
|
||
|
})?;
|
||
|
|
||
|
let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
|
||
|
|
||
|
for line in reader.lines()
|
||
|
{
|
||
|
match line
|
||
|
{
|
||
|
Ok(text) =>
|
||
|
{
|
||
|
Self::scan(&text, &mut stream, &mut cursor);
|
||
|
}
|
||
|
Err(_) =>
|
||
|
{
|
||
|
return Err(LexerError::new("Unable to read line during \
|
||
|
Lexigraphical Analysis.",
|
||
|
Span::default(),
|
||
|
Some(path.as_ref()
|
||
|
.to_string_lossy()
|
||
|
.to_string()),
|
||
|
None));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
stream.push("\n".to_string(),
|
||
|
TokenType::Newline,
|
||
|
Span::with_single(cursor));
|
||
|
|
||
|
cursor.line += 1;
|
||
|
cursor.column = 0;
|
||
|
}
|
||
|
|
||
|
Ok(transform(&stream))
|
||
|
}
|
||
|
|
||
|
/// Scans a full in-memory string and returns transformed tokens.
|
||
|
pub fn scan_text<F, T>(text: &str, transform: F)
|
||
|
-> Result<Vec<T>, LexerError>
|
||
|
where F: FnOnce(&TokenStream) -> Vec<T>
|
||
|
{
|
||
|
let mut cursor = Position::default();
|
||
|
let mut stream = TokenStream::new();
|
||
|
|
||
|
for line in text.lines()
|
||
|
{
|
||
|
Self::scan(line, &mut stream, &mut cursor);
|
||
|
|
||
|
stream.push("\n".to_string(),
|
||
|
TokenType::Newline,
|
||
|
Span::with_single(cursor));
|
||
|
|
||
|
cursor.line += 1;
|
||
|
cursor.column = 0;
|
||
|
}
|
||
|
|
||
|
Ok(transform(&stream))
|
||
|
}
|
||
|
|
||
|
/// Internal: scans a single line of text into tokens.
|
||
|
fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position)
|
||
|
{
|
||
|
for c in line.chars()
|
||
|
{
|
||
|
let variant = get_token_type(c);
|
||
|
let last = stream.len().saturating_sub(1);
|
||
|
|
||
|
if !stream.is_empty() &&
|
||
|
variant == stream.variants[last] &&
|
||
|
(variant == TokenType::Numeric || variant == TokenType::Text)
|
||
|
{
|
||
|
stream.lexemes[last].push(c);
|
||
|
stream.locations[last].end = *cursor;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
stream.push(c.to_string(), variant, Span::with_single(*cursor));
|
||
|
}
|
||
|
|
||
|
cursor.column += 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
fn get_token_type(curr_char: char) -> TokenType
|
||
|
{
|
||
|
match curr_char
|
||
|
{
|
||
|
'\n' => TokenType::Newline,
|
||
|
c if c.is_whitespace() => TokenType::Whitespace,
|
||
|
c if c.is_numeric() => TokenType::Numeric,
|
||
|
c if c.is_alphabetic() => TokenType::Text,
|
||
|
c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
|
||
|
_ => TokenType::Unknown
|
||
|
}
|
||
|
}
|