use std::fs::File; use std::io::{BufRead, BufReader}; use super::error::LexerError; use super::position::{Position, Span}; use super::token::{TokenStream, TokenType}; /// The size of data chunks to read from a file. This was arbitrarily chosen to /// be 1mb. const BUFFER_SIZE: usize = 1024 * 1024; /// The `Lexer` is the core component responsible for performing /// lexicographical analysis on a text file. /// /// It reads input from a file character-by-character, generating a stream /// of base tokens such as text, numbers, whitespace, symbols, and newlines. /// These tokens are accumulated into a `TokenStream`, which is a flat, /// cache-friendly data structure. /// /// After tokenization, the lexer applies a user-provided `transform` function /// to each token in the stream, allowing consumers of the library to convert /// base tokens into richer, domain-specific token types (e.g. Markdown /// elements, syntax trees, or custom DSL tokens). /// /// # Example /// /// ```rust /// use rune::{Lexer, TokenStream, TokenType}; /// /// fn transform(tokens: &TokenStream) -> TokenStream /// { /// tokens.clone() /// } /// /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); /// /// // The tuple here is from the transform functions return type. /// for token in &tokens /// { /// println!("{}", token); /// } /// ``` /// /// # Design Notes /// /// - Uses a flat `TokenStream` to improve iteration performance and reduce heap /// overhead. /// - Consolidates contiguous characters into compound tokens (e.g. multi-digit /// numbers). /// - Easily extensible via the `transform` function to support higher-level /// parsing tasks. /// /// # Errors /// /// Returns a `LexerError` if the file cannot be opened or read. pub enum Lexer {} impl Lexer { /// Scans a file and produces a vector of transformed tokens. pub fn scan_file(path: P, transform: F) -> Result, LexerError> where P: AsRef, F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); let input_file = File::open(&path).map_err(|_error| { LexerError::new( "Unable to open file for Lexigraphical Analysis.", Span::default(), Some(path.as_ref().to_string_lossy().to_string()), None, ) })?; let reader = BufReader::with_capacity(BUFFER_SIZE, input_file); for line in reader.lines() { match line { Ok(text) => { Self::scan(&text, &mut stream, &mut cursor); } Err(_) => { return Err(LexerError::new("Unable to read line during \ Lexigraphical Analysis.", Span::default(), Some(path.as_ref() .to_string_lossy() .to_string()), None)); } } stream.push("\n".to_string(), TokenType::Newline, Span::with_single(cursor)); cursor.line += 1; cursor.column = 0; } Ok(transform(&stream)) } /// Scans a full in-memory string and returns transformed tokens. pub fn scan_text(text: &str, transform: F) -> Result, LexerError> where F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); for line in text.lines() { Self::scan(line, &mut stream, &mut cursor); stream.push("\n".to_string(), TokenType::Newline, Span::with_single(cursor)); cursor.line += 1; cursor.column = 0; } Ok(transform(&stream)) } /// Internal: scans a single line of text into tokens. fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) { for c in line.chars() { let variant = get_token_type(c); let last = stream.len().saturating_sub(1); if !stream.is_empty() && variant == stream.variants[last] && (variant == TokenType::Numeric || variant == TokenType::Text) { stream.lexemes[last].push(c); stream.locations[last].end = *cursor; } else { stream.push(c.to_string(), variant, Span::with_single(*cursor)); } cursor.column += 1; } } } fn get_token_type(curr_char: char) -> TokenType { match curr_char { '\n' => TokenType::Newline, c if c.is_whitespace() => TokenType::Whitespace, c if c.is_numeric() => TokenType::Numeric, c if c.is_alphabetic() => TokenType::Text, c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol, _ => TokenType::Unknown } }