use std::fs::File; use std::io::{BufRead, BufReader}; use super::error::LexerError; use super::position::{Position, Span}; use super::token::{TokenStream, TokenType}; /// The size of data chunks to read from a file. This is an arbitrary choice, /// set to 1MB. const BUFFER_SIZE: usize = 1024 * 1024; /// The `Lexer` struct is responsible for performing lexical analysis /// (tokenization) on text. /// /// It processes input from a file or string character-by-character and /// generates a stream of tokens, such as text, numbers, whitespace, symbols, /// and newlines. These tokens are accumulated into a `TokenStream`, which is a /// flat, cache-friendly data structure designed for efficient iteration. /// /// After the base tokens are generated, the `Lexer` allows for transformation /// of these tokens into richer, domain-specific types via a user-provided /// `transform` function. This transformation can be used to convert base tokens /// into specific elements of a Markdown syntax tree, custom DSL tokens, or any /// other custom format you need. /// /// # Example /// /// ```rust /// use rune::{Lexer, TokenStream, TokenType}; /// /// fn transform(tokens: &TokenStream) -> TokenStream /// { /// tokens.clone() /// } /// /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); /// /// for token in &tokens /// { /// println!("{}", token); /// } /// ``` /// /// # Design Considerations /// - Utilizes a flat `TokenStream` to improve performance and reduce heap /// overhead. /// - Consolidates contiguous characters into compound tokens (e.g., multi-digit /// numbers). /// - Extensible via the `transform` function, enabling the creation of /// higher-level constructs, like Markdown elements or syntax trees for a /// custom DSL. /// /// # Error Handling /// The lexer will return a `LexerError` if the input file cannot be opened or /// read. Errors include issues such as missing files, read failures, or invalid /// input formats. pub enum Lexer {} impl Lexer { /// Scans a file and generates a vector of transformed tokens based on the /// provided `transform` function. /// /// This method opens a file from the given `path`, reads the file line by /// line, and converts the input into a stream of tokens. The tokens are /// then passed to the `transform` function, which allows users to map /// base tokens into domain-specific types. /// /// # Parameters /// - `path`: A path to the file to be lexically analyzed. /// - `transform`: A function that takes a `TokenStream` and /// transforms it into a `TokenStream` where `T` is a domain-specific /// type. /// /// # Returns /// A `Result, LexerError>` where `T` is the transformed token /// type, or an error. /// /// # Errors /// Returns a `LexerError` if the file cannot be opened or read. pub fn scan_file(path: P, transform: F) -> Result, LexerError> where P: AsRef, F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); let input_file = File::open(&path).map_err(|err| { LexerError::new( "Unable to open file for Lexical Analysis.", Span::default(), Some(path.as_ref().to_path_buf()), None).with_source(err) })?; let reader = BufReader::with_capacity(BUFFER_SIZE, input_file); // Read the file line by line. for line in reader.lines() { match line { Ok(text) => { Self::scan(&text, &mut stream, &mut cursor); } Err(_) => { return Err(LexerError::new("Unable to read line during \ Lexical Analysis.", Span::default(), Some(path.as_ref() .to_string_lossy() .to_string()), None)); } } // Add the newline token after each line. stream.push("\n".to_string(), TokenType::Newline, Span::with_single(cursor)); cursor.line += 1; cursor.column = 0; } Ok(transform(&stream)) } /// Scans a full in-memory string and produces transformed tokens. /// /// This method tokenizes the input string `text` and returns the transformed /// tokens using the provided `transform` function. It's a convenient way /// to perform lexical analysis on in-memory strings without needing to /// read from a file. /// /// # Parameters /// - `text`: A string slice representing the in-memory input text to /// analyze. /// - `transform`: A function that transforms the base tokens into /// domain-specific types. /// /// # Returns /// A `Result, LexerError>` where `T` is the transformed token /// type, or an error. pub fn scan_text(text: &str, transform: F) -> Result, LexerError> where F: FnOnce(&TokenStream) -> TokenStream { let mut cursor = Position::default(); let mut stream = TokenStream::new(); // Process each line in the input string. for line in text.lines() { Self::scan(line, &mut stream, &mut cursor); // Add the newline token after each line. stream.push("\n".to_string(), TokenType::Newline, Span::with_single(cursor)); cursor.line += 1; cursor.column = 0; } // Remove the last newline character if the text did not end with a // newline. if !text.ends_with('\n') { stream.pop(); } Ok(transform(&stream)) } /// Internal method that scans a single line of text into tokens. /// /// This method processes each character of a line and generates the /// corresponding token. It handles cases like numeric tokens, text /// tokens, symbols, and whitespace. /// /// # Parameters /// - `line`: A line of text to be lexically analyzed. /// - `stream`: A mutable reference to the token stream where the generated /// tokens will be pushed. /// - `cursor`: A mutable reference to the cursor position, which tracks the /// current position in the input. fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) { for c in line.chars() { // Get the token type based on the character. let variant = get_token_type(c); let last = stream.len().saturating_sub(1); // Handle token merging for contiguous tokens like numbers or text. if !stream.is_empty() && variant == stream.variants[last] && (variant == TokenType::Numeric || variant == TokenType::Text) { stream.lexemes[last].push(c); stream.locations[last].end = *cursor; } else { // Add a new token to the stream. stream.push(c.to_string(), variant, Span::with_single(*cursor)); } cursor.column += 1; } } } /// Determines the type of a token based on the current character. /// /// This helper function is responsible for identifying whether the current /// character is part of a known token type such as numeric, text, whitespace, /// or symbol. /// /// # Parameters /// - `curr_char`: The current character to analyze. /// /// # Returns /// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`, /// `Whitespace`, etc.). fn get_token_type(curr_char: char) -> TokenType { match curr_char { '\n' => TokenType::Newline, c if c.is_whitespace() => TokenType::Whitespace, c if c.is_numeric() => TokenType::Numeric, c if c.is_alphabetic() => TokenType::Text, c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol, _ => TokenType::Unknown } }