| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | use std::fs::File;
 | 
					
						
							|  |  |  | use std::io::{BufRead, BufReader};
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | use super::error::LexerError;
 | 
					
						
							|  |  |  | use super::position::{Position, Span};
 | 
					
						
							|  |  |  | use super::token::{TokenStream, TokenType};
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// The size of data chunks to read from a file. This is an arbitrary choice,
 | 
					
						
							|  |  |  | /// set to 1MB.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | const BUFFER_SIZE: usize = 1024 * 1024;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// The `Lexer` struct is responsible for performing lexical analysis
 | 
					
						
							|  |  |  | /// (tokenization) on text.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | ///
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// It processes input from a file or string character-by-character and
 | 
					
						
							|  |  |  | /// generates a stream of tokens, such as text, numbers, whitespace, symbols,
 | 
					
						
							|  |  |  | /// and newlines. These tokens are accumulated into a `TokenStream`, which is a
 | 
					
						
							|  |  |  | /// flat, cache-friendly data structure designed for efficient iteration.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | ///
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// After the base tokens are generated, the `Lexer` allows for transformation
 | 
					
						
							|  |  |  | /// of these tokens into richer, domain-specific types via a user-provided
 | 
					
						
							|  |  |  | /// `transform` function. This transformation can be used to convert base tokens
 | 
					
						
							|  |  |  | /// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
 | 
					
						
							|  |  |  | /// other custom format you need.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | ///
 | 
					
						
							|  |  |  | /// # Example
 | 
					
						
							|  |  |  | ///
 | 
					
						
							|  |  |  | /// ```rust
 | 
					
						
							|  |  |  | /// use rune::{Lexer, TokenStream, TokenType};
 | 
					
						
							|  |  |  | ///
 | 
					
						
							| 
									
										
										
										
											2025-04-16 01:54:22 -04:00
										 |  |  | /// fn transform(tokens: &TokenStream<TokenType>) -> TokenStream<TokenType>
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | /// {
 | 
					
						
							| 
									
										
										
										
											2025-04-16 01:54:22 -04:00
										 |  |  | ///    tokens.clone()
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | /// }
 | 
					
						
							|  |  |  | ///
 | 
					
						
							|  |  |  | /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
 | 
					
						
							|  |  |  | ///
 | 
					
						
							| 
									
										
										
										
											2025-04-16 01:54:22 -04:00
										 |  |  | /// for token in &tokens
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | /// {
 | 
					
						
							| 
									
										
										
										
											2025-04-16 01:54:22 -04:00
										 |  |  | ///    println!("{}", token);
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | /// }
 | 
					
						
							|  |  |  | /// ```
 | 
					
						
							|  |  |  | ///
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// # Design Considerations
 | 
					
						
							|  |  |  | /// - Utilizes a flat `TokenStream` to improve performance and reduce heap
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | ///   overhead.
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | ///   numbers).
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// - Extensible via the `transform` function, enabling the creation of
 | 
					
						
							|  |  |  | ///   higher-level constructs, like Markdown elements or syntax trees for a
 | 
					
						
							|  |  |  | ///   custom DSL.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | ///
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// # Error Handling
 | 
					
						
							|  |  |  | /// The lexer will return a `LexerError` if the input file cannot be opened or
 | 
					
						
							|  |  |  | /// read. Errors include issues such as missing files, read failures, or invalid
 | 
					
						
							|  |  |  | /// input formats.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | pub enum Lexer {}
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | impl Lexer
 | 
					
						
							|  |  |  | {
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |    /// Scans a file and generates a vector of transformed tokens based on the
 | 
					
						
							|  |  |  |    /// provided `transform` function.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// This method opens a file from the given `path`, reads the file line by
 | 
					
						
							|  |  |  |    /// line, and converts the input into a stream of tokens. The tokens are
 | 
					
						
							|  |  |  |    /// then passed to the `transform` function, which allows users to map
 | 
					
						
							|  |  |  |    /// base tokens into domain-specific types.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// # Parameters
 | 
					
						
							|  |  |  |    /// - `path`: A path to the file to be lexically analyzed.
 | 
					
						
							|  |  |  |    /// - `transform`: A function that takes a `TokenStream<TokenType>` and
 | 
					
						
							|  |  |  |    ///   transforms it into a `TokenStream<T>` where `T` is a domain-specific
 | 
					
						
							|  |  |  |    ///   type.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// # Returns
 | 
					
						
							|  |  |  |    /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
 | 
					
						
							|  |  |  |    /// type, or an error.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// # Errors
 | 
					
						
							|  |  |  |    /// Returns a `LexerError` if the file cannot be opened or read.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |    pub fn scan_file<P, F, T>(path: P, transform: F)
 | 
					
						
							| 
									
										
										
										
											2025-04-16 01:54:22 -04:00
										 |  |  |                              -> Result<TokenStream<T>, LexerError>
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |       where P: AsRef<std::path::Path>,
 | 
					
						
							| 
									
										
										
										
											2025-04-16 01:54:22 -04:00
										 |  |  |             F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |    {
 | 
					
						
							|  |  |  |       let mut cursor = Position::default();
 | 
					
						
							|  |  |  |       let mut stream = TokenStream::new();
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-16 20:03:15 -04:00
										 |  |  |       let input_file = File::open(&path).map_err(|err| {
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |                                            LexerError::new(
 | 
					
						
							| 
									
										
										
										
											2025-04-16 20:03:15 -04:00
										 |  |  |                 "Unable to open file for Lexical Analysis.",
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |                 Span::default(),
 | 
					
						
							| 
									
										
										
										
											2025-04-16 20:03:15 -04:00
										 |  |  |                 Some(path.as_ref().to_path_buf()),
 | 
					
						
							|  |  |  |                 None).with_source(err)
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |                                         })?;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |       // Read the file line by line.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |       for line in reader.lines()
 | 
					
						
							|  |  |  |       {
 | 
					
						
							|  |  |  |          match line
 | 
					
						
							|  |  |  |          {
 | 
					
						
							|  |  |  |             Ok(text) =>
 | 
					
						
							|  |  |  |             {
 | 
					
						
							|  |  |  |                Self::scan(&text, &mut stream, &mut cursor);
 | 
					
						
							|  |  |  |             }
 | 
					
						
							|  |  |  |             Err(_) =>
 | 
					
						
							|  |  |  |             {
 | 
					
						
							|  |  |  |                return Err(LexerError::new("Unable to read line during \
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |                                            Lexical Analysis.",
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |                                           Span::default(),
 | 
					
						
							|  |  |  |                                           Some(path.as_ref()
 | 
					
						
							|  |  |  |                                                    .to_string_lossy()
 | 
					
						
							|  |  |  |                                                    .to_string()),
 | 
					
						
							|  |  |  |                                           None));
 | 
					
						
							|  |  |  |             }
 | 
					
						
							|  |  |  |          }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |          // Add the newline token after each line.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |          stream.push("\n".to_string(),
 | 
					
						
							|  |  |  |                      TokenType::Newline,
 | 
					
						
							|  |  |  |                      Span::with_single(cursor));
 | 
					
						
							|  |  |  |          cursor.line += 1;
 | 
					
						
							|  |  |  |          cursor.column = 0;
 | 
					
						
							|  |  |  |       }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       Ok(transform(&stream))
 | 
					
						
							|  |  |  |    }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |    /// Scans a full in-memory string and produces transformed tokens.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// This method tokenizes the input string `text` and returns the transformed
 | 
					
						
							|  |  |  |    /// tokens using the provided `transform` function. It's a convenient way
 | 
					
						
							|  |  |  |    /// to perform lexical analysis on in-memory strings without needing to
 | 
					
						
							|  |  |  |    /// read from a file.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// # Parameters
 | 
					
						
							|  |  |  |    /// - `text`: A string slice representing the in-memory input text to
 | 
					
						
							|  |  |  |    ///   analyze.
 | 
					
						
							|  |  |  |    /// - `transform`: A function that transforms the base tokens into
 | 
					
						
							|  |  |  |    ///   domain-specific types.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// # Returns
 | 
					
						
							|  |  |  |    /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
 | 
					
						
							|  |  |  |    /// type, or an error.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |    pub fn scan_text<F, T>(text: &str, transform: F)
 | 
					
						
							| 
									
										
										
										
											2025-04-16 01:54:22 -04:00
										 |  |  |                           -> Result<TokenStream<T>, LexerError>
 | 
					
						
							|  |  |  |       where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |    {
 | 
					
						
							|  |  |  |       let mut cursor = Position::default();
 | 
					
						
							|  |  |  |       let mut stream = TokenStream::new();
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |       // Process each line in the input string.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |       for line in text.lines()
 | 
					
						
							|  |  |  |       {
 | 
					
						
							|  |  |  |          Self::scan(line, &mut stream, &mut cursor);
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |          // Add the newline token after each line.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |          stream.push("\n".to_string(),
 | 
					
						
							|  |  |  |                      TokenType::Newline,
 | 
					
						
							|  |  |  |                      Span::with_single(cursor));
 | 
					
						
							|  |  |  |          cursor.line += 1;
 | 
					
						
							|  |  |  |          cursor.column = 0;
 | 
					
						
							|  |  |  |       }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-05-05 18:11:47 -04:00
										 |  |  |       // Remove the last newline character if the text did not end with a
 | 
					
						
							|  |  |  |       // newline.
 | 
					
						
							|  |  |  |       if !text.ends_with('\n')
 | 
					
						
							|  |  |  |       {
 | 
					
						
							|  |  |  |          stream.pop();
 | 
					
						
							|  |  |  |       }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |       Ok(transform(&stream))
 | 
					
						
							|  |  |  |    }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |    /// Internal method that scans a single line of text into tokens.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// This method processes each character of a line and generates the
 | 
					
						
							|  |  |  |    /// corresponding token. It handles cases like numeric tokens, text
 | 
					
						
							|  |  |  |    /// tokens, symbols, and whitespace.
 | 
					
						
							|  |  |  |    ///
 | 
					
						
							|  |  |  |    /// # Parameters
 | 
					
						
							|  |  |  |    /// - `line`: A line of text to be lexically analyzed.
 | 
					
						
							|  |  |  |    /// - `stream`: A mutable reference to the token stream where the generated
 | 
					
						
							|  |  |  |    ///   tokens will be pushed.
 | 
					
						
							|  |  |  |    /// - `cursor`: A mutable reference to the cursor position, which tracks the
 | 
					
						
							|  |  |  |    ///   current position in the input.
 | 
					
						
							| 
									
										
										
										
											2025-04-16 20:03:15 -04:00
										 |  |  |    fn scan(line: &str, stream: &mut TokenStream<TokenType>,
 | 
					
						
							|  |  |  |            cursor: &mut Position)
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |    {
 | 
					
						
							|  |  |  |       for c in line.chars()
 | 
					
						
							|  |  |  |       {
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |          // Get the token type based on the character.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |          let variant = get_token_type(c);
 | 
					
						
							|  |  |  |          let last = stream.len().saturating_sub(1);
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |          // Handle token merging for contiguous tokens like numbers or text.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |          if !stream.is_empty() &&
 | 
					
						
							|  |  |  |             variant == stream.variants[last] &&
 | 
					
						
							|  |  |  |             (variant == TokenType::Numeric || variant == TokenType::Text)
 | 
					
						
							|  |  |  |          {
 | 
					
						
							|  |  |  |             stream.lexemes[last].push(c);
 | 
					
						
							|  |  |  |             stream.locations[last].end = *cursor;
 | 
					
						
							|  |  |  |          }
 | 
					
						
							|  |  |  |          else
 | 
					
						
							|  |  |  |          {
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  |             // Add a new token to the stream.
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  |             stream.push(c.to_string(), variant, Span::with_single(*cursor));
 | 
					
						
							|  |  |  |          }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |          cursor.column += 1;
 | 
					
						
							|  |  |  |       }
 | 
					
						
							|  |  |  |    }
 | 
					
						
							|  |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-22 02:18:12 -04:00
										 |  |  | /// Determines the type of a token based on the current character.
 | 
					
						
							|  |  |  | ///
 | 
					
						
							|  |  |  | /// This helper function is responsible for identifying whether the current
 | 
					
						
							|  |  |  | /// character is part of a known token type such as numeric, text, whitespace,
 | 
					
						
							|  |  |  | /// or symbol.
 | 
					
						
							|  |  |  | ///
 | 
					
						
							|  |  |  | /// # Parameters
 | 
					
						
							|  |  |  | /// - `curr_char`: The current character to analyze.
 | 
					
						
							|  |  |  | ///
 | 
					
						
							|  |  |  | /// # Returns
 | 
					
						
							|  |  |  | /// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
 | 
					
						
							|  |  |  | /// `Whitespace`, etc.).
 | 
					
						
							| 
									
										
										
										
											2025-04-15 21:17:28 -04:00
										 |  |  | fn get_token_type(curr_char: char) -> TokenType
 | 
					
						
							|  |  |  | {
 | 
					
						
							|  |  |  |    match curr_char
 | 
					
						
							|  |  |  |    {
 | 
					
						
							|  |  |  |       '\n' => TokenType::Newline,
 | 
					
						
							|  |  |  |       c if c.is_whitespace() => TokenType::Whitespace,
 | 
					
						
							|  |  |  |       c if c.is_numeric() => TokenType::Numeric,
 | 
					
						
							|  |  |  |       c if c.is_alphabetic() => TokenType::Text,
 | 
					
						
							|  |  |  |       c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
 | 
					
						
							|  |  |  |       _ => TokenType::Unknown
 | 
					
						
							|  |  |  |    }
 | 
					
						
							|  |  |  | }
 |