186 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
		
		
			
		
	
	
			186 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
|  | use std::fs::File;
 | ||
|  | use std::io::{BufRead, BufReader};
 | ||
|  | 
 | ||
|  | use super::error::LexerError;
 | ||
|  | use super::position::{Position, Span};
 | ||
|  | use super::token::{TokenStream, TokenType};
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /// The size of data chunks to read from a file. This was arbitrarily chosen to
 | ||
|  | /// be 1mb.
 | ||
|  | const BUFFER_SIZE: usize = 1024 * 1024;
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /// The `Lexer` is the core component responsible for performing
 | ||
|  | /// lexicographical analysis on a text file.
 | ||
|  | ///
 | ||
|  | /// It reads input from a file character-by-character, generating a stream
 | ||
|  | /// of base tokens such as text, numbers, whitespace, symbols, and newlines.
 | ||
|  | /// These tokens are accumulated into a `TokenStream`, which is a flat,
 | ||
|  | /// cache-friendly data structure.
 | ||
|  | ///
 | ||
|  | /// After tokenization, the lexer applies a user-provided `transform` function
 | ||
|  | /// to each token in the stream, allowing consumers of the library to convert
 | ||
|  | /// base tokens into richer, domain-specific token types (e.g. Markdown
 | ||
|  | /// elements, syntax trees, or custom DSL tokens).
 | ||
|  | ///
 | ||
|  | /// # Example
 | ||
|  | ///
 | ||
|  | /// ```rust
 | ||
|  | /// use rune::{Lexer, TokenStream, TokenType};
 | ||
|  | ///
 | ||
|  | /// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
 | ||
|  | /// {
 | ||
|  | ///    let mut new_tokens = Vec::new();
 | ||
|  | ///
 | ||
|  | ///    for token in tokens
 | ||
|  | ///    {
 | ||
|  | ///       new_tokens.push((*token.variant, token.lexeme.to_string()));
 | ||
|  | ///    }
 | ||
|  | ///
 | ||
|  | ///    new_tokens
 | ||
|  | /// }
 | ||
|  | ///
 | ||
|  | /// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
 | ||
|  | ///
 | ||
|  | /// // The tuple here is from the transform functions return type.
 | ||
|  | /// for (ty, lexeme) in tokens
 | ||
|  | /// {
 | ||
|  | ///    println!("{:?}: {:?}", ty, lexeme);
 | ||
|  | /// }
 | ||
|  | /// ```
 | ||
|  | ///
 | ||
|  | /// # Design Notes
 | ||
|  | ///
 | ||
|  | /// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
 | ||
|  | ///   overhead.
 | ||
|  | /// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
 | ||
|  | ///   numbers).
 | ||
|  | /// - Easily extensible via the `transform` function to support higher-level
 | ||
|  | ///   parsing tasks.
 | ||
|  | ///
 | ||
|  | /// # Errors
 | ||
|  | ///
 | ||
|  | /// Returns a `LexerError` if the file cannot be opened or read.
 | ||
|  | pub enum Lexer {}
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | impl Lexer
 | ||
|  | {
 | ||
|  |    /// Scans a file and produces a vector of transformed tokens.
 | ||
|  |    pub fn scan_file<P, F, T>(path: P, transform: F)
 | ||
|  |                              -> Result<Vec<T>, LexerError>
 | ||
|  |       where P: AsRef<std::path::Path>,
 | ||
|  |             F: FnOnce(&TokenStream) -> Vec<T>
 | ||
|  |    {
 | ||
|  |       let mut cursor = Position::default();
 | ||
|  |       let mut stream = TokenStream::new();
 | ||
|  | 
 | ||
|  |       let input_file = File::open(&path).map_err(|_error| {
 | ||
|  |                                            LexerError::new(
 | ||
|  |                 "Unable to open file for Lexigraphical Analysis.",
 | ||
|  |                 Span::default(),
 | ||
|  |                 Some(path.as_ref().to_string_lossy().to_string()),
 | ||
|  |                 None,
 | ||
|  |             )
 | ||
|  |                                         })?;
 | ||
|  | 
 | ||
|  |       let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
 | ||
|  | 
 | ||
|  |       for line in reader.lines()
 | ||
|  |       {
 | ||
|  |          match line
 | ||
|  |          {
 | ||
|  |             Ok(text) =>
 | ||
|  |             {
 | ||
|  |                Self::scan(&text, &mut stream, &mut cursor);
 | ||
|  |             }
 | ||
|  |             Err(_) =>
 | ||
|  |             {
 | ||
|  |                return Err(LexerError::new("Unable to read line during \
 | ||
|  |                                            Lexigraphical Analysis.",
 | ||
|  |                                           Span::default(),
 | ||
|  |                                           Some(path.as_ref()
 | ||
|  |                                                    .to_string_lossy()
 | ||
|  |                                                    .to_string()),
 | ||
|  |                                           None));
 | ||
|  |             }
 | ||
|  |          }
 | ||
|  | 
 | ||
|  |          stream.push("\n".to_string(),
 | ||
|  |                      TokenType::Newline,
 | ||
|  |                      Span::with_single(cursor));
 | ||
|  | 
 | ||
|  |          cursor.line += 1;
 | ||
|  |          cursor.column = 0;
 | ||
|  |       }
 | ||
|  | 
 | ||
|  |       Ok(transform(&stream))
 | ||
|  |    }
 | ||
|  | 
 | ||
|  |    /// Scans a full in-memory string and returns transformed tokens.
 | ||
|  |    pub fn scan_text<F, T>(text: &str, transform: F)
 | ||
|  |                           -> Result<Vec<T>, LexerError>
 | ||
|  |       where F: FnOnce(&TokenStream) -> Vec<T>
 | ||
|  |    {
 | ||
|  |       let mut cursor = Position::default();
 | ||
|  |       let mut stream = TokenStream::new();
 | ||
|  | 
 | ||
|  |       for line in text.lines()
 | ||
|  |       {
 | ||
|  |          Self::scan(line, &mut stream, &mut cursor);
 | ||
|  | 
 | ||
|  |          stream.push("\n".to_string(),
 | ||
|  |                      TokenType::Newline,
 | ||
|  |                      Span::with_single(cursor));
 | ||
|  | 
 | ||
|  |          cursor.line += 1;
 | ||
|  |          cursor.column = 0;
 | ||
|  |       }
 | ||
|  | 
 | ||
|  |       Ok(transform(&stream))
 | ||
|  |    }
 | ||
|  | 
 | ||
|  |    /// Internal: scans a single line of text into tokens.
 | ||
|  |    fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position)
 | ||
|  |    {
 | ||
|  |       for c in line.chars()
 | ||
|  |       {
 | ||
|  |          let variant = get_token_type(c);
 | ||
|  |          let last = stream.len().saturating_sub(1);
 | ||
|  | 
 | ||
|  |          if !stream.is_empty() &&
 | ||
|  |             variant == stream.variants[last] &&
 | ||
|  |             (variant == TokenType::Numeric || variant == TokenType::Text)
 | ||
|  |          {
 | ||
|  |             stream.lexemes[last].push(c);
 | ||
|  |             stream.locations[last].end = *cursor;
 | ||
|  |          }
 | ||
|  |          else
 | ||
|  |          {
 | ||
|  |             stream.push(c.to_string(), variant, Span::with_single(*cursor));
 | ||
|  |          }
 | ||
|  | 
 | ||
|  |          cursor.column += 1;
 | ||
|  |       }
 | ||
|  |    }
 | ||
|  | }
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | fn get_token_type(curr_char: char) -> TokenType
 | ||
|  | {
 | ||
|  |    match curr_char
 | ||
|  |    {
 | ||
|  |       '\n' => TokenType::Newline,
 | ||
|  |       c if c.is_whitespace() => TokenType::Whitespace,
 | ||
|  |       c if c.is_numeric() => TokenType::Numeric,
 | ||
|  |       c if c.is_alphabetic() => TokenType::Text,
 | ||
|  |       c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
 | ||
|  |       _ => TokenType::Unknown
 | ||
|  |    }
 | ||
|  | }
 |