rune/src/lexer.rs

use std::fs::File;
use std::io::{BufRead, BufReader};

use super::error::LexerError;
use super::position::{Position, Span};
use super::token::{TokenStream, TokenType};


/// The size of data chunks to read from a file. This is an arbitrary choice,
/// set to 1MB.
const BUFFER_SIZE: usize = 1024 * 1024;


/// The `Lexer` struct is responsible for performing lexical analysis
/// (tokenization) on text.
///
/// It processes input from a file or string character-by-character and
/// generates a stream of tokens, such as text, numbers, whitespace, symbols,
/// and newlines. These tokens are accumulated into a `TokenStream`, which is a
/// flat, cache-friendly data structure designed for efficient iteration.
///
/// After the base tokens are generated, the `Lexer` allows for transformation
/// of these tokens into richer, domain-specific types via a user-provided
/// `transform` function. This transformation can be used to convert base tokens
/// into specific elements of a Markdown syntax tree, custom DSL tokens, or any
/// other custom format you need.
///
/// # Example
///
/// ```rust
/// use rune::{Lexer, TokenStream, TokenType};
///
/// fn transform(tokens: &TokenStream<TokenType>) -> TokenStream<TokenType>
/// {
///    tokens.clone()
/// }
///
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
///
/// for token in &tokens
/// {
///    println!("{}", token);
/// }
/// ```
///
/// # Design Considerations
/// - Utilizes a flat `TokenStream` to improve performance and reduce heap
///   overhead.
/// - Consolidates contiguous characters into compound tokens (e.g., multi-digit
///   numbers).
/// - Extensible via the `transform` function, enabling the creation of
///   higher-level constructs, like Markdown elements or syntax trees for a
///   custom DSL.
///
/// # Error Handling
/// The lexer will return a `LexerError` if the input file cannot be opened or
/// read. Errors include issues such as missing files, read failures, or invalid
/// input formats.
pub enum Lexer {}


impl Lexer
{
   /// Scans a file and generates a vector of transformed tokens based on the
   /// provided `transform` function.
   ///
   /// This method opens a file from the given `path`, reads the file line by
   /// line, and converts the input into a stream of tokens. The tokens are
   /// then passed to the `transform` function, which allows users to map
   /// base tokens into domain-specific types.
   ///
   /// # Parameters
   /// - `path`: A path to the file to be lexically analyzed.
   /// - `transform`: A function that takes a `TokenStream<TokenType>` and
   ///   transforms it into a `TokenStream<T>` where `T` is a domain-specific
   ///   type.
   ///
   /// # Returns
   /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
   /// type, or an error.
   ///
   /// # Errors
   /// Returns a `LexerError` if the file cannot be opened or read.
   pub fn scan_file<P, F, T>(path: P, transform: F)
                             -> Result<TokenStream<T>, LexerError>
      where P: AsRef<std::path::Path>,
            F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
   {
      let mut cursor = Position::default();
      let mut stream = TokenStream::new();

      let input_file = File::open(&path).map_err(|err| {
                                           LexerError::new(
                "Unable to open file for Lexical Analysis.",
                Span::default(),
                Some(path.as_ref().to_path_buf()),
                None).with_source(err)
                                        })?;

      let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);

      // Read the file line by line.
      for line in reader.lines()
      {
         match line
         {
            Ok(text) =>
            {
               Self::scan(&text, &mut stream, &mut cursor);
            }
            Err(_) =>
            {
               return Err(LexerError::new("Unable to read line during \
                                           Lexical Analysis.",
                                          Span::default(),
                                          Some(path.as_ref()
                                                   .to_string_lossy()
                                                   .to_string()),
                                          None));
            }
         }

         // Add the newline token after each line.
         stream.push("\n".to_string(),
                     TokenType::Newline,
                     Span::with_single(cursor));
         cursor.line += 1;
         cursor.column = 0;
      }

      Ok(transform(&stream))
   }

   /// Scans a full in-memory string and produces transformed tokens.
   ///
   /// This method tokenizes the input string `text` and returns the transformed
   /// tokens using the provided `transform` function. It's a convenient way
   /// to perform lexical analysis on in-memory strings without needing to
   /// read from a file.
   ///
   /// # Parameters
   /// - `text`: A string slice representing the in-memory input text to
   ///   analyze.
   /// - `transform`: A function that transforms the base tokens into
   ///   domain-specific types.
   ///
   /// # Returns
   /// A `Result<TokenStream<T>, LexerError>` where `T` is the transformed token
   /// type, or an error.
   pub fn scan_text<F, T>(text: &str, transform: F)
                          -> Result<TokenStream<T>, LexerError>
      where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
   {
      let mut cursor = Position::default();
      let mut stream = TokenStream::new();

      // Process each line in the input string.
      for line in text.lines()
      {
         Self::scan(line, &mut stream, &mut cursor);

         // Add the newline token after each line.
         stream.push("\n".to_string(),
                     TokenType::Newline,
                     Span::with_single(cursor));
         cursor.line += 1;
         cursor.column = 0;
      }

      // Remove the last newline character if the text did not end with a
      // newline.
      if !text.ends_with('\n')
      {
         stream.pop();
      }

      Ok(transform(&stream))
   }

   /// Internal method that scans a single line of text into tokens.
   ///
   /// This method processes each character of a line and generates the
   /// corresponding token. It handles cases like numeric tokens, text
   /// tokens, symbols, and whitespace.
   ///
   /// # Parameters
   /// - `line`: A line of text to be lexically analyzed.
   /// - `stream`: A mutable reference to the token stream where the generated
   ///   tokens will be pushed.
   /// - `cursor`: A mutable reference to the cursor position, which tracks the
   ///   current position in the input.
   fn scan(line: &str, stream: &mut TokenStream<TokenType>,
           cursor: &mut Position)
   {
      for c in line.chars()
      {
         // Get the token type based on the character.
         let variant = get_token_type(c);
         let last = stream.len().saturating_sub(1);

         // Handle token merging for contiguous tokens like numbers or text.
         if !stream.is_empty() &&
            variant == stream.variants[last] &&
            (variant == TokenType::Numeric || variant == TokenType::Text)
         {
            stream.lexemes[last].push(c);
            stream.locations[last].end = *cursor;
         }
         else
         {
            // Add a new token to the stream.
            stream.push(c.to_string(), variant, Span::with_single(*cursor));
         }

         cursor.column += 1;
      }
   }
}


/// Determines the type of a token based on the current character.
///
/// This helper function is responsible for identifying whether the current
/// character is part of a known token type such as numeric, text, whitespace,
/// or symbol.
///
/// # Parameters
/// - `curr_char`: The current character to analyze.
///
/// # Returns
/// A `TokenType` corresponding to the character type (e.g., `Numeric`, `Text`,
/// `Whitespace`, etc.).
fn get_token_type(curr_char: char) -> TokenType
{
   match curr_char
   {
      '\n' => TokenType::Newline,
      c if c.is_whitespace() => TokenType::Whitespace,
      c if c.is_numeric() => TokenType::Numeric,
      c if c.is_alphabetic() => TokenType::Text,
      c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
      _ => TokenType::Unknown
   }
}