diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..a69b3e8 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "rune" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 19f1130..36cceb8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "rune" -version = "0.0.0" +version = "0.1.0" edition = "2021" -description = "A simple lexical analysis library." +description = "A lexical analysis library." repository = "/myrddin/rune" authors = ["CyberMages LLC ", "Jason Travis Smith "] readme = "README.md" diff --git a/examples/basic.rs b/examples/basic.rs new file mode 100644 index 0000000..181a170 --- /dev/null +++ b/examples/basic.rs @@ -0,0 +1,35 @@ +use std::path::PathBuf; + +use rune::{Lexer, TokenStream, TokenType}; + + + +// Define how you want to interpret base tokens +fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)> +{ + let mut new_tokens = Vec::new(); + + for token in tokens + { + new_tokens.push((*token.variant, token.lexeme.to_string())); + } + + new_tokens +} + + +fn main() -> Result<(), Box> +{ + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + path.push("examples/example.txt"); + + let tokens = Lexer::scan_file(path, transform)?; + + // The tuple here is from the transform functions return type. + for (ty, lexeme) in tokens + { + println!("{:?}: {:?}", ty, lexeme); + } + + Ok(()) +} diff --git a/examples/example.txt b/examples/example.txt new file mode 100644 index 0000000..3428015 --- /dev/null +++ b/examples/example.txt @@ -0,0 +1,2 @@ +The name Rune is inspired by arcane glyphs — ancient symbols holding deep meaning. +Just like your tokens! diff --git a/src/.lexer.rs.swp b/src/.lexer.rs.swp new file mode 100644 index 0000000..1503c74 Binary files /dev/null and b/src/.lexer.rs.swp differ diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..62bd5f0 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,59 @@ +use super::position::Span; + + + +/// An error that has occured during lexigraphical analysis. +#[derive(Debug, Clone, PartialEq)] +pub struct LexerError +{ + /// A human-readable error message. + pub message: String, + + /// The start and end of where the error is located in the file. + pub span: Span, + + /// The file that the error occured within. + pub file: Option, + + /// The problematic string (optional). + pub snippet: Option +} + + + +impl LexerError +{ + pub fn new(message: S, span: Span, file: Option, + snippet: Option) + -> Self + where S: Into, + T: Into + { + LexerError { message: message.into(), + span, + file: file.map(|t| t.into()), + snippet: snippet.map(|s| s.into()) } + } +} + +impl std::fmt::Display for LexerError +{ + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result + { + match &self.snippet + { + Some(snippet) => + { + write!(f, + "LexerError at {}: {} (snippet: '{}')", + self.span, self.message, snippet) + } + None => + { + write!(f, "LexerError at {}: {}", self.span, self.message) + } + } + } +} + +impl std::error::Error for LexerError {} diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..ee63104 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,185 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; + +use super::error::LexerError; +use super::position::{Position, Span}; +use super::token::{TokenStream, TokenType}; + + + +/// The size of data chunks to read from a file. This was arbitrarily chosen to +/// be 1mb. +const BUFFER_SIZE: usize = 1024 * 1024; + + + +/// The `Lexer` is the core component responsible for performing +/// lexicographical analysis on a text file. +/// +/// It reads input from a file character-by-character, generating a stream +/// of base tokens such as text, numbers, whitespace, symbols, and newlines. +/// These tokens are accumulated into a `TokenStream`, which is a flat, +/// cache-friendly data structure. +/// +/// After tokenization, the lexer applies a user-provided `transform` function +/// to each token in the stream, allowing consumers of the library to convert +/// base tokens into richer, domain-specific token types (e.g. Markdown +/// elements, syntax trees, or custom DSL tokens). +/// +/// # Example +/// +/// ```rust +/// use rune::{Lexer, TokenStream, TokenType}; +/// +/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)> +/// { +/// let mut new_tokens = Vec::new(); +/// +/// for token in tokens +/// { +/// new_tokens.push((*token.variant, token.lexeme.to_string())); +/// } +/// +/// new_tokens +/// } +/// +/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap(); +/// +/// // The tuple here is from the transform functions return type. +/// for (ty, lexeme) in tokens +/// { +/// println!("{:?}: {:?}", ty, lexeme); +/// } +/// ``` +/// +/// # Design Notes +/// +/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap +/// overhead. +/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit +/// numbers). +/// - Easily extensible via the `transform` function to support higher-level +/// parsing tasks. +/// +/// # Errors +/// +/// Returns a `LexerError` if the file cannot be opened or read. +pub enum Lexer {} + + + +impl Lexer +{ + /// Scans a file and produces a vector of transformed tokens. + pub fn scan_file(path: P, transform: F) + -> Result, LexerError> + where P: AsRef, + F: FnOnce(&TokenStream) -> Vec + { + let mut cursor = Position::default(); + let mut stream = TokenStream::new(); + + let input_file = File::open(&path).map_err(|_error| { + LexerError::new( + "Unable to open file for Lexigraphical Analysis.", + Span::default(), + Some(path.as_ref().to_string_lossy().to_string()), + None, + ) + })?; + + let reader = BufReader::with_capacity(BUFFER_SIZE, input_file); + + for line in reader.lines() + { + match line + { + Ok(text) => + { + Self::scan(&text, &mut stream, &mut cursor); + } + Err(_) => + { + return Err(LexerError::new("Unable to read line during \ + Lexigraphical Analysis.", + Span::default(), + Some(path.as_ref() + .to_string_lossy() + .to_string()), + None)); + } + } + + stream.push("\n".to_string(), + TokenType::Newline, + Span::with_single(cursor)); + + cursor.line += 1; + cursor.column = 0; + } + + Ok(transform(&stream)) + } + + /// Scans a full in-memory string and returns transformed tokens. + pub fn scan_text(text: &str, transform: F) + -> Result, LexerError> + where F: FnOnce(&TokenStream) -> Vec + { + let mut cursor = Position::default(); + let mut stream = TokenStream::new(); + + for line in text.lines() + { + Self::scan(line, &mut stream, &mut cursor); + + stream.push("\n".to_string(), + TokenType::Newline, + Span::with_single(cursor)); + + cursor.line += 1; + cursor.column = 0; + } + + Ok(transform(&stream)) + } + + /// Internal: scans a single line of text into tokens. + fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position) + { + for c in line.chars() + { + let variant = get_token_type(c); + let last = stream.len().saturating_sub(1); + + if !stream.is_empty() && + variant == stream.variants[last] && + (variant == TokenType::Numeric || variant == TokenType::Text) + { + stream.lexemes[last].push(c); + stream.locations[last].end = *cursor; + } + else + { + stream.push(c.to_string(), variant, Span::with_single(*cursor)); + } + + cursor.column += 1; + } + } +} + + + +fn get_token_type(curr_char: char) -> TokenType +{ + match curr_char + { + '\n' => TokenType::Newline, + c if c.is_whitespace() => TokenType::Whitespace, + c if c.is_numeric() => TokenType::Numeric, + c if c.is_alphabetic() => TokenType::Text, + c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol, + _ => TokenType::Unknown + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..9313f7a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,20 @@ +//! # Rune +//! **Rune** is a high-performance, customizable **lexical analysis library** +//! written in Rust. It transforms source files into tokens using a fast, +//! cache-friendly design. +//! > _“Turn raw text into structured meaning — like spellcraft for source +//! > code.”_ + +pub mod library; + +mod error; +mod lexer; +mod position; +mod token; + + + +pub use crate::error::*; +pub use crate::lexer::*; +pub use crate::position::*; +pub use crate::token::*; diff --git a/src/project.rs b/src/library.rs similarity index 80% rename from src/project.rs rename to src/library.rs index a977e1c..88a7d9b 100644 --- a/src/project.rs +++ b/src/library.rs @@ -1,4 +1,4 @@ -//! This is where the Projects build information can be retreived from. +//! This is where information about the library. @@ -13,7 +13,7 @@ const NOT_DEFINED: &'static str = "UNDEFINED"; -/// Returns the name of the program as defined by the CARGO_PKG_NAME. This is +/// Returns the name of the library as defined by the CARGO_PKG_NAME. This is /// set at compile time and comes from the Cargo.toml file. /// /// If a value is not found, then it will return the not defined value. @@ -23,7 +23,7 @@ pub fn get_name() -> &'static str } -/// Returns the name of the program as defined by the CARGO_PKG_VERSION. This is +/// Returns the name of the library as defined by the CARGO_PKG_VERSION. This is /// set at compile time and comes from the Cargo.toml file. /// /// If a value is not found, then it will return the not defined value. diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index a9544db..0000000 --- a/src/main.rs +++ /dev/null @@ -1,20 +0,0 @@ -//! A simple lexical analysis library. - - - -mod project; - - - -/// Print the version of the project. -fn print_version() -{ - println!("{} v{}", project::get_name(), project::get_version()); -} - - -/// The usual starting point of your project. -fn main() -{ - print_version(); -} diff --git a/src/position.rs b/src/position.rs new file mode 100644 index 0000000..c69a917 --- /dev/null +++ b/src/position.rs @@ -0,0 +1,81 @@ +/// Represents a location in a file by line and column. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Position +{ + /// The referenced line of the file. + pub line: usize, + + /// The referenced column of the file. + pub column: usize +} + + +/// Represents a range within a file. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Span +{ + /// The start `Position` in the file. + pub start: Position, + + /// The end `Position` in the file. + pub end: Position +} + + + +impl Position +{ + pub fn new(line: usize, column: usize) -> Self + { + Position { line, column } + } +} + +impl Default for Position +{ + fn default() -> Self + { + Position { line: 0, column: 0 } + } +} + +impl std::fmt::Display for Position +{ + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result + { + write!(f, "Line {}, Column {}", self.line, self.column) + } +} + + + +impl Span +{ + pub fn new(start: Position, end: Position) -> Self + { + Span { start, end } + } + + pub fn with_single(val: Position) -> Self + { + Span { start: val, + end: val } + } +} + +impl Default for Span +{ + fn default() -> Self + { + Span { start: Position::default(), + end: Position::default() } + } +} + +impl std::fmt::Display for Span +{ + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result + { + write!(f, "Span[{} -> {}]", self.start, self.end) + } +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..cb9f2a7 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,294 @@ +use super::position::Span; + + + +/// The different types of tokens that the `Lexer` will break a +/// file down into. These are all base token types. It is left as +/// basic as it is so that they can be combined and altered for +/// each domains specific needs. +/// +/// Base tokens are the most basic token type. +/// Complex tokens are a token type that uses multiple base tokens. +/// Swap tokens are when you swap one token type into another. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum TokenType +{ + /// This represents a single newline character. + /// + /// This is a base token type. + Newline, + + /// A numeric character followed by zero or more numeric characters. + /// + /// This is a base token type. + Numeric, + + /// A non alphanumeric or whitespace character. + /// + /// This is a base token type. + Symbol, + + /// An alpha character followed by zero or more alpha characters. + /// + /// This is a base token type. + Text, + + /// One or more whitespace characters. + /// + /// This is a base token type. + Whitespace, + + /// This token has something, but we don't know what. + /// This should not occur. + Unknown +} + + + +/// Data oriented design of the information desired while +/// tokenizing a file. +/// +/// Everything is in flat arrays for fast access +/// and minimal cache misses. +#[derive(Debug, Clone, Default)] +pub struct TokenStream +{ + /// The text of the `Token`. + pub lexemes: Vec, + + /// The type of `Token`. + pub variants: Vec, + + /// The location of the `Token` in the file. + pub locations: Vec +} + + +/// A `Token` found during the lexigraphical scan. +#[derive(Debug)] +pub struct Token<'a> +{ + /// The characters of the `Token`. + pub lexeme: &'a str, + + /// The `Token`'s type. + pub variant: &'a TokenType, + + /// The location in the file of this `Token`. + pub span: &'a Span +} + +/// A `Token` found during the lexigraphical scan. +/// +/// This is the mutable reference. +#[derive(Debug)] +pub struct TokenMut<'a> +{ + /// The characters of the `Token`. + pub lexeme: &'a mut str, + /// The `Token`'s type. + pub variant: &'a mut TokenType, + /// The location for this `Token` in the file. + pub span: &'a mut Span +} + + +/// An iterator over the Tokens in a `TokenStream`. +pub struct TokenStreamIter<'a> +{ + /// The stream to iterate over. + stream: &'a TokenStream, + + /// The position in the stream. + index: usize +} + +/// A mutable iterator over the Tokens in a `TokenStream`. +pub struct TokenStreamIterMut<'a> +{ + /// The characters of the `Token`. + lexemes: std::slice::IterMut<'a, String>, + + /// The `Token`'s type. + variants: std::slice::IterMut<'a, TokenType>, + + /// The location for this `Token` in the file. + locations: std::slice::IterMut<'a, Span> +} + + + +impl TokenStream +{ + pub fn new() -> Self + { + TokenStream { lexemes: Vec::new(), + variants: Vec::new(), + locations: Vec::new() } + } + + pub fn len(&self) -> usize + { + self.lexemes.len() + } + + pub fn is_empty(&self) -> bool + { + self.lexemes.is_empty() + } + + pub fn get(&self, index: usize) -> Option> + { + if index < self.lexemes.len() + { + Some(Token { lexeme: &self.lexemes[index], + variant: &self.variants[index], + span: &self.locations[index] }) + } + else + { + None + } + } + + pub fn iter(&self) -> TokenStreamIter<'_> + { + TokenStreamIter { stream: self, + index: 0 } + } + + pub fn get_mut(&mut self, index: usize) -> Option> + { + if index < self.lexemes.len() + { + // SAFETY: We manually split the borrows to avoid + // double mutable borrow. + let lexeme = &mut self.lexemes[index] as *mut String; + let variant = &mut self.variants[index] as *mut TokenType; + let span = &mut self.locations[index] as *mut Span; + + // Convert &mut String to &mut str safely. + unsafe { + Some(TokenMut { lexeme: &mut *lexeme.as_mut() + .unwrap() + .as_mut_str(), + variant: &mut *variant, + span: &mut *span }) + } + } + else + { + None + } + } + + pub fn clear(&mut self) + { + self.lexemes.clear(); + self.variants.clear(); + self.locations.clear(); + } + + pub fn push(&mut self, lexeme: String, variant: TokenType, span: Span) + { + self.lexemes.push(lexeme); + self.variants.push(variant); + self.locations.push(span); + } + + pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_> + { + TokenStreamIterMut { lexemes: self.lexemes.iter_mut(), + variants: self.variants.iter_mut(), + locations: self.locations.iter_mut() } + } +} + + +impl<'a> IntoIterator for &'a TokenStream +{ + type IntoIter = TokenStreamIter<'a>; + type Item = Token<'a>; + + fn into_iter(self) -> Self::IntoIter + { + TokenStreamIter { stream: self, + index: 0 } + } +} + +impl<'a> Iterator for TokenStreamIter<'a> +{ + type Item = Token<'a>; + + fn next(&mut self) -> Option + { + if self.index < self.stream.lexemes.len() + { + let i = self.index; + self.index += 1; + + Some(Token { lexeme: &self.stream.lexemes[i], + variant: &self.stream.variants[i], + span: &self.stream.locations[i] }) + } + else + { + None + } + } +} + + +impl<'a> Iterator for TokenStreamIterMut<'a> +{ + type Item = TokenMut<'a>; + + fn next(&mut self) -> Option + { + let lexeme = self.lexemes.next()?; + let variant = self.variants.next()?; + let span = self.locations.next()?; + + Some(TokenMut { lexeme: &mut *lexeme, + variant, + span }) + } +} + + +impl<'a> ::std::fmt::Display for Token<'a> +{ + fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result + { + match *self.variant + { + TokenType::Newline => write!(f, "[{}, {}]", self.variant, "\\n"), + + _ => write!(f, "[{}: {}]", self.variant, self.lexeme) + } + } +} + +impl ::std::fmt::Display for TokenType +{ + fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result + { + let name = match *self + { + TokenType::Newline => "Newline", + + TokenType::Numeric => "Numeric", + + TokenType::Symbol => "Symbol", + + TokenType::Text => "Text", + + TokenType::Whitespace => "Whitespace", + + TokenType::Unknown => "Unknown" + }; + + write!(f, "{}", name) + } +} diff --git a/tests/lexer_tests.rs b/tests/lexer_tests.rs new file mode 100644 index 0000000..8cb249a --- /dev/null +++ b/tests/lexer_tests.rs @@ -0,0 +1,135 @@ +use std::fs::{self, File}; +use std::io::Write; +use std::path::PathBuf; + +use rune::*; + + + +struct TestCase<'a> +{ + name: &'a str, + input: &'a str, + expected: Vec<(TokenType, &'a str)> +} + + + +fn dummy_transform(tokens: &TokenStream) -> Vec<(TokenType, String)> +{ + let mut new_tokens = Vec::new(); + + for token in tokens + { + new_tokens.push((*token.variant, token.lexeme.to_string())); + } + + new_tokens +} + +fn write_temp_file(name: &str, content: &str) -> PathBuf +{ + let mut path = std::env::temp_dir(); + path.push(format!("rune_test_{}.txt", name)); + let mut file = File::create(&path).expect("Failed to create temp file"); + write!(file, "{}", content).expect("Failed to write test content"); + path +} + +fn cleanup_temp_file(path: &PathBuf) +{ + if path.exists() + { + let _ = fs::remove_file(path); + } +} + + + +#[test] +fn test_basic_lexing() +{ + let tokens = + Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \ + succeed"); + + let expected = vec![(TokenType::Text, "magic".to_string()), + (TokenType::Whitespace, " ".to_string()), + (TokenType::Text, "runes".to_string()), + (TokenType::Newline, "\n".to_string()),]; + + assert_eq!(tokens, expected); +} + + +#[test] +fn test_symbols_and_numbers() +{ + let tokens = + Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \ + succeed"); + + let expected = vec![(TokenType::Numeric, "13".into()), + (TokenType::Whitespace, " ".into()), + (TokenType::Symbol, "+".into()), + (TokenType::Whitespace, " ".into()), + (TokenType::Numeric, "37".into()), + (TokenType::Newline, "\n".into()),]; + + assert_eq!(tokens, expected); +} + + +#[test] +fn test_lexer_with_cases() +{ + let cases = vec![TestCase { name: "simple_words", + input: "magic rune", + expected: vec![(TokenType::Text, "magic"), + (TokenType::Whitespace, " "), + (TokenType::Text, "rune"), + (TokenType::Newline, "\n"),] }, + TestCase { name: "symbols_and_digits", + input: "12 + 7", + expected: vec![(TokenType::Numeric, "12"), + (TokenType::Whitespace, " "), + (TokenType::Symbol, "+"), + (TokenType::Whitespace, " "), + (TokenType::Numeric, "7"), + (TokenType::Newline, "\n"),] }, + TestCase { name: "only_symbols", + input: "###", + expected: vec![(TokenType::Symbol, "#"), + (TokenType::Symbol, "#"), + (TokenType::Symbol, "#"), + (TokenType::Newline, "\n"),] }, + TestCase { name: "whitespace_and_text", + input: " spell", + expected: vec![(TokenType::Whitespace, " "), + (TokenType::Whitespace, " "), + (TokenType::Whitespace, " "), + (TokenType::Text, "spell"), + (TokenType::Newline, "\n"),] },]; + + for case in cases + { + let path = write_temp_file(case.name, case.input); + let result = + Lexer::scan_file(&path, dummy_transform).expect(&format!("Lexer failed \ + on case '{}'", + case.name)); + + let result_stripped: Vec<(TokenType, String)> = result; + + let expected = case.expected + .iter() + .map(|(ty, s)| (*ty, s.to_string())) + .collect::>(); + + assert_eq!(result_stripped, expected, + "Mismatch in test case '{}'", + case.name); + + cleanup_temp_file(&path); + } +}