[#2] A data-oriented Lexer.
I took the Token module from the Arcanum project and brought it over to here. It was a nice data oriented way of handling the Tokens. I then created a Lexer that can scan a file or text and allow the user to transform the scanned tokens before the final token array is returned. This should allow for more complex and specific tokens to be created for whatever domain is being targeted. I also added basic library examples and testing. Finally, I made sure the documentation generated nicely. This is now marked as version: 0.1.0
This commit is contained in:
parent
0350a151a9
commit
acf869efbb
7
Cargo.lock
generated
Normal file
7
Cargo.lock
generated
Normal file
@ -0,0 +1,7 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "rune"
|
||||
version = "0.1.0"
|
@ -1,8 +1,8 @@
|
||||
[package]
|
||||
name = "rune"
|
||||
version = "0.0.0"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "A simple lexical analysis library."
|
||||
description = "A lexical analysis library."
|
||||
repository = "/myrddin/rune"
|
||||
authors = ["CyberMages LLC <Software@CyberMagesLLC.com>", "Jason Travis Smith <Myrddin@CyberMages.tech>"]
|
||||
readme = "README.md"
|
||||
|
35
examples/basic.rs
Normal file
35
examples/basic.rs
Normal file
@ -0,0 +1,35 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use rune::{Lexer, TokenStream, TokenType};
|
||||
|
||||
|
||||
|
||||
// Define how you want to interpret base tokens
|
||||
fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
|
||||
{
|
||||
let mut new_tokens = Vec::new();
|
||||
|
||||
for token in tokens
|
||||
{
|
||||
new_tokens.push((*token.variant, token.lexeme.to_string()));
|
||||
}
|
||||
|
||||
new_tokens
|
||||
}
|
||||
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>>
|
||||
{
|
||||
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
path.push("examples/example.txt");
|
||||
|
||||
let tokens = Lexer::scan_file(path, transform)?;
|
||||
|
||||
// The tuple here is from the transform functions return type.
|
||||
for (ty, lexeme) in tokens
|
||||
{
|
||||
println!("{:?}: {:?}", ty, lexeme);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
2
examples/example.txt
Normal file
2
examples/example.txt
Normal file
@ -0,0 +1,2 @@
|
||||
The name Rune is inspired by arcane glyphs — ancient symbols holding deep meaning.
|
||||
Just like your tokens!
|
BIN
src/.lexer.rs.swp
Normal file
BIN
src/.lexer.rs.swp
Normal file
Binary file not shown.
59
src/error.rs
Normal file
59
src/error.rs
Normal file
@ -0,0 +1,59 @@
|
||||
use super::position::Span;
|
||||
|
||||
|
||||
|
||||
/// An error that has occured during lexigraphical analysis.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct LexerError
|
||||
{
|
||||
/// A human-readable error message.
|
||||
pub message: String,
|
||||
|
||||
/// The start and end of where the error is located in the file.
|
||||
pub span: Span,
|
||||
|
||||
/// The file that the error occured within.
|
||||
pub file: Option<std::path::PathBuf>,
|
||||
|
||||
/// The problematic string (optional).
|
||||
pub snippet: Option<String>
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl LexerError
|
||||
{
|
||||
pub fn new<S, T>(message: S, span: Span, file: Option<T>,
|
||||
snippet: Option<S>)
|
||||
-> Self
|
||||
where S: Into<String>,
|
||||
T: Into<std::path::PathBuf>
|
||||
{
|
||||
LexerError { message: message.into(),
|
||||
span,
|
||||
file: file.map(|t| t.into()),
|
||||
snippet: snippet.map(|s| s.into()) }
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for LexerError
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result
|
||||
{
|
||||
match &self.snippet
|
||||
{
|
||||
Some(snippet) =>
|
||||
{
|
||||
write!(f,
|
||||
"LexerError at {}: {} (snippet: '{}')",
|
||||
self.span, self.message, snippet)
|
||||
}
|
||||
None =>
|
||||
{
|
||||
write!(f, "LexerError at {}: {}", self.span, self.message)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for LexerError {}
|
185
src/lexer.rs
Normal file
185
src/lexer.rs
Normal file
@ -0,0 +1,185 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use super::error::LexerError;
|
||||
use super::position::{Position, Span};
|
||||
use super::token::{TokenStream, TokenType};
|
||||
|
||||
|
||||
|
||||
/// The size of data chunks to read from a file. This was arbitrarily chosen to
|
||||
/// be 1mb.
|
||||
const BUFFER_SIZE: usize = 1024 * 1024;
|
||||
|
||||
|
||||
|
||||
/// The `Lexer` is the core component responsible for performing
|
||||
/// lexicographical analysis on a text file.
|
||||
///
|
||||
/// It reads input from a file character-by-character, generating a stream
|
||||
/// of base tokens such as text, numbers, whitespace, symbols, and newlines.
|
||||
/// These tokens are accumulated into a `TokenStream`, which is a flat,
|
||||
/// cache-friendly data structure.
|
||||
///
|
||||
/// After tokenization, the lexer applies a user-provided `transform` function
|
||||
/// to each token in the stream, allowing consumers of the library to convert
|
||||
/// base tokens into richer, domain-specific token types (e.g. Markdown
|
||||
/// elements, syntax trees, or custom DSL tokens).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use rune::{Lexer, TokenStream, TokenType};
|
||||
///
|
||||
/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
|
||||
/// {
|
||||
/// let mut new_tokens = Vec::new();
|
||||
///
|
||||
/// for token in tokens
|
||||
/// {
|
||||
/// new_tokens.push((*token.variant, token.lexeme.to_string()));
|
||||
/// }
|
||||
///
|
||||
/// new_tokens
|
||||
/// }
|
||||
///
|
||||
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
|
||||
///
|
||||
/// // The tuple here is from the transform functions return type.
|
||||
/// for (ty, lexeme) in tokens
|
||||
/// {
|
||||
/// println!("{:?}: {:?}", ty, lexeme);
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// # Design Notes
|
||||
///
|
||||
/// - Uses a flat `TokenStream` to improve iteration performance and reduce heap
|
||||
/// overhead.
|
||||
/// - Consolidates contiguous characters into compound tokens (e.g. multi-digit
|
||||
/// numbers).
|
||||
/// - Easily extensible via the `transform` function to support higher-level
|
||||
/// parsing tasks.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `LexerError` if the file cannot be opened or read.
|
||||
pub enum Lexer {}
|
||||
|
||||
|
||||
|
||||
impl Lexer
|
||||
{
|
||||
/// Scans a file and produces a vector of transformed tokens.
|
||||
pub fn scan_file<P, F, T>(path: P, transform: F)
|
||||
-> Result<Vec<T>, LexerError>
|
||||
where P: AsRef<std::path::Path>,
|
||||
F: FnOnce(&TokenStream) -> Vec<T>
|
||||
{
|
||||
let mut cursor = Position::default();
|
||||
let mut stream = TokenStream::new();
|
||||
|
||||
let input_file = File::open(&path).map_err(|_error| {
|
||||
LexerError::new(
|
||||
"Unable to open file for Lexigraphical Analysis.",
|
||||
Span::default(),
|
||||
Some(path.as_ref().to_string_lossy().to_string()),
|
||||
None,
|
||||
)
|
||||
})?;
|
||||
|
||||
let reader = BufReader::with_capacity(BUFFER_SIZE, input_file);
|
||||
|
||||
for line in reader.lines()
|
||||
{
|
||||
match line
|
||||
{
|
||||
Ok(text) =>
|
||||
{
|
||||
Self::scan(&text, &mut stream, &mut cursor);
|
||||
}
|
||||
Err(_) =>
|
||||
{
|
||||
return Err(LexerError::new("Unable to read line during \
|
||||
Lexigraphical Analysis.",
|
||||
Span::default(),
|
||||
Some(path.as_ref()
|
||||
.to_string_lossy()
|
||||
.to_string()),
|
||||
None));
|
||||
}
|
||||
}
|
||||
|
||||
stream.push("\n".to_string(),
|
||||
TokenType::Newline,
|
||||
Span::with_single(cursor));
|
||||
|
||||
cursor.line += 1;
|
||||
cursor.column = 0;
|
||||
}
|
||||
|
||||
Ok(transform(&stream))
|
||||
}
|
||||
|
||||
/// Scans a full in-memory string and returns transformed tokens.
|
||||
pub fn scan_text<F, T>(text: &str, transform: F)
|
||||
-> Result<Vec<T>, LexerError>
|
||||
where F: FnOnce(&TokenStream) -> Vec<T>
|
||||
{
|
||||
let mut cursor = Position::default();
|
||||
let mut stream = TokenStream::new();
|
||||
|
||||
for line in text.lines()
|
||||
{
|
||||
Self::scan(line, &mut stream, &mut cursor);
|
||||
|
||||
stream.push("\n".to_string(),
|
||||
TokenType::Newline,
|
||||
Span::with_single(cursor));
|
||||
|
||||
cursor.line += 1;
|
||||
cursor.column = 0;
|
||||
}
|
||||
|
||||
Ok(transform(&stream))
|
||||
}
|
||||
|
||||
/// Internal: scans a single line of text into tokens.
|
||||
fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position)
|
||||
{
|
||||
for c in line.chars()
|
||||
{
|
||||
let variant = get_token_type(c);
|
||||
let last = stream.len().saturating_sub(1);
|
||||
|
||||
if !stream.is_empty() &&
|
||||
variant == stream.variants[last] &&
|
||||
(variant == TokenType::Numeric || variant == TokenType::Text)
|
||||
{
|
||||
stream.lexemes[last].push(c);
|
||||
stream.locations[last].end = *cursor;
|
||||
}
|
||||
else
|
||||
{
|
||||
stream.push(c.to_string(), variant, Span::with_single(*cursor));
|
||||
}
|
||||
|
||||
cursor.column += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn get_token_type(curr_char: char) -> TokenType
|
||||
{
|
||||
match curr_char
|
||||
{
|
||||
'\n' => TokenType::Newline,
|
||||
c if c.is_whitespace() => TokenType::Whitespace,
|
||||
c if c.is_numeric() => TokenType::Numeric,
|
||||
c if c.is_alphabetic() => TokenType::Text,
|
||||
c if !c.is_whitespace() && !c.is_alphanumeric() => TokenType::Symbol,
|
||||
_ => TokenType::Unknown
|
||||
}
|
||||
}
|
20
src/lib.rs
Normal file
20
src/lib.rs
Normal file
@ -0,0 +1,20 @@
|
||||
//! # Rune
|
||||
//! **Rune** is a high-performance, customizable **lexical analysis library**
|
||||
//! written in Rust. It transforms source files into tokens using a fast,
|
||||
//! cache-friendly design.
|
||||
//! > _“Turn raw text into structured meaning — like spellcraft for source
|
||||
//! > code.”_
|
||||
|
||||
pub mod library;
|
||||
|
||||
mod error;
|
||||
mod lexer;
|
||||
mod position;
|
||||
mod token;
|
||||
|
||||
|
||||
|
||||
pub use crate::error::*;
|
||||
pub use crate::lexer::*;
|
||||
pub use crate::position::*;
|
||||
pub use crate::token::*;
|
@ -1,4 +1,4 @@
|
||||
//! This is where the Projects build information can be retreived from.
|
||||
//! This is where information about the library.
|
||||
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ const NOT_DEFINED: &'static str = "UNDEFINED";
|
||||
|
||||
|
||||
|
||||
/// Returns the name of the program as defined by the CARGO_PKG_NAME. This is
|
||||
/// Returns the name of the library as defined by the CARGO_PKG_NAME. This is
|
||||
/// set at compile time and comes from the Cargo.toml file.
|
||||
///
|
||||
/// If a value is not found, then it will return the not defined value.
|
||||
@ -23,7 +23,7 @@ pub fn get_name() -> &'static str
|
||||
}
|
||||
|
||||
|
||||
/// Returns the name of the program as defined by the CARGO_PKG_VERSION. This is
|
||||
/// Returns the name of the library as defined by the CARGO_PKG_VERSION. This is
|
||||
/// set at compile time and comes from the Cargo.toml file.
|
||||
///
|
||||
/// If a value is not found, then it will return the not defined value.
|
20
src/main.rs
20
src/main.rs
@ -1,20 +0,0 @@
|
||||
//! A simple lexical analysis library.
|
||||
|
||||
|
||||
|
||||
mod project;
|
||||
|
||||
|
||||
|
||||
/// Print the version of the project.
|
||||
fn print_version()
|
||||
{
|
||||
println!("{} v{}", project::get_name(), project::get_version());
|
||||
}
|
||||
|
||||
|
||||
/// The usual starting point of your project.
|
||||
fn main()
|
||||
{
|
||||
print_version();
|
||||
}
|
81
src/position.rs
Normal file
81
src/position.rs
Normal file
@ -0,0 +1,81 @@
|
||||
/// Represents a location in a file by line and column.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct Position
|
||||
{
|
||||
/// The referenced line of the file.
|
||||
pub line: usize,
|
||||
|
||||
/// The referenced column of the file.
|
||||
pub column: usize
|
||||
}
|
||||
|
||||
|
||||
/// Represents a range within a file.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct Span
|
||||
{
|
||||
/// The start `Position` in the file.
|
||||
pub start: Position,
|
||||
|
||||
/// The end `Position` in the file.
|
||||
pub end: Position
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl Position
|
||||
{
|
||||
pub fn new(line: usize, column: usize) -> Self
|
||||
{
|
||||
Position { line, column }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Position
|
||||
{
|
||||
fn default() -> Self
|
||||
{
|
||||
Position { line: 0, column: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Position
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result
|
||||
{
|
||||
write!(f, "Line {}, Column {}", self.line, self.column)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl Span
|
||||
{
|
||||
pub fn new(start: Position, end: Position) -> Self
|
||||
{
|
||||
Span { start, end }
|
||||
}
|
||||
|
||||
pub fn with_single(val: Position) -> Self
|
||||
{
|
||||
Span { start: val,
|
||||
end: val }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Span
|
||||
{
|
||||
fn default() -> Self
|
||||
{
|
||||
Span { start: Position::default(),
|
||||
end: Position::default() }
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Span
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result
|
||||
{
|
||||
write!(f, "Span[{} -> {}]", self.start, self.end)
|
||||
}
|
||||
}
|
294
src/token.rs
Normal file
294
src/token.rs
Normal file
@ -0,0 +1,294 @@
|
||||
use super::position::Span;
|
||||
|
||||
|
||||
|
||||
/// The different types of tokens that the `Lexer` will break a
|
||||
/// file down into. These are all base token types. It is left as
|
||||
/// basic as it is so that they can be combined and altered for
|
||||
/// each domains specific needs.
|
||||
///
|
||||
/// Base tokens are the most basic token type.
|
||||
/// Complex tokens are a token type that uses multiple base tokens.
|
||||
/// Swap tokens are when you swap one token type into another.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum TokenType
|
||||
{
|
||||
/// This represents a single newline character.
|
||||
///
|
||||
/// This is a base token type.
|
||||
Newline,
|
||||
|
||||
/// A numeric character followed by zero or more numeric characters.
|
||||
///
|
||||
/// This is a base token type.
|
||||
Numeric,
|
||||
|
||||
/// A non alphanumeric or whitespace character.
|
||||
///
|
||||
/// This is a base token type.
|
||||
Symbol,
|
||||
|
||||
/// An alpha character followed by zero or more alpha characters.
|
||||
///
|
||||
/// This is a base token type.
|
||||
Text,
|
||||
|
||||
/// One or more whitespace characters.
|
||||
///
|
||||
/// This is a base token type.
|
||||
Whitespace,
|
||||
|
||||
/// This token has something, but we don't know what.
|
||||
/// This should not occur.
|
||||
Unknown
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Data oriented design of the information desired while
|
||||
/// tokenizing a file.
|
||||
///
|
||||
/// Everything is in flat arrays for fast access
|
||||
/// and minimal cache misses.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct TokenStream
|
||||
{
|
||||
/// The text of the `Token`.
|
||||
pub lexemes: Vec<String>,
|
||||
|
||||
/// The type of `Token`.
|
||||
pub variants: Vec<TokenType>,
|
||||
|
||||
/// The location of the `Token` in the file.
|
||||
pub locations: Vec<Span>
|
||||
}
|
||||
|
||||
|
||||
/// A `Token` found during the lexigraphical scan.
|
||||
#[derive(Debug)]
|
||||
pub struct Token<'a>
|
||||
{
|
||||
/// The characters of the `Token`.
|
||||
pub lexeme: &'a str,
|
||||
|
||||
/// The `Token`'s type.
|
||||
pub variant: &'a TokenType,
|
||||
|
||||
/// The location in the file of this `Token`.
|
||||
pub span: &'a Span
|
||||
}
|
||||
|
||||
/// A `Token` found during the lexigraphical scan.
|
||||
///
|
||||
/// This is the mutable reference.
|
||||
#[derive(Debug)]
|
||||
pub struct TokenMut<'a>
|
||||
{
|
||||
/// The characters of the `Token`.
|
||||
pub lexeme: &'a mut str,
|
||||
/// The `Token`'s type.
|
||||
pub variant: &'a mut TokenType,
|
||||
/// The location for this `Token` in the file.
|
||||
pub span: &'a mut Span
|
||||
}
|
||||
|
||||
|
||||
/// An iterator over the Tokens in a `TokenStream`.
|
||||
pub struct TokenStreamIter<'a>
|
||||
{
|
||||
/// The stream to iterate over.
|
||||
stream: &'a TokenStream,
|
||||
|
||||
/// The position in the stream.
|
||||
index: usize
|
||||
}
|
||||
|
||||
/// A mutable iterator over the Tokens in a `TokenStream`.
|
||||
pub struct TokenStreamIterMut<'a>
|
||||
{
|
||||
/// The characters of the `Token`.
|
||||
lexemes: std::slice::IterMut<'a, String>,
|
||||
|
||||
/// The `Token`'s type.
|
||||
variants: std::slice::IterMut<'a, TokenType>,
|
||||
|
||||
/// The location for this `Token` in the file.
|
||||
locations: std::slice::IterMut<'a, Span>
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl TokenStream
|
||||
{
|
||||
pub fn new() -> Self
|
||||
{
|
||||
TokenStream { lexemes: Vec::new(),
|
||||
variants: Vec::new(),
|
||||
locations: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize
|
||||
{
|
||||
self.lexemes.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool
|
||||
{
|
||||
self.lexemes.is_empty()
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> Option<Token<'_>>
|
||||
{
|
||||
if index < self.lexemes.len()
|
||||
{
|
||||
Some(Token { lexeme: &self.lexemes[index],
|
||||
variant: &self.variants[index],
|
||||
span: &self.locations[index] })
|
||||
}
|
||||
else
|
||||
{
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> TokenStreamIter<'_>
|
||||
{
|
||||
TokenStreamIter { stream: self,
|
||||
index: 0 }
|
||||
}
|
||||
|
||||
pub fn get_mut(&mut self, index: usize) -> Option<TokenMut<'_>>
|
||||
{
|
||||
if index < self.lexemes.len()
|
||||
{
|
||||
// SAFETY: We manually split the borrows to avoid
|
||||
// double mutable borrow.
|
||||
let lexeme = &mut self.lexemes[index] as *mut String;
|
||||
let variant = &mut self.variants[index] as *mut TokenType;
|
||||
let span = &mut self.locations[index] as *mut Span;
|
||||
|
||||
// Convert &mut String to &mut str safely.
|
||||
unsafe {
|
||||
Some(TokenMut { lexeme: &mut *lexeme.as_mut()
|
||||
.unwrap()
|
||||
.as_mut_str(),
|
||||
variant: &mut *variant,
|
||||
span: &mut *span })
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(&mut self)
|
||||
{
|
||||
self.lexemes.clear();
|
||||
self.variants.clear();
|
||||
self.locations.clear();
|
||||
}
|
||||
|
||||
pub fn push(&mut self, lexeme: String, variant: TokenType, span: Span)
|
||||
{
|
||||
self.lexemes.push(lexeme);
|
||||
self.variants.push(variant);
|
||||
self.locations.push(span);
|
||||
}
|
||||
|
||||
pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_>
|
||||
{
|
||||
TokenStreamIterMut { lexemes: self.lexemes.iter_mut(),
|
||||
variants: self.variants.iter_mut(),
|
||||
locations: self.locations.iter_mut() }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> IntoIterator for &'a TokenStream
|
||||
{
|
||||
type IntoIter = TokenStreamIter<'a>;
|
||||
type Item = Token<'a>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter
|
||||
{
|
||||
TokenStreamIter { stream: self,
|
||||
index: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for TokenStreamIter<'a>
|
||||
{
|
||||
type Item = Token<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item>
|
||||
{
|
||||
if self.index < self.stream.lexemes.len()
|
||||
{
|
||||
let i = self.index;
|
||||
self.index += 1;
|
||||
|
||||
Some(Token { lexeme: &self.stream.lexemes[i],
|
||||
variant: &self.stream.variants[i],
|
||||
span: &self.stream.locations[i] })
|
||||
}
|
||||
else
|
||||
{
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Iterator for TokenStreamIterMut<'a>
|
||||
{
|
||||
type Item = TokenMut<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item>
|
||||
{
|
||||
let lexeme = self.lexemes.next()?;
|
||||
let variant = self.variants.next()?;
|
||||
let span = self.locations.next()?;
|
||||
|
||||
Some(TokenMut { lexeme: &mut *lexeme,
|
||||
variant,
|
||||
span })
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> ::std::fmt::Display for Token<'a>
|
||||
{
|
||||
fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result
|
||||
{
|
||||
match *self.variant
|
||||
{
|
||||
TokenType::Newline => write!(f, "[{}, {}]", self.variant, "\\n"),
|
||||
|
||||
_ => write!(f, "[{}: {}]", self.variant, self.lexeme)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ::std::fmt::Display for TokenType
|
||||
{
|
||||
fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result
|
||||
{
|
||||
let name = match *self
|
||||
{
|
||||
TokenType::Newline => "Newline",
|
||||
|
||||
TokenType::Numeric => "Numeric",
|
||||
|
||||
TokenType::Symbol => "Symbol",
|
||||
|
||||
TokenType::Text => "Text",
|
||||
|
||||
TokenType::Whitespace => "Whitespace",
|
||||
|
||||
TokenType::Unknown => "Unknown"
|
||||
};
|
||||
|
||||
write!(f, "{}", name)
|
||||
}
|
||||
}
|
135
tests/lexer_tests.rs
Normal file
135
tests/lexer_tests.rs
Normal file
@ -0,0 +1,135 @@
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use rune::*;
|
||||
|
||||
|
||||
|
||||
struct TestCase<'a>
|
||||
{
|
||||
name: &'a str,
|
||||
input: &'a str,
|
||||
expected: Vec<(TokenType, &'a str)>
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn dummy_transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
|
||||
{
|
||||
let mut new_tokens = Vec::new();
|
||||
|
||||
for token in tokens
|
||||
{
|
||||
new_tokens.push((*token.variant, token.lexeme.to_string()));
|
||||
}
|
||||
|
||||
new_tokens
|
||||
}
|
||||
|
||||
fn write_temp_file(name: &str, content: &str) -> PathBuf
|
||||
{
|
||||
let mut path = std::env::temp_dir();
|
||||
path.push(format!("rune_test_{}.txt", name));
|
||||
let mut file = File::create(&path).expect("Failed to create temp file");
|
||||
write!(file, "{}", content).expect("Failed to write test content");
|
||||
path
|
||||
}
|
||||
|
||||
fn cleanup_temp_file(path: &PathBuf)
|
||||
{
|
||||
if path.exists()
|
||||
{
|
||||
let _ = fs::remove_file(path);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_basic_lexing()
|
||||
{
|
||||
let tokens =
|
||||
Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
|
||||
succeed");
|
||||
|
||||
let expected = vec![(TokenType::Text, "magic".to_string()),
|
||||
(TokenType::Whitespace, " ".to_string()),
|
||||
(TokenType::Text, "runes".to_string()),
|
||||
(TokenType::Newline, "\n".to_string()),];
|
||||
|
||||
assert_eq!(tokens, expected);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_symbols_and_numbers()
|
||||
{
|
||||
let tokens =
|
||||
Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
|
||||
succeed");
|
||||
|
||||
let expected = vec![(TokenType::Numeric, "13".into()),
|
||||
(TokenType::Whitespace, " ".into()),
|
||||
(TokenType::Symbol, "+".into()),
|
||||
(TokenType::Whitespace, " ".into()),
|
||||
(TokenType::Numeric, "37".into()),
|
||||
(TokenType::Newline, "\n".into()),];
|
||||
|
||||
assert_eq!(tokens, expected);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_lexer_with_cases()
|
||||
{
|
||||
let cases = vec![TestCase { name: "simple_words",
|
||||
input: "magic rune",
|
||||
expected: vec![(TokenType::Text, "magic"),
|
||||
(TokenType::Whitespace, " "),
|
||||
(TokenType::Text, "rune"),
|
||||
(TokenType::Newline, "\n"),] },
|
||||
TestCase { name: "symbols_and_digits",
|
||||
input: "12 + 7",
|
||||
expected: vec![(TokenType::Numeric, "12"),
|
||||
(TokenType::Whitespace, " "),
|
||||
(TokenType::Symbol, "+"),
|
||||
(TokenType::Whitespace, " "),
|
||||
(TokenType::Numeric, "7"),
|
||||
(TokenType::Newline, "\n"),] },
|
||||
TestCase { name: "only_symbols",
|
||||
input: "###",
|
||||
expected: vec![(TokenType::Symbol, "#"),
|
||||
(TokenType::Symbol, "#"),
|
||||
(TokenType::Symbol, "#"),
|
||||
(TokenType::Newline, "\n"),] },
|
||||
TestCase { name: "whitespace_and_text",
|
||||
input: " spell",
|
||||
expected: vec![(TokenType::Whitespace, " "),
|
||||
(TokenType::Whitespace, " "),
|
||||
(TokenType::Whitespace, " "),
|
||||
(TokenType::Text, "spell"),
|
||||
(TokenType::Newline, "\n"),] },];
|
||||
|
||||
for case in cases
|
||||
{
|
||||
let path = write_temp_file(case.name, case.input);
|
||||
let result =
|
||||
Lexer::scan_file(&path, dummy_transform).expect(&format!("Lexer failed \
|
||||
on case '{}'",
|
||||
case.name));
|
||||
|
||||
let result_stripped: Vec<(TokenType, String)> = result;
|
||||
|
||||
let expected = case.expected
|
||||
.iter()
|
||||
.map(|(ty, s)| (*ty, s.to_string()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(result_stripped, expected,
|
||||
"Mismatch in test case '{}'",
|
||||
case.name);
|
||||
|
||||
cleanup_temp_file(&path);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user