[#3] TokenStream now hold generic variants.

This makes it so that the TokenStream and all it's associated Token
types use a generic when dealing with the variant of the Token.

Span was also given the ability to merge with another span. This
will make it easier to track the span as users group TokenTypes
together to make their domain specific types.

All tests and examples were updated for this change.

The version was incremented to 0.2.0.
This commit is contained in:
Myrddin Dundragon 2025-04-16 01:54:22 -04:00
parent f924811c47
commit 7c564d18a2
8 changed files with 201 additions and 64 deletions

View File

@ -1,6 +1,6 @@
[package]
name = "rune"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
description = "A lexical analysis library."
repository = "/myrddin/rune"

View File

@ -31,5 +31,5 @@ Then add this to your Cargo.toml file.
```toml
[dependencies]
rune = { registry = "cybermages" }
rune = { version = "0.2.0", registry = "cybermages" }
```

View File

@ -1,34 +1,151 @@
use std::path::PathBuf;
use rune::{Lexer, TokenStream, TokenType};
use rune::{Lexer, Span, TokenStream, TokenType};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum MarkdownTokenType {
Heading(u8),
EmphasisStart,
EmphasisEnd,
StrongStart,
StrongEnd,
CodeSpan,
Text,
Symbol,
Whitespace,
Newline,
Unknown,
}
impl std::fmt::Display for MarkdownTokenType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
MarkdownTokenType::Heading(level) => write!(f, "Heading({})", level),
MarkdownTokenType::EmphasisStart => write!(f, "EmphasisStart"),
MarkdownTokenType::EmphasisEnd => write!(f, "EmphasisEnd"),
MarkdownTokenType::StrongStart => write!(f, "StrongStart"),
MarkdownTokenType::StrongEnd => write!(f, "StrongEnd"),
MarkdownTokenType::CodeSpan => write!(f, "CodeSpan"),
MarkdownTokenType::Text => write!(f, "Text"),
MarkdownTokenType::Symbol => write!(f, "Symbol"),
MarkdownTokenType::Whitespace => write!(f, "Whitespace"),
MarkdownTokenType::Newline => write!(f, "Newline"),
MarkdownTokenType::Unknown => write!(f, "Unknown"),
}
}
}
// Define how you want to interpret base tokens
fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
{
let mut new_tokens = Vec::new();
pub fn transform(input: &TokenStream<TokenType>) -> TokenStream<MarkdownTokenType> {
let mut output = TokenStream::new();
for token in tokens
{
new_tokens.push((*token.variant, token.lexeme.to_string()));
}
let mut i = 0;
while i < input.len() {
let token = input.get(i).unwrap(); // safe due to bounds check above
new_tokens
match token.variant {
TokenType::Symbol if token.lexeme == "#" => {
// Count consecutive #s for heading level
let mut level = 1;
let mut span = token.span.clone();
while i + 1 < input.len() {
let next = input.get(i + 1).unwrap();
if *next.variant == TokenType::Symbol && next.lexeme == "#" {
level += 1;
span.end = next.span.end;
i += 1;
} else {
break;
}
}
output.push(token.lexeme.repeat(level),
MarkdownTokenType::Heading(level as u8),
span);
}
TokenType::Symbol if token.lexeme == "*" => {
// Look ahead to see if it's strong (**) or emphasis (*)
if i + 1 < input.len() {
let next = input.get(i + 1).unwrap();
if *next.variant == TokenType::Symbol && next.lexeme == "*" {
output.push("**".to_string(),
MarkdownTokenType::StrongStart,
Span::merge(*token.span, *next.span));
i += 1; // skip the second '*'
} else {
output.push("*".to_string(),
MarkdownTokenType::EmphasisStart,
token.span.clone());
}
} else {
output.push("*".to_string(),
MarkdownTokenType::EmphasisStart,
token.span.clone());
}
}
TokenType::Symbol if token.lexeme == "`" => {
output.push(token.lexeme.to_string(),
MarkdownTokenType::CodeSpan,
token.span.clone());
}
TokenType::Text => {
output.push(token.lexeme.to_string(),
MarkdownTokenType::Text,
token.span.clone());
}
TokenType::Symbol => {
output.push(token.lexeme.to_string(),
MarkdownTokenType::Symbol,
token.span.clone());
}
TokenType::Whitespace => {
output.push(token.lexeme.to_string(),
MarkdownTokenType::Whitespace,
token.span.clone());
}
TokenType::Newline => {
output.push(token.lexeme.to_string(),
MarkdownTokenType::Newline,
token.span.clone());
}
_ => {
output.push(token.lexeme.to_string(),
MarkdownTokenType::Unknown,
token.span.clone());
}
}
i += 1;
}
output
}
fn main() -> Result<(), Box<dyn std::error::Error>>
{
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("examples/example.txt");
path.push("examples/example.md");
let tokens = Lexer::scan_file(path, transform)?;
// The tuple here is from the transform functions return type.
for (ty, lexeme) in tokens
for token in &tokens
{
println!("{:?}: {:?}", ty, lexeme);
println!("{}", token);
}
Ok(())

View File

@ -1,2 +1,6 @@
# Example File
This is an example file for Rune.
## Rune
The name Rune is inspired by arcane glyphs — ancient symbols holding deep meaning.
Just like your tokens!

View File

@ -31,24 +31,17 @@ const BUFFER_SIZE: usize = 1024 * 1024;
/// ```rust
/// use rune::{Lexer, TokenStream, TokenType};
///
/// fn transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
/// fn transform(tokens: &TokenStream<TokenType>) -> TokenStream<TokenType>
/// {
/// let mut new_tokens = Vec::new();
///
/// for token in tokens
/// {
/// new_tokens.push((*token.variant, token.lexeme.to_string()));
/// }
///
/// new_tokens
/// tokens.clone()
/// }
///
/// let tokens = Lexer::scan_text("Runes += 42", transform).unwrap();
///
/// // The tuple here is from the transform functions return type.
/// for (ty, lexeme) in tokens
/// for token in &tokens
/// {
/// println!("{:?}: {:?}", ty, lexeme);
/// println!("{}", token);
/// }
/// ```
///
@ -72,9 +65,9 @@ impl Lexer
{
/// Scans a file and produces a vector of transformed tokens.
pub fn scan_file<P, F, T>(path: P, transform: F)
-> Result<Vec<T>, LexerError>
-> Result<TokenStream<T>, LexerError>
where P: AsRef<std::path::Path>,
F: FnOnce(&TokenStream) -> Vec<T>
F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
{
let mut cursor = Position::default();
let mut stream = TokenStream::new();
@ -123,8 +116,8 @@ impl Lexer
/// Scans a full in-memory string and returns transformed tokens.
pub fn scan_text<F, T>(text: &str, transform: F)
-> Result<Vec<T>, LexerError>
where F: FnOnce(&TokenStream) -> Vec<T>
-> Result<TokenStream<T>, LexerError>
where F: FnOnce(&TokenStream<TokenType>) -> TokenStream<T>
{
let mut cursor = Position::default();
let mut stream = TokenStream::new();
@ -145,7 +138,7 @@ impl Lexer
}
/// Internal: scans a single line of text into tokens.
fn scan(line: &str, stream: &mut TokenStream, cursor: &mut Position)
fn scan(line: &str, stream: &mut TokenStream<TokenType>, cursor: &mut Position)
{
for c in line.chars()
{

View File

@ -61,6 +61,19 @@ impl Span
Span { start: val,
end: val }
}
pub fn merge(a: Span, b: Span) -> Self
{
Span { start: a.start,
end: b.end }
}
pub fn merge_with(&self, other: Span) -> Span {
Span {
start: self.start,
end: other.end,
}
}
}
impl Default for Span

View File

@ -51,13 +51,13 @@ pub enum TokenType
/// Everything is in flat arrays for fast access
/// and minimal cache misses.
#[derive(Debug, Clone, Default)]
pub struct TokenStream
pub struct TokenStream<T>
{
/// The text of the `Token`.
pub lexemes: Vec<String>,
/// The type of `Token`.
pub variants: Vec<TokenType>,
pub variants: Vec<T>,
/// The location of the `Token` in the file.
pub locations: Vec<Span>
@ -66,13 +66,13 @@ pub struct TokenStream
/// A `Token` found during the lexigraphical scan.
#[derive(Debug)]
pub struct Token<'a>
pub struct Token<'a, T>
{
/// The characters of the `Token`.
pub lexeme: &'a str,
/// The `Token`'s type.
pub variant: &'a TokenType,
pub variant: &'a T,
/// The location in the file of this `Token`.
pub span: &'a Span
@ -82,35 +82,35 @@ pub struct Token<'a>
///
/// This is the mutable reference.
#[derive(Debug)]
pub struct TokenMut<'a>
pub struct TokenMut<'a, T>
{
/// The characters of the `Token`.
pub lexeme: &'a mut str,
/// The `Token`'s type.
pub variant: &'a mut TokenType,
pub variant: &'a mut T,
/// The location for this `Token` in the file.
pub span: &'a mut Span
}
/// An iterator over the Tokens in a `TokenStream`.
pub struct TokenStreamIter<'a>
pub struct TokenStreamIter<'a, T>
{
/// The stream to iterate over.
stream: &'a TokenStream,
stream: &'a TokenStream<T>,
/// The position in the stream.
index: usize
}
/// A mutable iterator over the Tokens in a `TokenStream`.
pub struct TokenStreamIterMut<'a>
pub struct TokenStreamIterMut<'a, T>
{
/// The characters of the `Token`.
lexemes: std::slice::IterMut<'a, String>,
/// The `Token`'s type.
variants: std::slice::IterMut<'a, TokenType>,
variants: std::slice::IterMut<'a, T>,
/// The location for this `Token` in the file.
locations: std::slice::IterMut<'a, Span>
@ -118,7 +118,7 @@ pub struct TokenStreamIterMut<'a>
impl TokenStream
impl<T> TokenStream<T>
{
pub fn new() -> Self
{
@ -137,7 +137,7 @@ impl TokenStream
self.lexemes.is_empty()
}
pub fn get(&self, index: usize) -> Option<Token<'_>>
pub fn get(&self, index: usize) -> Option<Token<'_, T>>
{
if index < self.lexemes.len()
{
@ -151,20 +151,20 @@ impl TokenStream
}
}
pub fn iter(&self) -> TokenStreamIter<'_>
pub fn iter(&self) -> TokenStreamIter<'_, T>
{
TokenStreamIter { stream: self,
index: 0 }
}
pub fn get_mut(&mut self, index: usize) -> Option<TokenMut<'_>>
pub fn get_mut(&mut self, index: usize) -> Option<TokenMut<'_, T>>
{
if index < self.lexemes.len()
{
// SAFETY: We manually split the borrows to avoid
// double mutable borrow.
let lexeme = &mut self.lexemes[index] as *mut String;
let variant = &mut self.variants[index] as *mut TokenType;
let variant = &mut self.variants[index] as *mut T;
let span = &mut self.locations[index] as *mut Span;
// Convert &mut String to &mut str safely.
@ -189,14 +189,14 @@ impl TokenStream
self.locations.clear();
}
pub fn push(&mut self, lexeme: String, variant: TokenType, span: Span)
pub fn push(&mut self, lexeme: String, variant: T, span: Span)
{
self.lexemes.push(lexeme);
self.variants.push(variant);
self.locations.push(span);
}
pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_>
pub fn iter_mut(&mut self) -> TokenStreamIterMut<'_, T>
{
TokenStreamIterMut { lexemes: self.lexemes.iter_mut(),
variants: self.variants.iter_mut(),
@ -205,10 +205,10 @@ impl TokenStream
}
impl<'a> IntoIterator for &'a TokenStream
impl<'a, T> IntoIterator for &'a TokenStream<T>
{
type IntoIter = TokenStreamIter<'a>;
type Item = Token<'a>;
type IntoIter = TokenStreamIter<'a, T>;
type Item = Token<'a, T>;
fn into_iter(self) -> Self::IntoIter
{
@ -217,9 +217,9 @@ impl<'a> IntoIterator for &'a TokenStream
}
}
impl<'a> Iterator for TokenStreamIter<'a>
impl<'a, T> Iterator for TokenStreamIter<'a, T>
{
type Item = Token<'a>;
type Item = Token<'a, T>;
fn next(&mut self) -> Option<Self::Item>
{
@ -240,9 +240,9 @@ impl<'a> Iterator for TokenStreamIter<'a>
}
impl<'a> Iterator for TokenStreamIterMut<'a>
impl<'a, T> Iterator for TokenStreamIterMut<'a, T>
{
type Item = TokenMut<'a>;
type Item = TokenMut<'a, T>;
fn next(&mut self) -> Option<Self::Item>
{
@ -257,13 +257,13 @@ impl<'a> Iterator for TokenStreamIterMut<'a>
}
impl<'a> ::std::fmt::Display for Token<'a>
impl<'a, T: std::fmt::Display> ::std::fmt::Display for Token<'a, T>
{
fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result
{
match *self.variant
match self.lexeme
{
TokenType::Newline => write!(f, "[{}, {}]", self.variant, "\\n"),
"\n" => write!(f, "[{}, {}]", self.variant, "\\n"),
_ => write!(f, "[{}: {}]", self.variant, self.lexeme)
}

View File

@ -15,16 +15,22 @@ struct TestCase<'a>
fn dummy_transform(tokens: &TokenStream) -> Vec<(TokenType, String)>
fn dummy_transform(tokens: &TokenStream<TokenType>) -> TokenStream<TokenType>
{
let mut new_tokens = Vec::new();
/*
let mut stream: TokenStream<(TokenType, String)> = TokenStream::default();
for token in tokens
stream.lexemes = tokens.lexemes.clone();
stream.locations = tokens.locations.clone();
for 0..tokens.lexemes.len()
{
new_tokens.push((*token.variant, token.lexeme.to_string()));
stream.variants
}
new_tokens
stream
*/
tokens.clone()
}
fn write_temp_file(name: &str, content: &str) -> PathBuf
@ -53,6 +59,8 @@ fn test_basic_lexing()
Lexer::scan_text("magic runes", dummy_transform).expect("Lexer should \
succeed");
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
let expected = vec![(TokenType::Text, "magic".to_string()),
(TokenType::Whitespace, " ".to_string()),
(TokenType::Text, "runes".to_string()),
@ -69,6 +77,8 @@ fn test_symbols_and_numbers()
Lexer::scan_text("13 + 37", dummy_transform).expect("Lexer should \
succeed");
let tokens = tokens.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
let expected = vec![(TokenType::Numeric, "13".into()),
(TokenType::Whitespace, " ".into()),
(TokenType::Symbol, "+".into()),
@ -119,14 +129,14 @@ fn test_lexer_with_cases()
on case '{}'",
case.name));
let result_stripped: Vec<(TokenType, String)> = result;
let result = result.into_iter().map(|t| { (*t.variant, String::from(t.lexeme))}).collect::<Vec<_>>();
let expected = case.expected
.iter()
.map(|(ty, s)| (*ty, s.to_string()))
.collect::<Vec<_>>();
assert_eq!(result_stripped, expected,
assert_eq!(result, expected,
"Mismatch in test case '{}'",
case.name);