-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
186 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
use crate::types::token::Token; | ||
|
||
#[derive(Debug, PartialEq)] | ||
pub enum LexerError { | ||
UnexpectedCharacter(char), | ||
InvalidNumberFormat(String), | ||
} | ||
|
||
pub struct Lexer<'a> { | ||
input: &'a str, | ||
position: usize, | ||
next_position: usize, | ||
ch: char, | ||
} | ||
|
||
impl<'a> Lexer<'a> { | ||
pub fn new(input: &'a str) -> Lexer { | ||
let mut lexer = Lexer { | ||
input, | ||
position: 0, | ||
next_position: 0, | ||
ch: '\0', | ||
}; | ||
lexer.read_char(); | ||
lexer | ||
} | ||
|
||
fn read_char(&mut self) { | ||
if self.next_position >= self.input.len() { | ||
self.ch = '\0'; | ||
} else { | ||
self.ch = self.input.chars().nth(self.next_position).unwrap(); | ||
} | ||
self.position = self.next_position; | ||
self.next_position += 1; | ||
} | ||
|
||
pub fn next_token(&mut self) -> Result<Token, LexerError> { | ||
self.skip_whitespace(); | ||
|
||
let token = match self.ch { | ||
'+' => Ok(Token::Operator("+".to_string())), | ||
'-' => Ok(Token::Operator("-".to_string())), | ||
'*' => Ok(Token::Operator("*".to_string())), | ||
'/' => Ok(Token::Operator("/".to_string())), | ||
'<' => Ok(Token::Operator("<".to_string())), | ||
'>' => Ok(Token::Operator(">".to_string())), | ||
'=' => Ok(Token::Operator("=".to_string())), | ||
'!' => Ok(Token::Cut), | ||
',' => Ok(Token::Comma), | ||
'.' => Ok(Token::Period), | ||
';' => Ok(Token::Semicolon), | ||
':' => Ok(Token::Colon), | ||
'?' => Ok(Token::QuestionMark), | ||
'(' => Ok(Token::LeftParenthesis), | ||
')' => Ok(Token::RightParenthesis), | ||
'[' => Ok(Token::LeftBracket), | ||
']' => Ok(Token::RightBracket), | ||
'{' => Ok(Token::LeftCurlyBracket), | ||
'}' => Ok(Token::RightCurlyBracket), | ||
'\0' => Ok(Token::EndOfFile), | ||
_ => { | ||
if self.ch.is_alphabetic() || self.ch == '_' { | ||
Ok(self.read_identifier_or_variable()) | ||
} else if self.ch.is_digit(10) { | ||
Ok(self.read_number()) | ||
} else { | ||
Err(LexerError::UnexpectedCharacter(self.ch)) | ||
} | ||
} | ||
}; | ||
|
||
self.read_char(); | ||
token | ||
} | ||
|
||
fn read_identifier_or_variable(&mut self) -> Token { | ||
let start_position = self.position; | ||
while self.ch.is_alphanumeric() || self.ch == '_' { | ||
self.read_char(); | ||
} | ||
let text = &self.input[start_position..self.position]; | ||
|
||
// Variables in Prolog start with an uppercase letter or an underscore. | ||
if text.starts_with(char::is_uppercase) || text.starts_with('_') { | ||
Token::Variable(text.to_string()) | ||
} else { | ||
Token::Atom(text.to_string()) | ||
} | ||
} | ||
|
||
fn read_number(&mut self) -> Token { | ||
let start_position = self.position; | ||
while self.ch.is_digit(10) { | ||
self.read_char(); | ||
} | ||
|
||
// Check if it's a float. | ||
if self.ch == '.' { | ||
self.read_char(); | ||
|
||
if !self.ch.is_digit(10) { | ||
// Handle error: the character after '.' must be a digit. | ||
// For simplicity, we return an integer token here and ignore the '.'. | ||
// In a real lexer, you would return an error. | ||
return Token::Integer( | ||
self.input[start_position..self.position - 1] | ||
.parse::<i64>() | ||
.unwrap(), | ||
); | ||
} | ||
|
||
while self.ch.is_digit(10) { | ||
self.read_char(); | ||
} | ||
|
||
Token::Float( | ||
self.input[start_position..self.position] | ||
.parse::<f64>() | ||
.unwrap(), | ||
) | ||
} else { | ||
Token::Integer( | ||
self.input[start_position..self.position] | ||
.parse::<i64>() | ||
.unwrap(), | ||
) | ||
} | ||
} | ||
|
||
fn skip_whitespace(&mut self) { | ||
while self.ch.is_whitespace() { | ||
self.read_char(); | ||
} | ||
} | ||
} | ||
#[cfg(test)] | ||
mod tests { | ||
use crate::types::token::Token; | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn test_next_token() { | ||
let mut lexer = Lexer::new("atom_variable 123 4.56 _Variable"); | ||
assert_eq!( | ||
lexer.next_token().unwrap(), | ||
Token::Atom("atom_variable".to_string()) | ||
); | ||
assert_eq!(lexer.next_token().unwrap(), Token::Integer(123)); | ||
assert_eq!(lexer.next_token().unwrap(), Token::Float(4.56)); | ||
assert_eq!( | ||
lexer.next_token().unwrap(), | ||
Token::Variable("_Variable".to_string()) | ||
); | ||
assert_eq!(lexer.next_token().unwrap(), Token::EndOfFile); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
pub mod lexer; | ||
pub mod types; | ||
|
||
pub fn add(left: usize, right: usize) -> usize { | ||
left + right | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pub mod token; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#[derive(PartialEq, Debug, Clone)] | ||
pub enum Token { | ||
Atom(String), // Atoms, such as foo, 'Bar', etc. | ||
Variable(String), // Variables, such as X, _Y, _, etc. | ||
Integer(i64), // Integers, such as 123, -456, etc. | ||
Float(f64), // Floating-point numbers, such as 1.23, -4.56e+7, etc. | ||
String(String), // Strings, such as "hello world". | ||
Operator(String), // Operators, such as +, -, *, /, <, =:=, etc. | ||
Cut, // Cut operator (!). | ||
Comma, // Comma (,), usually used to separate clauses. | ||
Period, // Period (.), used to end a clause. | ||
Semicolon, // Semicolon (;), used to represent choice. | ||
Colon, // Colon (:), used for module qualifiers. | ||
ColonDash, // Colon-dash (:-), used to separate rule heads and bodies. | ||
QuestionMark, // Question mark (?), used for queries. | ||
LeftParenthesis, // Left parenthesis ((). | ||
RightParenthesis, // Right parenthesis ())). | ||
LeftBracket, // Left bracket ([). | ||
RightBracket, // Right bracket (]). | ||
LeftCurlyBracket, // Left curly bracket ({). | ||
RightCurlyBracket, // Right curly bracket (}). | ||
EndOfFile, // End-of-file marker. | ||
Comment, // Comment, such as // this is a comment. | ||
} |