Skip to content

Commit

Permalink
feat(core): basic lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
pluveto committed Dec 13, 2023
1 parent 5c9d364 commit a566fe4
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 0 deletions.
158 changes: 158 additions & 0 deletions rulog-core/src/lexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
use crate::types::token::Token;

#[derive(Debug, PartialEq)]
pub enum LexerError {
UnexpectedCharacter(char),
InvalidNumberFormat(String),
}

pub struct Lexer<'a> {
input: &'a str,
position: usize,
next_position: usize,
ch: char,
}

impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Lexer {
let mut lexer = Lexer {
input,
position: 0,
next_position: 0,
ch: '\0',
};
lexer.read_char();
lexer
}

fn read_char(&mut self) {
if self.next_position >= self.input.len() {
self.ch = '\0';
} else {
self.ch = self.input.chars().nth(self.next_position).unwrap();
}
self.position = self.next_position;
self.next_position += 1;
}

pub fn next_token(&mut self) -> Result<Token, LexerError> {
self.skip_whitespace();

let token = match self.ch {
'+' => Ok(Token::Operator("+".to_string())),
'-' => Ok(Token::Operator("-".to_string())),
'*' => Ok(Token::Operator("*".to_string())),
'/' => Ok(Token::Operator("/".to_string())),
'<' => Ok(Token::Operator("<".to_string())),
'>' => Ok(Token::Operator(">".to_string())),
'=' => Ok(Token::Operator("=".to_string())),
'!' => Ok(Token::Cut),
',' => Ok(Token::Comma),
'.' => Ok(Token::Period),
';' => Ok(Token::Semicolon),
':' => Ok(Token::Colon),
'?' => Ok(Token::QuestionMark),
'(' => Ok(Token::LeftParenthesis),
')' => Ok(Token::RightParenthesis),
'[' => Ok(Token::LeftBracket),
']' => Ok(Token::RightBracket),
'{' => Ok(Token::LeftCurlyBracket),
'}' => Ok(Token::RightCurlyBracket),
'\0' => Ok(Token::EndOfFile),
_ => {
if self.ch.is_alphabetic() || self.ch == '_' {
Ok(self.read_identifier_or_variable())
} else if self.ch.is_digit(10) {
Ok(self.read_number())
} else {
Err(LexerError::UnexpectedCharacter(self.ch))
}
}
};

self.read_char();
token
}

fn read_identifier_or_variable(&mut self) -> Token {
let start_position = self.position;
while self.ch.is_alphanumeric() || self.ch == '_' {
self.read_char();
}
let text = &self.input[start_position..self.position];

// Variables in Prolog start with an uppercase letter or an underscore.
if text.starts_with(char::is_uppercase) || text.starts_with('_') {
Token::Variable(text.to_string())
} else {
Token::Atom(text.to_string())
}
}

fn read_number(&mut self) -> Token {
let start_position = self.position;
while self.ch.is_digit(10) {
self.read_char();
}

// Check if it's a float.
if self.ch == '.' {
self.read_char();

if !self.ch.is_digit(10) {
// Handle error: the character after '.' must be a digit.
// For simplicity, we return an integer token here and ignore the '.'.
// In a real lexer, you would return an error.
return Token::Integer(
self.input[start_position..self.position - 1]
.parse::<i64>()
.unwrap(),
);
}

while self.ch.is_digit(10) {
self.read_char();
}

Token::Float(
self.input[start_position..self.position]
.parse::<f64>()
.unwrap(),
)
} else {
Token::Integer(
self.input[start_position..self.position]
.parse::<i64>()
.unwrap(),
)
}
}

fn skip_whitespace(&mut self) {
while self.ch.is_whitespace() {
self.read_char();
}
}
}
#[cfg(test)]
mod tests {
use crate::types::token::Token;

use super::*;

#[test]
fn test_next_token() {
let mut lexer = Lexer::new("atom_variable 123 4.56 _Variable");
assert_eq!(
lexer.next_token().unwrap(),
Token::Atom("atom_variable".to_string())
);
assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
assert_eq!(lexer.next_token().unwrap(), Token::Float(4.56));
assert_eq!(
lexer.next_token().unwrap(),
Token::Variable("_Variable".to_string())
);
assert_eq!(lexer.next_token().unwrap(), Token::EndOfFile);
}
}
3 changes: 3 additions & 0 deletions rulog-core/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
pub mod lexer;
pub mod types;

pub fn add(left: usize, right: usize) -> usize {
left + right
}
Expand Down
1 change: 1 addition & 0 deletions rulog-core/src/types/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod token;
24 changes: 24 additions & 0 deletions rulog-core/src/types/token.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#[derive(PartialEq, Debug, Clone)]
pub enum Token {
Atom(String), // Atoms, such as foo, 'Bar', etc.
Variable(String), // Variables, such as X, _Y, _, etc.
Integer(i64), // Integers, such as 123, -456, etc.
Float(f64), // Floating-point numbers, such as 1.23, -4.56e+7, etc.
String(String), // Strings, such as "hello world".
Operator(String), // Operators, such as +, -, *, /, <, =:=, etc.
Cut, // Cut operator (!).
Comma, // Comma (,), usually used to separate clauses.
Period, // Period (.), used to end a clause.
Semicolon, // Semicolon (;), used to represent choice.
Colon, // Colon (:), used for module qualifiers.
ColonDash, // Colon-dash (:-), used to separate rule heads and bodies.
QuestionMark, // Question mark (?), used for queries.
LeftParenthesis, // Left parenthesis (().
RightParenthesis, // Right parenthesis ())).
LeftBracket, // Left bracket ([).
RightBracket, // Right bracket (]).
LeftCurlyBracket, // Left curly bracket ({).
RightCurlyBracket, // Right curly bracket (}).
EndOfFile, // End-of-file marker.
Comment, // Comment, such as // this is a comment.
}

0 comments on commit a566fe4

Please # to comment.