From a566fe410a4f7a020f9c3bccd97d4f6e23cfc1cd Mon Sep 17 00:00:00 2001 From: pluveto Date: Wed, 13 Dec 2023 15:29:50 +0800 Subject: [PATCH] feat(core): basic lexer --- rulog-core/src/lexer.rs | 158 ++++++++++++++++++++++++++++++++++ rulog-core/src/lib.rs | 3 + rulog-core/src/types/mod.rs | 1 + rulog-core/src/types/token.rs | 24 ++++++ 4 files changed, 186 insertions(+) create mode 100644 rulog-core/src/lexer.rs create mode 100644 rulog-core/src/types/mod.rs create mode 100644 rulog-core/src/types/token.rs diff --git a/rulog-core/src/lexer.rs b/rulog-core/src/lexer.rs new file mode 100644 index 0000000..4c63e86 --- /dev/null +++ b/rulog-core/src/lexer.rs @@ -0,0 +1,158 @@ +use crate::types::token::Token; + +#[derive(Debug, PartialEq)] +pub enum LexerError { + UnexpectedCharacter(char), + InvalidNumberFormat(String), +} + +pub struct Lexer<'a> { + input: &'a str, + position: usize, + next_position: usize, + ch: char, +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str) -> Lexer { + let mut lexer = Lexer { + input, + position: 0, + next_position: 0, + ch: '\0', + }; + lexer.read_char(); + lexer + } + + fn read_char(&mut self) { + if self.next_position >= self.input.len() { + self.ch = '\0'; + } else { + self.ch = self.input.chars().nth(self.next_position).unwrap(); + } + self.position = self.next_position; + self.next_position += 1; + } + + pub fn next_token(&mut self) -> Result { + self.skip_whitespace(); + + let token = match self.ch { + '+' => Ok(Token::Operator("+".to_string())), + '-' => Ok(Token::Operator("-".to_string())), + '*' => Ok(Token::Operator("*".to_string())), + '/' => Ok(Token::Operator("/".to_string())), + '<' => Ok(Token::Operator("<".to_string())), + '>' => Ok(Token::Operator(">".to_string())), + '=' => Ok(Token::Operator("=".to_string())), + '!' => Ok(Token::Cut), + ',' => Ok(Token::Comma), + '.' => Ok(Token::Period), + ';' => Ok(Token::Semicolon), + ':' => Ok(Token::Colon), + '?' => Ok(Token::QuestionMark), + '(' => Ok(Token::LeftParenthesis), + ')' => Ok(Token::RightParenthesis), + '[' => Ok(Token::LeftBracket), + ']' => Ok(Token::RightBracket), + '{' => Ok(Token::LeftCurlyBracket), + '}' => Ok(Token::RightCurlyBracket), + '\0' => Ok(Token::EndOfFile), + _ => { + if self.ch.is_alphabetic() || self.ch == '_' { + Ok(self.read_identifier_or_variable()) + } else if self.ch.is_digit(10) { + Ok(self.read_number()) + } else { + Err(LexerError::UnexpectedCharacter(self.ch)) + } + } + }; + + self.read_char(); + token + } + + fn read_identifier_or_variable(&mut self) -> Token { + let start_position = self.position; + while self.ch.is_alphanumeric() || self.ch == '_' { + self.read_char(); + } + let text = &self.input[start_position..self.position]; + + // Variables in Prolog start with an uppercase letter or an underscore. + if text.starts_with(char::is_uppercase) || text.starts_with('_') { + Token::Variable(text.to_string()) + } else { + Token::Atom(text.to_string()) + } + } + + fn read_number(&mut self) -> Token { + let start_position = self.position; + while self.ch.is_digit(10) { + self.read_char(); + } + + // Check if it's a float. + if self.ch == '.' { + self.read_char(); + + if !self.ch.is_digit(10) { + // Handle error: the character after '.' must be a digit. + // For simplicity, we return an integer token here and ignore the '.'. + // In a real lexer, you would return an error. + return Token::Integer( + self.input[start_position..self.position - 1] + .parse::() + .unwrap(), + ); + } + + while self.ch.is_digit(10) { + self.read_char(); + } + + Token::Float( + self.input[start_position..self.position] + .parse::() + .unwrap(), + ) + } else { + Token::Integer( + self.input[start_position..self.position] + .parse::() + .unwrap(), + ) + } + } + + fn skip_whitespace(&mut self) { + while self.ch.is_whitespace() { + self.read_char(); + } + } +} +#[cfg(test)] +mod tests { + use crate::types::token::Token; + + use super::*; + + #[test] + fn test_next_token() { + let mut lexer = Lexer::new("atom_variable 123 4.56 _Variable"); + assert_eq!( + lexer.next_token().unwrap(), + Token::Atom("atom_variable".to_string()) + ); + assert_eq!(lexer.next_token().unwrap(), Token::Integer(123)); + assert_eq!(lexer.next_token().unwrap(), Token::Float(4.56)); + assert_eq!( + lexer.next_token().unwrap(), + Token::Variable("_Variable".to_string()) + ); + assert_eq!(lexer.next_token().unwrap(), Token::EndOfFile); + } +} diff --git a/rulog-core/src/lib.rs b/rulog-core/src/lib.rs index 7d12d9a..5b933c1 100644 --- a/rulog-core/src/lib.rs +++ b/rulog-core/src/lib.rs @@ -1,3 +1,6 @@ +pub mod lexer; +pub mod types; + pub fn add(left: usize, right: usize) -> usize { left + right } diff --git a/rulog-core/src/types/mod.rs b/rulog-core/src/types/mod.rs new file mode 100644 index 0000000..79c66ba --- /dev/null +++ b/rulog-core/src/types/mod.rs @@ -0,0 +1 @@ +pub mod token; diff --git a/rulog-core/src/types/token.rs b/rulog-core/src/types/token.rs new file mode 100644 index 0000000..8e2a64f --- /dev/null +++ b/rulog-core/src/types/token.rs @@ -0,0 +1,24 @@ +#[derive(PartialEq, Debug, Clone)] +pub enum Token { + Atom(String), // Atoms, such as foo, 'Bar', etc. + Variable(String), // Variables, such as X, _Y, _, etc. + Integer(i64), // Integers, such as 123, -456, etc. + Float(f64), // Floating-point numbers, such as 1.23, -4.56e+7, etc. + String(String), // Strings, such as "hello world". + Operator(String), // Operators, such as +, -, *, /, <, =:=, etc. + Cut, // Cut operator (!). + Comma, // Comma (,), usually used to separate clauses. + Period, // Period (.), used to end a clause. + Semicolon, // Semicolon (;), used to represent choice. + Colon, // Colon (:), used for module qualifiers. + ColonDash, // Colon-dash (:-), used to separate rule heads and bodies. + QuestionMark, // Question mark (?), used for queries. + LeftParenthesis, // Left parenthesis ((). + RightParenthesis, // Right parenthesis ())). + LeftBracket, // Left bracket ([). + RightBracket, // Right bracket (]). + LeftCurlyBracket, // Left curly bracket ({). + RightCurlyBracket, // Right curly bracket (}). + EndOfFile, // End-of-file marker. + Comment, // Comment, such as // this is a comment. +}