Skip to content

Commit

Permalink
refactor: improve lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
Aloso committed Dec 18, 2024
1 parent 2e53858 commit c863456
Showing 1 changed file with 75 additions and 72 deletions.
147 changes: 75 additions & 72 deletions pomsky-syntax/src/lexer/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,82 +82,85 @@ pub(crate) fn tokenize(mut input: &str) -> Vec<(Token, Span)> {
let mut result = vec![];
let mut offset = 0;

loop {
let input_len = input.len();
input = input.trim_start();
while input.starts_with('#') {
input = input.trim_start_matches(|c| c != '\n').trim_start();
}
offset += input_len - input.len();

match input.chars().next() {
None => break,
Some(c) => {
let (len, token) = consume_chain! {
input, c;

if input.starts_with(">>") => (2, Token::LookAhead);
if input.starts_with("<<") => (2, Token::LookBehind);
if input.starts_with("::") => (2, Token::DoubleColon);

if let Some(token) = lookup_single(c) => (1, token);

if c == '\'' => match input[1..].find('\'') {
Some(len_inner) => (len_inner + 2, Token::String),
None => (input.len(), Token::ErrorMsg(LexErrorMsg::UnclosedString)),
};

if c == '"' => match find_unescaped_quote(&input[1..]) {
Some(len_inner) => (len_inner + 2, Token::String),
None => (input.len(), Token::ErrorMsg(LexErrorMsg::UnclosedString)),
};

if let Some((len, _)) = (
'U',
Many0(CharIs(char::is_whitespace)),
'+',
Many0(CharIs(char::is_whitespace)),
Many1(CharIs(|c| c.is_alphanumeric() || c == '_')),
).is_start(input) => {
if input[1..len].trim_start_matches(|c: char| c == '+' || c.is_whitespace())
.contains(|c: char| !c.is_ascii_hexdigit()) {
(len, Token::ErrorMsg(LexErrorMsg::InvalidCodePoint))
} else {
(len, Token::CodePoint)
}
};

if let Some((len, _)) = (
Many1(CharIs(|c| c.is_ascii_digit()))
).is_start(input) => match (input.as_bytes(), len) {
([b'0', ..], 2..) => (len, Token::ErrorMsg(LexErrorMsg::LeadingZero)),
_ => (len, Token::Number),
};

if let Some((len, _)) = (
CharIs(|c| c.is_alphabetic() || c == '_'),
Many0(CharIs(|c| c.is_alphanumeric() || c == '_'))
).is_start(input) => match &input[..len] {
reserved_word_pattern!() => (len, Token::ReservedName),
_ => (len, Token::Identifier),
};

if let Some((len, err)) = parse_special_group(input) => (len, Token::ErrorMsg(err));

if c == '(' => (1, Token::OpenParen);

if let Some((len, err)) = parse_backslash(input) => (len, Token::ErrorMsg(err));
while let Some((token, start, end)) = next_token(input) {
result.push((token, Span::new(offset + start, offset + end)));
input = &input[end..];
offset += end;
}

result
}

pub fn next_token(mut input: &str) -> Option<(Token, usize, usize)> {
let input_len = input.len();
input = input.trim_start();
while input.starts_with('#') {
input = input.trim_start_matches(|c| c != '\n').trim_start();
}
let start = input_len - input.len();

match input.chars().next() {
None => None,
Some(c) => {
let (len, token) = consume_chain! {
input, c;

if input.starts_with(">>") => (2, Token::LookAhead);
if input.starts_with("<<") => (2, Token::LookBehind);
if input.starts_with("::") => (2, Token::DoubleColon);

if let Some(token) = lookup_single(c) => (1, token);

if c == '\'' => match input[1..].find('\'') {
Some(len_inner) => (len_inner + 2, Token::String),
None => (input.len(), Token::ErrorMsg(LexErrorMsg::UnclosedString)),
};

let start = offset;
offset += len;
input = &input[len..];
result.push((token, Span::new(start, offset)));
}
if c == '"' => match find_unescaped_quote(&input[1..]) {
Some(len_inner) => (len_inner + 2, Token::String),
None => (input.len(), Token::ErrorMsg(LexErrorMsg::UnclosedString)),
};

if let Some((len, _)) = (
'U',
Many0(CharIs(char::is_whitespace)),
'+',
Many0(CharIs(char::is_whitespace)),
Many1(CharIs(|c| c.is_alphanumeric() || c == '_')),
).is_start(input) => {
if input[1..len].trim_start_matches(|c: char| c == '+' || c.is_whitespace())
.contains(|c: char| !c.is_ascii_hexdigit()) {
(len, Token::ErrorMsg(LexErrorMsg::InvalidCodePoint))
} else {
(len, Token::CodePoint)
}
};

if let Some((len, _)) = (
Many1(CharIs(|c| c.is_ascii_digit()))
).is_start(input) => match (input.as_bytes(), len) {
([b'0', ..], 2..) => (len, Token::ErrorMsg(LexErrorMsg::LeadingZero)),
_ => (len, Token::Number),
};

if let Some((len, _)) = (
CharIs(|c| c.is_alphabetic() || c == '_'),
Many0(CharIs(|c| c.is_alphanumeric() || c == '_'))
).is_start(input) => match &input[..len] {
reserved_word_pattern!() => (len, Token::ReservedName),
_ => (len, Token::Identifier),
};

if let Some((len, err)) = parse_special_group(input) => (len, Token::ErrorMsg(err));

if c == '(' => (1, Token::OpenParen);

if let Some((len, err)) = parse_backslash(input) => (len, Token::ErrorMsg(err));
};

Some((token, start, start + len))
}
}

result
}

fn find_unescaped_quote(input: &str) -> Option<usize> {
Expand Down

0 comments on commit c863456

Please # to comment.