Skip to content

Commit

Permalink
fix(parser): recognize the NBSP character as space.
Browse files Browse the repository at this point in the history
  • Loading branch information
plusvic committed Oct 17, 2024
1 parent 4667ca8 commit cb0494f
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
9 changes: 6 additions & 3 deletions parser/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,9 @@ enum NormalToken<'src> {
Comment,

// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/U+00A0
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
#[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,

#[token("\n")]
Expand Down Expand Up @@ -595,8 +596,9 @@ enum HexPatternToken {
RBracket,

// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/U+00A0
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
#[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,

#[token("\n")]
Expand Down Expand Up @@ -651,8 +653,9 @@ enum HexJumpToken<'src> {
IntegerLit(&'src [u8]),

// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/U+00A0
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
#[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,

#[token("\n")]
Expand Down
5 changes: 5 additions & 0 deletions parser/src/tokenizer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,11 @@ fn whitespaces() {
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..2))));
assert_eq!(lexer.next_token(), None);

// No-Break Space (NBSP) (U+00A0).
let mut lexer = super::Tokenizer::new(b"\xC2\xA0");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..2))));
assert_eq!(lexer.next_token(), None);

// "En Quad" character (U+2000).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x80");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
Expand Down

0 comments on commit cb0494f

Please # to comment.