From cb0494fdb8e332a33d0b4f4e4add28ea3a605bc5 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 17 Oct 2024 10:03:25 +0200 Subject: [PATCH] fix(parser): recognize the NBSP character as space. --- parser/src/tokenizer/mod.rs | 9 ++++++--- parser/src/tokenizer/tests.rs | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/parser/src/tokenizer/mod.rs b/parser/src/tokenizer/mod.rs index 0b15367f..dc298003 100644 --- a/parser/src/tokenizer/mod.rs +++ b/parser/src/tokenizer/mod.rs @@ -549,8 +549,9 @@ enum NormalToken<'src> { Comment, // Space, tab, and many other Unicode characters that are considered spaces. + // https://www.compart.com/en/unicode/U+00A0 // https://www.compart.com/en/unicode/bidiclass/WS - #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] + #[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] Whitespace, #[token("\n")] @@ -595,8 +596,9 @@ enum HexPatternToken { RBracket, // Space, tab, and many other Unicode characters that are considered spaces. + // https://www.compart.com/en/unicode/U+00A0 // https://www.compart.com/en/unicode/bidiclass/WS - #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] + #[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] Whitespace, #[token("\n")] @@ -651,8 +653,9 @@ enum HexJumpToken<'src> { IntegerLit(&'src [u8]), // Space, tab, and many other Unicode characters that are considered spaces. + // https://www.compart.com/en/unicode/U+00A0 // https://www.compart.com/en/unicode/bidiclass/WS - #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] + #[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] Whitespace, #[token("\n")] diff --git a/parser/src/tokenizer/tests.rs b/parser/src/tokenizer/tests.rs index 3ef194ce..a75aa15a 100644 --- a/parser/src/tokenizer/tests.rs +++ b/parser/src/tokenizer/tests.rs @@ -270,6 +270,11 @@ fn whitespaces() { assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..2)))); assert_eq!(lexer.next_token(), None); + // No-Break Space (NBSP) (U+00A0). + let mut lexer = super::Tokenizer::new(b"\xC2\xA0"); + assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..2)))); + assert_eq!(lexer.next_token(), None); + // "En Quad" character (U+2000). let mut lexer = super::Tokenizer::new(b"\xE2\x80\x80"); assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));