From b86f89a4766bd7b0aa164ec0008b684e989b1c61 Mon Sep 17 00:00:00 2001 From: jgm0 Date: Sat, 1 Feb 2025 17:08:02 -0500 Subject: [PATCH 1/5] Changed DELIMITER (u8) into an array DELIMITERS (u8; 4) that holds various types of possible delimiters. --- src/de/simple_type.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index fefa9e97..43fd6404 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -8,7 +8,6 @@ use crate::encoding::Decoder; use crate::errors::serialize::DeError; use crate::escape::unescape; use crate::utils::CowRef; -use memchr::memchr; use serde::de::value::UnitDeserializer; use serde::de::{ DeserializeSeed, Deserializer, EnumAccess, IntoDeserializer, SeqAccess, VariantAccess, Visitor, @@ -361,14 +360,17 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { T: DeserializeSeed<'de>, { if let Some(mut content) = self.content.take() { - const DELIMITER: u8 = b' '; + const DELIMETERS: [u8; 4] = [b' ', b'\t', b'\r', b'\n']; loop { let string = content.as_str(); if string.is_empty() { return Ok(None); } - return match memchr(DELIMITER, string.as_bytes()) { + + let first_delimiter = string.as_bytes().iter().position(|c| DELIMETERS.contains(c)); + + return match first_delimiter { // No delimiters in the `content`, deserialize it as a whole atomic None => match content { Content::Input(s) => seed.deserialize(AtomicDeserializer { @@ -391,7 +393,7 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { // `content` started with a space, skip them all Some(0) => { // Skip all spaces - let start = string.as_bytes().iter().position(|ch| *ch != DELIMITER); + let start = string.as_bytes().iter().position(|c| !DELIMETERS.contains(c)); content = match (start, content) { // We cannot find any non-space character, so string contains only spaces (None, _) => return Ok(None), From 70bc0e9894b1d7f68117980a3bc052cb7181145f Mon Sep 17 00:00:00 2001 From: Wubbzee <41394708+JGM01@users.noreply.github.com> Date: Sun, 2 Feb 2025 10:01:57 -0500 Subject: [PATCH 2/5] Update src/de/simple_type.rs Co-authored-by: Mingun --- src/de/simple_type.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index 43fd6404..7ac2f72f 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -360,6 +360,8 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { T: DeserializeSeed<'de>, { if let Some(mut content) = self.content.take() { + // NOTE: when normalization will be implemented, it may be enough + // to check only b' ', because all whitespaces will be normalized const DELIMETERS: [u8; 4] = [b' ', b'\t', b'\r', b'\n']; loop { From c19b6c5839f3556a7cb94793bed5f8cdc19e98be Mon Sep 17 00:00:00 2001 From: jgm0 Date: Sun, 2 Feb 2025 10:30:35 -0500 Subject: [PATCH 3/5] Delimiters is a string now, which is smaller and lets me use find. --- src/de/simple_type.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index 7ac2f72f..269f4e98 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -362,7 +362,7 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { if let Some(mut content) = self.content.take() { // NOTE: when normalization will be implemented, it may be enough // to check only b' ', because all whitespaces will be normalized - const DELIMETERS: [u8; 4] = [b' ', b'\t', b'\r', b'\n']; + const DELIMETERS: &str = " \t\r\n"; loop { let string = content.as_str(); @@ -370,7 +370,7 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { return Ok(None); } - let first_delimiter = string.as_bytes().iter().position(|c| DELIMETERS.contains(c)); + let first_delimiter = string.find(|c| DELIMETERS.contains(c)); return match first_delimiter { // No delimiters in the `content`, deserialize it as a whole atomic @@ -395,7 +395,7 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { // `content` started with a space, skip them all Some(0) => { // Skip all spaces - let start = string.as_bytes().iter().position(|c| !DELIMETERS.contains(c)); + let start = string.find(|c| !DELIMETERS.contains(c)); content = match (start, content) { // We cannot find any non-space character, so string contains only spaces (None, _) => return Ok(None), From d1c277baaff76f4ac56bcebb19028336b4507645 Mon Sep 17 00:00:00 2001 From: jgm0 Date: Sun, 2 Feb 2025 10:50:37 -0500 Subject: [PATCH 4/5] added test --- src/de/simple_type.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index 269f4e98..6d4fe2ec 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -1172,6 +1172,23 @@ mod tests { assert_eq!(seq.next_element::<()>().unwrap(), None); assert_eq!(seq.next_element::<()>().unwrap(), None); } + + #[test] + fn mixed_whitespace_delimiters() { + let mut seq = ListIter { + content: Some(Content::Input("one two\nthree\rfour\tfive six")), + escaped: true, + }; + + assert_eq!(seq.next_element::<&str>().unwrap(), Some("one")); + assert_eq!(seq.next_element::<&str>().unwrap(), Some("two")); + assert_eq!(seq.next_element::<&str>().unwrap(), Some("three")); + assert_eq!(seq.next_element::<&str>().unwrap(), Some("four")); + assert_eq!(seq.next_element::<&str>().unwrap(), Some("five")); + assert_eq!(seq.next_element::<&str>().unwrap(), Some("six")); + assert_eq!(seq.next_element::<&str>().unwrap(), None); + assert_eq!(seq.next_element::<&str>().unwrap(), None); + } } mod utf8 { From e585669136599a11356a12e3b9e466b81a871bad Mon Sep 17 00:00:00 2001 From: Wubbzee <41394708+JGM01@users.noreply.github.com> Date: Sun, 2 Feb 2025 10:57:43 -0500 Subject: [PATCH 5/5] Update Changelog.md --- Changelog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changelog.md b/Changelog.md index 7636b5fd..ef263ff8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -17,6 +17,10 @@ ### Bug Fixes +- [#843]: `xs:list` deserialization now delimits on whitespaces ` `, `\r`, `\t`, and `\n`. + +[#843]: https://github.com/tafia/quick-xml/pull/843 + ### Misc Changes