feat(error): Provide ParseError::char_span

This simplifies adapting parse errors to application errors.
winnow-rs · Feb 19, 2025 · 8021856 · 8021856
1 parent 2e94c45
commit 8021856
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 3 deletions.
diff --git a/src/_tutorial/chapter_7.rs b/src/_tutorial/chapter_7.rs
@@ -625,13 +625,12 @@
 //!         // customized as well to better fit your needs.
 //!         let message = error.inner().to_string();
 //!         let input = input.to_owned();
-//!         let start = error.offset();
 //!         // Assume the error span is only for the first `char`.
 //!         // Semantic errors are free to choose the entire span returned by `Parser::with_span`.
-//!         let end = (start + 1..).find(|e| input.is_char_boundary(*e)).unwrap_or(start);
+//!         let span = error.char_span();
 //!         Self {
 //!             message,
-//!             span: start..end,
+//!             span,
 //!             input,
 //!         }
 //!     }

diff --git a/src/error.rs b/src/error.rs
@@ -1263,6 +1263,8 @@ impl<I, E> ParseError<I, E> {
 
     /// The location in [`ParseError::input`] where parsing failed
     ///
+    /// To get the span for the `char` this points to, see [`ParseError::char_span`].
+    ///
     /// <div class="warning">
     ///
     /// **Note:** This is an offset, not an index, and may point to the end of input
@@ -1287,6 +1289,48 @@ impl<I, E> ParseError<I, E> {
     }
 }
 
+impl<I: AsBStr, E> ParseError<I, E> {
+    /// The byte indices for the `char` at [`ParseError::offset`]
+    #[inline]
+    pub fn char_span(&self) -> std::ops::Range<usize> {
+        char_boundary(self.input.as_bstr(), self.offset())
+    }
+}
+
+fn char_boundary(input: &[u8], offset: usize) -> std::ops::Range<usize> {
+    let len = input.len();
+    if offset == len {
+        return offset..offset;
+    }
+
+    let start = (0..(offset + 1).min(len))
+        .rev()
+        .find(|i| {
+            input
+                .get(*i)
+                .copied()
+                .map(is_utf8_char_boundary)
+                .unwrap_or(false)
+        })
+        .unwrap_or(0);
+    let end = (offset + 1..len)
+        .find(|i| {
+            input
+                .get(*i)
+                .copied()
+                .map(is_utf8_char_boundary)
+                .unwrap_or(false)
+        })
+        .unwrap_or(len);
+    start..end
+}
+
+/// Taken from `core::num`
+const fn is_utf8_char_boundary(b: u8) -> bool {
+    // This is bit magic equivalent to: b < 128 || b >= 192
+    (b as i8) >= -0x40
+}
+
 impl<I, E> core::fmt::Display for ParseError<I, E>
 where
     I: AsBStr,
@@ -1384,6 +1428,52 @@ fn translate_position(input: &[u8], index: usize) -> (usize, usize) {
     (line, column)
 }
 
+#[cfg(test)]
+mod test_char_boundary {
+    use super::*;
+
+    #[test]
+    fn ascii() {
+        let input = "hi";
+        let cases = [(0, 0..1), (1, 1..2), (2, 2..2)];
+        for (offset, expected) in cases {
+            assert_eq!(
+                char_boundary(input.as_bytes(), offset),
+                expected,
+                "input={input:?}, offset={offset:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn utf8() {
+        let input = "βèƒôřè";
+        assert_eq!(input.len(), 12);
+        let cases = [
+            (0, 0..2),
+            (1, 0..2),
+            (2, 2..4),
+            (3, 2..4),
+            (4, 4..6),
+            (5, 4..6),
+            (6, 6..8),
+            (7, 6..8),
+            (8, 8..10),
+            (9, 8..10),
+            (10, 10..12),
+            (11, 10..12),
+            (12, 12..12),
+        ];
+        for (offset, expected) in cases {
+            assert_eq!(
+                char_boundary(input.as_bytes(), offset),
+                expected,
+                "input={input:?}, offset={offset:?}"
+            );
+        }
+    }
+}
+
 #[cfg(test)]
 #[cfg(feature = "std")]
 mod test_parse_error {