diff --git a/alloc/src/string.rs b/alloc/src/string.rs index d58a016b502ca..8397735083211 100644 --- a/alloc/src/string.rs +++ b/alloc/src/string.rs @@ -2087,7 +2087,31 @@ impl FromUtf8Error { #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")] pub fn into_utf8_lossy(self) -> String { - String::from_utf8_lossy_owned(self.bytes) + const REPLACEMENT: &str = "\u{FFFD}"; + + let mut res = { + let mut v = Vec::with_capacity(self.bytes.len()); + + // `Utf8Error::valid_up_to` returns the maximum index of validated + // UTF-8 bytes. Copy the valid bytes into the output buffer. + v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]); + + // SAFETY: This is safe because the only bytes present in the buffer + // were validated as UTF-8 by the call to `String::from_utf8` which + // produced this `FromUtf8Error`. + unsafe { String::from_utf8_unchecked(v) } + }; + + let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks(); + + for chunk in iter { + res.push_str(chunk.valid()); + if !chunk.invalid().is_empty() { + res.push_str(REPLACEMENT); + } + } + + res } /// Returns the bytes that were attempted to convert to a `String`. diff --git a/alloc/tests/lib.rs b/alloc/tests/lib.rs index ffc9a233b665d..1d07a7690da43 100644 --- a/alloc/tests/lib.rs +++ b/alloc/tests/lib.rs @@ -28,6 +28,7 @@ #![feature(iter_next_chunk)] #![feature(round_char_boundary)] #![feature(slice_partition_dedup)] +#![feature(string_from_utf8_lossy_owned)] #![feature(string_remove_matches)] #![feature(const_btree_len)] #![feature(const_trait_impl)] diff --git a/alloc/tests/string.rs b/alloc/tests/string.rs index dc03c4860e84b..d996c55f94660 100644 --- a/alloc/tests/string.rs +++ b/alloc/tests/string.rs @@ -114,6 +114,43 @@ fn test_from_utf8_lossy() { ); } +#[test] +fn test_fromutf8error_into_lossy() { + fn func(input: &[u8]) -> String { + String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy()) + } + + let xs = b"hello"; + let ys = "hello".to_owned(); + assert_eq!(func(xs), ys); + + let xs = "ศไทย中华Việt Nam".as_bytes(); + let ys = "ศไทย中华Việt Nam".to_owned(); + assert_eq!(func(xs), ys); + + let xs = b"Hello\xC2 There\xFF Goodbye"; + assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned()); + + let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye"; + assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned()); + + let xs = b"\xF5foo\xF5\x80bar"; + assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned()); + + let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz"; + assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned()); + + let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz"; + assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned()); + + let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar"; + assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned()); + + // surrogates + let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar"; + assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned()); +} + #[test] fn test_from_utf16() { let pairs = [