Skip to content

Commit 289f3a4

Browse files
authored
Auto merge of #36377 - tormol:encode_utf, r=alexcrichton
Change encode_utf{8,16}() to write to a buffer and panic if it's too small cc #27784 Should the "A buffer that's too small" examples be removed and replaced by tests?
2 parents ff67da6 + 13a2dd9 commit 289f3a4

File tree

9 files changed

+163
-188
lines changed

9 files changed

+163
-188
lines changed

src/libcollections/string.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -975,7 +975,7 @@ impl String {
975975
pub fn push(&mut self, ch: char) {
976976
match ch.len_utf8() {
977977
1 => self.vec.push(ch as u8),
978-
_ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()),
978+
_ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0;4]).as_bytes()),
979979
}
980980
}
981981

@@ -1131,10 +1131,11 @@ impl String {
11311131
let len = self.len();
11321132
assert!(idx <= len);
11331133
assert!(self.is_char_boundary(idx));
1134-
let bits = ch.encode_utf8();
1134+
let mut bits = [0; 4];
1135+
let bits = ch.encode_utf8(&mut bits).as_bytes();
11351136

11361137
unsafe {
1137-
self.insert_bytes(idx, bits.as_slice());
1138+
self.insert_bytes(idx, bits);
11381139
}
11391140
}
11401141

src/libcollectionstest/str.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -786,9 +786,9 @@ fn test_rev_iterator() {
786786

787787
#[test]
788788
fn test_chars_decoding() {
789+
let mut bytes = [0; 4];
789790
for c in (0..0x110000).filter_map(::std::char::from_u32) {
790-
let bytes = c.encode_utf8();
791-
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
791+
let s = c.encode_utf8(&mut bytes);
792792
if Some(c) != s.chars().next() {
793793
panic!("character {:x}={} does not decode correctly", c as u32, c);
794794
}
@@ -797,9 +797,9 @@ fn test_chars_decoding() {
797797

798798
#[test]
799799
fn test_chars_rev_decoding() {
800+
let mut bytes = [0; 4];
800801
for c in (0..0x110000).filter_map(::std::char::from_u32) {
801-
let bytes = c.encode_utf8();
802-
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
802+
let s = c.encode_utf8(&mut bytes);
803803
if Some(c) != s.chars().rev().next() {
804804
panic!("character {:x}={} does not decode correctly", c as u32, c);
805805
}

src/libcore/char.rs

+51-119
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
use char_private::is_printable;
1919
use convert::TryFrom;
2020
use fmt;
21+
use slice;
2122
use iter::FusedIterator;
2223
use mem::transmute;
2324

@@ -327,9 +328,9 @@ pub trait CharExt {
327328
#[stable(feature = "core", since = "1.6.0")]
328329
fn len_utf16(self) -> usize;
329330
#[unstable(feature = "unicode", issue = "27784")]
330-
fn encode_utf8(self) -> EncodeUtf8;
331+
fn encode_utf8(self, dst: &mut [u8]) -> &mut str;
331332
#[unstable(feature = "unicode", issue = "27784")]
332-
fn encode_utf16(self) -> EncodeUtf16;
333+
fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16];
333334
}
334335

335336
#[stable(feature = "core", since = "1.6.0")]
@@ -419,47 +420,59 @@ impl CharExt for char {
419420
}
420421

421422
#[inline]
422-
fn encode_utf8(self) -> EncodeUtf8 {
423+
fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
423424
let code = self as u32;
424-
let mut buf = [0; 4];
425-
let pos = if code < MAX_ONE_B {
426-
buf[3] = code as u8;
427-
3
428-
} else if code < MAX_TWO_B {
429-
buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
430-
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
431-
2
432-
} else if code < MAX_THREE_B {
433-
buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
434-
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
435-
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
436-
1
437-
} else {
438-
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
439-
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
440-
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
441-
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
442-
0
443-
};
444-
EncodeUtf8 { buf: buf, pos: pos }
425+
unsafe {
426+
let len =
427+
if code < MAX_ONE_B && !dst.is_empty() {
428+
*dst.get_unchecked_mut(0) = code as u8;
429+
1
430+
} else if code < MAX_TWO_B && dst.len() >= 2 {
431+
*dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
432+
*dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT;
433+
2
434+
} else if code < MAX_THREE_B && dst.len() >= 3 {
435+
*dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
436+
*dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
437+
*dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT;
438+
3
439+
} else if dst.len() >= 4 {
440+
*dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
441+
*dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
442+
*dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
443+
*dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT;
444+
4
445+
} else {
446+
panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
447+
from_u32_unchecked(code).len_utf8(),
448+
code,
449+
dst.len())
450+
};
451+
transmute(slice::from_raw_parts_mut(dst.as_mut_ptr(), len))
452+
}
445453
}
446454

447455
#[inline]
448-
fn encode_utf16(self) -> EncodeUtf16 {
449-
let mut buf = [0; 2];
456+
fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
450457
let mut code = self as u32;
451-
let pos = if (code & 0xFFFF) == code {
452-
// The BMP falls through (assuming non-surrogate, as it should)
453-
buf[1] = code as u16;
454-
1
455-
} else {
456-
// Supplementary planes break into surrogates.
457-
code -= 0x1_0000;
458-
buf[0] = 0xD800 | ((code >> 10) as u16);
459-
buf[1] = 0xDC00 | ((code as u16) & 0x3FF);
460-
0
461-
};
462-
EncodeUtf16 { buf: buf, pos: pos }
458+
unsafe {
459+
if (code & 0xFFFF) == code && !dst.is_empty() {
460+
// The BMP falls through (assuming non-surrogate, as it should)
461+
*dst.get_unchecked_mut(0) = code as u16;
462+
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
463+
} else if dst.len() >= 2 {
464+
// Supplementary planes break into surrogates.
465+
code -= 0x1_0000;
466+
*dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
467+
*dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
468+
slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
469+
} else {
470+
panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
471+
from_u32_unchecked(code).len_utf16(),
472+
code,
473+
dst.len())
474+
}
475+
}
463476
}
464477
}
465478

@@ -702,88 +715,7 @@ impl ExactSizeIterator for EscapeDebug { }
702715
#[unstable(feature = "fused", issue = "35602")]
703716
impl FusedIterator for EscapeDebug {}
704717

705-
/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
706-
/// value.
707-
///
708-
/// Constructed via the `.encode_utf8()` method on `char`.
709-
#[unstable(feature = "unicode", issue = "27784")]
710-
#[derive(Debug)]
711-
pub struct EncodeUtf8 {
712-
buf: [u8; 4],
713-
pos: usize,
714-
}
715-
716-
impl EncodeUtf8 {
717-
/// Returns the remaining bytes of this iterator as a slice.
718-
#[unstable(feature = "unicode", issue = "27784")]
719-
pub fn as_slice(&self) -> &[u8] {
720-
&self.buf[self.pos..]
721-
}
722-
}
723-
724-
#[unstable(feature = "unicode", issue = "27784")]
725-
impl Iterator for EncodeUtf8 {
726-
type Item = u8;
727-
728-
fn next(&mut self) -> Option<u8> {
729-
if self.pos == self.buf.len() {
730-
None
731-
} else {
732-
let ret = Some(self.buf[self.pos]);
733-
self.pos += 1;
734-
ret
735-
}
736-
}
737-
738-
fn size_hint(&self) -> (usize, Option<usize>) {
739-
self.as_slice().iter().size_hint()
740-
}
741-
}
742-
743-
#[unstable(feature = "fused", issue = "35602")]
744-
impl FusedIterator for EncodeUtf8 {}
745-
746-
/// An iterator over `u16` entries represending the UTF-16 encoding of a `char`
747-
/// value.
748-
///
749-
/// Constructed via the `.encode_utf16()` method on `char`.
750-
#[unstable(feature = "unicode", issue = "27784")]
751-
#[derive(Debug)]
752-
pub struct EncodeUtf16 {
753-
buf: [u16; 2],
754-
pos: usize,
755-
}
756-
757-
impl EncodeUtf16 {
758-
/// Returns the remaining bytes of this iterator as a slice.
759-
#[unstable(feature = "unicode", issue = "27784")]
760-
pub fn as_slice(&self) -> &[u16] {
761-
&self.buf[self.pos..]
762-
}
763-
}
764-
765-
766-
#[unstable(feature = "unicode", issue = "27784")]
767-
impl Iterator for EncodeUtf16 {
768-
type Item = u16;
769-
770-
fn next(&mut self) -> Option<u16> {
771-
if self.pos == self.buf.len() {
772-
None
773-
} else {
774-
let ret = Some(self.buf[self.pos]);
775-
self.pos += 1;
776-
ret
777-
}
778-
}
779-
780-
fn size_hint(&self) -> (usize, Option<usize>) {
781-
self.as_slice().iter().size_hint()
782-
}
783-
}
784718

785-
#[unstable(feature = "fused", issue = "35602")]
786-
impl FusedIterator for EncodeUtf16 {}
787719

788720
/// An iterator over an iterator of bytes of the characters the bytes represent
789721
/// as UTF-8

src/libcore/fmt/mod.rs

+5-13
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,7 @@ pub trait Write {
9797
/// This function will return an instance of `Error` on error.
9898
#[stable(feature = "fmt_write_char", since = "1.1.0")]
9999
fn write_char(&mut self, c: char) -> Result {
100-
self.write_str(unsafe {
101-
str::from_utf8_unchecked(c.encode_utf8().as_slice())
102-
})
100+
self.write_str(c.encode_utf8(&mut [0; 4]))
103101
}
104102

105103
/// Glue for usage of the `write!` macro with implementors of this trait.
@@ -924,9 +922,7 @@ impl<'a> Formatter<'a> {
924922
// Writes the sign if it exists, and then the prefix if it was requested
925923
let write_prefix = |f: &mut Formatter| {
926924
if let Some(c) = sign {
927-
f.buf.write_str(unsafe {
928-
str::from_utf8_unchecked(c.encode_utf8().as_slice())
929-
})?;
925+
f.buf.write_str(c.encode_utf8(&mut [0; 4]))?;
930926
}
931927
if prefixed { f.buf.write_str(prefix) }
932928
else { Ok(()) }
@@ -1032,10 +1028,8 @@ impl<'a> Formatter<'a> {
10321028
rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2),
10331029
};
10341030

1035-
let fill = self.fill.encode_utf8();
1036-
let fill = unsafe {
1037-
str::from_utf8_unchecked(fill.as_slice())
1038-
};
1031+
let mut fill = [0; 4];
1032+
let fill = self.fill.encode_utf8(&mut fill);
10391033

10401034
for _ in 0..pre_pad {
10411035
self.buf.write_str(fill)?;
@@ -1435,9 +1429,7 @@ impl Display for char {
14351429
if f.width.is_none() && f.precision.is_none() {
14361430
f.write_char(*self)
14371431
} else {
1438-
f.pad(unsafe {
1439-
str::from_utf8_unchecked(self.encode_utf8().as_slice())
1440-
})
1432+
f.pad(self.encode_utf8(&mut [0; 4]))
14411433
}
14421434
}
14431435
}

src/libcoretest/char.rs

+12-9
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
use std::char;
11+
use std::{char,str};
1212
use std::convert::TryFrom;
1313

1414
#[test]
@@ -248,10 +248,12 @@ fn test_escape_unicode() {
248248
#[test]
249249
fn test_encode_utf8() {
250250
fn check(input: char, expect: &[u8]) {
251-
assert_eq!(input.encode_utf8().as_slice(), expect);
252-
for (a, b) in input.encode_utf8().zip(expect) {
253-
assert_eq!(a, *b);
254-
}
251+
let mut buf = [0; 4];
252+
let ptr = buf.as_ptr();
253+
let s = input.encode_utf8(&mut buf);
254+
assert_eq!(s.as_ptr() as usize, ptr as usize);
255+
assert!(str::from_utf8(s.as_bytes()).is_ok());
256+
assert_eq!(s.as_bytes(), expect);
255257
}
256258

257259
check('x', &[0x78]);
@@ -263,10 +265,11 @@ fn test_encode_utf8() {
263265
#[test]
264266
fn test_encode_utf16() {
265267
fn check(input: char, expect: &[u16]) {
266-
assert_eq!(input.encode_utf16().as_slice(), expect);
267-
for (a, b) in input.encode_utf16().zip(expect) {
268-
assert_eq!(a, *b);
269-
}
268+
let mut buf = [0; 2];
269+
let ptr = buf.as_mut_ptr();
270+
let b = input.encode_utf16(&mut buf);
271+
assert_eq!(b.as_mut_ptr() as usize, ptr as usize);
272+
assert_eq!(b, expect);
270273
}
271274

272275
check('x', &[0x0078]);

0 commit comments

Comments
 (0)