diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index 06952253ef3b0..51fb4f515b287 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -1895,9 +1895,24 @@ impl<'a> From<&'a [char]> for String { #[stable(feature = "stringfromchars", since = "1.12.0")] impl From> for String { - #[inline] - fn from(v: Vec) -> String { - String::from(v.as_slice()) + fn from(mut v: Vec) -> String { + unsafe { + let cap = v.capacity(); + let ptr = v.as_mut_ptr() as *mut u8; + let mut bytes = 0usize; + + for chr in v.iter() { + // Regular loop instead of copy_nonoverlapping, because LLVM insists on having a + // memcpy for slices shorter than 4 bytes. + for b in chr.encode_utf8().as_slice() { + // Neither bytes nor ptr can overflow. + *ptr.offset(bytes as isize) = *b; + bytes = bytes.wrapping_add(1); + } + } + mem::forget(v); + String::from_raw_parts(ptr, bytes, cap * mem::size_of::()) + } } } diff --git a/src/libcollectionstest/string.rs b/src/libcollectionstest/string.rs index 1652fb5a88d80..aacdc2d1e0668 100644 --- a/src/libcollectionstest/string.rs +++ b/src/libcollectionstest/string.rs @@ -392,6 +392,22 @@ fn test_into_boxed_str() { assert_eq!(&*ys, "hello my name is bob"); } +#[test] +fn test_string_from_vec_char() { + let str1 = String::from(vec!['a', 'b', 'πŸ˜ƒ', 'a', 'b']); + let str2 = String::from(vec!['a', 'Δ…', 'あ', '🞎']); + let str3 = String::from(vec!['🞎', 'あ', 'Δ…', 'a']); + assert_eq!("abπŸ˜ƒab", str1); + assert_eq!(str1.len(), 8); + assert!(str1.capacity() >= 20); + assert_eq!("aΔ…γ‚πŸžŽ", str2); + assert_eq!(str2.len(), 10); + assert!(str2.capacity() >= 16); + assert_eq!("πŸžŽγ‚Δ…a", str3); + assert_eq!(str3.len(), 10); + assert!(str3.capacity() >= 16); +} + #[bench] fn bench_with_capacity(b: &mut Bencher) { b.iter(|| String::with_capacity(100)); diff --git a/src/libcore/char.rs b/src/libcore/char.rs index a3440fe8aa644..24aa7b1fc34ee 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -28,6 +28,7 @@ const TAG_FOUR_B: u8 = 0b1111_0000; const MAX_ONE_B: u32 = 0x80; const MAX_TWO_B: u32 = 0x800; const MAX_THREE_B: u32 = 0x10000; +const UTF8_BUF_SIZE: usize = 4; /* Lu Uppercase_Letter an uppercase letter @@ -644,15 +645,27 @@ impl ExactSizeIterator for EscapeDebug { } #[unstable(feature = "unicode", issue = "27784")] #[derive(Debug)] pub struct EncodeUtf8 { - buf: [u8; 4], + buf: [u8; UTF8_BUF_SIZE], pos: usize, } impl EncodeUtf8 { /// Returns the remaining bytes of this iterator as a slice. #[unstable(feature = "unicode", issue = "27784")] + #[inline(always)] pub fn as_slice(&self) -> &[u8] { - &self.buf[self.pos..] + // We know for sure this method cannot slice out-of-bounds because: + // * 0 ≀ self.pos ≀ 3 + // * self.buf.len() = 4 + // + // This way the slicing will always succeed, but LLVM is incapable of figuring out both + // these conditions hold, resulting in suboptimal code, especially after inlining. + // Ideally there would be a `slice_unchecked` method for slices, but there isn’t any, + // therefore we construct the slice manually. + unsafe { + ::slice::from_raw_parts(self.buf.as_ptr().offset(self.pos as isize), + UTF8_BUF_SIZE.wrapping_sub(self.pos)) + } } }