From 79cda0135f2a23d575f110ad73432e4f3b705b41 Mon Sep 17 00:00:00 2001 From: M Farkas-Dyck Date: Fri, 20 Jul 2018 15:15:26 -0800 Subject: [PATCH 1/5] Define non-panicking UTF encoding methods on `char` --- src/libcore/char/methods.rs | 62 +++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs index eee78de90362..a39d8283abde 100644 --- a/src/libcore/char/methods.rs +++ b/src/libcore/char/methods.rs @@ -467,6 +467,37 @@ impl char { } } + /// Encodes this character as UTF-8 into the provided byte buffer, + /// and then returns the subslice of the buffer that contains the encoded character. + /// Returns `None` if buffer too short. + /// + /// # Examples + /// + /// In both of these examples, 'ß' takes two bytes to encode. + /// + /// ``` + /// let mut b = [0; 2]; + /// + /// let result = 'ß'.encode_utf8(&mut b).unwrap(); + /// + /// assert_eq!(result, "ß"); + /// + /// assert_eq!(result.len(), 2); + /// ``` + /// + /// A buffer that's too small: + /// + /// ``` + /// let mut b = [0; 1]; + /// + /// assert_eq!(None, 'ß'.encode_utf8(&mut b)); + /// ``` + #[unstable(feature = "try_unicode_encode_char", issue = "52579")] + #[inline] + pub fn try_encode_utf8(self, dst: &mut [u8]) -> Option<&mut str> { + if dst.len() < self.len_utf8() { None } else { Some(self.encode_utf8(dst)) } + } + /// Encodes this character as UTF-16 into the provided `u16` buffer, /// and then returns the subslice of the buffer that contains the encoded character. /// @@ -525,6 +556,37 @@ impl char { } } + /// Encodes this character as UTF-16 into the provided `u16` buffer, + /// and then returns the subslice of the buffer that contains the encoded character. + /// Returns `None` if buffer too short. + /// + /// # Examples + /// + /// In both of these examples, '𝕊' takes two `u16`s to encode. + /// + /// ``` + /// let mut b = [0; 2]; + /// + /// let result = '𝕊'.encode_utf16(&mut b).unwrap(); + /// + /// assert_eq!(result, "𝕊"); + /// + /// assert_eq!(result.len(), 2); + /// ``` + /// + /// A buffer that's too small: + /// + /// ``` + /// let mut b = [0; 1]; + /// + /// assert_eq!(None, '𝕊'.encode_utf16(&mut b)); + /// ``` + #[unstable(feature = "try_unicode_encode_char", issue = "52579")] + #[inline] + pub fn try_encode_utf16(self, dst: &mut [u16]) -> Option<&mut [u16]> { + if dst.len() < self.len_utf16() { None } else { Some(self.encode_utf16(dst)) } + } + /// Returns true if this `char` is an alphabetic code point, and false if not. /// /// # Examples From fc9e1900137cf7bf169afab2615518fc148d97f2 Mon Sep 17 00:00:00 2001 From: M Farkas-Dyck Date: Sun, 22 Jul 2018 12:12:06 -0800 Subject: [PATCH 2/5] not check UTF length twice --- src/libcore/char/methods.rs | 100 +++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs index a39d8283abde..bf0b42784456 100644 --- a/src/libcore/char/methods.rs +++ b/src/libcore/char/methods.rs @@ -436,34 +436,11 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { - let code = self as u32; - unsafe { - let len = - if code < MAX_ONE_B && !dst.is_empty() { - *dst.get_unchecked_mut(0) = code as u8; - 1 - } else if code < MAX_TWO_B && dst.len() >= 2 { - *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT; - 2 - } else if code < MAX_THREE_B && dst.len() >= 3 { - *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT; - 3 - } else if dst.len() >= 4 { - *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; - *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT; - 4 - } else { - panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", - from_u32_unchecked(code).len_utf8(), - code, - dst.len()) - }; - from_utf8_unchecked_mut(dst.get_unchecked_mut(..len)) + let l = dst.len(); + match self.try_encode_utf8(dst) { + Some(s) => s, + None => panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", + self.len_utf8(), self as u32, l), } } @@ -495,7 +472,32 @@ impl char { #[unstable(feature = "try_unicode_encode_char", issue = "52579")] #[inline] pub fn try_encode_utf8(self, dst: &mut [u8]) -> Option<&mut str> { - if dst.len() < self.len_utf8() { None } else { Some(self.encode_utf8(dst)) } + let code = self as u32; + unsafe { + let len = + if code < MAX_ONE_B && !dst.is_empty() { + *dst.get_unchecked_mut(0) = code as u8; + 1 + } else if code < MAX_TWO_B && dst.len() >= 2 { + *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT; + 2 + } else if code < MAX_THREE_B && dst.len() >= 3 { + *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT; + 3 + } else if dst.len() >= 4 { + *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT; + 4 + } else { + return None; + }; + Some(from_utf8_unchecked_mut(dst.get_unchecked_mut(..len))) + } } /// Encodes this character as UTF-16 into the provided `u16` buffer, @@ -535,24 +537,11 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { - let mut code = self as u32; - unsafe { - if (code & 0xFFFF) == code && !dst.is_empty() { - // The BMP falls through (assuming non-surrogate, as it should) - *dst.get_unchecked_mut(0) = code as u16; - slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) - } else if dst.len() >= 2 { - // Supplementary planes break into surrogates. - code -= 0x1_0000; - *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); - *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); - slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) - } else { - panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", - from_u32_unchecked(code).len_utf16(), - code, - dst.len()) - } + let l = dst.len(); + match self.try_encode_utf16(dst) { + Some(s) => s, + None => panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", + self.len_utf16(), self as u32, l), } } @@ -584,7 +573,22 @@ impl char { #[unstable(feature = "try_unicode_encode_char", issue = "52579")] #[inline] pub fn try_encode_utf16(self, dst: &mut [u16]) -> Option<&mut [u16]> { - if dst.len() < self.len_utf16() { None } else { Some(self.encode_utf16(dst)) } + let mut code = self as u32; + unsafe { + if (code & 0xFFFF) == code && !dst.is_empty() { + // The BMP falls through (assuming non-surrogate, as it should) + *dst.get_unchecked_mut(0) = code as u16; + Some(slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)) + } else if dst.len() >= 2 { + // Supplementary planes break into surrogates. + code -= 0x1_0000; + *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); + *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); + Some(slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)) + } else { + None + } + } } /// Returns true if this `char` is an alphabetic code point, and false if not. From 9ba4046aa6369e0e74ff681c05cf24ff3aee06c1 Mon Sep 17 00:00:00 2001 From: M Farkas-Dyck Date: Sun, 22 Jul 2018 12:14:04 -0800 Subject: [PATCH 3/5] unbreak doctests --- src/libcore/char/methods.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs index bf0b42784456..ac5c4104049d 100644 --- a/src/libcore/char/methods.rs +++ b/src/libcore/char/methods.rs @@ -455,7 +455,7 @@ impl char { /// ``` /// let mut b = [0; 2]; /// - /// let result = 'ß'.encode_utf8(&mut b).unwrap(); + /// let result = 'ß'.try_encode_utf8(&mut b).unwrap(); /// /// assert_eq!(result, "ß"); /// @@ -467,7 +467,7 @@ impl char { /// ``` /// let mut b = [0; 1]; /// - /// assert_eq!(None, 'ß'.encode_utf8(&mut b)); + /// assert_eq!(None, 'ß'.try_encode_utf8(&mut b)); /// ``` #[unstable(feature = "try_unicode_encode_char", issue = "52579")] #[inline] @@ -517,22 +517,17 @@ impl char { /// /// let result = '𝕊'.encode_utf16(&mut b); /// + /// assert_eq!(result, "𝕊"); + /// /// assert_eq!(result.len(), 2); /// ``` /// /// A buffer that's too small: /// /// ``` - /// use std::thread; - /// - /// let result = thread::spawn(|| { - /// let mut b = [0; 1]; - /// - /// // this panics - /// '𝕊'.encode_utf16(&mut b); - /// }).join(); + /// let mut b = [0; 1]; /// - /// assert!(result.is_err()); + /// assert_eq!(None, '𝕊'.encode_utf16(&mut b)); /// ``` #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] @@ -556,9 +551,7 @@ impl char { /// ``` /// let mut b = [0; 2]; /// - /// let result = '𝕊'.encode_utf16(&mut b).unwrap(); - /// - /// assert_eq!(result, "𝕊"); + /// let result = '𝕊'.try_encode_utf16(&mut b).unwrap(); /// /// assert_eq!(result.len(), 2); /// ``` @@ -568,7 +561,7 @@ impl char { /// ``` /// let mut b = [0; 1]; /// - /// assert_eq!(None, '𝕊'.encode_utf16(&mut b)); + /// assert_eq!(None, '𝕊'.try_encode_utf16(&mut b)); /// ``` #[unstable(feature = "try_unicode_encode_char", issue = "52579")] #[inline] From a4b17ae3dfc80b07c0a9af4e7ca2758bace5c404 Mon Sep 17 00:00:00 2001 From: M Farkas-Dyck Date: Sun, 29 Jul 2018 20:27:06 -0800 Subject: [PATCH 4/5] feature(try_unicode_encode_char) --- src/libcore/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs index bbe6ae8619fe..cd21250c8753 100644 --- a/src/libcore/lib.rs +++ b/src/libcore/lib.rs @@ -122,6 +122,7 @@ #![feature(const_slice_len)] #![feature(const_str_as_bytes)] #![feature(const_str_len)] +#![feature(try_unicode_encode_char)] #[prelude_import] #[allow(unused)] From 5ceeccdfd59ed11e7a6febeba5bb1a355eb92ff4 Mon Sep 17 00:00:00 2001 From: M Farkas-Dyck Date: Wed, 8 Aug 2018 03:56:14 -0800 Subject: [PATCH 5/5] unbreak docs --- src/libcore/char/methods.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs index ac5c4104049d..919793aff879 100644 --- a/src/libcore/char/methods.rs +++ b/src/libcore/char/methods.rs @@ -453,6 +453,8 @@ impl char { /// In both of these examples, 'ß' takes two bytes to encode. /// /// ``` + /// #![feature(try_unicode_encode_char)] + /// /// let mut b = [0; 2]; /// /// let result = 'ß'.try_encode_utf8(&mut b).unwrap(); @@ -465,6 +467,8 @@ impl char { /// A buffer that's too small: /// /// ``` + /// #![feature(try_unicode_encode_char)] + /// /// let mut b = [0; 1]; /// /// assert_eq!(None, 'ß'.try_encode_utf8(&mut b)); @@ -549,6 +553,8 @@ impl char { /// In both of these examples, '𝕊' takes two `u16`s to encode. /// /// ``` + /// #![feature(try_unicode_encode_char)] + /// /// let mut b = [0; 2]; /// /// let result = '𝕊'.try_encode_utf16(&mut b).unwrap(); @@ -559,6 +565,8 @@ impl char { /// A buffer that's too small: /// /// ``` + /// #![feature(try_unicode_encode_char)] + /// /// let mut b = [0; 1]; /// /// assert_eq!(None, '𝕊'.try_encode_utf16(&mut b));