Merge pull request #828 from lucacasonato/lone_surrogate

dtolnay · web-flow · commit 691466c02e9a · 2021-11-24T21:18:39.000-08:00
Deserialize lone surrogates into byte bufs
diff --git a/src/de.rs b/src/de.rs
@@ -1580,20 +1580,21 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     /// ```
     ///
     /// Backslash escape sequences like `\n` are still interpreted and required
-    /// to be valid, and `\u` escape sequences are required to represent valid
-    /// Unicode code points.
+    /// to be valid. `\u` escape sequences are required to represent valid
+    /// Unicode code points, except in the case of lone surrogates.
     ///
     /// ```
     /// use serde_bytes::ByteBuf;
     ///
     /// fn look_at_bytes() {
-    ///     let json_data = b"\"invalid unicode surrogate: \\uD801\"";
+    ///     let json_data = b"\"lone surrogate: \\uD801\"";
     ///     let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
     ///
-    ///     assert!(parsed.is_err());
+    ///     assert!(parsed.is_ok());
     ///
-    ///     let expected_msg = "unexpected end of hex escape at line 1 column 35";
-    ///     assert_eq!(expected_msg, parsed.unwrap_err().to_string());
+    ///     let expected = b"lone surrogate: \xED\xA0\x81";
+    ///     let bytes: ByteBuf = parsed.unwrap();
+    ///     assert_eq!(expected, &bytes[..]);
     /// }
     /// #
     /// # look_at_bytes();
diff --git a/src/read.rs b/src/read.rs
@@ -225,7 +225,7 @@ where
                     return result(self, scratch);
                 }
                 b'\\' => {
-                    tri!(parse_escape(self, scratch));
+                    tri!(parse_escape(self, validate, scratch));
                 }
                 _ => {
                     if validate {
@@ -465,7 +465,7 @@ impl<'a> SliceRead<'a> {
                 b'\\' => {
                     scratch.extend_from_slice(&self.slice[start..self.index]);
                     self.index += 1;
-                    tri!(parse_escape(self, scratch));
+                    tri!(parse_escape(self, validate, scratch));
                     start = self.index;
                 }
                 _ => {
@@ -817,6 +817,16 @@ where
     }
 }
 
+fn peek_or_eof<'de, R>(read: &mut R) -> Result<u8>
+where
+    R: ?Sized + Read<'de>,
+{
+    match tri!(read.peek()) {
+        Some(b) => Ok(b),
+        None => error(read, ErrorCode::EofWhileParsingString),
+    }
+}
+
 fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result<T>
 where
     R: ?Sized + Read<'de>,
@@ -831,7 +841,11 @@ fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> {
 
 /// Parses a JSON escape sequence and appends it into the scratch space. Assumes
 /// the previous byte read was a backslash.
-fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
+fn parse_escape<'de, R: Read<'de>>(
+    read: &mut R,
+    validate: bool,
+    scratch: &mut Vec<u8>,
+) -> Result<()> {
     let ch = tri!(next_or_eof(read));
 
     match ch {
@@ -845,19 +859,67 @@ fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec<u8>) -> Resul
         b't' => scratch.push(b'\t'),
         b'u' => {
             let c = match tri!(read.decode_hex_escape()) {
-                0xDC00..=0xDFFF => {
-                    return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+                n @ 0xDC00..=0xDFFF => {
+                    if validate {
+                        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+                    }
+
+                    let utf8_bytes = [
+                        (n >> 12 & 0x0F) as u8 | 0b1110_0000,
+                        (n >> 6 & 0x3F) as u8 | 0b1000_0000,
+                        (n & 0x3F) as u8 | 0b1000_0000,
+                    ];
+
+                    scratch.extend_from_slice(&utf8_bytes);
+
+                    return Ok(());
                 }
 
                 // Non-BMP characters are encoded as a sequence of
                 // two hex escapes, representing UTF-16 surrogates.
+                // If `validate` is false and we only find a single
+                // hex escape that is a surrogate, then we'll accept
+                // it instead of erroring.
                 n1 @ 0xD800..=0xDBFF => {
-                    if tri!(next_or_eof(read)) != b'\\' {
-                        return error(read, ErrorCode::UnexpectedEndOfHexEscape);
+                    if tri!(peek_or_eof(read)) != b'\\' {
+                        if validate {
+                            read.discard();
+                            return error(read, ErrorCode::UnexpectedEndOfHexEscape);
+                        }
+
+                        let utf8_bytes = [
+                            (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
+                            (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
+                            (n1 & 0x3F) as u8 | 0b1000_0000,
+                        ];
+
+                        scratch.extend_from_slice(&utf8_bytes);
+
+                        return Ok(());
                     }
-                    if tri!(next_or_eof(read)) != b'u' {
-                        return error(read, ErrorCode::UnexpectedEndOfHexEscape);
+                    read.discard();
+
+                    if tri!(peek_or_eof(read)) != b'u' {
+                        if validate {
+                            read.discard();
+                            return error(read, ErrorCode::UnexpectedEndOfHexEscape);
+                        }
+
+                        let utf8_bytes = [
+                            (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
+                            (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
+                            (n1 & 0x3F) as u8 | 0b1000_0000,
+                        ];
+
+                        scratch.extend_from_slice(&utf8_bytes);
+
+                        // The \ prior to this byte started an escape sequence,
+                        // so we need to parse that now.
+                        parse_escape(read, validate, scratch)?;
+
+                        return Ok(());
                     }
+                    read.discard();
 
                     let n2 = tri!(read.decode_hex_escape());
 
diff --git a/tests/test.rs b/tests/test.rs
@@ -1714,6 +1714,34 @@ fn test_byte_buf_de() {
     assert_eq!(v, bytes);
 }
 
+#[test]
+fn test_byte_buf_de_lone_surrogate() {
+    let bytes = ByteBuf::from(vec![237, 160, 188]);
+    let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
+    assert_eq!(v, bytes);
+
+    let bytes = ByteBuf::from(vec![237, 160, 188, 10]);
+    let v: ByteBuf = from_str(r#""\ud83c\n""#).unwrap();
+    assert_eq!(v, bytes);
+
+    let bytes = ByteBuf::from(vec![237, 160, 188, 32]);
+    let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
+    assert_eq!(v, bytes);
+
+    let bytes = ByteBuf::from(vec![237, 176, 129]);
+    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
+    assert_eq!(v, bytes);
+
+    let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
+    assert!(res.is_err());
+
+    let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
+    assert!(res.is_err());
+
+    let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
+    assert!(res.is_err());
+}
+
 #[test]
 fn test_byte_buf_de_multiple() {
     let s: Vec<ByteBuf> = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();