Skip to content

Commit 691466c

Browse files
authored
Merge pull request #828 from lucacasonato/lone_surrogate
Deserialize lone surrogates into byte bufs
2 parents 33c3134 + 07c740c commit 691466c

File tree

3 files changed

+106
-15
lines changed

3 files changed

+106
-15
lines changed

src/de.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1580,20 +1580,21 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
15801580
/// ```
15811581
///
15821582
/// Backslash escape sequences like `\n` are still interpreted and required
1583-
/// to be valid, and `\u` escape sequences are required to represent valid
1584-
/// Unicode code points.
1583+
/// to be valid. `\u` escape sequences are required to represent valid
1584+
/// Unicode code points, except in the case of lone surrogates.
15851585
///
15861586
/// ```
15871587
/// use serde_bytes::ByteBuf;
15881588
///
15891589
/// fn look_at_bytes() {
1590-
/// let json_data = b"\"invalid unicode surrogate: \\uD801\"";
1590+
/// let json_data = b"\"lone surrogate: \\uD801\"";
15911591
/// let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
15921592
///
1593-
/// assert!(parsed.is_err());
1593+
/// assert!(parsed.is_ok());
15941594
///
1595-
/// let expected_msg = "unexpected end of hex escape at line 1 column 35";
1596-
/// assert_eq!(expected_msg, parsed.unwrap_err().to_string());
1595+
/// let expected = b"lone surrogate: \xED\xA0\x81";
1596+
/// let bytes: ByteBuf = parsed.unwrap();
1597+
/// assert_eq!(expected, &bytes[..]);
15971598
/// }
15981599
/// #
15991600
/// # look_at_bytes();

src/read.rs

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ where
225225
return result(self, scratch);
226226
}
227227
b'\\' => {
228-
tri!(parse_escape(self, scratch));
228+
tri!(parse_escape(self, validate, scratch));
229229
}
230230
_ => {
231231
if validate {
@@ -465,7 +465,7 @@ impl<'a> SliceRead<'a> {
465465
b'\\' => {
466466
scratch.extend_from_slice(&self.slice[start..self.index]);
467467
self.index += 1;
468-
tri!(parse_escape(self, scratch));
468+
tri!(parse_escape(self, validate, scratch));
469469
start = self.index;
470470
}
471471
_ => {
@@ -817,6 +817,16 @@ where
817817
}
818818
}
819819

820+
fn peek_or_eof<'de, R>(read: &mut R) -> Result<u8>
821+
where
822+
R: ?Sized + Read<'de>,
823+
{
824+
match tri!(read.peek()) {
825+
Some(b) => Ok(b),
826+
None => error(read, ErrorCode::EofWhileParsingString),
827+
}
828+
}
829+
820830
fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result<T>
821831
where
822832
R: ?Sized + Read<'de>,
@@ -831,7 +841,11 @@ fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> {
831841

832842
/// Parses a JSON escape sequence and appends it into the scratch space. Assumes
833843
/// the previous byte read was a backslash.
834-
fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
844+
fn parse_escape<'de, R: Read<'de>>(
845+
read: &mut R,
846+
validate: bool,
847+
scratch: &mut Vec<u8>,
848+
) -> Result<()> {
835849
let ch = tri!(next_or_eof(read));
836850

837851
match ch {
@@ -845,19 +859,67 @@ fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec<u8>) -> Resul
845859
b't' => scratch.push(b'\t'),
846860
b'u' => {
847861
let c = match tri!(read.decode_hex_escape()) {
848-
0xDC00..=0xDFFF => {
849-
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
862+
n @ 0xDC00..=0xDFFF => {
863+
if validate {
864+
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
865+
}
866+
867+
let utf8_bytes = [
868+
(n >> 12 & 0x0F) as u8 | 0b1110_0000,
869+
(n >> 6 & 0x3F) as u8 | 0b1000_0000,
870+
(n & 0x3F) as u8 | 0b1000_0000,
871+
];
872+
873+
scratch.extend_from_slice(&utf8_bytes);
874+
875+
return Ok(());
850876
}
851877

852878
// Non-BMP characters are encoded as a sequence of
853879
// two hex escapes, representing UTF-16 surrogates.
880+
// If `validate` is false and we only find a single
881+
// hex escape that is a surrogate, then we'll accept
882+
// it instead of erroring.
854883
n1 @ 0xD800..=0xDBFF => {
855-
if tri!(next_or_eof(read)) != b'\\' {
856-
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
884+
if tri!(peek_or_eof(read)) != b'\\' {
885+
if validate {
886+
read.discard();
887+
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
888+
}
889+
890+
let utf8_bytes = [
891+
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
892+
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
893+
(n1 & 0x3F) as u8 | 0b1000_0000,
894+
];
895+
896+
scratch.extend_from_slice(&utf8_bytes);
897+
898+
return Ok(());
857899
}
858-
if tri!(next_or_eof(read)) != b'u' {
859-
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
900+
read.discard();
901+
902+
if tri!(peek_or_eof(read)) != b'u' {
903+
if validate {
904+
read.discard();
905+
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
906+
}
907+
908+
let utf8_bytes = [
909+
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
910+
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
911+
(n1 & 0x3F) as u8 | 0b1000_0000,
912+
];
913+
914+
scratch.extend_from_slice(&utf8_bytes);
915+
916+
// The \ prior to this byte started an escape sequence,
917+
// so we need to parse that now.
918+
parse_escape(read, validate, scratch)?;
919+
920+
return Ok(());
860921
}
922+
read.discard();
861923

862924
let n2 = tri!(read.decode_hex_escape());
863925

tests/test.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1714,6 +1714,34 @@ fn test_byte_buf_de() {
17141714
assert_eq!(v, bytes);
17151715
}
17161716

1717+
#[test]
1718+
fn test_byte_buf_de_lone_surrogate() {
1719+
let bytes = ByteBuf::from(vec![237, 160, 188]);
1720+
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
1721+
assert_eq!(v, bytes);
1722+
1723+
let bytes = ByteBuf::from(vec![237, 160, 188, 10]);
1724+
let v: ByteBuf = from_str(r#""\ud83c\n""#).unwrap();
1725+
assert_eq!(v, bytes);
1726+
1727+
let bytes = ByteBuf::from(vec![237, 160, 188, 32]);
1728+
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
1729+
assert_eq!(v, bytes);
1730+
1731+
let bytes = ByteBuf::from(vec![237, 176, 129]);
1732+
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
1733+
assert_eq!(v, bytes);
1734+
1735+
let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
1736+
assert!(res.is_err());
1737+
1738+
let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
1739+
assert!(res.is_err());
1740+
1741+
let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
1742+
assert!(res.is_err());
1743+
}
1744+
17171745
#[test]
17181746
fn test_byte_buf_de_multiple() {
17191747
let s: Vec<ByteBuf> = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();

0 commit comments

Comments
 (0)