diff --git a/Cargo.toml b/Cargo.toml index df75c4001..bd61fd869 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ edition = "2018" serde = { version = "1.0.100", default-features = false } indexmap = { version = "1.2", optional = true } itoa = { version = "0.4.3", default-features = false } +minimal-lexical = { git = "https://github.com/Alexhuszagh/minimal-lexical" } ryu = "1.0" [dev-dependencies] diff --git a/src/de.rs b/src/de.rs index 3aa8f298c..749c95597 100644 --- a/src/de.rs +++ b/src/de.rs @@ -23,6 +23,7 @@ pub struct Deserializer { read: R, scratch: Vec, remaining_depth: u8, + requested_f32: bool, #[cfg(feature = "unbounded_depth")] disable_recursion_limit: bool, } @@ -43,6 +44,7 @@ where #[cfg(not(feature = "unbounded_depth"))] { Deserializer { + requested_f32: false, read: read, scratch: Vec::new(), remaining_depth: 128, @@ -52,6 +54,7 @@ where #[cfg(feature = "unbounded_depth")] { Deserializer { + requested_f32: false, read: read, scratch: Vec::new(), remaining_depth: 128, @@ -316,6 +319,18 @@ impl<'de, R: Read<'de>> Deserializer { } }; + // to deserialize floats we'll first push the 'integer' and 'fraction' parts, both as byte + // strings, into the scratch buffer and then feed both slices to `minimal-lexical`. For + // example, if the input is "12.34e5" we'll push b"1234" into the scratch buffer and then + // we'll pass b"12" and b"34" as iterators to `minimal-lexical`. `integer_end` will be used + // to track where to split the `scratch` buffer + // + // Note that `minimal-lexical` expects the first string to contain *no* leading zeroes and + // the second string to contain *no* trailing zeroes. The first requirement is already + // handled by the integer parsing logic. The second requirement will be enforced just before + // passing the slices to `minimal-lexical`, in `f64_from_parts` + self.scratch.clear(); + let value = match peek { b'-' => { self.eat_char(); @@ -391,6 +406,8 @@ impl<'de, R: Read<'de>> Deserializer { match next { b'0' => { + self.scratch.push(next); + // There can be only one leading '0'. match tri!(self.peek_or_null()) { b'0'..=b'9' => Err(self.peek_error(ErrorCode::InvalidNumber)), @@ -398,11 +415,13 @@ impl<'de, R: Read<'de>> Deserializer { } } c @ b'1'..=b'9' => { + self.scratch.push(c); let mut res = (c - b'0') as u64; loop { match tri!(self.peek_or_null()) { c @ b'0'..=b'9' => { + self.scratch.push(c); self.eat_char(); let digit = (c - b'0') as u64; @@ -410,11 +429,9 @@ impl<'de, R: Read<'de>> Deserializer { // number as a `u64` until we grow too large. At that point, switch to // parsing the value as a `f64`. if overflow!(res * 10 + digit, u64::max_value()) { - return Ok(ParserNumber::F64(tri!(self.parse_long_integer( - positive, - res, - 1, // res * 10^1 - )))); + return Ok(ParserNumber::F64(tri!( + self.parse_long_integer(positive,) + ))); } res = res * 10 + digit; @@ -429,37 +446,31 @@ impl<'de, R: Read<'de>> Deserializer { } } - fn parse_long_integer( - &mut self, - positive: bool, - significand: u64, - mut exponent: i32, - ) -> Result { + fn parse_long_integer(&mut self, positive: bool) -> Result { loop { match tri!(self.peek_or_null()) { - b'0'..=b'9' => { + c @ b'0'..=b'9' => { + self.scratch.push(c); self.eat_char(); - // This could overflow... if your integer is gigabytes long. - // Ignore that possibility. - exponent += 1; } b'.' => { - return self.parse_decimal(positive, significand, exponent); + return self.parse_decimal(positive, self.scratch.len()); } b'e' | b'E' => { - return self.parse_exponent(positive, significand, exponent); + return self.parse_exponent(positive, self.scratch.len()); } _ => { - return self.f64_from_parts(positive, significand, exponent); + return self.f64_from_parts(positive, self.scratch.len(), 0); } } } } fn parse_number(&mut self, positive: bool, significand: u64) -> Result { + let integer_end = self.scratch.len(); Ok(match tri!(self.peek_or_null()) { - b'.' => ParserNumber::F64(tri!(self.parse_decimal(positive, significand, 0))), - b'e' | b'E' => ParserNumber::F64(tri!(self.parse_exponent(positive, significand, 0))), + b'.' => ParserNumber::F64(tri!(self.parse_decimal(positive, integer_end))), + b'e' | b'E' => ParserNumber::F64(tri!(self.parse_exponent(positive, integer_end))), _ => { if positive { ParserNumber::U64(significand) @@ -477,31 +488,14 @@ impl<'de, R: Read<'de>> Deserializer { }) } - fn parse_decimal( - &mut self, - positive: bool, - mut significand: u64, - mut exponent: i32, - ) -> Result { + fn parse_decimal(&mut self, positive: bool, integer_end: usize) -> Result { self.eat_char(); let mut at_least_one_digit = false; while let c @ b'0'..=b'9' = tri!(self.peek_or_null()) { + self.scratch.push(c); self.eat_char(); - let digit = (c - b'0') as u64; at_least_one_digit = true; - - if overflow!(significand * 10 + digit, u64::max_value()) { - // The next multiply/add would overflow, so just ignore all - // further digits. - while let b'0'..=b'9' = tri!(self.peek_or_null()) { - self.eat_char(); - } - break; - } - - significand = significand * 10 + digit; - exponent -= 1; } if !at_least_one_digit { @@ -512,17 +506,12 @@ impl<'de, R: Read<'de>> Deserializer { } match tri!(self.peek_or_null()) { - b'e' | b'E' => self.parse_exponent(positive, significand, exponent), - _ => self.f64_from_parts(positive, significand, exponent), + b'e' | b'E' => self.parse_exponent(positive, integer_end), + _ => self.f64_from_parts(positive, integer_end, 0), } } - fn parse_exponent( - &mut self, - positive: bool, - significand: u64, - starting_exp: i32, - ) -> Result { + fn parse_exponent(&mut self, positive: bool, integer_end: usize) -> Result { self.eat_char(); let positive_exp = match tri!(self.peek_or_null()) { @@ -557,19 +546,15 @@ impl<'de, R: Read<'de>> Deserializer { let digit = (c - b'0') as i32; if overflow!(exp * 10 + digit, i32::max_value()) { - return self.parse_exponent_overflow(positive, significand, positive_exp); + return self.parse_exponent_overflow(positive, integer_end, positive_exp); } exp = exp * 10 + digit; } - let final_exp = if positive_exp { - starting_exp.saturating_add(exp) - } else { - starting_exp.saturating_sub(exp) - }; + let final_exp = if positive_exp { exp } else { -exp }; - self.f64_from_parts(positive, significand, final_exp) + self.f64_from_parts(positive, integer_end, final_exp) } // This cold code should not be inlined into the middle of the hot @@ -579,11 +564,11 @@ impl<'de, R: Read<'de>> Deserializer { fn parse_exponent_overflow( &mut self, positive: bool, - significand: u64, + integer_end: usize, positive_exp: bool, ) -> Result { // Error instead of +/- infinity. - if significand != 0 && positive_exp { + if self.scratch[..integer_end] != [b'0'] && positive_exp { return Err(self.error(ErrorCode::NumberOutOfRange)); } @@ -743,39 +728,36 @@ impl<'de, R: Read<'de>> Deserializer { Ok(()) } - fn f64_from_parts( - &mut self, - positive: bool, - significand: u64, - mut exponent: i32, - ) -> Result { - let mut f = significand as f64; - loop { - match POW10.get(exponent.wrapping_abs() as usize) { - Some(&pow) => { - if exponent >= 0 { - f *= pow; - if f.is_infinite() { - return Err(self.error(ErrorCode::NumberOutOfRange)); - } - } else { - f /= pow; - } - break; - } - None => { - if f == 0.0 { - break; - } - if exponent >= 0 { - return Err(self.error(ErrorCode::NumberOutOfRange)); - } - f /= 1e308; - exponent += 308; - } + fn f64_from_parts(&mut self, positive: bool, integer_end: usize, exponent: i32) -> Result { + // remove trailing zeroes from the 'fraction' part + let mut nzeroes = 0; + for c in self.scratch[integer_end..].iter().rev() { + if *c == b'0' { + nzeroes += 1; + } else { + break; } } - Ok(if positive { f } else { -f }) + + for _ in 0..nzeroes { + self.scratch.pop(); + } + + let integer = &self.scratch[..integer_end]; + let fraction = &self.scratch[integer_end..]; + + let f = if self.requested_f32 { + minimal_lexical::parse_float::(integer.iter(), fraction.iter(), exponent) as f64 + } else { + minimal_lexical::parse_float::(integer.iter(), fraction.iter(), exponent) + }; + self.scratch.clear(); + + if f.is_infinite() { + Err(self.error(ErrorCode::NumberOutOfRange)) + } else { + Ok(if positive { f } else { -f }) + } } fn parse_object_colon(&mut self) -> Result<()> { @@ -1023,42 +1005,6 @@ impl FromStr for Number { } } -// Clippy bug: https://github.com/rust-lang/rust-clippy/issues/5201 -#[allow(clippy::excessive_precision)] -static POW10: [f64; 309] = [ - 1e000, 1e001, 1e002, 1e003, 1e004, 1e005, 1e006, 1e007, 1e008, 1e009, // - 1e010, 1e011, 1e012, 1e013, 1e014, 1e015, 1e016, 1e017, 1e018, 1e019, // - 1e020, 1e021, 1e022, 1e023, 1e024, 1e025, 1e026, 1e027, 1e028, 1e029, // - 1e030, 1e031, 1e032, 1e033, 1e034, 1e035, 1e036, 1e037, 1e038, 1e039, // - 1e040, 1e041, 1e042, 1e043, 1e044, 1e045, 1e046, 1e047, 1e048, 1e049, // - 1e050, 1e051, 1e052, 1e053, 1e054, 1e055, 1e056, 1e057, 1e058, 1e059, // - 1e060, 1e061, 1e062, 1e063, 1e064, 1e065, 1e066, 1e067, 1e068, 1e069, // - 1e070, 1e071, 1e072, 1e073, 1e074, 1e075, 1e076, 1e077, 1e078, 1e079, // - 1e080, 1e081, 1e082, 1e083, 1e084, 1e085, 1e086, 1e087, 1e088, 1e089, // - 1e090, 1e091, 1e092, 1e093, 1e094, 1e095, 1e096, 1e097, 1e098, 1e099, // - 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, // - 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, // - 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, // - 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, // - 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, // - 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, // - 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, // - 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, // - 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, // - 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, // - 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, // - 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, // - 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, // - 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, // - 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, // - 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, // - 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, // - 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, // - 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, // - 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, // - 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308, -]; - macro_rules! deserialize_prim_number { ($method:ident) => { fn $method(self, visitor: V) -> Result @@ -1222,9 +1168,18 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { deserialize_prim_number!(deserialize_u16); deserialize_prim_number!(deserialize_u32); deserialize_prim_number!(deserialize_u64); - deserialize_prim_number!(deserialize_f32); deserialize_prim_number!(deserialize_f64); + fn deserialize_f32(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + self.requested_f32 = true; + let val = self.deserialize_any(visitor); + self.requested_f32 = false; + val + } + serde_if_integer128! { fn deserialize_i128(self, visitor: V) -> Result where diff --git a/tests/test.rs b/tests/test.rs index 38c9f5497..db217bcbb 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -902,6 +902,36 @@ fn test_parse_f64() { ]); } +// test roundtrip with some values that failed with the old version of the f64 deserializer +#[test] +fn test_roundtrip_f64() { + let roundtrip = |input: &f64| { + let json = serde_json::to_string(input).unwrap(); + let output: f64 = serde_json::from_str(&json).unwrap(); + assert_eq!(input, &output); + }; + + [ + // samples from quickcheck-ing `roundtrip` with `input: f64` + // comments on the right showed the value returned by the old deserializer + 51.24817837550540_4, // 51.2481783755054_1 + -93.3113703768803_3, // -93.3113703768803_2 + -36.5739948427534_36, // -36.5739948427534_4 + 52.31400820410624_4, // 52.31400820410624_ + 97.45365320034685, // 97.4536532003468_4 + // samples from `rng.next_u64` + `f64::from_bits` + `is_finite` filter + 2.0030397744267762e-253, + 7.101215824554616e260, + 1.769268377902049e74, + -1.6727517818542075e58, + 3.9287532173373315e299, + // edge case from https://github.com/serde-rs/json/issues/536#issuecomment-583714900 + 2.638344616030823e-256, + ] + .iter() + .for_each(roundtrip); +} + #[test] fn test_serialize_char() { let value = json!(