From 11cd91b2f1b3d8a7cd96a1ec50e13356beba1777 Mon Sep 17 00:00:00 2001 From: Isabel Atkinson Date: Thu, 23 Jan 2025 10:37:23 -0700 Subject: [PATCH 1/4] RUST-2003 Binary vector subtype support --- Cargo.toml | 1 + src/binary.rs | 12 +++++++- src/spec.rs | 4 +++ src/tests/spec/json/bson-corpus/binary.json | 30 +++++++++++++++++++ src/tests/spec/json/bson-corpus/datetime.json | 1 + .../spec/json/bson-corpus/decimal128-1.json | 24 +++++++++++++++ src/tests/spec/mod.rs | 20 ++++++++----- 7 files changed, 83 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1fb1626b..5fd8df36 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -82,6 +82,7 @@ criterion = "0.3.0" pretty_assertions = "0.6.1" proptest = "1.0.0" serde_bytes = "0.11" +serde_path_to_error = "0.1.16" chrono = { version = "0.4", features = ["serde", "clock", "std"], default-features = false } [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dev-dependencies] getrandom = { version = "0.2", features = ["js"] } diff --git a/src/binary.rs b/src/binary.rs index 18cf8a3d..18e83095 100644 --- a/src/binary.rs +++ b/src/binary.rs @@ -1,3 +1,7 @@ +#! Module containing functionality related to BSON binary values. + +mod vector; + use crate::{spec::BinarySubtype, Document, RawBinaryRef}; use std::{ convert::TryFrom, @@ -5,6 +9,8 @@ use std::{ fmt::{self, Display}, }; +pub use vector::{PackedBitVector, Vector}; + /// Represents a BSON binary value. #[derive(Debug, Clone, Eq, PartialEq, Hash)] pub struct Binary { @@ -98,6 +104,9 @@ impl Binary { pub enum Error { /// While trying to decode from base64, an error was returned. DecodingError { message: String }, + + /// Invalid bytes were provided to construct a [`Binary`] of subtype [`BinarySubtype::Vector`]. + Vector { message: String }, } impl error::Error for Error {} @@ -105,7 +114,8 @@ impl error::Error for Error {} impl std::fmt::Display for Error { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { match self { - Error::DecodingError { message: m } => fmt.write_str(m), + Error::DecodingError { message } => fmt.write_str(message), + Error::Vector { message } => fmt.write_str(message), } } } diff --git a/src/spec.rs b/src/spec.rs index e853c150..dc082ff7 100644 --- a/src/spec.rs +++ b/src/spec.rs @@ -62,6 +62,7 @@ const BINARY_SUBTYPE_MD5: u8 = 0x05; const BINARY_SUBTYPE_ENCRYPTED: u8 = 0x06; const BINARY_SUBTYPE_COLUMN: u8 = 0x07; const BINARY_SUBTYPE_SENSITIVE: u8 = 0x08; +const BINARY_SUBTYPE_VECTOR: u8 = 0x09; const BINARY_SUBTYPE_USER_DEFINED: u8 = 0x80; /// All available BSON element types. @@ -162,6 +163,7 @@ pub enum BinarySubtype { Encrypted, Column, Sensitive, + Vector, UserDefined(u8), Reserved(u8), } @@ -179,6 +181,7 @@ impl From for u8 { BinarySubtype::Encrypted => BINARY_SUBTYPE_ENCRYPTED, BinarySubtype::Column => BINARY_SUBTYPE_COLUMN, BinarySubtype::Sensitive => BINARY_SUBTYPE_SENSITIVE, + BinarySubtype::Vector => BINARY_SUBTYPE_VECTOR, BinarySubtype::UserDefined(x) => x, BinarySubtype::Reserved(x) => x, } @@ -198,6 +201,7 @@ impl From for BinarySubtype { BINARY_SUBTYPE_ENCRYPTED => BinarySubtype::Encrypted, BINARY_SUBTYPE_COLUMN => BinarySubtype::Column, BINARY_SUBTYPE_SENSITIVE => BinarySubtype::Sensitive, + BINARY_SUBTYPE_VECTOR => BinarySubtype::Vector, _ if t < BINARY_SUBTYPE_USER_DEFINED => BinarySubtype::Reserved(t), _ => BinarySubtype::UserDefined(t), } diff --git a/src/tests/spec/json/bson-corpus/binary.json b/src/tests/spec/json/bson-corpus/binary.json index 20aaef74..0e0056f3 100644 --- a/src/tests/spec/json/bson-corpus/binary.json +++ b/src/tests/spec/json/bson-corpus/binary.json @@ -74,6 +74,36 @@ "description": "$type query operator (conflicts with legacy $binary form with $type field)", "canonical_bson": "180000000378001000000010247479706500020000000000", "canonical_extjson": "{\"x\" : { \"$type\" : {\"$numberInt\": \"2\"}}}" + }, + { + "description": "subtype 0x09 Vector FLOAT32", + "canonical_bson": "170000000578000A0000000927000000FE420000E04000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector INT8", + "canonical_bson": "11000000057800040000000903007F0700", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector PACKED_BIT", + "canonical_bson": "11000000057800040000000910007F0700", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) FLOAT32", + "canonical_bson": "0F0000000578000200000009270000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) INT8", + "canonical_bson": "0F0000000578000200000009030000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) PACKED_BIT", + "canonical_bson": "0F0000000578000200000009100000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" } ], "decodeErrors": [ diff --git a/src/tests/spec/json/bson-corpus/datetime.json b/src/tests/spec/json/bson-corpus/datetime.json index f857afdc..1554341d 100644 --- a/src/tests/spec/json/bson-corpus/datetime.json +++ b/src/tests/spec/json/bson-corpus/datetime.json @@ -24,6 +24,7 @@ { "description" : "Y10K", "canonical_bson" : "1000000009610000DC1FD277E6000000", + "relaxed_extjson" : "{\"a\":{\"$date\":{\"$numberLong\":\"253402300800000\"}}}", "canonical_extjson" : "{\"a\":{\"$date\":{\"$numberLong\":\"253402300800000\"}}}" }, { diff --git a/src/tests/spec/json/bson-corpus/decimal128-1.json b/src/tests/spec/json/bson-corpus/decimal128-1.json index 7eefec6b..8e7fbc93 100644 --- a/src/tests/spec/json/bson-corpus/decimal128-1.json +++ b/src/tests/spec/json/bson-corpus/decimal128-1.json @@ -312,6 +312,30 @@ "canonical_bson": "18000000136400000000000a5bc138938d44c64d31cc3700", "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\"}}", "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"1.000000000000000000000000000000000E+999\"}}" + }, + { + "description": "Clamped zeros with a large positive exponent", + "canonical_bson": "180000001364000000000000000000000000000000FE5F00", + "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E+2147483647\"}}", + "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E+6111\"}}" + }, + { + "description": "Clamped zeros with a large negative exponent", + "canonical_bson": "180000001364000000000000000000000000000000000000", + "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E-2147483647\"}}", + "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E-6176\"}}" + }, + { + "description": "Clamped negative zeros with a large positive exponent", + "canonical_bson": "180000001364000000000000000000000000000000FEDF00", + "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E+2147483647\"}}", + "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E+6111\"}}" + }, + { + "description": "Clamped negative zeros with a large negative exponent", + "canonical_bson": "180000001364000000000000000000000000000000008000", + "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E-2147483647\"}}", + "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E-6176\"}}" } ] } diff --git a/src/tests/spec/mod.rs b/src/tests/spec/mod.rs index 36ff825e..6174aea0 100644 --- a/src/tests/spec/mod.rs +++ b/src/tests/spec/mod.rs @@ -1,4 +1,5 @@ mod corpus; +mod vector; use std::{ any::type_name, @@ -7,7 +8,7 @@ use std::{ path::PathBuf, }; -use crate::RawDocumentBuf; +use crate::Bson; use serde::de::DeserializeOwned; pub(crate) fn run_spec_test(spec: &[&str], run_test_file: F) @@ -31,13 +32,16 @@ where let file = File::open(&path) .unwrap_or_else(|e| panic!("Failed to open file at {:?}: {}", path, e)); - let test_bson: RawDocumentBuf = serde_json::from_reader(file).unwrap_or_else(|e| { - panic!( - "Failed to deserialize test JSON to BSON in {:?}: {}", - path, e - ) - }); - let test: T = crate::from_slice(test_bson.as_bytes()).unwrap_or_else(|e| { + let mut json_deserializer = serde_json::Deserializer::from_reader(file); + let test_bson: Bson = serde_path_to_error::deserialize(&mut json_deserializer) + .unwrap_or_else(|e| { + panic!( + "Failed to deserialize test JSON to BSON in {:?}: {}", + path, e + ) + }); + let bson_deserializer = crate::Deserializer::new(test_bson); + let test: T = serde_path_to_error::deserialize(bson_deserializer).unwrap_or_else(|e| { panic!( "Failed to deserialize test BSON to {} in {:?}: {}", type_name::(), From b82d03c20a5fccf1fd02b209322f70aada36d0ba Mon Sep 17 00:00:00 2001 From: Isabel Atkinson Date: Thu, 23 Jan 2025 10:59:51 -0700 Subject: [PATCH 2/4] add files --- src/binary.rs | 2 +- src/binary/vector.rs | 281 ++++++++++++++++++ .../spec/json/bson-binary-vector/README.md | 58 ++++ .../spec/json/bson-binary-vector/float32.json | 51 ++++ .../spec/json/bson-binary-vector/int8.json | 57 ++++ .../json/bson-binary-vector/packed_bit.json | 98 ++++++ src/tests/spec/vector.rs | 219 ++++++++++++++ 7 files changed, 765 insertions(+), 1 deletion(-) create mode 100644 src/binary/vector.rs create mode 100644 src/tests/spec/json/bson-binary-vector/README.md create mode 100644 src/tests/spec/json/bson-binary-vector/float32.json create mode 100644 src/tests/spec/json/bson-binary-vector/int8.json create mode 100644 src/tests/spec/json/bson-binary-vector/packed_bit.json create mode 100644 src/tests/spec/vector.rs diff --git a/src/binary.rs b/src/binary.rs index 18e83095..ae5240b9 100644 --- a/src/binary.rs +++ b/src/binary.rs @@ -105,7 +105,7 @@ pub enum Error { /// While trying to decode from base64, an error was returned. DecodingError { message: String }, - /// Invalid bytes were provided to construct a [`Binary`] of subtype [`BinarySubtype::Vector`]. + /// A [`Vector`]-related error occurred. Vector { message: String }, } diff --git a/src/binary/vector.rs b/src/binary/vector.rs new file mode 100644 index 00000000..2ee23ab9 --- /dev/null +++ b/src/binary/vector.rs @@ -0,0 +1,281 @@ +use std::{ + convert::{TryFrom, TryInto}, + mem::size_of, +}; + +use serde::{Deserialize, Serialize}; + +use super::{Binary, Error, Result}; +use crate::{spec::BinarySubtype, Bson, RawBson}; + +const INT8: u8 = 0x03; +const FLOAT32: u8 = 0x27; +const PACKED_BIT: u8 = 0x10; + +/// A vector of numeric values. This type can be converted into a [`Binary`] of subtype +/// [`BinarySubtype::Vector`]. +/// +/// ```rust +/// let vector = Vector::Int8(vec![0, 1, 2]); +/// let binary = Binary::from(vector); +/// ``` +/// +/// The `Serialize` and `Deserialize` implementations for `Vector` treat it as a `Binary`. +/// +/// ```rust +/// #[derive(Serialize, Deserialize)] +/// struct Data { +/// vector: Vector, +/// } +/// +/// let data = Data { vector: Vector::Int8(vec![0, 1, 2]) }; +/// let document = bson::to_document(&data); +/// assert_eq!(document.get("vector").unwrap().element_type(), ElementType::Binary); +/// +/// let data = bson::from_document(document); +/// assert_eq!(data.vector, Vector::Int8(vec![0, 1, 2])); +/// ``` +/// +/// See the +/// [specification](https://github.com/mongodb/specifications/blob/master/source/bson-binary-vector/bson-binary-vector.md) +/// for more details. +#[derive(Clone, Debug, PartialEq)] +pub enum Vector { + /// A vector of `i8` values. + Int8(Vec), + + /// A vector of `f32` values. + Float32(Vec), + + /// A vector of packed bits. See [`PackedBitVector::new`] for more details. + PackedBit(PackedBitVector), +} + +/// A vector of packed bits. This type can be constructed by calling [`PackedBitVector::new`]. +#[derive(Clone, Debug, PartialEq)] +pub struct PackedBitVector { + vector: Vec, + padding: u8, +} + +impl PackedBitVector { + /// Construct a new `PackedBitVector`. Each `u8` value in the provided `vector` represents 8 + /// single-bit elements in little-endian format. For example, the following vector: + /// + /// ```rust + /// let packed_bits = vec![238, 224]; + /// let vector = PackedBitVector::new(packed_bits, 0); + /// ``` + /// + /// represents a 16-bit vector containing the following values: + /// + /// ``` + /// [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0] + /// ``` + /// + /// Padding can optionally be specified to ignore a number of least-significant bits in the + /// final byte. For example, the vector in the previous example with a padding of 4 would + /// represent a 12-bit vector containing the following values: + /// + /// ``` + /// [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0] + /// ``` + /// + /// Padding must be within 0-7 inclusive. Padding must be 0 or unspecified if the provided + /// vector is empty. + pub fn new(vector: Vec, padding: impl Into>) -> Result { + let padding = padding.into().unwrap_or(0); + if !(0..8).contains(&padding) { + return Err(Error::Vector { + message: format!("padding must be within 0-7 inclusive, got {}", padding), + }); + } + if padding != 0 && vector.is_empty() { + return Err(Error::Vector { + message: format!( + "cannot specify non-zero padding if the provided vector is empty, got {}", + padding + ), + }); + } + Ok(Self { vector, padding }) + } +} + +impl Vector { + /// Construct a [`Vector`] from the given bytes. See the + /// [specification](https://github.com/mongodb/specifications/blob/master/source/bson-binary-vector/bson-binary-vector.md#specification) + /// for details on the expected byte format. + pub fn from_bytes(bytes: impl AsRef<[u8]>) -> Result { + let bytes = bytes.as_ref(); + + if bytes.len() < 2 { + return Err(Error::Vector { + message: format!( + "the provided bytes must have a length of at least 2, got {}", + bytes.len() + ), + }); + } + + let d_type = bytes[0]; + let padding = bytes[1]; + if d_type != PACKED_BIT && padding != 0 { + return Err(Error::Vector { + message: format!( + "padding can only be specified for a packed bit vector (data type {}), got \ + type {}", + PACKED_BIT, d_type + ), + }); + } + let number_bytes = &bytes[2..]; + + match d_type { + INT8 => { + let vector = number_bytes + .iter() + .map(|n| i8::from_le_bytes([*n])) + .collect(); + Ok(Self::Int8(vector)) + } + FLOAT32 => { + const F32_BYTES: usize = size_of::(); + + let mut vector = Vec::new(); + for chunk in number_bytes.chunks(F32_BYTES) { + let bytes: [u8; F32_BYTES] = chunk.try_into().map_err(|_| Error::Vector { + message: format!( + "f32 vector values must be {} bytes, got {:?}", + F32_BYTES, chunk, + ), + })?; + vector.push(f32::from_le_bytes(bytes)); + } + Ok(Self::Float32(vector)) + } + PACKED_BIT => { + let packed_bit_vector = PackedBitVector::new(number_bytes.to_vec(), padding)?; + Ok(Self::PackedBit(packed_bit_vector)) + } + other => Err(Error::Vector { + message: format!("unsupported vector data type: {}", other), + }), + } + } + + fn d_type(&self) -> u8 { + match self { + Self::Int8(_) => INT8, + Self::Float32(_) => FLOAT32, + Self::PackedBit(_) => PACKED_BIT, + } + } + + fn padding(&self) -> u8 { + match self { + Self::Int8(_) => 0, + Self::Float32(_) => 0, + Self::PackedBit(PackedBitVector { padding, .. }) => *padding, + } + } +} + +impl From<&Vector> for Binary { + fn from(vector: &Vector) -> Self { + let d_type = vector.d_type(); + let padding = vector.padding(); + let mut bytes = vec![d_type, padding]; + + match vector { + Vector::Int8(vector) => { + for n in vector { + bytes.extend_from_slice(&n.to_le_bytes()); + } + } + Vector::Float32(vector) => { + for n in vector { + bytes.extend_from_slice(&n.to_le_bytes()); + } + } + Vector::PackedBit(PackedBitVector { vector, .. }) => { + for n in vector { + bytes.extend_from_slice(&n.to_le_bytes()); + } + } + } + + Self { + subtype: BinarySubtype::Vector, + bytes, + } + } +} + +impl From for Binary { + fn from(vector: Vector) -> Binary { + Self::from(&vector) + } +} + +impl TryFrom<&Binary> for Vector { + type Error = Error; + + fn try_from(binary: &Binary) -> Result { + if binary.subtype != BinarySubtype::Vector { + return Err(Error::Vector { + message: format!("expected vector binary subtype, got {:?}", binary.subtype), + }); + } + Self::from_bytes(&binary.bytes) + } +} + +impl TryFrom for Vector { + type Error = Error; + + fn try_from(binary: Binary) -> std::result::Result { + Self::try_from(&binary) + } +} + +// Convenience impl to allow passing a Vector directly into the doc! macro. From is already +// implemented by a blanket impl in src/bson.rs. +impl From<&Vector> for Bson { + fn from(vector: &Vector) -> Self { + Self::Binary(Binary::from(vector)) + } +} + +// Convenience impls to allow passing a Vector directly into the rawdoc! macro +impl From<&Vector> for RawBson { + fn from(vector: &Vector) -> Self { + Self::Binary(Binary::from(vector)) + } +} + +impl From for RawBson { + fn from(vector: Vector) -> Self { + Self::from(&vector) + } +} + +impl Serialize for Vector { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + let binary = Binary::from(self); + binary.serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for Vector { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + let binary = Binary::deserialize(deserializer)?; + Self::try_from(binary).map_err(serde::de::Error::custom) + } +} diff --git a/src/tests/spec/json/bson-binary-vector/README.md b/src/tests/spec/json/bson-binary-vector/README.md new file mode 100644 index 00000000..3357ebb2 --- /dev/null +++ b/src/tests/spec/json/bson-binary-vector/README.md @@ -0,0 +1,58 @@ +# Testing Binary subtype 9: Vector + +The JSON files in this directory tree are platform-independent tests that drivers can use to prove their conformance to +the specification. + +These tests focus on the roundtrip of the list of numbers as input/output, along with their data type and byte padding. + +Additional tests exist in `bson_corpus/tests/binary.json` but do not sufficiently test the end-to-end process of Vector +to BSON. For this reason, drivers must create a bespoke test runner for the vector subtype. + +## Format + +The test data corpus consists of a JSON file for each data type (dtype). Each file contains a number of test cases, +under the top-level key "tests". Each test case pertains to a single vector. The keys provide the specification of the +vector. Valid cases also include the Canonical BSON format of a document {test_key: binary}. The "test_key" is common, +and specified at the top level. + +#### Top level keys + +Each JSON file contains three top-level keys. + +- `description`: human-readable description of what is in the file +- `test_key`: name used for key when encoding/decoding a BSON document containing the single BSON Binary for the test + case. Applies to *every* case. +- `tests`: array of test case objects, each of which have the following keys. Valid cases will also contain additional + binary and json encoding values. + +#### Keys of individual tests cases + +- `description`: string describing the test. +- `valid`: boolean indicating if the vector, dtype, and padding should be considered a valid input. +- `vector`: list of numbers +- `dtype_hex`: string defining the data type in hex (e.g. "0x10", "0x27") +- `dtype_alias`: (optional) string defining the data dtype, perhaps as Enum. +- `padding`: (optional) integer for byte padding. Defaults to 0. +- `canonical_bson`: (required if valid is true) an (uppercase) big-endian hex representation of a BSON byte string. + +## Required tests + +#### To prove correct in a valid case (`valid: true`), one MUST + +- encode a document from the numeric values, dtype, and padding, along with the "test_key", and assert this matches the + canonical_bson string. +- decode the canonical_bson into its binary form, and then assert that the numeric values, dtype, and padding all match + those provided in the JSON. + +Note: For floating point number types, exact numerical matches may not be possible. Drivers that natively support the +floating-point type being tested (e.g., when testing float32 vector values in a driver that natively supports float32), +MUST assert that the input float array is the same after encoding and decoding. + +#### To prove correct in an invalid case (`valid:false`), one MUST + +- raise an exception when attempting to encode a document from the numeric values, dtype, and padding. + +## FAQ + +- What MongoDB Server version does this apply to? + - Files in the "specifications" repository have no version scheme. They are not tied to a MongoDB server version. diff --git a/src/tests/spec/json/bson-binary-vector/float32.json b/src/tests/spec/json/bson-binary-vector/float32.json new file mode 100644 index 00000000..872c4353 --- /dev/null +++ b/src/tests/spec/json/bson-binary-vector/float32.json @@ -0,0 +1,51 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector FLOAT32", + "valid": true, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000" + }, + { + "description": "Vector with decimals and negative value FLOAT32", + "valid": true, + "vector": [127.7, -7.7], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1C00000005766563746F72000A0000000927006666FF426666F6C000" + }, + { + "description": "Empty Vector FLOAT32", + "valid": true, + "vector": [], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009270000" + }, + { + "description": "Infinity Vector FLOAT32", + "valid": true, + "vector": ["-inf", 0.0, "inf"], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00" + }, + { + "description": "FLOAT32 with padding", + "valid": false, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 3 + } + ] +} + diff --git a/src/tests/spec/json/bson-binary-vector/int8.json b/src/tests/spec/json/bson-binary-vector/int8.json new file mode 100644 index 00000000..7529721e --- /dev/null +++ b/src/tests/spec/json/bson-binary-vector/int8.json @@ -0,0 +1,57 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype INT8", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector INT8", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000903007F0700" + }, + { + "description": "Empty Vector INT8", + "valid": true, + "vector": [], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009030000" + }, + { + "description": "Overflow Vector INT8", + "valid": false, + "vector": [128], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + }, + { + "description": "Underflow Vector INT8", + "valid": false, + "vector": [-129], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + }, + { + "description": "INT8 with padding", + "valid": false, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 3 + }, + { + "description": "INT8 with float inputs", + "valid": false, + "vector": [127.77, 7.77], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + } + ] +} + diff --git a/src/tests/spec/json/bson-binary-vector/packed_bit.json b/src/tests/spec/json/bson-binary-vector/packed_bit.json new file mode 100644 index 00000000..035776e8 --- /dev/null +++ b/src/tests/spec/json/bson-binary-vector/packed_bit.json @@ -0,0 +1,98 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT", + "test_key": "vector", + "tests": [ + { + "description": "Padding specified with no vector data PACKED_BIT", + "valid": false, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 1 + }, + { + "description": "Simple Vector PACKED_BIT", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000910007F0700" + }, + { + "description": "Empty Vector PACKED_BIT", + "valid": true, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009100000" + }, + { + "description": "PACKED_BIT with padding", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 3, + "canonical_bson": "1600000005766563746F7200040000000910037F0700" + }, + { + "description": "Overflow Vector PACKED_BIT", + "valid": false, + "vector": [256], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + }, + { + "description": "Underflow Vector PACKED_BIT", + "valid": false, + "vector": [-1], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + }, + { + "description": "Vector with float values PACKED_BIT", + "valid": false, + "vector": [127.5], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + }, + { + "description": "Padding specified with no vector data PACKED_BIT", + "valid": false, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 1 + }, + { + "description": "Exceeding maximum padding PACKED_BIT", + "valid": false, + "vector": [1], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 8 + }, + { + "description": "Negative padding PACKED_BIT", + "valid": false, + "vector": [1], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": -1 + }, + { + "description": "Vector with float values PACKED_BIT", + "valid": false, + "vector": [127.5], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + } + ] +} + diff --git a/src/tests/spec/vector.rs b/src/tests/spec/vector.rs new file mode 100644 index 00000000..c297e285 --- /dev/null +++ b/src/tests/spec/vector.rs @@ -0,0 +1,219 @@ +use std::convert::TryFrom; + +use serde::{Deserialize, Deserializer, Serialize}; + +use crate::{ + binary::{Binary, PackedBitVector, Vector}, + from_document, + from_slice, + spec::BinarySubtype, + to_document, + to_raw_document_buf, + Bson, + Document, + RawDocumentBuf, +}; + +use super::run_spec_test; + +const INT8: u8 = 0x03; +const FLOAT32: u8 = 0x27; +const PACKED_BIT: u8 = 0x10; + +#[derive(Deserialize)] +struct TestFile { + description: String, + test_key: String, + tests: Vec, +} + +#[derive(Deserialize)] +struct Test { + description: String, + valid: bool, + vector: Vec, + #[serde( + rename = "dtype_hex", + deserialize_with = "deserialize_u8_from_hex_string" + )] + d_type: u8, + padding: Option, + canonical_bson: Option, +} + +fn deserialize_u8_from_hex_string<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + u8::from_str_radix(s.trim_start_matches("0x"), 16).map_err(serde::de::Error::custom) +} + +enum Number { + Int(i16), + Float(f32), +} + +impl<'de> Deserialize<'de> for Number { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + #[serde(untagged)] + enum NumberHelper { + Int(i16), + Float(f32), + String(String), + } + + let helper = NumberHelper::deserialize(deserializer)?; + match helper { + NumberHelper::Int(n) => Ok(Self::Int(n)), + NumberHelper::Float(n) => Ok(Self::Float(n)), + NumberHelper::String(s) => match s.as_str() { + "inf" => Ok(Self::Float(f32::INFINITY)), + "-inf" => Ok(Self::Float(f32::NEG_INFINITY)), + other => Err(serde::de::Error::custom(format!( + "unsupported number value {}", + other + ))), + }, + } + } +} + +// Some of the invalid cases (e.g. mixed number types, padding for non-packed-bit vectors) are +// impossible to construct, so we return an error from this method. +fn vector_from_numbers( + numbers: Vec, + d_type: u8, + padding: Option, +) -> Result { + let padding = u8::try_from(padding.unwrap_or(0)).map_err(|e| e.to_string())?; + if padding != 0 && d_type != PACKED_BIT { + return Err(format!("got nonzero padding for data type {}", d_type)); + } + match d_type { + INT8 => { + let vector = numbers + .into_iter() + .map(|n| match n { + Number::Int(n) => i8::try_from(n).map_err(|e| e.to_string()), + Number::Float(n) => Err(format!("expected i8, got float {}", n)), + }) + .collect::, String>>()?; + Ok(Vector::Int8(vector)) + } + FLOAT32 => { + let vector = numbers + .into_iter() + .map(|n| match n { + Number::Int(n) => Err(format!("expected f32, got int {}", n)), + Number::Float(n) => Ok(n), + }) + .collect::, String>>()?; + Ok(Vector::Float32(vector)) + } + PACKED_BIT => { + let vector = numbers + .into_iter() + .map(|n| match n { + Number::Int(n) => u8::try_from(n).map_err(|e| e.to_string()), + Number::Float(n) => Err(format!("expected u8, got float {}", n)), + }) + .collect::, String>>()?; + Ok(Vector::PackedBit( + PackedBitVector::new(vector, padding).map_err(|e| e.to_string())?, + )) + } + other => Err(format!("invalid data type: {}", other)), + } +} + +fn run_test_file(test_file: TestFile) { + for test in test_file.tests { + let description = format!("{} ({})", test.description, test_file.description); + + let test_vector = match ( + vector_from_numbers(test.vector, test.d_type, test.padding), + test.valid, + ) { + (Ok(vector), true) => vector, + (Err(_), false) => return, + (Ok(vector), false) => panic!( + "{}: valid was false but successfully constructed vector {:?}", + description, vector + ), + (Err(error), true) => panic!( + "{}: valid was true but vector construction failed with error {}", + description, error + ), + }; + + let Some(canonical_bson) = test.canonical_bson else { + return; + }; + + let bytes = hex::decode(canonical_bson).expect(&description); + let mut test_document = Document::from_reader(bytes.as_slice()).expect(&description); + // Rename the field to match the name used in the struct below. + let vector = test_document + .remove(&test_file.test_key) + .expect(&description); + test_document.insert("vector", vector); + let bson = test_document.get("vector").expect(&description); + let test_binary = match bson { + Bson::Binary(binary) => binary, + other => panic!("{}: expected binary, got {}", description, other), + }; + + // TryFrom for Vector + let parsed_vector = Vector::try_from(test_binary).expect(&description); + assert_eq!(parsed_vector, test_vector); + + // From for Binary + let binary = Binary::from(&test_vector); + assert_eq!(binary.subtype, BinarySubtype::Vector); + assert_eq!(&binary, test_binary); + + // From for Bson + let document = doc! { "vector": &test_vector }; + assert_eq!(document, test_document); + + // From for RawBson + let raw_document = rawdoc! { "vector": &test_vector }; + let test_raw_document = RawDocumentBuf::from_document(&test_document).expect(&description); + assert_eq!(raw_document, test_raw_document); + + #[derive(Debug, Deserialize, PartialEq, Serialize)] + struct Data { + vector: Vector, + } + let data = Data { + vector: test_vector, + }; + + // Serialize for Vector (Document) + let serialized_document = to_document(&data).expect(&description); + assert_eq!(serialized_document, test_document); + + // Deserialize for Vector (Document) + let deserialized_data: Data = from_document(serialized_document).expect(&description); + assert_eq!(deserialized_data, data); + + // Serialize for Vector (RawDocumentBuf) + let serialized_raw_document = to_raw_document_buf(&data).expect(&description); + assert_eq!(serialized_raw_document, test_raw_document); + + // Deserialize for Vector (RawDocumentBuf) + let deserialized_data: Data = + from_slice(serialized_raw_document.as_bytes()).expect(&description); + assert_eq!(deserialized_data, data); + } +} + +#[test] +fn run_vector_tests() { + run_spec_test(&["bson-binary-vector"], run_test_file); +} From a8b5aa419b415015bedbcba2c593ad579651ccaa Mon Sep 17 00:00:00 2001 From: Isabel Atkinson Date: Thu, 23 Jan 2025 13:06:50 -0700 Subject: [PATCH 3/4] fix docs tests --- src/binary/vector.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/binary/vector.rs b/src/binary/vector.rs index 2ee23ab9..4f74e8f8 100644 --- a/src/binary/vector.rs +++ b/src/binary/vector.rs @@ -16,23 +16,26 @@ const PACKED_BIT: u8 = 0x10; /// [`BinarySubtype::Vector`]. /// /// ```rust +/// # use bson::binary::{Binary, Vector}; /// let vector = Vector::Int8(vec![0, 1, 2]); /// let binary = Binary::from(vector); /// ``` /// -/// The `Serialize` and `Deserialize` implementations for `Vector` treat it as a `Binary`. +/// `Vector` serializes to and deserializes from a `Binary`. /// /// ```rust +/// # use serde::{Serialize, Deserialize}; +/// # use bson::{binary::{Result, Vector}, spec::ElementType}; /// #[derive(Serialize, Deserialize)] /// struct Data { /// vector: Vector, /// } /// /// let data = Data { vector: Vector::Int8(vec![0, 1, 2]) }; -/// let document = bson::to_document(&data); +/// let document = bson::to_document(&data).unwrap(); /// assert_eq!(document.get("vector").unwrap().element_type(), ElementType::Binary); /// -/// let data = bson::from_document(document); +/// let data: Data = bson::from_document(document).unwrap(); /// assert_eq!(data.vector, Vector::Int8(vec![0, 1, 2])); /// ``` /// @@ -63,13 +66,17 @@ impl PackedBitVector { /// single-bit elements in little-endian format. For example, the following vector: /// /// ```rust + /// # use bson::binary::{Result, PackedBitVector}; + /// # fn main() -> Result<()> { /// let packed_bits = vec![238, 224]; - /// let vector = PackedBitVector::new(packed_bits, 0); + /// let vector = PackedBitVector::new(packed_bits, 0)?; + /// # Ok(()) + /// # } /// ``` /// /// represents a 16-bit vector containing the following values: /// - /// ``` + /// ```text /// [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0] /// ``` /// @@ -77,7 +84,7 @@ impl PackedBitVector { /// final byte. For example, the vector in the previous example with a padding of 4 would /// represent a 12-bit vector containing the following values: /// - /// ``` + /// ```text /// [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0] /// ``` /// From 8173d3fe0fef2749f84fe4bbf98b417d7500794c Mon Sep 17 00:00:00 2001 From: Isabel Atkinson Date: Thu, 23 Jan 2025 13:30:42 -0700 Subject: [PATCH 4/4] revert corpus tests --- src/tests/spec/json/bson-corpus/binary.json | 30 ------------------- src/tests/spec/json/bson-corpus/datetime.json | 1 - .../spec/json/bson-corpus/decimal128-1.json | 24 --------------- 3 files changed, 55 deletions(-) diff --git a/src/tests/spec/json/bson-corpus/binary.json b/src/tests/spec/json/bson-corpus/binary.json index 0e0056f3..20aaef74 100644 --- a/src/tests/spec/json/bson-corpus/binary.json +++ b/src/tests/spec/json/bson-corpus/binary.json @@ -74,36 +74,6 @@ "description": "$type query operator (conflicts with legacy $binary form with $type field)", "canonical_bson": "180000000378001000000010247479706500020000000000", "canonical_extjson": "{\"x\" : { \"$type\" : {\"$numberInt\": \"2\"}}}" - }, - { - "description": "subtype 0x09 Vector FLOAT32", - "canonical_bson": "170000000578000A0000000927000000FE420000E04000", - "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" - }, - { - "description": "subtype 0x09 Vector INT8", - "canonical_bson": "11000000057800040000000903007F0700", - "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" - }, - { - "description": "subtype 0x09 Vector PACKED_BIT", - "canonical_bson": "11000000057800040000000910007F0700", - "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" - }, - { - "description": "subtype 0x09 Vector (Zero-length) FLOAT32", - "canonical_bson": "0F0000000578000200000009270000", - "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" - }, - { - "description": "subtype 0x09 Vector (Zero-length) INT8", - "canonical_bson": "0F0000000578000200000009030000", - "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" - }, - { - "description": "subtype 0x09 Vector (Zero-length) PACKED_BIT", - "canonical_bson": "0F0000000578000200000009100000", - "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" } ], "decodeErrors": [ diff --git a/src/tests/spec/json/bson-corpus/datetime.json b/src/tests/spec/json/bson-corpus/datetime.json index 1554341d..f857afdc 100644 --- a/src/tests/spec/json/bson-corpus/datetime.json +++ b/src/tests/spec/json/bson-corpus/datetime.json @@ -24,7 +24,6 @@ { "description" : "Y10K", "canonical_bson" : "1000000009610000DC1FD277E6000000", - "relaxed_extjson" : "{\"a\":{\"$date\":{\"$numberLong\":\"253402300800000\"}}}", "canonical_extjson" : "{\"a\":{\"$date\":{\"$numberLong\":\"253402300800000\"}}}" }, { diff --git a/src/tests/spec/json/bson-corpus/decimal128-1.json b/src/tests/spec/json/bson-corpus/decimal128-1.json index 8e7fbc93..7eefec6b 100644 --- a/src/tests/spec/json/bson-corpus/decimal128-1.json +++ b/src/tests/spec/json/bson-corpus/decimal128-1.json @@ -312,30 +312,6 @@ "canonical_bson": "18000000136400000000000a5bc138938d44c64d31cc3700", "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\"}}", "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"1.000000000000000000000000000000000E+999\"}}" - }, - { - "description": "Clamped zeros with a large positive exponent", - "canonical_bson": "180000001364000000000000000000000000000000FE5F00", - "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E+2147483647\"}}", - "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E+6111\"}}" - }, - { - "description": "Clamped zeros with a large negative exponent", - "canonical_bson": "180000001364000000000000000000000000000000000000", - "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E-2147483647\"}}", - "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"0E-6176\"}}" - }, - { - "description": "Clamped negative zeros with a large positive exponent", - "canonical_bson": "180000001364000000000000000000000000000000FEDF00", - "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E+2147483647\"}}", - "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E+6111\"}}" - }, - { - "description": "Clamped negative zeros with a large negative exponent", - "canonical_bson": "180000001364000000000000000000000000000000008000", - "degenerate_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E-2147483647\"}}", - "canonical_extjson": "{\"d\" : {\"$numberDecimal\" : \"-0E-6176\"}}" } ] }