Round trip serialization for array metadata v2/v3

moradology · moradology · commit 6301b15d3935 · 2025-02-19T11:41:13.000-06:00
diff --git a/changes/2802.fix.rst b/changes/2802.fix.rst
@@ -0,0 +1 @@
+Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization
diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -170,7 +170,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
         if dtype.kind in "SV":
             fill_value_encoded = _data.get("fill_value")
             if fill_value_encoded is not None:
-                fill_value = base64.standard_b64decode(fill_value_encoded)
+                fill_value: Any = base64.standard_b64decode(fill_value_encoded)
                 _data["fill_value"] = fill_value
         else:
             fill_value = _data.get("fill_value")
@@ -180,13 +180,11 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
                         _data["fill_value"] = np.array("NaT", dtype=dtype)[()]
                     else:
                         _data["fill_value"] = np.array(fill_value, dtype=dtype)[()]
-                elif dtype.kind == "c" and isinstance(fill_value, list):
-                    if len(fill_value) == 2:
-                        val = complex(float(fill_value[0]), float(fill_value[1]))
-                        _data["fill_value"] = np.array(val, dtype=dtype)[()]
-                elif dtype.kind in "f" and isinstance(fill_value, str):
-                    if fill_value in {"NaN", "Infinity", "-Infinity"}:
-                        _data["fill_value"] = np.array(fill_value, dtype=dtype)[()]
+                elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2:
+                    val = complex(float(fill_value[0]), float(fill_value[1]))
+                    _data["fill_value"] = np.array(val, dtype=dtype)[()]
+                elif dtype.kind in "f" and fill_value in {"NaN", "Infinity", "-Infinity"}:
+                    _data["fill_value"] = np.array(fill_value, dtype=dtype)[()]
         # zarr v2 allowed arbitrary keys in the metadata.
         # Filter the keys to only those expected by the constructor.
         expected = {x.name for x in fields(cls)}
@@ -196,21 +194,22 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
         return cls(**_data)
 
     def to_dict(self) -> dict[str, JSON]:
-        def _sanitize_fill_value(fv: Any):
+        def _sanitize_fill_value(fv: Any) -> JSON:
             if fv is None:
                 return fv
             elif isinstance(fv, np.datetime64):
                 if np.isnat(fv):
                     return "NaT"
                 return np.datetime_as_string(fv)
             elif isinstance(fv, numbers.Real):
-                if np.isnan(fv):
+                float_fv = float(fv)
+                if np.isnan(float_fv):
                     fv = "NaN"
-                elif np.isinf(fv):
-                    fv = "Infinity" if fv > 0 else "-Infinity"
+                elif np.isinf(float_fv):
+                    fv = "Infinity" if float_fv > 0 else "-Infinity"
             elif isinstance(fv, numbers.Complex):
                 fv = [_sanitize_fill_value(fv.real), _sanitize_fill_value(fv.imag)]
-            return fv
+            return cast(JSON, fv)
 
         zarray_dict = super().to_dict()
 
diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py
@@ -85,7 +85,7 @@ def add_group(self, name: str, data: DataObject) -> None:
     @rule(
         data=st.data(),
         name=node_names,
-        array_and_chunks=np_array_and_chunks(nparrays=numpy_arrays(zarr_formats=st.just(3))),
+        array_and_chunks=np_array_and_chunks(arrays=numpy_arrays(zarr_formats=st.just(3))),
     )
     def add_array(
         self,
diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py
@@ -3,7 +3,6 @@
 
 import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
-import numcodecs
 import numpy as np
 from hypothesis import assume, given, settings  # noqa: F401
 from hypothesis.strategies import SearchStrategy
@@ -345,136 +344,3 @@ def make_request(start: int, length: int) -> RangeByteRequest:
     )
     key_tuple = st.tuples(keys, byte_ranges)
     return st.lists(key_tuple, min_size=1, max_size=10)
-
-
-def simple_text():
-    """A strategy for generating simple text strings."""
-    return st.text(st.characters(min_codepoint=32, max_codepoint=126), min_size=1, max_size=10)
-
-
-def simple_attrs():
-    """A strategy for generating simple attribute dictionaries."""
-    return st.dictionaries(
-        simple_text(),
-        st.one_of(
-            st.integers(),
-            st.floats(allow_nan=False, allow_infinity=False),
-            st.booleans(),
-            simple_text(),
-        ),
-    )
-
-
-def array_shapes(min_dims=1, max_dims=3, max_len=100):
-    """A strategy for generating array shapes."""
-    return st.lists(
-        st.integers(min_value=1, max_value=max_len), min_size=min_dims, max_size=max_dims
-    )
-
-
-# def zarr_compressors():
-#     """A strategy for generating Zarr compressors."""
-#     return st.sampled_from([None, Blosc(), GZip(), Zstd(), LZ4()])
-
-
-# def zarr_codecs():
-#     """A strategy for generating Zarr codecs."""
-#     return st.sampled_from([BytesCodec(), Blosc(), GZip(), Zstd(), LZ4()])
-
-
-def zarr_filters():
-    """A strategy for generating Zarr filters."""
-    return st.lists(
-        st.just(numcodecs.Delta(dtype="i4")), min_size=0, max_size=2
-    )  # Example filter, expand as needed
-
-
-def zarr_storage_transformers():
-    """A strategy for generating Zarr storage transformers."""
-    return st.lists(
-        st.dictionaries(
-            simple_text(), st.one_of(st.integers(), st.floats(), st.booleans(), simple_text())
-        ),
-        min_size=0,
-        max_size=2,
-    )
-
-
-@st.composite
-def array_metadata_v2(draw: st.DrawFn) -> ArrayV2Metadata:
-    """Generates valid ArrayV2Metadata objects for property-based testing."""
-    dims = draw(st.integers(min_value=1, max_value=3))  # Limit dimensions for complexity
-    shape = tuple(draw(array_shapes(min_dims=dims, max_dims=dims, max_len=100)))
-    max_chunk_len = max(shape) if shape else 100
-    chunks = tuple(
-        draw(
-            st.lists(
-                st.integers(min_value=1, max_value=max_chunk_len), min_size=dims, max_size=dims
-            )
-        )
-    )
-
-    # Validate shape and chunks relationship
-    assume(all(c <= s for s, c in zip(shape, chunks, strict=False)))  # Chunk size must be <= shape
-
-    dtype = draw(v2_dtypes())
-    fill_value = draw(st.one_of([st.none(), npst.from_dtype(dtype)]))
-    order = draw(st.sampled_from(["C", "F"]))
-    dimension_separator = draw(st.sampled_from([".", "/"]))
-    # compressor = draw(zarr_compressors())
-    filters = tuple(draw(zarr_filters())) if draw(st.booleans()) else None
-    attributes = draw(simple_attrs())
-
-    # Construct the metadata object.  Type hints are crucial here for correctness.
-    return ArrayV2Metadata(
-        shape=shape,
-        dtype=dtype,
-        chunks=chunks,
-        fill_value=fill_value,
-        order=order,
-        dimension_separator=dimension_separator,
-        #    compressor=compressor,
-        filters=filters,
-        attributes=attributes,
-    )
-
-
-@st.composite
-def array_metadata_v3(draw: st.DrawFn) -> ArrayV3Metadata:
-    """Generates valid ArrayV3Metadata objects for property-based testing."""
-    dims = draw(st.integers(min_value=1, max_value=3))
-    shape = tuple(draw(array_shapes(min_dims=dims, max_dims=dims, max_len=100)))
-    max_chunk_len = max(shape) if shape else 100
-    chunks = tuple(
-        draw(
-            st.lists(
-                st.integers(min_value=1, max_value=max_chunk_len), min_size=dims, max_size=dims
-            )
-        )
-    )
-    assume(all(c <= s for s, c in zip(shape, chunks, strict=False)))
-
-    dtype = draw(v3_dtypes())
-    fill_value = draw(npst.from_dtype(dtype))
-    chunk_grid = RegularChunkGrid(chunks)  # Ensure chunks is passed as tuple.
-    chunk_key_encoding = DefaultChunkKeyEncoding(separator="/")  # Or st.sampled_from(["/", "."])
-    # codecs = tuple(draw(st.lists(zarr_codecs(), min_size=0, max_size=3)))
-    attributes = draw(simple_attrs())
-    dimension_names = (
-        tuple(draw(st.lists(st.one_of(st.none(), simple_text()), min_size=dims, max_size=dims)))
-        if draw(st.booleans())
-        else None
-    )
-    storage_transformers = tuple(draw(zarr_storage_transformers()))
-
-    return ArrayV3Metadata(
-        shape=shape,
-        data_type=dtype,
-        chunk_grid=chunk_grid,
-        chunk_key_encoding=chunk_key_encoding,
-        fill_value=fill_value,
-        #    codecs=codecs,
-        attributes=attributes,
-        dimension_names=dimension_names,
-        storage_transformers=storage_transformers,
-    )
diff --git a/tests/test_properties.py b/tests/test_properties.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ def add_group(self, name: str, data: DataObject) -> None:`
`85`	`85`	`@rule(`
`86`	`86`	`data=st.data(),`
`87`	`87`	`name=node_names,`
`88`		`- array_and_chunks=np_array_and_chunks(nparrays=numpy_arrays(zarr_formats=st.just(3))),`
	`88`	`+ array_and_chunks=np_array_and_chunks(arrays=numpy_arrays(zarr_formats=st.just(3))),`
`89`	`89`	`)`
`90`	`90`	`def add_array(`
`91`	`91`	`self,`