From a5c13b745c4759706065ef50e1613ce16801ed2f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 18 Feb 2025 20:07:50 +0100 Subject: [PATCH 1/5] don't serialize empty tuples for v2 filters, and warn when reading such metadata --- src/zarr/core/metadata/v2.py | 17 +++++++++++- tests/test_array.py | 47 +++++++++++++++------------------- tests/test_metadata/test_v2.py | 20 ++++++++++++++- 3 files changed, 56 insertions(+), 28 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3d292c81b4..27e8bc64f0 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import warnings from collections.abc import Iterable from enum import Enum from functools import cached_property @@ -178,6 +179,16 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # handle the renames expected |= {"dtype", "chunks"} + # check if `filters` is an empty tuple; if so use None instead and raise a warning + if _data["filters"] is not None and len(_data["filters"]) == 0: + msg = ( + "Found an empty list of filters in the array metadata document. " + "This is contrary to the Zarr V2 specification, and will cause an error in the future. " + "Use None (or Null in a JSON document) instead of an empty list of filters." + ) + warnings.warn(msg, UserWarning, stacklevel=1) + _data["filters"] = None + _data = {k: v for k, v in _data.items() if k in expected} return cls(**_data) @@ -255,7 +266,11 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) - return tuple(out) + if len(out) == 0: + # Per the v2 spec, an empty tuple is not allowed -- use None to express "no filters" + return None + else: + return tuple(out) # take a single codec instance and wrap it in a tuple if isinstance(data, numcodecs.abc.Codec): return (data,) diff --git a/tests/test_array.py b/tests/test_array.py index b81f966e20..efcf8a6bf9 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -972,45 +972,40 @@ def test_default_fill_value(dtype: str, fill_value_expected: object, store: Stor @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) @pytest.mark.parametrize("empty_value", [None, ()]) - async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_value: Any) -> None: + async def test_no_filters_compressors( + store: MemoryStore, dtype: str, empty_value: object, zarr_format: ZarrFormat + ) -> None: """ Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. """ - # v2 arr = await create_array( store=store, dtype=dtype, shape=(10,), - zarr_format=2, + zarr_format=zarr_format, compressors=empty_value, filters=empty_value, ) # Test metadata explicitly - assert arr.metadata.zarr_format == 2 # guard for mypy - # The v2 metadata stores None and () separately - assert arr.metadata.filters == empty_value - # The v2 metadata does not allow tuple for compressor, therefore it is turned into None - assert arr.metadata.compressor is None - - assert arr.filters == () - assert arr.compressors == () - - # v3 - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - compressors=empty_value, - filters=empty_value, - ) - assert arr.metadata.zarr_format == 3 # guard for mypy - if dtype == "str": - assert arr.metadata.codecs == (VLenUTF8Codec(),) - assert arr.serializer == VLenUTF8Codec() + if zarr_format == 2: + assert arr.metadata.zarr_format == 2 # guard for mypy + # v2 spec requires that filters be either a collection with at least one filter, or None + assert arr.metadata.filters is None + # Compressor is a single element in v2 metadata; the absence of a compressor is encoded + # as None + assert arr.metadata.compressor is None + + assert arr.filters == () + assert arr.compressors == () else: - assert arr.metadata.codecs == (BytesCodec(),) - assert arr.serializer == BytesCodec() + assert arr.metadata.zarr_format == 3 # guard for mypy + if dtype == "str": + assert arr.metadata.codecs == (VLenUTF8Codec(),) + assert arr.serializer == VLenUTF8Codec() + else: + assert arr.metadata.codecs == (BytesCodec(),) + assert arr.serializer == BytesCodec() @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 5a5bf5f73a..4600a977d4 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -33,7 +33,7 @@ def test_parse_zarr_format_invalid(data: Any) -> None: @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) -@pytest.mark.parametrize("filters", [None, (), (numcodecs.GZip(),)]) +@pytest.mark.parametrize("filters", [None, (numcodecs.GZip(),)]) @pytest.mark.parametrize("compressor", [None, numcodecs.GZip()]) @pytest.mark.parametrize("fill_value", [None, 0, 1]) @pytest.mark.parametrize("order", ["C", "F"]) @@ -81,6 +81,24 @@ def test_metadata_to_dict( assert observed == expected +def test_filters_empty_tuple_warns() -> None: + metadata_dict = { + "zarr_format": 2, + "shape": (1,), + "chunks": (1,), + "dtype": "uint8", + "order": "C", + "compressor": None, + "filters": (), + "fill_value": 0, + } + with pytest.warns( + UserWarning, match="Found an empty list of filters in the array metadata document." + ): + meta = ArrayV2Metadata.from_dict(metadata_dict) + assert meta.filters is None + + class TestConsolidated: @pytest.fixture async def v2_consolidated_metadata( From 76407f5f19f2167dbd49dd2c757e3502549d039a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 18 Feb 2025 20:21:15 +0100 Subject: [PATCH 2/5] release notes --- changes/2847.fix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/2847.fix.rst diff --git a/changes/2847.fix.rst b/changes/2847.fix.rst new file mode 100644 index 0000000000..148e191b98 --- /dev/null +++ b/changes/2847.fix.rst @@ -0,0 +1 @@ +Fixed a bug where ``ArrayV2Metadata`` could save ``filters`` as an empty array. \ No newline at end of file From d8060ecc4e46b346e547a4d57c6153179b25f874 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 18 Feb 2025 20:26:12 +0100 Subject: [PATCH 3/5] alter text in comment --- src/zarr/core/metadata/v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 27e8bc64f0..823944e067 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -179,7 +179,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # handle the renames expected |= {"dtype", "chunks"} - # check if `filters` is an empty tuple; if so use None instead and raise a warning + # check if `filters` is an empty sequence; if so use None instead and raise a warning if _data["filters"] is not None and len(_data["filters"]) == 0: msg = ( "Found an empty list of filters in the array metadata document. " From 79a26d2580e58fa48b667a87467e7bdb315e5ed8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 18 Feb 2025 15:54:53 -0700 Subject: [PATCH 4/5] add skeleton of metadata spec compliance property test --- tests/test_properties.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_properties.py b/tests/test_properties.py index acecd44810..e5f7d14034 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -113,6 +113,13 @@ async def test_roundtrip_array_metadata( assert actual == expected +@given(store=stores, meta=array_metadata()) # type: ignore[misc] +def test_array_metadata_meets_spec(store: Store, meta: ArrayV2Metadata | ArrayV3Metadata) -> None: + asdict = meta.to_dict() + if isinstance(meta, ArrayV2Metadata): + assert asdict["filters"] != () + + # @st.composite # def advanced_indices(draw, *, shape): # basic_idxr = draw( From 8169e3efcd26fd745b4c5ed681ab64b28913aae0 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 18 Feb 2025 16:04:08 -0700 Subject: [PATCH 5/5] do something for V3 --- tests/test_properties.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_properties.py b/tests/test_properties.py index e5f7d14034..5643cf3853 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -115,9 +115,16 @@ async def test_roundtrip_array_metadata( @given(store=stores, meta=array_metadata()) # type: ignore[misc] def test_array_metadata_meets_spec(store: Store, meta: ArrayV2Metadata | ArrayV3Metadata) -> None: + # TODO: fill this out asdict = meta.to_dict() if isinstance(meta, ArrayV2Metadata): assert asdict["filters"] != () + assert asdict["filters"] is None or isinstance(asdict["filters"], tuple) + assert asdict["zarr_format"] == 2 + elif isinstance(meta, ArrayV3Metadata): + assert asdict["zarr_format"] == 3 + else: + raise NotImplementedError # @st.composite