diff --git a/changes/2847.fix.rst b/changes/2847.fix.rst new file mode 100644 index 0000000000..148e191b98 --- /dev/null +++ b/changes/2847.fix.rst @@ -0,0 +1 @@ +Fixed a bug where ``ArrayV2Metadata`` could save ``filters`` as an empty array. \ No newline at end of file diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3d292c81b4..823944e067 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import warnings from collections.abc import Iterable from enum import Enum from functools import cached_property @@ -178,6 +179,16 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # handle the renames expected |= {"dtype", "chunks"} + # check if `filters` is an empty sequence; if so use None instead and raise a warning + if _data["filters"] is not None and len(_data["filters"]) == 0: + msg = ( + "Found an empty list of filters in the array metadata document. " + "This is contrary to the Zarr V2 specification, and will cause an error in the future. " + "Use None (or Null in a JSON document) instead of an empty list of filters." + ) + warnings.warn(msg, UserWarning, stacklevel=1) + _data["filters"] = None + _data = {k: v for k, v in _data.items() if k in expected} return cls(**_data) @@ -255,7 +266,11 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) - return tuple(out) + if len(out) == 0: + # Per the v2 spec, an empty tuple is not allowed -- use None to express "no filters" + return None + else: + return tuple(out) # take a single codec instance and wrap it in a tuple if isinstance(data, numcodecs.abc.Codec): return (data,) diff --git a/tests/test_array.py b/tests/test_array.py index b81f966e20..efcf8a6bf9 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -972,45 +972,40 @@ def test_default_fill_value(dtype: str, fill_value_expected: object, store: Stor @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) @pytest.mark.parametrize("empty_value", [None, ()]) - async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_value: Any) -> None: + async def test_no_filters_compressors( + store: MemoryStore, dtype: str, empty_value: object, zarr_format: ZarrFormat + ) -> None: """ Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. """ - # v2 arr = await create_array( store=store, dtype=dtype, shape=(10,), - zarr_format=2, + zarr_format=zarr_format, compressors=empty_value, filters=empty_value, ) # Test metadata explicitly - assert arr.metadata.zarr_format == 2 # guard for mypy - # The v2 metadata stores None and () separately - assert arr.metadata.filters == empty_value - # The v2 metadata does not allow tuple for compressor, therefore it is turned into None - assert arr.metadata.compressor is None - - assert arr.filters == () - assert arr.compressors == () - - # v3 - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - compressors=empty_value, - filters=empty_value, - ) - assert arr.metadata.zarr_format == 3 # guard for mypy - if dtype == "str": - assert arr.metadata.codecs == (VLenUTF8Codec(),) - assert arr.serializer == VLenUTF8Codec() + if zarr_format == 2: + assert arr.metadata.zarr_format == 2 # guard for mypy + # v2 spec requires that filters be either a collection with at least one filter, or None + assert arr.metadata.filters is None + # Compressor is a single element in v2 metadata; the absence of a compressor is encoded + # as None + assert arr.metadata.compressor is None + + assert arr.filters == () + assert arr.compressors == () else: - assert arr.metadata.codecs == (BytesCodec(),) - assert arr.serializer == BytesCodec() + assert arr.metadata.zarr_format == 3 # guard for mypy + if dtype == "str": + assert arr.metadata.codecs == (VLenUTF8Codec(),) + assert arr.serializer == VLenUTF8Codec() + else: + assert arr.metadata.codecs == (BytesCodec(),) + assert arr.serializer == BytesCodec() @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 5a5bf5f73a..4600a977d4 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -33,7 +33,7 @@ def test_parse_zarr_format_invalid(data: Any) -> None: @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) -@pytest.mark.parametrize("filters", [None, (), (numcodecs.GZip(),)]) +@pytest.mark.parametrize("filters", [None, (numcodecs.GZip(),)]) @pytest.mark.parametrize("compressor", [None, numcodecs.GZip()]) @pytest.mark.parametrize("fill_value", [None, 0, 1]) @pytest.mark.parametrize("order", ["C", "F"]) @@ -81,6 +81,24 @@ def test_metadata_to_dict( assert observed == expected +def test_filters_empty_tuple_warns() -> None: + metadata_dict = { + "zarr_format": 2, + "shape": (1,), + "chunks": (1,), + "dtype": "uint8", + "order": "C", + "compressor": None, + "filters": (), + "fill_value": 0, + } + with pytest.warns( + UserWarning, match="Found an empty list of filters in the array metadata document." + ): + meta = ArrayV2Metadata.from_dict(metadata_dict) + assert meta.filters is None + + class TestConsolidated: @pytest.fixture async def v2_consolidated_metadata( diff --git a/tests/test_properties.py b/tests/test_properties.py index acecd44810..5643cf3853 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -113,6 +113,20 @@ async def test_roundtrip_array_metadata( assert actual == expected +@given(store=stores, meta=array_metadata()) # type: ignore[misc] +def test_array_metadata_meets_spec(store: Store, meta: ArrayV2Metadata | ArrayV3Metadata) -> None: + # TODO: fill this out + asdict = meta.to_dict() + if isinstance(meta, ArrayV2Metadata): + assert asdict["filters"] != () + assert asdict["filters"] is None or isinstance(asdict["filters"], tuple) + assert asdict["zarr_format"] == 2 + elif isinstance(meta, ArrayV3Metadata): + assert asdict["zarr_format"] == 3 + else: + raise NotImplementedError + + # @st.composite # def advanced_indices(draw, *, shape): # basic_idxr = draw(