Skip to content

don't serialize empty tuples for v2 filters, and warn when reading such metadata #2847

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2847.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed a bug where ``ArrayV2Metadata`` could save ``filters`` as an empty array.
17 changes: 16 additions & 1 deletion src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import base64
import warnings
from collections.abc import Iterable
from enum import Enum
from functools import cached_property
Expand Down Expand Up @@ -178,6 +179,16 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
# handle the renames
expected |= {"dtype", "chunks"}

# check if `filters` is an empty sequence; if so use None instead and raise a warning
if _data["filters"] is not None and len(_data["filters"]) == 0:
msg = (
"Found an empty list of filters in the array metadata document. "
"This is contrary to the Zarr V2 specification, and will cause an error in the future. "
"Use None (or Null in a JSON document) instead of an empty list of filters."
)
warnings.warn(msg, UserWarning, stacklevel=1)
_data["filters"] = None

_data = {k: v for k, v in _data.items() if k in expected}

return cls(**_data)
Expand Down Expand Up @@ -255,7 +266,11 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:
else:
msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
raise TypeError(msg)
return tuple(out)
if len(out) == 0:
# Per the v2 spec, an empty tuple is not allowed -- use None to express "no filters"
return None
else:
return tuple(out)
# take a single codec instance and wrap it in a tuple
if isinstance(data, numcodecs.abc.Codec):
return (data,)
Expand Down
47 changes: 21 additions & 26 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,45 +972,40 @@ def test_default_fill_value(dtype: str, fill_value_expected: object, store: Stor
@staticmethod
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
@pytest.mark.parametrize("empty_value", [None, ()])
async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_value: Any) -> None:
async def test_no_filters_compressors(
store: MemoryStore, dtype: str, empty_value: object, zarr_format: ZarrFormat
) -> None:
"""
Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked.
"""

# v2
arr = await create_array(
store=store,
dtype=dtype,
shape=(10,),
zarr_format=2,
zarr_format=zarr_format,
compressors=empty_value,
filters=empty_value,
)
# Test metadata explicitly
assert arr.metadata.zarr_format == 2 # guard for mypy
# The v2 metadata stores None and () separately
assert arr.metadata.filters == empty_value
# The v2 metadata does not allow tuple for compressor, therefore it is turned into None
assert arr.metadata.compressor is None

assert arr.filters == ()
assert arr.compressors == ()

# v3
arr = await create_array(
store=store,
dtype=dtype,
shape=(10,),
compressors=empty_value,
filters=empty_value,
)
assert arr.metadata.zarr_format == 3 # guard for mypy
if dtype == "str":
assert arr.metadata.codecs == (VLenUTF8Codec(),)
assert arr.serializer == VLenUTF8Codec()
if zarr_format == 2:
assert arr.metadata.zarr_format == 2 # guard for mypy
# v2 spec requires that filters be either a collection with at least one filter, or None
assert arr.metadata.filters is None
# Compressor is a single element in v2 metadata; the absence of a compressor is encoded
# as None
assert arr.metadata.compressor is None

assert arr.filters == ()
assert arr.compressors == ()
else:
assert arr.metadata.codecs == (BytesCodec(),)
assert arr.serializer == BytesCodec()
assert arr.metadata.zarr_format == 3 # guard for mypy
if dtype == "str":
assert arr.metadata.codecs == (VLenUTF8Codec(),)
assert arr.serializer == VLenUTF8Codec()
else:
assert arr.metadata.codecs == (BytesCodec(),)
assert arr.serializer == BytesCodec()

@staticmethod
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
Expand Down
20 changes: 19 additions & 1 deletion tests/test_metadata/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_parse_zarr_format_invalid(data: Any) -> None:


@pytest.mark.parametrize("attributes", [None, {"foo": "bar"}])
@pytest.mark.parametrize("filters", [None, (), (numcodecs.GZip(),)])
@pytest.mark.parametrize("filters", [None, (numcodecs.GZip(),)])
@pytest.mark.parametrize("compressor", [None, numcodecs.GZip()])
@pytest.mark.parametrize("fill_value", [None, 0, 1])
@pytest.mark.parametrize("order", ["C", "F"])
Expand Down Expand Up @@ -81,6 +81,24 @@ def test_metadata_to_dict(
assert observed == expected


def test_filters_empty_tuple_warns() -> None:
metadata_dict = {
"zarr_format": 2,
"shape": (1,),
"chunks": (1,),
"dtype": "uint8",
"order": "C",
"compressor": None,
"filters": (),
"fill_value": 0,
}
with pytest.warns(
UserWarning, match="Found an empty list of filters in the array metadata document."
):
meta = ArrayV2Metadata.from_dict(metadata_dict)
assert meta.filters is None


class TestConsolidated:
@pytest.fixture
async def v2_consolidated_metadata(
Expand Down
14 changes: 14 additions & 0 deletions tests/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,20 @@ async def test_roundtrip_array_metadata(
assert actual == expected


@given(store=stores, meta=array_metadata()) # type: ignore[misc]
def test_array_metadata_meets_spec(store: Store, meta: ArrayV2Metadata | ArrayV3Metadata) -> None:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since this test does nothing for v3 metadata, would it be possible to only run it on ArrayV2Metadata?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here I made it do something ;) . really we should fill it out.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was more thinking that it would make sense to test the v2 and v3 specs in separate tests, since they are logically independent, but we can do that refactor later

# TODO: fill this out
asdict = meta.to_dict()
if isinstance(meta, ArrayV2Metadata):
assert asdict["filters"] != ()
assert asdict["filters"] is None or isinstance(asdict["filters"], tuple)
assert asdict["zarr_format"] == 2
elif isinstance(meta, ArrayV3Metadata):
assert asdict["zarr_format"] == 3
else:
raise NotImplementedError


# @st.composite
# def advanced_indices(draw, *, shape):
# basic_idxr = draw(
Expand Down