Skip to content

Fix fill_value serialization issues #2802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2802.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization
2 changes: 1 addition & 1 deletion src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3456,7 +3456,7 @@ def _build_metadata_v3(zarr_json: dict[str, JSON]) -> ArrayV3Metadata | GroupMet


def _build_metadata_v2(
zarr_json: dict[str, object], attrs_json: dict[str, JSON]
zarr_json: dict[str, JSON], attrs_json: dict[str, JSON]
) -> ArrayV2Metadata | GroupMetadata:
"""
Convert a dict representation of Zarr V2 metadata into the corresponding metadata class.
Expand Down
111 changes: 79 additions & 32 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,25 @@

import base64
import warnings
from collections.abc import Iterable
from collections.abc import Iterable, Mapping, Sequence
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING, TypedDict, cast
from typing import TYPE_CHECKING, Any, TypedDict, cast

import numcodecs.abc

from zarr.abc.metadata import Metadata

if TYPE_CHECKING:
from typing import Any, Literal, Self
from typing import Literal, Self

import numpy.typing as npt

from zarr.core.buffer import Buffer, BufferPrototype
from zarr.core.common import ChunkCoords

import json
import numbers
from dataclasses import dataclass, field, fields, replace

import numcodecs
Expand Down Expand Up @@ -108,6 +109,29 @@
return None

def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
def _serialize_fill_value(fv: Any) -> JSON:
if self.fill_value is None:
pass
elif self.dtype.kind in "SV":

Check warning on line 115 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L115

Added line #L115 was not covered by tests
# There's a relationship between self.dtype and self.fill_value
# that mypy isn't aware of. The fact that we have S or V dtype here
# means we should have a bytes-type fill_value.
fv = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
elif isinstance(fv, np.datetime64):
if np.isnat(fv):
fv = "NaT"

Check warning on line 122 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L119-L122

Added lines #L119 - L122 were not covered by tests
else:
fv = np.datetime_as_string(fv)
elif isinstance(fv, numbers.Real):
float_fv = float(fv)
if np.isnan(float_fv):
fv = "NaN"
elif np.isinf(float_fv):
fv = "Infinity" if float_fv > 0 else "-Infinity"
elif isinstance(fv, numbers.Complex):
fv = [_serialize_fill_value(fv.real), _serialize_fill_value(fv.imag)]

Check warning on line 132 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L124-L132

Added lines #L124 - L132 were not covered by tests
return cast(JSON, fv)

def _json_convert(
o: Any,
) -> Any:
Expand Down Expand Up @@ -146,41 +170,39 @@
raise TypeError

zarray_dict = self.to_dict()
zarray_dict["fill_value"] = _serialize_fill_value(zarray_dict["fill_value"])
zattrs_dict = zarray_dict.pop("attributes", {})
json_indent = config.get("json_indent")
return {
ZARRAY_JSON: prototype.buffer.from_bytes(
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
json.dumps(
zarray_dict, default=_json_convert, indent=json_indent, allow_nan=False
).encode()
),
ZATTRS_JSON: prototype.buffer.from_bytes(
json.dumps(zattrs_dict, indent=json_indent).encode()
json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode()
),
}

@classmethod
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
# make a copy to protect the original from modification
def from_dict(cls, data: dict[str, JSON]) -> ArrayV2Metadata:
# Make a copy to protect the original from modification.
_data = data.copy()
# check that the zarr_format attribute is correct
# Check that the zarr_format attribute is correct.
_ = parse_zarr_format(_data.pop("zarr_format"))
dtype = parse_dtype(_data["dtype"])

if dtype.kind in "SV":
fill_value_encoded = _data.get("fill_value")
if fill_value_encoded is not None:
fill_value = base64.standard_b64decode(fill_value_encoded)
_data["fill_value"] = fill_value

# zarr v2 allowed arbitrary keys here.
# We don't want the ArrayV2Metadata constructor to fail just because someone put an
# extra key in the metadata.
# zarr v2 allowed arbitrary keys in the metadata.
# Filter the keys to only those expected by the constructor.
expected = {x.name for x in fields(cls)}
# https://github.com/zarr-developers/zarr-python/issues/2269
# handle the renames
expected |= {"dtype", "chunks"}

# check if `filters` is an empty sequence; if so use None instead and raise a warning
if _data["filters"] is not None and len(_data["filters"]) == 0:
filters = _data.get("filters")
if (

Check warning on line 201 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L200-L201

Added lines #L200 - L201 were not covered by tests
isinstance(filters, Sequence)
and not isinstance(filters, (str, bytes))
and len(filters) == 0
):
msg = (
"Found an empty list of filters in the array metadata document. "
"This is contrary to the Zarr V2 specification, and will cause an error in the future. "
Expand All @@ -191,18 +213,11 @@

_data = {k: v for k, v in _data.items() if k in expected}

return cls(**_data)
return cls(**cast(Mapping[str, Any], _data))

Check warning on line 216 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L216

Added line #L216 was not covered by tests

def to_dict(self) -> dict[str, JSON]:
zarray_dict = super().to_dict()

if self.dtype.kind in "SV" and self.fill_value is not None:
# There's a relationship between self.dtype and self.fill_value
# that mypy isn't aware of. The fact that we have S or V dtype here
# means we should have a bytes-type fill_value.
fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
zarray_dict["fill_value"] = fill_value

_ = zarray_dict.pop("dtype")
dtype_json: JSON
# In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
Expand Down Expand Up @@ -300,7 +315,27 @@
return data


def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
def parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""Handle structured dtype/fill value pairs"""
try:
if isinstance(fill_value, list):
fill_value = tuple(fill_value)
if isinstance(fill_value, tuple):
fill_value = np.array([fill_value], dtype=dtype)[0]
elif isinstance(fill_value, bytes):
fill_value = np.frombuffer(fill_value, dtype=dtype)[0]
elif isinstance(fill_value, str):
decoded = base64.standard_b64decode(fill_value)
fill_value = np.frombuffer(decoded, dtype=dtype)[0]

Check warning on line 329 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L320-L329

Added lines #L320 - L329 were not covered by tests
else:
fill_value = np.array(fill_value, dtype=dtype)[()]
except Exception as e:
msg = f"Fill_value {fill_value} is not valid for dtype {dtype}."
raise ValueError(msg) from e
return fill_value

Check warning on line 335 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L331-L335

Added lines #L331 - L335 were not covered by tests


def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""
Parse a potential fill value into a value that is compatible with the provided dtype.

Expand All @@ -317,13 +352,13 @@
"""

if fill_value is None or dtype.hasobject:
# no fill value
pass
elif dtype.fields is not None:
fill_value = parse_structured_fill_value(fill_value, dtype)

Check warning on line 357 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L356-L357

Added lines #L356 - L357 were not covered by tests
elif not isinstance(fill_value, np.void) and fill_value == 0:
# this should be compatible across numpy versions for any array type, including
# structured arrays
fill_value = np.zeros((), dtype=dtype)[()]

elif dtype.kind == "U":
# special case unicode because of encoding issues on Windows if passed through numpy
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713
Expand All @@ -332,6 +367,18 @@
raise ValueError(
f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
)
elif dtype.kind in "SV" and isinstance(fill_value, str):
fill_value = base64.standard_b64decode(fill_value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out of scope but does anyone know why we base64 encode scalars with dtype S ? S is ascii-encoded strings, which shouldn't need any encoding / decoding.

elif np.issubdtype(dtype, np.datetime64):
if fill_value == "NaT":
fill_value = np.array("NaT", dtype=dtype)[()]

Check warning on line 374 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L370-L374

Added lines #L370 - L374 were not covered by tests
else:
fill_value = np.array(fill_value, dtype=dtype)[()]
elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2:
complex_val = complex(float(fill_value[0]), float(fill_value[1]))
fill_value = np.array(complex_val, dtype=dtype)[()]
elif dtype.kind in "f" and fill_value in {"NaN", "Infinity", "-Infinity"}:
fill_value = np.array(fill_value, dtype=dtype)[()]

Check warning on line 381 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L376-L381

Added lines #L376 - L381 were not covered by tests
else:
try:
if isinstance(fill_value, bytes) and dtype.kind == "V":
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/testing/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import hypothesis.extra.numpy as npst
import hypothesis.strategies as st
import numpy as np
from hypothesis import event, given, settings # noqa: F401
from hypothesis import event
from hypothesis.strategies import SearchStrategy

import zarr
Expand Down
Loading