Skip to content

Narrow JSON type, ensure that to_dict always returns a dict, and v2 filter / compressor parsing #2179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/zarr/abc/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

@dataclass(frozen=True)
class Metadata:
def to_dict(self) -> JSON:
def to_dict(self) -> dict[str, JSON]:
"""
Recursively serialize this model to a dictionary.
This method inspects the fields of self and calls `x.to_dict()` for any fields that
Expand Down
11 changes: 5 additions & 6 deletions src/zarr/codecs/_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec)

@dataclass(frozen=True)
class V2Filters(ArrayArrayCodec):
filters: list[dict[str, JSON]]
filters: tuple[numcodecs.abc.Codec, ...] | None

is_fixed_size = False

Expand All @@ -79,8 +79,7 @@ async def _decode_single(
chunk_ndarray = chunk_array.as_ndarray_like()
# apply filters in reverse order
if self.filters is not None:
for filter_metadata in self.filters[::-1]:
filter = numcodecs.get_codec(filter_metadata)
for filter in self.filters[::-1]:
chunk_ndarray = await to_thread(filter.decode, chunk_ndarray)

# ensure correct chunk shape
Expand All @@ -99,9 +98,9 @@ async def _encode_single(
) -> NDBuffer | None:
chunk_ndarray = chunk_array.as_ndarray_like().ravel(order=chunk_spec.order)

for filter_metadata in self.filters:
filter = numcodecs.get_codec(filter_metadata)
chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
if self.filters is not None:
for filter in self.filters:
chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)

return get_ndbuffer_class().from_ndarray_like(chunk_ndarray)

Expand Down
4 changes: 2 additions & 2 deletions src/zarr/codecs/blosc.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ def to_dict(self) -> dict[str, JSON]:
"name": "blosc",
"configuration": {
"typesize": self.typesize,
"cname": self.cname,
"cname": self.cname.value,
"clevel": self.clevel,
"shuffle": self.shuffle,
"shuffle": self.shuffle.value,
"blocksize": self.blocksize,
},
}
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/codecs/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def to_dict(self) -> dict[str, JSON]:
if self.endian is None:
return {"name": "bytes"}
else:
return {"name": "bytes", "configuration": {"endian": self.endian}}
return {"name": "bytes", "configuration": {"endian": self.endian.value}}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
if array_spec.dtype.itemsize == 0:
Expand Down
9 changes: 7 additions & 2 deletions src/zarr/codecs/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,13 @@ def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = Non
out.append(get_codec_class(name_parsed).from_dict(c)) # type: ignore[arg-type]
return cls.from_list(out, batch_size=batch_size)

def to_dict(self) -> JSON:
return [c.to_dict() for c in self]
def to_dict(self) -> dict[str, JSON]:
return {
"array_array_codecs": tuple(c.to_dict() for c in self.array_array_codecs),
"array_bytes_codec": self.array_bytes_codec.to_dict(),
"bytes_bytes_codec": tuple(c.to_dict() for c in self.bytes_bytes_codecs),
"batch_size": self.batch_size,
}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self])
Expand Down
12 changes: 6 additions & 6 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class ShardingCodecIndexLocation(Enum):
end = "end"


def parse_index_location(data: JSON) -> ShardingCodecIndexLocation:
def parse_index_location(data: object) -> ShardingCodecIndexLocation:
return parse_enum(data, ShardingCodecIndexLocation)


Expand Down Expand Up @@ -333,7 +333,7 @@ def __init__(
chunk_shape: ChunkCoordsLike,
codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end,
index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
) -> None:
chunk_shape_parsed = parse_shapelike(chunk_shape)
codecs_parsed = parse_codecs(codecs)
Expand Down Expand Up @@ -379,10 +379,10 @@ def to_dict(self) -> dict[str, JSON]:
return {
"name": "sharding_indexed",
"configuration": {
"chunk_shape": list(self.chunk_shape),
"codecs": [s.to_dict() for s in self.codecs],
"index_codecs": [s.to_dict() for s in self.index_codecs],
"index_location": self.index_location,
"chunk_shape": self.chunk_shape,
"codecs": tuple([s.to_dict() for s in self.codecs]),
"index_codecs": tuple([s.to_dict() for s in self.index_codecs]),
"index_location": self.index_location.value,
},
}

Expand Down
2 changes: 1 addition & 1 deletion src/zarr/codecs/transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
return cls(**configuration_parsed) # type: ignore[arg-type]

def to_dict(self) -> dict[str, JSON]:
return {"name": "transpose", "configuration": {"order": list(self.order)}}
return {"name": "transpose", "configuration": {"order": tuple(self.order)}}

def validate(self, shape: tuple[int, ...], dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None:
if len(self.order) != len(shape):
Expand Down
16 changes: 4 additions & 12 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecP
return get_pipeline_class().from_list(metadata.codecs)
elif isinstance(metadata, ArrayV2Metadata):
return get_pipeline_class().from_list(
[V2Filters(metadata.filters or []), V2Compressor(metadata.compressor)]
[V2Filters(metadata.filters), V2Compressor(metadata.compressor)]
)
else:
raise TypeError
Expand Down Expand Up @@ -299,8 +299,6 @@ async def _create_v2(
attributes: dict[str, JSON] | None = None,
exists_ok: bool = False,
) -> AsyncArray:
import numcodecs

if not exists_ok:
await ensure_no_existing_node(store_path, zarr_format=2)
if order is None:
Expand All @@ -315,15 +313,9 @@ async def _create_v2(
chunks=chunks,
order=order,
dimension_separator=dimension_separator,
fill_value=0 if fill_value is None else fill_value,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restoring the 0 if fill_value is None else fill_value does fix the failure in tests/v3/test_array.py::test_serializable_sync_array.

But that said, I like the change here... Setting a default fill value here seems wrong.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, but this is just for v2 metadata, which is allowed to be None. Maybe this isn't the wrong spot to do that.

compressor=(
numcodecs.get_codec(compressor).get_config() if compressor is not None else None
),
filters=(
[numcodecs.get_codec(filter).get_config() for filter in filters]
if filters is not None
else None
),
fill_value=fill_value,
compressor=compressor,
filters=filters,
attributes=attributes,
)
array = cls(metadata=metadata, store_path=store_path)
Expand Down
6 changes: 3 additions & 3 deletions src/zarr/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import contextvars
import functools
import operator
from collections.abc import Iterable
from collections.abc import Iterable, Mapping
from enum import Enum
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -32,7 +32,7 @@
ChunkCoords = tuple[int, ...]
ChunkCoordsLike = Iterable[int]
ZarrFormat = Literal[2, 3]
JSON = None | str | int | float | Enum | dict[str, "JSON"] | list["JSON"] | tuple["JSON", ...]
JSON = None | str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...]
MemoryOrder = Literal["C", "F"]
AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"]

Expand Down Expand Up @@ -80,7 +80,7 @@ def enum_names(enum: type[E]) -> Iterator[str]:
yield item.name


def parse_enum(data: JSON, cls: type[E]) -> E:
def parse_enum(data: object, cls: type[E]) -> E:
if isinstance(data, cls):
return data
if not isinstance(data, str):
Expand Down
62 changes: 40 additions & 22 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from collections.abc import Iterable
from typing import TYPE_CHECKING

if TYPE_CHECKING:
Expand All @@ -14,6 +15,7 @@
import json
from dataclasses import dataclass, field, replace

import numcodecs
import numpy as np

from zarr.core.array_spec import ArraySpec
Expand All @@ -31,9 +33,9 @@ class ArrayV2Metadata(ArrayMetadata):
data_type: np.dtype[Any]
fill_value: None | int | float = 0
order: Literal["C", "F"] = "C"
filters: list[dict[str, JSON]] | None = None
filters: tuple[numcodecs.abc.Codec, ...] | None = None
dimension_separator: Literal[".", "/"] = "."
compressor: dict[str, JSON] | None = None
compressor: numcodecs.abc.Codec | None = None
attributes: dict[str, JSON] = field(default_factory=dict)
zarr_format: Literal[2] = field(init=False, default=2)

Expand All @@ -46,8 +48,8 @@ def __init__(
fill_value: Any,
order: Literal["C", "F"],
dimension_separator: Literal[".", "/"] = ".",
compressor: dict[str, JSON] | None = None,
filters: list[dict[str, JSON]] | None = None,
compressor: numcodecs.abc.Codec | dict[str, JSON] | None = None,
filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None,
attributes: dict[str, JSON] | None = None,
):
"""
Expand Down Expand Up @@ -104,11 +106,6 @@ def _json_convert(
raise TypeError

zarray_dict = self.to_dict()

# todo: remove this check when we can ensure that to_dict always returns dicts.
if not isinstance(zarray_dict, dict):
raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")

zattrs_dict = zarray_dict.pop("attributes", {})
json_indent = config.get("json_indent")
return {
Expand All @@ -128,13 +125,8 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
_ = parse_zarr_format(_data.pop("zarr_format"))
return cls(**_data)

def to_dict(self) -> JSON:
def to_dict(self) -> dict[str, JSON]:
zarray_dict = super().to_dict()

# todo: remove this check when we can ensure that to_dict always returns dicts.
if not isinstance(zarray_dict, dict):
raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")

_ = zarray_dict.pop("chunk_grid")
zarray_dict["chunks"] = self.chunk_grid.chunk_shape

Expand Down Expand Up @@ -165,18 +157,44 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
return replace(self, attributes=attributes)


def parse_zarr_format(data: Literal[2]) -> Literal[2]:
def parse_zarr_format(data: object) -> Literal[2]:
if data == 2:
return data
return 2
raise ValueError(f"Invalid value. Expected 2. Got {data}.")


def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
return data
def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:
"""
Parse a potential tuple of filters
"""
out: list[numcodecs.abc.Codec] = []

if data is None:
return data
if isinstance(data, Iterable):
for idx, val in enumerate(data):
if isinstance(val, numcodecs.abc.Codec):
out.append(val)
elif isinstance(val, dict):
out.append(numcodecs.get_codec(val))
else:
msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
raise TypeError(msg)
return tuple(out)
msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
raise TypeError(msg)

def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
return data

def parse_compressor(data: object) -> numcodecs.abc.Codec | None:
"""
Parse a potential compressor.
"""
if data is None or isinstance(data, numcodecs.abc.Codec):
return data
if isinstance(data, dict):
return numcodecs.get_codec(data)
msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead."
raise ValueError(msg)


def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
Expand All @@ -189,7 +207,7 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
return data


def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
"""
Parse a potential fill value into a value that is compatible with the provided dtype.

Expand Down
Loading