zarr-developers · d-v-b · Sep 17, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 13, 2024
diff --git a/src/zarr/abc/metadata.py b/src/zarr/abc/metadata.py
@@ -15,7 +15,7 @@
 
 @dataclass(frozen=True)
 class Metadata:
-    def to_dict(self) -> JSON:
+    def to_dict(self) -> dict[str, JSON]:
         """
         Recursively serialize this model to a dictionary.
         This method inspects the fields of self and calls `x.to_dict()` for any fields that

diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
@@ -67,7 +67,7 @@ def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec)
 
 @dataclass(frozen=True)
 class V2Filters(ArrayArrayCodec):
-    filters: list[dict[str, JSON]]
+    filters: tuple[numcodecs.abc.Codec, ...] | None
 
     is_fixed_size = False
 
@@ -79,8 +79,7 @@ async def _decode_single(
         chunk_ndarray = chunk_array.as_ndarray_like()
         # apply filters in reverse order
         if self.filters is not None:
-            for filter_metadata in self.filters[::-1]:
-                filter = numcodecs.get_codec(filter_metadata)
+            for filter in self.filters[::-1]:
                 chunk_ndarray = await to_thread(filter.decode, chunk_ndarray)
 
         # ensure correct chunk shape
@@ -99,9 +98,9 @@ async def _encode_single(
     ) -> NDBuffer | None:
         chunk_ndarray = chunk_array.as_ndarray_like().ravel(order=chunk_spec.order)
 
-        for filter_metadata in self.filters:
-            filter = numcodecs.get_codec(filter_metadata)
-            chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
+        if self.filters is not None:
+            for filter in self.filters:
+                chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
 
         return get_ndbuffer_class().from_ndarray_like(chunk_ndarray)
 

diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py
@@ -127,9 +127,9 @@ def to_dict(self) -> dict[str, JSON]:
             "name": "blosc",
             "configuration": {
                 "typesize": self.typesize,
-                "cname": self.cname,
+                "cname": self.cname.value,
                 "clevel": self.clevel,
-                "shuffle": self.shuffle,
+                "shuffle": self.shuffle.value,
                 "blocksize": self.blocksize,
             },
         }

diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
@@ -53,7 +53,7 @@ def to_dict(self) -> dict[str, JSON]:
         if self.endian is None:
             return {"name": "bytes"}
         else:
-            return {"name": "bytes", "configuration": {"endian": self.endian}}
+            return {"name": "bytes", "configuration": {"endian": self.endian.value}}
 
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
         if array_spec.dtype.itemsize == 0:

diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py
@@ -84,8 +84,13 @@ def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = Non
                 out.append(get_codec_class(name_parsed).from_dict(c))  # type: ignore[arg-type]
         return cls.from_list(out, batch_size=batch_size)
 
-    def to_dict(self) -> JSON:
-        return [c.to_dict() for c in self]
+    def to_dict(self) -> dict[str, JSON]:
+        return {
+            "array_array_codecs": tuple(c.to_dict() for c in self.array_array_codecs),
+            "array_bytes_codec": self.array_bytes_codec.to_dict(),
+            "bytes_bytes_codec": tuple(c.to_dict() for c in self.bytes_bytes_codecs),
+            "batch_size": self.batch_size,
+        }
 
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
         return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self])

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -68,7 +68,7 @@ class ShardingCodecIndexLocation(Enum):
     end = "end"
 
 
-def parse_index_location(data: JSON) -> ShardingCodecIndexLocation:
+def parse_index_location(data: object) -> ShardingCodecIndexLocation:
     return parse_enum(data, ShardingCodecIndexLocation)
 
 
@@ -333,7 +333,7 @@ def __init__(
         chunk_shape: ChunkCoordsLike,
         codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
         index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
-        index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end,
+        index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
     ) -> None:
         chunk_shape_parsed = parse_shapelike(chunk_shape)
         codecs_parsed = parse_codecs(codecs)
@@ -379,10 +379,10 @@ def to_dict(self) -> dict[str, JSON]:
         return {
             "name": "sharding_indexed",
             "configuration": {
-                "chunk_shape": list(self.chunk_shape),
-                "codecs": [s.to_dict() for s in self.codecs],
-                "index_codecs": [s.to_dict() for s in self.index_codecs],
-                "index_location": self.index_location,
+                "chunk_shape": self.chunk_shape,
+                "codecs": tuple([s.to_dict() for s in self.codecs]),
+                "index_codecs": tuple([s.to_dict() for s in self.index_codecs]),
+                "index_location": self.index_location.value,
             },
         }
 

diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py
@@ -45,7 +45,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
         return cls(**configuration_parsed)  # type: ignore[arg-type]
 
     def to_dict(self) -> dict[str, JSON]:
-        return {"name": "transpose", "configuration": {"order": list(self.order)}}
+        return {"name": "transpose", "configuration": {"order": tuple(self.order)}}
 
     def validate(self, shape: tuple[int, ...], dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None:
         if len(self.order) != len(shape):

diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -90,7 +90,7 @@ def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecP
         return get_pipeline_class().from_list(metadata.codecs)
     elif isinstance(metadata, ArrayV2Metadata):
         return get_pipeline_class().from_list(
-            [V2Filters(metadata.filters or []), V2Compressor(metadata.compressor)]
+            [V2Filters(metadata.filters), V2Compressor(metadata.compressor)]
         )
     else:
         raise TypeError
@@ -299,8 +299,6 @@ async def _create_v2(
         attributes: dict[str, JSON] | None = None,
         exists_ok: bool = False,
     ) -> AsyncArray:
-        import numcodecs
-
         if not exists_ok:
             await ensure_no_existing_node(store_path, zarr_format=2)
         if order is None:
@@ -315,15 +313,9 @@ async def _create_v2(
             chunks=chunks,
             order=order,
             dimension_separator=dimension_separator,
-            fill_value=0 if fill_value is None else fill_value,
-            compressor=(
-                numcodecs.get_codec(compressor).get_config() if compressor is not None else None
-            ),
-            filters=(
-                [numcodecs.get_codec(filter).get_config() for filter in filters]
-                if filters is not None
-                else None
-            ),
+            fill_value=fill_value,
+            compressor=compressor,
+            filters=filters,
             attributes=attributes,
         )
         array = cls(metadata=metadata, store_path=store_path)

diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py
@@ -4,7 +4,7 @@
 import contextvars
 import functools
 import operator
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping
 from enum import Enum
 from typing import (
     TYPE_CHECKING,
@@ -32,7 +32,7 @@
 ChunkCoords = tuple[int, ...]
 ChunkCoordsLike = Iterable[int]
 ZarrFormat = Literal[2, 3]
-JSON = None | str | int | float | Enum | dict[str, "JSON"] | list["JSON"] | tuple["JSON", ...]
+JSON = None | str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...]
 MemoryOrder = Literal["C", "F"]
 AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"]
 
@@ -80,7 +80,7 @@ def enum_names(enum: type[E]) -> Iterator[str]:
         yield item.name
 
 
-def parse_enum(data: JSON, cls: type[E]) -> E:
+def parse_enum(data: object, cls: type[E]) -> E:
     if isinstance(data, cls):
         return data
     if not isinstance(data, str):

diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections.abc import Iterable
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -14,6 +15,7 @@
 import json
 from dataclasses import dataclass, field, replace
 
+import numcodecs
 import numpy as np
 
 from zarr.core.array_spec import ArraySpec
@@ -31,9 +33,9 @@ class ArrayV2Metadata(ArrayMetadata):
     data_type: np.dtype[Any]
     fill_value: None | int | float = 0
     order: Literal["C", "F"] = "C"
-    filters: list[dict[str, JSON]] | None = None
+    filters: tuple[numcodecs.abc.Codec, ...] | None = None
     dimension_separator: Literal[".", "/"] = "."
-    compressor: dict[str, JSON] | None = None
+    compressor: numcodecs.abc.Codec | None = None
     attributes: dict[str, JSON] = field(default_factory=dict)
     zarr_format: Literal[2] = field(init=False, default=2)
 
@@ -46,8 +48,8 @@ def __init__(
         fill_value: Any,
         order: Literal["C", "F"],
         dimension_separator: Literal[".", "/"] = ".",
-        compressor: dict[str, JSON] | None = None,
-        filters: list[dict[str, JSON]] | None = None,
+        compressor: numcodecs.abc.Codec | dict[str, JSON] | None = None,
+        filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None,
         attributes: dict[str, JSON] | None = None,
     ):
         """
@@ -104,11 +106,6 @@ def _json_convert(
             raise TypeError
 
         zarray_dict = self.to_dict()
-
-        # todo: remove this check when we can ensure that to_dict always returns dicts.
-        if not isinstance(zarray_dict, dict):
-            raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")
-
         zattrs_dict = zarray_dict.pop("attributes", {})
         json_indent = config.get("json_indent")
         return {
@@ -128,13 +125,8 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
         _ = parse_zarr_format(_data.pop("zarr_format"))
         return cls(**_data)
 
-    def to_dict(self) -> JSON:
+    def to_dict(self) -> dict[str, JSON]:
         zarray_dict = super().to_dict()
-
-        # todo: remove this check when we can ensure that to_dict always returns dicts.
-        if not isinstance(zarray_dict, dict):
-            raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.")
-
         _ = zarray_dict.pop("chunk_grid")
         zarray_dict["chunks"] = self.chunk_grid.chunk_shape
 
@@ -165,18 +157,44 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
         return replace(self, attributes=attributes)
 
 
-def parse_zarr_format(data: Literal[2]) -> Literal[2]:
+def parse_zarr_format(data: object) -> Literal[2]:
     if data == 2:
-        return data
+        return 2
     raise ValueError(f"Invalid value. Expected 2. Got {data}.")
 
 
-def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None:
-    return data
+def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:
+    """
+    Parse a potential tuple of filters
+    """
+    out: list[numcodecs.abc.Codec] = []
 
+    if data is None:
+        return data
+    if isinstance(data, Iterable):
+        for idx, val in enumerate(data):
+            if isinstance(val, numcodecs.abc.Codec):
+                out.append(val)
+            elif isinstance(val, dict):
+                out.append(numcodecs.get_codec(val))
+            else:
+                msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
+                raise TypeError(msg)
+        return tuple(out)
+    msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
+    raise TypeError(msg)
 
-def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None:
-    return data
+
+def parse_compressor(data: object) -> numcodecs.abc.Codec | None:
+    """
+    Parse a potential compressor.
+    """
+    if data is None or isinstance(data, numcodecs.abc.Codec):
+        return data
+    if isinstance(data, dict):
+        return numcodecs.get_codec(data)
+    msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead."
+    raise ValueError(msg)
 
 
 def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
@@ -189,7 +207,7 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
     return data
 
 
-def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
+def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
     """
     Parse a potential fill value into a value that is compatible with the provided dtype.