Narrow JSON type, ensure that to_dict always returns a dict, and v2 filter / compressor parsing (#2179)

d-v-b · web-flow · commit 8c5038a752ee · 2024-09-17T18:55:23.000+02:00
* fix: narrow JSON type, ensure compliance with it, and fix a variety of v2 metadata issues

* remove unneeded conditional

* codecpipeline no longer inherits from metadata, ditches to_dict and from_dict methods

* rename from_list to from_codecs
diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
@@ -17,7 +17,6 @@
     from zarr.abc.store import ByteGetter, ByteSetter
     from zarr.core.array_spec import ArraySpec
     from zarr.core.chunk_grids import ChunkGrid
-    from zarr.core.common import JSON
     from zarr.core.indexing import SelectorTuple
 
 __all__ = [
@@ -242,7 +241,7 @@ async def encode_partial(
         )
 
 
-class CodecPipeline(Metadata):
+class CodecPipeline:
     """Base class for implementing CodecPipeline.
     A CodecPipeline implements the read and write paths for chunk data.
     On the read path, it is responsible for fetching chunks from a store (via ByteGetter),
@@ -266,12 +265,12 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
 
     @classmethod
     @abstractmethod
-    def from_list(cls, codecs: Iterable[Codec]) -> Self:
-        """Creates a codec pipeline from a list of codecs.
+    def from_codecs(cls, codecs: Iterable[Codec]) -> Self:
+        """Creates a codec pipeline from an iterable of codecs.
 
         Parameters
         ----------
-        codecs : list[Codec]
+        codecs : Iterable[Codec]
 
         Returns
         -------
@@ -402,15 +401,6 @@ async def write(
         """
         ...
 
-    @classmethod
-    def from_dict(cls, data: Iterable[JSON | Codec]) -> Self:
-        """
-        Create an instance of the model from a dictionary
-        """
-        ...
-
-        return cls(**data)
-
 
 async def _batching_helper(
     func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]],
diff --git a/src/zarr/abc/metadata.py b/src/zarr/abc/metadata.py
@@ -15,7 +15,7 @@
 
 @dataclass(frozen=True)
 class Metadata:
-    def to_dict(self) -> JSON:
+    def to_dict(self) -> dict[str, JSON]:
         """
         Recursively serialize this model to a dictionary.
         This method inspects the fields of self and calls `x.to_dict()` for any fields that
diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
@@ -67,7 +67,7 @@ def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec)
 
 @dataclass(frozen=True)
 class V2Filters(ArrayArrayCodec):
-    filters: list[dict[str, JSON]]
+    filters: tuple[numcodecs.abc.Codec, ...] | None
 
     is_fixed_size = False
 
@@ -79,8 +79,7 @@ async def _decode_single(
         chunk_ndarray = chunk_array.as_ndarray_like()
         # apply filters in reverse order
         if self.filters is not None:
-            for filter_metadata in self.filters[::-1]:
-                filter = numcodecs.get_codec(filter_metadata)
+            for filter in self.filters[::-1]:
                 chunk_ndarray = await to_thread(filter.decode, chunk_ndarray)
 
         # ensure correct chunk shape
@@ -99,9 +98,9 @@ async def _encode_single(
     ) -> NDBuffer | None:
         chunk_ndarray = chunk_array.as_ndarray_like().ravel(order=chunk_spec.order)
 
-        for filter_metadata in self.filters:
-            filter = numcodecs.get_codec(filter_metadata)
-            chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
+        if self.filters is not None:
+            for filter in self.filters:
+                chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
 
         return get_ndbuffer_class().from_ndarray_like(chunk_ndarray)
 
diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py
@@ -127,9 +127,9 @@ def to_dict(self) -> dict[str, JSON]:
             "name": "blosc",
             "configuration": {
                 "typesize": self.typesize,
-                "cname": self.cname,
+                "cname": self.cname.value,
                 "clevel": self.clevel,
-                "shuffle": self.shuffle,
+                "shuffle": self.shuffle.value,
                 "blocksize": self.blocksize,
             },
         }
diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
@@ -53,7 +53,7 @@ def to_dict(self) -> dict[str, JSON]:
         if self.endian is None:
             return {"name": "bytes"}
         else:
-            return {"name": "bytes", "configuration": {"endian": self.endian}}
+            return {"name": "bytes", "configuration": {"endian": self.endian.value}}
 
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
         if array_spec.dtype.itemsize == 0:
diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from itertools import islice, pairwise
 from typing import TYPE_CHECKING, Any, TypeVar
@@ -15,12 +14,14 @@
     Codec,
     CodecPipeline,
 )
-from zarr.core.common import JSON, ChunkCoords, concurrent_map, parse_named_configuration
+from zarr.core.common import ChunkCoords, concurrent_map
 from zarr.core.config import config
 from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice
-from zarr.registry import get_codec_class, register_pipeline
+from zarr.registry import register_pipeline
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator
+
     import numpy as np
     from typing_extensions import Self
 
@@ -68,30 +69,11 @@ class BatchedCodecPipeline(CodecPipeline):
     bytes_bytes_codecs: tuple[BytesBytesCodec, ...]
     batch_size: int
 
-    @classmethod
-    def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = None) -> Self:
-        out: list[Codec] = []
-        if not isinstance(data, Iterable):
-            raise TypeError(f"Expected iterable, got {type(data)}")
-
-        for c in data:
-            if isinstance(
-                c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec
-            ):  # Can't use Codec here because of mypy limitation
-                out.append(c)
-            else:
-                name_parsed, _ = parse_named_configuration(c, require_configuration=False)
-                out.append(get_codec_class(name_parsed).from_dict(c))  # type: ignore[arg-type]
-        return cls.from_list(out, batch_size=batch_size)
-
-    def to_dict(self) -> JSON:
-        return [c.to_dict() for c in self]
-
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
-        return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self])
+        return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self)
 
     @classmethod
-    def from_list(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
+    def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
         array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs)
 
         return cls(
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -68,7 +68,7 @@ class ShardingCodecIndexLocation(Enum):
     end = "end"
 
 
-def parse_index_location(data: JSON) -> ShardingCodecIndexLocation:
+def parse_index_location(data: object) -> ShardingCodecIndexLocation:
     return parse_enum(data, ShardingCodecIndexLocation)
 
 
@@ -333,7 +333,7 @@ def __init__(
         chunk_shape: ChunkCoordsLike,
         codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
         index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
-        index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end,
+        index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
     ) -> None:
         chunk_shape_parsed = parse_shapelike(chunk_shape)
         codecs_parsed = parse_codecs(codecs)
@@ -373,16 +373,16 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
 
     @property
     def codec_pipeline(self) -> CodecPipeline:
-        return get_pipeline_class().from_list(self.codecs)
+        return get_pipeline_class().from_codecs(self.codecs)
 
     def to_dict(self) -> dict[str, JSON]:
         return {
             "name": "sharding_indexed",
             "configuration": {
-                "chunk_shape": list(self.chunk_shape),
-                "codecs": [s.to_dict() for s in self.codecs],
-                "index_codecs": [s.to_dict() for s in self.index_codecs],
-                "index_location": self.index_location,
+                "chunk_shape": self.chunk_shape,
+                "codecs": tuple([s.to_dict() for s in self.codecs]),
+                "index_codecs": tuple([s.to_dict() for s in self.index_codecs]),
+                "index_location": self.index_location.value,
             },
         }
 
@@ -620,7 +620,7 @@ async def _decode_shard_index(
         index_array = next(
             iter(
                 await get_pipeline_class()
-                .from_list(self.index_codecs)
+                .from_codecs(self.index_codecs)
                 .decode(
                     [(index_bytes, self._get_index_chunk_spec(chunks_per_shard))],
                 )
@@ -633,7 +633,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
         index_bytes = next(
             iter(
                 await get_pipeline_class()
-                .from_list(self.index_codecs)
+                .from_codecs(self.index_codecs)
                 .encode(
                     [
                         (
@@ -651,7 +651,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
     def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int:
         return (
             get_pipeline_class()
-            .from_list(self.index_codecs)
+            .from_codecs(self.index_codecs)
             .compute_encoded_size(
                 16 * product(chunks_per_shard), self._get_index_chunk_spec(chunks_per_shard)
             )
diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py
@@ -45,7 +45,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
         return cls(**configuration_parsed)  # type: ignore[arg-type]
 
     def to_dict(self) -> dict[str, JSON]:
-        return {"name": "transpose", "configuration": {"order": list(self.order)}}
+        return {"name": "transpose", "configuration": {"order": tuple(self.order)}}
 
     def validate(self, shape: tuple[int, ...], dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None:
         if len(self.order) != len(shape):
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -87,10 +87,10 @@ def parse_array_metadata(data: Any) -> ArrayV2Metadata | ArrayV3Metadata:
 
 def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecPipeline:
     if isinstance(metadata, ArrayV3Metadata):
-        return get_pipeline_class().from_list(metadata.codecs)
+        return get_pipeline_class().from_codecs(metadata.codecs)
     elif isinstance(metadata, ArrayV2Metadata):
-        return get_pipeline_class().from_list(
-            [V2Filters(metadata.filters or []), V2Compressor(metadata.compressor)]
+        return get_pipeline_class().from_codecs(
+            [V2Filters(metadata.filters), V2Compressor(metadata.compressor)]
         )
     else:
         raise TypeError
@@ -299,8 +299,6 @@ async def _create_v2(
         attributes: dict[str, JSON] | None = None,
         exists_ok: bool = False,
     ) -> AsyncArray:
-        import numcodecs
-
         if not exists_ok:
             await ensure_no_existing_node(store_path, zarr_format=2)
         if order is None:
@@ -315,15 +313,9 @@ async def _create_v2(
             chunks=chunks,
             order=order,
             dimension_separator=dimension_separator,
-            fill_value=0 if fill_value is None else fill_value,
-            compressor=(
-                numcodecs.get_codec(compressor).get_config() if compressor is not None else None
-            ),
-            filters=(
-                [numcodecs.get_codec(filter).get_config() for filter in filters]
-                if filters is not None
-                else None
-            ),
+            fill_value=fill_value,
+            compressor=compressor,
+            filters=filters,
             attributes=attributes,
         )
         array = cls(metadata=metadata, store_path=store_path)
diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py
@@ -4,7 +4,7 @@
 import contextvars
 import functools
 import operator
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping
 from enum import Enum
 from typing import (
     TYPE_CHECKING,
@@ -32,7 +32,7 @@
 ChunkCoords = tuple[int, ...]
 ChunkCoordsLike = Iterable[int]
 ZarrFormat = Literal[2, 3]
-JSON = None | str | int | float | Enum | dict[str, "JSON"] | list["JSON"] | tuple["JSON", ...]
+JSON = None | str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...]
 MemoryOrder = Literal["C", "F"]
 AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"]
 
@@ -80,7 +80,7 @@ def enum_names(enum: type[E]) -> Iterator[str]:
         yield item.name
 
 
-def parse_enum(data: JSON, cls: type[E]) -> E:
+def parse_enum(data: object, cls: type[E]) -> E:
     if isinstance(data, cls):
         return data
     if not isinstance(data, str):
diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py