Skip to content

Commit 8c5038a

Browse files
authored
Narrow JSON type, ensure that to_dict always returns a dict, and v2 filter / compressor parsing (#2179)
* fix: narrow JSON type, ensure compliance with it, and fix a variety of v2 metadata issues * remove unneeded conditional * codecpipeline no longer inherits from metadata, ditches to_dict and from_dict methods * rename from_list to from_codecs
1 parent dd03ff0 commit 8c5038a

File tree

13 files changed

+106
-130
lines changed

13 files changed

+106
-130
lines changed

src/zarr/abc/codec.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from zarr.abc.store import ByteGetter, ByteSetter
1818
from zarr.core.array_spec import ArraySpec
1919
from zarr.core.chunk_grids import ChunkGrid
20-
from zarr.core.common import JSON
2120
from zarr.core.indexing import SelectorTuple
2221

2322
__all__ = [
@@ -242,7 +241,7 @@ async def encode_partial(
242241
)
243242

244243

245-
class CodecPipeline(Metadata):
244+
class CodecPipeline:
246245
"""Base class for implementing CodecPipeline.
247246
A CodecPipeline implements the read and write paths for chunk data.
248247
On the read path, it is responsible for fetching chunks from a store (via ByteGetter),
@@ -266,12 +265,12 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
266265

267266
@classmethod
268267
@abstractmethod
269-
def from_list(cls, codecs: Iterable[Codec]) -> Self:
270-
"""Creates a codec pipeline from a list of codecs.
268+
def from_codecs(cls, codecs: Iterable[Codec]) -> Self:
269+
"""Creates a codec pipeline from an iterable of codecs.
271270
272271
Parameters
273272
----------
274-
codecs : list[Codec]
273+
codecs : Iterable[Codec]
275274
276275
Returns
277276
-------
@@ -402,15 +401,6 @@ async def write(
402401
"""
403402
...
404403

405-
@classmethod
406-
def from_dict(cls, data: Iterable[JSON | Codec]) -> Self:
407-
"""
408-
Create an instance of the model from a dictionary
409-
"""
410-
...
411-
412-
return cls(**data)
413-
414404

415405
async def _batching_helper(
416406
func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]],

src/zarr/abc/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
@dataclass(frozen=True)
1717
class Metadata:
18-
def to_dict(self) -> JSON:
18+
def to_dict(self) -> dict[str, JSON]:
1919
"""
2020
Recursively serialize this model to a dictionary.
2121
This method inspects the fields of self and calls `x.to_dict()` for any fields that

src/zarr/codecs/_v2.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec)
6767

6868
@dataclass(frozen=True)
6969
class V2Filters(ArrayArrayCodec):
70-
filters: list[dict[str, JSON]]
70+
filters: tuple[numcodecs.abc.Codec, ...] | None
7171

7272
is_fixed_size = False
7373

@@ -79,8 +79,7 @@ async def _decode_single(
7979
chunk_ndarray = chunk_array.as_ndarray_like()
8080
# apply filters in reverse order
8181
if self.filters is not None:
82-
for filter_metadata in self.filters[::-1]:
83-
filter = numcodecs.get_codec(filter_metadata)
82+
for filter in self.filters[::-1]:
8483
chunk_ndarray = await to_thread(filter.decode, chunk_ndarray)
8584

8685
# ensure correct chunk shape
@@ -99,9 +98,9 @@ async def _encode_single(
9998
) -> NDBuffer | None:
10099
chunk_ndarray = chunk_array.as_ndarray_like().ravel(order=chunk_spec.order)
101100

102-
for filter_metadata in self.filters:
103-
filter = numcodecs.get_codec(filter_metadata)
104-
chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
101+
if self.filters is not None:
102+
for filter in self.filters:
103+
chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
105104

106105
return get_ndbuffer_class().from_ndarray_like(chunk_ndarray)
107106

src/zarr/codecs/blosc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,9 @@ def to_dict(self) -> dict[str, JSON]:
127127
"name": "blosc",
128128
"configuration": {
129129
"typesize": self.typesize,
130-
"cname": self.cname,
130+
"cname": self.cname.value,
131131
"clevel": self.clevel,
132-
"shuffle": self.shuffle,
132+
"shuffle": self.shuffle.value,
133133
"blocksize": self.blocksize,
134134
},
135135
}

src/zarr/codecs/bytes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def to_dict(self) -> dict[str, JSON]:
5353
if self.endian is None:
5454
return {"name": "bytes"}
5555
else:
56-
return {"name": "bytes", "configuration": {"endian": self.endian}}
56+
return {"name": "bytes", "configuration": {"endian": self.endian.value}}
5757

5858
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
5959
if array_spec.dtype.itemsize == 0:

src/zarr/codecs/pipeline.py

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
from collections.abc import Iterable, Iterator
43
from dataclasses import dataclass
54
from itertools import islice, pairwise
65
from typing import TYPE_CHECKING, Any, TypeVar
@@ -15,12 +14,14 @@
1514
Codec,
1615
CodecPipeline,
1716
)
18-
from zarr.core.common import JSON, ChunkCoords, concurrent_map, parse_named_configuration
17+
from zarr.core.common import ChunkCoords, concurrent_map
1918
from zarr.core.config import config
2019
from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice
21-
from zarr.registry import get_codec_class, register_pipeline
20+
from zarr.registry import register_pipeline
2221

2322
if TYPE_CHECKING:
23+
from collections.abc import Iterable, Iterator
24+
2425
import numpy as np
2526
from typing_extensions import Self
2627

@@ -68,30 +69,11 @@ class BatchedCodecPipeline(CodecPipeline):
6869
bytes_bytes_codecs: tuple[BytesBytesCodec, ...]
6970
batch_size: int
7071

71-
@classmethod
72-
def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = None) -> Self:
73-
out: list[Codec] = []
74-
if not isinstance(data, Iterable):
75-
raise TypeError(f"Expected iterable, got {type(data)}")
76-
77-
for c in data:
78-
if isinstance(
79-
c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec
80-
): # Can't use Codec here because of mypy limitation
81-
out.append(c)
82-
else:
83-
name_parsed, _ = parse_named_configuration(c, require_configuration=False)
84-
out.append(get_codec_class(name_parsed).from_dict(c)) # type: ignore[arg-type]
85-
return cls.from_list(out, batch_size=batch_size)
86-
87-
def to_dict(self) -> JSON:
88-
return [c.to_dict() for c in self]
89-
9072
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
91-
return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self])
73+
return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self)
9274

9375
@classmethod
94-
def from_list(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
76+
def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
9577
array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs)
9678

9779
return cls(

src/zarr/codecs/sharding.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class ShardingCodecIndexLocation(Enum):
6868
end = "end"
6969

7070

71-
def parse_index_location(data: JSON) -> ShardingCodecIndexLocation:
71+
def parse_index_location(data: object) -> ShardingCodecIndexLocation:
7272
return parse_enum(data, ShardingCodecIndexLocation)
7373

7474

@@ -333,7 +333,7 @@ def __init__(
333333
chunk_shape: ChunkCoordsLike,
334334
codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
335335
index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
336-
index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end,
336+
index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
337337
) -> None:
338338
chunk_shape_parsed = parse_shapelike(chunk_shape)
339339
codecs_parsed = parse_codecs(codecs)
@@ -373,16 +373,16 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
373373

374374
@property
375375
def codec_pipeline(self) -> CodecPipeline:
376-
return get_pipeline_class().from_list(self.codecs)
376+
return get_pipeline_class().from_codecs(self.codecs)
377377

378378
def to_dict(self) -> dict[str, JSON]:
379379
return {
380380
"name": "sharding_indexed",
381381
"configuration": {
382-
"chunk_shape": list(self.chunk_shape),
383-
"codecs": [s.to_dict() for s in self.codecs],
384-
"index_codecs": [s.to_dict() for s in self.index_codecs],
385-
"index_location": self.index_location,
382+
"chunk_shape": self.chunk_shape,
383+
"codecs": tuple([s.to_dict() for s in self.codecs]),
384+
"index_codecs": tuple([s.to_dict() for s in self.index_codecs]),
385+
"index_location": self.index_location.value,
386386
},
387387
}
388388

@@ -620,7 +620,7 @@ async def _decode_shard_index(
620620
index_array = next(
621621
iter(
622622
await get_pipeline_class()
623-
.from_list(self.index_codecs)
623+
.from_codecs(self.index_codecs)
624624
.decode(
625625
[(index_bytes, self._get_index_chunk_spec(chunks_per_shard))],
626626
)
@@ -633,7 +633,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
633633
index_bytes = next(
634634
iter(
635635
await get_pipeline_class()
636-
.from_list(self.index_codecs)
636+
.from_codecs(self.index_codecs)
637637
.encode(
638638
[
639639
(
@@ -651,7 +651,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
651651
def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int:
652652
return (
653653
get_pipeline_class()
654-
.from_list(self.index_codecs)
654+
.from_codecs(self.index_codecs)
655655
.compute_encoded_size(
656656
16 * product(chunks_per_shard), self._get_index_chunk_spec(chunks_per_shard)
657657
)

src/zarr/codecs/transpose.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
4545
return cls(**configuration_parsed) # type: ignore[arg-type]
4646

4747
def to_dict(self) -> dict[str, JSON]:
48-
return {"name": "transpose", "configuration": {"order": list(self.order)}}
48+
return {"name": "transpose", "configuration": {"order": tuple(self.order)}}
4949

5050
def validate(self, shape: tuple[int, ...], dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None:
5151
if len(self.order) != len(shape):

src/zarr/core/array.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,10 @@ def parse_array_metadata(data: Any) -> ArrayV2Metadata | ArrayV3Metadata:
8787

8888
def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecPipeline:
8989
if isinstance(metadata, ArrayV3Metadata):
90-
return get_pipeline_class().from_list(metadata.codecs)
90+
return get_pipeline_class().from_codecs(metadata.codecs)
9191
elif isinstance(metadata, ArrayV2Metadata):
92-
return get_pipeline_class().from_list(
93-
[V2Filters(metadata.filters or []), V2Compressor(metadata.compressor)]
92+
return get_pipeline_class().from_codecs(
93+
[V2Filters(metadata.filters), V2Compressor(metadata.compressor)]
9494
)
9595
else:
9696
raise TypeError
@@ -299,8 +299,6 @@ async def _create_v2(
299299
attributes: dict[str, JSON] | None = None,
300300
exists_ok: bool = False,
301301
) -> AsyncArray:
302-
import numcodecs
303-
304302
if not exists_ok:
305303
await ensure_no_existing_node(store_path, zarr_format=2)
306304
if order is None:
@@ -315,15 +313,9 @@ async def _create_v2(
315313
chunks=chunks,
316314
order=order,
317315
dimension_separator=dimension_separator,
318-
fill_value=0 if fill_value is None else fill_value,
319-
compressor=(
320-
numcodecs.get_codec(compressor).get_config() if compressor is not None else None
321-
),
322-
filters=(
323-
[numcodecs.get_codec(filter).get_config() for filter in filters]
324-
if filters is not None
325-
else None
326-
),
316+
fill_value=fill_value,
317+
compressor=compressor,
318+
filters=filters,
327319
attributes=attributes,
328320
)
329321
array = cls(metadata=metadata, store_path=store_path)

src/zarr/core/common.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import contextvars
55
import functools
66
import operator
7-
from collections.abc import Iterable
7+
from collections.abc import Iterable, Mapping
88
from enum import Enum
99
from typing import (
1010
TYPE_CHECKING,
@@ -32,7 +32,7 @@
3232
ChunkCoords = tuple[int, ...]
3333
ChunkCoordsLike = Iterable[int]
3434
ZarrFormat = Literal[2, 3]
35-
JSON = None | str | int | float | Enum | dict[str, "JSON"] | list["JSON"] | tuple["JSON", ...]
35+
JSON = None | str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...]
3636
MemoryOrder = Literal["C", "F"]
3737
AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"]
3838

@@ -80,7 +80,7 @@ def enum_names(enum: type[E]) -> Iterator[str]:
8080
yield item.name
8181

8282

83-
def parse_enum(data: JSON, cls: type[E]) -> E:
83+
def parse_enum(data: object, cls: type[E]) -> E:
8484
if isinstance(data, cls):
8585
return data
8686
if not isinstance(data, str):

0 commit comments

Comments
 (0)