From 3f14d2ab8933d6657d51a75f9ee56f5427afe40c Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 13:06:03 +0100 Subject: [PATCH 1/9] adds filters, serializer, compressors properties to Array --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/_info.py | 2 +- src/zarr/core/array.py | 89 +++++++++++++++++++++++++++++++++++- src/zarr/core/group.py | 6 +-- src/zarr/core/metadata/v3.py | 2 +- tests/test_array.py | 74 +++++++++++++++++++----------- tests/test_info.py | 8 ++-- 7 files changed, 146 insertions(+), 37 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 200db9ec26..d0b91b373f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -802,7 +802,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 807e940508..9115baefb4 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -87,7 +87,7 @@ class ArrayInfo: _store_type: str _compressor: numcodecs.abc.Codec | None = None _filters: tuple[numcodecs.abc.Codec, ...] | None = None - _codecs: list[Codec] | None = None + _codecs: tuple[Codec, ...] | None = None _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e5c4e4538c..021c1acc5d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -20,6 +20,7 @@ from warnings import warn import numcodecs +import numcodecs.abc import numpy as np import numpy.typing as npt from typing_extensions import deprecated @@ -911,6 +912,57 @@ def size(self) -> int: """ return np.prod(self.metadata.shape).item() + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + if self.metadata.zarr_format == 2: + filters = self.metadata.filters + if filters is None: + return () + return filters + + return tuple(codec for codec in self.metadata.codecs if isinstance(codec, ArrayArrayCodec)) + + @property + def serializer(self) -> ArrayBytesCodec | None: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + if self.metadata.zarr_format == 2: + return None + + return next(codec for codec in self.metadata.codecs if isinstance(codec, ArrayBytesCodec)) + + @property + @deprecated("Use AsyncArray.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + if self.metadata.zarr_format == 2: + return self.metadata.compressor + raise TypeError("`compressor` is not available for Zarr format 3 arrays.") + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + if self.metadata.zarr_format == 2: + if self.metadata.compressor is not None: + return (self.metadata.compressor,) + return () + + return tuple(codec for codec in self.metadata.codecs if isinstance(codec, BytesBytesCodec)) + @property def dtype(self) -> np.dtype[Any]: """Returns the data type of the array. @@ -1967,6 +2019,41 @@ def read_only(self) -> bool: def fill_value(self) -> Any: return self.metadata.fill_value + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + return self._async_array.filters + + @property + def serializer(self) -> None | ArrayBytesCodec: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + return self._async_array.serializer + + @property + @deprecated("Use Array.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + return self._async_array.compressor + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + return self._async_array.compressors + @property def cdata_shape(self) -> ChunkCoords: """ @@ -3710,7 +3797,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index a4503ce64e..4ee230cb53 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1065,7 +1065,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. @@ -2321,7 +2321,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. @@ -2710,7 +2710,7 @@ def array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 0821dd9bc9..9760301f82 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -254,7 +254,7 @@ def __init__( config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) - codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial] + codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial) validate_codecs(codecs_parsed_partial, data_type_parsed) object.__setattr__(self, "shape", shape_parsed) diff --git a/tests/test_array.py b/tests/test_array.py index 628b873e72..8f1c427f15 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -510,9 +510,9 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] + _codecs=(BytesCodec(), ZstdCodec()) if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + else (ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()]),), _count_bytes=512, ) assert result == expected @@ -536,7 +536,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _codecs=(BytesCodec(),) if shards is None else (ShardingCodec(chunk_shape=chunks),), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -596,9 +596,9 @@ async def test_info_v3_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] + _codecs=(BytesCodec(), ZstdCodec()) if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + else (ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()]),), _count_bytes=512, ) assert result == expected @@ -624,7 +624,7 @@ async def test_info_complete_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _codecs=(BytesCodec(),) if shards is None else (ShardingCodec(chunk_shape=chunks),), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -839,7 +839,8 @@ def test_array_create_metadata_order_v2( arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") expected = order or zarr.config.get("array.order") - assert arr.metadata.order == expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.order == expected @pytest.mark.parametrize("order_config", ["C", "F", None]) @@ -1048,10 +1049,15 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + # Test metadata explicitly + assert arr.metadata.zarr_format == 2 # guard for mypy # The v2 metadata stores None and () separately - assert arr.metadata.filters == empty_value # type: ignore[union-attr] + assert arr.metadata.filters == empty_value # The v2 metadata does not allow tuple for compressor, therefore it is turned into None - assert arr.metadata.compressor is None # type: ignore[union-attr] + assert arr.metadata.compressor is None + + assert arr.filters == () + assert arr.compressors == () # v3 arr = await create_array( @@ -1061,10 +1067,13 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + assert arr.metadata.zarr_format == 3 # guard for mypy if dtype == "str": - assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (VLenUTF8Codec(),) + assert arr.serializer == VLenUTF8Codec() else: - assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (BytesCodec(),) + assert arr.serializer == BytesCodec() @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1130,12 +1139,11 @@ async def test_create_array_v3_chunk_encoding( filters=filters, compressors=compressors, ) - aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters / compressors from the array. - assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] + assert arr.filters == filters_expected + assert arr.compressors == compressors_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1167,9 +1175,16 @@ async def test_create_array_v2_chunk_encoding( filters_expected, compressor_expected = _parse_chunk_encoding_v2( filters=filters, compressor=compressors, dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters/compressor from the array. - assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr] - assert arr.metadata.filters == filters_expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.compressor == compressor_expected + assert arr.metadata.filters == filters_expected + + # Normalize for property getters + compressor_expected = () if compressor_expected is None else (compressor_expected,) + filters_expected = () if filters_expected is None else filters_expected + + assert arr.compressors == compressor_expected + assert arr.filters == filters_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1185,12 +1200,12 @@ async def test_create_array_v3_default_filters_compressors(store: MemoryStore, d shape=(10,), zarr_format=3, ) - expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) - # TODO: define the codec pipeline class such that these fields are required, which will obviate the - # type ignore statements - assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined] - assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined] + expected_filters, expected_serializer, expected_compressors = _get_default_chunk_encoding_v3( + np_dtype=np.dtype(dtype) + ) + assert arr.filters == expected_filters + assert arr.serializer == expected_serializer + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1209,8 +1224,15 @@ async def test_create_array_v2_default_filters_compressors(store: MemoryStore, d expected_filters, expected_compressors = _get_default_chunk_encoding_v2( np_dtype=np.dtype(dtype) ) - assert arr.metadata.filters == expected_filters # type: ignore[union-attr] - assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.filters == expected_filters + assert arr.metadata.compressor == expected_compressors + + # Normalize for property getters + expected_filters = () if expected_filters is None else expected_filters + expected_compressors = () if expected_compressors is None else (expected_compressors,) + assert arr.filters == expected_filters + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_info.py b/tests/test_info.py index 5d9264aa13..c567e9b042 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -59,7 +59,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=(BytesCodec(),), ) result = repr(info) assert result == textwrap.dedent(f"""\ @@ -71,7 +71,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}]""") + Codecs : ({{'endian': }},)""") @pytest.mark.parametrize("zarr_format", ZARR_FORMATS) @@ -95,7 +95,7 @@ def test_array_info_complete( _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=(BytesCodec(),), _count_bytes=count_bytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, @@ -110,7 +110,7 @@ def test_array_info_complete( Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}] + Codecs : ({{'endian': }},) No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored_formatted} Storage ratio : {storage_ratio_formatted} From 3bf61c504584aa72336e6799c6455f32efc7c02f Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 13:35:36 +0100 Subject: [PATCH 2/9] adapt Array.info --- src/zarr/core/_info.py | 24 ++++++++++++----------- src/zarr/core/array.py | 37 ++++++++++++++++++------------------ src/zarr/core/metadata/v3.py | 26 +++++++++++++++++++++---- tests/test_array.py | 31 +++++++++++++++++------------- tests/test_config.py | 8 ++++---- tests/test_info.py | 8 ++++---- 6 files changed, 80 insertions(+), 54 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 9115baefb4..6939e0a40b 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,7 +5,7 @@ import numcodecs.abc import numpy as np -from zarr.abc.codec import Codec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat from zarr.core.metadata.v3 import DataType @@ -85,9 +85,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _compressor: numcodecs.abc.Codec | None = None - _filters: tuple[numcodecs.abc.Codec, ...] | None = None - _codecs: tuple[Codec, ...] | None = None + _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _serializer: ArrayBytesCodec | None = None + _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None @@ -113,14 +113,13 @@ def __repr__(self) -> str: if self._chunk_shape is None: # for non-regular chunk grids kwargs["chunk_shape"] = "" - if self._compressor is not None: - template += "\nCompressor : {_compressor}" - if self._filters is not None: + if len(self._filters) > 0: template += "\nFilters : {_filters}" - - if self._codecs is not None: - template += "\nCodecs : {_codecs}" + if self._serializer is not None: + template += "\nSerializer : {_serializer}" + if len(self._compressors) > 0: + template += "\nCompressors : {_compressors}" if self._count_bytes is not None: template += "\nNo. bytes : {_count_bytes}" @@ -139,5 +138,8 @@ def __repr__(self) -> str: kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}" if self._count_chunks_initialized is not None: - template += "\nChunks Initialized : {_count_chunks_initialized}" + if self._shard_shape is not None: + template += "\nShards Initialized : {_count_chunks_initialized}" + else: + template += "\nChunks Initialized : {_count_chunks_initialized}" return template.format(**kwargs) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 021c1acc5d..90c568045a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -924,7 +924,9 @@ def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, .. return () return filters - return tuple(codec for codec in self.metadata.codecs if isinstance(codec, ArrayArrayCodec)) + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayArrayCodec) + ) @property def serializer(self) -> ArrayBytesCodec | None: @@ -934,7 +936,9 @@ def serializer(self) -> ArrayBytesCodec | None: if self.metadata.zarr_format == 2: return None - return next(codec for codec in self.metadata.codecs if isinstance(codec, ArrayBytesCodec)) + return next( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayBytesCodec) + ) @property @deprecated("Use AsyncArray.compressors instead.") @@ -961,7 +965,9 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec return (self.metadata.compressor,) return () - return tuple(codec for codec in self.metadata.codecs if isinstance(codec, BytesBytesCodec)) + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, BytesBytesCodec) + ) @property def dtype(self) -> np.dtype[Any]: @@ -1613,31 +1619,26 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - kwargs: dict[str, Any] = {} - if self.metadata.zarr_format == 2: - assert isinstance(self.metadata, ArrayV2Metadata) - if self.metadata.compressor is not None: - kwargs["_compressor"] = self.metadata.compressor - if self.metadata.filters is not None: - kwargs["_filters"] = self.metadata.filters - kwargs["_data_type"] = self.metadata.dtype - kwargs["_chunk_shape"] = self.metadata.chunks + _data_type: np.dtype[Any] | DataType + if isinstance(self.metadata, ArrayV2Metadata): + _data_type = self.metadata.dtype else: - kwargs["_codecs"] = self.metadata.codecs - kwargs["_data_type"] = self.metadata.data_type - kwargs["_chunk_shape"] = self.chunks - kwargs["_shard_shape"] = self.shards - + _data_type = self.metadata.data_type return ArrayInfo( _zarr_format=self.metadata.zarr_format, + _data_type=_data_type, _shape=self.shape, _order=self.order, + _shard_shape=self.shards, + _chunk_shape=self.chunks, _read_only=self.read_only, + _compressors=self.compressors, + _filters=self.filters, + _serializer=self.serializer, _store_type=type(self.store_path.store).__name__, _count_bytes=self.nbytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, - **kwargs, ) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 9760301f82..4fc3441581 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -81,9 +81,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: return out -def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: - """Check that the codecs are valid for the given dtype""" - +def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: # ensure that we have at least one ArrayBytesCodec abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)] if len(abcs) == 0: @@ -91,7 +89,18 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: elif len(abcs) > 1: raise ValueError("Only one ArrayBytesCodec is allowed.") - abc = abcs[0] + return abcs[0] + + +def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: + """Check that the codecs are valid for the given dtype""" + from zarr.codecs.sharding import ShardingCodec + + abc = validate_array_bytes_codec(codecs) + + # Recursively resolve array-bytes codecs within sharding codecs + while isinstance(abc, ShardingCodec): + abc = validate_array_bytes_codec(abc.codecs) # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name @@ -330,6 +339,15 @@ def shards(self) -> ChunkCoords | None: ) raise NotImplementedError(msg) + @property + def inner_codecs(self) -> tuple[Codec, ...]: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.codecs[0].codecs + return self.codecs + def get_chunk_spec( self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: diff --git a/tests/test_array.py b/tests/test_array.py index 8f1c427f15..e6fdaa5c14 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -20,7 +20,6 @@ VLenUTF8Codec, ZstdCodec, ) -from zarr.codecs.sharding import ShardingCodec from zarr.core._info import ArrayInfo from zarr.core.array import ( CompressorsLike, @@ -494,7 +493,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -510,9 +509,8 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=(BytesCodec(), ZstdCodec()) - if shards is None - else (ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()]),), + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -536,7 +534,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=(BytesCodec(),) if shards is None else (ShardingCodec(chunk_shape=chunks),), + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -572,7 +570,7 @@ async def test_info_v2_async( _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -596,9 +594,8 @@ async def test_info_v3_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=(BytesCodec(), ZstdCodec()) - if shards is None - else (ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()]),), + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -624,7 +621,7 @@ async def test_info_complete_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=(BytesCodec(),) if shards is None else (ShardingCodec(chunk_shape=chunks),), + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -1125,8 +1122,14 @@ async def test_create_array_no_filters_compressors( ({"name": "transpose", "configuration": {"order": [0]}},), ], ) +@pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) async def test_create_array_v3_chunk_encoding( - store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str + store: MemoryStore, + compressors: CompressorsLike, + filters: FiltersLike, + dtype: str, + chunks: tuple[int, ...], + shards: tuple[int, ...] | None, ) -> None: """ Test various possibilities for the compressors and filters parameter to create_array @@ -1134,7 +1137,9 @@ async def test_create_array_v3_chunk_encoding( arr = await create_array( store=store, dtype=dtype, - shape=(10,), + shape=(12,), + chunks=chunks, + shards=shards, zarr_format=3, filters=filters, compressors=compressors, diff --git a/tests/test_config.py b/tests/test_config.py index 20e3c6044f..ca65c62166 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -305,12 +305,12 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize( ("dtype", "expected_codecs"), [ - ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec(), GzipCodec()]), - ("str", [VLenUTF8Codec(), GzipCodec()]), + ("int", (BytesCodec(), GzipCodec())), + ("bytes", (VLenBytesCodec(), GzipCodec())), + ("str", (VLenUTF8Codec(), GzipCodec())), ], ) -async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: +async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> None: with config.set( { "array.v3_default_codecs": { # test setting non-standard codecs diff --git a/tests/test_info.py b/tests/test_info.py index c567e9b042..a0b28451a1 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -59,7 +59,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=(BytesCodec(),), + _serializer=BytesCodec(), ) result = repr(info) assert result == textwrap.dedent(f"""\ @@ -71,7 +71,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Order : C Read-only : True Store type : MemoryStore - Codecs : ({{'endian': }},)""") + Serializer : {{'endian': }}""") @pytest.mark.parametrize("zarr_format", ZARR_FORMATS) @@ -95,7 +95,7 @@ def test_array_info_complete( _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=(BytesCodec(),), + _serializer=BytesCodec(), _count_bytes=count_bytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, @@ -110,7 +110,7 @@ def test_array_info_complete( Order : C Read-only : True Store type : MemoryStore - Codecs : ({{'endian': }},) + Serializer : {{'endian': }} No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored_formatted} Storage ratio : {storage_ratio_formatted} From 635a35fbca0e0ea635d5ffe6fa5def07d2307b94 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 14:36:59 +0100 Subject: [PATCH 3/9] fixes doctests --- docs/user-guide/arrays.rst | 38 +++++++++++++---------- docs/user-guide/consolidated_metadata.rst | 12 +++---- docs/user-guide/groups.rst | 6 ++-- docs/user-guide/performance.rst | 9 ++++-- src/zarr/core/_info.py | 4 ++- src/zarr/core/array.py | 1 + tests/test_info.py | 4 +-- 7 files changed, 44 insertions(+), 30 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 110e12c3be..c33b9e2edf 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -168,8 +168,8 @@ argument accepted by all array creation functions. For example:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0)] + >>> z.compressors + (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) This array above will use Blosc as the primary compressor, using the Zstandard algorithm (compression level 3) internally within Blosc, and with the @@ -188,7 +188,8 @@ which can be used to print useful diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) The :func:`zarr.Array.info_complete` method inspects the underlying store and @@ -203,7 +204,8 @@ prints additional diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) No. bytes stored : 9696302 Storage ratio : 41.3 @@ -223,8 +225,8 @@ here is an array using Gzip compression, level 1:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1)) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), GzipCodec(level=1)] + >>> z.compressors + (GzipCodec(level=1),) Here is an example using LZMA from NumCodecs_ with a custom filter pipeline including LZMA's built-in delta filter:: @@ -236,23 +238,24 @@ built-in delta filter:: >>> compressors = LZMA(filters=lzma_filters) >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) - >>> z.metadata.codecs - [BytesCodec(endian=), _make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]})] + >>> z.compressors + (LZMA(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) The default compressor can be changed by setting the value of the using Zarr's :ref:`user-guide-config`, e.g.:: >>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) - >>> z.metadata.filters - >>> z.metadata.compressor - Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + >>> z.filters + () + >>> z.compressors + (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) To disable compression, set ``compressors=None`` when creating an array, e.g.:: >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) - >>> z.metadata.codecs - [BytesCodec(endian=)] + >>> z.compressors + () .. _user-guide-filters: @@ -287,7 +290,9 @@ Here is an example using a delta filter with the Blosc compressor:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'codec_name': 'numcodecs.delta', 'codec_config': {'id': 'delta', 'dtype': 'int32'}}, {'endian': }, {'typesize': 4, 'cname': , 'clevel': 1, 'shuffle': , 'blocksize': 0}] + Filters : (Delta(codec_name='numcodecs.delta', codec_config={'id': 'delta', 'dtype': 'int32'}),) + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=1, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) For more information about available filter codecs, see the `Numcodecs @@ -600,11 +605,12 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Order : C Read-only : False Store type : LocalStore - Codecs : [{'chunk_shape': (100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) No. bytes stored : 3981060 Storage ratio : 25.1 - Chunks Initialized : 100 + Shards Initialized : 100 In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used. This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total. diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 511761d34e..3c015dcfca 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -52,8 +52,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -65,8 +65,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -78,8 +78,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 62160ffde5..7882407319 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -109,7 +109,8 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 8000000 (7.6M) No. bytes stored : 1432 Storage ratio : 5586.6 @@ -123,7 +124,8 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 4000000 (3.8M) Groups also have the :func:`zarr.Group.tree` method, e.g.:: diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index f56b642fb1..b8dc3e247f 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -98,7 +98,8 @@ To use sharding, you need to specify the ``shards`` parameter when creating the Order : C Read-only : False Store type : MemoryStore - Codecs : [{'chunk_shape': (100, 100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000000 (93.1G) .. _user-guide-chunks-order: @@ -125,7 +126,8 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 @@ -142,7 +144,8 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : F Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 6939e0a40b..18c67b6ae5 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -109,7 +109,9 @@ def __repr__(self) -> str: Read-only : {_read_only} Store type : {_store_type}""") - kwargs = dataclasses.asdict(self) + # We can't use dataclasses.asdict, because we only want a shallow dict + kwargs = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} + if self._chunk_shape is None: # for non-regular chunk grids kwargs["chunk_shape"] = "" diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 90c568045a..764a4e5418 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1624,6 +1624,7 @@ def _info( _data_type = self.metadata.dtype else: _data_type = self.metadata.data_type + return ArrayInfo( _zarr_format=self.metadata.zarr_format, _data_type=_data_type, diff --git a/tests/test_info.py b/tests/test_info.py index a0b28451a1..d620015098 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -71,7 +71,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Order : C Read-only : True Store type : MemoryStore - Serializer : {{'endian': }}""") + Serializer : BytesCodec(endian=)""") @pytest.mark.parametrize("zarr_format", ZARR_FORMATS) @@ -110,7 +110,7 @@ def test_array_info_complete( Order : C Read-only : True Store type : MemoryStore - Serializer : {{'endian': }} + Serializer : BytesCodec(endian=) No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored_formatted} Storage ratio : {storage_ratio_formatted} From c8b96a5623bad1291d1c238451ef947b71fedfb4 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 14:46:20 +0100 Subject: [PATCH 4/9] ugly numcodecs class names --- docs/user-guide/arrays.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index c33b9e2edf..47112decb4 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -239,7 +239,7 @@ built-in delta filter:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) >>> z.compressors - (LZMA(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) + (_make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) The default compressor can be changed by setting the value of the using Zarr's :ref:`user-guide-config`, e.g.:: @@ -290,7 +290,7 @@ Here is an example using a delta filter with the Blosc compressor:: Order : C Read-only : False Store type : LocalStore - Filters : (Delta(codec_name='numcodecs.delta', codec_config={'id': 'delta', 'dtype': 'int32'}),) + Filters : (_make_array_array_codec.._Codec(codec_name='numcodecs.delta', codec_config={'id': 'delta', 'dtype': 'int32'}),) Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=1, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) From 5c63de3cea039980fda87dd07ae15797306bd86a Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 16:22:04 +0100 Subject: [PATCH 5/9] merged #2652 in --- docs/user-guide/config.rst | 29 ++++----- src/zarr/api/asynchronous.py | 3 +- src/zarr/api/synchronous.py | 17 +++--- src/zarr/core/array.py | 113 +++++++++++++++++------------------ src/zarr/core/config.py | 11 ++-- src/zarr/core/group.py | 51 ++++++++-------- tests/test_config.py | 36 +++++------ 7 files changed, 126 insertions(+), 134 deletions(-) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index e38715b67e..a17bce9d99 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -28,7 +28,7 @@ Configuration options include the following: - Default Zarr format ``default_zarr_version`` - Default array order in memory ``array.order`` -- Default codecs ``array.v3_default_codecs`` and ``array.v2_default_compressor`` +- Default filters, serializers and compressors, e.g. ``array.v3_default_filters``, ``array.v3_default_serializer``, ``array.v3_default_compressors``, ``array.v2_default_filters`` and ``array.v2_default_compressor`` - Whether empty chunks are written to storage ``array.write_empty_chunks`` - Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers`` - Selections of implementations of codecs, codec pipelines and buffers @@ -54,19 +54,20 @@ This is the current default configuration:: 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], 'numeric': None, 'string': [{'id': 'vlen-utf8'}]}, - 'v3_default_codecs': {'bytes': [{'name': 'vlen-bytes'}, - {'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'numeric': [{'configuration': {'endian': 'little'}, - 'name': 'bytes'}, - {'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'string': [{'name': 'vlen-utf8'}, - {'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}]}, + 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}], + 'numeric': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}], + 'string': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}]}, + 'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []}, + 'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'}, + 'numeric': {'configuration': {'endian': 'little'}, + 'name': 'bytes'}, + 'string': {'name': 'vlen-utf8'}}, 'write_empty_chunks': False}, 'async': {'concurrency': 10, 'timeout': None}, 'buffer': 'zarr.core.buffer.cpu.Buffer', diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index f42b6d3f51..1b8e463b26 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -892,7 +892,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_filters``, + ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. compressor : Codec, optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index d0b91b373f..a8a29733d9 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -788,9 +788,8 @@ def create_array( For Zarr v3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -806,22 +805,22 @@ def create_array( For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 764a4e5418..c0dd61f6cb 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -469,7 +469,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_filters``, + ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. @@ -1715,7 +1716,8 @@ def create( - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_filters``, + ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. @@ -3698,17 +3700,9 @@ def _build_parents( def _get_default_codecs( np_dtype: np.dtype[Any], -) -> list[dict[str, JSON]]: - default_codecs = zarr_config.get("array.v3_default_codecs") - dtype = DataType.from_numpy(np_dtype) - if dtype == DataType.string: - dtype_key = "string" - elif dtype == DataType.bytes: - dtype_key = "bytes" - else: - dtype_key = "numeric" - - return cast(list[dict[str, JSON]], default_codecs[dtype_key]) +) -> tuple[Codec, ...]: + filters, serializer, compressors = _get_default_chunk_encoding_v3(np_dtype) + return filters + (serializer,) + compressors FiltersLike: TypeAlias = ( @@ -3785,9 +3779,8 @@ async def create_array( For Zarr v3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -3803,22 +3796,22 @@ async def create_array( For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -3997,7 +3990,6 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - default_codecs = zarr_config.get("array.v3_default_codecs") dtype = DataType.from_numpy(np_dtype) if dtype == DataType.string: dtype_key = "string" @@ -4006,31 +3998,34 @@ def _get_default_chunk_encoding_v3( else: dtype_key = "numeric" - codec_dicts = default_codecs[dtype_key] - codecs = tuple(_resolve_codec(c) for c in codec_dicts) - array_bytes_maybe = None - array_array: list[ArrayArrayCodec] = [] - bytes_bytes: list[BytesBytesCodec] = [] + default_filters = zarr_config.get("array.v3_default_filters").get(dtype_key) + default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype_key) + default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype_key) - for codec in codecs: - if isinstance(codec, ArrayBytesCodec): - if array_bytes_maybe is not None: - raise ValueError( - f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. " - "Only one array-to-bytes codec is allowed." - ) - array_bytes_maybe = codec - elif isinstance(codec, ArrayArrayCodec): - array_array.append(codec) - elif isinstance(codec, BytesBytesCodec): - bytes_bytes.append(codec) - else: - raise TypeError(f"Unexpected codec type: {type(codec)}") + filters_list: list[ArrayArrayCodec] = [] + compressors_list: list[BytesBytesCodec] = [] - if array_bytes_maybe is None: + serializer = _resolve_codec(default_serializer) + if serializer is None: raise ValueError("Required ArrayBytesCodec was not found.") + if not isinstance(serializer, ArrayBytesCodec): + raise TypeError(f"Expected ArrayBytesCodec, got: {type(serializer)}") + + for codec_dict in default_filters: + codec = _resolve_codec(codec_dict) + if isinstance(codec, ArrayArrayCodec): + filters_list.append(codec) + else: + raise TypeError(f"Expected ArrayArrayCodec, got: {type(codec)}") - return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes) + for codec_dict in default_compressors: + codec = _resolve_codec(codec_dict) + if isinstance(codec, BytesBytesCodec): + compressors_list.append(codec) + else: + raise TypeError(f"Expected BytesBytesCodec, got: {type(codec)}") + + return tuple(filters_list), serializer, tuple(compressors_list) def _get_default_chunk_encoding_v2( @@ -4114,21 +4109,7 @@ def _parse_chunk_encoding_v3( maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] maybe_array_array: Iterable[Codec | dict[str, JSON]] out_bytes_bytes: tuple[BytesBytesCodec, ...] - if compressors is None: - out_bytes_bytes = () - - elif compressors == "auto": - out_bytes_bytes = default_bytes_bytes - else: - if isinstance(compressors, dict | Codec): - maybe_bytes_bytes = (compressors,) - elif compressors is None: - maybe_bytes_bytes = () - else: - maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) - - out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) out_array_array: tuple[ArrayArrayCodec, ...] if filters is None: out_array_array = () @@ -4148,6 +4129,22 @@ def _parse_chunk_encoding_v3( else: out_array_bytes = _parse_array_bytes_codec(serializer) + if compressors is None: + out_bytes_bytes = () + + elif compressors == "auto": + out_bytes_bytes = default_bytes_bytes + + else: + if isinstance(compressors, dict | Codec): + maybe_bytes_bytes = (compressors,) + elif compressors is None: + maybe_bytes_bytes = () + else: + maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) + + out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + return out_array_array, out_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 421a100f1b..7920d220a4 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -76,17 +76,20 @@ def reset(self) -> None: "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], }, - "v3_default_codecs": { + "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, + "v3_default_serializer": { + "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, + "string": {"name": "vlen-utf8"}, + "bytes": {"name": "vlen-bytes"}, + }, + "v3_default_compressors": { "numeric": [ - {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "string": [ - {"name": "vlen-utf8"}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "bytes": [ - {"name": "vlen-bytes"}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], }, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4ee230cb53..bc3eb0ea74 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1051,9 +1051,8 @@ async def create_array( For Zarr v3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -1069,16 +1068,14 @@ async def create_array( For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. compressor : Codec, optional @@ -1086,7 +1083,9 @@ async def create_array( serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -2307,9 +2306,8 @@ def create_array( For Zarr v3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -2325,16 +2323,14 @@ def create_array( For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. compressor : Codec, optional @@ -2342,7 +2338,9 @@ def create_array( serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -2696,9 +2694,8 @@ def array( For Zarr v3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -2714,16 +2711,14 @@ def array( For Zarr v3, a "compressor" is a codec that takes a bytestrea, and returns another bytestream. Multiple compressors my be provided for Zarr v3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. compressor : Codec, optional @@ -2731,7 +2726,9 @@ def array( serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional diff --git a/tests/test_config.py b/tests/test_config.py index ca65c62166..57a9624dd9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -10,7 +10,7 @@ import zarr import zarr.api from zarr import zeros -from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline +from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( BloscCodec, @@ -18,8 +18,6 @@ Crc32cCodec, GzipCodec, ShardingCodec, - VLenBytesCodec, - VLenUTF8Codec, ) from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer @@ -64,17 +62,20 @@ def test_config_defaults_set() -> None: "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], }, - "v3_default_codecs": { - "bytes": [ - {"name": "vlen-bytes"}, - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], + "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, + "v3_default_serializer": { + "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, + "string": {"name": "vlen-utf8"}, + "bytes": {"name": "vlen-bytes"}, + }, + "v3_default_compressors": { "numeric": [ - {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "string": [ - {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "bytes": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], }, @@ -304,26 +305,19 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize( ("dtype", "expected_codecs"), - [ - ("int", (BytesCodec(), GzipCodec())), - ("bytes", (VLenBytesCodec(), GzipCodec())), - ("str", (VLenUTF8Codec(), GzipCodec())), - ], + ["int", "bytes", "str"], ) -async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> None: +async def test_default_codecs(dtype: str) -> None: with config.set( { - "array.v3_default_codecs": { # test setting non-standard codecs + "array.v3_default_compressors": { # test setting non-standard codecs "numeric": [ - {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "gzip", "configuration": {"level": 5}}, ], "string": [ - {"name": "vlen-utf8"}, {"name": "gzip", "configuration": {"level": 5}}, ], "bytes": [ - {"name": "vlen-bytes"}, {"name": "gzip", "configuration": {"level": 5}}, ], } @@ -336,4 +330,4 @@ async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> zarr_format=3, store=MemoryStore(), ) - assert arr.metadata.codecs == expected_codecs + assert arr.compressors == (GzipCodec(),) From 67325893d8f7c8722c23987df97276473434ed2f Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 17:22:41 +0100 Subject: [PATCH 6/9] use _parse methods --- src/zarr/core/array.py | 28 ++++------------------------ tests/test_config.py | 5 +---- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index c0dd61f6cb..071c587b6d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -110,7 +110,6 @@ _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, - _resolve_codec, get_pipeline_class, ) from zarr.storage import StoreLike, make_store_path @@ -4002,30 +4001,11 @@ def _get_default_chunk_encoding_v3( default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype_key) default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype_key) - filters_list: list[ArrayArrayCodec] = [] - compressors_list: list[BytesBytesCodec] = [] + filters = tuple(_parse_array_array_codec(codec_dict) for codec_dict in default_filters) + serializer = _parse_array_bytes_codec(default_serializer) + compressors = tuple(_parse_bytes_bytes_codec(codec_dict) for codec_dict in default_compressors) - serializer = _resolve_codec(default_serializer) - if serializer is None: - raise ValueError("Required ArrayBytesCodec was not found.") - if not isinstance(serializer, ArrayBytesCodec): - raise TypeError(f"Expected ArrayBytesCodec, got: {type(serializer)}") - - for codec_dict in default_filters: - codec = _resolve_codec(codec_dict) - if isinstance(codec, ArrayArrayCodec): - filters_list.append(codec) - else: - raise TypeError(f"Expected ArrayArrayCodec, got: {type(codec)}") - - for codec_dict in default_compressors: - codec = _resolve_codec(codec_dict) - if isinstance(codec, BytesBytesCodec): - compressors_list.append(codec) - else: - raise TypeError(f"Expected BytesBytesCodec, got: {type(codec)}") - - return tuple(filters_list), serializer, tuple(compressors_list) + return filters, serializer, compressors def _get_default_chunk_encoding_v2( diff --git a/tests/test_config.py b/tests/test_config.py index 57a9624dd9..c552ace840 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -303,10 +303,7 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") -@pytest.mark.parametrize( - ("dtype", "expected_codecs"), - ["int", "bytes", "str"], -) +@pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) async def test_default_codecs(dtype: str) -> None: with config.set( { From d347eecb0600e886e25ddb54f24616003381a191 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 17:30:11 +0100 Subject: [PATCH 7/9] remove redundant if --- src/zarr/core/array.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 071c587b6d..f17f62e05f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4098,8 +4098,6 @@ def _parse_chunk_encoding_v3( else: if isinstance(filters, dict | Codec): maybe_array_array = (filters,) - elif filters is None: - maybe_array_array = () else: maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters) out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) @@ -4111,15 +4109,11 @@ def _parse_chunk_encoding_v3( if compressors is None: out_bytes_bytes = () - elif compressors == "auto": out_bytes_bytes = default_bytes_bytes - else: if isinstance(compressors, dict | Codec): maybe_bytes_bytes = (compressors,) - elif compressors is None: - maybe_bytes_bytes = () else: maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) From 5a9eafef7e6c7dfeb418dcd5be07eb596be99aff Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 17:35:58 +0100 Subject: [PATCH 8/9] code cleanup --- src/zarr/core/array.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f17f62e05f..ce68a6a0e3 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4086,16 +4086,13 @@ def _parse_chunk_encoding_v3( default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3( dtype ) - maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] - maybe_array_array: Iterable[Codec | dict[str, JSON]] - out_bytes_bytes: tuple[BytesBytesCodec, ...] - out_array_array: tuple[ArrayArrayCodec, ...] if filters is None: - out_array_array = () + out_array_array: tuple[ArrayArrayCodec, ...] = () elif filters == "auto": out_array_array = default_array_array else: + maybe_array_array: Iterable[Codec | dict[str, JSON]] if isinstance(filters, dict | Codec): maybe_array_array = (filters,) else: @@ -4108,10 +4105,11 @@ def _parse_chunk_encoding_v3( out_array_bytes = _parse_array_bytes_codec(serializer) if compressors is None: - out_bytes_bytes = () + out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": out_bytes_bytes = default_bytes_bytes else: + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] if isinstance(compressors, dict | Codec): maybe_bytes_bytes = (compressors,) else: From 5ea0f3797297c632ae3ef98e679ffc6659d0ef64 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 6 Jan 2025 18:03:56 +0100 Subject: [PATCH 9/9] typo --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 2 +- src/zarr/core/group.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index cc4c5c319c..f8bee9fcef 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -803,7 +803,7 @@ def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. - For Zarr format 3, a "compressor" is a codec that takes a bytestrea, and + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. If no ``compressors`` are provided, a default set of compressors will be used. These defaults can be changed by modifying the value of ``array.v3_default_compressors`` diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e766366349..915158cb5a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3793,7 +3793,7 @@ async def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. - For Zarr format 3, a "compressor" is a codec that takes a bytestrea, and + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. If no ``compressors`` are provided, a default set of compressors will be used. These defaults can be changed by modifying the value of ``array.v3_default_compressors`` diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 983d38bcea..ebdc63364e 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1066,7 +1066,7 @@ async def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. - For Zarr format 3, a "compressor" is a codec that takes a bytestrea, and + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. If no ``compressors`` are provided, a default set of compressors will be used. These defaults can be changed by modifying the value of ``array.v3_default_compressors`` @@ -2321,7 +2321,7 @@ def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. - For Zarr format 3, a "compressor" is a codec that takes a bytestrea, and + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. If no ``compressors`` are provided, a default set of compressors will be used. These defaults can be changed by modifying the value of ``array.v3_default_compressors`` @@ -2709,7 +2709,7 @@ def array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. - For Zarr format 3, a "compressor" is a codec that takes a bytestrea, and + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. If no ``compressors`` are provided, a default set of compressors will be used. These defaults can be changed by modifying the value of ``array.v3_default_compressors``