diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 110e12c3be..ba85ce1cda 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -168,8 +168,8 @@ argument accepted by all array creation functions. For example:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0)] + >>> z.compressors + (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) This array above will use Blosc as the primary compressor, using the Zstandard algorithm (compression level 3) internally within Blosc, and with the @@ -188,7 +188,9 @@ which can be used to print useful diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) The :func:`zarr.Array.info_complete` method inspects the underlying store and @@ -203,7 +205,9 @@ prints additional diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) No. bytes stored : 9696302 Storage ratio : 41.3 @@ -223,8 +227,8 @@ here is an array using Gzip compression, level 1:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1)) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), GzipCodec(level=1)] + >>> z.compressors + (GzipCodec(level=1),) Here is an example using LZMA from NumCodecs_ with a custom filter pipeline including LZMA's built-in delta filter:: @@ -236,23 +240,24 @@ built-in delta filter:: >>> compressors = LZMA(filters=lzma_filters) >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) - >>> z.metadata.codecs - [BytesCodec(endian=), _make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]})] + >>> z.compressors + (_make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) The default compressor can be changed by setting the value of the using Zarr's :ref:`user-guide-config`, e.g.:: >>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) - >>> z.metadata.filters - >>> z.metadata.compressor - Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + >>> z.filters + () + >>> z.compressors + (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) To disable compression, set ``compressors=None`` when creating an array, e.g.:: >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) - >>> z.metadata.codecs - [BytesCodec(endian=)] + >>> z.compressors + () .. _user-guide-filters: @@ -287,7 +292,9 @@ Here is an example using a delta filter with the Blosc compressor:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'codec_name': 'numcodecs.delta', 'codec_config': {'id': 'delta', 'dtype': 'int32'}}, {'endian': }, {'typesize': 4, 'cname': , 'clevel': 1, 'shuffle': , 'blocksize': 0}] + Filters : (_make_array_array_codec.._Codec(codec_name='numcodecs.delta', codec_config={'id': 'delta', 'dtype': 'int32'}),) + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=1, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) For more information about available filter codecs, see the `Numcodecs @@ -600,11 +607,13 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Order : C Read-only : False Store type : LocalStore - Codecs : [{'chunk_shape': (100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) No. bytes stored : 3981060 Storage ratio : 25.1 - Chunks Initialized : 100 + Shards Initialized : 100 In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used. This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total. diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 511761d34e..3c015dcfca 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -52,8 +52,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -65,8 +65,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -78,8 +78,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 62160ffde5..da5f393246 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -109,7 +109,9 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 8000000 (7.6M) No. bytes stored : 1432 Storage ratio : 5586.6 @@ -123,7 +125,9 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 4000000 (3.8M) Groups also have the :func:`zarr.Group.tree` method, e.g.:: diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index f56b642fb1..265bef8efe 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -98,7 +98,9 @@ To use sharding, you need to specify the ``shards`` parameter when creating the Order : C Read-only : False Store type : MemoryStore - Codecs : [{'chunk_shape': (100, 100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000000 (93.1G) .. _user-guide-chunks-order: @@ -125,7 +127,9 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 @@ -142,7 +146,9 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : F Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 7b3d842832..1a8e6df649 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -802,7 +802,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 807e940508..845552c8be 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,7 +5,7 @@ import numcodecs.abc import numpy as np -from zarr.abc.codec import Codec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat from zarr.core.metadata.v3 import DataType @@ -85,9 +85,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _compressor: numcodecs.abc.Codec | None = None - _filters: tuple[numcodecs.abc.Codec, ...] | None = None - _codecs: list[Codec] | None = None + _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _serializer: ArrayBytesCodec | None = None + _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None @@ -109,18 +109,19 @@ def __repr__(self) -> str: Read-only : {_read_only} Store type : {_store_type}""") - kwargs = dataclasses.asdict(self) + # We can't use dataclasses.asdict, because we only want a shallow dict + kwargs = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} + if self._chunk_shape is None: # for non-regular chunk grids kwargs["chunk_shape"] = "" - if self._compressor is not None: - template += "\nCompressor : {_compressor}" - if self._filters is not None: - template += "\nFilters : {_filters}" + template += "\nFilters : {_filters}" + + if self._serializer is not None: + template += "\nSerializer : {_serializer}" - if self._codecs is not None: - template += "\nCodecs : {_codecs}" + template += "\nCompressors : {_compressors}" if self._count_bytes is not None: template += "\nNo. bytes : {_count_bytes}" @@ -139,5 +140,8 @@ def __repr__(self) -> str: kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}" if self._count_chunks_initialized is not None: - template += "\nChunks Initialized : {_count_chunks_initialized}" + if self._shard_shape is not None: + template += "\nShards Initialized : {_count_chunks_initialized}" + else: + template += "\nChunks Initialized : {_count_chunks_initialized}" return template.format(**kwargs) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 87ec4e48bc..2fa342ce16 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -20,6 +20,7 @@ from warnings import warn import numcodecs +import numcodecs.abc import numpy as np import numpy.typing as npt from typing_extensions import deprecated @@ -911,6 +912,63 @@ def size(self) -> int: """ return np.prod(self.metadata.shape).item() + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + if self.metadata.zarr_format == 2: + filters = self.metadata.filters + if filters is None: + return () + return filters + + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayArrayCodec) + ) + + @property + def serializer(self) -> ArrayBytesCodec | None: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + if self.metadata.zarr_format == 2: + return None + + return next( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayBytesCodec) + ) + + @property + @deprecated("Use AsyncArray.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + if self.metadata.zarr_format == 2: + return self.metadata.compressor + raise TypeError("`compressor` is not available for Zarr format 3 arrays.") + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + if self.metadata.zarr_format == 2: + if self.metadata.compressor is not None: + return (self.metadata.compressor,) + return () + + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, BytesBytesCodec) + ) + @property def dtype(self) -> np.dtype[Any]: """Returns the data type of the array. @@ -1561,31 +1619,27 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - kwargs: dict[str, Any] = {} - if self.metadata.zarr_format == 2: - assert isinstance(self.metadata, ArrayV2Metadata) - if self.metadata.compressor is not None: - kwargs["_compressor"] = self.metadata.compressor - if self.metadata.filters is not None: - kwargs["_filters"] = self.metadata.filters - kwargs["_data_type"] = self.metadata.dtype - kwargs["_chunk_shape"] = self.metadata.chunks + _data_type: np.dtype[Any] | DataType + if isinstance(self.metadata, ArrayV2Metadata): + _data_type = self.metadata.dtype else: - kwargs["_codecs"] = self.metadata.codecs - kwargs["_data_type"] = self.metadata.data_type - kwargs["_chunk_shape"] = self.chunks - kwargs["_shard_shape"] = self.shards + _data_type = self.metadata.data_type return ArrayInfo( _zarr_format=self.metadata.zarr_format, + _data_type=_data_type, _shape=self.shape, _order=self.order, + _shard_shape=self.shards, + _chunk_shape=self.chunks, _read_only=self.read_only, + _compressors=self.compressors, + _filters=self.filters, + _serializer=self.serializer, _store_type=type(self.store_path.store).__name__, _count_bytes=self.nbytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, - **kwargs, ) @@ -1967,6 +2021,41 @@ def read_only(self) -> bool: def fill_value(self) -> Any: return self.metadata.fill_value + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + return self._async_array.filters + + @property + def serializer(self) -> None | ArrayBytesCodec: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + return self._async_array.serializer + + @property + @deprecated("Use Array.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + return self._async_array.compressor + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + return self._async_array.compressors + @property def cdata_shape(self) -> ChunkCoords: """ @@ -3710,7 +3799,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index dac2270a53..d100e30492 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1065,7 +1065,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. @@ -2321,7 +2321,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. @@ -2710,7 +2710,7 @@ def array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 1265c832b2..13a275a6a1 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -81,9 +81,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: return out -def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: - """Check that the codecs are valid for the given dtype""" - +def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: # ensure that we have at least one ArrayBytesCodec abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)] if len(abcs) == 0: @@ -91,7 +89,18 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: elif len(abcs) > 1: raise ValueError("Only one ArrayBytesCodec is allowed.") - abc = abcs[0] + return abcs[0] + + +def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: + """Check that the codecs are valid for the given dtype""" + from zarr.codecs.sharding import ShardingCodec + + abc = validate_array_bytes_codec(codecs) + + # Recursively resolve array-bytes codecs within sharding codecs + while isinstance(abc, ShardingCodec): + abc = validate_array_bytes_codec(abc.codecs) # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name @@ -254,7 +263,7 @@ def __init__( config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) - codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial] + codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial) validate_codecs(codecs_parsed_partial, data_type_parsed) object.__setattr__(self, "shape", shape_parsed) @@ -330,6 +339,15 @@ def shards(self) -> ChunkCoords | None: ) raise NotImplementedError(msg) + @property + def inner_codecs(self) -> tuple[Codec, ...]: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.codecs[0].codecs + return self.codecs + def get_chunk_spec( self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: diff --git a/tests/test_array.py b/tests/test_array.py index 86885514a3..410b2e58d0 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -20,7 +20,6 @@ VLenUTF8Codec, ZstdCodec, ) -from zarr.codecs.sharding import ShardingCodec from zarr.core._info import ArrayInfo from zarr.core.array import ( CompressorsLike, @@ -494,7 +493,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -510,9 +509,8 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] - if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -536,7 +534,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -572,7 +570,7 @@ async def test_info_v2_async( _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -596,9 +594,8 @@ async def test_info_v3_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] - if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -624,7 +621,7 @@ async def test_info_complete_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -839,7 +836,8 @@ def test_array_create_metadata_order_v2( arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") expected = order or zarr.config.get("array.order") - assert arr.metadata.order == expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.order == expected @pytest.mark.parametrize("order_config", ["C", "F", None]) @@ -1048,10 +1046,15 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + # Test metadata explicitly + assert arr.metadata.zarr_format == 2 # guard for mypy # The v2 metadata stores None and () separately - assert arr.metadata.filters == empty_value # type: ignore[union-attr] + assert arr.metadata.filters == empty_value # The v2 metadata does not allow tuple for compressor, therefore it is turned into None - assert arr.metadata.compressor is None # type: ignore[union-attr] + assert arr.metadata.compressor is None + + assert arr.filters == () + assert arr.compressors == () # v3 arr = await create_array( @@ -1061,10 +1064,13 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + assert arr.metadata.zarr_format == 3 # guard for mypy if dtype == "str": - assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (VLenUTF8Codec(),) + assert arr.serializer == VLenUTF8Codec() else: - assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (BytesCodec(),) + assert arr.serializer == BytesCodec() @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1116,8 +1122,14 @@ async def test_create_array_no_filters_compressors( ({"name": "transpose", "configuration": {"order": [0]}},), ], ) +@pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) async def test_create_array_v3_chunk_encoding( - store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str + store: MemoryStore, + compressors: CompressorsLike, + filters: FiltersLike, + dtype: str, + chunks: tuple[int, ...], + shards: tuple[int, ...] | None, ) -> None: """ Test various possibilities for the compressors and filters parameter to create_array @@ -1125,17 +1137,18 @@ async def test_create_array_v3_chunk_encoding( arr = await create_array( store=store, dtype=dtype, - shape=(10,), + shape=(12,), + chunks=chunks, + shards=shards, zarr_format=3, filters=filters, compressors=compressors, ) - aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters / compressors from the array. - assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] + assert arr.filters == filters_expected + assert arr.compressors == compressors_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1167,9 +1180,16 @@ async def test_create_array_v2_chunk_encoding( filters_expected, compressor_expected = _parse_chunk_encoding_v2( filters=filters, compressor=compressors, dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters/compressor from the array. - assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr] - assert arr.metadata.filters == filters_expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.compressor == compressor_expected + assert arr.metadata.filters == filters_expected + + # Normalize for property getters + compressor_expected = () if compressor_expected is None else (compressor_expected,) + filters_expected = () if filters_expected is None else filters_expected + + assert arr.compressors == compressor_expected + assert arr.filters == filters_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1185,12 +1205,12 @@ async def test_create_array_v3_default_filters_compressors(store: MemoryStore, d shape=(10,), zarr_format=3, ) - expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) - # TODO: define the codec pipeline class such that these fields are required, which will obviate the - # type ignore statements - assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined] - assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined] + expected_filters, expected_serializer, expected_compressors = _get_default_chunk_encoding_v3( + np_dtype=np.dtype(dtype) + ) + assert arr.filters == expected_filters + assert arr.serializer == expected_serializer + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1209,8 +1229,15 @@ async def test_create_array_v2_default_filters_compressors(store: MemoryStore, d expected_filters, expected_compressors = _get_default_chunk_encoding_v2( np_dtype=np.dtype(dtype) ) - assert arr.metadata.filters == expected_filters # type: ignore[union-attr] - assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.filters == expected_filters + assert arr.metadata.compressor == expected_compressors + + # Normalize for property getters + expected_filters = () if expected_filters is None else expected_filters + expected_compressors = () if expected_compressors is None else (expected_compressors,) + assert arr.filters == expected_filters + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_config.py b/tests/test_config.py index 20e3c6044f..ca65c62166 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -305,12 +305,12 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize( ("dtype", "expected_codecs"), [ - ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec(), GzipCodec()]), - ("str", [VLenUTF8Codec(), GzipCodec()]), + ("int", (BytesCodec(), GzipCodec())), + ("bytes", (VLenBytesCodec(), GzipCodec())), + ("str", (VLenUTF8Codec(), GzipCodec())), ], ) -async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: +async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> None: with config.set( { "array.v3_default_codecs": { # test setting non-standard codecs diff --git a/tests/test_info.py b/tests/test_info.py index 5d9264aa13..db0fd0ef76 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -59,7 +59,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _serializer=BytesCodec(), ) result = repr(info) assert result == textwrap.dedent(f"""\ @@ -71,7 +71,9 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}]""") + Filters : () + Serializer : BytesCodec(endian=) + Compressors : ()""") @pytest.mark.parametrize("zarr_format", ZARR_FORMATS) @@ -95,7 +97,7 @@ def test_array_info_complete( _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _serializer=BytesCodec(), _count_bytes=count_bytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, @@ -110,7 +112,9 @@ def test_array_info_complete( Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored_formatted} Storage ratio : {storage_ratio_formatted}