diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 51116a929e..bcbdaf7c19 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -6,6 +6,8 @@ copy_all, copy_store, create, + create_array, + create_group, empty, empty_like, full, @@ -46,6 +48,8 @@ "copy_all", "copy_store", "create", + "create_array", + "create_group", "empty", "empty_like", "full", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c4d1ec8627..75c043fc1a 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,8 +9,8 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.core.array import Array, AsyncArray, get_array_metadata -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams +from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -18,14 +18,14 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _default_zarr_format, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, parse_dtype, ) -from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -49,6 +49,7 @@ "copy_all", "copy_store", "create", + "create_array", "empty", "empty_like", "full", @@ -150,11 +151,6 @@ def _handle_zarr_version_or_format( return zarr_format -def _default_zarr_version() -> ZarrFormat: - """Return the default zarr_version""" - return cast(ZarrFormat, int(config.get("default_zarr_version", 3))) - - async def consolidate_metadata( store: StoreLike, path: str | None = None, @@ -300,8 +296,8 @@ async def open( path : str or None, optional The path within the store to open. storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. **kwargs Additional parameters are passed through to :func:`zarr.creation.open_array` or :func:`zarr.hierarchy.open_group`. @@ -417,7 +413,7 @@ async def save_array( """ zarr_format = ( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) - or _default_zarr_version() + or _default_zarr_format() ) if not isinstance(arr, NDArrayLike): raise TypeError("arr argument must be numpy or other NDArrayLike array") @@ -429,7 +425,7 @@ async def save_array( shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) - new = await AsyncArray.create( + new = await AsyncArray._create( store_path, zarr_format=zarr_format, shape=shape, @@ -477,7 +473,7 @@ async def save_group( zarr_version=zarr_version, zarr_format=zarr_format, ) - or _default_zarr_version() + or _default_zarr_format() ) for arg in args: @@ -657,7 +653,7 @@ async def group( try: return await AsyncGroup.open(store=store_path, zarr_format=zarr_format) except (KeyError, FileNotFoundError): - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await AsyncGroup.from_store( store=store_path, zarr_format=_zarr_format, @@ -666,6 +662,56 @@ async def group( ) +async def create_group( + *, + store: StoreLike, + path: str | None = None, + overwrite: bool = False, + zarr_format: ZarrFormat | None = None, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> AsyncGroup: + """Create a group. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system. + path : str, optional + Group path within store. + overwrite : bool, optional + If True, pre-existing data at ``path`` will be deleted before + creating the group. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + If no ``zarr_format`` is provided, the default format will be used. + This default can be changed by modifying the value of ``default_zarr_format`` + in :mod:`zarr.core.config`. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + AsyncGroup + The new group. + """ + + if zarr_format is None: + zarr_format = _default_zarr_format() + + mode: Literal["a"] = "a" + + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + + return await AsyncGroup.from_store( + store=store_path, + zarr_format=zarr_format, + overwrite=overwrite, + attributes=attributes, + ) + + async def open_group( store: StoreLike | None = None, *, # Note: this is a change from v2 @@ -768,7 +814,7 @@ async def open_group( pass if mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await AsyncGroup.from_store( store_path, zarr_format=_zarr_format, @@ -813,7 +859,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, **kwargs: Any, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -843,8 +889,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional @@ -857,7 +903,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. + fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Deprecated in favor of the ``config`` keyword argument. @@ -878,8 +925,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If neither ``compressor`` nor ``filters`` are provided, a default - compressor will be used. (see ``compressor`` for details). + V2 only. If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -914,7 +961,7 @@ async def create( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. - config : ArrayConfig or ArrayConfigParams, optional + config : ArrayConfig or ArrayConfigLike, optional Runtime configuration of the array. If provided, will override the default values from `zarr.config.array`. @@ -925,15 +972,17 @@ async def create( """ zarr_format = ( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) - or _default_zarr_version() + or _default_zarr_format() ) if zarr_format == 2: if chunks is None: chunks = shape dtype = parse_dtype(dtype, zarr_format) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks @@ -971,7 +1020,7 @@ async def create( mode = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) - config_dict: ArrayConfigParams = {} + config_dict: ArrayConfigLike = {} if write_empty_chunks is not None: if config is not None: @@ -994,7 +1043,7 @@ async def create( config_parsed = ArrayConfig.from_dict(config_dict) - return await AsyncArray.create( + return await AsyncArray._create( store_path, shape=shape, chunks=chunks, @@ -1173,7 +1222,7 @@ async def open_array( If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. **kwargs - Any keyword arguments to pass to ``create``. + Any keyword arguments to pass to :func:`create`. Returns ------- @@ -1196,7 +1245,7 @@ async def open_array( except FileNotFoundError: if not store_path.read_only and mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await create( store=store_path, zarr_format=_zarr_format, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index cd1ef8b38d..52815748ad 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -5,6 +5,7 @@ from typing_extensions import deprecated import zarr.api.asynchronous as async_api +import zarr.core.array from zarr._compat import _deprecate_positional_args from zarr.core.array import Array, AsyncArray from zarr.core.group import Group @@ -17,10 +18,23 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.core.array_spec import ArrayConfig, ArrayConfigParams + from zarr.core.array import ( + CompressorsLike, + FiltersLike, + SerializerLike, + ShardsLike, + ) + from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike - from zarr.core.chunk_key_encodings import ChunkKeyEncoding - from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike + from zarr.core.common import ( + JSON, + AccessModeLiteral, + ChunkCoords, + MemoryOrder, + ShapeLike, + ZarrFormat, + ) from zarr.storage import StoreLike __all__ = [ @@ -30,6 +44,7 @@ "copy_all", "copy_store", "create", + "create_array", "empty", "empty_like", "full", @@ -523,6 +538,54 @@ def open_group( ) +def create_group( + store: StoreLike, + *, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + overwrite: bool = False, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> Group: + """Create a group. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system. + path : str, optional + Group path within store. + overwrite : bool, optional + If True, pre-existing data at ``path`` will be deleted before + creating the group. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + If no ``zarr_format`` is provided, the default format will be used. + This default can be changed by modifying the value of ``default_zarr_format`` + in :mod:`zarr.core.config`. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + Group + The new group. + """ + return Group( + sync( + async_api.create_group( + store=store, + path=path, + overwrite=overwrite, + storage_options=storage_options, + zarr_format=zarr_format, + attributes=attributes, + ) + ) + ) + + # TODO: add type annotations for kwargs def create( shape: ChunkCoords | int, @@ -559,7 +622,7 @@ def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, **kwargs: Any, ) -> Array: """Create an array. @@ -629,7 +692,7 @@ def create( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. - config : ArrayConfig or ArrayConfigParams, optional + config : ArrayConfig or ArrayConfigLike, optional Runtime configuration of the array. If provided, will override the default values from `zarr.config.array`. @@ -675,6 +738,160 @@ def create( ) +def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + serializer: SerializerLike = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, +) -> Array: + """Create an array. + + This function wraps :func:`zarr.core.array.create_array`. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + Array + The array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> arr = await zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + + """ + return Array( + sync( + zarr.core.array.create_array( + store, + name=name, + shape=shape, + dtype=dtype, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, + ) + ) + ) + + # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: """Create an empty array. diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 4708967390..12bcc02e96 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -6,6 +6,7 @@ import numpy as np from zarr.abc.codec import Codec +from zarr.core.common import ZarrFormat from zarr.core.metadata.v3 import DataType @@ -20,7 +21,7 @@ class GroupInfo: _name: str _type: Literal["Group"] = "Group" - _zarr_format: Literal[2, 3] + _zarr_format: ZarrFormat _read_only: bool _store_type: str _count_members: int | None = None @@ -76,7 +77,7 @@ class ArrayInfo: """ _type: Literal["Array"] = "Array" - _zarr_format: Literal[2, 3] + _zarr_format: ZarrFormat _data_type: np.dtype[Any] | DataType _shape: tuple[int, ...] _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 717eff36dc..0a5b5f085a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1,21 +1,35 @@ from __future__ import annotations import json +import warnings from asyncio import gather +from collections.abc import Iterable from dataclasses import dataclass, field from itertools import starmap from logging import getLogger -from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Generic, + Literal, + TypeAlias, + TypedDict, + cast, + overload, +) from warnings import warn +import numcodecs import numpy as np import numpy.typing as npt +from typing_extensions import deprecated from zarr._compat import _deprecate_positional_args +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, normalize_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -23,9 +37,10 @@ NDBuffer, default_buffer_prototype, ) -from zarr.core.chunk_grids import RegularChunkGrid, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, + ChunkKeyEncodingLike, DefaultChunkKeyEncoding, V2ChunkKeyEncoding, ) @@ -38,6 +53,7 @@ MemoryOrder, ShapeLike, ZarrFormat, + _default_zarr_format, _warn_order_kwarg, concurrent_map, parse_dtype, @@ -80,21 +96,34 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import ( + _default_compressor, + _default_filters, + parse_compressor, + parse_filters, +) from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError -from zarr.registry import get_pipeline_class +from zarr.registry import ( + _parse_array_array_codec, + _parse_array_bytes_codec, + _parse_bytes_bytes_codec, + _resolve_codec, + get_pipeline_class, +) from zarr.storage import StoreLike, make_store_path from zarr.storage.common import StorePath, ensure_no_existing_node if TYPE_CHECKING: - from collections.abc import Iterable, Iterator, Sequence + from collections.abc import Iterator, Sequence from typing import Self - from zarr.abc.codec import Codec, CodecPipeline + from zarr.abc.codec import CodecPipeline + from zarr.codecs.sharding import ShardingCodecIndexLocation from zarr.core.group import AsyncGroup + # Array and AsyncArray are defined in the base ``zarr`` namespace __all__ = ["create_codec_pipeline", "parse_array_metadata"] @@ -149,9 +178,9 @@ async def get_array_metadata( (store_path / ZATTRS_JSON).get(), ) if zarr_json_bytes is not None and zarray_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zarray objects exist") + # warn and favor v3 + msg = f"Both zarr.json (Zarr v3) and .zarray (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: raise FileNotFoundError(store_path) # set zarr_format based on which keys were found @@ -273,7 +302,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata]: ... # this overload defines the function signature when zarr_format is 3 @@ -302,7 +331,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @overload @@ -330,8 +359,9 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... + @overload @classmethod async def create( @@ -363,10 +393,12 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod + @deprecated("Use zarr.api.asynchronous.create_array instead.") + @_deprecate_positional_args async def create( cls, store: StoreLike, @@ -396,10 +428,9 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """ - Method to create a new asynchronous array instance. + """Method to create a new asynchronous array instance. Parameters ---------- @@ -431,8 +462,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -453,14 +484,14 @@ async def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -471,24 +502,77 @@ async def create( Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional The data to be inserted into the array (default is None). + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- AsyncArray The created asynchronous array instance. - Examples - -------- - >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') - >>> async_arr = await zarr.core.array.AsyncArray.create( - >>> store=store, - >>> shape=(100,100), - >>> chunks=(10,10), - >>> dtype='i4', - >>> fill_value=0) - + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. + """ + return await cls._create( + store, + # v2 and v3 + shape=shape, + dtype=dtype, + zarr_format=zarr_format, + fill_value=fill_value, + attributes=attributes, + # v3 only + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + # v2 only + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, + # runtime + overwrite=overwrite, + data=data, + config=config, + ) + @classmethod + async def _create( + cls, + store: StoreLike, + *, + # v2 and v3 + shape: ShapeLike, + dtype: npt.DTypeLike, + zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ShapeLike | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ShapeLike | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: MemoryOrder | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime + overwrite: bool = False, + data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, + ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Method to create a new asynchronous array instance. + See :func:`AsyncArray.create` for more details. + Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ store_path = await make_store_path(store) @@ -502,7 +586,7 @@ async def create( _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) else: _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) - config_parsed = normalize_array_config(config) + config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 3: @@ -653,8 +737,8 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + compressor: dict[str, JSON] | numcodecs.abc.Codec | None = None, attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -670,12 +754,14 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + + # inject VLenUTF8 for str dtype if not already present if np.issubdtype(dtype, np.str_): filters = filters or [] - if not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [{"id": "vlen-utf8"}] + from numcodecs.vlen import VLenUTF8 + + if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [VLenUTF8()] metadata = ArrayV2Metadata( shape=shape, @@ -787,6 +873,7 @@ def shape(self) -> ChunkCoords: @property def chunks(self) -> ChunkCoords: """Returns the chunk shape of the Array. + If sharding is used the inner chunk shape is returned. Only defined for arrays using using `RegularChunkGrid`. If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. @@ -796,14 +883,22 @@ def chunks(self) -> ChunkCoords: ChunkCoords: The chunk shape of the Array. """ - if isinstance(self.metadata.chunk_grid, RegularChunkGrid): - return self.metadata.chunk_grid.chunk_shape + return self.metadata.chunks - msg = ( - f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`." - f"This array has a {self.metadata.chunk_grid} instead." - ) - raise NotImplementedError(msg) + @property + def shards(self) -> ChunkCoords | None: + """Returns the shard shape of the Array. + Returns None if sharding is not used. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + + Returns + ------- + ChunkCoords: + The shard shape of the Array. + """ + return self.metadata.shards @property def size(self) -> int: @@ -1115,7 +1210,7 @@ async def getitem( -------- >>> import zarr >>> store = zarr.storage.MemoryStore(mode='w') - >>> async_arr = await zarr.core.array.AsyncArray.create( + >>> async_arr = await zarr.api.asynchronous.create_array( ... store=store, ... shape=(100,100), ... chunks=(10,10), @@ -1508,6 +1603,7 @@ class Array: _async_array: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @classmethod + @deprecated("Use zarr.create_array instead.") @_deprecate_positional_args def create( cls, @@ -1537,7 +1633,7 @@ def create( compressor: dict[str, JSON] | None = None, # runtime overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: """Creates a new Array instance from an initialized store. @@ -1565,8 +1661,8 @@ def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -1587,14 +1683,14 @@ def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -1608,9 +1704,71 @@ def create( ------- Array Array created from the store. + + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.create_array`. + """ + return cls._create( + store, + # v2 and v3 + shape=shape, + dtype=dtype, + zarr_format=zarr_format, + attributes=attributes, + fill_value=fill_value, + # v3 only + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + # v2 only + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, + # runtime + overwrite=overwrite, + config=config, + ) + + @classmethod + def _create( + cls, + store: StoreLike, + *, + # v2 and v3 + shape: ChunkCoords, + dtype: npt.DTypeLike, + zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ChunkCoords | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: MemoryOrder | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, + ) -> Array: + """Creates a new Array instance from an initialized store. + See :func:`Array.create` for more details. + Deprecated in favor of :func:`zarr.create_array`. """ async_array = sync( - AsyncArray.create( + AsyncArray._create( store=store, shape=shape, dtype=dtype, @@ -1717,6 +1875,10 @@ def shape(self, value: ChunkCoords) -> None: @property def chunks(self) -> ChunkCoords: """Returns a tuple of integers describing the length of each dimension of a chunk of the array. + If sharding is used the inner chunk shape is returned. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. Returns ------- @@ -1725,6 +1887,21 @@ def chunks(self) -> ChunkCoords: """ return self._async_array.chunks + @property + def shards(self) -> ChunkCoords | None: + """Returns a tuple of integers describing the length of each dimension of a shard of the array. + Returns None if sharding is not used. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + + Returns + ------- + tuple | None + A tuple of integers representing the length of each dimension of a shard or None if sharding is not used. + """ + return self._async_array.shards + @property def size(self) -> int: """Returns the total number of elements in the array. @@ -1973,10 +2150,10 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: >>> import zarr >>> import numpy as np >>> data = np.arange(100, dtype="uint16") - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(10,), + >>> chunks=(10,), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2007,10 +2184,10 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: Setup a 2-dimensional array:: >>> data = np.arange(100, dtype="uint16").reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(10, 10), + >>> chunks=(10, 10), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2238,10 +2415,10 @@ def get_basic_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100, dtype="uint16") - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3,), + >>> chunks=(3,), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2267,10 +2444,10 @@ def get_basic_selection( Setup a 3-dimensional array:: >>> data = np.arange(1000).reshape(10, 10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(5, 5, 5), + >>> chunks=(5, 5, 5), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2462,10 +2639,10 @@ def get_orthogonal_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100).reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=data.shape, + >>> chunks=data.shape, >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2696,10 +2873,10 @@ def get_mask_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100).reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=data.shape, + >>> chunks=data.shape, >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2856,10 +3033,10 @@ def get_coordinate_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3, 3), + >>> chunks=(3, 3), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -3044,10 +3221,10 @@ def get_block_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3, 3), + >>> chunks=(3, 3), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -3448,4 +3625,459 @@ def _get_default_codecs( else: dtype_key = "numeric" - return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] + return cast(list[dict[str, JSON]], default_codecs[dtype_key]) + + +FiltersLike: TypeAlias = ( + Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + | ArrayArrayCodec + | Iterable[numcodecs.abc.Codec] + | numcodecs.abc.Codec + | Literal["auto"] + | None +) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | None +CompressorsLike: TypeAlias = ( + Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + | dict[str, JSON] + | BytesBytesCodec + | numcodecs.abc.Codec + | Literal["auto"] + | None +) +SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] + + +class ShardsConfigParam(TypedDict): + shape: ChunkCoords + index_location: ShardingCodecIndexLocation | None + + +ShardsLike: TypeAlias = ChunkCoords | ShardsConfigParam | Literal["auto"] + + +async def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + serializer: SerializerLike = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + AsyncArray + The array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> async_arr = await zarr.api.asynchronous.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + + """ + + if zarr_format is None: + zarr_format = _default_zarr_format() + + from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation + + mode: Literal["a"] = "a" + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + config_parsed = parse_array_config(config) + shape_parsed = parse_shapelike(shape) + chunk_key_encoding_parsed = _parse_chunk_key_encoding( + chunk_key_encoding, zarr_format=zarr_format + ) + store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) + shard_shape_parsed, chunk_shape_parsed = _auto_partition( + array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, dtype=dtype_parsed + ) + chunks_out: tuple[int, ...] + result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] + + if zarr_format == 2: + if shard_shape_parsed is not None: + msg = ( + "Zarr v2 arrays can only be created with `shard_shape` set to `None`. " + f"Got `shard_shape={shards}` instead." + ) + + raise ValueError(msg) + if serializer != "auto": + raise ValueError("Zarr v2 arrays do not support `serializer`.") + + filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( + compressor=compressors, filters=filters, dtype=np.dtype(dtype) + ) + + if dimension_names is not None: + raise ValueError("Zarr v2 arrays do not support dimension names.") + if order is None: + order_parsed = zarr_config.get("array.order") + else: + order_parsed = order + + result = await AsyncArray._create_v2( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + chunks=chunk_shape_parsed, + dimension_separator=chunk_key_encoding_parsed.separator, + fill_value=fill_value, + order=order_parsed, + filters=filters_parsed, + compressor=compressor_parsed, + attributes=attributes, + overwrite=overwrite, + config=config_parsed, + ) + else: + array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( + compressors=compressors, + filters=filters, + serializer=serializer, + dtype=dtype_parsed, + ) + sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) + codecs_out: tuple[Codec, ...] + if shard_shape_parsed is not None: + index_location = None + if isinstance(shards, dict): + index_location = ShardingCodecIndexLocation(shards.get("index_location", None)) + if index_location is None: + index_location = ShardingCodecIndexLocation.end + sharding_codec = ShardingCodec( + chunk_shape=chunk_shape_parsed, codecs=sub_codecs, index_location=index_location + ) + sharding_codec.validate( + shape=chunk_shape_parsed, + dtype=dtype_parsed, + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), + ) + codecs_out = (sharding_codec,) + chunks_out = shard_shape_parsed + else: + chunks_out = chunk_shape_parsed + codecs_out = sub_codecs + + result = await AsyncArray._create_v3( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + fill_value=fill_value, + attributes=attributes, + chunk_shape=chunks_out, + chunk_key_encoding=chunk_key_encoding_parsed, + codecs=codecs_out, + dimension_names=dimension_names, + overwrite=overwrite, + config=config_parsed, + ) + + return result + + +def _parse_chunk_key_encoding( + data: ChunkKeyEncoding | ChunkKeyEncodingLike | None, zarr_format: ZarrFormat +) -> ChunkKeyEncoding: + """ + Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. + """ + if data is None: + if zarr_format == 2: + result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "."}) + else: + result = ChunkKeyEncoding.from_dict({"name": "default", "separator": "/"}) + elif isinstance(data, ChunkKeyEncoding): + result = data + else: + result = ChunkKeyEncoding.from_dict(data) + if zarr_format == 2 and result.name != "v2": + msg = ( + "Invalid chunk key encoding. For Zarr v2 arrays, the `name` field of the " + f"chunk key encoding must be 'v2'. Got `name` = {result.name} instead." + ) + raise ValueError(msg) + return result + + +def _get_default_chunk_encoding_v3( + np_dtype: np.dtype[Any], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. + """ + default_codecs = zarr_config.get("array.v3_default_codecs") + dtype = DataType.from_numpy(np_dtype) + if dtype == DataType.string: + dtype_key = "string" + elif dtype == DataType.bytes: + dtype_key = "bytes" + else: + dtype_key = "numeric" + + codec_dicts = default_codecs[dtype_key] + codecs = tuple(_resolve_codec(c) for c in codec_dicts) + array_bytes_maybe = None + array_array: list[ArrayArrayCodec] = [] + bytes_bytes: list[BytesBytesCodec] = [] + + for codec in codecs: + if isinstance(codec, ArrayBytesCodec): + if array_bytes_maybe is not None: + raise ValueError( + f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. " + "Only one array-to-bytes codec is allowed." + ) + array_bytes_maybe = codec + elif isinstance(codec, ArrayArrayCodec): + array_array.append(codec) + elif isinstance(codec, BytesBytesCodec): + bytes_bytes.append(codec) + else: + raise TypeError(f"Unexpected codec type: {type(codec)}") + + if array_bytes_maybe is None: + raise ValueError("Required ArrayBytesCodec was not found.") + + return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes) + + +def _get_default_chunk_encoding_v2( + np_dtype: np.dtype[Any], +) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: + """ + Get the default chunk encoding for zarr v2 arrays, given a dtype + """ + + compressor_dict = _default_compressor(np_dtype) + filter_dicts = _default_filters(np_dtype) + + compressor = None + if compressor_dict is not None: + compressor = numcodecs.get_codec(compressor_dict) + + filters = None + if filter_dicts is not None: + filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + + return filters, compressor + + +def _parse_chunk_encoding_v2( + *, + compressor: CompressorsLike, + filters: FiltersLike, + dtype: np.dtype[Any], +) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: + """ + Generate chunk encoding classes for v2 arrays with optional defaults. + """ + default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) + + _filters: tuple[numcodecs.abc.Codec, ...] | None + _compressor: numcodecs.abc.Codec | None + + if compressor is None or compressor == (): + _compressor = None + elif compressor == "auto": + _compressor = default_compressor + elif isinstance(compressor, tuple | list) and len(compressor) == 1: + _compressor = parse_compressor(compressor[0]) + else: + if isinstance(compressor, Iterable) and not isinstance(compressor, dict): + msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." + raise TypeError(msg) + _compressor = parse_compressor(compressor) + + if filters is None: + _filters = None + elif filters == "auto": + _filters = default_filters + else: + if isinstance(filters, Iterable): + for idx, f in enumerate(filters): + if not isinstance(f, numcodecs.abc.Codec): + msg = ( + "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs. " + f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." + ) + raise TypeError(msg) + _filters = parse_filters(filters) + + return _filters, _compressor + + +def _parse_chunk_encoding_v3( + *, + compressors: CompressorsLike, + filters: FiltersLike, + serializer: SerializerLike, + dtype: np.dtype[Any], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Generate chunk encoding classes for v3 arrays with optional defaults. + """ + default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3( + dtype + ) + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] + maybe_array_array: Iterable[Codec | dict[str, JSON]] + out_bytes_bytes: tuple[BytesBytesCodec, ...] + if compressors is None: + out_bytes_bytes = () + + elif compressors == "auto": + out_bytes_bytes = default_bytes_bytes + + else: + if isinstance(compressors, dict | Codec): + maybe_bytes_bytes = (compressors,) + elif compressors is None: + maybe_bytes_bytes = () + else: + maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) + + out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + out_array_array: tuple[ArrayArrayCodec, ...] + if filters is None: + out_array_array = () + elif filters == "auto": + out_array_array = default_array_array + else: + if isinstance(filters, dict | Codec): + maybe_array_array = (filters,) + elif filters is None: + maybe_array_array = () + else: + maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters) + out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) + + if serializer == "auto": + out_array_bytes = default_array_bytes + else: + out_array_bytes = _parse_array_bytes_codec(serializer) + + return out_array_array, out_array_bytes, out_bytes_bytes + + +def _parse_deprecated_compressor( + compressor: CompressorLike | None, compressors: CompressorsLike +) -> CompressorsLike | None: + if compressor: + if compressors != "auto": + raise ValueError("Cannot specify both `compressor` and `compressors`.") + warn( + "The `compressor` argument is deprecated. Use `compressors` instead.", + category=UserWarning, + stacklevel=2, + ) + compressors = (compressor,) + return compressors diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index ee6934d05f..b1a6a3cad0 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -21,7 +21,7 @@ from zarr.core.common import ChunkCoords -class ArrayConfigParams(TypedDict): +class ArrayConfigLike(TypedDict): """ A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. This allows for partial construction of an ArrayConfig, with the assumption that the unset @@ -56,13 +56,13 @@ def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) @classmethod - def from_dict(cls, data: ArrayConfigParams) -> Self: + def from_dict(cls, data: ArrayConfigLike) -> Self: """ Create an ArrayConfig from a dict. The keys of that dict are a subset of the attributes of the ArrayConfig class. Any keys missing from that dict will be set to the the values in the ``array`` namespace of ``zarr.config``. """ - kwargs_out: ArrayConfigParams = {} + kwargs_out: ArrayConfigLike = {} for f in fields(ArrayConfig): field_name = cast(Literal["order", "write_empty_chunks"], f.name) if field_name not in data: @@ -72,7 +72,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: return cls(**kwargs_out) -def normalize_array_config(data: ArrayConfig | ArrayConfigParams | None) -> ArrayConfig: +def parse_array_config(data: ArrayConfig | ArrayConfigLike | None) -> ArrayConfig: """ Convert various types of data to an ArrayConfig. """ diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 7ddedfe064..85a7351fc7 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -16,11 +16,6 @@ import numpy as np import numpy.typing as npt -from zarr.registry import ( - get_buffer_class, - get_ndbuffer_class, -) - if TYPE_CHECKING: from collections.abc import Iterable, Sequence from typing import Self @@ -507,4 +502,9 @@ class BufferPrototype(NamedTuple): # The default buffer prototype used throughout the Zarr codebase. def default_buffer_prototype() -> BufferPrototype: + from zarr.registry import ( + get_buffer_class, + get_ndbuffer_class, + ) + return BufferPrototype(buffer=get_buffer_class(), nd_buffer=get_ndbuffer_class()) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ea050e39ef..d3e40c26ed 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -4,10 +4,11 @@ import math import numbers import operator +import warnings from abc import abstractmethod from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np @@ -26,6 +27,8 @@ from collections.abc import Iterator from typing import Self + from zarr.core.array import ShardsLike + def _guess_chunks( shape: ShapeLike, @@ -194,3 +197,55 @@ def get_nchunks(self, array_shape: ChunkCoords) -> int: itertools.starmap(ceildiv, zip(array_shape, self.chunk_shape, strict=True)), 1, ) + + +def _auto_partition( + *, + array_shape: tuple[int, ...], + chunk_shape: tuple[int, ...] | Literal["auto"], + shard_shape: ShardsLike | None, + dtype: np.dtype[Any], +) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: + """ + Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. + If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based + on the dtype and shape of the array. + If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape + of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, + given the dtype and shard shape. Otherwise, the chunks will be returned as-is. + """ + item_size = dtype.itemsize + if shard_shape is None: + _shards_out: None | tuple[int, ...] = None + if chunk_shape == "auto": + _chunks_out = _guess_chunks(array_shape, item_size) + else: + _chunks_out = chunk_shape + else: + if chunk_shape == "auto": + # aim for a 1MiB chunk + _chunks_out = _guess_chunks(array_shape, item_size, max_bytes=1024) + else: + _chunks_out = chunk_shape + + if shard_shape == "auto": + warnings.warn( + "Automatic shard shape inference is experimental and may change without notice.", + UserWarning, + stacklevel=2, + ) + _shards_out = () + for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True): + # TODO: make a better heuristic than this. + # for each axis, if there are more than 8 chunks along that axis, then put + # 2 chunks in each shard for that axis. + if a_shape // c_shape > 8: + _shards_out += (c_shape * 2,) + else: + _shards_out += (c_shape,) + elif isinstance(shard_shape, dict): + _shards_out = tuple(shard_shape["shape"]) + else: + _shards_out = shard_shape + + return _shards_out, _chunks_out diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index ed12ee3065..95ce9108f3 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -2,7 +2,7 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import Literal, cast +from typing import Literal, TypedDict, cast from zarr.abc.metadata import Metadata from zarr.core.common import ( @@ -20,6 +20,11 @@ def parse_separator(data: JSON) -> SeparatorLiteral: return cast(SeparatorLiteral, data) +class ChunkKeyEncodingLike(TypedDict): + name: Literal["v2", "default"] + separator: SeparatorLiteral + + @dataclass(frozen=True) class ChunkKeyEncoding(Metadata): name: str @@ -31,10 +36,16 @@ def __init__(self, *, separator: SeparatorLiteral) -> None: object.__setattr__(self, "separator", separator_parsed) @classmethod - def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncoding) -> ChunkKeyEncoding: + def from_dict( + cls, data: dict[str, JSON] | ChunkKeyEncoding | ChunkKeyEncodingLike + ) -> ChunkKeyEncoding: if isinstance(data, ChunkKeyEncoding): return data + # handle ChunkKeyEncodingParams + if "name" in data and "separator" in data: + data = {"name": data["name"], "configuration": {"separator": data["separator"]}} + # configuration is optional for chunk key encodings name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False) if name_parsed == "default": diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 3db00b1a06..d53f3847a5 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -18,6 +18,7 @@ import numpy as np +from zarr.core.config import config as zarr_config from zarr.core.strings import _STRING_DTYPE if TYPE_CHECKING: @@ -197,3 +198,8 @@ def _warn_order_kwarg() -> None: "or change the global 'array.order' configuration variable." ) warnings.warn(msg, RuntimeWarning, stacklevel=2) + + +def _default_zarr_format() -> ZarrFormat: + """Return the default zarr_version""" + return cast(ZarrFormat, int(zarr_config.get("default_zarr_format", 3))) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index a14305aef8..421a100f1b 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -62,19 +62,33 @@ def reset(self) -> None: "zarr", defaults=[ { - "default_zarr_version": 3, + "default_zarr_format": 3, "array": { "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": False}, + "string": {"id": "zstd", "level": 0, "checksum": False}, + "bytes": {"id": "zstd", "level": 0, "checksum": False}, + }, + "v2_default_filters": { + "numeric": None, + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 2d7a21911a..29b25689c4 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -18,7 +18,18 @@ from zarr.abc.metadata import Metadata from zarr.abc.store import Store, set_or_delete from zarr.core._info import GroupInfo -from zarr.core.array import Array, AsyncArray, _build_parents +from zarr.core.array import ( + Array, + AsyncArray, + CompressorLike, + CompressorsLike, + FiltersLike, + SerializerLike, + ShardsLike, + _build_parents, + _parse_deprecated_compressor, + create_array, +) from zarr.core.attributes import Attributes from zarr.core.buffer import default_buffer_prototype from zarr.core.common import ( @@ -46,9 +57,10 @@ from collections.abc import AsyncGenerator, Generator, Iterable, Iterator from typing import Any - from zarr.abc.codec import Codec + from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.chunk_key_encodings import ChunkKeyEncoding + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike + from zarr.core.common import MemoryOrder logger = logging.getLogger("zarr.group") @@ -58,7 +70,7 @@ def parse_zarr_format(data: Any) -> ZarrFormat: """Parse the zarr_format field from metadata.""" if data in (2, 3): - return cast(Literal[2, 3], data) + return cast(ZarrFormat, data) msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." raise ValueError(msg) @@ -434,7 +446,7 @@ async def from_store( async def open( cls, store: StoreLike, - zarr_format: Literal[2, 3] | None = 3, + zarr_format: ZarrFormat | None = 3, use_consolidated: bool | str | None = None, ) -> AsyncGroup: """Open a new AsyncGroup @@ -501,9 +513,9 @@ async def open( (store_path / str(consolidated_key)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zgroup objects exist") + # warn and favor v3 + msg = f"Both zarr.json (Zarr v3) and .zgroup (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: raise FileNotFoundError( f"could not find zarr.json or .zgroup objects in {store_path}" @@ -998,116 +1010,136 @@ async def create_array( name: str, *, shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, - data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """ - Create a Zarr array within this AsyncGroup. - This method lightly wraps AsyncArray.create. + """Create an array within this group. + + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- AsyncArray """ - return await AsyncArray.create( - self.store_path / name, + + compressors = _parse_deprecated_compressor(compressor, compressors) + return await create_array( + store=self.store_path, + name=name, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, fill_value=fill_value, + order=order, + zarr_format=self.metadata.zarr_format, + attributes=attributes, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, dimension_names=dimension_names, - attributes=attributes, - chunks=chunks, - dimension_separator=dimension_separator, - order=order, - filters=filters, - compressor=compressor, + storage_options=storage_options, overwrite=overwrite, - zarr_format=self.metadata.zarr_format, - data=data, + config=config, ) @deprecated("Use AsyncGroup.create_array instead.") @@ -1719,7 +1751,7 @@ def from_store( def open( cls, store: StoreLike, - zarr_format: Literal[2, 3] | None = 3, + zarr_format: ZarrFormat | None = 3, ) -> Group: """Open a group from an initialized store. @@ -1755,8 +1787,8 @@ def __getitem__(self, path: str) -> Array | Group: -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) - >>> group.create_group(name="subgroup").create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) + >>> group.create_group(name="subgroup").create_array(name="subarray", shape=(10,), chunks=(10,)) >>> group["subarray"] >>> group["subgroup"] @@ -1790,7 +1822,7 @@ def get(self, path: str, default: DefaultT | None = None) -> Array | Group | Def -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) >>> group.create_group(name="subgroup") >>> group.get("subarray") @@ -1816,7 +1848,7 @@ def __delitem__(self, key: str) -> None: -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) >>> del group["subarray"] >>> "subarray" in group False @@ -1831,8 +1863,8 @@ def __iter__(self) -> Iterator[str]: >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_array('baz', shape=(10,), chunk_shape=(10,)) - >>> d2 = g1.create_array('quux', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('baz', shape=(10,), chunks=(10,)) + >>> d2 = g1.create_array('quux', shape=(10,), chunks=(10,)) >>> for name in g1: ... print(name) baz @@ -2023,8 +2055,8 @@ def keys(self) -> Generator[str, None]: >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_array('baz', shape=(10,), chunk_shape=(10,)) - >>> d2 = g1.create_array('quux', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('baz', shape=(10,), chunks=(10,)) + >>> d2 = g1.create_array('quux', shape=(10,), chunks=(10,)) >>> for name in g1.keys(): ... print(name) baz @@ -2042,7 +2074,7 @@ def __contains__(self, member: str) -> bool: >>> import zarr >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') - >>> d1 = g1.create_array('bar', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('bar', shape=(10,), chunks=(10,)) >>> 'foo' in g1 True >>> 'bar' in g1 @@ -2105,7 +2137,7 @@ def arrays(self) -> Generator[tuple[str, Array], None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for name, subarray in group.arrays(): ... print(name, subarray) subarray @@ -2120,7 +2152,7 @@ def array_keys(self) -> Generator[str, None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for name in group.array_keys(): ... print(name) subarray @@ -2136,7 +2168,7 @@ def array_values(self) -> Generator[Array, None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for subarray in group.array_values(): ... print(subarray) @@ -2225,120 +2257,134 @@ def create_array( name: str, *, shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, - data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. + """Create an array within this group. - This method lightly wraps `AsyncArray.create`. + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + compressors = _parse_deprecated_compressor(compressor, compressors) return Array( self._sync( self._async_group.create_array( name=name, shape=shape, dtype=dtype, + chunks=chunks, + shards=shards, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compressors=compressors, + serializer=serializer, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, filters=filters, - compressor=compressor, overwrite=overwrite, - data=data, + storage_options=storage_options, + config=config, ) ) ) @@ -2594,121 +2640,136 @@ def array( self, name: str, *, - shape: ChunkCoords, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ChunkCoords | Literal["auto"] | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ChunkCoords | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, data: npt.ArrayLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. + """Create an array within this group. - This method lightly wraps `AsyncArray.create`. + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + compressors = _parse_deprecated_compressor(compressor, compressors) return Array( self._sync( self._async_group.create_array( name=name, shape=shape, dtype=dtype, + chunks=chunks, + shards=shards, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compressors=compressors, + serializer=serializer, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, filters=filters, - compressor=compressor, overwrite=overwrite, - data=data, + storage_options=storage_options, + config=config, ) ) ) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index af26034b1d..bc7fd32cbf 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -6,6 +6,8 @@ from functools import cached_property from typing import TYPE_CHECKING, TypedDict, cast +import numcodecs.abc + from zarr.abc.metadata import Metadata if TYPE_CHECKING: @@ -14,7 +16,7 @@ import numpy.typing as npt from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.common import JSON, ChunkCoords + from zarr.core.common import ChunkCoords import json from dataclasses import dataclass, field, fields, replace @@ -25,7 +27,7 @@ from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator -from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike +from zarr.core.common import JSON, ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import parse_attributes @@ -42,7 +44,7 @@ class ArrayV2MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): shape: ChunkCoords - chunks: tuple[int, ...] + chunks: ChunkCoords dtype: np.dtype[Any] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" @@ -100,6 +102,10 @@ def ndim(self) -> int: def chunk_grid(self) -> RegularChunkGrid: return RegularChunkGrid(chunk_shape=self.chunks) + @property + def shards(self) -> ChunkCoords | None: + return None + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: def _json_convert( o: Any, @@ -235,6 +241,9 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) return tuple(out) + # take a single codec instance and wrap it in a tuple + if isinstance(data, numcodecs.abc.Codec): + return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) @@ -329,9 +338,9 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return dtype.type(0) -def _default_filters_and_compressor( +def _default_compressor( dtype: np.dtype[Any], -) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: +) -> dict[str, JSON] | None: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html @@ -346,4 +355,24 @@ def _default_filters_and_compressor( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return [{"id": default_compressor[dtype_key]}], None + return cast(dict[str, JSON] | None, default_compressor.get(dtype_key, None)) + + +def _default_filters( + dtype: np.dtype[Any], +) -> list[dict[str, JSON]] | None: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_filters = config.get("array.v2_default_filters") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return cast(list[dict[str, JSON]] | None, default_filters.get(dtype_key, None)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4cf5860ffd..0821dd9bc9 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -296,6 +296,40 @@ def dtype(self) -> np.dtype[Any]: def ndim(self) -> int: return len(self.shape) + @property + def chunks(self) -> ChunkCoords: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + sharding_codec = self.codecs[0] + assert isinstance(sharding_codec, ShardingCodec) # for mypy + return sharding_codec.chunk_shape + else: + return self.chunk_grid.chunk_shape + + msg = ( + f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`." + f"This array has a {self.chunk_grid} instead." + ) + raise NotImplementedError(msg) + + @property + def shards(self) -> ChunkCoords | None: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.chunk_grid.chunk_shape + else: + return None + + msg = ( + f"The `shards` attribute is only defined for arrays using `RegularChunkGrid`." + f"This array has a {self.chunk_grid} instead." + ) + raise NotImplementedError(msg) + def get_chunk_spec( self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: @@ -449,7 +483,7 @@ def parse_fill_value( return np.bytes_(fill_value) # the rest are numeric types - np_dtype = cast(np.dtype[np.generic], data_type.to_numpy()) + np_dtype = cast(np.dtype[Any], data_type.to_numpy()) if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): if data_type in (DataType.complex64, DataType.complex128): @@ -513,8 +547,8 @@ def default_fill_value(dtype: DataType) -> str | bytes | np.generic: return b"" else: np_dtype = dtype.to_numpy() - np_dtype = cast(np.dtype[np.generic], np_dtype) - return np_dtype.type(0) + np_dtype = cast(np.dtype[Any], np_dtype) + return np_dtype.type(0) # type: ignore[misc] # For type checking @@ -586,7 +620,7 @@ def to_numpy_shortname(self) -> str: } return data_type_to_numpy[self] - def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[np.generic]: + def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[Any]: # note: it is not possible to round trip DataType <-> np.dtype # due to the fact that DataType.string and DataType.bytes both # generally return np.dtype("O") from this function, even though diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 9055bb1447..704db3f704 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -10,8 +10,15 @@ if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.abc.codec import Codec, CodecPipeline + from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BytesBytesCodec, + Codec, + CodecPipeline, + ) from zarr.core.buffer import Buffer, NDBuffer + from zarr.core.common import JSON __all__ = [ "Registry", @@ -151,6 +158,74 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: raise KeyError(key) +def _resolve_codec(data: dict[str, JSON]) -> Codec: + """ + Get a codec instance from a dict representation of that codec. + """ + # TODO: narrow the type of the input to only those dicts that map on to codec class instances. + return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] + + +def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: + """ + Normalize the input to a ``BytesBytesCodec`` instance. + If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it + is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. + """ + from zarr.abc.codec import BytesBytesCodec + + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, BytesBytesCodec): + msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." + raise TypeError(msg) + else: + if not isinstance(data, BytesBytesCodec): + raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") + result = data + return result + + +def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: + """ + Normalize the input to a ``ArrayBytesCodec`` instance. + If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it + is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. + """ + from zarr.abc.codec import ArrayBytesCodec + + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, ArrayBytesCodec): + msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." + raise TypeError(msg) + else: + if not isinstance(data, ArrayBytesCodec): + raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") + result = data + return result + + +def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: + """ + Normalize the input to a ``ArrayArrayCodec`` instance. + If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it + is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. + """ + from zarr.abc.codec import ArrayArrayCodec + + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, ArrayArrayCodec): + msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." + raise TypeError(msg) + else: + if not isinstance(data, ArrayArrayCodec): + raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") + result = data + return result + + def get_pipeline_class(reload_config: bool = False) -> type[CodecPipeline]: if reload_config: _reload_config() diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 85a67e3e69..ae0487e447 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -1,4 +1,4 @@ -from typing import Any, Literal +from typing import Any import hypothesis.extra.numpy as npst import hypothesis.strategies as st @@ -8,8 +8,10 @@ import zarr from zarr.core.array import Array +from zarr.core.common import ZarrFormat from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike +from zarr.storage.common import _dereference_path # Copied from Xarray _attr_keys = st.text(st.characters(), min_size=1) @@ -68,7 +70,7 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]: # So we map a clear to reset the store. stores = st.builds(MemoryStore, st.just({})).map(lambda x: sync(x.clear())) compressors = st.sampled_from([None, "default"]) -zarr_formats: st.SearchStrategy[Literal[2, 3]] = st.sampled_from([2, 3]) +zarr_formats: st.SearchStrategy[ZarrFormat] = st.sampled_from([2, 3]) array_shapes = npst.array_shapes(max_dims=4, min_side=0) @@ -77,7 +79,7 @@ def numpy_arrays( draw: st.DrawFn, *, shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, - zarr_formats: st.SearchStrategy[Literal[2, 3]] = zarr_formats, + zarr_formats: st.SearchStrategy[ZarrFormat] = zarr_formats, ) -> Any: """ Generate numpy arrays that can be saved in the provided Zarr format. @@ -137,7 +139,7 @@ def arrays( expected_attrs = {} if attributes is None else attributes - array_path = path + ("/" if not path.endswith("/") else "") + name + array_path = _dereference_path(path, name) root = zarr.open_group(store, mode="w", zarr_format=zarr_format) a = root.create_array( diff --git a/tests/test_api.py b/tests/test_api.py index d25ec54bfe..80e8555e11 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -13,6 +13,8 @@ from zarr.abc.store import Store from zarr.api.synchronous import ( create, + create_array, + create_group, group, load, open, @@ -21,13 +23,13 @@ save_array, save_group, ) -from zarr.core.common import MemoryOrder, ZarrFormat +from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.errors import MetadataValidationError from zarr.storage._utils import normalize_path from zarr.storage.memory import MemoryStore -def test_create_array(memory_store: Store) -> None: +def test_create(memory_store: Store) -> None: store = memory_store # create array @@ -56,6 +58,22 @@ def test_create_array(memory_store: Store) -> None: z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) # type: ignore [arg-type] +# TODO: parametrize over everything this function takes +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_create_array(store: Store) -> None: + attrs: dict[str, JSON] = {"foo": 100} # explicit type annotation to avoid mypy error + shape = (10, 10) + path = "foo" + data_val = 1 + array_w = create_array( + store, name=path, shape=shape, attributes=attrs, chunks=shape, dtype="uint8" + ) + array_w[:] = data_val + assert array_w.shape == shape + assert array_w.attrs == attrs + assert np.array_equal(array_w[:], np.zeros(shape, dtype=array_w.dtype) + data_val) + + @pytest.mark.parametrize("write_empty_chunks", [True, False]) def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: """ @@ -113,6 +131,16 @@ async def test_open_array(memory_store: MemoryStore) -> None: open(store="doesnotexist", mode="r") +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_group(store: Store, zarr_format: ZarrFormat) -> None: + attrs = {"foo": 100} + path = "node" + node = create_group(store, path=path, attributes=attrs, zarr_format=zarr_format) + assert isinstance(node, Group) + assert node.attrs == attrs + assert node.metadata.zarr_format == zarr_format + + async def test_open_group(memory_store: MemoryStore) -> None: store = memory_store diff --git a/tests/test_array.py b/tests/test_array.py index 891538bc43..72ff68d954 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2,21 +2,38 @@ import json import math import pickle +import re from itertools import accumulate -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal import numcodecs import numpy as np import pytest -from numcodecs import Zstd import zarr.api.asynchronous from zarr import Array, AsyncArray, Group -from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec +from zarr.codecs import ( + BytesCodec, + GzipCodec, + TransposeCodec, + VLenBytesCodec, + VLenUTF8Codec, + ZstdCodec, +) from zarr.core._info import ArrayInfo -from zarr.core.array import chunks_initialized +from zarr.core.array import ( + CompressorsLike, + FiltersLike, + _get_default_chunk_encoding_v2, + _get_default_chunk_encoding_v3, + _parse_chunk_encoding_v2, + _parse_chunk_encoding_v3, + chunks_initialized, + create_array, +) from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer +from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import ceildiv @@ -26,6 +43,9 @@ from zarr.storage import LocalStore, MemoryStore from zarr.storage.common import StorePath +if TYPE_CHECKING: + from zarr.core.array_spec import ArrayConfigLike + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) @@ -58,7 +78,7 @@ def test_array_creation_existing_node( if overwrite: if not store.supports_deletes: pytest.skip("store does not support deletes") - arr_new = Array.create( + arr_new = zarr.create_array( spath / "extant", shape=new_shape, dtype=new_dtype, @@ -69,7 +89,7 @@ def test_array_creation_existing_node( assert arr_new.dtype == new_dtype else: with pytest.raises(expected_exception): - arr_new = Array.create( + arr_new = zarr.create_array( spath / "extant", shape=new_shape, dtype=new_dtype, @@ -123,7 +143,9 @@ async def test_create_creates_parents( def test_array_name_properties_no_group( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: - arr = Array.create(store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4") + arr = zarr.create_array( + store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" + ) assert arr.path == "" assert arr.name == "/" assert arr.basename == "" @@ -161,17 +183,17 @@ def test_array_v3_fill_value_default( shape = (10,) default_fill_value = 0 if specifiy_fill_value: - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=dtype_str, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=None, ) else: - arr = Array.create( - store=store, shape=shape, dtype=dtype_str, zarr_format=3, chunk_shape=shape + arr = zarr.create_array( + store=store, shape=shape, dtype=dtype_str, zarr_format=3, chunks=shape ) assert arr.fill_value == np.dtype(dtype_str).type(default_fill_value) @@ -185,12 +207,12 @@ def test_array_v3_fill_value_default( ) def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str) -> None: shape = (10,) - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=dtype_str, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=fill_value, ) @@ -201,12 +223,12 @@ def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str def test_create_positional_args_deprecated() -> None: store = MemoryStore() with pytest.warns(FutureWarning, match="Pass"): - Array.create(store, (2, 2), dtype="f8") + zarr.Array.create(store, (2, 2), dtype="f8") def test_selection_positional_args_deprecated() -> None: store = MemoryStore() - arr = Array.create(store, shape=(2, 2), dtype="f8") + arr = zarr.create_array(store, shape=(2, 2), dtype="f8") with pytest.warns(FutureWarning, match="Pass out"): arr.get_basic_selection(..., NDBuffer(array=np.empty((2, 2)))) @@ -242,12 +264,12 @@ def test_selection_positional_args_deprecated() -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) async def test_array_v3_nan_fill_value(store: MemoryStore) -> None: shape = (10,) - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=np.float64, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=np.nan, ) arr[:] = np.nan @@ -263,7 +285,7 @@ async def test_array_v3_nan_fill_value(store: MemoryStore) -> None: async def test_serializable_async_array( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: - expected = await AsyncArray.create( + expected = await zarr.api.asynchronous.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" ) # await expected.setitems(list(range(100))) @@ -279,7 +301,7 @@ async def test_serializable_async_array( @pytest.mark.parametrize("store", ["local"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) def test_serializable_sync_array(store: LocalStore, zarr_format: ZarrFormat) -> None: - expected = Array.create( + expected = zarr.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" ) expected[:] = list(range(100)) @@ -320,7 +342,7 @@ def test_nchunks(test_cls: type[Array] | type[AsyncArray[Any]], nchunks: int) -> """ store = MemoryStore() shape = 100 - arr = Array.create(store, shape=(shape,), chunks=(ceildiv(shape, nchunks),), dtype="i4") + arr = zarr.create_array(store, shape=(shape,), chunks=(ceildiv(shape, nchunks),), dtype="i4") expected = nchunks if test_cls == Array: observed = arr.nchunks @@ -335,7 +357,7 @@ async def test_nchunks_initialized(test_cls: type[Array] | type[AsyncArray[Any]] Test that nchunks_initialized accurately returns the number of stored chunks. """ store = MemoryStore() - arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") # write chunks one at a time for idx, region in enumerate(arr._iter_chunk_regions()): @@ -363,7 +385,7 @@ async def test_chunks_initialized() -> None: Test that chunks_initialized accurately returns the keys of stored chunks. """ store = MemoryStore() - arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") chunks_accumulated = tuple( accumulate(tuple(tuple(v.split(" ")) for v in arr._iter_chunk_keys())) @@ -402,44 +424,54 @@ async def test_nbytes_stored_async() -> None: def test_default_fill_values() -> None: - a = Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): - Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: +def test_update_attrs(zarr_format: ZarrFormat) -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 store = MemoryStore() - arr = Array.create(store=store, shape=5, chunk_shape=5, dtype="f8", zarr_format=zarr_format) + arr = zarr.create_array( + store=store, shape=(5,), chunks=(5,), dtype="f8", zarr_format=zarr_format + ) arr.attrs["foo"] = "bar" assert arr.attrs["foo"] == "bar" @@ -460,7 +492,7 @@ def test_info_v2(self) -> None: _read_only=False, _store_type="MemoryStore", _count_bytes=128, - _filters=(numcodecs.Zstd(),), + _compressor=numcodecs.Zstd(), ) assert result == expected @@ -516,8 +548,8 @@ async def test_info_v2_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _filters=(Zstd(level=0),), _count_bytes=128, + _compressor=numcodecs.Zstd(), ) assert result == expected @@ -757,7 +789,7 @@ def test_array_create_metadata_order_v2( keyword argument to ``Array.create``. When ``order`` is ``None``, the value of the ``array.order`` config is used. """ - arr = Array.create(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") + arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") expected = order or zarr.config.get("array.order") assert arr.metadata.order == expected # type: ignore[union-attr] @@ -767,13 +799,14 @@ def test_array_create_metadata_order_v2( @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_array_create_order( order_config: MemoryOrder | None, - zarr_format: int, + zarr_format: ZarrFormat, store: MemoryStore, ) -> None: """ Test that the arrays generated by array indexing have a memory order defined by the config order value """ + config: ArrayConfigLike = {} if order_config is None: config = {} expected = zarr.config.get("array.order") @@ -781,7 +814,7 @@ def test_array_create_order( config = {"order": order_config} expected = order_config - arr = Array.create( + arr = zarr.create_array( store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config ) @@ -801,7 +834,7 @@ def test_write_empty_chunks_config(write_empty_chunks: bool) -> None: explicitly """ with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): - arr = Array.create({}, shape=(2, 2), dtype="i4") + arr = zarr.create_array({}, shape=(2, 2), dtype="i4") assert arr._async_array._config.write_empty_chunks == write_empty_chunks @@ -821,13 +854,13 @@ def test_write_empty_chunks_behavior( already present. """ - arr = Array.create( + arr = zarr.create_array( store=store, shape=(2,), zarr_format=zarr_format, dtype="i4", fill_value=fill_value, - chunk_shape=(1,), + chunks=(1,), config={"write_empty_chunks": write_empty_chunks}, ) @@ -858,7 +891,7 @@ def test_write_empty_chunks_behavior( ) async def test_special_complex_fill_values_roundtrip(fill_value: Any, expected: list[Any]) -> None: store = MemoryStore() - Array.create(store=store, shape=(1,), dtype=np.complex64, fill_value=fill_value) + zarr.create_array(store=store, shape=(1,), dtype=np.complex64, fill_value=fill_value) content = await store.get("zarr.json", prototype=default_buffer_prototype()) assert content is not None actual = json.loads(content.to_bytes()) @@ -876,13 +909,281 @@ async def test_nbytes( the chunks of that array. """ store = MemoryStore() - arr = Array.create(store=store, shape=shape, dtype=dtype, fill_value=0) + arr = zarr.create_array(store=store, shape=shape, dtype=dtype, fill_value=0) if array_type == "async": assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize else: assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize +@pytest.mark.parametrize( + ("array_shape", "chunk_shape"), + [((256,), (2,))], +) +def test_auto_partition_auto_shards( + array_shape: tuple[int, ...], chunk_shape: tuple[int, ...] +) -> None: + """ + Test that automatically picking a shard size returns a tuple of 2 * the chunk shape for any axis + where there are 8 or more chunks. + """ + dtype = np.dtype("uint8") + expected_shards: tuple[int, ...] = () + for cs, a_len in zip(chunk_shape, array_shape, strict=False): + if a_len // cs >= 8: + expected_shards += (2 * cs,) + else: + expected_shards += (cs,) + + auto_shards, _ = _auto_partition( + array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype + ) + assert auto_shards == expected_shards + + +def test_chunks_and_shards() -> None: + store = StorePath(MemoryStore()) + shape = (100, 100) + chunks = (5, 5) + shards = (10, 10) + + arr_v3 = zarr.create_array(store=store / "v3", shape=shape, chunks=chunks, dtype="i4") + assert arr_v3.chunks == chunks + assert arr_v3.shards is None + + arr_v3_sharding = zarr.create_array( + store=store / "v3_sharding", + shape=shape, + chunks=chunks, + shards=shards, + dtype="i4", + ) + assert arr_v3_sharding.chunks == chunks + assert arr_v3_sharding.shards == shards + + arr_v2 = zarr.create_array( + store=store / "v2", shape=shape, chunks=chunks, zarr_format=2, dtype="i4" + ) + assert arr_v2.chunks == chunks + assert arr_v2.shards is None + + +def test_create_array_default_fill_values() -> None: + a = zarr.create_array(MemoryStore(), shape=(5,), chunks=(5,), dtype=" None: + """ + Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. + """ + + # v2 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=empty_value, + filters=empty_value, + ) + # The v2 metadata stores None and () separately + assert arr.metadata.filters == empty_value # type: ignore[union-attr] + # The v2 metadata does not allow tuple for compressor, therefore it is turned into None + assert arr.metadata.compressor is None # type: ignore[union-attr] + + # v3 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + compressors=empty_value, + filters=empty_value, + ) + if dtype == "str": + assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + else: + assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +@pytest.mark.parametrize( + "compressors", + [ + "auto", + None, + (), + (ZstdCodec(level=3),), + (ZstdCodec(level=3), GzipCodec(level=0)), + ZstdCodec(level=3), + {"name": "zstd", "configuration": {"level": 3}}, + ({"name": "zstd", "configuration": {"level": 3}},), + ], +) +@pytest.mark.parametrize( + "filters", + [ + "auto", + None, + (), + ( + TransposeCodec( + order=[ + 0, + ] + ), + ), + ( + TransposeCodec( + order=[ + 0, + ] + ), + TransposeCodec( + order=[ + 0, + ] + ), + ), + TransposeCodec( + order=[ + 0, + ] + ), + {"name": "transpose", "configuration": {"order": [0]}}, + ({"name": "transpose", "configuration": {"order": [0]}},), + ], +) +async def test_create_array_v3_chunk_encoding( + store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str +) -> None: + """ + Test various possibilities for the compressors and filters parameter to create_array + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=3, + filters=filters, + compressors=compressors, + ) + aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) + ) + # TODO: find a better way to get the filters / compressors from the array. + assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] + assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +@pytest.mark.parametrize( + "compressors", + [ + "auto", + None, + numcodecs.Zstd(level=3), + (), + (numcodecs.Zstd(level=3),), + ], +) +@pytest.mark.parametrize( + "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] +) +async def test_create_array_v2_chunk_encoding( + store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str +) -> None: + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=compressors, + filters=filters, + ) + filters_expected, compressor_expected = _parse_chunk_encoding_v2( + filters=filters, compressor=compressors, dtype=np.dtype(dtype) + ) + # TODO: find a better way to get the filters/compressor from the array. + assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr] + assert arr.metadata.filters == filters_expected # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +async def test_create_array_v3_default_filters_compressors(store: MemoryStore, dtype: str) -> None: + """ + Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with + ``zarr_format`` = 3 and ``filters`` and ``compressors`` are not specified. + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=3, + ) + expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) + # TODO: define the codec pipeline class such that these fields are required, which will obviate the + # type ignore statements + assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined] + assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined] + assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +async def test_create_array_v2_default_filters_compressors(store: MemoryStore, dtype: str) -> None: + """ + Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with + ``zarr_format`` = 2 and ``filters`` and ``compressors`` are not specified. + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + ) + expected_filters, expected_compressors = _get_default_chunk_encoding_v2( + np_dtype=np.dtype(dtype) + ) + assert arr.metadata.filters == expected_filters # type: ignore[union-attr] + assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_v2_no_shards(store: MemoryStore) -> None: + """ + Test that creating a Zarr v2 array with ``shard_shape`` set to a non-None value raises an error. + """ + msg = re.escape( + "Zarr v2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." + ) + with pytest.raises(ValueError, match=msg): + _ = await create_array( + store=store, + dtype="uint8", + shape=(10,), + shards=(5,), + zarr_format=2, + ) + + async def test_scalar_array() -> None: arr = zarr.array(1.5) assert arr[...] == 1.5 diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 7a275516c6..e3cab0f214 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -5,9 +5,8 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.codecs.blosc import BloscCodec -from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec @@ -47,10 +46,10 @@ async def test_async_array_prototype() -> None: """Test the use of a custom buffer prototype""" expect = np.zeros((9, 9), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(StoreExpectingTestBuffer()) / "test_async_array_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, ) @@ -76,10 +75,10 @@ async def test_async_array_gpu_prototype() -> None: """Test the use of the GPU buffer prototype""" expect = cp.zeros((9, 9), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(MemoryStore()) / "test_async_array_gpu_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, ) @@ -98,20 +97,14 @@ async def test_async_array_gpu_prototype() -> None: @pytest.mark.asyncio async def test_codecs_use_of_prototype() -> None: expect = np.zeros((10, 10), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(StoreExpectingTestBuffer()) / "test_codecs_use_of_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, - codecs=[ - TransposeCodec(order=(1, 0)), - BytesCodec(), - BloscCodec(), - Crc32cCodec(), - GzipCodec(), - ZstdCodec(), - ], + compressors=[BloscCodec(), Crc32cCodec(), GzipCodec(), ZstdCodec()], + filters=[TransposeCodec(order=(1, 0))], ) expect[:] = np.arange(100).reshape(10, 10) @@ -133,20 +126,14 @@ async def test_codecs_use_of_prototype() -> None: @pytest.mark.asyncio async def test_codecs_use_of_gpu_prototype() -> None: expect = cp.zeros((10, 10), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(MemoryStore()) / "test_codecs_use_of_gpu_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, - codecs=[ - TransposeCodec(order=(1, 0)), - BytesCodec(), - BloscCodec(), - Crc32cCodec(), - GzipCodec(), - ZstdCodec(), - ], + compressors=[BloscCodec(), Crc32cCodec(), GzipCodec(), ZstdCodec()], + filters=[TransposeCodec(order=(1, 0))], ) expect[:] = cp.arange(100).reshape(10, 10) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 416a2f784e..34044d7d62 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -3,9 +3,9 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.abc.store import Store -from zarr.codecs import BloscCodec, BytesCodec, ShardingCodec +from zarr.codecs import BloscCodec from zarr.core.buffer import default_buffer_prototype from zarr.storage.common import StorePath @@ -16,13 +16,13 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: typesize = np.dtype(dtype).itemsize path = "blosc_evolve" spath = StorePath(store, path) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), dtype=dtype, fill_value=0, - codecs=[BytesCodec(), BloscCodec()], + compressors=BloscCodec(), ) buf = await store.get(f"{path}/zarr.json", prototype=default_buffer_prototype()) assert buf is not None @@ -36,13 +36,14 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: path2 = "blosc_evolve_sharding" spath2 = StorePath(store, path2) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath2, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), + shards=(16, 16), dtype=dtype, fill_value=0, - codecs=[ShardingCodec(chunk_shape=(16, 16), codecs=[BytesCodec(), BloscCodec()])], + compressors=BloscCodec(), ) buf = await store.get(f"{path2}/zarr.json", prototype=default_buffer_prototype()) assert buf is not None diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index 2025e72937..e36a332440 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -7,6 +7,9 @@ import numpy as np import pytest +import zarr +import zarr.api +import zarr.api.asynchronous from zarr import Array, AsyncArray, config from zarr.codecs import ( BytesCodec, @@ -19,7 +22,6 @@ from zarr.storage import StorePath if TYPE_CHECKING: - from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.core.buffer.core import NDArrayLike from zarr.core.common import MemoryOrder @@ -75,27 +77,18 @@ async def test_order( data = np.arange(0, 256, dtype="uint16").reshape((32, 8), order=input_order) path = "order" spath = StorePath(store, path=path) - codecs_: list[Codec] = ( - [ - ShardingCodec( - chunk_shape=(16, 8), - codecs=[TransposeCodec(order=order_from_dim(store_order, data.ndim)), BytesCodec()], - ) - ] - if with_sharding - else [TransposeCodec(order=order_from_dim(store_order, data.ndim)), BytesCodec()] - ) - with config.set({"array.order": runtime_write_order}): - a = await AsyncArray.create( - spath, - shape=data.shape, - chunk_shape=(32, 8), - dtype=data.dtype, - fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=codecs_, - ) + a = await zarr.api.asynchronous.create_array( + spath, + shape=data.shape, + chunks=(16, 8) if with_sharding else (32, 8), + shards=(32, 8) if with_sharding else None, + dtype=data.dtype, + fill_value=0, + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=order_from_dim(store_order, data.ndim))], + config={"order": runtime_write_order}, + ) await _AsyncArrayProxy(a)[:, :].set(data) read_data = await _AsyncArrayProxy(a)[:, :].get() @@ -131,16 +124,15 @@ def test_order_implicit( data = np.arange(0, 256, dtype="uint16").reshape((16, 16), order=input_order) path = "order_implicit" spath = StorePath(store, path) - codecs_: list[Codec] | None = [ShardingCodec(chunk_shape=(8, 8))] if with_sharding else None with config.set({"array.order": runtime_write_order}): - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(8, 8) if with_sharding else (16, 16), + shards=(16, 16) if with_sharding else None, dtype=data.dtype, fill_value=0, - codecs=codecs_, ) a[:, :] = data @@ -161,10 +153,10 @@ def test_order_implicit( @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_open(store: Store) -> None: spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), dtype="int32", fill_value=0, ) @@ -228,10 +220,10 @@ def test_morton2(shape) -> None: def test_write_partial_chunks(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=(20, 20), + chunks=(20, 20), dtype=data.dtype, fill_value=1, ) @@ -244,10 +236,10 @@ async def test_delete_empty_chunks(store: Store) -> None: data = np.ones((16, 16)) path = "delete_empty_chunks" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(32, 32), + chunks=(32, 32), dtype=data.dtype, fill_value=1, ) @@ -262,25 +254,25 @@ async def test_dimension_names(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) path = "dimension_names" spath = StorePath(store, path) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, dimension_names=("x", "y"), ) - assert (await AsyncArray.open(spath)).metadata.dimension_names == ( + assert (await zarr.api.asynchronous.open_array(store=spath)).metadata.dimension_names == ( "x", "y", ) path2 = "dimension_names2" spath2 = StorePath(store, path2) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath2, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, ) @@ -293,7 +285,7 @@ async def test_dimension_names(store: Store) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_invalid_metadata(store: Store) -> None: - spath2 = StorePath(store, "invalid_endian") + spath2 = StorePath(store, "invalid_codec_order") with pytest.raises(TypeError): Array.create( spath2, @@ -302,7 +294,7 @@ def test_invalid_metadata(store: Store) -> None: dtype=np.dtype("uint8"), fill_value=0, codecs=[ - BytesCodec(endian="big"), + BytesCodec(), TransposeCodec(order=order_from_dim("F", 2)), ], ) @@ -315,8 +307,8 @@ def test_invalid_metadata(store: Store) -> None: dtype=np.dtype("uint8"), fill_value=0, codecs=[ - BytesCodec(), TransposeCodec(order="F"), # type: ignore[arg-type] + BytesCodec(), ], ) spath4 = StorePath(store, "invalid_missing_bytes_codec") @@ -370,17 +362,34 @@ def test_invalid_metadata(store: Store) -> None: ) +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_invalid_metadata_create_array(store: Store) -> None: + spath = StorePath(store, "warning_inefficient_codecs") + with pytest.warns(UserWarning): + zarr.create_array( + spath, + shape=(16, 16), + chunks=(16, 16), + dtype=np.dtype("uint8"), + fill_value=0, + serializer=ShardingCodec(chunk_shape=(8, 8)), + compressors=[ + GzipCodec(), + ], + ) + + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) async def test_resize(store: Store) -> None: data = np.zeros((16, 18), dtype="uint16") path = "resize" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(10, 10), + chunks=(10, 10), dtype=data.dtype, - chunk_key_encoding=("v2", "."), + chunk_key_encoding={"name": "v2", "separator": "."}, fill_value=1, ) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index db4e77451c..ae9d1f6f1f 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.abc.store import Store from zarr.codecs import BytesCodec from zarr.storage.common import StorePath @@ -17,14 +17,14 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[BytesCodec(endian=endian)], + chunk_key_encoding={"name": "v2", "separator": "."}, + serializer=BytesCodec(endian=endian), ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -43,14 +43,14 @@ async def test_endian_write( data = np.arange(0, 256, dtype=dtype_input_endian).reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype="uint16", fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[BytesCodec(endian=dtype_store_endian)], + chunk_key_encoding={"name": "v2", "separator": "."}, + serializer=BytesCodec(endian=dtype_store_endian), ) await _AsyncArrayProxy(a)[:, :].set(data) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 7b4d231813..f47f9710b1 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from zarr import Array +import zarr from zarr.abc.store import Store -from zarr.codecs import BytesCodec, GzipCodec +from zarr.codecs import GzipCodec from zarr.storage.common import StorePath @@ -11,13 +11,13 @@ def test_gzip(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( StorePath(store), shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - codecs=[BytesCodec(), GzipCodec()], + compressors=GzipCodec(), ) a[:, :] = data diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 51c82067f3..3f14007351 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -5,11 +5,13 @@ import numpy.typing as npt import pytest -from zarr import Array, AsyncArray +import zarr +import zarr.api +import zarr.api.asynchronous +from zarr import Array from zarr.abc.store import Store from zarr.codecs import ( BloscCodec, - BytesCodec, ShardingCodec, ShardingCodecIndexLocation, TransposeCodec, @@ -45,23 +47,16 @@ def test_sharding( """ data = array_fixture spath = StorePath(store) - arr = Array.create( + + arr = zarr.create_array( spath, shape=tuple(s + offset for s in data.shape), - chunk_shape=(64,) * data.ndim, + chunks=(32,) * data.ndim, + shards={"shape": (64,) * data.ndim, "index_location": index_location}, dtype=data.dtype, fill_value=6, - codecs=[ - ShardingCodec( - chunk_shape=(32,) * data.ndim, - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], + compressors=BloscCodec(cname="lz4"), ) write_region = tuple(slice(offset, None) for dim in range(data.ndim)) arr[write_region] = data @@ -89,23 +84,15 @@ def test_sharding_partial( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) a[10:, 10:, 10:] = data @@ -132,19 +119,15 @@ def test_sharding_partial_readwrite( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=data.shape, + chunks=(1, data.shape[1], data.shape[2]), + shards={"shape": data.shape, "index_location": index_location}, dtype=data.dtype, fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(1, data.shape[1], data.shape[2]), - codecs=[BytesCodec()], - index_location=index_location, - ) - ], + filters=None, + compressors=None, ) a[:] = data @@ -168,23 +151,15 @@ def test_sharding_partial_read( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) read_data = a[0:10, 0:10, 0:10] @@ -205,23 +180,15 @@ def test_sharding_partial_overwrite( ) -> None: data = array_fixture[:10, :10, :10] spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) a[:10, :10, :10] = data @@ -283,26 +250,66 @@ def test_nested_sharding( assert np.array_equal(data, read_data) +@pytest.mark.parametrize( + "array_fixture", + [ + ArrayRequest(shape=(128,) * 3, dtype="uint16", order="F"), + ], + indirect=["array_fixture"], +) +@pytest.mark.parametrize( + "outer_index_location", + ["start", "end"], +) +@pytest.mark.parametrize( + "inner_index_location", + ["start", "end"], +) +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) +def test_nested_sharding_create_array( + store: Store, + array_fixture: npt.NDArray[Any], + outer_index_location: ShardingCodecIndexLocation, + inner_index_location: ShardingCodecIndexLocation, +) -> None: + data = array_fixture + spath = StorePath(store) + a = zarr.create_array( + spath, + shape=data.shape, + chunks=(32, 32, 32), + dtype=data.dtype, + fill_value=0, + serializer=ShardingCodec( + chunk_shape=(32, 32, 32), + codecs=[ShardingCodec(chunk_shape=(16, 16, 16), index_location=inner_index_location)], + index_location=outer_index_location, + ), + filters=None, + compressors=None, + ) + print(a.metadata.to_dict()) + + a[:, :, :] = data + + read_data = a[0 : data.shape[0], 0 : data.shape[1], 0 : data.shape[2]] + assert data.shape == read_data.shape + assert np.array_equal(data, read_data) + + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) def test_open_sharding(store: Store) -> None: path = "open_sharding" spath = StorePath(store, path) - a = Array.create( + a = zarr.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(8, 8), + shards=(16, 16), + filters=[TransposeCodec(order=order_from_dim("F", 2))], + compressors=BloscCodec(), dtype="int32", fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(8, 8), - codecs=[ - TransposeCodec(order=order_from_dim("F", 2)), - BytesCodec(), - BloscCodec(), - ], - ) - ], ) b = Array.open(spath) assert a.metadata == b.metadata @@ -312,21 +319,14 @@ def test_open_sharding(store: Store) -> None: def test_write_partial_sharded_chunks(store: Store) -> None: data = np.arange(0, 16 * 16, dtype="uint16").reshape((16, 16)) spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=(40, 40), - chunk_shape=(20, 20), + chunks=(10, 10), + shards=(20, 20), dtype=data.dtype, + compressors=BloscCodec(), fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(10, 10), - codecs=[ - BytesCodec(), - BloscCodec(), - ], - ) - ], ) a[0:16, 0:16] = data assert np.array_equal(a[0:16, 0:16], data) @@ -338,14 +338,16 @@ async def test_delete_empty_shards(store: Store) -> None: pytest.skip("store does not support deletes") path = "delete_empty_shards" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(8, 16), + chunks=(8, 8), + shards=(8, 16), dtype="uint16", + compressors=None, fill_value=1, - codecs=[ShardingCodec(chunk_shape=(8, 8))], ) + print(a.metadata.to_dict()) await _AsyncArrayProxy(a)[:, :].set(np.zeros((16, 16))) await _AsyncArrayProxy(a)[8:, :].set(np.ones((8, 16))) await _AsyncArrayProxy(a)[:, 8:].set(np.ones((16, 8))) @@ -380,13 +382,13 @@ async def test_sharding_with_empty_inner_chunk( path = f"sharding_with_empty_inner_chunk_{index_location}" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(8, 8), + chunks=(4, 4), + shards={"shape": (8, 8), "index_location": index_location}, dtype="uint32", fill_value=fill_value, - codecs=[ShardingCodec(chunk_shape=(4, 4), index_location=index_location)], ) data[:4, :4] = fill_value await a.setitem(..., data) @@ -405,20 +407,44 @@ async def test_sharding_with_chunks_per_shard( store: Store, index_location: ShardingCodecIndexLocation, chunks_per_shard: tuple[int] ) -> None: chunk_shape = (2, 1) - shape = [x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)] + shape = tuple(x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)) data = np.ones(np.prod(shape), dtype="int32").reshape(shape) fill_value = 42 path = f"test_sharding_with_chunks_per_shard_{index_location}" spath = StorePath(store, path) - a = Array.create( + a = zarr.create_array( spath, shape=shape, - chunk_shape=shape, + chunks=chunk_shape, + shards={"shape": shape, "index_location": index_location}, dtype="int32", fill_value=fill_value, - codecs=[ShardingCodec(chunk_shape=chunk_shape, index_location=index_location)], ) a[...] = data data_read = a[...] assert np.array_equal(data_read, data) + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_invalid_metadata(store: Store) -> None: + spath1 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath1, + shape=(16, 16), + shards=(16, 16), + chunks=(8,), + dtype=np.dtype("uint8"), + fill_value=0, + ) + spath2 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath2, + shape=(16, 16), + shards=(16, 16), + chunks=(8, 7), + dtype=np.dtype("uint8"), + fill_value=0, + ) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 2b3914150e..65159f174b 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -1,19 +1,15 @@ -from typing import TYPE_CHECKING - import numpy as np import pytest -from zarr import Array, AsyncArray, config +import zarr +from zarr import AsyncArray, config from zarr.abc.store import Store -from zarr.codecs import BytesCodec, ShardingCodec, TransposeCodec +from zarr.codecs import TransposeCodec from zarr.core.common import MemoryOrder from zarr.storage.common import StorePath from .test_codecs import _AsyncArrayProxy -if TYPE_CHECKING: - from zarr.abc.codec import Codec - @pytest.mark.parametrize("input_order", ["F", "C"]) @pytest.mark.parametrize("runtime_write_order", ["F", "C"]) @@ -29,25 +25,16 @@ async def test_transpose( ) -> None: data = np.arange(0, 256, dtype="uint16").reshape((1, 32, 8), order=input_order) spath = StorePath(store, path="transpose") - codecs_: list[Codec] = ( - [ - ShardingCodec( - chunk_shape=(1, 16, 8), - codecs=[TransposeCodec(order=(2, 1, 0)), BytesCodec()], - ) - ] - if with_sharding - else [TransposeCodec(order=(2, 1, 0)), BytesCodec()] - ) with config.set({"array.order": runtime_write_order}): - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(1, 32, 8), + chunks=(1, 16, 8) if with_sharding else (1, 32, 8), + shards=(1, 32, 8) if with_sharding else None, dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=codecs_, + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=(2, 1, 0))], ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -75,13 +62,13 @@ def test_transpose_non_self_inverse(store: Store, order: list[int]) -> None: shape = [i + 3 for i in range(len(order))] data = np.arange(0, np.prod(shape), dtype="uint16").reshape(shape) spath = StorePath(store, "transpose_non_self_inverse") - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value=0, - codecs=[TransposeCodec(order=order), BytesCodec()], + filters=[TransposeCodec(order=order)], ) a[:, :] = data read_data = a[:, :] @@ -94,14 +81,14 @@ def test_transpose_invalid( ) -> None: data = np.arange(0, 256, dtype="uint16").reshape((1, 32, 8)) spath = StorePath(store, "transpose_invalid") - for order in [(1, 0), (3, 2, 1), (3, 3, 1)]: - with pytest.raises(ValueError): - Array.create( + for order in [(1, 0), (3, 2, 1), (3, 3, 1), "F", "C"]: + with pytest.raises((ValueError, TypeError)): + zarr.create_array( spath, shape=data.shape, - chunk_shape=(1, 32, 8), + chunks=(1, 32, 8), dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[TransposeCodec(order=order), BytesCodec()], + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=order)], ) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 05b2e25267..f4ee135601 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -3,10 +3,11 @@ import numpy as np import pytest +import zarr from zarr import Array from zarr.abc.codec import Codec from zarr.abc.store import Store -from zarr.codecs import VLenBytesCodec, VLenUTF8Codec, ZstdCodec +from zarr.codecs import ZstdCodec from zarr.core.metadata.v3 import ArrayV3Metadata, DataType from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage.common import StorePath @@ -23,21 +24,21 @@ @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("dtype", numpy_str_dtypes) @pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("codecs", [None, [VLenUTF8Codec()], [VLenUTF8Codec(), ZstdCodec()]]) +@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) def test_vlen_string( - store: Store, dtype: np.dtype[Any] | None, as_object_array: bool, codecs: list[Codec] | None + store: Store, dtype: np.dtype[Any] | None, as_object_array: bool, compressor: Codec | None ) -> None: strings = ["hello", "world", "this", "is", "a", "test"] data = np.array(strings, dtype=dtype).reshape((2, 3)) sp = StorePath(store, path="string") - a = Array.create( + a = zarr.create_array( sp, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value="", - codecs=codecs, + compressors=compressor, ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy @@ -61,20 +62,20 @@ def test_vlen_string( @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("codecs", [None, [VLenBytesCodec()], [VLenBytesCodec(), ZstdCodec()]]) -def test_vlen_bytes(store: Store, as_object_array: bool, codecs: list[Codec] | None) -> None: +@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) +def test_vlen_bytes(store: Store, as_object_array: bool, compressor: Codec | None) -> None: bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] data = np.array(bstrings).reshape((2, 3)) assert data.dtype == "|S5" sp = StorePath(store, path="string") - a = Array.create( + a = zarr.create_array( sp, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value=b"", - codecs=codecs, + compressors=compressor, ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 29efc29466..a57476fb61 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from zarr import Array +import zarr from zarr.abc.store import Store -from zarr.codecs import BytesCodec, ZstdCodec +from zarr.codecs import ZstdCodec from zarr.storage.common import StorePath @@ -12,13 +12,13 @@ def test_zstd(store: Store, checksum: bool) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( StorePath(store, path="zstd"), shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - codecs=[BytesCodec(), ZstdCodec(level=0, checksum=checksum)], + compressors=ZstdCodec(level=0, checksum=checksum), ) a[:, :] = data diff --git a/tests/test_config.py b/tests/test_config.py index ea8e70a994..20e3c6044f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,7 +8,8 @@ import pytest import zarr -from zarr import Array, AsyncArray, zeros +import zarr.api +from zarr import zeros from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( @@ -49,19 +50,33 @@ def test_config_defaults_set() -> None: # regression test for available defaults assert config.defaults == [ { - "default_zarr_version": 3, + "default_zarr_format": 3, "array": { "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": False}, + "string": {"id": "zstd", "level": 0, "checksum": False}, + "bytes": {"id": "zstd", "level": 0, "checksum": False}, + }, + "v2_default_filters": { + "numeric": None, + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "bytes": ["vlen-bytes"], - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, @@ -139,7 +154,7 @@ async def write( assert get_pipeline_class() == MockCodecPipeline # test if codec is used - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), @@ -184,13 +199,13 @@ async def _encode_single( assert get_codec_class("blosc") == MockBloscCodec # test if codec is used - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=3, dtype="i4", - codecs=[BytesCodec(), {"name": "blosc", "configuration": {}}], + compressors=[{"name": "blosc", "configuration": {}}], ) arr[:] = range(100) _mock.call.assert_called() @@ -213,7 +228,7 @@ def test_config_ndbuffer_implementation(store: Store) -> None: register_ndbuffer(NDBufferUsingTestNDArrayLike) with config.set({"ndbuffer": fully_qualified_name(NDBufferUsingTestNDArrayLike)}): assert get_ndbuffer_class() == NDBufferUsingTestNDArrayLike - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), @@ -291,23 +306,32 @@ class NewCodec2(BytesCodec): ("dtype", "expected_codecs"), [ ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec()]), - ("str", [VLenUTF8Codec()]), + ("bytes", [VLenBytesCodec(), GzipCodec()]), + ("str", [VLenUTF8Codec(), GzipCodec()]), ], ) async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: with config.set( { - "array.v3_default_codecs": { - "numeric": ["bytes", "gzip"], # test setting non-standard codecs - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "array.v3_default_codecs": { # test setting non-standard codecs + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], } } ): - arr = await AsyncArray.create( + arr = await zarr.api.asynchronous.create_array( shape=(100,), - chunk_shape=(100,), + chunks=(100,), dtype=np.dtype(dtype), zarr_format=3, store=MemoryStore(), diff --git a/tests/test_group.py b/tests/test_group.py index e0bc304b9b..6b3c40412e 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -155,9 +155,8 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad subsubsubgroup = subsubgroup.create_group("subsubsubgroup") members_expected["subarray"] = group.create_array( - "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), overwrite=True + "subarray", shape=(100,), dtype="uint8", chunks=(10,), overwrite=True ) - # add an extra object to the domain of the group. # the list of children should ignore this object. sync( @@ -227,9 +226,7 @@ def test_group(store: Store, zarr_format: ZarrFormat) -> None: # create an array from the "bar" group data = np.arange(0, 4 * 4, dtype="uint16").reshape((4, 4)) - arr = bar.create_array( - "baz", shape=data.shape, dtype=data.dtype, chunk_shape=(2, 2), overwrite=True - ) + arr = bar.create_array("baz", shape=data.shape, dtype=data.dtype, chunks=(2, 2), overwrite=True) arr[:] = data # check the array @@ -313,8 +310,8 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) - subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") + subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -391,7 +388,7 @@ def test_group_delitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -472,19 +469,21 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat expected_group_values = [group.create_group(name=name) for name in expected_group_keys] expected_groups = list(zip(expected_group_keys, expected_group_values, strict=False)) + fill_value = 3 + dtype = "uint8" + expected_group_values[0].create_group("subgroup") - expected_group_values[0].create_array("subarray", shape=(1,)) + expected_group_values[0].create_array( + "subarray", shape=(1,), dtype=dtype, fill_value=fill_value + ) expected_array_keys = ["a0", "a1"] + expected_array_values = [ - group.create_array(name=name, shape=(1,)) for name in expected_array_keys + group.create_array(name=name, shape=(1,), dtype=dtype, fill_value=fill_value) + for name in expected_array_keys ] expected_arrays = list(zip(expected_array_keys, expected_array_values, strict=False)) - fill_value: float | None - if zarr_format == 2: - fill_value = None - else: - fill_value = np.float64(0.0) if consolidate: group = zarr.consolidate_metadata(store) @@ -492,12 +491,13 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": "float64", + "dtype": dtype, "fill_value": fill_value, "shape": (1,), "chunks": (1,), "order": "C", - "filters": (Zstd(level=0),), + "filters": None, + "compressor": Zstd(level=0), "zarr_format": zarr_format, }, "subgroup": { @@ -527,7 +527,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", + "data_type": dtype, "fill_value": fill_value, "node_type": "array", "shape": (1,), @@ -614,20 +614,24 @@ def test_group_create_array( data = np.arange(np.prod(shape)).reshape(shape).astype(dtype) if method == "create_array": - array = group.create_array(name="array", shape=shape, dtype=dtype, data=data) + array = group.create_array(name="array", shape=shape, dtype=dtype) + array[:] = data elif method == "array": with pytest.warns(DeprecationWarning): - array = group.array(name="array", shape=shape, dtype=dtype, data=data) + array = group.array(name="array", shape=shape, dtype=dtype) + array[:] = data else: raise AssertionError if not overwrite: if method == "create_array": with pytest.raises(ContainsArrayError): - group.create_array(name="array", shape=shape, dtype=dtype, data=data) + a = group.create_array(name="array", shape=shape, dtype=dtype) + a[:] = data elif method == "array": with pytest.raises(ContainsArrayError), pytest.warns(DeprecationWarning): - group.array(name="array", shape=shape, dtype=dtype, data=data) + a = group.array(name="array", shape=shape, dtype=dtype) + a[:] = data assert array.shape == shape assert array.dtype == np.dtype(dtype) assert np.array_equal(array[:], data) @@ -780,7 +784,7 @@ async def test_asyncgroup_create( ) # create an array at our target path collision_name = "foo" - _ = await AsyncArray.create( + _ = await zarr.api.asynchronous.create_array( spath / collision_name, shape=(10,), dtype="uint8", zarr_format=zarr_format ) with pytest.raises(ContainsArrayError): @@ -870,9 +874,7 @@ async def test_asyncgroup_getitem(store: Store, zarr_format: ZarrFormat) -> None agroup = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) array_name = "sub_array" - sub_array = await agroup.create_array( - name=array_name, shape=(10,), dtype="uint8", chunk_shape=(2,) - ) + sub_array = await agroup.create_array(name=array_name, shape=(10,), dtype="uint8", chunks=(2,)) assert await agroup.getitem(array_name) == sub_array sub_group_path = "sub_group" @@ -894,7 +896,7 @@ async def test_asyncgroup_delitem(store: Store, zarr_format: ZarrFormat) -> None name=array_name, shape=(10,), dtype="uint8", - chunk_shape=(2,), + chunks=(2,), attributes={"foo": 100}, ) await agroup.delitem(array_name) @@ -960,7 +962,7 @@ async def test_asyncgroup_create_array( name=sub_node_path, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + chunks=chunk_shape, attributes=attributes, ) assert isinstance(subnode, AsyncArray) @@ -1014,11 +1016,11 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> group = await AsyncGroup.from_store( store=store, ) - a0 = await group.create_array("a0", shape=(1,)) + a0 = await group.create_array("a0", shape=(1,), dtype="uint8") g0 = await group.create_group("g0") - a1 = await g0.create_array("a1", shape=(1,)) + a1 = await g0.create_array("a1", shape=(1,), dtype="uint8") g1 = await g0.create_group("g1") - a2 = await g1.create_array("a2", shape=(1,)) + a2 = await g1.create_array("a2", shape=(1,), dtype="uint8") g2 = await g1.create_group("g2") # immediate children @@ -1101,7 +1103,7 @@ async def test_require_group(store: LocalStore | MemoryStore, zarr_format: ZarrF assert foo_group.attrs == {} _ = await foo_group.create_array( - "bar", shape=(10,), dtype="uint8", chunk_shape=(2,), attributes={"foo": 100} + "bar", shape=(10,), dtype="uint8", chunks=(2,), attributes={"foo": 100} ) # test that overwriting a group w/ children fails @@ -1179,9 +1181,9 @@ async def test_require_array(store: Store, zarr_format: ZarrFormat) -> None: async def test_members_name(store: Store, consolidate: bool, zarr_format: ZarrFormat): group = Group.from_store(store=store, zarr_format=zarr_format) a = group.create_group(name="a") - a.create_array("array", shape=(1,)) + a.create_array("array", shape=(1,), dtype="uint8") b = a.create_group(name="b") - b.create_array("array", shape=(1,)) + b.create_array("array", shape=(1,), dtype="uint8") if consolidate: group = zarr.api.synchronous.consolidate_metadata(store) @@ -1284,12 +1286,12 @@ async def test_group_delitem_consolidated(self, store: Store) -> None: g0 = await root.create_group("g0") g1 = await g0.create_group("g1") g2 = await g1.create_group("g2") - await g2.create_array("data", shape=(1,)) + await g2.create_array("data", shape=(1,), dtype="uint8") x0 = await root.create_group("x0") x1 = await x0.create_group("x1") x2 = await x1.create_group("x2") - await x2.create_array("data", shape=(1,)) + await x2.create_array("data", shape=(1,), dtype="uint8") await zarr.api.asynchronous.consolidate_metadata(store) @@ -1360,8 +1362,8 @@ def test_info(self): A = zarr.group(store=store, path="A") B = A.create_group(name="B") - B.create_array(name="x", shape=(1,)) - B.create_array(name="y", shape=(2,)) + B.create_array(name="x", shape=(1,), dtype="uint8") + B.create_array(name="y", shape=(2,), dtype="uint8") result = A.info expected = GroupInfo( @@ -1420,8 +1422,18 @@ def test_delitem_removes_children(store: Store, zarr_format: ZarrFormat) -> None g1 = zarr.group(store=store, zarr_format=zarr_format) g1.create_group("0") g1.create_group("0/0") - arr = g1.create_array("0/0/0", shape=(1,)) + arr = g1.create_array("0/0/0", shape=(1,), dtype="uint8") arr[:] = 1 del g1["0"] with pytest.raises(KeyError): g1["0/0"] + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_deprecated_compressor(store: Store) -> None: + g = zarr.group(store=store, zarr_format=2) + with pytest.warns(UserWarning, match="The `compressor` argument is deprecated.*"): + a = g.create_array( + "foo", shape=(100,), chunks=(10,), dtype="i4", compressor={"id": "blosc"} + ) + assert a.metadata.compressor.codec_id == "blosc" diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 04eb53e364..fc83af695b 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -47,12 +47,12 @@ def zarr_array_from_numpy_array( a: npt.NDArray[Any], chunk_shape: ChunkCoords | None = None, ) -> zarr.Array: - z = zarr.Array.create( + z = zarr.create_array( store=store / str(uuid4()), shape=a.shape, dtype=a.dtype, - chunk_shape=chunk_shape or a.shape, - chunk_key_encoding=("v2", "."), + chunks=chunk_shape or a.shape, + chunk_key_encoding={"name": "v2", "separator": "."}, ) z[()] = a return z @@ -1933,7 +1933,7 @@ def test_indexing_with_zarr_array(store: StorePath) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("shape", [(0, 2, 3), (0), (3, 0)]) def test_zero_sized_chunks(store: StorePath, shape: list[int]) -> None: - z = Array.create(store=store, shape=shape, chunk_shape=shape, zarr_format=3, dtype="f8") + z = zarr.create_array(store=store, shape=shape, chunks=shape, zarr_format=3, dtype="f8") z[...] = 42 assert_array_equal(z[...], np.zeros(shape, dtype="f8")) diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 7f0c49338e..aaace6f5cd 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -31,16 +31,19 @@ @pytest.fixture async def memory_store_with_hierarchy(memory_store: Store) -> None: g = await group(store=memory_store, attributes={"foo": "bar"}) - await g.create_array(name="air", shape=(1, 2, 3)) - await g.create_array(name="lat", shape=(1,)) - await g.create_array(name="lon", shape=(2,)) - await g.create_array(name="time", shape=(3,)) + dtype = "uint8" + await g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + await g.create_array(name="lat", shape=(1,), dtype=dtype) + await g.create_array(name="lon", shape=(2,), dtype=dtype) + await g.create_array(name="time", shape=(3,), dtype=dtype) child = await g.create_group("child", attributes={"key": "child"}) - await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) + await child.create_array("array", shape=(4, 4), attributes={"key": "child"}, dtype=dtype) grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) - await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + await grandchild.create_array( + "array", shape=(4, 4), attributes={"key": "grandchild"}, dtype=dtype + ) await grandchild.create_group("empty_group", attributes={"key": "empty"}) return memory_store @@ -74,10 +77,10 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: }, "codecs": ( {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + {"configuration": {"level": 0, "checksum": False}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": "uint8", + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -205,10 +208,11 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: def test_consolidated_sync(self, memory_store): g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) - g.create_array(name="air", shape=(1, 2, 3)) - g.create_array(name="lat", shape=(1,)) - g.create_array(name="lon", shape=(2,)) - g.create_array(name="time", shape=(3,)) + dtype = "uint8" + g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + g.create_array(name="lat", shape=(1,), dtype=dtype) + g.create_array(name="lon", shape=(2,), dtype=dtype) + g.create_array(name="time", shape=(3,), dtype=dtype) zarr.api.synchronous.consolidate_metadata(memory_store) group2 = zarr.api.synchronous.Group.open(memory_store) @@ -221,10 +225,10 @@ def test_consolidated_sync(self, memory_store): }, "codecs": ( {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + {"configuration": {"level": 0, "checksum": False}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": dtype, + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -475,7 +479,8 @@ async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - await g.create_array(name="a", shape=(1,), attributes={"key": "a"}) + dtype = "uint8" + await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) @@ -489,11 +494,11 @@ async def test_consolidated_metadata_v2(self): metadata={ "a": ArrayV2Metadata( shape=(1,), - dtype="float64", + dtype=dtype, attributes={"key": "a"}, chunks=(1,), - fill_value=None, - filters=(Zstd(level=0),), + fill_value=0, + compressor=Zstd(level=0), order="C", ), "g1": GroupMetadata( @@ -518,7 +523,7 @@ async def test_consolidated_metadata_v2(self): async def test_use_consolidated_false( self, memory_store: zarr.storage.MemoryStore, zarr_format: ZarrFormat ) -> None: - with zarr.config.set(default_zarr_version=zarr_format): + with zarr.config.set(default_zarr_format=zarr_format): g = await group(store=memory_store, attributes={"foo": "bar"}) await g.create_group(name="a") diff --git a/tests/test_v2.py b/tests/test_v2.py index 80897db8e5..72127f4ede 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,7 +11,7 @@ import zarr import zarr.core.buffer import zarr.storage -from zarr import Array, config +from zarr import config from zarr.storage import MemoryStore, StorePath @@ -23,7 +23,7 @@ async def store() -> Iterator[StorePath]: def test_simple(store: StorePath) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( store / "simple_v2", zarr_format=2, shape=data.shape, @@ -82,7 +82,12 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): + with config.set( + { + "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], + "array.v2_default_compressor.bytes": None, + } + ): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( @@ -120,9 +125,9 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "array.v2_default_compressor": { - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "array.v2_default_filters": { + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): @@ -162,7 +167,7 @@ def test_v2_filters_codecs(filters: Any, order: Literal["C", "F"]) -> None: @pytest.mark.parametrize("array_order", ["C", "F"]) @pytest.mark.parametrize("data_order", ["C", "F"]) def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal["C", "F"]) -> None: - arr = zarr.Array.create( + arr = zarr.create_array( MemoryStore({}), shape=(10, 8), chunks=(3, 3), @@ -182,7 +187,7 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" arr[slice(6, 9, None), slice(3, 6, None)], a[slice(6, 9, None), slice(3, 6, None)] ) - arr = zarr.Array.create( + arr = zarr.create_array( MemoryStore({}), shape=(10, 8), chunks=(3, 3), @@ -210,18 +215,31 @@ def test_default_compressor_deprecation_warning(): @pytest.mark.parametrize( "dtype_expected", - [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]], + [ + ["b", "zstd", None], + ["i", "zstd", None], + ["f", "zstd", None], + ["|S1", "zstd", "vlen-bytes"], + ["|U1", "zstd", "vlen-utf8"], + ], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "array.v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": "0"}, + "string": {"id": "zstd", "level": "0"}, + "bytes": {"id": "zstd", "level": "0"}, + }, + "array.v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): - dtype, expected = dtype_expected + dtype, expected_compressor, expected_filter = dtype_expected arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.filters[0].codec_id == expected + assert arr.metadata.compressor.codec_id == expected_compressor + if expected_filter is not None: + assert arr.metadata.filters[0].codec_id == expected_filter diff --git a/tests/test_zarr.py b/tests/test_zarr.py new file mode 100644 index 0000000000..2aa62e4231 --- /dev/null +++ b/tests/test_zarr.py @@ -0,0 +1,11 @@ +import zarr + + +def test_exports() -> None: + """ + Ensure that everything in __all__ can be imported. + """ + from zarr import __all__ + + for export in __all__: + getattr(zarr, export)