diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f22dc39832..a8ee599137 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: hooks: - id: check-yaml - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.3.0 + rev: v1.7.1 hooks: - id: mypy files: zarr @@ -35,3 +35,4 @@ repos: additional_dependencies: - types-redis - types-setuptools + - attrs diff --git a/zarr/tests/test_array_v3.py b/zarr/tests/test_array_v3.py new file mode 100644 index 0000000000..0678606080 --- /dev/null +++ b/zarr/tests/test_array_v3.py @@ -0,0 +1,121 @@ +import zarr.v3.array.v3 as v3 +import zarr.v3.array.v2 as v2 +import pytest +from typing import Any, Dict, Literal, Tuple, Union +import numpy as np + +from zarr.v3.types import Attributes, ChunkCoords +from zarr.v3.metadata.v3 import DefaultChunkKeyEncoding, RegularChunkGrid, RegularChunkGridConfig + +# todo: parametrize by chunks +@pytest.mark.asyncio +@pytest.mark.parametrize("zarr_version", ("2", "3")) +@pytest.mark.parametrize( + "shape", + ( + (10,), + ( + 10, + 11, + ), + ( + 10, + 11, + 12, + ), + ), +) +@pytest.mark.parametrize( + "dtype", (np.dtype("uint8"), "uint8", np.dtype("float32"), "float32", "int64") +) +@pytest.mark.parametrize("attributes", ({}, dict(a=10, b=10))) +@pytest.mark.parametrize("fill_value", (0, 1, 2)) +@pytest.mark.parametrize("dimension_separator", (".", "/")) +async def test_array( + tmpdir, + zarr_version: Literal["2", "3"], + shape: Tuple[int, ...], + dtype: Union[str, np.dtype], + attributes: Attributes, + fill_value: float, + dimension_separator: Literal[".", "/"], +): + store_path = str(tmpdir) + arr: Union[v2.AsyncArray, v3.Array] + if zarr_version == "2": + arr = await v2.AsyncArray.create( + store=store_path, + shape=shape, + dtype=dtype, + chunks=shape, + dimension_separator=dimension_separator, + fill_value=fill_value, + attributes=attributes, + exists_ok=True, + ) + else: + arr = await v3.AsyncArray.create( + store=store_path, + shape=shape, + dtype=dtype, + chunk_shape=shape, + fill_value=fill_value, + attributes=attributes, + exists_ok=True, + ) + fill_array = np.zeros(shape, dtype=dtype) + fill_value + assert np.array_equal(arr[:], fill_array) + + data = np.arange(np.prod(shape)).reshape(shape).astype(dtype) + + # note: if we try to create a prefix called "0/0/0" but an object named "0" already + # exists in the store, then we will get an unhandled exception + arr[:] = data + assert np.array_equal(arr[:], data) + + # partial write + arr[slice(0, 1)] = data[slice(0, 1)] + + +@pytest.mark.parametrize("zarr_format", (2, 3)) +def test_init_format(zarr_format: Literal[2, 3]): + dtype = "uint8" + shape = (10,) + if zarr_format == 2: + with pytest.raises(ValueError): + arr1 = v2.ArrayMetadata(shape=shape, dtype=dtype, chunks=shape, zarr_format=3) + else: + with pytest.raises(ValueError): + arr2 = v3.ArrayMetadata( + shape=shape, + data_type=dtype, + codecs=[], + chunk_grid=RegularChunkGrid( + configuration=RegularChunkGridConfig(chunk_shape=shape) + ), + fill_value=0, + chunk_key_encoding=DefaultChunkKeyEncoding(), + zarr_format=2, + ) + + +@pytest.mark.parametrize("zarr_format", ("2", "3")) +def test_init_node_type(zarr_format: Literal["2", "3"]): + dtype = "uint8" + shape = (10,) + if zarr_format == 2: + with pytest.raises(ValueError): + arr = v2.ArrayMetadata(shape=shape, dtype=dtype, chunks=shape, node_type="group") + else: + with pytest.raises(ValueError): + arr = v3.ArrayMetadata( + shape=shape, + data_type=dtype, + codecs=[], + chunk_grid=RegularChunkGrid( + configuration=RegularChunkGridConfig(chunk_shape=shape) + ), + fill_value=0, + chunk_key_encoding=DefaultChunkKeyEncoding(), + node_type="group", + ) diff --git a/zarr/tests/test_codecs_v3.py b/zarr/tests/test_codecs_v3.py new file mode 100644 index 0000000000..ae3cc2b80a --- /dev/null +++ b/zarr/tests/test_codecs_v3.py @@ -0,0 +1,980 @@ +from __future__ import annotations + +import json +from typing import Iterator, List, Literal, Optional +from attr import frozen + +import numpy as np +import pytest +import zarr +from zarr.v3 import codecs +from zarr.v3.common import runtime_configuration +from zarr.v3.array.v3 import Array, AsyncArray +from zarr.v3.types import Selection +from zarr.v3.array.indexing import morton_order_iter +from zarr.v3.metadata.v3 import CodecMetadata + +from zarr.v3.store import MemoryStore, Store + + +@frozen +class _AsyncArrayProxy: + array: AsyncArray + + def __getitem__(self, selection: Selection) -> _AsyncArraySelectionProxy: + return _AsyncArraySelectionProxy(self.array, selection) + + +@frozen +class _AsyncArraySelectionProxy: + array: AsyncArray + selection: Selection + + async def get(self) -> np.ndarray: + return await self.array.getitem(self.selection) + + async def set(self, value: np.ndarray): + return await self.array.setitem(self.selection, value) + + +@pytest.fixture +def store() -> Iterator[Store]: + yield MemoryStore() + + +@pytest.fixture +def sample_data() -> np.ndarray: + return np.arange(0, 128 * 128 * 128, dtype="uint16").reshape((128, 128, 128), order="F") + + +@pytest.mark.parametrize("index_location", ["start", "end"]) +def test_sharding(store: Store, sample_data: np.ndarray, index_location: Literal["start", "end"]): + a = Array.create( + store / "sample", + shape=sample_data.shape, + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=0, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", sample_data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=sample_data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + a[:, :, :] = sample_data + + read_data = a[0 : sample_data.shape[0], 0 : sample_data.shape[1], 0 : sample_data.shape[2]] + assert sample_data.shape == read_data.shape + assert np.array_equal(sample_data, read_data) + + +@pytest.mark.parametrize("index_location", ["start", "end"]) +def test_sharding_partial( + store: Store, sample_data: np.ndarray, index_location: Literal["start", "end"] +): + a = Array.create( + store / "sample", + shape=tuple(a + 10 for a in sample_data.shape), + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=0, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", sample_data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=sample_data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + a[10:, 10:, 10:] = sample_data + + read_data = a[0:10, 0:10, 0:10] + assert np.all(read_data == 0) + + read_data = a[10:, 10:, 10:] + assert sample_data.shape == read_data.shape + assert np.array_equal(sample_data, read_data) + + +@pytest.mark.parametrize("index_location", ["start", "end"]) +def test_sharding_partial_read( + store: Store, sample_data: np.ndarray, index_location: Literal["start", "end"] +): + a = Array.create( + store / "sample", + shape=tuple(a + 10 for a in sample_data.shape), + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=1, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", sample_data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=sample_data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + read_data = a[0:10, 0:10, 0:10] + assert np.all(read_data == 1) + + +@pytest.mark.parametrize("index_location", ["start", "end"]) +def test_sharding_partial_overwrite( + store: Store, sample_data: np.ndarray, index_location: Literal["start", "end"] +): + data = sample_data[:10, :10, :10] + + a = Array.create( + store / "sample", + shape=tuple(a + 10 for a in data.shape), + chunk_shape=(64, 64, 64), + dtype=data.dtype, + fill_value=1, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [ + codecs.transpose_codec("F", data.ndim), + codecs.bytes_codec(), + codecs.blosc_codec(typesize=data.dtype.itemsize, cname="lz4"), + ], + index_location=index_location, + ) + ], + ) + + a[:10, :10, :10] = data + + read_data = a[0:10, 0:10, 0:10] + assert np.array_equal(data, read_data) + + data = data + 10 + a[:10, :10, :10] = data + read_data = a[0:10, 0:10, 0:10] + assert np.array_equal(data, read_data) + + +@pytest.mark.parametrize( + "outer_index_location", + ["start", "end"], +) +@pytest.mark.parametrize( + "inner_index_location", + ["start", "end"], +) +def test_nested_sharding( + store: Store, + sample_data: np.ndarray, + outer_index_location: Literal["start", "end"], + inner_index_location: Literal["start", "end"], +): + a = Array.create( + store / "l4_sample" / "color" / "1", + shape=sample_data.shape, + chunk_shape=(64, 64, 64), + dtype=sample_data.dtype, + fill_value=0, + codecs=[ + codecs.sharding_codec( + (32, 32, 32), + [codecs.sharding_codec((16, 16, 16), index_location=inner_index_location)], + index_location=outer_index_location, + ) + ], + ) + + a[:, :, :] = sample_data + + read_data = a[0 : sample_data.shape[0], 0 : sample_data.shape[1], 0 : sample_data.shape[2]] + assert sample_data.shape == read_data.shape + assert np.array_equal(sample_data, read_data) + + +@pytest.mark.parametrize("input_order", ["F", "C"]) +@pytest.mark.parametrize("store_order", ["F", "C"]) +@pytest.mark.parametrize("runtime_write_order", ["F", "C"]) +@pytest.mark.parametrize("runtime_read_order", ["F", "C"]) +@pytest.mark.parametrize("with_sharding", [True, False]) +@pytest.mark.asyncio +async def test_order( + store: Store, + input_order: Literal["F", "C"], + store_order: Literal["F", "C"], + runtime_write_order: Literal["F", "C"], + runtime_read_order: Literal["F", "C"], + with_sharding: bool, +): + data = np.arange(0, 256, dtype="uint16").reshape((32, 8), order=input_order) + + codecs_: List[CodecMetadata] = ( + [ + codecs.sharding_codec( + (16, 8), + codecs=[codecs.transpose_codec(store_order, data.ndim), codecs.bytes_codec()], + ) + ] + if with_sharding + else [codecs.transpose_codec(store_order, data.ndim), codecs.bytes_codec()] + ) + + a_create = await AsyncArray.create( + store / "order", + shape=data.shape, + chunk_shape=(32, 8), + dtype=data.dtype, + fill_value=0, + chunk_key_encoding=("v2", "."), + codecs=codecs_, + runtime_configuration=runtime_configuration(runtime_write_order), + ) + + await _AsyncArrayProxy(a_create)[:, :].set(data) + read_data = await _AsyncArrayProxy(a_create)[:, :].get() + assert np.array_equal(data, read_data) + + a_open = await AsyncArray.open( + store / "order", + runtime_configuration=runtime_configuration(order=runtime_read_order), + ) + read_data = await _AsyncArrayProxy(a_open)[:, :].get() + assert np.array_equal(data, read_data) + + if runtime_read_order == "F": + assert read_data.flags["F_CONTIGUOUS"] + assert not read_data.flags["C_CONTIGUOUS"] + else: + assert not read_data.flags["F_CONTIGUOUS"] + assert read_data.flags["C_CONTIGUOUS"] + + if not with_sharding: + # Compare with zarr-python + z = zarr.create( + shape=data.shape, + chunks=(32, 8), + dtype="u2", "u2", " Union[Array, ArrayV2, Group, GroupV2]: +) -> Union[ArrayV2, ArrayV3, Group, GroupV2]: store_path = make_store_path(store) try: return await Group.open_or_array(store_path, runtime_configuration=runtime_configuration_) @@ -33,7 +32,7 @@ async def open_auto_async( def open_auto( store: StoreLike, runtime_configuration_: RuntimeConfiguration = RuntimeConfiguration(), -) -> Union[Array, ArrayV2, Group, GroupV2]: +) -> Union[ArrayV2, ArrayV3, Group, GroupV2]: return _sync( open_auto_async(store, runtime_configuration_), runtime_configuration_.asyncio_loop, diff --git a/zarr/v3/abc/array.py b/zarr/v3/abc/array.py deleted file mode 100644 index 976aa48618..0000000000 --- a/zarr/v3/abc/array.py +++ /dev/null @@ -1,140 +0,0 @@ -from __future__ import annotations -from abc import abstractproperty, abstractmethod, ABC -from typing import Tuple, Any, Dict - -import numpy as np - -from zarr.v3.abc.store import ReadStore, WriteStore -from zarr.v3.common import Selection - - -class BaseArray(ABC): - @abstractproperty - def store_path(self) -> str: # TODO: rename to `path`? - """Path to this array in the underlying store.""" - ... - - @abstractproperty - def dtype(self) -> np.dtype: - """Data type of the array elements. - - Returns - ------- - dtype - array data type - """ - ... - - @abstractproperty - def ndim(self) -> int: - """Number of array dimensions (axes). - - Returns - ------- - int - number of array dimensions (axes) - """ - ... - - @abstractproperty - def shape(self) -> Tuple[int, ...]: - """Array dimensions. - - Returns - ------- - tuple of int - array dimensions - """ - ... - - @abstractproperty - def size(self) -> int: - """Number of elements in the array. - - Returns - ------- - int - number of elements in an array. - """ - - @abstractproperty - def attrs(self) -> Dict[str, Any]: - """Array attributes. - - Returns - ------- - dict - user defined attributes - """ - ... - - @abstractproperty - def info(self) -> Any: - """Report some diagnostic information about the array. - - Returns - ------- - out - """ - ... - - -class AsynchronousArray(BaseArray): - """This class can be implemented as a v2 or v3 array""" - - @classmethod - @abstractmethod - async def from_json(cls, zarr_json: Any, store: ReadStore) -> AsynchronousArray: - ... - - @classmethod - @abstractmethod - async def open(cls, store: ReadStore) -> AsynchronousArray: - ... - - @classmethod - @abstractmethod - async def create(cls, store: WriteStore, *, shape, **kwargs) -> AsynchronousArray: - ... - - @abstractmethod - async def getitem(self, selection: Selection): - ... - - @abstractmethod - async def setitem(self, selection: Selection, value: np.ndarray) -> None: - ... - - -class SynchronousArray(BaseArray): - """ - This class can be implemented as a v2 or v3 array - """ - - @classmethod - @abstractmethod - def from_json(cls, zarr_json: Any, store: ReadStore) -> SynchronousArray: - ... - - @classmethod - @abstractmethod - def open(cls, store: ReadStore) -> SynchronousArray: - ... - - @classmethod - @abstractmethod - def create(cls, store: WriteStore, *, shape, **kwargs) -> SynchronousArray: - ... - - @abstractmethod - def __getitem__(self, selection: Selection): # TODO: type as np.ndarray | scalar - ... - - @abstractmethod - def __setitem__(self, selection: Selection, value: np.ndarray) -> None: - ... - - # some day ;) - # @property - # def __array_api_version__(self) -> str: - # return "2022.12" diff --git a/zarr/v3/abc/codec.py b/zarr/v3/abc/codec.py index f84fc74af9..139af9163f 100644 --- a/zarr/v3/abc/codec.py +++ b/zarr/v3/abc/codec.py @@ -11,30 +11,40 @@ from __future__ import annotations from abc import abstractmethod, ABC -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Type +from typing import TYPE_CHECKING, Optional, Type import numpy as np +from zarr.v3.common import ChunkMetadata, RuntimeConfiguration -from zarr.v3.common import BytesLike +from zarr.v3.types import SliceSelection +from zarr.v3.store import StorePath +from zarr.v3.types import SliceSelection +from zarr.v3.store import StorePath +from zarr.v3.types import BytesLike if TYPE_CHECKING: - from zarr.v3.metadata import CoreArrayMetadata + from zarr.v3.array.base import CodecMetadata class Codec(ABC): - supports_partial_decode: bool - supports_partial_encode: bool is_fixed_size: bool - array_metadata: CoreArrayMetadata + array_metadata: ChunkMetadata @abstractmethod def compute_encoded_size(self, input_byte_length: int) -> int: pass - def resolve_metadata(self) -> CoreArrayMetadata: + def resolve_metadata(self) -> ChunkMetadata: return self.array_metadata + @classmethod + def from_metadata( + cls, codec_metadata: "CodecMetadata", array_metadata: ChunkMetadata + ) -> "Type[Codec]": + pass + class ArrayArrayCodec(Codec): @abstractmethod @@ -62,9 +72,50 @@ async def decode( @abstractmethod async def encode( + self, chunk_array: np.ndarray, config: RuntimeConfiguration + ) -> Optional[BytesLike]: + pass + + +class ArrayBytesCodecPartialDecodeMixin: + @abstractmethod + async def decode_partial( + self, + store_path: StorePath, + selection: SliceSelection, + ) -> Optional[np.ndarray]: + pass + + +class ArrayBytesCodecPartialEncodeMixin: + @abstractmethod + async def encode_partial( self, + store_path: StorePath, chunk_array: np.ndarray, - ) -> Optional[BytesLike]: + selection: SliceSelection, + ) -> None: + pass + + +class ArrayBytesCodecPartialDecodeMixin: + @abstractmethod + async def decode_partial( + self, + store_path: StorePath, + selection: SliceSelection, + ) -> Optional[np.ndarray]: + pass + + +class ArrayBytesCodecPartialEncodeMixin: + @abstractmethod + async def encode_partial( + self, + store_path: StorePath, + chunk_array: np.ndarray, + selection: SliceSelection, + ) -> None: pass diff --git a/zarr/v3/abc/group.py b/zarr/v3/abc/group.py index 02de819894..0103daed45 100644 --- a/zarr/v3/abc/group.py +++ b/zarr/v3/abc/group.py @@ -4,10 +4,12 @@ from collections.abc import MutableMapping from typing import Dict, Any +from zarr.v3.types import Attributes + class BaseGroup(ABC): @abstractproperty - def attrs(self) -> Dict[str, Any]: + def attrs(self) -> Attributes: """User-defined attributes.""" ... diff --git a/zarr/v3/array/__init__.py b/zarr/v3/array/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/zarr/v3/array/base.py b/zarr/v3/array/base.py new file mode 100644 index 0000000000..507b88f86c --- /dev/null +++ b/zarr/v3/array/base.py @@ -0,0 +1,349 @@ +from __future__ import annotations +from abc import ABC, abstractmethod, abstractproperty + +import json +from asyncio import AbstractEventLoop +from enum import Enum +from typing import ( + Any, + Dict, + Iterable, + List, + Literal, + Optional, + Protocol, + Tuple, + Union, + runtime_checkable, +) + +import numpy as np +from zarr.util import is_total_slice +from zarr.v3.abc.store import ReadStore, WriteStore +from zarr.v3.codecs.sharding import CodecPipeline, ShardingCodec +from zarr.v3.common import RuntimeConfiguration, concurrent_map +from zarr.v3.store import Store, StorePath + +from zarr.v3.types import Attributes, BytesLike, ChunkCoords, Selection, SliceSelection + + +class BaseArray(ABC): + @abstractproperty + def store_path(self) -> str: # TODO: rename to `path`? + """Path to this array in the underlying store.""" + ... + + @abstractproperty + def dtype(self) -> np.dtype: + """Data type of the array elements. + + Returns + ------- + dtype + array data type + """ + ... + + @abstractproperty + def ndim(self) -> int: + """Number of array dimensions (axes). + + Returns + ------- + int + number of array dimensions (axes) + """ + ... + + @abstractproperty + def shape(self) -> Tuple[int, ...]: + """Array dimensions. + + Returns + ------- + tuple of int + array dimensions + """ + ... + + @abstractproperty + def size(self) -> int: + """Number of elements in the array. + + Returns + ------- + int + number of elements in an array. + """ + + @abstractproperty + def attrs(self) -> Attributes: + """Array attributes. + + Returns + ------- + dict + user defined attributes + """ + ... + + @abstractproperty + def info(self) -> Any: + """Report some diagnostic information about the array. + + Returns + ------- + out + """ + ... + + +class AsynchronousArray(BaseArray): + """This class can be implemented as a v2 or v3 array""" + + @classmethod + @abstractmethod + async def from_json(cls, zarr_json: Any, store: ReadStore) -> AsynchronousArray: + ... + + @classmethod + @abstractmethod + async def open(cls, store: ReadStore) -> AsynchronousArray: + ... + + @classmethod + @abstractmethod + async def create(cls, store: WriteStore, *, shape, **kwargs) -> AsynchronousArray: + ... + + @abstractmethod + async def getitem(self, selection: Selection): + ... + + @abstractmethod + async def setitem(self, selection: Selection, value: np.ndarray) -> None: + ... + + +class SynchronousArray(BaseArray): + """ + This class can be implemented as a v2 or v3 array + """ + + @classmethod + @abstractmethod + def from_json(cls, zarr_json: Any, store: ReadStore) -> SynchronousArray: + ... + + @classmethod + @abstractmethod + def open(cls, store: ReadStore) -> SynchronousArray: + ... + + @classmethod + @abstractmethod + def create(cls, store: WriteStore, *, shape, **kwargs) -> SynchronousArray: + ... + + @abstractmethod + def __getitem__(self, selection: Selection): # TODO: type as np.ndarray | scalar + ... + + @abstractmethod + def __setitem__(self, selection: Selection, value: np.ndarray) -> None: + ... + + +async def write_chunk_to_store( + store_path: StorePath, + chunk_array: np.ndarray, + fill_value: Any, + codec_pipeline: List[Any], + config: RuntimeConfiguration, +): + if np.all(chunk_array == fill_value): + # chunks that only contain fill_value will be removed + await store_path.delete_async() + else: + chunk_bytes = await codec_pipeline.encode(chunk_array, config) + if chunk_bytes is None: + await store_path.delete_async() + else: + await store_path.set_async(chunk_bytes) + + +@runtime_checkable +class ChunkKeyEncoder(Protocol): + def encode_key(self, coords: ChunkCoords, **kwargs) -> str: + ... + + def decode_key(self, key: str, **kwargs) -> ChunkCoords: + ... + + +async def write_chunk( + chunk_key_encoding: ChunkKeyEncoder, + store_path: StorePath, + codec_pipeline, + value: np.ndarray, + chunk_shape: ChunkCoords, + chunk_coords: ChunkCoords, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + fill_value: Any, + config: RuntimeConfiguration, +): + chunk_key = chunk_key_encoding.encode_key(chunk_coords) + store_path = store_path / chunk_key + + if is_total_slice(chunk_selection, chunk_shape): + # write entire chunks + if np.isscalar(value): + chunk_array = np.empty( + chunk_shape, + dtype=value.dtype, + ) + chunk_array.fill(value) + else: + chunk_array = value[out_selection] + await write_chunk_to_store( + store_path=store_path, + chunk_array=chunk_array, + codec_pipeline=codec_pipeline, + fill_value=fill_value, + config=config, + ) + + elif len(codec_pipeline.codecs) == 1 and isinstance(codec_pipeline.codecs[0], ShardingCodec): + sharding_codec = codec_pipeline.codecs[0] + # print("encode_partial", chunk_coords, chunk_selection, repr(self)) + await sharding_codec.encode_partial( + store_path, + value[out_selection], + chunk_selection, + config=config, + ) + else: + # writing partial chunks + # read chunk first + chunk_bytes = await store_path.get_async() + + # merge new value + if chunk_bytes is None: + chunk_array = np.empty( + chunk_shape, + dtype=value.dtype, + ) + chunk_array.fill(fill_value) + else: + chunk_array = ( + await codec_pipeline.decode(chunk_bytes, config=config) + ).copy() # make a writable copy + chunk_array[chunk_selection] = value[out_selection] + + await write_chunk_to_store( + store_path=store_path, + chunk_array=chunk_array, + fill_value=fill_value, + codec_pipeline=codec_pipeline, + config=config, + ) + + +async def read_chunk( + chunk_key: str, + store_path, + codec_pipeline, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + out: np.ndarray, + config: RuntimeConfiguration, +) -> None: + store_path = store_path / chunk_key + + if len(codec_pipeline.codecs) == 1 and isinstance(codec_pipeline.codecs[0], ShardingCodec): + chunk_array = await codec_pipeline.codecs[0].decode_partial( + store_path, chunk_selection, config=config + ) + if chunk_array is not None: + out[out_selection] = chunk_array + else: + chunk_bytes = await store_path.get_async() + if chunk_bytes is not None: + chunk_array = await codec_pipeline.decode(chunk_bytes, config=config) + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + + +async def read_chunks( + chunk_keys: Iterable[str], + store_path: Store, + codec_pipeline: CodecPipeline, + chunk_selections: Iterable[SliceSelection], + out_selections: Iterable[SliceSelection], + out: np.ndarray, + config: RuntimeConfiguration, +): + + await concurrent_map( + [ + ( + chunk_key, + store_path, + codec_pipeline, + chunk_selection, + out_selection, + out, + config, + ) + for chunk_key, chunk_selection, out_selection in zip( + chunk_keys, chunk_selections, out_selections + ) + ], + read_chunk, + config.concurrency, + ) + + +class ChunkKeyEncodingV3(ChunkKeyEncoder): + separator: Literal[".", "/"] + prefix = "c" + + def __init__(self, separator: Literal[".", "/"]): + if separator not in (".", "/"): + raise ValueError(f'Separator must be "." or "/", got {separator}') + self.separator = separator + + def decode_key(self, key: str) -> ChunkCoords: + if key == self.prefix: + return () + return tuple(map(int, key[len(self.prefix) :].split(self.separator))) + + def encode_key(self, chunk_coords: ChunkCoords) -> str: + return self.separator.join(map(str, (self.prefix,) + chunk_coords)) + + +class ChunkKeyEncodingV2(ChunkKeyEncoder): + separator: Literal["/", "."] + + def __init__(self, separator: Literal[".", "/"]): + if separator not in (".", "/"): + raise ValueError(f'Separator must be "." or "/", got {separator}') + self.separator = separator + + def decode_key(self, key: str) -> ChunkCoords: + return tuple(map(int, key.split(self.separator))) + + def encode_key(self, coords: ChunkCoords) -> str: + chunk_identifier = self.separator.join(map(str, coords)) + return "0" if chunk_identifier == "" else chunk_identifier + + +class ChunkKeyEncoderABC(ABC): + @abstractmethod + def encode_key(self, coords: ChunkCoords) -> str: + pass + + @abstractmethod + def decode_key(self, key: str) -> ChunkCoords: + pass diff --git a/zarr/v3/indexing.py b/zarr/v3/array/indexing.py similarity index 98% rename from zarr/v3/indexing.py rename to zarr/v3/array/indexing.py index 15adad111d..0b70868227 100644 --- a/zarr/v3/indexing.py +++ b/zarr/v3/array/indexing.py @@ -4,7 +4,8 @@ import math from typing import Iterator, List, NamedTuple, Optional, Tuple -from zarr.v3.common import ChunkCoords, Selection, SliceSelection, product +from zarr.v3.common import product +from zarr.v3.types import ChunkCoords, Selection, SliceSelection def _ensure_tuple(v: Selection) -> SliceSelection: diff --git a/zarr/v3/array_v2.py b/zarr/v3/array/v2.py similarity index 61% rename from zarr/v3/array_v2.py rename to zarr/v3/array/v2.py index a2f26f01b0..deba63b211 100644 --- a/zarr/v3/array_v2.py +++ b/zarr/v3/array/v2.py @@ -2,43 +2,55 @@ import asyncio import json -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union import numcodecs import numpy as np -from attr import evolve, frozen -from numcodecs.compat import ensure_bytes, ensure_ndarray +import attr +from zarr.v3.array.base import ( + AsynchronousArray, + ChunkKeyEncodingV2, + SynchronousArray, + read_chunk, + write_chunk, +) +from zarr.v3.array.base import ( + ChunkKeyEncoder, +) +from zarr.v3.codecs import CodecPipeline, bytes_codec from zarr.v3.common import ( ZARRAY_JSON, ZATTRS_JSON, - BytesLike, - ChunkCoords, - Selection, - SliceSelection, + RuntimeConfiguration, concurrent_map, + make_cattr, to_thread, ) -from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice -from zarr.v3.metadata import ArrayV2Metadata, RuntimeConfiguration +from zarr.v3.array.indexing import BasicIndexer, all_chunk_coords, is_total_slice +from zarr.v3.common import ChunkMetadata +from zarr.v3.metadata.v3 import DefaultChunkKeyConfig, DefaultChunkKeyEncoding +import zarr.v3.metadata.v2 as metaV2 + from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync +from zarr.v3.types import Attributes, BytesLike, ChunkCoords, Selection, SliceSelection if TYPE_CHECKING: - from zarr.v3.array import Array + import zarr.v3.array.v3 as v3 -@frozen +@attr.frozen class _AsyncArrayProxy: - array: ArrayV2 + array: AsyncArray def __getitem__(self, selection: Selection) -> _AsyncArraySelectionProxy: return _AsyncArraySelectionProxy(self.array, selection) -@frozen +@attr.frozen class _AsyncArraySelectionProxy: - array: ArrayV2 + array: AsyncArray selection: Selection async def get(self) -> np.ndarray: @@ -48,35 +60,82 @@ async def set(self, value: np.ndarray): return await self.array.set_async(self.selection, value) -@frozen -class ArrayV2: - metadata: ArrayV2Metadata - attributes: Optional[Dict[str, Any]] +class ArrayMetadata(metaV2.ArrayMetadata): + @property + def ndim(self) -> int: + return len(self.shape) + + def to_bytes(self): + return self.to_json() + + """ + def to_bytes(self) -> bytes: + def _json_convert(o): + if isinstance(o, np.dtype): + if o.fields is None: + return o.str + else: + return o.descr + raise TypeError + + return json.dumps(attr.asdict(self), default=_json_convert).encode() """ + + @classmethod + def from_json(cls, zarr_json: Any) -> ArrayMetadata: + return make_cattr().structure(zarr_json, cls) + + +@attr.frozen +class AsyncArray(AsynchronousArray): + metadata: ArrayMetadata + attributes: Attributes store_path: StorePath runtime_configuration: RuntimeConfiguration + codec_pipeline: CodecPipeline + chunk_key_encoding: ChunkKeyEncoder + + @property + def ndim(self) -> int: + return len(self.metadata.shape) + + @property + def shape(self) -> ChunkCoords: + return self.metadata.shape + + @property + def size(self) -> int: + return np.prod(self.metadata.shape) + + @property + def dtype(self) -> np.dtype: + return self.metadata.dtype + + @property + def attrs(self) -> Attributes: + return self.attributes @classmethod - async def create_async( + async def create( cls, store: StoreLike, *, - shape: ChunkCoords, + shape: Tuple[int, ...], dtype: np.dtype, - chunks: ChunkCoords, + chunks: Tuple[int, ...], dimension_separator: Literal[".", "/"] = ".", fill_value: Optional[Union[None, int, float]] = None, order: Literal["C", "F"] = "C", filters: Optional[List[Dict[str, Any]]] = None, compressor: Optional[Dict[str, Any]] = None, - attributes: Optional[Dict[str, Any]] = None, + attributes: Attributes = {}, exists_ok: bool = False, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> ArrayV2: + ) -> AsyncArray: store_path = make_store_path(store) if not exists_ok: assert not await (store_path / ZARRAY_JSON).exists_async() - metadata = ArrayV2Metadata( + metadata = ArrayMetadata( shape=shape, dtype=np.dtype(dtype), chunks=chunks, @@ -90,56 +149,47 @@ async def create_async( if filters is not None else None, ) + + chunk_key_encoding = ChunkKeyEncodingV2(separator=dimension_separator) + + if metadata.filters is None: + filters = [] + else: + filters = metadata.filters + + if metadata.compressor is None: + codecs = [bytes_codec()] + else: + codecs = [metadata.compressor] + filters + + codec_pipeline = CodecPipeline.from_metadata( + codecs, + ChunkMetadata( + array_shape=metadata.shape, + chunk_shape=metadata.chunks, + dtype=metadata.dtype, + fill_value=metadata.fill_value, + ), + ) + array = cls( metadata=metadata, store_path=store_path, - attributes=attributes, runtime_configuration=runtime_configuration, + codec_pipeline=codec_pipeline, + chunk_key_encoding=chunk_key_encoding, + attributes=attributes, ) + await array._save_metadata() return array @classmethod - def create( - cls, - store: StoreLike, - *, - shape: ChunkCoords, - dtype: np.dtype, - chunks: ChunkCoords, - dimension_separator: Literal[".", "/"] = ".", - fill_value: Optional[Union[None, int, float]] = None, - order: Literal["C", "F"] = "C", - filters: Optional[List[Dict[str, Any]]] = None, - compressor: Optional[Dict[str, Any]] = None, - attributes: Optional[Dict[str, Any]] = None, - exists_ok: bool = False, - runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> ArrayV2: - return sync( - cls.create_async( - store, - shape=shape, - dtype=dtype, - chunks=chunks, - order=order, - dimension_separator=dimension_separator, - fill_value=0 if fill_value is None else fill_value, - compressor=compressor, - filters=filters, - attributes=attributes, - exists_ok=exists_ok, - runtime_configuration=runtime_configuration, - ), - runtime_configuration.asyncio_loop, - ) - - @classmethod - async def open_async( + async def open( cls, store: StoreLike, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> ArrayV2: + ) -> AsyncArray: store_path = make_store_path(store) zarray_bytes, zattrs_bytes = await asyncio.gather( (store_path / ZARRAY_JSON).get_async(), @@ -153,17 +203,6 @@ async def open_async( runtime_configuration=runtime_configuration, ) - @classmethod - def open( - cls, - store: StoreLike, - runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> ArrayV2: - return sync( - cls.open_async(store, runtime_configuration), - runtime_configuration.asyncio_loop, - ) - @classmethod def from_json( cls, @@ -171,12 +210,18 @@ def from_json( zarray_json: Any, zattrs_json: Optional[Any], runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> ArrayV2: - metadata = ArrayV2Metadata.from_json(zarray_json) + ) -> AsyncArray: + metadata = ArrayMetadata.from_json(zarray_json) + + chunk_key_encoding = DefaultChunkKeyEncoding( + configuration=DefaultChunkKeyConfig(separator=metadata.dimension_separator) + ) + out = cls( store_path=store_path, metadata=metadata, attributes=zattrs_json, + chunk_key_encoding=chunk_key_encoding, runtime_configuration=runtime_configuration, ) out._validate_metadata() @@ -198,26 +243,10 @@ def _validate_metadata(self) -> None: self.metadata.chunks ), "`chunks` and `shape` need to have the same number of dimensions." - @property - def ndim(self) -> int: - return len(self.metadata.shape) - - @property - def shape(self) -> ChunkCoords: - return self.metadata.shape - - @property - def dtype(self) -> np.dtype: - return self.metadata.dtype - - @property - def async_(self) -> _AsyncArrayProxy: - return _AsyncArrayProxy(self) - def __getitem__(self, selection: Selection): - return sync(self.get_async(selection), self.runtime_configuration.asyncio_loop) + return sync(self.getitem(selection), self.runtime_configuration.asyncio_loop) - async def get_async(self, selection: Selection): + async def getitem(self, selection: Selection): indexer = BasicIndexer( selection, shape=self.metadata.shape, @@ -230,6 +259,10 @@ async def get_async(self, selection: Selection): dtype=self.metadata.dtype, order=self.metadata.order, ) + out.fill(self.metadata.fill_value) + + chunk_coords, chunk_selections, out_selections = zip(*indexer) + chunk_keys = map(self.chunk_key_encoding.encode_key, chunk_coords) # reading chunks and decoding them await concurrent_map( @@ -252,43 +285,16 @@ async def _read_chunk( out_selection: SliceSelection, out: np.ndarray, ): - store_path = self.store_path / self._encode_chunk_key(chunk_coords) - chunk_array = await self._decode_chunk(await store_path.get_async()) - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = self.metadata.fill_value - - async def _decode_chunk(self, chunk_bytes: Optional[BytesLike]) -> Optional[np.ndarray]: - if chunk_bytes is None: - return None - - if self.metadata.compressor is not None: - compressor = numcodecs.get_codec(self.metadata.compressor) - chunk_array = ensure_ndarray(await to_thread(compressor.decode, chunk_bytes)) - else: - chunk_array = ensure_ndarray(chunk_bytes) - - # ensure correct dtype - if str(chunk_array.dtype) != self.metadata.dtype: - chunk_array = chunk_array.view(self.metadata.dtype) - - # apply filters in reverse order - if self.metadata.filters is not None: - for filter_metadata in self.metadata.filters[::-1]: - filter = numcodecs.get_codec(filter_metadata) - chunk_array = await to_thread(filter.decode, chunk_array) - - # ensure correct chunk shape - if chunk_array.shape != self.metadata.chunks: - chunk_array = chunk_array.reshape( - self.metadata.chunks, - order=self.metadata.order, - ) - - return chunk_array + await read_chunk( + chunk_key=self.chunk_key_encoding.encode_key(chunk_coords), + store_path=self.store_path, + chunk_selection=chunk_selection, + codec_pipeline=self.codec_pipeline, + out_selection=out_selection, + out=out, + config=self.runtime_configuration, + ) def __setitem__(self, selection: Selection, value: np.ndarray) -> None: sync(self.set_async(selection, value), self.runtime_configuration.asyncio_loop) @@ -300,7 +306,6 @@ async def set_async(self, selection: Selection, value: np.ndarray) -> None: shape=self.metadata.shape, chunk_shape=chunk_shape, ) - sel_shape = indexer.shape # check value shape @@ -318,98 +323,25 @@ async def set_async(self, selection: Selection, value: np.ndarray) -> None: await concurrent_map( [ ( + self.chunk_key_encoding, + self.store_path, + self.codec_pipeline, value, chunk_shape, chunk_coords, chunk_selection, out_selection, + self.metadata.fill_value, + self.runtime_configuration, ) for chunk_coords, chunk_selection, out_selection in indexer ], - self._write_chunk, + write_chunk, ) - async def _write_chunk( - self, - value: np.ndarray, - chunk_shape: ChunkCoords, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - ): - store_path = self.store_path / self._encode_chunk_key(chunk_coords) - - if is_total_slice(chunk_selection, chunk_shape): - # write entire chunks - if np.isscalar(value): - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - order=self.metadata.order, - ) - chunk_array.fill(value) - else: - chunk_array = value[out_selection] - await self._write_chunk_to_store(store_path, chunk_array) - - else: - # writing partial chunks - # read chunk first - tmp = await self._decode_chunk(await store_path.get_async()) - - # merge new value - if tmp is None: - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - order=self.metadata.order, - ) - chunk_array.fill(self.metadata.fill_value) - else: - chunk_array = tmp.copy( - order=self.metadata.order, - ) # make a writable copy - chunk_array[chunk_selection] = value[out_selection] - - await self._write_chunk_to_store(store_path, chunk_array) - - async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.ndarray): - chunk_bytes: Optional[BytesLike] - if np.all(chunk_array == self.metadata.fill_value): - # chunks that only contain fill_value will be removed - await store_path.delete_async() - else: - chunk_bytes = await self._encode_chunk(chunk_array) - if chunk_bytes is None: - await store_path.delete_async() - else: - await store_path.set_async(chunk_bytes) - - async def _encode_chunk(self, chunk_array: np.ndarray) -> Optional[BytesLike]: - chunk_array = chunk_array.ravel(order=self.metadata.order) - - if self.metadata.filters is not None: - for filter_metadata in self.metadata.filters: - filter = numcodecs.get_codec(filter_metadata) - chunk_array = await to_thread(filter.encode, chunk_array) - - if self.metadata.compressor is not None: - compressor = numcodecs.get_codec(self.metadata.compressor) - if not chunk_array.flags.c_contiguous and not chunk_array.flags.f_contiguous: - chunk_array = chunk_array.copy(order="A") - encoded_chunk_bytes = ensure_bytes(await to_thread(compressor.encode, chunk_array)) - else: - encoded_chunk_bytes = ensure_bytes(chunk_array) - - return encoded_chunk_bytes - - def _encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: - chunk_identifier = self.metadata.dimension_separator.join(map(str, chunk_coords)) - return "0" if chunk_identifier == "" else chunk_identifier - - async def resize_async(self, new_shape: ChunkCoords) -> ArrayV2: + async def resize_async(self, new_shape: ChunkCoords) -> AsyncArray: assert len(new_shape) == len(self.metadata.shape) - new_metadata = evolve(self.metadata, shape=new_shape) + new_metadata = attr.evolve(self.metadata, shape=new_shape) # Remove all chunks outside of the new shape chunk_shape = self.metadata.chunks @@ -429,18 +361,17 @@ async def _delete_key(key: str) -> None: # Write new metadata await (self.store_path / ZARRAY_JSON).set_async(new_metadata.to_bytes()) - return evolve(self, metadata=new_metadata) + return attr.evolve(self, metadata=new_metadata) - def resize(self, new_shape: ChunkCoords) -> ArrayV2: + def resize(self, new_shape: ChunkCoords) -> AsyncArray: return sync(self.resize_async(new_shape), self.runtime_configuration.asyncio_loop) - async def convert_to_v3_async(self) -> Array: + """ async def convert_to_v3_async(self) -> v3.Array: from sys import byteorder as sys_byteorder - - from zarr.v3.array import Array + import zarr.v3.array.v3 as v3 from zarr.v3.common import ZARR_JSON - from zarr.v3.metadata import ( - ArrayMetadata, + + from zarr.v3.array.base import ( BloscCodecConfigurationMetadata, BloscCodecMetadata, BytesCodecConfigurationMetadata, @@ -508,7 +439,7 @@ async def convert_to_v3_async(self) -> Array: ) ) - new_metadata = ArrayMetadata( + new_metadata = v3.ArrayMetadata( shape=self.metadata.shape, chunk_grid=RegularChunkGridMetadata( configuration=RegularChunkGridConfigurationMetadata( @@ -529,24 +460,171 @@ async def convert_to_v3_async(self) -> Array: new_metadata_bytes = new_metadata.to_bytes() await (self.store_path / ZARR_JSON).set_async(new_metadata_bytes) - return Array.from_json( + return AsyncArray.from_json( store_path=self.store_path, zarr_json=json.loads(new_metadata_bytes), runtime_configuration=self.runtime_configuration, ) + """ - async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> ArrayV2: + async def update_attributes_async(self, new_attributes: Attributes) -> AsyncArray: await (self.store_path / ZATTRS_JSON).set_async(json.dumps(new_attributes).encode()) - return evolve(self, attributes=new_attributes) + return attr.evolve(self, attributes=new_attributes) - def update_attributes(self, new_attributes: Dict[str, Any]) -> ArrayV2: + def update_attributes(self, new_attributes: Attributes) -> AsyncArray: return sync( self.update_attributes_async(new_attributes), self.runtime_configuration.asyncio_loop, ) - def convert_to_v3(self) -> Array: - return sync(self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop) + """ def convert_to_v3(self) -> AsyncArray: + return sync(self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop) """ def __repr__(self): return f"" + + async def info(self): + raise NotImplementedError + + async def setitem(self, key, value): + raise NotImplementedError + + +@attr.frozen +class Array(SynchronousArray): + _async_array: AsyncArray + + @classmethod + def create( + cls, + store: StoreLike, + *, + shape: Tuple[int, ...], + dtype: np.dtype, + chunks: Tuple[int, ...], + dimension_separator: Literal[".", "/"] = ".", + fill_value: Optional[Union[None, int, float]] = None, + order: Literal["C", "F"] = "C", + filters: Optional[List[Dict[str, Any]]] = None, + compressor: Optional[Dict[str, Any]] = None, + attributes: Attributes = {}, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Array: + async_array = sync( + AsyncArray.create( + store=store, + shape=shape, + dtype=dtype, + chunks=chunks, + fill_value=fill_value, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, + attributes=attributes, + runtime_configuration=runtime_configuration, + exists_ok=exists_ok, + ), + runtime_configuration.asyncio_loop, + ) + return cls(async_array) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + ) -> Array: + async_array = AsyncArray.from_json( + store_path=store_path, zarr_json=zarr_json, runtime_configuration=runtime_configuration + ) + return cls(async_array) + + @classmethod + def open( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Array: + + async_array = sync( + AsyncArray.open(store, runtime_configuration=runtime_configuration), + runtime_configuration.asyncio_loop, + ) + async_array._validate_metadata() + return cls(async_array) + + @classmethod + def open_auto( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Array: # TODO: Union[Array, ArrayV2]: + async_array = sync( + AsyncArray.open_auto(store, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + return cls(async_array) + + @property + def ndim(self) -> int: + return self._async_array.ndim + + @property + def shape(self) -> ChunkCoords: + return self._async_array.shape + + @property + def size(self) -> int: + return self._async_array.size + + @property + def dtype(self) -> np.dtype: + return self._async_array.dtype + + @property + def attrs(self) -> dict: + return self._async_array.attrs + + @property + def metadata(self) -> ArrayMetadata: + return self._async_array.metadata + + @property + def store_path(self) -> str: + return self._async_array.store_path + + def __getitem__(self, selection: Selection): + return sync( + self._async_array.getitem(selection), + self._async_array.runtime_configuration.asyncio_loop, + ) + + def __setitem__(self, selection: Selection, value: np.ndarray) -> None: + sync( + self._async_array.setitem(selection, value), + self._async_array.runtime_configuration.asyncio_loop, + ) + + def resize(self, new_shape: ChunkCoords) -> Array: + return sync( + self._async_array.resize(new_shape), + self._async_array.runtime_configuration.asyncio_loop, + ) + + def update_attributes(self, new_attributes: Attributes) -> Array: + return sync( + self._async_array.update_attributes(new_attributes), + self._async_array.runtime_configuration.asyncio_loop, + ) + + def __repr__(self): + return f"" + + def info(self): + return sync( + self._async_array.info(), + self._async_array.runtime_configuration.asyncio_loop, + ) diff --git a/zarr/v3/array.py b/zarr/v3/array/v3.py similarity index 65% rename from zarr/v3/array.py rename to zarr/v3/array/v3.py index 3c0d7eba5c..752df4939e 100644 --- a/zarr/v3/array.py +++ b/zarr/v3/array/v3.py @@ -10,40 +10,89 @@ # 2. Do we really need runtime_configuration? Specifically, the asyncio_loop seems problematic from __future__ import annotations +from enum import Enum import json -from typing import Any, Dict, Iterable, Literal, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union import numpy as np -from attr import evolve, frozen +from attr import asdict, evolve, frozen, field + +from zarr.v3.array.base import ( + ChunkKeyEncodingV2, + ChunkKeyEncodingV3, + SynchronousArray, + read_chunks, + write_chunk, +) -from zarr.v3.abc.array import SynchronousArray, AsynchronousArray +from zarr.v3.array.base import ( + ChunkKeyEncoder, +) -# from zarr.v3.array_v2 import ArrayV2 from zarr.v3.codecs import CodecMetadata, CodecPipeline, bytes_codec from zarr.v3.common import ( ZARR_JSON, - ChunkCoords, - Selection, - SliceSelection, + ChunkMetadata, + RuntimeConfiguration, concurrent_map, + make_cattr, ) -from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice -from zarr.v3.metadata import ( - ArrayMetadata, - DataType, - DefaultChunkKeyEncodingConfigurationMetadata, - DefaultChunkKeyEncodingMetadata, - RegularChunkGridConfigurationMetadata, - RegularChunkGridMetadata, - RuntimeConfiguration, - V2ChunkKeyEncodingConfigurationMetadata, - V2ChunkKeyEncodingMetadata, - dtype_to_data_type, +from zarr.v3.array.indexing import BasicIndexer, all_chunk_coords, is_total_slice +from zarr.v3.array.base import ( + AsynchronousArray, +) + +from zarr.v3.metadata.v3 import ( + DefaultChunkKeyConfig, + DefaultChunkKeyEncoding, + RegularChunkGrid, + RegularChunkGridConfig, + V2ChunkKeyEncoding, ) -from zarr.v3.sharding import ShardingCodec from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync +from zarr.v3.types import Attributes, ChunkCoords, Selection, SliceSelection +import zarr.v3.metadata.v3 as metaV3 + + +class ArrayMetadata(metaV3.ArrayMetadata): + @property + def ndim(self) -> int: + return len(self.shape) + + def get_core_metadata(self) -> ChunkMetadata: + return ChunkMetadata( + array_shape=self.shape, + chunk_shape=self.chunk_grid.configuration.chunk_shape, + dtype=self.data_type, + fill_value=self.fill_value, + ) + + def to_bytes(self) -> bytes: + return self.to_json() + + """ + def to_bytes(self) -> bytes: + def _json_convert(o): + if isinstance(o, np.dtype): + return str(o) + if isinstance(o, Enum): + return o.name + raise TypeError + + return json.dumps( + asdict( + self, + filter=lambda attr, value: attr.name != "dimension_names" or value is not None, + ), + default=_json_convert, + ).encode() + """ + + @classmethod + def from_json(cls, zarr_json: Any) -> ArrayMetadata: + return make_cattr().structure(zarr_json, cls) @frozen @@ -52,15 +101,37 @@ class AsyncArray(AsynchronousArray): store_path: StorePath runtime_configuration: RuntimeConfiguration codec_pipeline: CodecPipeline + chunk_key_encoding: ChunkKeyEncoder + attributes: Attributes + + @property + def ndim(self) -> int: + return len(self.metadata.shape) + + @property + def shape(self) -> ChunkCoords: + return self.metadata.shape + + @property + def size(self) -> int: + return np.prod(self.metadata.shape) + + @property + def dtype(self) -> np.dtype: + return self.metadata.data_type + + @property + def attrs(self) -> Attributes: + return self.metadata.attributes @classmethod async def create( cls, store: StoreLike, *, - shape: ChunkCoords, + shape: Tuple[int, ...], dtype: Union[str, np.dtype], - chunk_shape: ChunkCoords, + chunk_shape: Tuple[int, ...], fill_value: Optional[Any] = None, chunk_key_encoding: Union[ Tuple[Literal["default"], Literal[".", "/"]], @@ -68,59 +139,66 @@ async def create( ] = ("default", "/"), codecs: Optional[Iterable[CodecMetadata]] = None, dimension_names: Optional[Iterable[str]] = None, - attributes: Optional[Dict[str, Any]] = None, - runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + attributes: Attributes = {}, exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), ) -> AsyncArray: store_path = make_store_path(store) if not exists_ok: assert not await (store_path / ZARR_JSON).exists_async() - - data_type = ( - DataType[dtype] if isinstance(dtype, str) else DataType[dtype_to_data_type[dtype.str]] - ) - + if isinstance(dtype, str): + data_type = np.dtype(dtype) + else: + data_type = dtype codecs = list(codecs) if codecs is not None else [bytes_codec()] if fill_value is None: - if data_type == DataType.bool: + if data_type == "bool": fill_value = False else: fill_value = 0 + if chunk_key_encoding[0] == "default": + _chunk_key_encoding = DefaultChunkKeyEncoding( + configuration=DefaultChunkKeyConfig(separator=chunk_key_encoding[1]) + ) + else: + _chunk_key_encoding = V2ChunkKeyEncoding( + configuration=DefaultChunkKeyConfig(separator=chunk_key_encoding[1]) + ) metadata = ArrayMetadata( shape=shape, data_type=data_type, - chunk_grid=RegularChunkGridMetadata( - configuration=RegularChunkGridConfigurationMetadata(chunk_shape=chunk_shape) - ), - chunk_key_encoding=( - V2ChunkKeyEncodingMetadata( - configuration=V2ChunkKeyEncodingConfigurationMetadata( - separator=chunk_key_encoding[1] - ) - ) - if chunk_key_encoding[0] == "v2" - else DefaultChunkKeyEncodingMetadata( - configuration=DefaultChunkKeyEncodingConfigurationMetadata( - separator=chunk_key_encoding[1] - ) - ) + chunk_grid=RegularChunkGrid( + configuration=RegularChunkGridConfig(chunk_shape=chunk_shape) ), + chunk_key_encoding=_chunk_key_encoding, fill_value=fill_value, codecs=codecs, dimension_names=tuple(dimension_names) if dimension_names else None, - attributes=attributes or {}, ) runtime_configuration = runtime_configuration or RuntimeConfiguration() + # this logic should live in a `normalize_chunk_key_encoding` function + if chunk_key_encoding[0] == "default": + cke = ChunkKeyEncodingV3(separator=chunk_key_encoding[1]) + elif chunk_key_encoding[0] == "v2": + cke = ChunkKeyEncodingV2(separator=chunk_key_encoding[1]) + else: + raise ValueError( + f"Chunk key encoding {chunk_key_encoding[0]} is not recognized. " + 'Must be one one of ("default", "v2")' + ) + array = cls( metadata=metadata, store_path=store_path, runtime_configuration=runtime_configuration, codec_pipeline=CodecPipeline.from_metadata( - metadata.codecs, metadata.get_core_metadata(runtime_configuration) + metadata.codecs, metadata.get_core_metadata() ), + chunk_key_encoding=cke, + attributes=attributes or {}, ) await array._save_metadata() @@ -134,13 +212,24 @@ def from_json( runtime_configuration: RuntimeConfiguration, ) -> AsyncArray: metadata = ArrayMetadata.from_json(zarr_json) + + if metadata.chunk_key_encoding.name == "V2": + chunk_key_encoding = ChunkKeyEncodingV2( + separator=metadata.chunk_key_encoding.configuration.separator + ) + else: + chunk_key_encoding = ChunkKeyEncodingV3( + separator=metadata.chunk_key_encoding.configuration.separator + ) + async_array = cls( metadata=metadata, store_path=store_path, runtime_configuration=runtime_configuration, codec_pipeline=CodecPipeline.from_metadata( - metadata.codecs, metadata.get_core_metadata(runtime_configuration) + metadata.codecs, metadata.get_core_metadata() ), + chunk_key_encoding=chunk_key_encoding, ) async_array._validate_metadata() return async_array @@ -178,48 +267,32 @@ async def open_auto( raise ValueError("no v2 support yet") # return await ArrayV2.open_async(store_path) - @property - def ndim(self) -> int: - return len(self.metadata.shape) - - @property - def shape(self) -> ChunkCoords: - return self.metadata.shape - - @property - def size(self) -> int: - return np.prod(self.metadata.shape) - - @property - def dtype(self) -> np.dtype: - return self.metadata.dtype - - @property - def attrs(self) -> dict: - return self.metadata.attributes - async def getitem(self, selection: Selection): + indexer = BasicIndexer( selection, - shape=self.metadata.shape, + shape=self.shape, chunk_shape=self.metadata.chunk_grid.configuration.chunk_shape, ) # setup output array out = np.zeros( indexer.shape, - dtype=self.metadata.dtype, + dtype=self.metadata.data_type, order=self.runtime_configuration.order, ) - - # reading chunks and decoding them - await concurrent_map( - [ - (chunk_coords, chunk_selection, out_selection, out) - for chunk_coords, chunk_selection, out_selection in indexer - ], - self._read_chunk, - self.runtime_configuration.concurrency, + out.fill(self.metadata.fill_value) + chunk_coords, chunk_selections, out_selections = zip(*indexer) + chunk_keys = map(self.chunk_key_encoding.encode_key, chunk_coords) + + await read_chunks( + chunk_keys=chunk_keys, + store_path=self.store_path, + codec_pipeline=self.codec_pipeline, + chunk_selections=chunk_selections, + out_selections=out_selections, + out=out, + config=self.runtime_configuration, ) if out.shape: @@ -241,36 +314,6 @@ def _validate_metadata(self) -> None: ), "`dimension_names` and `shape` need to have the same number of dimensions." assert self.metadata.fill_value is not None, "`fill_value` is required." - async def _read_chunk( - self, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - out: np.ndarray, - ): - chunk_key_encoding = self.metadata.chunk_key_encoding - chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords) - store_path = self.store_path / chunk_key - - if len(self.codec_pipeline.codecs) == 1 and isinstance( - self.codec_pipeline.codecs[0], ShardingCodec - ): - chunk_array = await self.codec_pipeline.codecs[0].decode_partial( - store_path, chunk_selection - ) - if chunk_array is not None: - out[out_selection] = chunk_array - else: - out[out_selection] = self.metadata.fill_value - else: - chunk_bytes = await store_path.get_async() - if chunk_bytes is not None: - chunk_array = await self.codec_pipeline.decode(chunk_bytes) - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = self.metadata.fill_value - async def setitem(self, selection: Selection, value: np.ndarray) -> None: chunk_shape = self.metadata.chunk_grid.configuration.chunk_shape indexer = BasicIndexer( @@ -289,97 +332,37 @@ async def setitem(self, selection: Selection, value: np.ndarray) -> None: if not hasattr(value, "shape"): value = np.asarray(value, self.metadata.dtype) assert value.shape == sel_shape - if value.dtype.name != self.metadata.dtype.name: - value = value.astype(self.metadata.dtype, order="A") + if value.dtype.name != self.dtype.name: + value = value.astype(self.dtype, order="A") # merging with existing data and encoding chunks await concurrent_map( [ ( + self.chunk_key_encoding, + self.store_path, + self.codec_pipeline, value, chunk_shape, chunk_coords, chunk_selection, out_selection, + self.metadata.fill_value, + self.runtime_configuration, ) for chunk_coords, chunk_selection, out_selection in indexer ], - self._write_chunk, + write_chunk, self.runtime_configuration.concurrency, ) - async def _write_chunk( - self, - value: np.ndarray, - chunk_shape: ChunkCoords, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - ): - chunk_key_encoding = self.metadata.chunk_key_encoding - chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords) - store_path = self.store_path / chunk_key - - if is_total_slice(chunk_selection, chunk_shape): - # write entire chunks - if np.isscalar(value): - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - ) - chunk_array.fill(value) - else: - chunk_array = value[out_selection] - await self._write_chunk_to_store(store_path, chunk_array) - - elif len(self.codec_pipeline.codecs) == 1 and isinstance( - self.codec_pipeline.codecs[0], ShardingCodec - ): - sharding_codec = self.codec_pipeline.codecs[0] - # print("encode_partial", chunk_coords, chunk_selection, repr(self)) - await sharding_codec.encode_partial( - store_path, - value[out_selection], - chunk_selection, - ) - else: - # writing partial chunks - # read chunk first - chunk_bytes = await store_path.get_async() - - # merge new value - if chunk_bytes is None: - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - ) - chunk_array.fill(self.metadata.fill_value) - else: - chunk_array = ( - await self.codec_pipeline.decode(chunk_bytes) - ).copy() # make a writable copy - chunk_array[chunk_selection] = value[out_selection] - - await self._write_chunk_to_store(store_path, chunk_array) - - async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.ndarray): - if np.all(chunk_array == self.metadata.fill_value): - # chunks that only contain fill_value will be removed - await store_path.delete_async() - else: - chunk_bytes = await self.codec_pipeline.encode(chunk_array) - if chunk_bytes is None: - await store_path.delete_async() - else: - await store_path.set_async(chunk_bytes) - async def resize(self, new_shape: ChunkCoords) -> Array: assert len(new_shape) == len(self.metadata.shape) new_metadata = evolve(self.metadata, shape=new_shape) # Remove all chunks outside of the new shape chunk_shape = self.metadata.chunk_grid.configuration.chunk_shape - chunk_key_encoding = self.metadata.chunk_key_encoding + chunk_key_encoding = self.chunk_key_encoding old_chunk_coords = set(all_chunk_coords(self.metadata.shape, chunk_shape)) new_chunk_coords = set(all_chunk_coords(new_shape, chunk_shape)) @@ -388,7 +371,7 @@ async def _delete_key(key: str) -> None: await concurrent_map( [ - (chunk_key_encoding.encode_chunk_key(chunk_coords),) + (chunk_key_encoding.encode_key(chunk_coords),) for chunk_coords in old_chunk_coords.difference(new_chunk_coords) ], _delete_key, @@ -399,7 +382,7 @@ async def _delete_key(key: str) -> None: await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) return evolve(self, metadata=new_metadata) - async def update_attributes(self, new_attributes: Dict[str, Any]) -> Array: + async def update_attributes(self, new_attributes: Attributes) -> Array: new_metadata = evolve(self.metadata, attributes=new_attributes) # Write new metadata @@ -432,7 +415,7 @@ def create( ] = ("default", "/"), codecs: Optional[Iterable[CodecMetadata]] = None, dimension_names: Optional[Iterable[str]] = None, - attributes: Optional[Dict[str, Any]] = None, + attributes: Optional[Attributes] = None, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), exists_ok: bool = False, ) -> Array: @@ -512,6 +495,10 @@ def dtype(self) -> np.dtype: def attrs(self) -> dict: return self._async_array.attrs + @property + def metadata(self) -> ArrayMetadata: + return self._async_array.metadata + @property def store_path(self) -> str: return self._async_array.store_path @@ -534,7 +521,7 @@ def resize(self, new_shape: ChunkCoords) -> Array: self._async_array.runtime_configuration.asyncio_loop, ) - def update_attributes(self, new_attributes: Dict[str, Any]) -> Array: + def update_attributes(self, new_attributes: Attributes) -> Array: return sync( self._async_array.update_attributes(new_attributes), self._async_array.runtime_configuration.asyncio_loop, diff --git a/zarr/v3/codecs.py b/zarr/v3/codecs.py deleted file mode 100644 index ff913c42b2..0000000000 --- a/zarr/v3/codecs.py +++ /dev/null @@ -1,514 +0,0 @@ -from __future__ import annotations - -from functools import reduce -from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Tuple, Union -from warnings import warn - -import numcodecs -import numpy as np -from attr import asdict, evolve, frozen -from crc32c import crc32c -from numcodecs.blosc import Blosc -from numcodecs.gzip import GZip -from zstandard import ZstdCompressor, ZstdDecompressor - -from zarr.v3.abc.codec import Codec, ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec -from zarr.v3.common import BytesLike, to_thread -from zarr.v3.metadata import ( - BloscCodecConfigurationMetadata, - BloscCodecMetadata, - BytesCodecConfigurationMetadata, - BytesCodecMetadata, - CodecMetadata, - Crc32cCodecMetadata, - GzipCodecConfigurationMetadata, - GzipCodecMetadata, - ShardingCodecConfigurationMetadata, - ShardingCodecMetadata, - TransposeCodecConfigurationMetadata, - TransposeCodecMetadata, - ZstdCodecConfigurationMetadata, - ZstdCodecMetadata, -) - -if TYPE_CHECKING: - from zarr.v3.metadata import CoreArrayMetadata - -# See https://zarr.readthedocs.io/en/stable/tutorial.html#configuring-blosc -numcodecs.blosc.use_threads = False - - -@frozen -class CodecPipeline: - codecs: List[Codec] - - @classmethod - def from_metadata( - cls, - codecs_metadata: Iterable[CodecMetadata], - array_metadata: CoreArrayMetadata, - ) -> CodecPipeline: - out: List[Codec] = [] - for codec_metadata in codecs_metadata or []: - if codec_metadata.name == "endian": - codec_metadata = evolve(codec_metadata, name="bytes") # type: ignore - - codec: Codec - if codec_metadata.name == "blosc": - codec = BloscCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "gzip": - codec = GzipCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "zstd": - codec = ZstdCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "transpose": - codec = TransposeCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "bytes": - codec = BytesCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "crc32c": - codec = Crc32cCodec.from_metadata(codec_metadata, array_metadata) - elif codec_metadata.name == "sharding_indexed": - from zarr.v3.sharding import ShardingCodec - - codec = ShardingCodec.from_metadata(codec_metadata, array_metadata) - else: - raise RuntimeError(f"Unsupported codec: {codec_metadata}") - - out.append(codec) - array_metadata = codec.resolve_metadata() - CodecPipeline._validate_codecs(out, array_metadata) - return cls(out) - - @staticmethod - def _validate_codecs(codecs: List[Codec], array_metadata: CoreArrayMetadata) -> None: - from zarr.v3.sharding import ShardingCodec - - assert any( - isinstance(codec, ArrayBytesCodec) for codec in codecs - ), "Exactly one array-to-bytes codec is required." - - prev_codec: Optional[Codec] = None - for codec in codecs: - if prev_codec is not None: - assert not isinstance(codec, ArrayBytesCodec) or not isinstance( - prev_codec, ArrayBytesCodec - ), ( - f"ArrayBytesCodec '{type(codec)}' cannot follow after " - + f"ArrayBytesCodec '{type(prev_codec)}' because exactly " - + "1 ArrayBytesCodec is allowed." - ) - assert not isinstance(codec, ArrayBytesCodec) or not isinstance( - prev_codec, BytesBytesCodec - ), ( - f"ArrayBytesCodec '{type(codec)}' cannot follow after " - + f"BytesBytesCodec '{type(prev_codec)}'." - ) - assert not isinstance(codec, ArrayArrayCodec) or not isinstance( - prev_codec, ArrayBytesCodec - ), ( - f"ArrayArrayCodec '{type(codec)}' cannot follow after " - + f"ArrayBytesCodec '{type(prev_codec)}'." - ) - assert not isinstance(codec, ArrayArrayCodec) or not isinstance( - prev_codec, BytesBytesCodec - ), ( - f"ArrayArrayCodec '{type(codec)}' cannot follow after " - + f"BytesBytesCodec '{type(prev_codec)}'." - ) - - if isinstance(codec, ShardingCodec): - assert len(codec.configuration.chunk_shape) == len(array_metadata.shape), ( - "The shard's `chunk_shape` and array's `shape` need to have the " - + "same number of dimensions." - ) - assert all( - s % c == 0 - for s, c in zip( - array_metadata.chunk_shape, - codec.configuration.chunk_shape, - ) - ), ( - "The array's `chunk_shape` needs to be divisible by the " - + "shard's inner `chunk_shape`." - ) - prev_codec = codec - - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: - warn( - "Combining a `sharding_indexed` codec disables partial reads and " - + "writes, which may lead to inefficient performance." - ) - - def _array_array_codecs(self) -> List[ArrayArrayCodec]: - return [codec for codec in self.codecs if isinstance(codec, ArrayArrayCodec)] - - def _array_bytes_codec(self) -> ArrayBytesCodec: - return next(codec for codec in self.codecs if isinstance(codec, ArrayBytesCodec)) - - def _bytes_bytes_codecs(self) -> List[BytesBytesCodec]: - return [codec for codec in self.codecs if isinstance(codec, BytesBytesCodec)] - - async def decode(self, chunk_bytes: BytesLike) -> np.ndarray: - for bb_codec in self._bytes_bytes_codecs()[::-1]: - chunk_bytes = await bb_codec.decode(chunk_bytes) - - chunk_array = await self._array_bytes_codec().decode(chunk_bytes) - - for aa_codec in self._array_array_codecs()[::-1]: - chunk_array = await aa_codec.decode(chunk_array) - - return chunk_array - - async def encode(self, chunk_array: np.ndarray) -> Optional[BytesLike]: - for aa_codec in self._array_array_codecs(): - chunk_array_maybe = await aa_codec.encode(chunk_array) - if chunk_array_maybe is None: - return None - chunk_array = chunk_array_maybe - - chunk_bytes_maybe = await self._array_bytes_codec().encode(chunk_array) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - for bb_codec in self._bytes_bytes_codecs(): - chunk_bytes_maybe = await bb_codec.encode(chunk_bytes) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - return chunk_bytes - - def compute_encoded_size(self, byte_length: int) -> int: - return reduce(lambda acc, codec: codec.compute_encoded_size(acc), self.codecs, byte_length) - - -@frozen -class BloscCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - configuration: BloscCodecConfigurationMetadata - blosc_codec: Blosc - is_fixed_size = False - - @classmethod - def from_metadata( - cls, codec_metadata: BloscCodecMetadata, array_metadata: CoreArrayMetadata - ) -> BloscCodec: - configuration = codec_metadata.configuration - if configuration.typesize == 0: - configuration = evolve(configuration, typesize=array_metadata.data_type.byte_count) - config_dict = asdict(codec_metadata.configuration) - config_dict.pop("typesize", None) - map_shuffle_str_to_int = {"noshuffle": 0, "shuffle": 1, "bitshuffle": 2} - config_dict["shuffle"] = map_shuffle_str_to_int[config_dict["shuffle"]] - return cls( - array_metadata=array_metadata, - configuration=configuration, - blosc_codec=Blosc.from_config(config_dict), - ) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - return await to_thread(self.blosc_codec.decode, chunk_bytes) - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - chunk_array = np.frombuffer(chunk_bytes, dtype=self.array_metadata.dtype) - return await to_thread(self.blosc_codec.encode, chunk_array) - - def compute_encoded_size(self, _input_byte_length: int) -> int: - raise NotImplementedError - - -@frozen -class BytesCodec(ArrayBytesCodec): - array_metadata: CoreArrayMetadata - configuration: BytesCodecConfigurationMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: BytesCodecMetadata, array_metadata: CoreArrayMetadata - ) -> BytesCodec: - assert ( - array_metadata.dtype.itemsize == 1 or codec_metadata.configuration.endian is not None - ), "The `endian` configuration needs to be specified for multi-byte data types." - return cls( - array_metadata=array_metadata, - configuration=codec_metadata.configuration, - ) - - def _get_byteorder(self, array: np.ndarray) -> Literal["big", "little"]: - if array.dtype.byteorder == "<": - return "little" - elif array.dtype.byteorder == ">": - return "big" - else: - import sys - - return sys.byteorder - - async def decode( - self, - chunk_bytes: BytesLike, - ) -> np.ndarray: - if self.array_metadata.dtype.itemsize > 0: - if self.configuration.endian == "little": - prefix = "<" - else: - prefix = ">" - dtype = np.dtype(f"{prefix}{self.array_metadata.data_type.to_numpy_shortname()}") - else: - dtype = np.dtype(f"|{self.array_metadata.data_type.to_numpy_shortname()}") - chunk_array = np.frombuffer(chunk_bytes, dtype) - - # ensure correct chunk shape - if chunk_array.shape != self.array_metadata.chunk_shape: - chunk_array = chunk_array.reshape( - self.array_metadata.chunk_shape, - ) - return chunk_array - - async def encode( - self, - chunk_array: np.ndarray, - ) -> Optional[BytesLike]: - if chunk_array.dtype.itemsize > 1: - byteorder = self._get_byteorder(chunk_array) - if self.configuration.endian != byteorder: - new_dtype = chunk_array.dtype.newbyteorder(self.configuration.endian) - chunk_array = chunk_array.astype(new_dtype) - return chunk_array.tobytes() - - def compute_encoded_size(self, input_byte_length: int) -> int: - return input_byte_length - - -@frozen -class TransposeCodec(ArrayArrayCodec): - array_metadata: CoreArrayMetadata - order: Tuple[int, ...] - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: TransposeCodecMetadata, array_metadata: CoreArrayMetadata - ) -> TransposeCodec: - configuration = codec_metadata.configuration - if configuration.order == "F": - order = tuple(array_metadata.ndim - x - 1 for x in range(array_metadata.ndim)) - - elif configuration.order == "C": - order = tuple(range(array_metadata.ndim)) - - else: - assert len(configuration.order) == array_metadata.ndim, ( - "The `order` tuple needs have as many entries as " - + f"there are dimensions in the array. Got: {configuration.order}" - ) - assert len(configuration.order) == len(set(configuration.order)), ( - "There must not be duplicates in the `order` tuple. " - + f"Got: {configuration.order}" - ) - assert all(0 <= x < array_metadata.ndim for x in configuration.order), ( - "All entries in the `order` tuple must be between 0 and " - + f"the number of dimensions in the array. Got: {configuration.order}" - ) - order = tuple(configuration.order) - - return cls( - array_metadata=array_metadata, - order=order, - ) - - def resolve_metadata(self) -> CoreArrayMetadata: - from zarr.v3.metadata import CoreArrayMetadata - - return CoreArrayMetadata( - shape=tuple( - self.array_metadata.shape[self.order[i]] for i in range(self.array_metadata.ndim) - ), - chunk_shape=tuple( - self.array_metadata.chunk_shape[self.order[i]] - for i in range(self.array_metadata.ndim) - ), - data_type=self.array_metadata.data_type, - fill_value=self.array_metadata.fill_value, - runtime_configuration=self.array_metadata.runtime_configuration, - ) - - async def decode( - self, - chunk_array: np.ndarray, - ) -> np.ndarray: - inverse_order = [0 for _ in range(self.array_metadata.ndim)] - for x, i in enumerate(self.order): - inverse_order[x] = i - chunk_array = chunk_array.transpose(inverse_order) - return chunk_array - - async def encode( - self, - chunk_array: np.ndarray, - ) -> Optional[np.ndarray]: - chunk_array = chunk_array.transpose(self.order) - return chunk_array - - def compute_encoded_size(self, input_byte_length: int) -> int: - return input_byte_length - - -@frozen -class GzipCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - configuration: GzipCodecConfigurationMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: GzipCodecMetadata, array_metadata: CoreArrayMetadata - ) -> GzipCodec: - return cls( - array_metadata=array_metadata, - configuration=codec_metadata.configuration, - ) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - return await to_thread(GZip(self.configuration.level).decode, chunk_bytes) - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - return await to_thread(GZip(self.configuration.level).encode, chunk_bytes) - - def compute_encoded_size(self, _input_byte_length: int) -> int: - raise NotImplementedError - - -@frozen -class ZstdCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - configuration: ZstdCodecConfigurationMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: ZstdCodecMetadata, array_metadata: CoreArrayMetadata - ) -> ZstdCodec: - return cls( - array_metadata=array_metadata, - configuration=codec_metadata.configuration, - ) - - def _compress(self, data: bytes) -> bytes: - ctx = ZstdCompressor( - level=self.configuration.level, write_checksum=self.configuration.checksum - ) - return ctx.compress(data) - - def _decompress(self, data: bytes) -> bytes: - ctx = ZstdDecompressor() - return ctx.decompress(data) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - return await to_thread(self._decompress, chunk_bytes) - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - return await to_thread(self._compress, chunk_bytes) - - def compute_encoded_size(self, _input_byte_length: int) -> int: - raise NotImplementedError - - -@frozen -class Crc32cCodec(BytesBytesCodec): - array_metadata: CoreArrayMetadata - is_fixed_size = True - - @classmethod - def from_metadata( - cls, codec_metadata: Crc32cCodecMetadata, array_metadata: CoreArrayMetadata - ) -> Crc32cCodec: - return cls(array_metadata=array_metadata) - - async def decode( - self, - chunk_bytes: bytes, - ) -> BytesLike: - crc32_bytes = chunk_bytes[-4:] - inner_bytes = chunk_bytes[:-4] - - assert np.uint32(crc32c(inner_bytes)).tobytes() == bytes(crc32_bytes) - return inner_bytes - - async def encode( - self, - chunk_bytes: bytes, - ) -> Optional[BytesLike]: - return chunk_bytes + np.uint32(crc32c(chunk_bytes)).tobytes() - - def compute_encoded_size(self, input_byte_length: int) -> int: - return input_byte_length + 4 - - -def blosc_codec( - typesize: int, - cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd", - clevel: int = 5, - shuffle: Literal["noshuffle", "shuffle", "bitshuffle"] = "noshuffle", - blocksize: int = 0, -) -> BloscCodecMetadata: - return BloscCodecMetadata( - configuration=BloscCodecConfigurationMetadata( - cname=cname, - clevel=clevel, - shuffle=shuffle, - blocksize=blocksize, - typesize=typesize, - ) - ) - - -def bytes_codec(endian: Optional[Literal["big", "little"]] = "little") -> BytesCodecMetadata: - return BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian)) - - -def transpose_codec(order: Union[Tuple[int, ...], Literal["C", "F"]]) -> TransposeCodecMetadata: - return TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order)) - - -def gzip_codec(level: int = 5) -> GzipCodecMetadata: - return GzipCodecMetadata(configuration=GzipCodecConfigurationMetadata(level)) - - -def zstd_codec(level: int = 0, checksum: bool = False) -> ZstdCodecMetadata: - return ZstdCodecMetadata(configuration=ZstdCodecConfigurationMetadata(level, checksum)) - - -def crc32c_codec() -> Crc32cCodecMetadata: - return Crc32cCodecMetadata() - - -def sharding_codec( - chunk_shape: Tuple[int, ...], - codecs: Optional[List[CodecMetadata]] = None, - index_codecs: Optional[List[CodecMetadata]] = None, -) -> ShardingCodecMetadata: - codecs = codecs or [bytes_codec()] - index_codecs = index_codecs or [bytes_codec(), crc32c_codec()] - return ShardingCodecMetadata( - configuration=ShardingCodecConfigurationMetadata(chunk_shape, codecs, index_codecs) - ) diff --git a/zarr/v3/codecs/__init__.py b/zarr/v3/codecs/__init__.py new file mode 100644 index 0000000000..0ffe597a1b --- /dev/null +++ b/zarr/v3/codecs/__init__.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +from functools import reduce +from typing import ( + TYPE_CHECKING, + Iterable, + List, + Literal, + Optional, + Tuple, + Union, +) +from warnings import warn + +import numpy as np +from attr import frozen + +from zarr.v3.abc.codec import Codec, ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.types import BytesLike +from zarr.v3.metadata.v3 import CodecMetadata +from zarr.v3.codecs.registry import get_codec_class + +if TYPE_CHECKING: + from zarr.v3.metadata import ChunkMetadata + from zarr.v3.codecs.sharding import ShardingCodecMetadata + from zarr.v3.codecs.blosc import BloscCodecMetadata + from zarr.v3.codecs.bytes import BytesCodecMetadata + from zarr.v3.codecs.transpose import TransposeCodecMetadata + from zarr.v3.codecs.gzip import GzipCodecMetadata + from zarr.v3.codecs.zstd import ZstdCodecMetadata + from zarr.v3.codecs.crc32c_ import Crc32cCodecMetadata + + +@frozen +class CodecPipeline: + codecs: List[Codec] + + @classmethod + def from_metadata( + cls, + codecs_metadata: Iterable[CodecMetadata], + array_metadata: ChunkMetadata, + ) -> CodecPipeline: + out: List[Codec] = [] + for codec_metadata in codecs_metadata or []: + codec_cls = get_codec_class(codec_metadata.name) + codec = codec_cls.from_metadata(codec_metadata, array_metadata) + out.append(codec) + array_metadata = codec.resolve_metadata() + CodecPipeline._validate_codecs(out, array_metadata) + return cls(out) + + @staticmethod + def _validate_codecs(codecs: List[Codec], array_metadata: ChunkMetadata) -> None: + from zarr.v3.codecs.sharding import ShardingCodec + + assert any( + isinstance(codec, ArrayBytesCodec) for codec in codecs + ), "Exactly one array-to-bytes codec is required." + + prev_codec: Optional[Codec] = None + for codec in codecs: + if prev_codec is not None: + assert not isinstance(codec, ArrayBytesCodec) or not isinstance( + prev_codec, ArrayBytesCodec + ), ( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}' because exactly " + + "1 ArrayBytesCodec is allowed." + ) + assert not isinstance(codec, ArrayBytesCodec) or not isinstance( + prev_codec, BytesBytesCodec + ), ( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + assert not isinstance(codec, ArrayArrayCodec) or not isinstance( + prev_codec, ArrayBytesCodec + ), ( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}'." + ) + assert not isinstance(codec, ArrayArrayCodec) or not isinstance( + prev_codec, BytesBytesCodec + ), ( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + + if isinstance(codec, ShardingCodec): + assert len(codec.configuration.chunk_shape) == len(array_metadata.array_shape), ( + "The shard's `chunk_shape` and array's `shape` need to have the " + + "same number of dimensions." + ) + assert all( + s % c == 0 + for s, c in zip( + array_metadata.chunk_shape, + codec.configuration.chunk_shape, + ) + ), ( + "The array's `chunk_shape` needs to be divisible by the " + + "shard's inner `chunk_shape`." + ) + prev_codec = codec + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: + warn( + "Combining a `sharding_indexed` codec disables partial reads and " + + "writes, which may lead to inefficient performance." + ) + + def _array_array_codecs(self) -> List[ArrayArrayCodec]: + return [codec for codec in self.codecs if isinstance(codec, ArrayArrayCodec)] + + def _array_bytes_codec(self) -> ArrayBytesCodec: + return next(codec for codec in self.codecs if isinstance(codec, ArrayBytesCodec)) + + def _bytes_bytes_codecs(self) -> List[BytesBytesCodec]: + return [codec for codec in self.codecs if isinstance(codec, BytesBytesCodec)] + + async def decode(self, chunk_bytes: BytesLike, config: RuntimeConfiguration) -> np.ndarray: + for bb_codec in self._bytes_bytes_codecs()[::-1]: + chunk_bytes = await bb_codec.decode(chunk_bytes, config=config) + + chunk_array = await self._array_bytes_codec().decode(chunk_bytes, config) + + for aa_codec in self._array_array_codecs()[::-1]: + chunk_array = await aa_codec.decode(chunk_array, config=config) + + return chunk_array + + async def encode( + self, chunk_array: np.ndarray, config: RuntimeConfiguration + ) -> Optional[BytesLike]: + for aa_codec in self._array_array_codecs(): + chunk_array_maybe = await aa_codec.encode(chunk_array, config=config) + if chunk_array_maybe is None: + return None + chunk_array = chunk_array_maybe + + chunk_bytes_maybe = await self._array_bytes_codec().encode(chunk_array, config=config) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + for bb_codec in self._bytes_bytes_codecs(): + chunk_bytes_maybe = await bb_codec.encode(chunk_bytes, config) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + return chunk_bytes + + def compute_encoded_size(self, byte_length: int) -> int: + return reduce(lambda acc, codec: codec.compute_encoded_size(acc), self.codecs, byte_length) + + +def blosc_codec( + typesize: int, + cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd", + clevel: int = 5, + shuffle: Literal["noshuffle", "shuffle", "bitshuffle"] = "noshuffle", + blocksize: int = 0, +) -> "BloscCodecMetadata": + from zarr.v3.codecs.blosc import BloscCodecMetadata, BloscCodecConfigurationMetadata + + return BloscCodecMetadata( + configuration=BloscCodecConfigurationMetadata( + cname=cname, + clevel=clevel, + shuffle=shuffle, + blocksize=blocksize, + typesize=typesize, + ) + ) + + +def bytes_codec(endian: Optional[Literal["big", "little"]] = "little") -> "BytesCodecMetadata": + from zarr.v3.codecs.bytes import BytesCodecMetadata, BytesCodecConfigurationMetadata + + return BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian)) + + +def transpose_codec( + order: Union[Tuple[int, ...], Literal["C", "F"]], ndim: Optional[int] = None +) -> "TransposeCodecMetadata": + from zarr.v3.codecs.transpose import TransposeCodecMetadata, TransposeCodecConfigurationMetadata + + if order == "C" or order == "F": + assert ( + isinstance(ndim, int) and ndim > 0 + ), 'When using "C" or "F" the `ndim` argument needs to be provided.' + if order == "C": + order = tuple(range(ndim)) + if order == "F": + order = tuple(ndim - i - 1 for i in range(ndim)) + + return TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order)) + + +def gzip_codec(level: int = 5) -> "GzipCodecMetadata": + from zarr.v3.codecs.gzip import GzipCodecMetadata, GzipCodecConfigurationMetadata + + return GzipCodecMetadata(configuration=GzipCodecConfigurationMetadata(level)) + + +def zstd_codec(level: int = 0, checksum: bool = False) -> "ZstdCodecMetadata": + from zarr.v3.codecs.zstd import ZstdCodecMetadata, ZstdCodecConfigurationMetadata + + return ZstdCodecMetadata(configuration=ZstdCodecConfigurationMetadata(level, checksum)) + + +def crc32c_codec() -> "Crc32cCodecMetadata": + from zarr.v3.codecs.crc32c_ import Crc32cCodecMetadata + + return Crc32cCodecMetadata() + + +def sharding_codec( + chunk_shape: Tuple[int, ...], + codecs: Optional[List[CodecMetadata]] = None, + index_codecs: Optional[List[CodecMetadata]] = None, + index_location: Literal["start", "end"] = "end", +) -> "ShardingCodecMetadata": + from zarr.v3.codecs.sharding import ShardingCodecMetadata, ShardingCodecConfigurationMetadata + + codecs = codecs or [bytes_codec()] + index_codecs = index_codecs or [bytes_codec(), crc32c_codec()] + return ShardingCodecMetadata( + configuration=ShardingCodecConfigurationMetadata( + chunk_shape, codecs, index_codecs, index_location + ) + ) diff --git a/zarr/v3/codecs/blosc.py b/zarr/v3/codecs/blosc.py new file mode 100644 index 0000000000..81a056d90b --- /dev/null +++ b/zarr/v3/codecs/blosc.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Dict, + Literal, + Optional, + Type, +) + +import numcodecs +import numpy as np +from attr import asdict, evolve, frozen, field +from numcodecs.blosc import Blosc + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import to_thread +from zarr.v3.metadata.v3 import CodecMetadata +from zarr.v3.types import BytesLike + +if TYPE_CHECKING: + from zarr.v3.metadata import ChunkMetadata + + +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] + +# See https://zarr.readthedocs.io/en/stable/tutorial.html#configuring-blosc +numcodecs.blosc.use_threads = False + + +@frozen +class BloscCodecConfigurationMetadata: + typesize: int + cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd" + clevel: int = 5 + shuffle: BloscShuffle = "noshuffle" + blocksize: int = 0 + + +blosc_shuffle_int_to_str: Dict[int, BloscShuffle] = { + 0: "noshuffle", + 1: "shuffle", + 2: "bitshuffle", +} + + +@frozen +class BloscCodecMetadata(CodecMetadata): + configuration: BloscCodecConfigurationMetadata + name: Literal["blosc"] = field(default="blosc", init=False) + + +@frozen +class BloscCodec(BytesBytesCodec): + array_metadata: ChunkMetadata + configuration: BloscCodecConfigurationMetadata + blosc_codec: Blosc + is_fixed_size = False + + @classmethod + def from_metadata( + cls, codec_metadata: CodecMetadata, array_metadata: ChunkMetadata + ) -> BloscCodec: + assert isinstance(codec_metadata, BloscCodecMetadata) + configuration = codec_metadata.configuration + if configuration.typesize == 0: + configuration = evolve(configuration, typesize=array_metadata.data_type.byte_count) + config_dict = asdict(codec_metadata.configuration) + config_dict.pop("typesize", None) + map_shuffle_str_to_int = {"noshuffle": 0, "shuffle": 1, "bitshuffle": 2} + config_dict["shuffle"] = map_shuffle_str_to_int[config_dict["shuffle"]] + return cls( + array_metadata=array_metadata, + configuration=configuration, + blosc_codec=Blosc.from_config(config_dict), + ) + + @classmethod + def get_metadata_class(cls) -> Type[BloscCodecMetadata]: + return BloscCodecMetadata + + async def decode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> BytesLike: + return await to_thread(self.blosc_codec.decode, chunk_bytes) + + async def encode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> Optional[BytesLike]: + chunk_array = np.frombuffer(chunk_bytes, dtype=self.array_metadata.dtype) + return await to_thread(self.blosc_codec.encode, chunk_array) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +register_codec("blosc", BloscCodec) diff --git a/zarr/v3/codecs/bytes.py b/zarr/v3/codecs/bytes.py new file mode 100644 index 0000000000..a5a665daa4 --- /dev/null +++ b/zarr/v3/codecs/bytes.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, + Type, + TypedDict, +) + +import numpy as np + +from zarr.v3.abc.codec import ArrayBytesCodec +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.codecs.registry import register_codec +from zarr.v3.types import BytesLike +from zarr.v3.metadata.v3 import CodecMetadata, to_numpy_shortname + +if TYPE_CHECKING: + from zarr.v3.common import ChunkMetadata, ChunkMetadataDict + + +class BytesCodecConfigurationMetadataDict(TypedDict): + endian: Optional[Literal["big", "little"]] + + +class BytesCodecConfigurationMetadata: + endian: Optional[Literal["big", "little"]] = "little" + + def __init__(self, endian: Optional[Literal["big", "little"]]): + self.endian = endian + + def to_dict(self) -> BytesCodecConfigurationMetadataDict: + return {"endian": self.endian} + + +class BytesCodecMetadataDict(TypedDict): + configuration: BytesCodecConfigurationMetadataDict + name: Literal["bytes"] + + +class BytesCodecMetadata: + configuration: BytesCodecConfigurationMetadata + name: Literal["bytes"] + + def __init__(self, configuration: BytesCodecConfigurationMetadata): + # note: the only degree of freedom for this class is the "endian" property of `BytesConfigurationMetadata` + self.configuration = configuration + self.name = "bytes" + + def to_dict(self) -> BytesCodecMetadataDict: + return {"configuration": self.configuration.to_dict(), "name": self.name} + + +class BytesCodecDict(TypedDict): + array_metadata: ChunkMetadataDict + configuration: BytesCodecConfigurationMetadataDict + + +class BytesCodec(ArrayBytesCodec): + array_metadata: ChunkMetadata + configuration: BytesCodecConfigurationMetadata + is_fixed_size = True + + def __init__( + self, array_metadata: ChunkMetadata, configuration: BytesCodecConfigurationMetadata + ) -> None: + self.array_metadata = array_metadata + self.configuration = configuration + + @classmethod + def from_metadata( + cls, codec_metadata: CodecMetadata, array_metadata: ChunkMetadata + ) -> BytesCodec: + assert isinstance(codec_metadata, BytesCodecMetadata) + assert ( + array_metadata.dtype.itemsize == 1 or codec_metadata.configuration.endian is not None + ), "The `endian` configuration needs to be specified for multi-byte data types." + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + @classmethod + def get_metadata_class(cls) -> Type[BytesCodecMetadata]: + return BytesCodecMetadata + + def _get_byteorder(self, array: np.ndarray) -> Literal["big", "little"]: + if array.dtype.byteorder == "<": + return "little" + elif array.dtype.byteorder == ">": + return "big" + else: + import sys + + return sys.byteorder + + async def decode(self, chunk_bytes: BytesLike, config: RuntimeConfiguration) -> np.ndarray: + short_name = to_numpy_shortname(self.array_metadata.dtype) + if self.array_metadata.dtype.itemsize > 0: + if self.configuration.endian == "little": + prefix = "<" + else: + prefix = ">" + dtype = np.dtype(f"{prefix}{short_name}") + else: + dtype = np.dtype(f"|{short_name}") + chunk_array = np.frombuffer(chunk_bytes, dtype) + + # ensure correct chunk shape + if chunk_array.shape != self.array_metadata.chunk_shape: + chunk_array = chunk_array.reshape( + self.array_metadata.chunk_shape, + ) + return chunk_array + + async def encode( + self, chunk_array: np.ndarray, config: RuntimeConfiguration + ) -> Optional[BytesLike]: + if chunk_array.dtype.itemsize > 1: + byteorder = self._get_byteorder(chunk_array) + if self.configuration.endian != byteorder: + new_dtype = chunk_array.dtype.newbyteorder(self.configuration.endian) + chunk_array = chunk_array.astype(new_dtype) + return chunk_array.tobytes() + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + + def to_dict(self) -> BytesCodecDict: + return { + "array_metadata": self.array_metadata.to_dict(), + "configuration": self.configuration.to_dict(), + } + + +register_codec("bytes", BytesCodec) + +# compatibility with earlier versions of ZEP1 +register_codec("endian", BytesCodec) diff --git a/zarr/v3/codecs/crc32c_.py b/zarr/v3/codecs/crc32c_.py new file mode 100644 index 0000000000..0e7a61f4a6 --- /dev/null +++ b/zarr/v3/codecs/crc32c_.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, + Type, +) + +import numpy as np +from attr import frozen, field +from crc32c import crc32c + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.codecs.registry import register_codec +from zarr.v3.types import BytesLike +from zarr.v3.metadata.v3 import CodecMetadata + +if TYPE_CHECKING: + from zarr.v3.common import ChunkMetadata + + +@frozen +class Crc32cCodecMetadata: + name: Literal["crc32c"] = field(default="crc32c", init=False) + + +@frozen +class Crc32cCodec(BytesBytesCodec): + array_metadata: ChunkMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: CodecMetadata, array_metadata: ChunkMetadata + ) -> Crc32cCodec: + assert isinstance(codec_metadata, Crc32cCodecMetadata) + return cls(array_metadata=array_metadata) + + @classmethod + def get_metadata_class(cls) -> Type[Crc32cCodecMetadata]: + return Crc32cCodecMetadata + + async def decode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> BytesLike: + crc32_bytes = chunk_bytes[-4:] + inner_bytes = chunk_bytes[:-4] + + assert np.uint32(crc32c(inner_bytes)).tobytes() == bytes(crc32_bytes) + return inner_bytes + + async def encode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> Optional[BytesLike]: + return chunk_bytes + np.uint32(crc32c(chunk_bytes)).tobytes() + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + 4 + + +register_codec("crc32c", Crc32cCodec) diff --git a/zarr/v3/codecs/gzip.py b/zarr/v3/codecs/gzip.py new file mode 100644 index 0000000000..89be32b679 --- /dev/null +++ b/zarr/v3/codecs/gzip.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, + Type, +) + +from attr import frozen, field +from numcodecs.gzip import GZip + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import to_thread +from zarr.v3.metadata.v3 import CodecMetadata +from zarr.v3.types import BytesLike + +if TYPE_CHECKING: + from zarr.v3.metadata import ChunkMetadata + + +@frozen +class GzipCodecConfigurationMetadata: + level: int = 5 + + +@frozen +class GzipCodecMetadata: + configuration: GzipCodecConfigurationMetadata + name: Literal["gzip"] = field(default="gzip", init=False) + + +@frozen +class GzipCodec(BytesBytesCodec): + array_metadata: ChunkMetadata + configuration: GzipCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: CodecMetadata, array_metadata: ChunkMetadata + ) -> GzipCodec: + assert isinstance(codec_metadata, GzipCodecMetadata) + + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + @classmethod + def get_metadata_class(cls) -> Type[GzipCodecMetadata]: + return GzipCodecMetadata + + async def decode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> BytesLike: + return await to_thread(GZip(self.configuration.level).decode, chunk_bytes) + + async def encode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> Optional[BytesLike]: + return await to_thread(GZip(self.configuration.level).encode, chunk_bytes) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +register_codec("gzip", GzipCodec) diff --git a/zarr/v3/codecs/registry.py b/zarr/v3/codecs/registry.py new file mode 100644 index 0000000000..bc824c6e90 --- /dev/null +++ b/zarr/v3/codecs/registry.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from typing import Dict, NamedTuple, Type +from importlib.metadata import EntryPoint, entry_points as get_entry_points + +from zarr.v3.abc.codec import Codec +from zarr.v3.metadata.v3 import CodecMetadata + + +class CodecRegistryItem(NamedTuple): + codec_cls: Type[Codec] + codec_metadata_cls: Type[CodecMetadata] + + +__codec_registry: Dict[str, CodecRegistryItem] = {} +__lazy_load_codecs: Dict[str, EntryPoint] = {} + + +def _collect_entrypoints() -> None: + entry_points = get_entry_points() + if hasattr(entry_points, "select"): + # If entry_points() has a select method, use that. Python 3.10+ + for e in entry_points.select(group="zarr.codecs"): + __lazy_load_codecs[e.name] = e + else: + # Otherwise, fallback to using get + for e in entry_points.get("zarr.codecs", []): + __lazy_load_codecs[e.name] = e + + +def register_codec(key: str, codec_cls: Type[Codec]) -> None: + __codec_registry[key] = CodecRegistryItem(codec_cls, codec_cls.get_metadata_class()) + + +def _get_codec_item(key: str) -> CodecRegistryItem: + item = __codec_registry.get(key) + if item is None: + if key in __lazy_load_codecs: + # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) + cls = __lazy_load_codecs[key].load() + register_codec(key, cls) + item = __codec_registry.get(key) + if item: + return item + raise KeyError(key) + + +def get_codec_metadata_class(key: str) -> Type[CodecMetadata]: + return _get_codec_item(key).codec_metadata_cls + + +def get_codec_class(key: str) -> Type[Codec]: + return _get_codec_item(key).codec_cls + + +_collect_entrypoints() diff --git a/zarr/v3/sharding.py b/zarr/v3/codecs/sharding.py similarity index 73% rename from zarr/v3/sharding.py rename to zarr/v3/codecs/sharding.py index 3c5b4bd12d..f6f1b372d9 100644 --- a/zarr/v3/sharding.py +++ b/zarr/v3/codecs/sharding.py @@ -1,35 +1,59 @@ from __future__ import annotations -from typing import Iterator, List, Mapping, NamedTuple, Optional, Set, Tuple +from typing import ( + Awaitable, + Callable, + Iterator, + List, + Literal, + Mapping, + NamedTuple, + Optional, + Set, + Tuple, + Type, +) +from attr import field, frozen import numpy as np -from attrs import frozen - -from zarr.v3.codecs import ArrayBytesCodec, CodecPipeline -from zarr.v3.common import ( - BytesLike, - ChunkCoords, - SliceSelection, - concurrent_map, - product, +from zarr.v3.abc.codec import ( + ArrayBytesCodec, + ArrayBytesCodecPartialDecodeMixin, + ArrayBytesCodecPartialEncodeMixin, ) -from zarr.v3.indexing import ( + +from zarr.v3.codecs import CodecPipeline +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import ChunkMetadata, concurrent_map, product, RuntimeConfiguration +from zarr.v3.array.indexing import ( BasicIndexer, c_order_iter, is_total_slice, morton_order_iter, ) -from zarr.v3.metadata import ( - CoreArrayMetadata, - DataType, - ShardingCodecConfigurationMetadata, - ShardingCodecMetadata, +from zarr.v3.metadata.v3 import ( + CodecMetadata, ) from zarr.v3.store import StorePath +from zarr.v3.types import BytesLike, ChunkCoords, SliceSelection MAX_UINT_64 = 2**64 - 1 +@frozen +class ShardingCodecConfigurationMetadata: + chunk_shape: ChunkCoords + codecs: List["CodecMetadata"] + index_codecs: List["CodecMetadata"] + index_location: Literal["start", "end"] = "end" + + +@frozen +class ShardingCodecMetadata: + configuration: ShardingCodecConfigurationMetadata + name: Literal["sharding_indexed"] = field(default="sharding_indexed", init=False) + + class _ShardIndex(NamedTuple): # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) offsets_and_lengths: np.ndarray @@ -49,7 +73,7 @@ def get_chunk_slice(self, chunk_coords: ChunkCoords) -> Optional[Tuple[int, int] if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): return None else: - return (int(chunk_start), int(chunk_start + chunk_len)) + return (int(chunk_start), int(chunk_start) + int(chunk_len)) def set_chunk_slice(self, chunk_coords: ChunkCoords, chunk_slice: Optional[slice]) -> None: localized_chunk = self._localize_chunk(chunk_coords) @@ -94,10 +118,18 @@ class _ShardProxy(Mapping): buf: BytesLike @classmethod - async def from_bytes(cls, buf: BytesLike, codec: ShardingCodec) -> _ShardProxy: + async def from_bytes( + cls, buf: BytesLike, codec: ShardingCodec, config: RuntimeConfiguration + ) -> _ShardProxy: + shard_index_size = codec._shard_index_size() obj = cls() obj.buf = memoryview(buf) - obj.index = await codec._decode_shard_index(obj.buf[-codec._shard_index_size() :]) + if codec.configuration.index_location == "start": + shard_index_bytes = obj.buf[:shard_index_size] + else: + shard_index_bytes = obj.buf[-shard_index_size:] + + obj.index = await codec._decode_shard_index(shard_index_bytes, config=config) return obj @classmethod @@ -156,14 +188,31 @@ def append(self, chunk_coords: ChunkCoords, value: BytesLike): self.buf.extend(value) self.index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length)) - def finalize(self, index_bytes: BytesLike) -> BytesLike: - self.buf.extend(index_bytes) - return self.buf + async def finalize( + self, + index_location: Literal["start", "end"], + index_encoder: Callable[[_ShardIndex], Awaitable[BytesLike]], + config: RuntimeConfiguration, + ) -> BytesLike: + index_bytes = await index_encoder(self.index, config) + if index_location == "start": + self.index.offsets_and_lengths[..., 0] += len(index_bytes) + index_bytes = await index_encoder( + self.index, config=config + ) # encode again with corrected offsets + out_buf = bytearray(index_bytes) + out_buf.extend(self.buf) + else: + out_buf = self.buf + out_buf.extend(index_bytes) + return out_buf @frozen -class ShardingCodec(ArrayBytesCodec): - array_metadata: CoreArrayMetadata +class ShardingCodec( + ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin +): + array_metadata: ChunkMetadata configuration: ShardingCodecConfigurationMetadata codec_pipeline: CodecPipeline index_codec_pipeline: CodecPipeline @@ -172,9 +221,11 @@ class ShardingCodec(ArrayBytesCodec): @classmethod def from_metadata( cls, - codec_metadata: ShardingCodecMetadata, - array_metadata: CoreArrayMetadata, + codec_metadata: CodecMetadata, + array_metadata: ChunkMetadata, ) -> ShardingCodec: + assert isinstance(codec_metadata, ShardingCodecMetadata) + chunks_per_shard = tuple( s // c for s, c in zip( @@ -183,24 +234,22 @@ def from_metadata( ) ) # rewriting the metadata to scope it to the shard - shard_metadata = CoreArrayMetadata( - shape=array_metadata.chunk_shape, + shard_metadata = ChunkMetadata( + array_shape=array_metadata.chunk_shape, chunk_shape=codec_metadata.configuration.chunk_shape, - data_type=array_metadata.data_type, + dtype=array_metadata.dtype, fill_value=array_metadata.fill_value, - runtime_configuration=array_metadata.runtime_configuration, ) codec_pipeline = CodecPipeline.from_metadata( codec_metadata.configuration.codecs, shard_metadata ) index_codec_pipeline = CodecPipeline.from_metadata( codec_metadata.configuration.index_codecs, - CoreArrayMetadata( - shape=chunks_per_shard + (2,), + ChunkMetadata( + array_shape=chunks_per_shard + (2,), chunk_shape=chunks_per_shard + (2,), - data_type=DataType.uint64, + dtype=np.dtype("uint64"), fill_value=MAX_UINT_64, - runtime_configuration=array_metadata.runtime_configuration, ), ) return cls( @@ -211,10 +260,11 @@ def from_metadata( chunks_per_shard=chunks_per_shard, ) - async def decode( - self, - shard_bytes: BytesLike, - ) -> np.ndarray: + @classmethod + def get_metadata_class(cls) -> Type[ShardingCodecMetadata]: + return ShardingCodecMetadata + + async def decode(self, shard_bytes: BytesLike, config: RuntimeConfiguration) -> np.ndarray: # print("decode") shard_shape = self.array_metadata.chunk_shape chunk_shape = self.configuration.chunk_shape @@ -229,9 +279,9 @@ async def decode( out = np.zeros( shard_shape, dtype=self.array_metadata.dtype, - order=self.array_metadata.runtime_configuration.order, + order=config.order, ) - shard_dict = await _ShardProxy.from_bytes(shard_bytes, self) + shard_dict = await _ShardProxy.from_bytes(shard_bytes, self, config=config) if shard_dict.index.is_all_empty(): out.fill(self.array_metadata.fill_value) @@ -240,27 +290,18 @@ async def decode( # decoding chunks and writing them into the output buffer await concurrent_map( [ - ( - shard_dict, - chunk_coords, - chunk_selection, - out_selection, - out, - ) + (shard_dict, chunk_coords, chunk_selection, out_selection, out, config) for chunk_coords, chunk_selection, out_selection in indexer ], self._read_chunk, - self.array_metadata.runtime_configuration.concurrency, + config.concurrency, ) return out async def decode_partial( - self, - store_path: StorePath, - selection: SliceSelection, + self, store_path: StorePath, selection: SliceSelection, config: RuntimeConfiguration ) -> Optional[np.ndarray]: - # print("decode_partial") shard_shape = self.array_metadata.chunk_shape chunk_shape = self.configuration.chunk_shape @@ -274,7 +315,7 @@ async def decode_partial( out = np.zeros( indexer.shape, dtype=self.array_metadata.dtype, - order=self.array_metadata.runtime_configuration.order, + order=config.order, ) indexed_chunks = list(indexer) @@ -284,13 +325,13 @@ async def decode_partial( shard_dict: Mapping[ChunkCoords, BytesLike] = {} if self._is_total_shard(all_chunk_coords): # read entire shard - shard_dict_maybe = await self._load_full_shard_maybe(store_path) + shard_dict_maybe = await self._load_full_shard_maybe(store_path, config=config) if shard_dict_maybe is None: return None shard_dict = shard_dict_maybe else: # read some chunks within the shard - shard_index = await self._load_shard_index_maybe(store_path) + shard_index = await self._load_shard_index_maybe(store_path, config=config) if shard_index is None: return None shard_dict = {} @@ -304,17 +345,11 @@ async def decode_partial( # decoding chunks and writing them into the output buffer await concurrent_map( [ - ( - shard_dict, - chunk_coords, - chunk_selection, - out_selection, - out, - ) + (shard_dict, chunk_coords, chunk_selection, out_selection, out, config) for chunk_coords, chunk_selection, out_selection in indexed_chunks ], self._read_chunk, - self.array_metadata.runtime_configuration.concurrency, + config.concurrency, ) return out @@ -326,18 +361,18 @@ async def _read_chunk( chunk_selection: SliceSelection, out_selection: SliceSelection, out: np.ndarray, + config: RuntimeConfiguration, ): chunk_bytes = shard_dict.get(chunk_coords, None) if chunk_bytes is not None: - chunk_array = await self.codec_pipeline.decode(chunk_bytes) + chunk_array = await self.codec_pipeline.decode(chunk_bytes, config=config) tmp = chunk_array[chunk_selection] out[out_selection] = tmp else: out[out_selection] = self.array_metadata.fill_value async def encode( - self, - shard_array: np.ndarray, + self, shard_array: np.ndarray, config: RuntimeConfiguration ) -> Optional[BytesLike]: shard_shape = self.array_metadata.chunk_shape chunk_shape = self.configuration.chunk_shape @@ -369,7 +404,7 @@ async def _write_chunk( if not np.array_equiv(chunk_array, self.array_metadata.fill_value): return ( chunk_coords, - await self.codec_pipeline.encode(chunk_array), + await self.codec_pipeline.encode(chunk_array, config=config), ) return (chunk_coords, None) @@ -380,7 +415,7 @@ async def _write_chunk( for chunk_coords, chunk_selection, out_selection in indexer ], _write_chunk, - self.array_metadata.runtime_configuration.concurrency, + config.concurrency, ) if len(encoded_chunks) == 0: return None @@ -390,20 +425,23 @@ async def _write_chunk( if chunk_bytes is not None: shard_builder.append(chunk_coords, chunk_bytes) - return shard_builder.finalize(await self._encode_shard_index(shard_builder.index)) + return await shard_builder.finalize( + self.configuration.index_location, self._encode_shard_index, config=config + ) async def encode_partial( self, store_path: StorePath, shard_array: np.ndarray, selection: SliceSelection, + config: RuntimeConfiguration, ) -> None: # print("encode_partial") shard_shape = self.array_metadata.chunk_shape chunk_shape = self.configuration.chunk_shape old_shard_dict = ( - await self._load_full_shard_maybe(store_path) + await self._load_full_shard_maybe(store_path, config) ) or _ShardProxy.create_empty(self.chunks_per_shard) new_shard_builder = _ShardBuilder.create_empty(self.chunks_per_shard) tombstones: Set[ChunkCoords] = set() @@ -438,14 +476,14 @@ async def _write_chunk( chunk_array.fill(self.array_metadata.fill_value) else: chunk_array = ( - await self.codec_pipeline.decode(chunk_bytes) + await self.codec_pipeline.decode(chunk_bytes, config=config) ).copy() # make a writable copy chunk_array[chunk_selection] = shard_array[out_selection] if not np.array_equiv(chunk_array, self.array_metadata.fill_value): return ( chunk_coords, - await self.codec_pipeline.encode(chunk_array), + await self.codec_pipeline.encode(chunk_array, config=config), ) else: return (chunk_coords, None) @@ -460,7 +498,7 @@ async def _write_chunk( for chunk_coords, chunk_selection, out_selection in indexer ], _write_chunk, - self.array_metadata.runtime_configuration.concurrency, + config.concurrency, ) for chunk_coords, chunk_bytes in encoded_chunks: @@ -477,7 +515,9 @@ async def _write_chunk( await store_path.delete_async() else: await store_path.set_async( - shard_builder.finalize(await self._encode_shard_index(shard_builder.index)) + await shard_builder.finalize( + self.configuration.index_location, self._encode_shard_index, config=config + ) ) def _is_total_shard(self, all_chunk_coords: Set[ChunkCoords]) -> bool: @@ -485,21 +525,33 @@ def _is_total_shard(self, all_chunk_coords: Set[ChunkCoords]) -> bool: chunk_coords in all_chunk_coords for chunk_coords in c_order_iter(self.chunks_per_shard) ) - async def _decode_shard_index(self, index_bytes: BytesLike) -> _ShardIndex: - return _ShardIndex(await self.index_codec_pipeline.decode(index_bytes)) + async def _decode_shard_index( + self, index_bytes: BytesLike, config: RuntimeConfiguration + ) -> _ShardIndex: + return _ShardIndex(await self.index_codec_pipeline.decode(index_bytes, config=config)) - async def _encode_shard_index(self, index: _ShardIndex) -> BytesLike: - index_bytes = await self.index_codec_pipeline.encode(index.offsets_and_lengths) + async def _encode_shard_index( + self, index: _ShardIndex, config: RuntimeConfiguration + ) -> BytesLike: + index_bytes = await self.index_codec_pipeline.encode( + index.offsets_and_lengths, config=config + ) assert index_bytes is not None return index_bytes def _shard_index_size(self) -> int: return self.index_codec_pipeline.compute_encoded_size(16 * product(self.chunks_per_shard)) - async def _load_shard_index_maybe(self, store_path: StorePath) -> Optional[_ShardIndex]: - index_bytes = await store_path.get_async((-self._shard_index_size(), None)) + async def _load_shard_index_maybe( + self, store_path: StorePath, config: RuntimeConfiguration + ) -> Optional[_ShardIndex]: + shard_index_size = self._shard_index_size() + if self.configuration.index_location == "start": + index_bytes = await store_path.get_async((0, shard_index_size)) + else: + index_bytes = await store_path.get_async((-shard_index_size, None)) if index_bytes is not None: - return await self._decode_shard_index(index_bytes) + return await self._decode_shard_index(index_bytes, config=config) return None async def _load_shard_index(self, store_path: StorePath) -> _ShardIndex: @@ -507,10 +559,17 @@ async def _load_shard_index(self, store_path: StorePath) -> _ShardIndex: self.chunks_per_shard ) - async def _load_full_shard_maybe(self, store_path: StorePath) -> Optional[_ShardProxy]: + async def _load_full_shard_maybe( + self, store_path: StorePath, config: RuntimeConfiguration + ) -> Optional[_ShardProxy]: shard_bytes = await store_path.get_async() - return await _ShardProxy.from_bytes(shard_bytes, self) if shard_bytes else None + return ( + await _ShardProxy.from_bytes(shard_bytes, self, config=config) if shard_bytes else None + ) def compute_encoded_size(self, input_byte_length: int) -> int: return input_byte_length + self._shard_index_size() + + +register_codec("sharding_indexed", ShardingCodec) diff --git a/zarr/v3/codecs/transpose.py b/zarr/v3/codecs/transpose.py new file mode 100644 index 0000000000..84be503555 --- /dev/null +++ b/zarr/v3/codecs/transpose.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, + Tuple, + Type, +) + +import numpy as np +from attr import frozen, field + +from zarr.v3.abc.codec import ArrayArrayCodec +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.codecs.registry import register_codec +from zarr.v3.metadata.v3 import CodecMetadata + +if TYPE_CHECKING: + from zarr.v3.common import ChunkMetadata + + +@frozen +class TransposeCodecConfigurationMetadata: + order: Tuple[int, ...] + + +@frozen +class TransposeCodecMetadata: + configuration: TransposeCodecConfigurationMetadata + name: Literal["transpose"] = field(default="transpose", init=False) + + +@frozen +class TransposeCodec(ArrayArrayCodec): + array_metadata: ChunkMetadata + order: Tuple[int, ...] + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: CodecMetadata, array_metadata: ChunkMetadata + ) -> TransposeCodec: + assert isinstance(codec_metadata, TransposeCodecMetadata) + + configuration = codec_metadata.configuration + # Compatibility with older version of ZEP1 + if configuration.order == "F": # type: ignore + order = tuple(array_metadata.ndim - x - 1 for x in range(array_metadata.ndim)) + + elif configuration.order == "C": # type: ignore + order = tuple(range(array_metadata.ndim)) + + else: + assert len(configuration.order) == array_metadata.ndim, ( + "The `order` tuple needs have as many entries as " + + f"there are dimensions in the array. Got: {configuration.order}" + ) + assert len(configuration.order) == len(set(configuration.order)), ( + "There must not be duplicates in the `order` tuple. " + + f"Got: {configuration.order}" + ) + assert all(0 <= x < array_metadata.ndim for x in configuration.order), ( + "All entries in the `order` tuple must be between 0 and " + + f"the number of dimensions in the array. Got: {configuration.order}" + ) + order = tuple(configuration.order) + + return cls( + array_metadata=array_metadata, + order=order, + ) + + @classmethod + def get_metadata_class(cls) -> Type[TransposeCodecMetadata]: + return TransposeCodecMetadata + + def resolve_metadata(self) -> ChunkMetadata: + from zarr.v3.common import ChunkMetadata + + return ChunkMetadata( + array_shape=tuple( + self.array_metadata.array_shape[self.order[i]] + for i in range(self.array_metadata.ndim) + ), + chunk_shape=tuple( + self.array_metadata.chunk_shape[self.order[i]] + for i in range(self.array_metadata.ndim) + ), + dtype=self.array_metadata.dtype, + fill_value=self.array_metadata.fill_value, + ) + + async def decode(self, chunk_array: np.ndarray, config: RuntimeConfiguration) -> np.ndarray: + inverse_order = [0 for _ in range(self.array_metadata.ndim)] + for x, i in enumerate(self.order): + inverse_order[x] = i + chunk_array = chunk_array.transpose(inverse_order) + return chunk_array + + async def encode( + self, chunk_array: np.ndarray, config: RuntimeConfiguration + ) -> Optional[np.ndarray]: + chunk_array = chunk_array.transpose(self.order) + return chunk_array + + def compute_encoded_size(self, input_byte_length: int) -> int: + return input_byte_length + + +register_codec("transpose", TransposeCodec) diff --git a/zarr/v3/codecs/zstd.py b/zarr/v3/codecs/zstd.py new file mode 100644 index 0000000000..df9f9d883c --- /dev/null +++ b/zarr/v3/codecs/zstd.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + Optional, + Type, +) + +from attr import frozen, field +from zstandard import ZstdCompressor, ZstdDecompressor + +from zarr.v3.abc.codec import BytesBytesCodec +from zarr.v3.codecs.registry import register_codec +from zarr.v3.common import to_thread +from zarr.v3.metadata.v3 import CodecMetadata +from zarr.v3.types import BytesLike + +if TYPE_CHECKING: + from zarr.v3.metadata import ChunkMetadata + + +@frozen +class ZstdCodecConfigurationMetadata: + level: int = 0 + checksum: bool = False + + +@frozen +class ZstdCodecMetadata: + configuration: ZstdCodecConfigurationMetadata + name: Literal["zstd"] = field(default="zstd", init=False) + + +@frozen +class ZstdCodec(BytesBytesCodec): + array_metadata: ChunkMetadata + configuration: ZstdCodecConfigurationMetadata + is_fixed_size = True + + @classmethod + def from_metadata( + cls, codec_metadata: CodecMetadata, array_metadata: ChunkMetadata + ) -> ZstdCodec: + assert isinstance(codec_metadata, ZstdCodecMetadata) + return cls( + array_metadata=array_metadata, + configuration=codec_metadata.configuration, + ) + + @classmethod + def get_metadata_class(cls) -> Type[ZstdCodecMetadata]: + return ZstdCodecMetadata + + def _compress(self, data: bytes) -> bytes: + ctx = ZstdCompressor( + level=self.configuration.level, write_checksum=self.configuration.checksum + ) + return ctx.compress(data) + + def _decompress(self, data: bytes) -> bytes: + ctx = ZstdDecompressor() + return ctx.decompress(data) + + async def decode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> BytesLike: + return await to_thread(self._decompress, chunk_bytes) + + async def encode(self, chunk_bytes: bytes, config: RuntimeConfiguration) -> Optional[BytesLike]: + return await to_thread(self._compress, chunk_bytes) + + def compute_encoded_size(self, _input_byte_length: int) -> int: + raise NotImplementedError + + +register_codec("zstd", ZstdCodec) diff --git a/zarr/v3/common.py b/zarr/v3/common.py index 0e55a7c1fd..b3dfa45425 100644 --- a/zarr/v3/common.py +++ b/zarr/v3/common.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +from asyncio import AbstractEventLoop import contextvars import functools from typing import ( @@ -13,70 +14,53 @@ Optional, Tuple, TypeVar, + TypedDict, Union, ) +from attr import frozen import numpy as np from cattr import Converter +from zarr.v3.types import Attributes, ChunkCoords ZARR_JSON = "zarr.json" ZARRAY_JSON = ".zarray" ZGROUP_JSON = ".zgroup" ZATTRS_JSON = ".zattrs" -BytesLike = Union[bytes, bytearray, memoryview] -ChunkCoords = Tuple[int, ...] -SliceSelection = Tuple[slice, ...] -Selection = Union[slice, SliceSelection] - def make_cattr(): - from zarr.v3.metadata import ( - BloscCodecMetadata, - BytesCodecMetadata, - ChunkKeyEncodingMetadata, + from zarr.v3.metadata.v3 import ( + DefaultChunkKeyEncoding, CodecMetadata, - Crc32cCodecMetadata, - DefaultChunkKeyEncodingMetadata, - GzipCodecMetadata, - ShardingCodecMetadata, - TransposeCodecMetadata, - V2ChunkKeyEncodingMetadata, - ZstdCodecMetadata, + ChunkKeyEncoding, + V2ChunkKeyEncoding, ) + from zarr.v3.codecs.registry import get_codec_metadata_class converter = Converter() - def _structure_chunk_key_encoding_metadata(d: Dict[str, Any], _t) -> ChunkKeyEncodingMetadata: + def _structure_attributes(d: Dict[str, Any], _t) -> Attributes: + return d + + converter.register_structure_hook_factory( + lambda t: str(t) + == "typing.Union[typing.Dict[str, ForwardRef('Attributes')], typing.List[ForwardRef('Attributes')], str, int, float, bool, NoneType]", + lambda t: _structure_attributes, + ) + + def _structure_chunk_key_encoding_metadata(d: Dict[str, Any], _t) -> ChunkKeyEncoding: if d["name"] == "default": - return converter.structure(d, DefaultChunkKeyEncodingMetadata) + return converter.structure(d, DefaultChunkKeyEncoding) if d["name"] == "v2": - return converter.structure(d, V2ChunkKeyEncodingMetadata) + return converter.structure(d, V2ChunkKeyEncoding) raise KeyError - converter.register_structure_hook( - ChunkKeyEncodingMetadata, _structure_chunk_key_encoding_metadata - ) + converter.register_structure_hook(ChunkKeyEncoding, _structure_chunk_key_encoding_metadata) def _structure_codec_metadata(d: Dict[str, Any], _t=None) -> CodecMetadata: - if d["name"] == "endian": - d["name"] = "bytes" - - if d["name"] == "blosc": - return converter.structure(d, BloscCodecMetadata) - if d["name"] == "bytes": - return converter.structure(d, BytesCodecMetadata) - if d["name"] == "transpose": - return converter.structure(d, TransposeCodecMetadata) - if d["name"] == "gzip": - return converter.structure(d, GzipCodecMetadata) - if d["name"] == "zstd": - return converter.structure(d, ZstdCodecMetadata) - if d["name"] == "sharding_indexed": - return converter.structure(d, ShardingCodecMetadata) - if d["name"] == "crc32c": - return converter.structure(d, Crc32cCodecMetadata) - raise KeyError + codec_metadata_cls = get_codec_metadata_class(d["name"]) + return converter.structure(d, codec_metadata_cls) converter.register_structure_hook(CodecMetadata, _structure_codec_metadata) @@ -156,3 +140,49 @@ async def to_thread(func, /, *args, **kwargs): ctx = contextvars.copy_context() func_call = functools.partial(ctx.run, func, *args, **kwargs) return await loop.run_in_executor(None, func_call) + + +@frozen +class RuntimeConfiguration: + order: Literal["C", "F"] = "C" + concurrency: Optional[int] = None + asyncio_loop: Optional[AbstractEventLoop] = None + + +def runtime_configuration( + order: Literal["C", "F"], concurrency: Optional[int] = None +) -> RuntimeConfiguration: + return RuntimeConfiguration(order=order, concurrency=concurrency) + + +class ChunkMetadataDict(TypedDict): + array_shape: Tuple[int, ...] + chunk_shape: Tuple[int, ...] + dtype: str + fill_value: Any + + +class ChunkMetadata: + array_shape: Tuple[int, ...] + chunk_shape: Tuple[int, ...] + # data_type: DataType + dtype: np.dtype + fill_value: Any + + def __init__(self, array_shape, chunk_shape, dtype, fill_value) -> None: + self.array_shape = array_shape + self.chunk_shape = chunk_shape + self.dtype = dtype + self.fill_value = fill_value + + @property + def ndim(self) -> int: + return len(self.array_shape) + + def to_dict(self) -> ChunkMetadataDict: + return { + "array_shape": self.array_shape, + "chunk_shape": self.chunk_shape, + "fill_value": self.fill_value, + "dtype": self.dtype.str, + } diff --git a/zarr/v3/group.py b/zarr/v3/group.py index aa43c706a5..34791ff2ff 100644 --- a/zarr/v3/group.py +++ b/zarr/v3/group.py @@ -5,16 +5,17 @@ from attr import asdict, evolve, field, frozen -from zarr.v3.array import Array +from zarr.v3.array.v3 import Array from zarr.v3.common import ZARR_JSON, make_cattr -from zarr.v3.metadata import RuntimeConfiguration +from zarr.v3.common import RuntimeConfiguration from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync +from zarr.v3.types import Attributes @frozen class GroupMetadata: - attributes: Dict[str, Any] = field(factory=dict) + attributes: Attributes = field(factory=dict) zarr_format: Literal[3] = 3 node_type: Literal["group"] = "group" @@ -37,7 +38,7 @@ async def create_async( cls, store: StoreLike, *, - attributes: Optional[Dict[str, Any]] = None, + attributes: Optional[Attributes] = None, exists_ok: bool = False, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), ) -> Group: @@ -57,7 +58,7 @@ def create( cls, store: StoreLike, *, - attributes: Optional[Dict[str, Any]] = None, + attributes: Optional[Attributes] = None, exists_ok: bool = False, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), ) -> Group: @@ -162,14 +163,14 @@ def create_array(self, path: str, **kwargs) -> Array: self.runtime_configuration.asyncio_loop, ) - async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Group: + async def update_attributes_async(self, new_attributes: Attributes) -> Group: new_metadata = evolve(self.metadata, attributes=new_attributes) # Write new metadata await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) return evolve(self, metadata=new_metadata) - def update_attributes(self, new_attributes: Dict[str, Any]) -> Group: + def update_attributes(self, new_attributes: Attributes) -> Group: return sync( self.update_attributes_async(new_attributes), self.runtime_configuration.asyncio_loop, diff --git a/zarr/v3/group/group.py b/zarr/v3/group/group.py new file mode 100644 index 0000000000..0ebebf7a00 --- /dev/null +++ b/zarr/v3/group/group.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +import json +from typing import Any, Dict, Literal, Optional, Union + +from attr import asdict, evolve, field, frozen + +from zarr.v3.array.v3 import Array +from zarr.v3.common import ZARR_JSON, Attributes, make_cattr +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.store import StoreLike, StorePath, make_store_path +from zarr.v3.sync import sync + + +@frozen +class GroupMetadata: + attributes: Attributes = field(factory=dict) + zarr_format: Literal[3] = 3 + node_type: Literal["group"] = "group" + + def to_bytes(self) -> bytes: + return json.dumps(asdict(self)).encode() + + @classmethod + def from_json(cls, zarr_json: Any) -> GroupMetadata: + return make_cattr().structure(zarr_json, GroupMetadata) + + +@frozen +class Group: + metadata: GroupMetadata + store_path: StorePath + runtime_configuration: RuntimeConfiguration + + @classmethod + async def create_async( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + store_path = make_store_path(store) + if not exists_ok: + assert not await (store_path / ZARR_JSON).exists_async() + group = cls( + metadata=GroupMetadata(attributes=attributes or {}), + store_path=store_path, + runtime_configuration=runtime_configuration, + ) + await group._save_metadata() + return group + + @classmethod + def create( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + return sync( + cls.create_async( + store, + attributes=attributes, + exists_ok=exists_ok, + runtime_configuration=runtime_configuration, + ), + runtime_configuration.asyncio_loop, + ) + + @classmethod + async def open_async( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + store_path = make_store_path(store) + zarr_json_bytes = await (store_path / ZARR_JSON).get_async() + assert zarr_json_bytes is not None + return cls.from_json(store_path, json.loads(zarr_json_bytes), runtime_configuration) + + @classmethod + def open( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Group: + return sync( + cls.open_async(store, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + ) -> Group: + group = cls( + metadata=GroupMetadata.from_json(zarr_json), + store_path=store_path, + runtime_configuration=runtime_configuration, + ) + return group + + @classmethod + async def open_or_array( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Union[Array, Group]: + store_path = make_store_path(store) + zarr_json_bytes = await (store_path / ZARR_JSON).get_async() + if zarr_json_bytes is None: + raise KeyError + zarr_json = json.loads(zarr_json_bytes) + if zarr_json["node_type"] == "group": + return cls.from_json(store_path, zarr_json, runtime_configuration) + if zarr_json["node_type"] == "array": + return Array.from_json( + store_path, zarr_json, runtime_configuration=runtime_configuration + ) + raise KeyError + + async def _save_metadata(self) -> None: + await (self.store_path / ZARR_JSON).set_async(self.metadata.to_bytes()) + + async def get_async(self, path: str) -> Union[Array, Group]: + return await self.__class__.open_or_array( + self.store_path / path, self.runtime_configuration + ) + + def __getitem__(self, path: str) -> Union[Array, Group]: + return sync(self.get_async(path), self.runtime_configuration.asyncio_loop) + + async def create_group_async(self, path: str, **kwargs) -> Group: + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) + return await self.__class__.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_group(self, path: str, **kwargs) -> Group: + return sync(self.create_group_async(path), self.runtime_configuration.asyncio_loop) + + async def create_array_async(self, path: str, **kwargs) -> Array: + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) + return await Array.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_array(self, path: str, **kwargs) -> Array: + return sync( + self.create_array_async(path, **kwargs), + self.runtime_configuration.asyncio_loop, + ) + + async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Group: + new_metadata = evolve(self.metadata, attributes=new_attributes) + + # Write new metadata + await (self.store_path / ZARR_JSON).set_async(new_metadata.to_bytes()) + return evolve(self, metadata=new_metadata) + + def update_attributes(self, new_attributes: Dict[str, Any]) -> Group: + return sync( + self.update_attributes_async(new_attributes), + self.runtime_configuration.asyncio_loop, + ) + + def __repr__(self): + return f"" diff --git a/zarr/v3/group/group_v2.py b/zarr/v3/group/group_v2.py new file mode 100644 index 0000000000..8d74156d11 --- /dev/null +++ b/zarr/v3/group/group_v2.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import asyncio +import json +from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union + +from attr import asdict, evolve, frozen + +from zarr.v3.array.v2 import Array +from zarr.v3.common import ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, Attributes, make_cattr +from zarr.v3.common import RuntimeConfiguration +from zarr.v3.store import StoreLike, StorePath, make_store_path +from zarr.v3.sync import sync + +if TYPE_CHECKING: + from zarr.v3.group.group import Group + + +@frozen +class GroupV2Metadata: + zarr_format: Literal[2] = 2 + + def to_bytes(self) -> bytes: + return json.dumps(asdict(self)).encode() + + @classmethod + def from_json(cls, zarr_json: Any) -> GroupV2Metadata: + return make_cattr().structure(zarr_json, cls) + + +@frozen +class GroupV2: + metadata: GroupV2Metadata + store_path: StorePath + runtime_configuration: RuntimeConfiguration + attributes: Dict[str, Any] = None + + @classmethod + async def create_async( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + store_path = make_store_path(store) + if not exists_ok: + assert not await (store_path / ZGROUP_JSON).exists_async() + group = cls( + metadata=GroupV2Metadata(), + attributes=attributes, + store_path=store_path, + runtime_configuration=runtime_configuration, + ) + await group._save_metadata() + return group + + @classmethod + def create( + cls, + store: StoreLike, + *, + attributes: Optional[Dict[str, Any]] = None, + exists_ok: bool = False, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + return sync( + cls.create_async( + store, + attributes=attributes, + exists_ok=exists_ok, + runtime_configuration=runtime_configuration, + ), + runtime_configuration.asyncio_loop if runtime_configuration else None, + ) + + @classmethod + async def open_async( + cls, + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + store_path = make_store_path(store) + zgroup_bytes = await (store_path / ZGROUP_JSON).get_async() + assert zgroup_bytes is not None + zattrs_bytes = await (store_path / ZATTRS_JSON).get_async() + metadata = json.loads(zgroup_bytes) + attributes = json.loads(zattrs_bytes) if zattrs_bytes is not None else None + + return cls.from_json( + store_path, + metadata, + runtime_configuration, + attributes, + ) + + @classmethod + def open( + cls, + store_path: StorePath, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> GroupV2: + return sync( + cls.open_async(store_path, runtime_configuration), + runtime_configuration.asyncio_loop, + ) + + @classmethod + def from_json( + cls, + store_path: StorePath, + zarr_json: Any, + runtime_configuration: RuntimeConfiguration, + attributes: Optional[Dict[str, Any]] = None, + ) -> GroupV2: + group = cls( + metadata=GroupV2Metadata.from_json(zarr_json), + store_path=store_path, + runtime_configuration=runtime_configuration, + attributes=attributes, + ) + return group + + @staticmethod + async def open_or_array( + store: StoreLike, + runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), + ) -> Union[Array, GroupV2]: + store_path = make_store_path(store) + zgroup_bytes, zattrs_bytes = await asyncio.gather( + (store_path / ZGROUP_JSON).get_async(), + (store_path / ZATTRS_JSON).get_async(), + ) + attributes = json.loads(zattrs_bytes) if zattrs_bytes is not None else None + if zgroup_bytes is not None: + return GroupV2.from_json( + store_path, json.loads(zgroup_bytes), runtime_configuration, attributes + ) + zarray_bytes = await (store_path / ZARRAY_JSON).get_async() + if zarray_bytes is not None: + return Array.from_json( + store_path, json.loads(zarray_bytes), attributes, runtime_configuration + ) + raise KeyError + + async def _save_metadata(self) -> None: + await (self.store_path / ZGROUP_JSON).set_async(self.metadata.to_bytes()) + if self.attributes is not None and len(self.attributes) > 0: + await (self.store_path / ZATTRS_JSON).set_async( + json.dumps(self.attributes).encode(), + ) + else: + await (self.store_path / ZATTRS_JSON).delete_async() + + async def get_async(self, path: str) -> Union[Array, GroupV2]: + return await self.__class__.open_or_array( + self.store_path / path, self.runtime_configuration + ) + + def __getitem__(self, path: str) -> Union[Array, GroupV2]: + return sync(self.get_async(path), self.runtime_configuration.asyncio_loop) + + async def create_group_async(self, path: str, **kwargs) -> GroupV2: + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) + return await self.__class__.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_group(self, path: str, **kwargs) -> GroupV2: + return sync(self.create_group_async(path), self.runtime_configuration.asyncio_loop) + + async def create_array_async(self, path: str, **kwargs) -> Array: + runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) + return await Array.create_async( + self.store_path / path, + runtime_configuration=runtime_configuration, + **kwargs, + ) + + def create_array(self, path: str, **kwargs) -> Array: + return sync( + self.create_array_async(path, **kwargs), + self.runtime_configuration.asyncio_loop, + ) + + async def convert_to_v3_async(self) -> Group: + from zarr.v3.common import ZARR_JSON + from zarr.v3.group.group import Group, GroupMetadata + + new_metadata = GroupMetadata(attributes=self.attributes or {}) + new_metadata_bytes = new_metadata.to_bytes() + + await (self.store_path / ZARR_JSON).set_async(new_metadata_bytes) + + return Group.from_json( + store_path=self.store_path, + zarr_json=json.loads(new_metadata_bytes), + runtime_configuration=self.runtime_configuration, + ) + + async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> GroupV2: + await (self.store_path / ZATTRS_JSON).set_async(json.dumps(new_attributes).encode()) + return evolve(self, attributes=new_attributes) + + def update_attributes(self, new_attributes: Dict[str, Any]) -> GroupV2: + return sync( + self.update_attributes_async(new_attributes), + self.runtime_configuration.asyncio_loop, + ) + + def convert_to_v3(self) -> Group: + return sync(self.convert_to_v3_async(), loop=self.runtime_configuration.asyncio_loop) + + def __repr__(self): + return f"" diff --git a/zarr/v3/group_v2.py b/zarr/v3/group_v2.py index 3b1a369ae2..7d3af58a2a 100644 --- a/zarr/v3/group_v2.py +++ b/zarr/v3/group_v2.py @@ -6,11 +6,12 @@ from attr import asdict, evolve, frozen -from zarr.v3.array_v2 import ArrayV2 +from zarr.v3.array.v2 import AsyncArray from zarr.v3.common import ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, make_cattr -from zarr.v3.metadata import RuntimeConfiguration +from zarr.v3.common import RuntimeConfiguration from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync +from zarr.v3.types import Attributes if TYPE_CHECKING: from zarr.v3.group import Group @@ -33,14 +34,14 @@ class GroupV2: metadata: GroupV2Metadata store_path: StorePath runtime_configuration: RuntimeConfiguration - attributes: Optional[Dict[str, Any]] = None + attributes: Optional[Attributes] = None @classmethod async def create_async( cls, store: StoreLike, *, - attributes: Optional[Dict[str, Any]] = None, + attributes: Optional[Attributes] = None, exists_ok: bool = False, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), ) -> GroupV2: @@ -61,7 +62,7 @@ def create( cls, store: StoreLike, *, - attributes: Optional[Dict[str, Any]] = None, + attributes: Optional[Attributes] = None, exists_ok: bool = False, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), ) -> GroupV2: @@ -112,7 +113,7 @@ def from_json( store_path: StorePath, zarr_json: Any, runtime_configuration: RuntimeConfiguration, - attributes: Optional[Dict[str, Any]] = None, + attributes: Optional[Attributes] = None, ) -> GroupV2: group = cls( metadata=GroupV2Metadata.from_json(zarr_json), @@ -126,7 +127,7 @@ def from_json( async def open_or_array( store: StoreLike, runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(), - ) -> Union[ArrayV2, GroupV2]: + ) -> Union[AsyncArray, GroupV2]: store_path = make_store_path(store) zgroup_bytes, zattrs_bytes = await asyncio.gather( (store_path / ZGROUP_JSON).get_async(), @@ -139,7 +140,7 @@ async def open_or_array( ) zarray_bytes = await (store_path / ZARRAY_JSON).get_async() if zarray_bytes is not None: - return ArrayV2.from_json( + return AsyncArray.from_json( store_path, json.loads(zarray_bytes), attributes, runtime_configuration ) raise KeyError @@ -153,12 +154,12 @@ async def _save_metadata(self) -> None: else: await (self.store_path / ZATTRS_JSON).delete_async() - async def get_async(self, path: str) -> Union[ArrayV2, GroupV2]: + async def get_async(self, path: str) -> Union[AsyncArray, GroupV2]: return await self.__class__.open_or_array( self.store_path / path, self.runtime_configuration ) - def __getitem__(self, path: str) -> Union[ArrayV2, GroupV2]: + def __getitem__(self, path: str) -> Union[AsyncArray, GroupV2]: return sync(self.get_async(path), self.runtime_configuration.asyncio_loop) async def create_group_async(self, path: str, **kwargs) -> GroupV2: @@ -172,15 +173,15 @@ async def create_group_async(self, path: str, **kwargs) -> GroupV2: def create_group(self, path: str, **kwargs) -> GroupV2: return sync(self.create_group_async(path), self.runtime_configuration.asyncio_loop) - async def create_array_async(self, path: str, **kwargs) -> ArrayV2: + async def create_array_async(self, path: str, **kwargs) -> AsyncArray: runtime_configuration = kwargs.pop("runtime_configuration", self.runtime_configuration) - return await ArrayV2.create_async( + return await AsyncArray.create( self.store_path / path, runtime_configuration=runtime_configuration, **kwargs, ) - def create_array(self, path: str, **kwargs) -> ArrayV2: + def create_array(self, path: str, **kwargs) -> AsyncArray: return sync( self.create_array_async(path, **kwargs), self.runtime_configuration.asyncio_loop, @@ -201,11 +202,11 @@ async def convert_to_v3_async(self) -> Group: runtime_configuration=self.runtime_configuration, ) - async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> GroupV2: + async def update_attributes_async(self, new_attributes: Attributes) -> GroupV2: await (self.store_path / ZATTRS_JSON).set_async(json.dumps(new_attributes).encode()) return evolve(self, attributes=new_attributes) - def update_attributes(self, new_attributes: Dict[str, Any]) -> GroupV2: + def update_attributes(self, new_attributes: Attributes) -> GroupV2: return sync( self.update_attributes_async(new_attributes), self.runtime_configuration.asyncio_loop, diff --git a/zarr/v3/metadata.py b/zarr/v3/metadata.py index 1fc43b19f0..88589efe97 100644 --- a/zarr/v3/metadata.py +++ b/zarr/v3/metadata.py @@ -3,10 +3,11 @@ import json from asyncio import AbstractEventLoop from enum import Enum -from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union import numpy as np from attr import asdict, field, frozen +from zarr.v3.common import ChunkMetadata from zarr.v3.common import ChunkCoords, make_cattr @@ -142,107 +143,19 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: ChunkKeyEncodingMetadata = Union[DefaultChunkKeyEncodingMetadata, V2ChunkKeyEncodingMetadata] -BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] - - -@frozen -class BloscCodecConfigurationMetadata: - typesize: int - cname: Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] = "zstd" - clevel: int = 5 - shuffle: BloscShuffle = "noshuffle" - blocksize: int = 0 - - -blosc_shuffle_int_to_str: Dict[int, BloscShuffle] = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", -} - - -@frozen -class BloscCodecMetadata: - configuration: BloscCodecConfigurationMetadata - name: Literal["blosc"] = "blosc" - - -@frozen -class BytesCodecConfigurationMetadata: - endian: Optional[Literal["big", "little"]] = "little" - - -@frozen -class BytesCodecMetadata: - configuration: BytesCodecConfigurationMetadata - name: Literal["bytes"] = "bytes" - - -@frozen -class TransposeCodecConfigurationMetadata: - order: Union[Literal["C", "F"], Tuple[int, ...]] = "C" - - -@frozen -class TransposeCodecMetadata: - configuration: TransposeCodecConfigurationMetadata - name: Literal["transpose"] = "transpose" - - -@frozen -class GzipCodecConfigurationMetadata: - level: int = 5 - - -@frozen -class GzipCodecMetadata: - configuration: GzipCodecConfigurationMetadata - name: Literal["gzip"] = "gzip" - - -@frozen -class ZstdCodecConfigurationMetadata: - level: int = 0 - checksum: bool = False - - -@frozen -class ZstdCodecMetadata: - configuration: ZstdCodecConfigurationMetadata - name: Literal["zstd"] = "zstd" - - -@frozen -class Crc32cCodecMetadata: - name: Literal["crc32c"] = "crc32c" - - -@frozen -class ShardingCodecConfigurationMetadata: - chunk_shape: ChunkCoords - codecs: List["CodecMetadata"] - index_codecs: List["CodecMetadata"] - - -@frozen -class ShardingCodecMetadata: - configuration: ShardingCodecConfigurationMetadata - name: Literal["sharding_indexed"] = "sharding_indexed" +class CodecMetadata(Protocol): + @property + def name(self) -> str: + pass -CodecMetadata = Union[ - BloscCodecMetadata, - BytesCodecMetadata, - TransposeCodecMetadata, - GzipCodecMetadata, - ZstdCodecMetadata, - ShardingCodecMetadata, - Crc32cCodecMetadata, -] +class ShardingCodecIndexLocation(Enum): + start = "start" + end = "end" -@frozen -class CoreArrayMetadata: +""" @frozen +class ChunkMetadata: shape: ChunkCoords chunk_shape: ChunkCoords data_type: DataType @@ -256,7 +169,8 @@ def dtype(self) -> np.dtype: @property def ndim(self) -> int: return len(self.shape) - + """ +""" @frozen class ArrayMetadata: @@ -279,8 +193,8 @@ def dtype(self) -> np.dtype: def ndim(self) -> int: return len(self.shape) - def get_core_metadata(self, runtime_configuration: RuntimeConfiguration) -> CoreArrayMetadata: - return CoreArrayMetadata( + def get_core_metadata(self, runtime_configuration: RuntimeConfiguration) -> ChunkMetadata: + return ChunkMetadata( shape=self.shape, chunk_shape=self.chunk_grid.configuration.chunk_shape, data_type=self.data_type, @@ -290,7 +204,7 @@ def get_core_metadata(self, runtime_configuration: RuntimeConfiguration) -> Core def to_bytes(self) -> bytes: def _json_convert(o): - if isinstance(o, DataType): + if isinstance(o, Enum): return o.name raise TypeError @@ -337,3 +251,4 @@ def _json_convert(o): @classmethod def from_json(cls, zarr_json: Any) -> ArrayV2Metadata: return make_cattr().structure(zarr_json, cls) + """ diff --git a/zarr/v3/metadata/__init__.py b/zarr/v3/metadata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/zarr/v3/metadata/v2.py b/zarr/v3/metadata/v2.py new file mode 100644 index 0000000000..f8ed352aba --- /dev/null +++ b/zarr/v3/metadata/v2.py @@ -0,0 +1,151 @@ +""" +Models for objects described in zarr version 2 +""" + +from dataclasses import dataclass +import json +from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from numcodecs.abc import Codec +import numpy as np + +from zarr.v3.types import Attributes + +from typing import TypedDict + +V2CodecDict = Dict[str, Attributes] + + +class ArrayMetadataDict(TypedDict): + shape: Tuple[int, ...] + dtype: np.dtype + chunks: tuple[int, ...] + fill_value: Any + filters: Optional[list[V2CodecDict]] + compressor: V2CodecDict + zarr_format: Literal["2"] + + +class ArrayMetadata: + """ + A representation of v2 array metadata with no behavior besides + input validation and to / from JSON serialization + """ + + shape: Tuple[int, ...] + dtype: np.dtype + chunks: Tuple[int, ...] + fill_value: Any + filters: Optional[List[Codec]] + dimension_separator: Literal["/", "."] + order: Literal["C", "F"] + compressor: Optional[Codec] + zarr_format: Literal[2] = 2 + + def __init__( + self, shape, dtype, chunks, fill_value, filters, dimension_separator, order, compressor + ): + """ + The only thing we need to do here is validate inputs. + """ + self.shape = parse_shape(shape) + self.dtype = parse_data_type(dtype) + self.chunks = parse_chunks(chunks) + self.fill_value = parse_fill_value(fill_value) + self.order = parse_order(order) + self.compressor = parse_compressor(compressor) + self.filters = parse_filters(filters) + self.dimension_separator = parse_dimension_separator(dimension_separator) + + self = parse_metadata(self) + + @classmethod + def from_json(cls, json: bytes) -> "ArrayMetadata": + ... + + def to_dict(self) -> ArrayMetadataDict: + if self.compressor is not None: + compressor = self.compressor.get_config() + else: + compressor = self.compressor + if self.filters is not None: + filters = [f.get_config() for f in self.filters] + else: + filters = self.filters + return { + "shape": self.shape, + "dtype": self.dtype.str, + "chunks": self.chunks, + "fill_value": self.fill_value, + "order": self.order, + "compressor": compressor, + "filters": filters, + "dimension_separator": self.dimension_separator, + } + + def to_json(self) -> bytes: + return json.dumps(self.to_dict()).encode() + + +@dataclass(frozen=True) +class GroupMetadata: + attrs: Attributes + + @classmethod + def from_json(cls, json: bytes) -> "GroupMetadata": + ... + + def to_json(self) -> bytes: + ... + + +def from_json(blob: bytes) -> Union[ArrayMetadata, GroupMetadata]: + """The class methods can very lightly wrap this function""" + ... + + +def to_json(obj: Union[ArrayMetadata, GroupMetadata]) -> bytes: + """The class methods can very lightly wrap this function""" + ... + + +def parse_shape(shape: Any) -> Tuple[int, ...]: + return shape + + +def parse_data_type(data_type: Any) -> np.dtype: + return data_type + + +def parse_chunks(chunks: Any) -> Tuple[int, ...]: + return chunks + + +def parse_order(order: Any) -> Literal["C", "F"]: + return order + + +def parse_fill_value(fill_value: Any) -> Any: + return fill_value + + +def parse_compressor(compressor: Any) -> Codec: + return compressor + + +def parse_filters(filters: Any) -> Optional[List[Codec]]: + return filters + + +def parse_dimension_separator(dimension_separator: Any) -> Literal["/", "."]: + return dimension_separator + + +def parse_attrs(attrs: Any) -> Attributes: + return attrs + + +def parse_metadata(metadata: ArrayMetadata): + """ + Check that all properties are consistent + """ + return metadata diff --git a/zarr/v3/metadata/v3.py b/zarr/v3/metadata/v3.py new file mode 100644 index 0000000000..3349da0880 --- /dev/null +++ b/zarr/v3/metadata/v3.py @@ -0,0 +1,296 @@ +""" +Models for objects described in zarr version 3 +""" + +from dataclasses import dataclass +import json +from typing import ( + Any, + Dict, + Literal, + Optional, + Protocol, + Tuple, + TypedDict, + Union, + runtime_checkable, +) + +import numpy as np +import attr + +from zarr.v3.types import Attributes + + +class NamedConfigDict(TypedDict): + name: str + configuration: Attributes + + +# not clear how useful these protocols are, but lets try it +@runtime_checkable +class NamedConfig(Protocol): + name: str + configuration: Any + + +@runtime_checkable +class CodecMetadata(Protocol): + name: str + + +class RegularChunkGridConfigDict(TypedDict): + chunk_shape: tuple[int, ...] + + +class RegularChunkGridConfig: + chunk_shape: Tuple[int, ...] + + def __init__(self, chunk_shape) -> None: + self.chunk_shape = chunk_shape + + def to_dict(self) -> RegularChunkGridConfigDict: + return {"chunk_shape": self.chunk_shape} + + +class RegularChunkGridDict(TypedDict): + configuration: RegularChunkGridConfigDict + name: str + + +class RegularChunkGrid(NamedConfig): + configuration: RegularChunkGridConfig + name: Literal["regular"] = "regular" + + def __init__(self, configuration: RegularChunkGridConfig) -> None: + self.configuration = configuration + self.name = "regular" + + def to_dict(self) -> RegularChunkGridDict: + return {"configuration": self.configuration.to_dict(), "name": self.name} + + +class DefaultChunkKeyConfigDict(TypedDict): + separator: Literal[".", "/"] + + +class DefaultChunkKeyConfig: + separator: Literal[".", "/"] + + def __init__(self, *, separator: Literal[".", "/"] = "/") -> None: + self.separator = parse_dimension_separator + + def to_dict(self) -> DefaultChunkKeyConfigDict: + return {"separator": self.separator} + + +def parse_dimension_separator(separator: Any) -> Literal[".", "/"]: + if separator not in (".", "/"): + raise ValueError + return separator + + +class DefaultChunkKeyEncodingDict(TypedDict): + configuration: DefaultChunkKeyConfigDict + name: Literal["default", "v2"] + + +class DefaultChunkKeyEncoding(NamedConfig): + configuration: DefaultChunkKeyConfig + name: Literal["default", "V2"] + + def __init__(self, *, configuration=DefaultChunkKeyConfig(), name="default") -> None: + self.configuration = configuration + self.name = name + + def to_dict(self) -> DefaultChunkKeyEncodingDict: + return {"configuration": self.configuration.to_dict(), "name": self.name} + + +class V2ChunkKeyEncodingDict(TypedDict): + configuration: DefaultChunkKeyConfigDict + name: Literal["V2"] + + +class V2ChunkKeyEncoding(NamedConfig): + configuration: DefaultChunkKeyConfig = DefaultChunkKeyConfig() + name: Literal["V2"] = "V2" + + def __init__(self, configuration: DefaultChunkKeyConfig) -> None: + self.configuration = configuration + self.name = "V2" + + def to_dict(self) -> V2ChunkKeyEncodingDict: + return {"configuration": self.configuration.to_dict(), "name": self.name} + + +ChunkKeyEncoding = Union[DefaultChunkKeyEncoding, V2ChunkKeyEncoding] + + +class _ArrayMetadataDictBase(TypedDict): + """ + This is a private base class with all the required attributes. + Because `dimension_names` is an optional attribute, we need a subclass to express this. + See https://peps.python.org/pep-0655/ for a cleaner way + """ + + shape: Tuple[int, ...] + data_type: str + chunk_grid: RegularChunkGridDict + chunk_key_encoding: Union[DefaultChunkKeyConfigDict, V2ChunkKeyEncodingDict] + fill_value: Any + codecs: list[NamedConfigDict] + zarr_format: Literal["3"] + node_type: Literal["array"] + + +class ArrayMetadataDict(_ArrayMetadataDictBase, total=False): + """ + This inherits from a private base class with all the required attributes. + Because `dimension_names` is an optional attribute, we need a subclass to express this. + See https://peps.python.org/pep-0655/ for a cleaner way + """ + + dimension_names: list[str] + + +class ArrayMetadata: + """ + A representation of v3 array metadata with no behavior besides + input validation and to / from JSON serialization + """ + + shape: Tuple[int, ...] + data_type: np.dtype + chunk_grid: RegularChunkGrid + chunk_key_encoding: Union[DefaultChunkKeyEncoding, V2ChunkKeyEncoding] + fill_value: Any + codecs: list[CodecMetadata] + dimension_names: Optional[Tuple[str, ...]] + zarr_format: Literal[3] = 3 + node_type: Literal["array"] = "array" + + def __init__( + self, + *, + shape, + data_type, + chunk_grid, + chunk_key_encoding, + fill_value, + codecs, + dimension_names: Optional[Tuple[str]] = None, + ): + """ + The only thing we need to do here is validate inputs. + """ + self.shape = parse_shape(shape) + self.data_type = parse_data_type(data_type) + self.chunk_grid = parse_chunk_grid(chunk_grid) + self.chunk_key_encoding = parse_chunk_key_encoding(chunk_key_encoding) + self.fill_value = parse_fill_value(fill_value) + self.codecs = parse_codecs(codecs) + self.dimension_names = parse_dimension_names(dimension_names) + self = parse_array_metadata(self) + + def to_dict(self) -> ArrayMetadataDict: + + self_dict: ArrayMetadataDict = { + "shape": self.shape, + "data_type": self.data_type.str, + "chunk_grid": self.chunk_grid.to_dict(), + "fill_value": self.fill_value, + "chunk_key_encoding": self.chunk_grid.to_dict(), + "codecs": [codec.to_dict() for codec in self.codecs], + "node_type": "array", + "zarr_format": 3, + } + if self.dimension_names is not None: + # dimension names cannot by Null in JSON + self_dict["dimension_names"] = self.dimension_names + + def to_json(self) -> bytes: + return json.dumps(self.to_dict()).encode() + + @classmethod + def from_json(cls, json: bytes) -> "ArrayMetadata": + ... + + +class GroupMetadata: + @classmethod + def from_json(cls, json: bytes) -> "GroupMetadata": + ... + + def to_json(self) -> bytes: + ... + + +def from_json(blob: bytes) -> Union[ArrayMetadata, GroupMetadata]: + """The class methods can very lightly wrap this function""" + ... + + +def to_json(obj: Union[ArrayMetadata, GroupMetadata]) -> bytes: + """The class methods can very lightly wrap this function""" + ... + + +def parse_shape(shape: Any) -> Tuple[int, ...]: + return shape + + +def parse_data_type(data_type: Any) -> Any: + return data_type + + +def parse_chunk_grid(chunk_grid: Any) -> RegularChunkGrid: + return chunk_grid + + +def parse_chunk_key_encoding(chunk_key_encoding: Any) -> DefaultChunkKeyEncoding: + return chunk_key_encoding + + +def parse_fill_value(fill_value: Any) -> Any: + return fill_value + + +def parse_codecs(codecs: Any) -> list[CodecMetadata]: + return codecs + + +def parse_dimension_names(dimension_names: Optional[tuple[str, ...]]): + return dimension_names + + +def parse_array_metadata(metadata: ArrayMetadata): + """ + Check that all properties are consistent + """ + # todo: check that dimensional attributes like shape and dimension_names are consistent + return metadata + + +dtype_to_data_type = { + "|b1": "bool", + "bool": "bool", + "|i1": "int8", + " int: + return dtype.itemsize + + +def to_numpy_shortname(dtype: np.dtype) -> str: + return dtype.str.lstrip("|").lstrip("^").lstrip("<").lstrip(">") diff --git a/zarr/v3/store.py b/zarr/v3/store.py index f7472c68d2..0c553700fb 100644 --- a/zarr/v3/store.py +++ b/zarr/v3/store.py @@ -10,9 +10,10 @@ import asyncio import io from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, MutableMapping, Optional, Tuple, Union -from zarr.v3.common import BytesLike, to_thread +from zarr.v3.common import to_thread +from zarr.v3.types import BytesLike if TYPE_CHECKING: from upath import UPath @@ -284,6 +285,53 @@ def __repr__(self) -> str: return f"RemoteStore({repr(str(self))})" +class MemoryStore(Store): + supports_partial_writes = True + store_dict: MutableMapping[str, bytes] + + def __init__(self, store_dict: Optional[MutableMapping[str, bytes]] = None): + self.store_dict = store_dict or {} + + async def get_async( + self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + assert isinstance(key, str) + try: + value = self.store_dict[key] + if byte_range is not None: + value = value[byte_range[0] : byte_range[1]] + return value + except KeyError: + return None + + async def set_async( + self, key: str, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + ) -> None: + assert isinstance(key, str) + + if byte_range is not None: + buf = bytearray(self.store_dict[key]) + buf[byte_range[0] : byte_range[1]] = value + self.store_dict[key] = buf + else: + self.store_dict[key] = value + + async def delete_async(self, key: str) -> None: + try: + del self.store_dict[key] + except KeyError: + pass + + async def exists_async(self, key: str) -> bool: + return key in self.store_dict + + def __str__(self) -> str: + return f"memory://{id(self.store_dict)}" + + def __repr__(self) -> str: + return f"MemoryStore({repr(str(self))})" + + StoreLike = Union[Store, StorePath, Path, str] diff --git a/zarr/v3/types.py b/zarr/v3/types.py new file mode 100644 index 0000000000..791dd218e2 --- /dev/null +++ b/zarr/v3/types.py @@ -0,0 +1,9 @@ +from typing import Dict, List, Tuple, Union + +BytesLike = Union[bytes, bytearray, memoryview] +ChunkCoords = Tuple[int, ...] +SliceSelection = Tuple[slice, ...] +Selection = Union[slice, SliceSelection] +Attributes = Dict[ + str, Union[Dict[str, "Attributes"], List["Attributes"], str, int, float, bool, None] +]