diff --git a/pyproject.toml b/pyproject.toml index 886cd5a0bc..effcac83ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -273,6 +273,7 @@ filterwarnings = [ "ignore:PY_SSIZE_T_CLEAN will be required.*:DeprecationWarning", "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning", "ignore:Creating a zarr.buffer.gpu.*:UserWarning", + "ignore:Duplicate name:UserWarning", # from ZipFile ] markers = [ "gpu: mark a test as requiring CuPy and GPU" diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index e64a962bc3..1a251a0a4b 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Literal -from zarr.core.common import parse_dtype, parse_fill_value, parse_order, parse_shapelike +from zarr.core.common import parse_fill_value, parse_order, parse_shapelike if TYPE_CHECKING: import numpy as np @@ -29,12 +29,11 @@ def __init__( prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) - dtype_parsed = parse_dtype(dtype) fill_value_parsed = parse_fill_value(fill_value) order_parsed = parse_order(order) object.__setattr__(self, "shape", shape_parsed) - object.__setattr__(self, "dtype", dtype_parsed) + object.__setattr__(self, "dtype", dtype) object.__setattr__(self, "fill_value", fill_value_parsed) object.__setattr__(self, "order", order_parsed) object.__setattr__(self, "prototype", prototype) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 8ebe5160bd..6847bd419f 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,8 +19,6 @@ if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator -import numpy as np -import numpy.typing as npt ZARR_JSON = "zarr.json" ZARRAY_JSON = ".zarray" @@ -155,11 +153,6 @@ def parse_shapelike(data: int | Iterable[int]) -> tuple[int, ...]: return data_tuple -def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: - # todo: real validation - return np.dtype(data) - - def parse_fill_value(data: Any) -> Any: # todo: real validation return data diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index af7821bea7..34bdbb537f 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Iterable +from enum import Enum from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -21,7 +22,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator -from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_dtype, parse_shapelike +from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_shapelike from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import ArrayMetadata, parse_attributes @@ -100,9 +101,24 @@ def _json_convert( else: return o.descr if np.isscalar(o): - # convert numpy scalar to python type, and pass - # python types through - return getattr(o, "item", lambda: o)() + out: Any + if hasattr(o, "dtype") and o.dtype.kind == "M" and hasattr(o, "view"): + # https://github.com/zarr-developers/zarr-python/issues/2119 + # `.item()` on a datetime type might or might not return an + # integer, depending on the value. + # Explicitly cast to an int first, and then grab .item() + out = o.view("i8").item() + else: + # convert numpy scalar to python type, and pass + # python types through + out = getattr(o, "item", lambda: o)() + if isinstance(out, complex): + # python complex types are not JSON serializable, so we use the + # serialization defined in the zarr v3 spec + return [out.real, out.imag] + return out + if isinstance(o, Enum): + return o.name raise TypeError zarray_dict = self.to_dict() @@ -157,6 +173,11 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) +def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: + # todo: real validation + return np.dtype(data) + + def parse_zarr_format(data: object) -> Literal[2]: if data == 2: return 2 diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 10047cbb93..34a33103c6 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,7 +24,7 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.core.chunk_key_encodings import ChunkKeyEncoding -from zarr.core.common import ZARR_JSON, parse_dtype, parse_named_configuration, parse_shapelike +from zarr.core.common import ZARR_JSON, parse_named_configuration, parse_shapelike from zarr.core.config import config from zarr.core.metadata.common import ArrayMetadata, parse_attributes from zarr.registry import get_codec_class @@ -215,6 +215,10 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: # check that the node_type attribute is correct _ = parse_node_type_array(_data.pop("node_type")) + # check that the data_type attribute is valid + if _data["data_type"] not in DataType: + raise ValueError(f"Invalid V3 data_type: {_data['data_type']}") + # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) # attributes key is optional, normalize missing to `None` @@ -328,7 +332,17 @@ def parse_fill_value( raise ValueError(msg) msg = f"Cannot parse non-string sequence {fill_value} as a scalar with type {dtype}." raise TypeError(msg) - return dtype.type(fill_value) # type: ignore[arg-type] + + # Cast the fill_value to the given dtype + try: + casted_value = np.dtype(dtype).type(fill_value) + except (ValueError, OverflowError, TypeError) as e: + raise ValueError(f"fill value {fill_value!r} is not valid for dtype {dtype}") from e + # Check if the value is still representable by the dtype + if fill_value != casted_value: + raise ValueError(f"fill value {fill_value!r} is not valid for dtype {dtype}") + + return casted_value # For type checking @@ -345,8 +359,11 @@ class DataType(Enum): uint16 = "uint16" uint32 = "uint32" uint64 = "uint64" + float16 = "float16" float32 = "float32" float64 = "float64" + complex64 = "complex64" + complex128 = "complex128" @property def byte_count(self) -> int: @@ -360,8 +377,11 @@ def byte_count(self) -> int: DataType.uint16: 2, DataType.uint32: 4, DataType.uint64: 8, + DataType.float16: 2, DataType.float32: 4, DataType.float64: 8, + DataType.complex64: 8, + DataType.complex128: 16, } return data_type_byte_counts[self] @@ -381,8 +401,11 @@ def to_numpy_shortname(self) -> str: DataType.uint16: "u2", DataType.uint32: "u4", DataType.uint64: "u8", + DataType.float16: "f2", DataType.float32: "f4", DataType.float64: "f8", + DataType.complex64: "c8", + DataType.complex128: "c16", } return data_type_to_numpy[self] @@ -399,7 +422,24 @@ def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: " np.dtype[Any]: + try: + dtype = np.dtype(data) + except TypeError as e: + raise ValueError(f"Invalid V3 data_type: {data}") from e + # check that this is a valid v3 data_type + try: + _ = DataType.from_dtype(dtype) + except KeyError as e: + raise ValueError(f"Invalid V3 data_type: {dtype}") from e + + return dtype diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 83de3d92ce..00f93026ab 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -35,11 +35,14 @@ paths = st.lists(node_names, min_size=1).map(lambda x: "/".join(x)) | st.just("/") np_arrays = npst.arrays( # TODO: re-enable timedeltas once they are supported - dtype=npst.scalar_dtypes().filter(lambda x: x.kind != "m"), + dtype=npst.scalar_dtypes().filter( + lambda x: (x.kind not in ["m", "M"]) and (x.byteorder not in [">"]) + ), shape=npst.array_shapes(max_dims=4), ) stores = st.builds(MemoryStore, st.just({}), mode=st.just("w")) compressors = st.sampled_from([None, "default"]) +format = st.sampled_from([2, 3]) @st.composite # type: ignore[misc] @@ -69,12 +72,14 @@ def arrays( paths: st.SearchStrategy[None | str] = paths, array_names: st.SearchStrategy = array_names, attrs: st.SearchStrategy = attrs, + format: st.SearchStrategy = format, ) -> Array: store = draw(stores) nparray, chunks = draw(np_array_and_chunks(arrays=arrays)) path = draw(paths) name = draw(array_names) attributes = draw(attrs) + zarr_format = draw(format) # compressor = draw(compressors) # TODO: clean this up @@ -99,7 +104,7 @@ def arrays( expected_attrs = {} if attributes is None else attributes array_path = path + ("/" if not path.endswith("/") else "") + name - root = Group.create(store) + root = Group.create(store, zarr_format=zarr_format) fill_value_args: tuple[Any, ...] = tuple() if nparray.dtype.kind == "M": m = re.search(r"\[(.+)\]", nparray.dtype.str) diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py index 0a545dfb9d..7bfa2e0192 100644 --- a/tests/v3/test_metadata/test_v3.py +++ b/tests/v3/test_metadata/test_v3.py @@ -1,11 +1,9 @@ from __future__ import annotations -import json import re from typing import TYPE_CHECKING, Literal from zarr.codecs.bytes import BytesCodec -from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.metadata.v3 import ArrayV3Metadata @@ -19,7 +17,12 @@ import numpy as np import pytest -from zarr.core.metadata.v3 import parse_dimension_names, parse_fill_value, parse_zarr_format +from zarr.core.metadata.v3 import ( + parse_dimension_names, + parse_dtype, + parse_fill_value, + parse_zarr_format, +) bool_dtypes = ("bool",) @@ -234,22 +237,61 @@ def test_metadata_to_dict( assert observed == expected -@pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) -@pytest.mark.parametrize("precision", ["ns", "D"]) -async def test_datetime_metadata(fill_value: int, precision: str) -> None: +# @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) +# @pytest.mark.parametrize("precision", ["ns", "D"]) +# async def test_datetime_metadata(fill_value: int, precision: str) -> None: +# metadata_dict = { +# "zarr_format": 3, +# "node_type": "array", +# "shape": (1,), +# "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, +# "data_type": f" None: metadata_dict = { "zarr_format": 3, "node_type": "array", "shape": (1,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": f" None: + metadata_dict = { + "zarr_format": 3, + "node_type": "array", + "shape": (1,), + "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, + "data_type": data_type, + "chunk_key_encoding": {"name": "default", "separator": "."}, + "codecs": (), + "fill_value": fill_value, # this is not a valid fill value for uint8 + } + with pytest.raises(ValueError, match=rf"fill value .* is not valid for dtype {data_type}"): + ArrayV3Metadata.from_dict(metadata_dict)