From d46de8407be43f33e83e390eb693cbbcc245103f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Jan 2025 16:17:30 -0500 Subject: [PATCH 01/10] (fix): allow structured dtype in `v2` --- src/zarr/core/config.py | 1 + src/zarr/core/metadata/v2.py | 4 +++- tests/test_config.py | 1 + tests/test_v2.py | 8 ++++++++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 7920d220a4..051e8c68e1 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -75,6 +75,7 @@ def reset(self) -> None: "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], + "raw": None, }, "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, "v3_default_serializer": { diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 29cf15a119..f23d335609 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -376,8 +376,10 @@ def _default_filters( dtype_key = "numeric" elif dtype.kind in "U": dtype_key = "string" - elif dtype.kind in "OSV": + elif dtype.kind in "OS": dtype_key = "bytes" + elif dtype.kind == "V": + dtype_key = "raw" else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") diff --git a/tests/test_config.py b/tests/test_config.py index c552ace840..1a2453d646 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -61,6 +61,7 @@ def test_config_defaults_set() -> None: "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], + "raw": None, }, "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, "v3_default_serializer": { diff --git a/tests/test_v2.py b/tests/test_v2.py index 72127f4ede..216f4a1e1b 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -243,3 +243,11 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None: assert arr.metadata.compressor.codec_id == expected_compressor if expected_filter is not None: assert arr.metadata.filters[0].codec_id == expected_filter + +def test_structured_dtype() -> None: + a = np.array( + [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], + dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], + ) + za = zarr.array(a, chunks=2, fill_value=None, zarr_format=2) + assert (a == za[:]).all() \ No newline at end of file From 2090b984175084669279b6cabd6d6d5fde5faa65 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Jan 2025 16:20:20 -0500 Subject: [PATCH 02/10] (fix): `|V` test --- tests/test_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_v2.py b/tests/test_v2.py index 216f4a1e1b..3464bf0d20 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -107,7 +107,7 @@ async def test_v2_encode_decode(dtype): "compressor": None, "dtype": f"{dtype}0", "fill_value": "WA==", - "filters": [{"id": "vlen-bytes"}], + "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, "order": "C", "shape": [3], "zarr_format": 2, From f3e2e2dbb7af73e8a9146aa9459768d6a7479163 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Jan 2025 16:33:54 -0500 Subject: [PATCH 03/10] (fix): lint --- docs/user-guide/v3_migration.rst | 1 - tests/test_v2.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index bda1ae64ed..a285fda3a4 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -202,7 +202,6 @@ of Zarr-Python, please open (or comment on) a - The following features that were supported by Zarr-Python 2 have not been ported to Zarr-Python 3 yet: - * Structured arrays / dtypes (:issue:`2134`) * Fixed-length string dtypes (:issue:`2347`) * Datetime and timedelta dtypes (:issue:`2616`) * Object dtypes (:issue:`2617`) diff --git a/tests/test_v2.py b/tests/test_v2.py index 3464bf0d20..e64802a1a7 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -244,10 +244,11 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None: if expected_filter is not None: assert arr.metadata.filters[0].codec_id == expected_filter + def test_structured_dtype() -> None: a = np.array( [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], ) za = zarr.array(a, chunks=2, fill_value=None, zarr_format=2) - assert (a == za[:]).all() \ No newline at end of file + assert (a == za[:]).all() From a8d473bd9b413b468012716aa50a02e039ad8cee Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Jan 2025 17:09:50 -0500 Subject: [PATCH 04/10] (fix): handle `fill_value` --- src/zarr/core/buffer/core.py | 4 +++- tests/test_v2.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 85a7351fc7..ccab103e0f 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -470,7 +470,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool: # every single time we have to write data? _data, other = np.broadcast_arrays(self._data, other) return np.array_equal( - self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False + self._data, + other, + equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False, ) def fill(self, value: Any) -> None: diff --git a/tests/test_v2.py b/tests/test_v2.py index e64802a1a7..714791ad89 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -245,10 +245,16 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None: assert arr.metadata.filters[0].codec_id == expected_filter -def test_structured_dtype() -> None: +@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) +def test_structured_dtype(fill_value, tmp_path) -> None: a = np.array( [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], ) - za = zarr.array(a, chunks=2, fill_value=None, zarr_format=2) + za = zarr.create( + shape=(3,), path=tmp_path, chunks=(2,), fill_value=fill_value, zarr_format=2, dtype=a.dtype + ) + if fill_value is not None: + assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() + za[...] = a assert (a == za[:]).all() From 7898ef8ad0f6eb542f84759bb6e7ad25c1bef134 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Jan 2025 17:14:47 -0500 Subject: [PATCH 05/10] (fix): put back structured array business --- docs/user-guide/v3_migration.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index a285fda3a4..bda1ae64ed 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -202,6 +202,7 @@ of Zarr-Python, please open (or comment on) a - The following features that were supported by Zarr-Python 2 have not been ported to Zarr-Python 3 yet: + * Structured arrays / dtypes (:issue:`2134`) * Fixed-length string dtypes (:issue:`2347`) * Datetime and timedelta dtypes (:issue:`2616`) * Object dtypes (:issue:`2617`) From 9c2efdd12670215c7b938041cc30dedd3d4e3329 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 10 Jan 2025 11:03:39 -0500 Subject: [PATCH 06/10] (fix): dtype encoding roundtrip --- src/zarr/core/metadata/v2.py | 9 ++++++++- tests/test_v2.py | 11 +++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index f23d335609..a36dbb10ee 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -193,7 +193,12 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["fill_value"] = fill_value _ = zarray_dict.pop("dtype") - zarray_dict["dtype"] = self.dtype.str + dtype_json: JSON + if self.dtype.kind == "V": + dtype_json = tuple(self.dtype.descr) + else: + dtype_json = self.dtype.str + zarray_dict["dtype"] = dtype_json return zarray_dict @@ -220,6 +225,8 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: + if isinstance(data, list): # this is a valid _VoidDTypeLike check + data = [tuple(d) for d in data] return np.dtype(data) diff --git a/tests/test_v2.py b/tests/test_v2.py index 714791ad89..24f53035a4 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -246,15 +246,22 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None: @pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) -def test_structured_dtype(fill_value, tmp_path) -> None: +def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: a = np.array( [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], ) + array_path = tmp_path / "data.zarr" za = zarr.create( - shape=(3,), path=tmp_path, chunks=(2,), fill_value=fill_value, zarr_format=2, dtype=a.dtype + shape=(3,), + store=array_path, + chunks=(2,), + fill_value=fill_value, + zarr_format=2, + dtype=a.dtype, ) if fill_value is not None: assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() za[...] = a + za = zarr.open_array(store=array_path) assert (a == za[:]).all() From b2071a71ed98b4228cc8af49e7395d78b266bd4e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 13 Jan 2025 15:36:54 -0500 Subject: [PATCH 07/10] (fix): encode-decode test --- src/zarr/core/metadata/v2.py | 4 +++- tests/test_v2.py | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index a36dbb10ee..192db5b203 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -194,7 +194,9 @@ def to_dict(self) -> dict[str, JSON]: _ = zarray_dict.pop("dtype") dtype_json: JSON - if self.dtype.kind == "V": + # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string + dtype_descr = self.dtype.descr + if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: dtype_json = tuple(self.dtype.descr) else: dtype_json = self.dtype.str diff --git a/tests/test_v2.py b/tests/test_v2.py index a6dbe091fb..096a779ffa 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -82,8 +82,15 @@ def test_codec_pipeline() -> None: np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["|S", "|V"]) -async def test_v2_encode_decode(dtype): +@pytest.mark.parametrize( + ("dtype", "expected_dtype", "fill_value", "fill_value_encoding"), + [ + ("|S", "|S0", b"X", "WA=="), + ("|V", "|V0", b"X", "WA=="), + ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), + ], +) +async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None: with config.set( { "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], @@ -97,7 +104,7 @@ async def test_v2_encode_decode(dtype): shape=(3,), chunks=(3,), dtype=dtype, - fill_value=b"X", + fill_value=fill_value, ) result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) @@ -107,8 +114,8 @@ async def test_v2_encode_decode(dtype): expected = { "chunks": [3], "compressor": None, - "dtype": f"{dtype}0", - "fill_value": "WA==", + "dtype": expected_dtype, + "fill_value": fill_value_encoding, "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, "order": "C", "shape": [3], From ff730a3df8dc1f86e00f4bb8ba57f3f2bcebc92c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 13 Jan 2025 15:38:54 -0500 Subject: [PATCH 08/10] (chore): rel notes --- docs/release-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 47a0f9c2c2..38457126a1 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -10,6 +10,7 @@ New features Bug fixes ~~~~~~~~~ * Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`). +* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`) Behaviour changes ~~~~~~~~~~~~~~~~~ From 89cc8d93f9aca2b3ae1d3f69796ff0fb9f8a3728 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 13 Jan 2025 15:42:07 -0500 Subject: [PATCH 09/10] (fix): docstring test --- docs/user-guide/config.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 871291b72b..3662f75dff 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -53,6 +53,7 @@ This is the current default configuration:: 'level': 0}}, 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], 'numeric': None, + 'raw': None, 'string': [{'id': 'vlen-utf8'}]}, 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, 'level': 0}, From 7141de532488661478059ac9c453acc2308547a7 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 21 Jan 2025 15:37:23 -0500 Subject: [PATCH 10/10] Update release-notes.rst --- docs/release-notes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 4407d0b305..2276889cf6 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -7,6 +7,8 @@ Unreleased Bug fixes ~~~~~~~~~ +* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`) + Features ~~~~~~~~ @@ -22,7 +24,6 @@ Other Bug fixes ~~~~~~~~~ * Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`). -* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`) * Fixes a bug that prevented reading Zarr format 2 data with consolidated metadata written using ``zarr-python`` version 2 (:issue:`2694`).