Skip to content

Commit a260ae9

Browse files
(fix): structured arrays for v2 (#2681)
--------- Co-authored-by: Martin Durant <[email protected]>
1 parent e9772ac commit a260ae9

File tree

7 files changed

+56
-9
lines changed

7 files changed

+56
-9
lines changed

docs/release-notes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Unreleased
77
Bug fixes
88
~~~~~~~~~
99

10+
* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`)
11+
1012
Features
1113
~~~~~~~~
1214

docs/user-guide/config.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ This is the current default configuration::
5353
'level': 0}},
5454
'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}],
5555
'numeric': None,
56+
'raw': None,
5657
'string': [{'id': 'vlen-utf8'}]},
5758
'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False,
5859
'level': 0},

src/zarr/core/buffer/core.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
470470
# every single time we have to write data?
471471
_data, other = np.broadcast_arrays(self._data, other)
472472
return np.array_equal(
473-
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
473+
self._data,
474+
other,
475+
equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False,
474476
)
475477

476478
def fill(self, value: Any) -> None:

src/zarr/core/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def reset(self) -> None:
7575
"numeric": None,
7676
"string": [{"id": "vlen-utf8"}],
7777
"bytes": [{"id": "vlen-bytes"}],
78+
"raw": None,
7879
},
7980
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
8081
"v3_default_serializer": {

src/zarr/core/metadata/v2.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,14 @@ def to_dict(self) -> dict[str, JSON]:
193193
zarray_dict["fill_value"] = fill_value
194194

195195
_ = zarray_dict.pop("dtype")
196-
zarray_dict["dtype"] = self.dtype.str
196+
dtype_json: JSON
197+
# In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
198+
dtype_descr = self.dtype.descr
199+
if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0:
200+
dtype_json = tuple(self.dtype.descr)
201+
else:
202+
dtype_json = self.dtype.str
203+
zarray_dict["dtype"] = dtype_json
197204

198205
return zarray_dict
199206

@@ -220,6 +227,8 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
220227

221228

222229
def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]:
230+
if isinstance(data, list): # this is a valid _VoidDTypeLike check
231+
data = [tuple(d) for d in data]
223232
return np.dtype(data)
224233

225234

@@ -376,8 +385,10 @@ def _default_filters(
376385
dtype_key = "numeric"
377386
elif dtype.kind in "U":
378387
dtype_key = "string"
379-
elif dtype.kind in "OSV":
388+
elif dtype.kind in "OS":
380389
dtype_key = "bytes"
390+
elif dtype.kind == "V":
391+
dtype_key = "raw"
381392
else:
382393
raise ValueError(f"Unsupported dtype kind {dtype.kind}")
383394

tests/test_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def test_config_defaults_set() -> None:
6161
"numeric": None,
6262
"string": [{"id": "vlen-utf8"}],
6363
"bytes": [{"id": "vlen-bytes"}],
64+
"raw": None,
6465
},
6566
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
6667
"v3_default_serializer": {

tests/test_v2.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,15 @@ def test_codec_pipeline() -> None:
8484
np.testing.assert_array_equal(result, expected)
8585

8686

87-
@pytest.mark.parametrize("dtype", ["|S", "|V"])
88-
async def test_v2_encode_decode(dtype):
87+
@pytest.mark.parametrize(
88+
("dtype", "expected_dtype", "fill_value", "fill_value_encoding"),
89+
[
90+
("|S", "|S0", b"X", "WA=="),
91+
("|V", "|V0", b"X", "WA=="),
92+
("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="),
93+
],
94+
)
95+
async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None:
8996
with config.set(
9097
{
9198
"array.v2_default_filters.bytes": [{"id": "vlen-bytes"}],
@@ -95,7 +102,7 @@ async def test_v2_encode_decode(dtype):
95102
store = zarr.storage.MemoryStore()
96103
g = zarr.group(store=store, zarr_format=2)
97104
g.create_array(
98-
name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=b"X", compressor=None
105+
name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None
99106
)
100107

101108
result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
@@ -105,9 +112,9 @@ async def test_v2_encode_decode(dtype):
105112
expected = {
106113
"chunks": [3],
107114
"compressor": None,
108-
"dtype": f"{dtype}0",
109-
"fill_value": "WA==",
110-
"filters": [{"id": "vlen-bytes"}],
115+
"dtype": expected_dtype,
116+
"fill_value": fill_value_encoding,
117+
"filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None,
111118
"order": "C",
112119
"shape": [3],
113120
"zarr_format": 2,
@@ -284,3 +291,25 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None:
284291
assert arr.metadata.compressor.codec_id == expected_compressor
285292
if expected_filter is not None:
286293
assert arr.metadata.filters[0].codec_id == expected_filter
294+
295+
296+
@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"])
297+
def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None:
298+
a = np.array(
299+
[(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)],
300+
dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")],
301+
)
302+
array_path = tmp_path / "data.zarr"
303+
za = zarr.create(
304+
shape=(3,),
305+
store=array_path,
306+
chunks=(2,),
307+
fill_value=fill_value,
308+
zarr_format=2,
309+
dtype=a.dtype,
310+
)
311+
if fill_value is not None:
312+
assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all()
313+
za[...] = a
314+
za = zarr.open_array(store=array_path)
315+
assert (a == za[:]).all()

0 commit comments

Comments
 (0)