From 3bf3622f5c34dec4f1a829bc9b455451bb21b11c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2024 07:18:47 -0500 Subject: [PATCH 1/4] Use implicit fill values for zarr v2 --- src/zarr/codecs/pipeline.py | 14 ++++++++++++-- src/zarr/core/array.py | 2 +- src/zarr/core/metadata/v2.py | 9 ++++++++- tests/v3/test_metadata/test_v2.py | 2 +- tests/v3/test_v2.py | 11 +++++++++++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 6828377f97..c77c06bd1b 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -17,6 +17,7 @@ from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice +from zarr.core.metadata.v2 import default_fill_value from zarr.registry import register_pipeline if TYPE_CHECKING: @@ -247,7 +248,13 @@ async def read_batch( if chunk_array is not None: out[out_selection] = chunk_array else: - out[out_selection] = chunk_spec.fill_value + fill_value = chunk_spec.fill_value + + # this should maybe be version specific? + if fill_value is None: + fill_value = default_fill_value(dtype=chunk_spec.dtype) + + out[out_selection] = fill_value else: chunk_bytes_batch = await concurrent_map( [ @@ -274,7 +281,10 @@ async def read_batch( tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: - out[out_selection] = chunk_spec.fill_value + fill_value = chunk_spec.fill_value + if fill_value is None: + fill_value = default_fill_value(dtype=chunk_spec.dtype) + out[out_selection] = fill_value def _merge_chunk_array( self, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e1de15c747..e44f5ede6d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -376,7 +376,7 @@ async def _create_v2( chunks=chunks, order=order, dimension_separator=dimension_separator, - fill_value=0 if fill_value is None else fill_value, + fill_value=fill_value, compressor=compressor, filters=filters, attributes=attributes, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index df7f2abaea..132a5c1283 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -62,7 +62,10 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) + if fill_value is None: + fill_value_parsed = None + else: + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -273,3 +276,7 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: raise ValueError(msg) from e return fill_value + + +def default_fill_value(dtype: np.dtype[Any]) -> Any: + return dtype.type(0) diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 3ea702eecd..fa961e34cd 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -28,7 +28,7 @@ def test_parse_zarr_format_invalid(data: Any) -> None: @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) @pytest.mark.parametrize("filters", [None, (), (numcodecs.GZip(),)]) @pytest.mark.parametrize("compressor", [None, numcodecs.GZip()]) -@pytest.mark.parametrize("fill_value", [0, 1]) +@pytest.mark.parametrize("fill_value", [None, 0, 1]) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dimension_separator", [".", "/", None]) def test_metadata_to_dict( diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index 943c425f54..7d751d5f53 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -31,6 +31,15 @@ def test_simple(store: StorePath) -> None: assert np.array_equal(data, a[:, :]) +def test_implicit_fill_value(store: StorePath) -> None: + arr = zarr.open_array(store=store, shape=(4,), fill_value=None, zarr_format=2) + assert arr.metadata.fill_value is None + assert arr.metadata.to_dict()["fill_value"] is None + result = arr[:] + expected = np.zeros(arr.shape, dtype=arr.dtype) + np.testing.assert_array_equal(result, expected) + + def test_codec_pipeline() -> None: # https://github.com/zarr-developers/zarr-python/issues/2243 store = MemoryStore(mode="w") @@ -46,3 +55,5 @@ def test_codec_pipeline() -> None: result = array[:] expected = np.ones(1) np.testing.assert_array_equal(result, expected) + + From a8caaa40ddf290361cfe7fced5a7b9c0f8afbad5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 7 Oct 2024 08:42:21 -0500 Subject: [PATCH 2/4] Fixup --- src/zarr/codecs/pipeline.py | 6 +++++- src/zarr/core/metadata/v2.py | 17 +++++++++++++---- tests/v3/test_v2.py | 2 -- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index c77c06bd1b..14d2877274 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -250,8 +250,12 @@ async def read_batch( else: fill_value = chunk_spec.fill_value - # this should maybe be version specific? if fill_value is None: + # Zarr V2 allowed `fill_value` to be null in the metadata. + # Zarr V3 requires it to be set. This has already been + # validated when decoding the metadata, but we support reading + # Zarr V2 data and need to support the case where fill_value + # is None. fill_value = default_fill_value(dtype=chunk_spec.dtype) out[out_selection] = fill_value diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ca42cd2cca..72c0f5c28a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -62,10 +62,7 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - if fill_value is None: - fill_value_parsed = None - else: - fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -290,4 +287,16 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: def default_fill_value(dtype: np.dtype[Any]) -> Any: + """ + Get the default fill value for a type. + + Notes + ----- + This differs from :func:`parse_fill_value`, which parses a fill value + stored in the Array metadata into an in-memory value. This only gives + the default fill value for some type. + + This is useful for reading Zarr V2 arrays, which allow the fill + value to be unspecified. + """ return dtype.type(0) diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index 1a16980f2e..644883dd28 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -55,5 +55,3 @@ def test_codec_pipeline() -> None: result = array[:] expected = np.ones(1) np.testing.assert_array_equal(result, expected) - - From 8637c08cbebd6c468ce3177e8e3e9230977acac2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 7 Oct 2024 08:56:57 -0500 Subject: [PATCH 3/4] update asserts --- src/zarr/testing/strategies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 234454e289..8494d35939 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -140,7 +140,8 @@ def arrays( ) assert isinstance(a, Array) - assert a.fill_value is not None + if a.metadata.zarr_format == 3: + assert a.fill_value is not None assert isinstance(root[array_path], Array) assert nparray.shape == a.shape assert chunks == a.chunks From ff530c36e41fd725f5a4cdc5e7b35a92a9668e2b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 8 Oct 2024 14:30:30 -0500 Subject: [PATCH 4/4] restore v2 behavior --- src/zarr/codecs/pipeline.py | 6 +++--- src/zarr/core/metadata/v2.py | 9 +++++++-- tests/v3/test_v2.py | 19 ++++++++++++++++--- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 14d2877274..1226a04f06 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -17,7 +17,7 @@ from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice -from zarr.core.metadata.v2 import default_fill_value +from zarr.core.metadata.v2 import _default_fill_value from zarr.registry import register_pipeline if TYPE_CHECKING: @@ -256,7 +256,7 @@ async def read_batch( # validated when decoding the metadata, but we support reading # Zarr V2 data and need to support the case where fill_value # is None. - fill_value = default_fill_value(dtype=chunk_spec.dtype) + fill_value = _default_fill_value(dtype=chunk_spec.dtype) out[out_selection] = fill_value else: @@ -287,7 +287,7 @@ async def read_batch( else: fill_value = chunk_spec.fill_value if fill_value is None: - fill_value = default_fill_value(dtype=chunk_spec.dtype) + fill_value = _default_fill_value(dtype=chunk_spec.dtype) out[out_selection] = fill_value def _merge_chunk_array( diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 7b68640b13..6d8f2a8ab1 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -302,7 +302,7 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: return fill_value -def default_fill_value(dtype: np.dtype[Any]) -> Any: +def _default_fill_value(dtype: np.dtype[Any]) -> Any: """ Get the default fill value for a type. @@ -315,4 +315,9 @@ def default_fill_value(dtype: np.dtype[Any]) -> Any: This is useful for reading Zarr V2 arrays, which allow the fill value to be unspecified. """ - return dtype.type(0) + if dtype.kind == "S": + return b"" + elif dtype.kind == "U": + return "" + else: + return dtype.type(0) diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index 42ef2b4345..d981fbc893 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -1,5 +1,6 @@ import json from collections.abc import Iterator +from typing import Any import numpy as np import pytest @@ -35,12 +36,24 @@ def test_simple(store: StorePath) -> None: assert np.array_equal(data, a[:, :]) -def test_implicit_fill_value(store: StorePath) -> None: - arr = zarr.open_array(store=store, shape=(4,), fill_value=None, zarr_format=2) +@pytest.mark.parametrize( + ("dtype", "fill_value"), + [ + ("bool", False), + ("int64", 0), + ("float64", 0.0), + ("|S1", b""), + ("|U1", ""), + ("object", 0), + (str, ""), + ], +) +def test_implicit_fill_value(store: StorePath, dtype: str, fill_value: Any) -> None: + arr = zarr.open_array(store=store, shape=(4,), fill_value=None, zarr_format=2, dtype=dtype) assert arr.metadata.fill_value is None assert arr.metadata.to_dict()["fill_value"] is None result = arr[:] - expected = np.zeros(arr.shape, dtype=arr.dtype) + expected = np.full(arr.shape, fill_value, dtype=dtype) np.testing.assert_array_equal(result, expected)