diff --git a/docs/release.rst b/docs/release.rst index 65bd94c45f..49861268d4 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -56,6 +56,11 @@ Bug fixes * Always use a ``tuple`` when indexing a NumPy ``ndarray``. By :user:`John Kirkham `, :issue:`376` +* Ensure when ``Array`` uses a ``dict``-based chunk store that it only contains + ``bytes`` to facilitate comparisons and protect against writes. Drop the copy + for the no filter/compressor case as this handles that case. + By :user:`John Kirkham `, :issue:`359` + Maintenance ~~~~~~~~~~~ diff --git a/zarr/core.py b/zarr/core.py index 0838117b89..29d54e4750 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_ndarray +from numcodecs.compat import ensure_bytes, ensure_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1677,21 +1677,11 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non else: - if not self._compressor and not self._filters: - - # https://github.com/alimanfoo/zarr/issues/79 - # Ensure a copy is taken so we don't end up storing - # a view into someone else's array. - # N.B., this assumes that filters or compressor always - # take a copy and never attempt to apply encoding in-place. - chunk = np.array(value, dtype=self._dtype, order=self._order) - + # ensure array is contiguous + if self._order == 'F': + chunk = np.asfortranarray(value, dtype=self._dtype) else: - # ensure array is contiguous - if self._order == 'F': - chunk = np.asfortranarray(value, dtype=self._dtype) - else: - chunk = np.ascontiguousarray(value, dtype=self._dtype) + chunk = np.ascontiguousarray(value, dtype=self._dtype) else: # partially replace the contents of this chunk @@ -1788,6 +1778,10 @@ def _encode_chunk(self, chunk): else: cdata = chunk + # ensure in-memory data is immutable and easy to compare + if isinstance(self.chunk_store, dict): + cdata = ensure_bytes(cdata) + return cdata def __repr__(self): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 1c7d526c0c..7c168950df 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -23,6 +23,7 @@ from zarr.util import buffer_size from numcodecs import (Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle, Categorize, JSON, VLenUTF8, VLenBytes, VLenArray) +from numcodecs.compat import ensure_bytes, ensure_ndarray from numcodecs.tests.common import greetings @@ -83,6 +84,31 @@ def create_array(self, read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs) + def test_store_has_binary_values(self): + # Initialize array + np.random.seed(42) + z = self.create_array(shape=(1050,), chunks=100, dtype='f8', compressor=[]) + z[:] = np.random.random(z.shape) + + for v in z.chunk_store.values(): + try: + ensure_ndarray(v) + except TypeError: # pragma: no cover + pytest.fail("Non-bytes-like value: %s" % repr(v)) + + def test_store_has_bytes_values(self): + # Test that many stores do hold bytes values. + # Though this is not a strict requirement. + # Should be disabled by any stores that fail this as needed. + + # Initialize array + np.random.seed(42) + z = self.create_array(shape=(1050,), chunks=100, dtype='f8', compressor=[]) + z[:] = np.random.random(z.shape) + + # Check in-memory array only contains `bytes` + assert all([isinstance(v, binary_type) for v in z.chunk_store.values()]) + def test_nbytes_stored(self): # dict as store @@ -1401,6 +1427,9 @@ def create_array(read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs) + def test_store_has_bytes_values(self): + pass # returns values as memoryviews/buffers instead of bytes + def test_nbytes_stored(self): pass # not implemented @@ -1725,6 +1754,9 @@ def __init__(self): def keys(self): return self.inner.keys() + def values(self): + return self.inner.values() + def get(self, item, default=None): try: return self.inner[item] @@ -1735,7 +1767,7 @@ def __getitem__(self, item): return self.inner[item] def __setitem__(self, item, value): - self.inner[item] = value + self.inner[item] = ensure_bytes(value) def __delitem__(self, key): del self.inner[key] @@ -1846,3 +1878,7 @@ def create_array(read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs) + + def test_store_has_bytes_values(self): + # skip as the cache has no control over how the store provides values + pass