From 770344990fecc775d37624c3b51c71d528f1e72a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 00:54:05 -0500 Subject: [PATCH 01/10] Add `values` method to `CustomMapping` For doing a quick pass through the values within a `MutableMapping`, it is helpful to have the `values` method. This will be needed in some of the tests that we are adding. So add a quick implementation of `values` in the `CustomMapping`. --- zarr/tests/test_core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 1c7d526c0c..9fa30ff321 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1725,6 +1725,9 @@ def __init__(self): def keys(self): return self.inner.keys() + def values(self): + return self.inner.values() + def get(self, item, default=None): try: return self.inner[item] From 5dba561fada4ff1c718efba1335802a57b55dcaa Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 00:54:06 -0500 Subject: [PATCH 02/10] Test stores contain bytes-like data --- zarr/tests/test_core.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 9fa30ff321..c52b7f51b7 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -23,6 +23,7 @@ from zarr.util import buffer_size from numcodecs import (Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle, Categorize, JSON, VLenUTF8, VLenBytes, VLenArray) +from numcodecs.compat import ensure_ndarray from numcodecs.tests.common import greetings @@ -83,6 +84,18 @@ def create_array(self, read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs) + def test_store_has_binary_values(self): + # Initialize array + np.random.seed(42) + z = self.create_array(shape=(1050,), chunks=100, dtype='f8', compressor=[]) + z[:] = np.random.random(z.shape) + + for v in z.chunk_store.values(): + try: + ensure_ndarray(v) + except TypeError: + pytest.fail("Non-bytes-like value: %s" % repr(v)) + def test_nbytes_stored(self): # dict as store From c36c0819ca8a798ddae35d31d70318fd612abdf4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:01:33 -0500 Subject: [PATCH 03/10] Add a test to check that all values are bytes Currently this is a somewhat loose requirement that not all stores enforce. However it is a useful check to have in some cases. Particularly this is a useful constraint with in-memory stores where there is a concern that the data in the store might be manipulated externally due to ndarray views. The bytes instances don't have this problem as they are immutable and own their data. While views can be take onto bytes instances, they will be read-only views and thus are not a concern. For other stores that place their data in some other storage backend (e.g. on disk), this is less of a concern. Also other stores may choose to represent their data in other ways (e.g. LMDB with `memoryview`s). Manipulating the loaded data from these stores is less of a concern since as it doesn't affect their actually contents, only an in-memory representation. In these cases, it may make sense to disable this test. --- zarr/tests/test_core.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index c52b7f51b7..add6732e2c 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -96,6 +96,19 @@ def test_store_has_binary_values(self): except TypeError: pytest.fail("Non-bytes-like value: %s" % repr(v)) + def test_store_has_bytes_values(self): + # Test that many stores do hold bytes values. + # Though this is not a strict requirement. + # Should be disabled by any stores that fail this as needed. + + # Initialize array + np.random.seed(42) + z = self.create_array(shape=(1050,), chunks=100, dtype='f8', compressor=[]) + z[:] = np.random.random(z.shape) + + # Check in-memory array only contains `bytes` + assert all([isinstance(v, binary_type) for v in z.chunk_store.values()]) + def test_nbytes_stored(self): # dict as store From 0eda42ba1036357ce1c09cb1b19e1e88483f8cd2 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:01:41 -0500 Subject: [PATCH 04/10] Disable bytes value test for LMDBStore w/buffers It's possible to request that `LMDBStore` return `buffer`s/`memoryview`s instead of returning `bytes` for values. This is incredibly useful as LMDB is memory-mapped. So this avoids a copy when accessing the data. However this means it will fail `test_store_has_bytes_values`. Though this is ok as noted previously since that is not a hard requirement of stores. So we just disable that test for this `LMDBStore` case. The other `LMDBStore` case returns `bytes` instances instead (copying the data); so, it passes this test without issues. --- zarr/tests/test_core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index add6732e2c..0a4f3ab490 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1427,6 +1427,9 @@ def create_array(read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs) + def test_store_has_bytes_values(self): + pass # returns values as memoryviews/buffers instead of bytes + def test_nbytes_stored(self): pass # not implemented From ffa3df63bbda8f3e33347cfd985ef091c99252cf Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:01:42 -0500 Subject: [PATCH 05/10] Have CustomMapping ensure it stores bytes Since people will coming looking to see how a store should be implemented, we should show them good behavior. Namely we should ensure the values provided to `__setitem__` are `bytes`-like and immutable (once stored). By coercing the data to `bytes` we ensure that the data is `bytes`-like and we ensure the data is immutable since `bytes` are immutable. --- zarr/tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 0a4f3ab490..98a41c32a1 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -23,7 +23,7 @@ from zarr.util import buffer_size from numcodecs import (Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle, Categorize, JSON, VLenUTF8, VLenBytes, VLenArray) -from numcodecs.compat import ensure_ndarray +from numcodecs.compat import ensure_bytes, ensure_ndarray from numcodecs.tests.common import greetings @@ -1767,7 +1767,7 @@ def __getitem__(self, item): return self.inner[item] def __setitem__(self, item, value): - self.inner[item] = value + self.inner[item] = ensure_bytes(value) def __delitem__(self, key): del self.inner[key] From de95d518d240d9313634a5717a29590baa3ae1fb Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:01:43 -0500 Subject: [PATCH 06/10] Disable bytes value test for LRUStoreCache As the point of the `LRUStoreCache` is to merely hold onto values retrieved from the underlying store and keep them around in memory unaltered, the caching layer doesn't have any control over what type of values are returned to it. Thus it doesn't make much sense to test whether the values it returns are of `bytes` type or not. Though this is fine as there is not strict requirement that values of `bytes` type be returned by stores, so simply disable `test_store_has_bytes_values` for the `LRUStoreCache` test case with a note explaining this. --- zarr/tests/test_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 98a41c32a1..41e9e9b3b1 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1878,3 +1878,7 @@ def create_array(read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs) + + def test_store_has_bytes_values(self): + # skip as the cache has no control over how the store provides values + pass From ec495dbf23fbe5995699c8c34cddfa6dcd7ffc8c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:23:25 -0500 Subject: [PATCH 07/10] Ensure `bytes` in `_encode_chunk` for `dict` In `Array` when running `_encode_chunk` and using a `dict`-based chunk store, ensure that the chunk data is of `bytes` type. This is done to convert the underlying data to an immutable value to protect against external views onto the data from changing it (as is the case with NumPy arrays). Also this is done to ensure it is possible to compare `dict`-based stores easily. While coercing to a `bytes` object can introduce a copying, this generally won't happen if a compressor is involved as it will usually have returned a `bytes` object. Though filters may not, which could cause this to introduce an extra copy if only filters and no compressors are used. However this is an unlikely case and is not as important as guaranteeing the values own their own data and are read-only. Plus this should allow us to drop the preventive copy that happens earlier when storing values as this neatly handles that case of no filters and no compressors. --- zarr/core.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index 0838117b89..1a35e193ff 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_ndarray +from numcodecs.compat import ensure_bytes, ensure_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1788,6 +1788,10 @@ def _encode_chunk(self, chunk): else: cdata = chunk + # ensure in-memory data is immutable and easy to compare + if isinstance(self.chunk_store, dict): + cdata = ensure_bytes(cdata) + return cdata def __repr__(self): From 62337dc8f1c7e530d4984653bf608af2855f83be Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:23:51 -0500 Subject: [PATCH 08/10] Skip copying when no compressor or filter is used This copy was taken primarily to protect in-memory stores from being mutated by external views of the array. However all stores we define (including in-memory ones like `DictStore`) already perform this safeguarding themselves on the values they store. For the builtin `dict` store, we perform this safeguarding for it (since `dict`'s don't do this themselves) by ensuring we only store `bytes` objects into them. As these are immutable and own their own data, there isn't a way to mutate their content after storing them. Thus this preventive copy here is not needed and can be dropped. --- zarr/core.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 1a35e193ff..29d54e4750 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1677,21 +1677,11 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non else: - if not self._compressor and not self._filters: - - # https://github.com/alimanfoo/zarr/issues/79 - # Ensure a copy is taken so we don't end up storing - # a view into someone else's array. - # N.B., this assumes that filters or compressor always - # take a copy and never attempt to apply encoding in-place. - chunk = np.array(value, dtype=self._dtype, order=self._order) - + # ensure array is contiguous + if self._order == 'F': + chunk = np.asfortranarray(value, dtype=self._dtype) else: - # ensure array is contiguous - if self._order == 'F': - chunk = np.asfortranarray(value, dtype=self._dtype) - else: - chunk = np.ascontiguousarray(value, dtype=self._dtype) + chunk = np.ascontiguousarray(value, dtype=self._dtype) else: # partially replace the contents of this chunk From 13ada7f8cc4c68c6ec7f5a2c0ef5b4598a522ddd Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:23:53 -0500 Subject: [PATCH 09/10] Note dict stores bytes and preemptive copy dropped --- docs/release.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index 65bd94c45f..49861268d4 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -56,6 +56,11 @@ Bug fixes * Always use a ``tuple`` when indexing a NumPy ``ndarray``. By :user:`John Kirkham `, :issue:`376` +* Ensure when ``Array`` uses a ``dict``-based chunk store that it only contains + ``bytes`` to facilitate comparisons and protect against writes. Drop the copy + for the no filter/compressor case as this handles that case. + By :user:`John Kirkham `, :issue:`359` + Maintenance ~~~~~~~~~~~ From 582ae76ad7103fdf9c971f5d050eb801c335b65a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 14 Feb 2019 01:38:50 -0500 Subject: [PATCH 10/10] Ignore fail case coverage in bytes-like value test --- zarr/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 41e9e9b3b1..7c168950df 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -93,7 +93,7 @@ def test_store_has_binary_values(self): for v in z.chunk_store.values(): try: ensure_ndarray(v) - except TypeError: + except TypeError: # pragma: no cover pytest.fail("Non-bytes-like value: %s" % repr(v)) def test_store_has_bytes_values(self):