Skip to content

Special case chunk encoding for dict chunk store #359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 16, 2019
5 changes: 5 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ Bug fixes
* Always use a ``tuple`` when indexing a NumPy ``ndarray``.
By :user:`John Kirkham <jakirkham>`, :issue:`376`

* Ensure when ``Array`` uses a ``dict``-based chunk store that it only contains
``bytes`` to facilitate comparisons and protect against writes. Drop the copy
for the no filter/compressor case as this handles that case.
By :user:`John Kirkham <jakirkham>`, :issue:`359`

Maintenance
~~~~~~~~~~~

Expand Down
24 changes: 9 additions & 15 deletions zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


import numpy as np
from numcodecs.compat import ensure_ndarray
from numcodecs.compat import ensure_bytes, ensure_ndarray


from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args,
Expand Down Expand Up @@ -1677,21 +1677,11 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non

else:

if not self._compressor and not self._filters:

# https://github.com/alimanfoo/zarr/issues/79
# Ensure a copy is taken so we don't end up storing
# a view into someone else's array.
# N.B., this assumes that filters or compressor always
# take a copy and never attempt to apply encoding in-place.
chunk = np.array(value, dtype=self._dtype, order=self._order)

# ensure array is contiguous
if self._order == 'F':
chunk = np.asfortranarray(value, dtype=self._dtype)
else:
# ensure array is contiguous
if self._order == 'F':
chunk = np.asfortranarray(value, dtype=self._dtype)
else:
chunk = np.ascontiguousarray(value, dtype=self._dtype)
chunk = np.ascontiguousarray(value, dtype=self._dtype)

else:
# partially replace the contents of this chunk
Expand Down Expand Up @@ -1788,6 +1778,10 @@ def _encode_chunk(self, chunk):
else:
cdata = chunk

# ensure in-memory data is immutable and easy to compare
if isinstance(self.chunk_store, dict):
cdata = ensure_bytes(cdata)

return cdata

def __repr__(self):
Expand Down
38 changes: 37 additions & 1 deletion zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from zarr.util import buffer_size
from numcodecs import (Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle,
Categorize, JSON, VLenUTF8, VLenBytes, VLenArray)
from numcodecs.compat import ensure_bytes, ensure_ndarray
from numcodecs.tests.common import greetings


Expand Down Expand Up @@ -83,6 +84,31 @@ def create_array(self, read_only=False, **kwargs):
return Array(store, read_only=read_only, cache_metadata=cache_metadata,
cache_attrs=cache_attrs)

def test_store_has_binary_values(self):
# Initialize array
np.random.seed(42)
z = self.create_array(shape=(1050,), chunks=100, dtype='f8', compressor=[])
z[:] = np.random.random(z.shape)

for v in z.chunk_store.values():
try:
ensure_ndarray(v)
except TypeError: # pragma: no cover
pytest.fail("Non-bytes-like value: %s" % repr(v))

def test_store_has_bytes_values(self):
# Test that many stores do hold bytes values.
# Though this is not a strict requirement.
# Should be disabled by any stores that fail this as needed.

# Initialize array
np.random.seed(42)
z = self.create_array(shape=(1050,), chunks=100, dtype='f8', compressor=[])
z[:] = np.random.random(z.shape)

# Check in-memory array only contains `bytes`
assert all([isinstance(v, binary_type) for v in z.chunk_store.values()])

def test_nbytes_stored(self):

# dict as store
Expand Down Expand Up @@ -1401,6 +1427,9 @@ def create_array(read_only=False, **kwargs):
return Array(store, read_only=read_only, cache_metadata=cache_metadata,
cache_attrs=cache_attrs)

def test_store_has_bytes_values(self):
pass # returns values as memoryviews/buffers instead of bytes

def test_nbytes_stored(self):
pass # not implemented

Expand Down Expand Up @@ -1725,6 +1754,9 @@ def __init__(self):
def keys(self):
return self.inner.keys()

def values(self):
return self.inner.values()

def get(self, item, default=None):
try:
return self.inner[item]
Expand All @@ -1735,7 +1767,7 @@ def __getitem__(self, item):
return self.inner[item]

def __setitem__(self, item, value):
self.inner[item] = value
self.inner[item] = ensure_bytes(value)

def __delitem__(self, key):
del self.inner[key]
Expand Down Expand Up @@ -1846,3 +1878,7 @@ def create_array(read_only=False, **kwargs):
init_array(store, **kwargs)
return Array(store, read_only=read_only, cache_metadata=cache_metadata,
cache_attrs=cache_attrs)

def test_store_has_bytes_values(self):
# skip as the cache has no control over how the store provides values
pass