From 7eed366397a4c8bf98f8bb4adac34fa022ac4532 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 30 Nov 2018 12:18:30 -0500 Subject: [PATCH 01/21] Bump Numcodecs requirement to 0.6.1 --- requirements_dev.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 2ad18f372c..d39ba9e9b8 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,3 @@ asciitree==0.3.3 fasteners==0.14.1 -numcodecs==0.5.5 +numcodecs==0.6.1 diff --git a/setup.py b/setup.py index a5e8334e43..903af3bc04 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'asciitree', 'numpy>=1.7', 'fasteners', - 'numcodecs>=0.5.3', + 'numcodecs>=0.6.1', ], package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], From 2552f620191cafa72429566f4a8ce4f49b4db4d3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 30 Nov 2018 13:06:29 -0500 Subject: [PATCH 02/21] Assert MsgPack round-trips bytes objects correctly Previously MsgPack was turning bytes objects to unicode objects when round-tripping them. However this has been fixed in the latest version of Numcodecs. So correct this test now that MsgPack is working correctly. --- zarr/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 11891f8fe9..544ec95c41 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -982,7 +982,7 @@ def test_object_arrays(self): z[0] = 'foo' assert z[0] == 'foo' z[1] = b'bar' - assert z[1] == 'bar' # msgpack gets this wrong + assert z[1] == b'bar' z[2] = 1 assert z[2] == 1 z[3] = [2, 4, 6, 'baz'] From aee5aceced5e5a3f2698f2363540f064c200f4a9 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 1 Dec 2018 14:09:40 +0000 Subject: [PATCH 03/21] properly guard against removal of object codec --- zarr/core.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index b4da45cd99..bcae03cb9f 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,6 +8,7 @@ import numpy as np +from numcodecs.compat import ensure_contiguous_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1743,18 +1744,25 @@ def _decode_chunk(self, cdata): for f in self._filters[::-1]: chunk = f.decode(chunk) - # view as correct dtype + # view as numpy array with correct dtype if self._dtype == object: - if isinstance(chunk, np.ndarray): - chunk = chunk.astype(self._dtype) + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if isinstance(chunk, np.ndarray) and chunk.dtype == object: + # chunk is already of correct dtype, good to carry on + # flatten just to be sure we can reshape later + chunk = chunk.reshape(-1, order='A') else: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. raise RuntimeError('cannot read object array without object codec') - elif isinstance(chunk, np.ndarray): - chunk = chunk.view(self._dtype) else: - chunk = np.frombuffer(chunk, dtype=self._dtype) + chunk = ensure_contiguous_ndarray(chunk).view(self._dtype) - # reshape + # ensure correct chunk shape chunk = chunk.reshape(self._chunks, order=self._order) return chunk From bf4eee8cc763b1917e299fcfde04a5e5d9a0938b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 13:21:24 -0500 Subject: [PATCH 04/21] Ensure `chunk` in `_decode_chunk` is an `ndarray` --- zarr/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index bcae03cb9f..94bd94edde 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_contiguous_ndarray +from numcodecs.compat import ensure_ndarray, ensure_contiguous_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1745,10 +1745,11 @@ def _decode_chunk(self, cdata): chunk = f.decode(chunk) # view as numpy array with correct dtype + chunk = ensure_ndarray(chunk) if self._dtype == object: # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening - if isinstance(chunk, np.ndarray) and chunk.dtype == object: + if chunk.dtype == object: # chunk is already of correct dtype, good to carry on # flatten just to be sure we can reshape later chunk = chunk.reshape(-1, order='A') From b741fe12a0099cdcc0697a80b3ace31c82738cce Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:44 -0500 Subject: [PATCH 05/21] Reshape `chunk` ourselves since it is an `ndarray` As we already ensured the `chunk` is an `ndarray` viewing the original data, there is no need for us to do that here as well. Plus the checks performed by `ensure_contiguous_ndarray` are not needed for our use case here. Particularly as we have already handled the unusual type cases above. We also don't need to constrain the buffer size. As such the only thing we really need is to flatten the array and make it contiguous, which is what we handle here directly. --- zarr/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 94bd94edde..b5d0185faf 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_ndarray, ensure_contiguous_ndarray +from numcodecs.compat import ensure_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1761,7 +1761,7 @@ def _decode_chunk(self, cdata): # array during decoding. raise RuntimeError('cannot read object array without object codec') else: - chunk = ensure_contiguous_ndarray(chunk).view(self._dtype) + chunk = chunk.reshape(-1, order='A').view(self._dtype) # ensure correct chunk shape chunk = chunk.reshape(self._chunks, order=self._order) From f3144ae6b4fdc929eb1390b1ed87ee5a35e6862f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:50 -0500 Subject: [PATCH 06/21] Refactor `reshape` from `_decode_chunk` As both the expected `object` case and the non-`object` case perform a `reshape` to flatten the data, go ahead and refactor that out of both cases and handle it generally. Simplifies the code a bit. --- zarr/core.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index b5d0185faf..ab5de14512 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1749,11 +1749,7 @@ def _decode_chunk(self, cdata): if self._dtype == object: # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening - if chunk.dtype == object: - # chunk is already of correct dtype, good to carry on - # flatten just to be sure we can reshape later - chunk = chunk.reshape(-1, order='A') - else: + if chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. # We cannot deal with object arrays unless there is an object # codec in the filter chain, i.e., a filter that converts from object @@ -1761,9 +1757,10 @@ def _decode_chunk(self, cdata): # array during decoding. raise RuntimeError('cannot read object array without object codec') else: - chunk = chunk.reshape(-1, order='A').view(self._dtype) + chunk = chunk.view(self._dtype) # ensure correct chunk shape + chunk = chunk.reshape(-1, order='A') chunk = chunk.reshape(self._chunks, order=self._order) return chunk From 3e3920af230e059e84f70563c4f215d60f845aed Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:53 -0500 Subject: [PATCH 07/21] Consolidate type checks in `_decode_chunk` As refactoring of the `reshape` step has effectively dropped the expected `object` type case, the checks for different types is a little more complicated than needed. To fix this, basically invert and swap the case ordering. This way we can handle all generally expected types first and simply cast them. Then we can raise if an `object` type shows up and is unexpected. --- zarr/core.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index ab5de14512..a2a07a29ba 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1746,18 +1746,17 @@ def _decode_chunk(self, cdata): # view as numpy array with correct dtype chunk = ensure_ndarray(chunk) - if self._dtype == object: - # special case object dtype, because incorrect handling can lead to - # segfaults and other bad things happening - if chunk.dtype != object: - # If we end up here, someone must have hacked around with the filters. - # We cannot deal with object arrays unless there is an object - # codec in the filter chain, i.e., a filter that converts from object - # array to something else during encoding, and converts back to object - # array during decoding. - raise RuntimeError('cannot read object array without object codec') - else: + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if self._dtype != object: chunk = chunk.view(self._dtype) + elif chunk.dtype != object: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. + raise RuntimeError('cannot read object array without object codec') # ensure correct chunk shape chunk = chunk.reshape(-1, order='A') From a61842bdeeaf7988c1e1886b1beb4c461d829c90 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 20:06:36 -0500 Subject: [PATCH 08/21] Ensure `DictStore` uses `bytes` to store blobs The `DictStore` is pretty reliant on the fact that values are immutable and can be easily compared. For example `__eq__` assumes that all contents can be compared easily. This works fine if the data is `bytes`. However it doesn't really work for `ndarray`s for example. Previously we would have stored whatever the user gave us here. This means comparisons could falldown in those cases as well (much as the example in the tutorial has highlighted on CI). Now we effectively require that the data be something that can either be coerced to `bytes` (e.g. via the new/old buffer protocol) or is `bytes` to begin with. Make sure not to force this requirement when nesting one `MutableMapping` within another. --- zarr/storage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zarr/storage.py b/zarr/storage.py index 6720b42d12..3bfc2e9919 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -40,6 +40,7 @@ from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.compat import PY2, binary_type, OrderedDict_move_to_end from numcodecs.registry import codec_registry +from numcodecs.compat import ensure_bytes from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -554,6 +555,8 @@ def __getitem__(self, item): def __setitem__(self, item, value): with self.write_mutex: parent, key = self._require_parent(item) + if not isinstance(value, self.cls): + value = ensure_bytes(value) parent[key] = value def __delitem__(self, item): From 0e05be0b2a813fdd75a2418916c3c733a11a0392 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 20:11:03 -0500 Subject: [PATCH 09/21] Drop `test_getsize_ext` This test case seems to be ill-posed. Anytime we store `object`s to `Array`s we require an `object_codec` to be specified. Otherwise we have no clean way to serialize the data. However this `DictStore` test breaks that assumption by explicitly storing an `object` type in it even though this would never work for the other stores (particularly when working with `Array`s). This includes in-memory Zarr `Array`s, which would be backed by `DictStore`. Given this, we go ahead and drop this test case. --- zarr/tests/test_storage.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 33c65f36c9..58c079b965 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -633,14 +633,6 @@ def test_setdel(self): store = self.create_store() setdel_hierarchy_checks(store) - def test_getsize_ext(self): - store = self.create_store() - store['a'] = list(range(10)) - store['b/c'] = list(range(10)) - assert -1 == store.getsize() - assert -1 == store.getsize('a') - assert -1 == store.getsize('b') - class TestDirectoryStore(StoreTests, unittest.TestCase): From bbf783eaa31f73c9d00e547ca34980ab48e49d18 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 20:39:36 -0500 Subject: [PATCH 10/21] Change default store to `DictStore` Instead of using a Python `dict` as the `default` store for a Zarr `Array`, use the `DictStore`. This ensures that all blobs will be represented as `bytes` regardless of what the user provided as data. Thus things like comparisons of stores will work well in the default case. --- zarr/creation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/creation.py b/zarr/creation.py index 0184a4a5da..b46adc5b38 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -7,7 +7,7 @@ from zarr.core import Array -from zarr.storage import (DirectoryStore, init_array, contains_array, contains_group, +from zarr.storage import (DictStore, DirectoryStore, init_array, contains_array, contains_group, default_compressor, normalize_storage_path, ZipStore) from numcodecs.registry import codec_registry from zarr.errors import err_contains_array, err_contains_group, err_array_not_found @@ -125,7 +125,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', return z -def normalize_store_arg(store, clobber=False, default=dict): +def normalize_store_arg(store, clobber=False, default=DictStore): if store is None: return default() elif isinstance(store, str): From b56c2dda0eb17436502c2dc92f93e5fd910d79a6 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 20:52:16 -0500 Subject: [PATCH 11/21] Update `DictStore` docs to note `Array` uses it --- zarr/storage.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 3bfc2e9919..bac4ce7c7a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -485,12 +485,11 @@ class DictStore(MutableMapping): >>> type(g.store) - Note that the default class when creating an array is the built-in - :class:`dict` class, i.e.:: + Also this is the default class when creating an array. E.g.:: >>> z = zarr.zeros(100) >>> type(z.store) - + Notes ----- From cf781adf70fffcd5fa8b0e32de224dca6f1ec54f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 22:34:52 -0500 Subject: [PATCH 12/21] Update `Array`'s `info` examples As we are now using `DictStore` to back the `Array`, we can correctly measure how much memory it is using. So update the examples in `info` and the tutorial to show how much memory is being used. Also update the store type listed in info as well. --- docs/tutorial.rst | 8 ++++---- zarr/core.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 606b5acef5..fe65145cb0 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -176,7 +176,7 @@ print some diagnostics, e.g.:: Read-only : False Compressor : Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, : blocksize=0) - Store type : builtins.dict + Store type : zarr.storage.DictStore No. bytes : 400000000 (381.5M) No. bytes stored : 3242241 (3.1M) Storage ratio : 123.4 @@ -268,7 +268,7 @@ Here is an example using a delta filter with the Blosc compressor:: Read-only : False Filter [0] : Delta(dtype=' Date: Sat, 1 Dec 2018 22:40:57 -0500 Subject: [PATCH 13/21] Drop `ensure_bytes` definition from `zarr.storage` As Numcodecs now includes a very versatile and effective `ensure_bytes` function, there is no need to define our own in `zarr.storage` as well. So go ahead and drop it. --- zarr/storage.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index bac4ce7c7a..7a6639af53 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -445,23 +445,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): store[key] = encode_group_metadata(meta) -def ensure_bytes(s): - if isinstance(s, binary_type): - return s - if isinstance(s, np.ndarray): - if PY2: # pragma: py3 no cover - # noinspection PyArgumentList - return s.tostring(order='A') - else: # pragma: py2 no cover - # noinspection PyArgumentList - return s.tobytes(order='A') - if hasattr(s, 'tobytes'): - return s.tobytes() - if PY2 and hasattr(s, 'tostring'): # pragma: py3 no cover - return s.tostring() - return memoryview(s).tobytes() - - def _dict_store_keys(d, prefix='', cls=dict): for k in d.keys(): v = d[k] From cb14850ed0d1eeac8aeb7e40f48082dbfcd27a66 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 22:48:37 -0500 Subject: [PATCH 14/21] Drop import of `binary_type` in `zarr.storage` As this is no longer being used by `ensure_bytes` as that function was dropped, go ahead and drop `binary_type` as well. --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 7a6639af53..9178e4a6b5 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -38,7 +38,7 @@ normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, binary_type, OrderedDict_move_to_end +from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry from numcodecs.compat import ensure_bytes from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, From d299b1be50fde8ddaecefc3b04a8eaf7c8dcef6c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 23:07:59 -0500 Subject: [PATCH 15/21] Take flattened array views to avoid some copies Make use of Numcodecs' `ensure_ndarray` to take `ndarray` views onto buffers to be stored in a few cases so as to reshape them and avoid a copy (thanks to the buffer protocol). --- zarr/storage.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 9178e4a6b5..2fb7ad5433 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -31,16 +31,13 @@ import warnings -import numpy as np - - from zarr.util import (normalize_shape, normalize_chunks, normalize_order, normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry -from numcodecs.compat import ensure_bytes +from numcodecs.compat import ensure_bytes, ensure_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -726,9 +723,8 @@ def __getitem__(self, key): def __setitem__(self, key, value): - # handle F-contiguous numpy arrays - if isinstance(value, np.ndarray) and value.flags.f_contiguous: - value = ensure_bytes(value) + # coerce to flat, contiguous array (ideally without copying) + value = ensure_ndarray(value).reshape(-1, order='A') # destination path for key file_path = os.path.join(self.path, key) @@ -1177,7 +1173,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if self.mode == 'r': err_read_only() - value = ensure_bytes(value) + value = ensure_ndarray(value).reshape(-1, order='A') with self.mutex: self.zf.writestr(key, value) From 205fa163597fa1162b17865f89a6593c703e9602 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 23:32:01 -0500 Subject: [PATCH 16/21] Simplify `buffer_size` by using `ensure_ndarray` Rewrite `buffer_size` to just use Numcodecs' `ensure_ndarray` to get an `ndarray` that views the data. Once the `ndarray` is gotten, all that is needed is to access its `nbytes` member, which returns the number of bytes that it takes up. --- zarr/util.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/zarr/util.py b/zarr/util.py index b79865bfe8..ad882c41d5 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division -import operator from textwrap import TextWrapper, dedent import numbers import uuid @@ -10,10 +9,11 @@ from asciitree import BoxStyle, LeftAligned from asciitree.traversal import Traversal import numpy as np +from numcodecs.compat import ensure_ndarray from numcodecs.registry import codec_registry -from zarr.compat import PY2, reduce, text_type, binary_type +from zarr.compat import PY2, text_type, binary_type # codecs to use for object dtype convenience API @@ -314,17 +314,7 @@ def normalize_storage_path(path): def buffer_size(v): - from array import array as _stdlib_array - if PY2 and isinstance(v, _stdlib_array): # pragma: py3 no cover - # special case array.array because does not support buffer - # interface in PY2 - return v.buffer_info()[1] * v.itemsize - else: # pragma: py2 no cover - v = memoryview(v) - if v.shape: - return reduce(operator.mul, v.shape) * v.itemsize - else: - return v.itemsize + return ensure_ndarray(v).nbytes def info_text_report(items): From f6880b907d2da5334b0eaf43d79462bd1e36f4db Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 23:32:31 -0500 Subject: [PATCH 17/21] Test `getsize` for unknown size --- zarr/tests/test_storage.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 58c079b965..ba6164da55 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1088,6 +1088,10 @@ def test_getsize(): assert 7 == getsize(store) assert 5 == getsize(store, 'baz') + store = dict() + store['boo'] = None + assert -1 == getsize(store) + def test_migrate_1to2(): from zarr import meta_v1 From bcee828b7275a5c615124f2b196fcebfa46340fb Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 23:42:56 -0500 Subject: [PATCH 18/21] Simplify `ensure_str` in `zarr.meta` If the data is already a `str` instance, turn `ensure_str` into a no-op. For all other cases, make use of Numcodecs' `ensure_bytes` to aid `ensure_str` in coercing data through the buffer protocol. If we are on Python 3, then decode the `bytes` object to a `str`. --- zarr/meta.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 9ce580eff2..7984efb701 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -5,9 +5,10 @@ import numpy as np +from numcodecs.compat import ensure_bytes -from zarr.compat import PY2, binary_type, Mapping +from zarr.compat import PY2, Mapping from zarr.errors import MetadataError @@ -15,14 +16,9 @@ def ensure_str(s): - if PY2: # pragma: py3 no cover - # noinspection PyUnresolvedReferences - if isinstance(s, buffer): # noqa - s = str(s) - else: # pragma: py2 no cover - if isinstance(s, memoryview): - s = s.tobytes() - if isinstance(s, binary_type): + if not isinstance(s, str): + s = ensure_bytes(s) + if not PY2: # pragma: py2 no cover s = s.decode('ascii') return s From dfa51f8d8e85f1230f8146f04b84ab74ed83e6e9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 23:55:30 -0500 Subject: [PATCH 19/21] Drop unknown size cases from `DictStore` As `DictStore` now must only store `bytes` or types coercible to bytes via the buffer protocol, there is no possibility for it to have unknown sizes as `bytes` always have a known size. So drop these cases where the size can be `-1`. --- zarr/storage.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 2fb7ad5433..6e57315377 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -634,17 +634,11 @@ def getsize(self, path=None): size = 0 for v in value.values(): if not isinstance(v, self.cls): - try: - size += buffer_size(v) - except TypeError: - return -1 + size += buffer_size(v) return size else: - try: - return buffer_size(value) - except TypeError: - return -1 + return buffer_size(value) def clear(self): with self.write_mutex: From d0b80126348c0d447e3662d4806b820853e30db9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 2 Dec 2018 00:29:17 -0500 Subject: [PATCH 20/21] Cast datetime/timedelta arrays for buffer protocol Make sure that datetime/timedelta arrays are cast to a type that supports the buffer protocol. Ensure this is a type that can handle all of the datetime/timedelta values and has the same itemsize. --- zarr/storage.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zarr/storage.py b/zarr/storage.py index 6e57315377..228e9cccb5 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -719,6 +719,8 @@ def __setitem__(self, key, value): # coerce to flat, contiguous array (ideally without copying) value = ensure_ndarray(value).reshape(-1, order='A') + if value.dtype.kind in 'mM': + value = value.view('i8') # destination path for key file_path = os.path.join(self.path, key) @@ -1168,6 +1170,8 @@ def __setitem__(self, key, value): if self.mode == 'r': err_read_only() value = ensure_ndarray(value).reshape(-1, order='A') + if value.dtype.kind in 'mM': + value = value.view('i8') with self.mutex: self.zf.writestr(key, value) From 0cf5e5b7ee9fbaf757869b73b566bbeb06cd7e7b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 2 Dec 2018 00:33:41 -0500 Subject: [PATCH 21/21] Use `ensure_contiguous_ndarray` with stores Instead of using `ensure_ndarray`, use `ensure_contiguous_ndarray` with the stores. This ensures that datetime/timedeltas are handled by default. Also catches things like object arrays. Finally this handles flattening the array if needed. --- zarr/storage.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 228e9cccb5..e013924d91 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -37,7 +37,7 @@ from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry -from numcodecs.compat import ensure_bytes, ensure_ndarray +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -718,9 +718,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): # coerce to flat, contiguous array (ideally without copying) - value = ensure_ndarray(value).reshape(-1, order='A') - if value.dtype.kind in 'mM': - value = value.view('i8') + value = ensure_contiguous_ndarray(value) # destination path for key file_path = os.path.join(self.path, key) @@ -1169,9 +1167,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if self.mode == 'r': err_read_only() - value = ensure_ndarray(value).reshape(-1, order='A') - if value.dtype.kind in 'mM': - value = value.view('i8') + value = ensure_contiguous_ndarray(value) with self.mutex: self.zf.writestr(key, value)