zarr-developers · jakirkham · Nov 30, 2018 · Nov 30, 2018 · Dec 1, 2018 · Dec 1, 2018
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -176,7 +176,7 @@ print some diagnostics, e.g.::
     Read-only          : False
     Compressor         : Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE,
                        : blocksize=0)
-    Store type         : builtins.dict
+    Store type         : zarr.storage.DictStore
     No. bytes          : 400000000 (381.5M)
     No. bytes stored   : 3242241 (3.1M)
     Storage ratio      : 123.4
@@ -268,7 +268,7 @@ Here is an example using a delta filter with the Blosc compressor::
     Read-only          : False
     Filter [0]         : Delta(dtype='<i4')
     Compressor         : Blosc(cname='zstd', clevel=1, shuffle=SHUFFLE, blocksize=0)
-    Store type         : builtins.dict
+    Store type         : zarr.storage.DictStore
     No. bytes          : 400000000 (381.5M)
     No. bytes stored   : 648605 (633.4K)
     Storage ratio      : 616.7
@@ -1181,7 +1181,7 @@ ratios, depending on the correlation structure within the data. E.g.::
     Order              : C
     Read-only          : False
     Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
-    Store type         : builtins.dict
+    Store type         : zarr.storage.DictStore
     No. bytes          : 400000000 (381.5M)
     No. bytes stored   : 6696010 (6.4M)
     Storage ratio      : 59.7
@@ -1195,7 +1195,7 @@ ratios, depending on the correlation structure within the data. E.g.::
     Order              : F
     Read-only          : False
     Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
-    Store type         : builtins.dict
+    Store type         : zarr.storage.DictStore
     No. bytes          : 400000000 (381.5M)
     No. bytes stored   : 4684636 (4.5M)
     Storage ratio      : 85.4

diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -1,3 +1,3 @@
 asciitree==0.3.3
 fasteners==0.14.1
-numcodecs==0.5.5
+numcodecs==0.6.1
diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
         'asciitree',
         'numpy>=1.7',
         'fasteners',
-        'numcodecs>=0.5.3',
+        'numcodecs>=0.6.1',
     ],
     package_dir={'': '.'},
     packages=['zarr', 'zarr.tests'],

diff --git a/zarr/core.py b/zarr/core.py
@@ -8,6 +8,7 @@
 
 
 import numpy as np
+from numcodecs.compat import ensure_ndarray
 
 
 from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args,
@@ -1743,18 +1744,22 @@ def _decode_chunk(self, cdata):
             for f in self._filters[::-1]:
                 chunk = f.decode(chunk)
 
-        # view as correct dtype
-        if self._dtype == object:
-            if isinstance(chunk, np.ndarray):
-                chunk = chunk.astype(self._dtype)
-            else:
-                raise RuntimeError('cannot read object array without object codec')
-        elif isinstance(chunk, np.ndarray):
+        # view as numpy array with correct dtype
+        chunk = ensure_ndarray(chunk)
+        # special case object dtype, because incorrect handling can lead to
+        # segfaults and other bad things happening
+        if self._dtype != object:
             chunk = chunk.view(self._dtype)
-        else:
-            chunk = np.frombuffer(chunk, dtype=self._dtype)
-
-        # reshape
+        elif chunk.dtype != object:
+            # If we end up here, someone must have hacked around with the filters.
+            # We cannot deal with object arrays unless there is an object
+            # codec in the filter chain, i.e., a filter that converts from object
+            # array to something else during encoding, and converts back to object
+            # array during decoding.
+            raise RuntimeError('cannot read object array without object codec')
+
+        # ensure correct chunk shape
+        chunk = chunk.reshape(-1, order='A')
         chunk = chunk.reshape(self._chunks, order=self._order)
 
         return chunk
@@ -1806,10 +1811,10 @@ def info(self):
         Order              : C
         Read-only          : False
         Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
-        Store type         : builtins.dict
+        Store type         : zarr.storage.DictStore
         No. bytes          : 4000000 (3.8M)
-        No. bytes stored   : ...
-        Storage ratio      : ...
+        No. bytes stored   : 320
+        Storage ratio      : 12500.0
         Chunks initialized : 0/10
 
         """

diff --git a/zarr/creation.py b/zarr/creation.py
@@ -7,7 +7,7 @@
 
 
 from zarr.core import Array
-from zarr.storage import (DirectoryStore, init_array, contains_array, contains_group,
+from zarr.storage import (DictStore, DirectoryStore, init_array, contains_array, contains_group,
                           default_compressor, normalize_storage_path, ZipStore)
 from numcodecs.registry import codec_registry
 from zarr.errors import err_contains_array, err_contains_group, err_array_not_found
@@ -125,7 +125,7 @@ def create(shape, chunks=True, dtype=None, compressor='default',
     return z
 
 
-def normalize_store_arg(store, clobber=False, default=dict):
+def normalize_store_arg(store, clobber=False, default=DictStore):
     if store is None:
         return default()
     elif isinstance(store, str):

diff --git a/zarr/meta.py b/zarr/meta.py
@@ -5,24 +5,20 @@
 
 
 import numpy as np
+from numcodecs.compat import ensure_bytes
 
 
-from zarr.compat import PY2, binary_type, Mapping
+from zarr.compat import PY2, Mapping
 from zarr.errors import MetadataError
 
 
 ZARR_FORMAT = 2
 
 
 def ensure_str(s):
-    if PY2:  # pragma: py3 no cover
-        # noinspection PyUnresolvedReferences
-        if isinstance(s, buffer):  # noqa
-            s = str(s)
-    else:  # pragma: py2 no cover
-        if isinstance(s, memoryview):
-            s = s.tobytes()
-        if isinstance(s, binary_type):
+    if not isinstance(s, str):
+        s = ensure_bytes(s)
+        if not PY2:  # pragma: py2 no cover
             s = s.decode('ascii')
     return s
 

diff --git a/zarr/storage.py b/zarr/storage.py
@@ -31,15 +31,13 @@
 import warnings
 
 
-import numpy as np
-
-
 from zarr.util import (normalize_shape, normalize_chunks, normalize_order,
                        normalize_storage_path, buffer_size,
                        normalize_fill_value, nolock, normalize_dtype)
 from zarr.meta import encode_array_metadata, encode_group_metadata
-from zarr.compat import PY2, binary_type, OrderedDict_move_to_end
+from zarr.compat import PY2, OrderedDict_move_to_end
 from numcodecs.registry import codec_registry
+from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray
 from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor,
                          err_fspath_exists_notdir, err_read_only, MetadataError)
 
@@ -444,23 +442,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None):
     store[key] = encode_group_metadata(meta)
 
 
-def ensure_bytes(s):
-    if isinstance(s, binary_type):
-        return s
-    if isinstance(s, np.ndarray):
-        if PY2:  # pragma: py3 no cover
-            # noinspection PyArgumentList
-            return s.tostring(order='A')
-        else:  # pragma: py2 no cover
-            # noinspection PyArgumentList
-            return s.tobytes(order='A')
-    if hasattr(s, 'tobytes'):
-        return s.tobytes()
-    if PY2 and hasattr(s, 'tostring'):  # pragma: py3 no cover
-        return s.tostring()
-    return memoryview(s).tobytes()
-
-
 def _dict_store_keys(d, prefix='', cls=dict):
     for k in d.keys():
         v = d[k]
@@ -484,12 +465,11 @@ class DictStore(MutableMapping):
         >>> type(g.store)
         <class 'zarr.storage.DictStore'>
 
-    Note that the default class when creating an array is the built-in
-    :class:`dict` class, i.e.::
+    Also this is the default class when creating an array. E.g.::
 
         >>> z = zarr.zeros(100)
         >>> type(z.store)
-        <class 'dict'>
+        <class 'zarr.storage.DictStore'>
 
     Notes
     -----
@@ -554,6 +534,8 @@ def __getitem__(self, item):
     def __setitem__(self, item, value):
         with self.write_mutex:
             parent, key = self._require_parent(item)
+            if not isinstance(value, self.cls):
+                value = ensure_bytes(value)
             parent[key] = value
 
     def __delitem__(self, item):
@@ -652,17 +634,11 @@ def getsize(self, path=None):
             size = 0
             for v in value.values():
                 if not isinstance(v, self.cls):
-                    try:
-                        size += buffer_size(v)
-                    except TypeError:
-                        return -1
+                    size += buffer_size(v)
             return size
 
         else:
-            try:
-                return buffer_size(value)
-            except TypeError:
-                return -1
+            return buffer_size(value)
 
     def clear(self):
         with self.write_mutex:
@@ -741,9 +717,8 @@ def __getitem__(self, key):
 
     def __setitem__(self, key, value):
 
-        # handle F-contiguous numpy arrays
-        if isinstance(value, np.ndarray) and value.flags.f_contiguous:
-            value = ensure_bytes(value)
+        # coerce to flat, contiguous array (ideally without copying)
+        value = ensure_contiguous_ndarray(value)
 
         # destination path for key
         file_path = os.path.join(self.path, key)
@@ -1192,7 +1167,7 @@ def __getitem__(self, key):
     def __setitem__(self, key, value):
         if self.mode == 'r':
             err_read_only()
-        value = ensure_bytes(value)
+        value = ensure_contiguous_ndarray(value)
         with self.mutex:
             self.zf.writestr(key, value)
 

diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py
@@ -982,7 +982,7 @@ def test_object_arrays(self):
         z[0] = 'foo'
         assert z[0] == 'foo'
         z[1] = b'bar'
-        assert z[1] == 'bar'  # msgpack gets this wrong
+        assert z[1] == b'bar'
         z[2] = 1
         assert z[2] == 1
         z[3] = [2, 4, 6, 'baz']

diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py
@@ -633,14 +633,6 @@ def test_setdel(self):
         store = self.create_store()
         setdel_hierarchy_checks(store)
 
-    def test_getsize_ext(self):
-        store = self.create_store()
-        store['a'] = list(range(10))
-        store['b/c'] = list(range(10))
-        assert -1 == store.getsize()
-        assert -1 == store.getsize('a')
-        assert -1 == store.getsize('b')
-
 
 class TestDirectoryStore(StoreTests, unittest.TestCase):
 
@@ -1096,6 +1088,10 @@ def test_getsize():
     assert 7 == getsize(store)
     assert 5 == getsize(store, 'baz')
 
+    store = dict()
+    store['boo'] = None
+    assert -1 == getsize(store)
+
 
 def test_migrate_1to2():
     from zarr import meta_v1

diff --git a/zarr/util.py b/zarr/util.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, print_function, division
-import operator
 from textwrap import TextWrapper, dedent
 import numbers
 import uuid
@@ -10,10 +9,11 @@
 from asciitree import BoxStyle, LeftAligned
 from asciitree.traversal import Traversal
 import numpy as np
+from numcodecs.compat import ensure_ndarray
 from numcodecs.registry import codec_registry
 
 
-from zarr.compat import PY2, reduce, text_type, binary_type
+from zarr.compat import PY2, text_type, binary_type
 
 
 # codecs to use for object dtype convenience API
@@ -314,17 +314,7 @@ def normalize_storage_path(path):
 
 
 def buffer_size(v):
-    from array import array as _stdlib_array
-    if PY2 and isinstance(v, _stdlib_array):  # pragma: py3 no cover
-        # special case array.array because does not support buffer
-        # interface in PY2
-        return v.buffer_info()[1] * v.itemsize
-    else:  # pragma: py2 no cover
-        v = memoryview(v)
-        if v.shape:
-            return reduce(operator.mul, v.shape) * v.itemsize
-        else:
-            return v.itemsize
+    return ensure_ndarray(v).nbytes
 
 
 def info_text_report(items):