diff --git a/docs/release.rst b/docs/release.rst index 0810946ee6..e9c592a860 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -12,6 +12,9 @@ Enhancements * array indexing with [] (getitem and setitem) now supports fancy indexing. By :user:`Juan Nunez-Iglesias `; :issue:`725`. +* write_empty_chunks=False deletes chunks consisting of only fill_value. + By :user:`Davis Bennett `; :issue:`738`. + .. _release_2.10.2: 2.10.2 diff --git a/zarr/core.py b/zarr/core.py index f53c2b9b05..b9600467c1 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -32,6 +32,7 @@ from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.storage import array_meta_key, attrs_key, getsize, listdir from zarr.util import ( + all_equal, InfoReporter, check_array_shape, human_readable_size, @@ -75,6 +76,14 @@ class Array: If True and while the chunk_store is a FSStore and the compresion used is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. + write_empty_chunks : bool, optional + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill + value prior to storing. If a chunk is uniformly equal to the fill + value, then that chunk is not be stored, and the store entry for + that chunk's key is deleted. This setting enables sparser storage, + as only chunks with non-fill-value data are stored, at the expense + of overhead associated with checking the data of each chunk. .. versionadded:: 2.7 @@ -107,6 +116,7 @@ class Array: info vindex oindex + write_empty_chunks Methods ------- @@ -139,6 +149,7 @@ def __init__( cache_metadata=True, cache_attrs=True, partial_decompress=False, + write_empty_chunks=True, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized @@ -155,6 +166,7 @@ def __init__( self._cache_metadata = cache_metadata self._is_view = False self._partial_decompress = partial_decompress + self._write_empty_chunks = write_empty_chunks # initialize metadata self._load_metadata() @@ -455,6 +467,13 @@ def vindex(self): :func:`set_mask_selection` for documentation and examples.""" return self._vindex + @property + def write_empty_chunks(self) -> bool: + """A Boolean, True if chunks composed of the array's fill value + will be stored. If False, such chunks will not be stored. + """ + return self._write_empty_chunks + def __eq__(self, other): return ( isinstance(other, Array) and @@ -1626,9 +1645,18 @@ def _set_basic_selection_zd(self, selection, value, fields=None): else: chunk[selection] = value - # encode and store - cdata = self._encode_chunk(chunk) - self.chunk_store[ckey] = cdata + # remove chunk if write_empty_chunks is false and it only contains the fill value + if (not self.write_empty_chunks) and all_equal(self.fill_value, chunk): + try: + del self.chunk_store[ckey] + return + except Exception: # pragma: no cover + # deleting failed, fallback to overwriting + pass + else: + # encode and store + cdata = self._encode_chunk(chunk) + self.chunk_store[ckey] = cdata def _set_basic_selection_nd(self, selection, value, fields=None): # implementation of __setitem__ for array with at least one dimension @@ -1896,11 +1924,38 @@ def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, out[out_select] = fill_value def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): - ckeys = [self._chunk_key(co) for co in lchunk_coords] - cdatas = [self._process_for_setitem(key, sel, val, fields=fields) - for key, sel, val in zip(ckeys, lchunk_selection, values)] - values = {k: v for k, v in zip(ckeys, cdatas)} - self.chunk_store.setitems(values) + ckeys = map(self._chunk_key, lchunk_coords) + cdatas = {key: self._process_for_setitem(key, sel, val, fields=fields) + for key, sel, val in zip(ckeys, lchunk_selection, values)} + to_store = {} + if not self.write_empty_chunks: + empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)} + self._chunk_delitems(empty_chunks.keys()) + nonempty_keys = cdatas.keys() - empty_chunks.keys() + to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} + else: + to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} + self.chunk_store.setitems(to_store) + + def _chunk_delitems(self, ckeys): + if hasattr(self.store, "delitems"): + self.store.delitems(ckeys) + else: # pragma: no cover + # exempting this branch from coverage as there are no extant stores + # that will trigger this condition, but it's possible that they + # will be developed in the future. + tuple(map(self._chunk_delitem, ckeys)) + return None + + def _chunk_delitem(self, ckey): + """ + Attempt to delete the value associated with ckey. + """ + try: + del self.chunk_store[ckey] + return + except KeyError: + return def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): """Replace part or whole of a chunk. @@ -1931,8 +1986,12 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) - # store - self.chunk_store[ckey] = cdata + + # attempt to delete chunk if it only contains the fill value + if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): + self._chunk_delitem(ckey) + else: + self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): if is_total_slice(chunk_selection, self._chunks) and not fields: @@ -1988,8 +2047,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): else: chunk[chunk_selection] = value - # encode chunk - return self._encode_chunk(chunk) + return chunk def _chunk_key(self, chunk_coords): return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) @@ -2209,7 +2267,8 @@ def hexdigest(self, hashname="sha1"): def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, - self._synchronizer, self._cache_metadata, self._attrs.cache) + self._synchronizer, self._cache_metadata, self._attrs.cache, + self._partial_decompress, self._write_empty_chunks) def __setstate__(self, state): self.__init__(*state) diff --git a/zarr/creation.py b/zarr/creation.py index 0e2d2041ba..75ff1d0212 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -21,7 +21,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, - object_codec=None, dimension_separator=None, **kwargs): + object_codec=None, dimension_separator=None, write_empty_chunks=True, **kwargs): """Create an array. Parameters @@ -71,6 +71,15 @@ def create(shape, chunks=True, dtype=None, compressor='default', dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. .. versionadded:: 2.8 + write_empty_chunks : bool, optional + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill + value prior to storing. If a chunk is uniformly equal to the fill + value, then that chunk is not be stored, and the store entry for + that chunk's key is deleted. This setting enables sparser storage, + as only chunks with non-fill-value data are stored, at the expense + of overhead associated with checking the data of each chunk. + Returns ------- @@ -142,7 +151,8 @@ def create(shape, chunks=True, dtype=None, compressor='default', # instantiate array z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only, + write_empty_chunks=write_empty_chunks) return z @@ -400,6 +410,7 @@ def open_array( chunk_store=None, storage_options=None, partial_decompress=False, + write_empty_chunks=True, **kwargs ): """Open an array using file-mode-like semantics. @@ -454,8 +465,14 @@ def open_array( If True and while the chunk_store is a FSStore and the compresion used is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. - - .. versionadded:: 2.7 + write_empty_chunks : bool, optional + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill + value prior to storing. If a chunk is uniformly equal to the fill + value, then that chunk is not be stored, and the store entry for + that chunk's key is deleted. This setting enables sparser storage, + as only chunks with non-fill-value data are stored, at the expense + of overhead associated with checking the data of each chunk. Returns ------- @@ -545,7 +562,7 @@ def open_array( # instantiate array z = Array(store, read_only=read_only, synchronizer=synchronizer, cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path, - chunk_store=chunk_store) + chunk_store=chunk_store, write_empty_chunks=write_empty_chunks) return z diff --git a/zarr/storage.py b/zarr/storage.py index 6ce2f88e1c..92be9df0aa 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1154,6 +1154,15 @@ def __delitem__(self, key): else: del self.map[key] + def delitems(self, keys): + if self.mode == 'r': + raise ReadOnlyError() + # only remove the keys that exist in the store + nkeys = [self._normalize_key(key) for key in keys if key in self] + # rm errors if you pass an empty collection + if len(nkeys) > 0: + self.map.delitems(nkeys) + def __contains__(self, key): key = self._normalize_key(key) return key in self.map diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index be2feffe8a..4544a6cae9 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -86,9 +86,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_store_has_text_keys(self): # Initialize array @@ -939,7 +940,7 @@ def test_array_0d(self): # setup a = np.zeros(()) - z = self.create_array(shape=(), dtype=a.dtype, fill_value=0) + z = self.create_array(shape=(), dtype=a.dtype, fill_value=0, write_empty_chunks=False) # check properties assert a.ndim == z.ndim @@ -973,6 +974,8 @@ def test_array_0d(self): assert 42 == z[()] z[()] = 43 assert 43 == z[()] + z[()] = z.fill_value + assert z.fill_value == z[()] with pytest.raises(IndexError): z[0] = 42 with pytest.raises(IndexError): @@ -984,17 +987,47 @@ def test_array_0d(self): z.store.close() def test_nchunks_initialized(self): + for fill_value in (0, 1.0, np.nan): + if isinstance(fill_value, int): + dtype = 'int' + else: + dtype = 'float' + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value, + dtype=dtype, + write_empty_chunks=True) + + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove the first chunk from the store + del z.chunk_store[z._chunk_key((0,))] + assert 9 == z.nchunks_initialized - z = self.create_array(shape=100, chunks=10) - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store['foo'] = b'bar' - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized + if hasattr(z.store, 'close'): + z.store.close() - if hasattr(z.store, 'close'): - z.store.close() + # second round of similar tests with write_empty_chunks set to + # False + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value, + dtype=dtype, + write_empty_chunks=False) + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[z._chunk_key((0,))] + assert 9 == z.nchunks_initialized + z[:] = z.fill_value + assert 0 == z.nchunks_initialized + + if hasattr(z.store, 'close'): + z.store.close() def test_array_dtype_shape(self): @@ -1545,9 +1578,11 @@ def create_array(read_only=False, **kwargs): store = dict() cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, path='foo/bar', **kwargs) return Array(store, path='foo/bar', read_only=read_only, - cache_metadata=cache_metadata, cache_attrs=cache_attrs) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -1600,9 +1635,11 @@ def create_array(read_only=False, **kwargs): chunk_store = dict() cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, chunk_store=chunk_store, **kwargs) return Array(store, read_only=read_only, chunk_store=chunk_store, - cache_metadata=cache_metadata, cache_attrs=cache_attrs) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -1654,10 +1691,11 @@ def create_array(read_only=False, **kwargs): store = DirectoryStore(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): @@ -1685,9 +1723,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) @pytest.mark.xfail def test_nbytes_stored(self): @@ -1708,10 +1747,11 @@ def create_array(read_only=False, **kwargs): store = NestedDirectoryStore(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def expected(self): return [ @@ -1732,10 +1772,11 @@ def create_array(read_only=False, **kwargs): store = N5Store(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_array_0d(self): # test behaviour for array with 0 dimensions @@ -1802,6 +1843,40 @@ def test_array_1d_fill_value(self): z = self.create_array(shape=(nvalues,), chunks=100, dtype=dtype, fill_value=1) + def test_nchunks_initialized(self): + fill_value = 0 + dtype = 'int' + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value, + dtype=dtype, + write_empty_chunks=True) + + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[z._chunk_key((0,))] + assert 9 == z.nchunks_initialized + + # second round of similar tests with write_empty_chunks set to + # False + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value, + dtype=dtype, + write_empty_chunks=False) + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[z._chunk_key((0,))] + assert 9 == z.nchunks_initialized + z[:] = z.fill_value + assert 0 == z.nchunks_initialized + def test_array_order(self): # N5 only supports 'C' at the moment @@ -2029,9 +2104,10 @@ def create_array(read_only=False, **kwargs): cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) kwargs.setdefault('compressor', Zlib(1)) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) class TestArrayWithDBMStore(TestArray): @@ -2043,10 +2119,11 @@ def create_array(read_only=False, **kwargs): store = DBMStore(path, flag='n') cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_attrs=cache_attrs, - cache_metadata=cache_metadata) + cache_metadata=cache_metadata, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2062,10 +2139,11 @@ def create_array(read_only=False, **kwargs): store = DBMStore(path, flag='n', open=bsddb3.btopen) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2081,10 +2159,11 @@ def create_array(read_only=False, **kwargs): store = LMDBStore(path, buffers=True) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_store_has_bytes_values(self): pass # returns values as memoryviews/buffers instead of bytes @@ -2103,10 +2182,11 @@ def create_array(read_only=False, **kwargs): store = LMDBStore(path, buffers=False) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2122,10 +2202,11 @@ def create_array(read_only=False, **kwargs): store = SQLiteStore(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2138,9 +2219,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', None) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2174,9 +2256,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2210,9 +2293,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2253,9 +2337,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2296,9 +2381,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_attrs=cache_attrs, - cache_metadata=cache_metadata) + cache_metadata=cache_metadata, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2441,9 +2527,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): z = self.create_array(shape=1000, chunks=100) @@ -2460,9 +2547,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_cache_metadata(self): a1 = self.create_array(shape=100, chunks=10, dtype='i1', cache_metadata=False) @@ -2532,9 +2620,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_store_has_bytes_values(self): # skip as the cache has no control over how the store provides values @@ -2551,10 +2640,11 @@ def create_array(read_only=False, **kwargs): store = FSStore(path, key_separator=key_separator, auto_mkdir=True) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def expected(self): return [ @@ -2602,6 +2692,7 @@ def create_array(read_only=False, **kwargs): store = FSStore(path) cache_metadata = kwargs.pop("cache_metadata", True) cache_attrs = kwargs.pop("cache_attrs", True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault("compressor", Blosc()) init_array(store, **kwargs) return Array( @@ -2610,6 +2701,7 @@ def create_array(read_only=False, **kwargs): cache_metadata=cache_metadata, cache_attrs=cache_attrs, partial_decompress=True, + write_empty_chunks=write_empty_chunks ) def test_hexdigest(self): @@ -2678,10 +2770,11 @@ def create_array(read_only=False, **kwargs): store = FSStore(path, key_separator=key_separator, auto_mkdir=True) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def expected(self): return [ @@ -2730,6 +2823,7 @@ def create_array(read_only=False, **kwargs): store = FSStore(path, key_separator=key_separator, auto_mkdir=True) cache_metadata = kwargs.pop("cache_metadata", True) cache_attrs = kwargs.pop("cache_attrs", True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault("compressor", Blosc()) init_array(store, **kwargs) return Array( @@ -2738,6 +2832,7 @@ def create_array(read_only=False, **kwargs): cache_metadata=cache_metadata, cache_attrs=cache_attrs, partial_decompress=True, + write_empty_chunks=write_empty_chunks ) def expected(self): diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 1412ec2099..51bc9bf782 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1012,6 +1012,12 @@ def test_read_only(self): with pytest.raises(PermissionError): del store['foo'] + with pytest.raises(PermissionError): + store.delitems(['foo']) + + with pytest.raises(PermissionError): + store.setitems({'foo': b'baz'}) + with pytest.raises(PermissionError): store.clear() diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py index 51b7fe0e10..274ce166be 100644 --- a/zarr/tests/test_sync.py +++ b/zarr/tests/test_sync.py @@ -99,10 +99,11 @@ def create_array(self, read_only=False, **kwargs): store = dict() cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, synchronizer=ThreadSynchronizer(), read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) # noinspection PyMethodMayBeStatic def create_pool(self): @@ -141,12 +142,14 @@ def create_array(self, read_only=False, **kwargs): store = DirectoryStore(path) cache_metadata = kwargs.pop('cache_metadata', False) cache_attrs = kwargs.pop('cache_attrs', False) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) sync_path = tempfile.mkdtemp() atexit.register(atexit_rmtree, sync_path) synchronizer = ProcessSynchronizer(sync_path) return Array(store, synchronizer=synchronizer, read_only=read_only, - cache_metadata=cache_metadata, cache_attrs=cache_attrs) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) # noinspection PyMethodMayBeStatic def create_pool(self): diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index fa1f18fa63..a65b26bae8 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from zarr.util import (guess_chunks, human_readable_size, info_html_report, +from zarr.util import (all_equal, flatten, guess_chunks, human_readable_size, info_html_report, info_text_report, is_total_slice, normalize_chunks, normalize_dimension_separator, normalize_fill_value, normalize_order, @@ -211,3 +211,30 @@ def fail(x): for x in range(11, 15): pytest.raises(PermissionError, fail, x) + + +def test_flatten(): + assert list(flatten(['0', ['1', ['2', ['3', [4, ]]]]])) == ['0', '1', '2', '3', 4] + assert list(flatten('foo')) == ['f', 'o', 'o'] + assert list(flatten(['foo'])) == ['foo'] + + +def test_all_equal(): + assert all_equal(0, np.zeros((10, 10, 10))) + assert not all_equal(1, np.zeros((10, 10, 10))) + + assert all_equal(1, np.ones((10, 10, 10))) + assert not all_equal(1, 1 + np.ones((10, 10, 10))) + + assert all_equal(np.nan, np.array([np.nan, np.nan])) + assert not all_equal(np.nan, np.array([np.nan, 1.0])) + + assert all_equal({'a': -1}, np.array([{'a': -1}, {'a': -1}], dtype='object')) + assert not all_equal({'a': -1}, np.array([{'a': -1}, {'a': 2}], dtype='object')) + + assert all_equal(np.timedelta64(999, 'D'), np.array([999, 999], dtype='timedelta64[D]')) + assert not all_equal(np.timedelta64(999, 'D'), np.array([999, 998], dtype='timedelta64[D]')) + + # all_equal(None, *) always returns False + assert not all_equal(None, np.array([None, None])) + assert not all_equal(None, np.array([None, 10])) diff --git a/zarr/util.py b/zarr/util.py index 2a2250433c..d092ffe0de 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -9,6 +9,7 @@ import numpy as np from asciitree import BoxStyle, LeftAligned from asciitree.traversal import Traversal +from collections.abc import Iterable from numcodecs.compat import ensure_ndarray, ensure_text from numcodecs.registry import codec_registry from numcodecs.blosc import cbuffer_sizes, cbuffer_metainfo @@ -16,6 +17,14 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union +def flatten(arg: Iterable) -> Iterable: + for element in arg: + if isinstance(element, Iterable) and not isinstance(element, (str, bytes)): + yield from flatten(element) + else: + yield element + + # codecs to use for object dtype convenience API object_codecs = { str.__name__: 'vlen-utf8', @@ -650,3 +659,35 @@ def retry_call(callabl: Callable, time.sleep(wait) else: raise + + +def all_equal(value: Any, array: Any): + """ + Test if all the elements of an array are equivalent to a value. + If `value` is None, then this function does not do any comparison and + returns False. + """ + + if value is None: + return False + if not value: + # if `value` is falsey, then just 1 truthy value in `array` + # is sufficient to return False. We assume here that np.any is + # optimized to return on the first truthy value in `array`. + try: + return not np.any(array) + except TypeError: # pragma: no cover + pass + if np.issubdtype(array.dtype, np.object_): + # we have to flatten the result of np.equal to handle outputs like + # [np.array([True,True]), True, True] + return all(flatten(np.equal(value, array, dtype=array.dtype))) + else: + # Numpy errors if you call np.isnan on custom dtypes, so ensure + # we are working with floats before calling isnan + if np.issubdtype(array.dtype, np.floating) and np.isnan(value): + return np.all(np.isnan(array)) + else: + # using == raises warnings from numpy deprecated pattern, but + # using np.equal() raises type errors for structured dtypes... + return np.all(value == array)