diff --git a/.github/workflows/minimal.yml b/.github/workflows/minimal.yml index 90c9a59585..ff68783d33 100644 --- a/.github/workflows/minimal.yml +++ b/.github/workflows/minimal.yml @@ -25,11 +25,11 @@ jobs: run: | conda activate minimal python -m pip install . - pytest -svx + pytest -svx --timeout=300 - name: Fixture generation shell: "bash -l {0}" run: | conda activate minimal rm -rf fixture/ - pytest -svx zarr/tests/test_dim_separator.py zarr/tests/test_storage.py + pytest -svx --timeout=300 zarr/tests/test_dim_separator.py zarr/tests/test_storage.py # This simulates fixture-less tests in conda and debian packaging diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 0c44de8b1f..e0d404b1a0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -73,7 +73,7 @@ jobs: conda activate zarr-env mkdir ~/blob_emulator azurite -l ~/blob_emulator --debug debug.log 2>&1 > stdouterr.log & - pytest --cov=zarr --cov-config=.coveragerc --doctest-plus --cov-report xml --cov=./ + pytest --cov=zarr --cov-config=.coveragerc --doctest-plus --cov-report xml --cov=./ --timeout=300 - uses: codecov/codecov-action@v1 with: #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos diff --git a/docs/release.rst b/docs/release.rst index 630f9de833..788aa309be 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,6 +9,9 @@ Unreleased Enhancements ~~~~~~~~~~~~ +* write_empty_chunks defaults to False. + By :user:`Juan Nunez-Iglesias `; :issue:`853`. + * Allow to assign array ``fill_values`` and update metadata accordingly. :issue:`662` * array indexing with [] (getitem and setitem) now supports fancy indexing. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index a292892ebd..906d5d9f08 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1302,6 +1302,75 @@ bytes within chunks of an array may improve the compression ratio, depending on the structure of the data, the compression algorithm used, and which compression filters (e.g., byte-shuffle) have been applied. +.. _tutorial_chunks_empty_chunks: + +Empty chunks +~~~~~~~~~~~~ + +As of version 2.11, it is possible to configure how Zarr handles the storage of +chunks that are "empty" (i.e., every element in the chunk is equal to the array's fill value). +When creating an array with ``write_empty_chunks=False`` (the default), +Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty, +then Zarr does not store it, and instead deletes the chunk from storage +if the chunk had been previously stored. + +This optimization prevents storing redundant objects and can speed up reads, but the cost is +added computation during array writes, since the contents of +each chunk must be compared to the fill value, and these advantages are contingent on the content of the array. +If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above. +In this case, creating an array with ``write_empty_chunks=True`` will instruct Zarr to write every chunk without checking for emptiness. + +The following example illustrates the effect of the ``write_empty_chunks`` flag on +the time required to write an array with different values.:: + + >>> import zarr + >>> import numpy as np + >>> import time + >>> from tempfile import TemporaryDirectory + >>> def timed_write(write_empty_chunks): + ... """ + ... Measure the time required and number of objects created when writing + ... to a Zarr array with random ints or fill value. + ... """ + ... chunks = (8192,) + ... shape = (chunks[0] * 1024,) + ... data = np.random.randint(0, 255, shape) + ... dtype = 'uint8' + ... + ... with TemporaryDirectory() as store: + ... arr = zarr.open(store, + ... shape=shape, + ... chunks=chunks, + ... dtype=dtype, + ... write_empty_chunks=write_empty_chunks, + ... fill_value=0, + ... mode='w') + ... # initialize all chunks + ... arr[:] = 100 + ... result = [] + ... for value in (data, arr.fill_value): + ... start = time.time() + ... arr[:] = value + ... elapsed = time.time() - start + ... result.append((elapsed, arr.nchunks_initialized)) + ... + ... return result + >>> for write_empty_chunks in (True, False): + ... full, empty = timed_write(write_empty_chunks) + ... print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') + + write_empty_chunks=True: + Random Data: 0.1252s, 1024 objects stored + Empty Data: 0.1060s, 1024 objects stored + + + write_empty_chunks=False: + Random Data: 0.1359s, 1024 objects stored + Empty Data: 0.0301s, 0 objects stored + +In this example, writing random data is slightly slower with ``write_empty_chunks=True``, +but writing empty data is substantially faster and generates far fewer objects in storage. + .. _tutorial_rechunking: Changing chunk shapes (rechunking) diff --git a/environment.yml b/environment.yml index b47dd9238b..066319d750 100644 --- a/environment.yml +++ b/environment.yml @@ -8,6 +8,7 @@ dependencies: - pip - pip: - asciitree - - fasteners + - fasteners == 0.16.3 - pytest + - pytest-timeout - setuptools_scm diff --git a/requirements_dev_minimal.txt b/requirements_dev_minimal.txt index 481e51f8fb..0395c8dd79 100644 --- a/requirements_dev_minimal.txt +++ b/requirements_dev_minimal.txt @@ -1,6 +1,6 @@ # library requirements asciitree==0.3.3 -fasteners==0.17.3 +fasteners==0.16.3 numcodecs==0.9.1 msgpack-python==0.5.6 setuptools-scm==6.4.2 diff --git a/setup.py b/setup.py index a68c77a63f..4bc6943c1d 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ dependencies = [ 'asciitree', 'numpy>=1.7', - 'fasteners', + 'fasteners==0.16.3', 'numcodecs>=0.6.4', ] diff --git a/tox.ini b/tox.ini index 9e0212cc5e..3adc147dac 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,7 @@ envlist = py37-npy{117,latest}, py38, py39, docs install_command = pip install --no-binary=numcodecs {opts} {packages} setenv = PYTHONHASHSEED = 42 + PYTEST_TIMEOUT = {env:PYTEST_TIMEOUT:300} passenv = ZARR_TEST_ABS ZARR_TEST_MONGO diff --git a/windows_conda_dev.txt b/windows_conda_dev.txt index 576674827d..8bdf5fb3da 100644 --- a/windows_conda_dev.txt +++ b/windows_conda_dev.txt @@ -1,5 +1,5 @@ coverage -fasteners +fasteners==0.16.3 flake8 monotonic msgpack-python diff --git a/zarr/_storage/absstore.py b/zarr/_storage/absstore.py index f23b406e0b..98ac6328b1 100644 --- a/zarr/_storage/absstore.py +++ b/zarr/_storage/absstore.py @@ -17,26 +17,36 @@ class ABSStore(Store): ---------- container : string The name of the ABS container to use. + .. deprecated:: Use ``client`` instead. + prefix : string Location of the "directory" to use as the root of the storage hierarchy within the container. + account_name : string The Azure blob storage account name. + .. deprecated:: 2.8.3 Use ``client`` instead. + account_key : string The Azure blob storage account access key. + .. deprecated:: 2.8.3 Use ``client`` instead. + blob_service_kwargs : dictionary Extra arguments to be passed into the azure blob client, for e.g. when using the emulator, pass in blob_service_kwargs={'is_emulated': True}. + .. deprecated:: 2.8.3 Use ``client`` instead. + dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. + client : azure.storage.blob.ContainerClient, optional And ``azure.storage.blob.ContainerClient`` to connect with. See `here `_ # noqa diff --git a/zarr/core.py b/zarr/core.py index 6f6b468e3b..e0fe4eb0e9 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -81,13 +81,13 @@ class Array: .. versionadded:: 2.7 write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their - contents. If False, each chunk is compared to the array's fill - value prior to storing. If a chunk is uniformly equal to the fill - value, then that chunk is not be stored, and the store entry for - that chunk's key is deleted. This setting enables sparser storage, - as only chunks with non-fill-value data are stored, at the expense - of overhead associated with checking the data of each chunk. + If True, all chunks will be stored regardless of their contents. If + False (default), each chunk is compared to the array's fill value prior + to storing. If a chunk is uniformly equal to the fill value, then that + chunk is not be stored, and the store entry for that chunk's key is + deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. .. versionadded:: 2.11 @@ -154,7 +154,7 @@ def __init__( cache_metadata=True, cache_attrs=True, partial_decompress=False, - write_empty_chunks=True, + write_empty_chunks=False, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized diff --git a/zarr/creation.py b/zarr/creation.py index d0dad231c4..7e7adcb157 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -19,7 +19,8 @@ def create(shape, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, - object_codec=None, dimension_separator=None, write_empty_chunks=True, **kwargs): + object_codec=None, dimension_separator=None, + write_empty_chunks=False, **kwargs): """Create an array. Parameters @@ -72,13 +73,14 @@ def create(shape, chunks=True, dtype=None, compressor='default', .. versionadded:: 2.8 write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their - contents. If False, each chunk is compared to the array's fill - value prior to storing. If a chunk is uniformly equal to the fill - value, then that chunk is not be stored, and the store entry for - that chunk's key is deleted. This setting enables sparser storage, - as only chunks with non-fill-value data are stored, at the expense - of overhead associated with checking the data of each chunk. + If True, all chunks will be stored regardless of their contents. If + False (default), each chunk is compared to the array's fill value prior + to storing. If a chunk is uniformly equal to the fill value, then that + chunk is not be stored, and the store entry for that chunk's key is + deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. + .. versionadded:: 2.11 Returns @@ -389,7 +391,7 @@ def open_array( chunk_store=None, storage_options=None, partial_decompress=False, - write_empty_chunks=True, + write_empty_chunks=False, **kwargs ): """Open an array using file-mode-like semantics. @@ -445,13 +447,14 @@ def open_array( is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their - contents. If False, each chunk is compared to the array's fill - value prior to storing. If a chunk is uniformly equal to the fill - value, then that chunk is not be stored, and the store entry for - that chunk's key is deleted. This setting enables sparser storage, - as only chunks with non-fill-value data are stored, at the expense - of overhead associated with checking the data of each chunk. + If True, all chunks will be stored regardless of their contents. If + False (default), each chunk is compared to the array's fill value prior + to storing. If a chunk is uniformly equal to the fill value, then that + chunk is not be stored, and the store entry for that chunk's key is + deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. + .. versionadded:: 2.11 Returns ------- diff --git a/zarr/util.py b/zarr/util.py index 04d350a68d..9f5f04f525 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -670,7 +670,7 @@ def all_equal(value: Any, array: Any): # optimized to return on the first truthy value in `array`. try: return not np.any(array) - except TypeError: # pragma: no cover + except (TypeError, ValueError): # pragma: no cover pass if np.issubdtype(array.dtype, np.object_): # we have to flatten the result of np.equal to handle outputs like