diff --git a/.gitignore b/.gitignore index 66b34ce262..46ec34fe9d 100644 --- a/.gitignore +++ b/.gitignore @@ -65,10 +65,13 @@ target/ # setuptools-scm zarr/version.py -# test data -*.zarr +# emacs *~ -*.zip -example* -doesnotexist -test_sync* + +# test data +#*.zarr +#*.zip +#example* +#doesnotexist +#test_sync* +data/* diff --git a/data/donotdelete b/data/donotdelete new file mode 100644 index 0000000000..1e9ef93e26 --- /dev/null +++ b/data/donotdelete @@ -0,0 +1 @@ +This directory is used for data files created during testing. \ No newline at end of file diff --git a/docs/api/convenience.rst b/docs/api/convenience.rst index 34286f9d1a..099c5f7f88 100644 --- a/docs/api/convenience.rst +++ b/docs/api/convenience.rst @@ -1,5 +1,5 @@ -Array creation (``zarr.convenience``) -===================================== +Convenience functions (``zarr.convenience``) +============================================ .. automodule:: zarr.convenience .. autofunction:: open .. autofunction:: save diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 48a6b68e58..69845dbc39 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -2,17 +2,20 @@ Storage (``zarr.storage``) ========================== .. automodule:: zarr.storage -.. autofunction:: init_array -.. autofunction:: init_group - .. autoclass:: DictStore .. autoclass:: DirectoryStore .. autoclass:: TempStore .. autoclass:: NestedDirectoryStore .. autoclass:: ZipStore -.. autoclass:: DBMStore .. automethod:: close .. automethod:: flush +.. autoclass:: DBMStore + + .. automethod:: close + .. automethod:: sync + +.. autofunction:: init_array +.. autofunction:: init_group .. autofunction:: migrate_1to2 diff --git a/docs/conf.py b/docs/conf.py index 1f3954a8aa..58fe548cd2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,7 +34,7 @@ def __getattr__(cls, name): sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) - + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -55,10 +55,12 @@ def __getattr__(cls, name): 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'numpydoc', + 'sphinx_issues', ] numpydoc_show_class_members = False numpydoc_class_members_toctree = False +issues_github_path = 'alimanfoo/zarr' # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/index.rst b/docs/index.rst index 80c7de664d..585dd7111c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,7 +12,7 @@ Highlights * Create N-dimensional arrays with any NumPy dtype. * Chunk arrays along any dimension. -* Compress and/or filter chunks using any numcodecs_ codec. +* Compress and/or filter chunks using any NumCodecs_ codec. * Store arrays in memory, on disk, inside a Zip file, on S3, ... * Read an array concurrently from multiple threads or processes. * Write to an array concurrently from multiple threads or processes. @@ -65,15 +65,19 @@ Contents Acknowledgments --------------- -The following people have contributed to the development of Zarr, by contributing code and/or -providing ideas, feedback and advice: +The following people have contributed to the development of Zarr by contributing code, +documentation, code reviews, comments and/or ideas: -* `Francesc Alted `_ -* `Stephan Hoyer `_ -* `John Kirkham `_ -* `Alistair Miles `_ -* `Matthew Rocklin `_ -* `Vincent Schut `_ +* :user:`Francesc Alted ` +* :user:`Martin Durant ` +* :user:`Stephan Hoyer ` +* :user:`John Kirkham ` +* :user:`Alistair Miles ` +* :user:`Mamy Ratsimbazafy ` +* :user:`Matthew Rocklin ` +* :user:`Vincent Schut ` +* :user:`Anthony Scopatz ` +* :user:`Prakhar Goel ` Zarr is inspired by `HDF5 `_, `h5py `_ and `bcolz `_. @@ -88,4 +92,4 @@ Indices and tables * :ref:`modindex` * :ref:`search` -.. _numcodecs: http://numcodecs.readthedocs.io/ +.. _NumCodecs: http://numcodecs.readthedocs.io/ diff --git a/docs/release.rst b/docs/release.rst index 2835cbb781..1e7c0cb766 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1,41 +1,181 @@ Release notes ============= -Changes to ``__repr__``; new ``info`` property -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. release_2.2.0rc1 + +2.2.0rc1 +-------- + +Enhancements +~~~~~~~~~~~~ + +* **Advanced indexing**. The ``Array`` class has several new methods and + properties that enable a selection of items in an array to be retrieved or + updated. See the :ref:`tutorial_indexing` tutorial section for more + information. There is also a `notebook + `_ + with extended examples and performance benchmarks. :issue:`78`, :issue:`89`, + :issue:`112`, :issue:`172`. + +* **New package for compressor and filter codecs**. The classes previously + defined in the :mod:`zarr.codecs` module have been factored out into a + separate package called NumCodecs_. The NumCodecs_ package also includes + several new codec classes not previously available in Zarr, including + compressor codecs for Zstd and LZ4. This change is backwards-compatible with + existing code, as all codec classes defined by NumCodecs are imported into the + :mod:`zarr.codecs` namespace. However, it is recommended to import codecs from + the new package, see the tutorial sections on :ref:`tutorial_compress` and + :ref:`tutorial_filters` for examples. With contributions by + :user:`John Kirkham `; :issue:`74`, :issue:`102`, :issue:`120`, + :issue:`123`, :issue:`139`. + +* **New storage class for DBM-style databases**. The + :class:`zarr.storage.DBMStore` class enables any DBM-style database to be used + as the backing store for an array or group. See the tutorial section on + :ref:`tutorial_storage` for some examples. :issue:`133`, :issue:`186` + +* **New storage class using a nested directory structure for chunk files**. The + :class:`zarr.storage.NestedDirectoryStore` has been added, which is similar to + the existing :class:`zarr.storage.DirectoryStore` class but nests chunk files + for multidimensional arrays into sub-directories. :issue:`155`, :issue:`177` + +* **New tree() method for printing hierarchies**. The ``Group`` class has a new + :func:`zarr.hierarchy.Group.tree` method which enables a tree representation of + a group hierarchy to be printed. Also provides an interactive tree + representation when used within a Jupyter notebook. See the + :ref:`tutorial_diagnostics` tutorial section for examples. By + :user:`John Kirkham `; :issue:`82`, :issue:`140`, :issue:`184`. + +* **Visitor API**. The ``Group`` class now implements the h5py visitor API, see + docs for the :func:`zarr.hierarchy.Group.visit`, + :func:`zarr.hierarchy.Group.visititems` and + :func:`zarr.hierarchy.Group.visitvalues` methods. By + :user:`John Kirkham `, :issue:`92`, :issue:`122`. + +* **Viewing an array as a different dtype**. The ``Array`` class has a new + :func:`zarr.core.Array.astype` method, which is a convenience that enables an + array to be viewed as a different dtype. By :user:`John Kirkham `, + :issue:`94`, :issue:`96`. + +* **New open(), save(), load() convenience functions**. The function + :func:`zarr.convenience.open` provides a convenient way to open a persistent + array or group, using either a ``DirectoryStore`` or ``ZipStore`` as the backing + store. The functions :func:`zarr.convenience.save` and + :func:`zarr.convenience.load` are also available and provide a convenient way to + save an entire NumPy array to disk and load back into memory later. See the + tutorial section :ref:`tutorial_persist` for examples. :issue:`104`, + :issue:`105`, :issue:`141`, :issue:`181`. + +* **IPython completions**. The ``Group`` class now implements ``__dir__()`` and + ``_ipython_key_completions_()`` which enables tab-completion for group members + to be used in any IPython interactive environment. :issue:`170`. + +* **New info property; changes to __repr__**. The ``Group`` and + ``Array`` classes have a new ``info`` property which can be used to print + diagnostic information, including compression ratio where available. See the + tutorial section on :ref:`tutorial_diagnostics` for examples. The string + representation (``__repr__``) of these classes has been simplified to ensure + it is cheap and quick to compute in all circumstances. :issue:`83`, + :issue:`115`, :issue:`132`, :issue:`148`. + +* **Chunk options**. When creating an array, ``chunks=False`` can be specified, + which will result in an array with a single chunk only. Alternatively, + ``chunks=True`` will trigger an automatic chunk shape guess. See + :ref:`tutorial_chunks` for more on the ``chunks`` parameter. :issue:`106`, + :issue:`107`, :issue:`183`. + +* **Zero-dimensional arrays** and are now supported; by + :user:`Prakhar Goel `, :issue:`154`, :issue:`161`. + +* **Arrays with one or more zero-length dimensions** are now fully supported; by + :user:`Prakhar Goel `, :issue:`150`, :issue:`154`, :issue:`160`. + +Bug fixes +~~~~~~~~~ + +* Fixed bug where ``read_only`` keyword argument was ignored when creating an + array; :issue:`151`, :issue:`179`. + +* Fixed bugs when using a ``ZipStore`` opened in 'w' mode; :issue:`158`, + :issue:`182`. + +* Fill values can now be provided for fixed-length string arrays; :issue:`165`, + :issue:`176`. + +* Fixed a bug where the number of chunks initialized could be counted + incorrectly; :issue:`97`, :issue:`174`. + +* Fixed a bug related to the use of an ellipsis (...) in indexing statements; + :issue:`93`, :issue:`168`, :issue:`172`. + +* Fixed a bug preventing use of other integer types for indexing; :issue:`143`, + :issue:`147`. + +Documentation +~~~~~~~~~~~~~ + +* Some changes have been made to the :ref:`spec_v2` document to clarify + ambiguities and add some missing information. These changes do not modify any + of the material previously implemented, and so the changes have been made + in-place in the document without incrementing the document version number. The + specification now describes how bytes fill values should be encoded and + decoded for arrays with a fixed-length byte string data type (:issue:`165`, + :issue:`176`). The specification now also clarifies that datetime64 and + timedelta64 data types are not supported in this version (:issue:`85`). +* A new :ref:`tutorial_indexing` section has been added to the tutorial. +* A new :ref:`tutorial_strings` section has been added to the tutorial + (:issue:`135`, :issue:`175`). +* The :ref:`tutorial_chunks` tutorial section has been reorganised and updated. +* The :ref:`tutorial_persist` and :ref:`tutorial_storage` tutorial sections have + been updated with new examples (:issue:`100`, :issue:`101`, :issue:`103`). +* A new tutorial section on :ref:`tutorial_pickle` has been added (:issue:`91`). +* A new tutorial section on :ref:`tutorial_datetime` has been added. +* A new tutorial section on :ref:`tutorial_diagnostics` has been added. + +Maintenance +~~~~~~~~~~~ + +* A data fixture has been included in the test suite to ensure data format + compatibility is maintained; :issue:`83`, :issue:`146`. +* Various continuous integration updates and improvements; :issue:`118`, :issue:`124`, + :issue:`125`, :issue:`126`, :issue:`109`, :issue:`114`, :issue:`171`. + +Acknowledgments +~~~~~~~~~~~~~~~ + +Code was contributed to this release by :user:`John Kirkham ` and +:user:`Prakhar Goel `. + +Documentation was contributed to this release by :user:`Mamy Ratsimbazafy `. -The string representation (``__repr__``) of array and group objects has been been simplified -(`#83 `_, -`#115 `_, -`#132 `_). -Further diagnostic information can be obtained via a new ``info`` property. See the tutorial -section on :ref:`tutorial_tips_info` for examples. +Thank you to :user:`John Kirkham `, :user:`Stephan Hoyer `, +:user:`Francesc Alted `, and :user:`Matthew Rocklin ` for code +reviews and/or comments on pull requests. .. _release_2.1.4: 2.1.4 ----- -Resolved an issue where calling ``hasattr`` on a ``Group`` object erroneously returned a -``KeyError`` (`#88 `_, -`#95 `_, -`Vincent Schut `_) +* Resolved an issue where calling ``hasattr`` on a ``Group`` object erroneously + returned a ``KeyError``. By :user:`Vincent Schut `; :issue:`88`, + :issue:`95`. .. _release_2.1.3: 2.1.3 ----- -Resolved an issue with :func:`zarr.creation.array` where dtype was given as -None (`#80 `_). +* Resolved an issue with :func:`zarr.creation.array` where dtype was given as + None (:issue:`80`). .. _release_2.1.2: 2.1.2 ----- -Resolved an issue when no compression is used and chunks are stored in memory -(`#79 `_). +* Resolved an issue when no compression is used and chunks are stored in memory + (:issue:`79`). .. _release_2.1.1: @@ -54,24 +194,24 @@ fixed bug in pickling ``ThreadSynchronizer``. ----- * Group objects now support member deletion via ``del`` statement - (`#65 `_). + (:issue:`65`). * Added :class:`zarr.storage.TempStore` class for convenience to provide storage via a temporary directory - (`#59 `_). + (:issue:`59`). * Fixed performance issues with :class:`zarr.storage.ZipStore` class - (`#66 `_). + (:issue:`66`). * The Blosc extension has been modified to return bytes instead of array objects from compress and decompress function calls. This should improve compatibility and also provides a small performance increase for compressing high compression ratio data - (`#55 `_). + (:issue:`55`). * Added ``overwrite`` keyword argument to array and group creation methods on the :class:`zarr.hierarchy.Group` class - (`#71 `_). + (:issue:`71`). * Added ``cache_metadata`` keyword argument to array creation methods. * The functions :func:`zarr.creation.open_array` and :func:`zarr.hierarchy.open_group` now accept any store as first argument - (`#56 `_). + (:issue:`56`). .. _release_2.0.1: @@ -112,8 +252,8 @@ The bundled Blosc library has been upgraded to version 1.11.0. Acknowledgments ~~~~~~~~~~~~~~~ -Thanks to Matthew Rocklin (mrocklin_), Stephan Hoyer (shoyer_) and -Francesc Alted (FrancescAlted_) for contributions and comments. +Thanks to :user:`Matthew Rocklin `, :user:`Stephan Hoyer ` and +:user:`Francesc Alted ` for contributions and comments. .. _release_1.1.0: @@ -149,7 +289,7 @@ abstraction layer between the core array logic and data storage (`#21 `_). In this release, any object that implements the ``MutableMapping`` interface can be used as an array store. See the tutorial sections on :ref:`tutorial_persist` -and :ref:`tutorial_tips_storage`, the :ref:`spec_v1`, and the +and :ref:`tutorial_storage`, the :ref:`spec_v1`, and the :mod:`zarr.storage` module documentation for more information. Please note also that the file organization and file name conventions @@ -216,7 +356,7 @@ The memory layout within chunks can now be set as either "C" (row-major) or "F" (column-major), which can help to provide better compression for some data (`#7 `_). See the tutorial -section on :ref:`tutorial_tips_order` for more information. +section on :ref:`tutorial_chunks_order` for more information. A bug has been fixed within the ``__getitem__`` and ``__setitem__`` machinery for slicing arrays, to properly handle getting and setting @@ -225,9 +365,9 @@ partial slices. Acknowledgments ~~~~~~~~~~~~~~~ -Thanks to Matthew Rocklin (mrocklin_), Stephan Hoyer (shoyer_), -Francesc Alted (FrancescAlted_), Anthony Scopatz (scopatz_) and Martin -Durant (martindurant_) for contributions and comments. +Thanks to :user:`Matthew Rocklin `, :user:`Stephan Hoyer `, +:user:`Francesc Alted `, :user:`Anthony Scopatz ` and +:user:`Martin Durant ` for contributions and comments. .. _release_0.4.0: @@ -245,8 +385,4 @@ See `v0.4.0 release notes on GitHub See `v0.3.0 release notes on GitHub `_. -.. _mrocklin: https://github.com/mrocklin -.. _shoyer: https://github.com/shoyer -.. _scopatz: https://github.com/scopatz -.. _martindurant: https://github.com/martindurant -.. _FrancescAlted: https://github.com/FrancescAlted +.. _NumCodecs: http://numcodecs.readthedocs.io/ diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index 88df4f9439..f34a8e3179 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -117,15 +117,18 @@ consists of 3 parts: ``">"``: big-endian; ``"|"``: not-relevant) * One character code giving the basic type of the array (``"b"``: Boolean (integer type where all values are only True or False); ``"i"``: integer; ``"u"``: unsigned - integer; ``"f"``: floating point; ``"c"``: complex floating point; ``"m"``: timedelta; - ``"M"``: datetime; ``"S"``: string (fixed-length sequence of char); ``"U"``: unicode - (fixed-length sequence of Py_UNICODE); ``"V"``: other (void * – each item is a - fixed-size chunk of memory)) + integer; ``"f"``: floating point; ``"c"``: complex floating point; ``"S"``: string + (fixed-length sequence of char); ``"U"``: unicode (fixed-length sequence of + Py_UNICODE); ``"V"``: other (void * – each item is a fixed-size chunk of memory)) * An integer specifying the number of bytes the type uses. The byte order MUST be specified. E.g., ``"i4"``, ``"|b1"`` and ``"|S12"`` are valid data type encodings. +Please note that NumPy's datetime64 ("M") and timedelta64 ("m") data types are **not** +currently supported. Please store data using an appropriate physical data type instead, +e.g., 64-bit integer. + Structured data types (i.e., with multiple named fields) are encoded as a list of two-element lists, following `NumPy array protocol type descriptions (descr) `_. For @@ -295,7 +298,7 @@ local file system as storage. Create an array:: >>> import zarr - >>> store = zarr.DirectoryStore('example') + >>> store = zarr.DirectoryStore('data/example.zarr') >>> a = zarr.create(shape=(20, 20), chunks=(10, 10), dtype='i4', ... fill_value=42, compressor=zarr.Zlib(level=1), ... store=store, overwrite=True) @@ -304,12 +307,12 @@ No chunks are initialized yet, so only the ".zarray" and ".zattrs" keys have been set in the store:: >>> import os - >>> sorted(os.listdir('example')) + >>> sorted(os.listdir('data/example.zarr')) ['.zarray', '.zattrs'] Inspect the array metadata:: - >>> print(open('example/.zarray').read()) + >>> print(open('data/example.zarr/.zarray').read()) { "chunks": [ 10, @@ -332,26 +335,26 @@ Inspect the array metadata:: Inspect the array attributes:: - >>> print(open('example/.zattrs').read()) + >>> print(open('data/example.zarr/.zattrs').read()) {} Chunks are initialized on demand. E.g., set some data:: >>> a[0:10, 0:10] = 1 - >>> sorted(os.listdir('example')) + >>> sorted(os.listdir('data/example.zarr')) ['.zarray', '.zattrs', '0.0'] Set some more data:: >>> a[0:10, 10:20] = 2 >>> a[10:20, :] = 3 - >>> sorted(os.listdir('example')) + >>> sorted(os.listdir('data/example.zarr')) ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] Manually decompress a single chunk for illustration:: >>> import zlib - >>> buf = zlib.decompress(open('example/0.0', 'rb').read()) + >>> buf = zlib.decompress(open('data/example.zarr/0.0', 'rb').read()) >>> import numpy as np >>> chunk = np.frombuffer(buf, dtype='>> chunk @@ -366,7 +369,7 @@ Modify the array attributes:: >>> a.attrs['foo'] = 42 >>> a.attrs['bar'] = 'apples' >>> a.attrs['baz'] = [1, 2, 3, 4] - >>> print(open('example/.zattrs').read()) + >>> print(open('data/example.zarr/.zattrs').read()) { "bar": "apples", "baz": [ @@ -389,7 +392,7 @@ however this is an implementation choice and is not required. Setup the store:: >>> import zarr - >>> store = zarr.DirectoryStore('example_hierarchy') + >>> store = zarr.DirectoryStore('data/group.zarr') Create the root group:: @@ -399,19 +402,19 @@ The metadata resource for the root group has been created, as well as a custom attributes resource:: >>> import os - >>> sorted(os.listdir('example_hierarchy')) + >>> sorted(os.listdir('data/group.zarr')) ['.zattrs', '.zgroup'] Inspect the group metadata:: - >>> print(open('example_hierarchy/.zgroup').read()) + >>> print(open('data/group.zarr/.zgroup').read()) { "zarr_format": 2 } Inspect the group attributes:: - >>> print(open('example_hierarchy/.zattrs').read()) + >>> print(open('data/group.zarr/.zattrs').read()) {} Create a sub-group:: @@ -420,9 +423,9 @@ Create a sub-group:: What has been stored:: - >>> sorted(os.listdir('example_hierarchy')) + >>> sorted(os.listdir('data/group.zarr')) ['.zattrs', '.zgroup', 'foo'] - >>> sorted(os.listdir('example_hierarchy/foo')) + >>> sorted(os.listdir('data/group.zarr/foo')) ['.zattrs', '.zgroup'] Create an array within the sub-group:: @@ -432,16 +435,16 @@ Create an array within the sub-group:: What has been stored:: - >>> sorted(os.listdir('example_hierarchy')) + >>> sorted(os.listdir('data/group.zarr')) ['.zattrs', '.zgroup', 'foo'] - >>> sorted(os.listdir('example_hierarchy/foo')) + >>> sorted(os.listdir('data/group.zarr/foo')) ['.zattrs', '.zgroup', 'bar'] - >>> sorted(os.listdir('example_hierarchy/foo/bar')) + >>> sorted(os.listdir('data/group.zarr/foo/bar')) ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] Here is the same example using a Zip file as storage:: - >>> store = zarr.ZipStore('example_hierarchy.zip', mode='w') + >>> store = zarr.ZipStore('data/group.zip', mode='w') >>> root_grp = zarr.group(store) >>> sub_grp = root_grp.create_group('foo') >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10)) @@ -451,7 +454,7 @@ Here is the same example using a Zip file as storage:: What has been stored:: >>> import zipfile - >>> zf = zipfile.ZipFile('example_hierarchy.zip', mode='r') + >>> zf = zipfile.ZipFile('data/group.zip', mode='r') >>> for name in sorted(zf.namelist()): ... print(name) .zattrs diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 8b8d2cbe83..9f1fd15c20 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -3,40 +3,37 @@ Tutorial ======== -Zarr provides classes and functions for working with N-dimensional -arrays that behave like NumPy arrays but whose data is divided into -chunks and compressed. If you are already familiar with HDF5 -then Zarr arrays provide similar functionality, but with some -additional flexibility. +Zarr provides classes and functions for working with N-dimensional arrays that +behave like NumPy arrays but whose data is divided into chunks and each chunk is +compressed. If you are already familiar with HDF5 then Zarr arrays provide +similar functionality, but with some additional flexibility. .. _tutorial_create: Creating an array ----------------- -Zarr has a number of convenience functions for creating arrays. For -example:: +Zarr has several functions for creating arrays. For example:: >>> import zarr >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') >>> z -The code above creates a 2-dimensional array of 32-bit integers with -10000 rows and 10000 columns, divided into chunks where each chunk has -1000 rows and 1000 columns (and so there will be 100 chunks in total). +The code above creates a 2-dimensional array of 32-bit integers with 10000 rows +and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 +columns (and so there will be 100 chunks in total). -For a complete list of array creation routines see the -:mod:`zarr.creation` module documentation. +For a complete list of array creation routines see the :mod:`zarr.creation` +module documentation. .. _tutorial_array: Reading and writing data ------------------------ -Zarr arrays support a similar interface to NumPy arrays for reading -and writing data. For example, the entire array can be filled with a -scalar value:: +Zarr arrays support a similar interface to NumPy arrays for reading and writing +data. For example, the entire array can be filled with a scalar value:: >>> z[:] = 42 @@ -46,8 +43,8 @@ Regions of the array can also be written to, e.g.:: >>> z[0, :] = np.arange(10000) >>> z[:, 0] = np.arange(10000) -The contents of the array can be retrieved by slicing, which will load -the requested region into memory as a NumPy array, e.g.:: +The contents of the array can be retrieved by slicing, which will load the +requested region into memory as a NumPy array, e.g.:: >>> z[0, 0] 0 @@ -57,7 +54,7 @@ the requested region into memory as a NumPy array, e.g.:: array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) >>> z[:, 0] array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) - >>> z[...] + >>> z[:] array([[ 0, 1, 2, ..., 9997, 9998, 9999], [ 1, 42, 42, ..., 42, 42, 42], [ 2, 42, 42, ..., 42, 42, 42], @@ -71,23 +68,23 @@ the requested region into memory as a NumPy array, e.g.:: Persistent arrays ----------------- -In the examples above, compressed data for each chunk of the array was -stored in memory. Zarr arrays can also be stored on a file system, -enabling persistence of data between sessions. For example:: +In the examples above, compressed data for each chunk of the array was stored in +main memory. Zarr arrays can also be stored on a file system, enabling +persistence of data between sessions. For example:: - >>> z1 = zarr.open_array('example.zarr', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4') + >>> z1 = zarr.open('data/example.zarr', mode='w', shape=(10000, 10000), + ... chunks=(1000, 1000), dtype='i4') -The array above will store its configuration metadata and all -compressed chunk data in a directory called 'example.zarr' relative to -the current working directory. The :func:`zarr.creation.open_array` function -provides a convenient way to create a new persistent array or continue -working with an existing array. Note that there is no need to close an -array, and data are automatically flushed to disk whenever an array is -modified. +The array above will store its configuration metadata and all compressed chunk +data in a directory called 'data/example.zarr' relative to the current working +directory. The :func:`zarr.convenience.open` function provides a convenient way +to create a new persistent array or continue working with an existing +array. Note that although the function is called "open", there is no need to +close an array: data are automatically flushed to disk, and files are +automatically closed whenever an array is modified. -Persistent arrays support the same interface for reading and writing -data, e.g.:: +Persistent arrays support the same interface for reading and writing data, +e.g.:: >>> z1[:] = 42 >>> z1[0, :] = np.arange(10000) @@ -95,20 +92,30 @@ data, e.g.:: Check that the data have been written and can be read again:: - >>> z2 = zarr.open_array('example.zarr', mode='r') - >>> np.all(z1[...] == z2[...]) + >>> z2 = zarr.open('data/example.zarr', mode='r') + >>> np.all(z1[:] == z2[:]) True -Please note that there are a number of other options for persistent array storage, see the -section on :ref:`tutorial_tips_storage` below. +If you are just looking for a fast and convenient way to save NumPy arrays to +disk then load back into memory later, the functions +:func:`zarr.convenience.save` and :func:`zarr.convenience.load` may be +useful. E.g.:: + + >>> a = np.arange(10) + >>> zarr.save('data/example.zarr', a) + >>> zarr.load('data/example.zarr') + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + +Please note that there are a number of other options for persistent array +storage, see the section on :ref:`tutorial_storage` below. .. _tutorial_resize: Resizing and appending ---------------------- -A Zarr array can be resized, which means that any of its dimensions -can be increased or decreased in length. For example:: +A Zarr array can be resized, which means that any of its dimensions can be +increased or decreased in length. For example:: >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) >>> z[:] = 42 @@ -116,13 +123,12 @@ can be increased or decreased in length. For example:: >>> z.shape (20000, 10000) -Note that when an array is resized, the underlying data are not -rearranged in any way. If one or more dimensions are shrunk, any -chunks falling outside the new array shape will be deleted from the -underlying store. +Note that when an array is resized, the underlying data are not rearranged in +any way. If one or more dimensions are shrunk, any chunks falling outside the +new array shape will be deleted from the underlying store. -For convenience, Zarr arrays also provide an ``append()`` method, -which can be used to append data to any axis. E.g.:: +For convenience, Zarr arrays also provide an ``append()`` method, which can be +used to append data to any axis. E.g.:: >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) >>> z = zarr.array(a, chunks=(1000, 100)) @@ -140,9 +146,10 @@ which can be used to append data to any axis. E.g.:: Compressors ----------- -A number of different compressors can be used with Zarr. A separate package called Numcodecs_ is -available which provides an interface to various compressor libraries including Blosc, Zstandard, -LZ4, Zlib, BZ2 and LZMA. Different compressors can be provided via the ``compressor`` keyword +A number of different compressors can be used with Zarr. A separate package +called NumCodecs_ is available which provides a common interface to various +compressor libraries including Blosc, Zstandard, LZ4, Zlib, BZ2 and +LZMA. Different compressors can be provided via the ``compressor`` keyword argument accepted by all array creation functions. For example:: >>> from numcodecs import Blosc @@ -152,11 +159,13 @@ argument accepted by all array creation functions. For example:: >>> z.compressor Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) -This array above will use Blosc as the primary compressor, using the Zstandard algorithm -(compression level 3) internally within Blosc, and with the bitshuffle filter applied. +This array above will use Blosc as the primary compressor, using the Zstandard +algorithm (compression level 3) internally within Blosc, and with the +bit-shuffle filter applied. -When using a compressor, it can be useful to get some diagnostics on the compression ratio. Zarr -arrays provide a ``info`` property which can be used to print some diagnostics, e.g.:: +When using a compressor, it can be useful to get some diagnostics on the +compression ratio. Zarr arrays provide a ``info`` property which can be used to +print some diagnostics, e.g.:: >>> z.info Type : zarr.core.Array @@ -173,20 +182,21 @@ arrays provide a ``info`` property which can be used to print some diagnostics, Storage ratio : 87.6 Chunks initialized : 100/100 -If you don't specify a compressor, by default Zarr uses the Blosc compressor. Blosc is extremely -fast and can be configured in a variety of ways to improve the compression ratio for different -types of data. Blosc is in fact a "meta-compressor", which means that it can used a number of -different compression algorithms internally to compress the data. Blosc also provides highly -optimized implementations of byte and bit shuffle filters, which can significantly improve -compression ratios for some data. A list of the internal compression libraries available within -Blosc can be obtained via:: +If you don't specify a compressor, by default Zarr uses the Blosc +compressor. Blosc is generally very fast and can be configured in a variety of +ways to improve the compression ratio for different types of data. Blosc is in +fact a "meta-compressor", which means that it can use a number of different +compression algorithms internally to compress the data. Blosc also provides +highly optimized implementations of byte- and bit-shuffle filters, which can +improve compression ratios for some data. A list of the internal compression +libraries available within Blosc can be obtained via:: >>> from numcodecs import blosc >>> blosc.list_compressors() ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'] -In addition to Blosc, other compression libraries can also be used. For example, here is an array -using Zstandard compression, level 1:: +In addition to Blosc, other compression libraries can also be used. For example, +here is an array using Zstandard compression, level 1:: >>> from numcodecs import Zstd >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), @@ -194,8 +204,8 @@ using Zstandard compression, level 1:: >>> z.compressor Zstd(level=1) -Here is an example using LZMA with a custom filter pipeline including -LZMA's built-in delta filter:: +Here is an example using LZMA with a custom filter pipeline including LZMA's +built-in delta filter:: >>> import lzma >>> lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), @@ -234,16 +244,15 @@ Filters In some cases, compression can be improved by transforming the data in some way. For example, if nearby values tend to be correlated, then shuffling the bytes within each numerical value or storing the difference between adjacent -values may increase compression ratio. Some compressors provide built-in -filters that apply transformations to the data prior to compression. For -example, the Blosc compressor has highly optimized built-in implementations of -byte- and bit-shuffle filters, and the LZMA compressor has a built-in -implementation of a delta filter. However, to provide additional -flexibility for implementing and using filters in combination with different -compressors, Zarr also provides a mechanism for configuring filters outside of -the primary compressor. +values may increase compression ratio. Some compressors provide built-in filters +that apply transformations to the data prior to compression. For example, the +Blosc compressor has built-in implementations of byte- and bit-shuffle filters, +and the LZMA compressor has a built-in implementation of a delta +filter. However, to provide additional flexibility for implementing and using +filters in combination with different compressors, Zarr also provides a +mechanism for configuring filters outside of the primary compressor. -Here is an example using the delta filter with the Blosc compressor:: +Here is an example using a delta filter with the Blosc compressor:: >>> from numcodecs import Blosc, Delta >>> filters = [Delta(dtype='i4')] @@ -265,88 +274,8 @@ Here is an example using the delta filter with the Blosc compressor:: Storage ratio : 616.7 Chunks initialized : 100/100 -For more information about available filter codecs, see the -`Numcodecs `_ documentation. - -.. _tutorial_sync: - -Parallel computing and synchronization --------------------------------------- - -Zarr arrays can be used as either the source or sink for data in -parallel computations. Both multi-threaded and multi-process -parallelism are supported. The Python global interpreter lock (GIL) is -released for both compression and decompression operations, so Zarr -will not block other Python threads from running. - -A Zarr array can be read concurrently by multiple threads or processes. -No synchronization (i.e., locking) is required for concurrent reads. - -A Zarr array can also be written to concurrently by multiple threads -or processes. Some synchronization may be required, depending on the -way the data is being written. - -If each worker in a parallel computation is writing to a separate -region of the array, and if region boundaries are perfectly aligned -with chunk boundaries, then no synchronization is required. However, -if region and chunk boundaries are not perfectly aligned, then -synchronization is required to avoid two workers attempting to modify -the same chunk at the same time. - -To give a simple example, consider a 1-dimensional array of length 60, -``z``, divided into three chunks of 20 elements each. If three workers -are running and each attempts to write to a 20 element region (i.e., -``z[0:20]``, ``z[20:40]`` and ``z[40:60]``) then each worker will be -writing to a separate chunk and no synchronization is -required. However, if two workers are running and each attempts to -write to a 30 element region (i.e., ``z[0:30]`` and ``z[30:60]``) then -it is possible both workers will attempt to modify the middle chunk at -the same time, and synchronization is required to prevent data loss. - -Zarr provides support for chunk-level synchronization. E.g., create an -array with thread synchronization:: - - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', - ... synchronizer=zarr.ThreadSynchronizer()) - >>> z - - -This array is safe to read or write within a multi-threaded program. - -Zarr also provides support for process synchronization via file locking, -provided that all processes have access to a shared file system. E.g.:: - - >>> synchronizer = zarr.ProcessSynchronizer('example.sync') - >>> z = zarr.open_array('example', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4', - ... synchronizer=synchronizer) - >>> z - - -This array is safe to read or write from multiple processes. - -.. _tutorial_attrs: - -User attributes ---------------- - -Zarr arrays also support custom key/value attributes, which can be useful -for associating an array with application-specific metadata. For example:: - - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z.attrs['foo'] = 'bar' - >>> z.attrs['baz'] = 42 - >>> sorted(z.attrs) - ['baz', 'foo'] - >>> 'foo' in z.attrs - True - >>> z.attrs['foo'] - 'bar' - >>> z.attrs['baz'] - 42 - -Internally Zarr uses JSON to store array attributes, so attribute values -must be JSON serializable. +For more information about available filter codecs, see the `Numcodecs +`_ documentation. .. _tutorial_groups: @@ -357,82 +286,198 @@ Zarr supports hierarchical organization of arrays via groups. As with arrays, groups can be stored in memory, on disk, or via other storage systems that support a similar interface. -To create a group, use the :func:`zarr.hierarchy.group` function:: +To create a group, use the :func:`zarr.group` function:: - >>> root_group = zarr.group() - >>> root_group + >>> root = zarr.group() + >>> root -Groups have a similar API to the Group class from `h5py `_. -For example, groups can contain other groups:: +Groups have a similar API to the Group class from `h5py +`_. For example, groups can contain other groups:: - >>> foo_group = root_group.create_group('foo') - >>> bar_group = foo_group.create_group('bar') + >>> foo = root.create_group('foo') + >>> bar = foo.create_group('bar') Groups can also contain arrays, e.g.:: - >>> z1 = bar_group.zeros('baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4', - ... compressor=zarr.Blosc(cname='zstd', clevel=1, shuffle=1)) + >>> z1 = bar.zeros('baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') >>> z1 -Arrays are known as "datasets" in HDF5 terminology. For compatibility with -h5py, Zarr groups also implement the :func:`zarr.hierarchy.Group.create_dataset` -and :func:`zarr.hierarchy.Group.require_dataset` methods, e.g.:: +Arrays are known as "datasets" in HDF5 terminology. For compatibility with h5py, +Zarr groups also implement the ``create_dataset()`` and ``require_dataset()`` +methods, e.g.:: - >>> z = bar_group.create_dataset('quux', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4', - ... compression='gzip', compression_opts=1) + >>> z = bar.create_dataset('quux', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') >>> z Members of a group can be accessed via the suffix notation, e.g.:: - >>> root_group['foo'] + >>> root['foo'] -The '/' character can be used to access multiple levels of the hierarchy, -e.g.:: +The '/' character can be used to access multiple levels of the hierarchy in one +call, e.g.:: - >>> root_group['foo/bar'] + >>> root['foo/bar'] - >>> root_group['foo/bar/baz'] + >>> root['foo/bar/baz'] -The :func:`zarr.hierarchy.open_group` provides a convenient way to create or -re-open a group stored in a directory on the file-system, with sub-groups -stored in sub-directories, e.g.:: +The :func:`zarr.hierarchy.Group.tree` method can be used to print a tree +representation of the hierarchy, e.g.:: - >>> persistent_group = zarr.open_group('example', mode='w') - >>> persistent_group + >>> root.tree() + / + └── foo + └── bar + ├── baz (10000, 10000) int32 + └── quux (10000, 10000) int32 + +The :func:`zarr.convenience.open` function provides a convenient way to create or +re-open a group stored in a directory on the file-system, with sub-groups stored in +sub-directories, e.g.:: + + >>> root = zarr.open('data/group.zarr', mode='w') + >>> root - >>> z = persistent_group.create_dataset('foo/bar/baz', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4') + >>> z = root.zeros('foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') >>> z -For more information on groups see the :mod:`zarr.hierarchy` API docs. +For more information on groups see the :mod:`zarr.hierarchy` and +:mod:`zarr.convenience` API docs. + +.. _tutorial_diagnostics: + +Array and group diagnostics +--------------------------- + +Diagnostic information about arrays and groups is available via the ``info`` +property. E.g.:: + + >>> root = zarr.group() + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=1000000, chunks=100000, dtype='i8') + >>> bar[:] = 42 + >>> baz = foo.zeros('baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4') + >>> baz[:] = 4.2 + >>> root.info + Name : / + Type : zarr.hierarchy.Group + Read-only : False + Store type : zarr.storage.DictStore + No. members : 1 + No. arrays : 0 + No. groups : 1 + Groups : foo + + >>> foo.info + Name : /foo + Type : zarr.hierarchy.Group + Read-only : False + Store type : zarr.storage.DictStore + No. members : 2 + No. arrays : 2 + No. groups : 0 + Arrays : bar, baz + + >>> bar.info + Name : /foo/bar + Type : zarr.core.Array + Data type : int64 + Shape : (1000000,) + Chunk shape : (100000,) + Order : C + Read-only : False + Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + Store type : zarr.storage.DictStore + No. bytes : 8000000 (7.6M) + No. bytes stored : 37482 (36.6K) + Storage ratio : 213.4 + Chunks initialized : 10/10 + + >>> baz.info + Name : /foo/baz + Type : zarr.core.Array + Data type : float32 + Shape : (1000, 1000) + Chunk shape : (100, 100) + Order : C + Read-only : False + Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + Store type : zarr.storage.DictStore + No. bytes : 4000000 (3.8M) + No. bytes stored : 23245 (22.7K) + Storage ratio : 172.1 + Chunks initialized : 100/100 + +Groups also have the :func:`zarr.hierarchy.Group.tree` method, e.g.:: + + >>> root.tree() + / + └── foo + ├── bar (1000000,) int64 + └── baz (1000, 1000) float32 + +If you're using Zarr within a Jupyter notebook, calling ``tree()`` will generate an +interactive tree representation, see the `repr_tree.ipynb notebook +`_ +for more examples. + +.. _tutorial_attrs: + +User attributes +--------------- + +Zarr arrays and groups support custom key/value attributes, which can be useful for +storing application-specific metadata. For example:: + + >>> root = zarr.group() + >>> root.attrs['foo'] = 'bar' + >>> z = root.zeros('zzz', shape=(10000, 10000)) + >>> z.attrs['baz'] = 42 + >>> z.attrs['qux'] = [1, 4, 7, 12] + >>> sorted(root.attrs) + ['foo'] + >>> 'foo' in root.attrs + True + >>> root.attrs['foo'] + 'bar' + >>> sorted(z.attrs) + ['baz', 'qux'] + >>> z.attrs['baz'] + 42 + >>> z.attrs['qux'] + [1, 4, 7, 12] + +Internally Zarr uses JSON to store array attributes, so attribute values must be +JSON serializable. .. _tutorial_indexing: Advanced indexing ----------------- -As of Zarr version 2.2, Zarr arrays support several methods for advanced or "fancy" indexing, -which enable a subset of data items to be extracted or updated in an array without loading the -entire array into memory. Note that although this functionality is similar to some of the -advanced indexing capabilities available on NumPy arrays and on h5py datasets, **the Zarr API for -advanced indexing is different from both NumPy and h5py**, so please read this section carefully. -For a complete description of the indexing API, see the documentation for the -:class:`zarr.core.Array` class. +As of version 2.2, Zarr arrays support several methods for advanced or "fancy" +indexing, which enable a subset of data items to be extracted or updated in an +array without loading the entire array into memory. + +Note that although this functionality is similar to some of the advanced +indexing capabilities available on NumPy arrays and on h5py datasets, **the Zarr +API for advanced indexing is different from both NumPy and h5py**, so please +read this section carefully. For a complete description of the indexing API, +see the documentation for the :class:`zarr.core.Array` class. Indexing with coordinate arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Items from a Zarr array can be extracted by providing an integer array of coordinates. E.g.:: +Items from a Zarr array can be extracted by providing an integer array of +coordinates. E.g.:: >>> z = zarr.array(np.arange(10)) - >>> z[...] + >>> z[:] array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> z.get_coordinate_selection([1, 4]) array([1, 4]) @@ -440,30 +485,32 @@ Items from a Zarr array can be extracted by providing an integer array of coordi Coordinate arrays can also be used to update data, e.g.:: >>> z.set_coordinate_selection([1, 4], [-1, -2]) - >>> z[...] + >>> z[:] array([ 0, -1, 2, 3, -2, 5, 6, 7, 8, 9]) -For multidimensional arrays, coordinates must be provided for each dimension, e.g.:: +For multidimensional arrays, coordinates must be provided for each dimension, +e.g.:: >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[...] + >>> z[:] array([[ 0, 1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]) >>> z.get_coordinate_selection(([0, 2], [1, 3])) array([ 1, 13]) >>> z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) - >>> z[...] + >>> z[:] array([[ 0, -1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, -2, 14]]) -For convenience, coordinate indexing is also available via the ``vindex`` property, e.g.:: +For convenience, coordinate indexing is also available via the ``vindex`` +property, e.g.:: >>> z.vindex[[0, 2], [1, 3]] array([-1, -2]) >>> z.vindex[[0, 2], [1, 3]] = [-3, -4] - >>> z[...] + >>> z[:] array([[ 0, -3, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, -4, 14]]) @@ -471,10 +518,10 @@ For convenience, coordinate indexing is also available via the ``vindex`` proper Indexing with a mask array ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Items can also be extracted by providing a Boolean mask array. E.g.:: +Items can also be extracted by providing a Boolean mask. E.g.:: >>> z = zarr.array(np.arange(10)) - >>> z[...] + >>> z[:] array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> sel = np.zeros_like(z, dtype=bool) >>> sel[1] = True @@ -482,13 +529,13 @@ Items can also be extracted by providing a Boolean mask array. E.g.:: >>> z.get_mask_selection(sel) array([1, 4]) >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[...] + >>> z[:] array([ 0, -1, 2, 3, -2, 5, 6, 7, 8, 9]) -Here is a multidimensional example:: +Here's a multidimensional example:: >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[...] + >>> z[:] array([[ 0, 1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]) @@ -498,34 +545,36 @@ Here is a multidimensional example:: >>> z.get_mask_selection(sel) array([ 1, 13]) >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[...] + >>> z[:] array([[ 0, -1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, -2, 14]]) -For convenience, mask indexing is also available via the ``vindex`` property, e.g.:: +For convenience, mask indexing is also available via the ``vindex`` property, +e.g.:: >>> z.vindex[sel] array([-1, -2]) >>> z.vindex[sel] = [-3, -4] - >>> z[...] + >>> z[:] array([[ 0, -3, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, -4, 14]]) -Mask indexing is conceptually the same as coordinate indexing, and is implemented internally via -the same machinery. Both styles of indexing allow selecting arbitrary items from an array, also -known as point selection. +Mask indexing is conceptually the same as coordinate indexing, and is +implemented internally via the same machinery. Both styles of indexing allow +selecting arbitrary items from an array, also known as point selection. Orthogonal indexing ~~~~~~~~~~~~~~~~~~~ -Zarr arrays also support methods for orthogonal indexing, which allows selections to be made -along each dimension of an array independently. For example, this allows selecting a subset of -rows and/or columns from a 2-dimensional array. E.g.:: +Zarr arrays also support methods for orthogonal indexing, which allows +selections to be made along each dimension of an array independently. For +example, this allows selecting a subset of rows and/or columns from a +2-dimensional array. E.g.:: >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[...] + >>> z[:] array([[ 0, 1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]) @@ -543,13 +592,13 @@ rows and/or columns from a 2-dimensional array. E.g.:: Data can also be modified, e.g.:: >>> z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) - >>> z[...] + >>> z[:] array([[ 0, -1, 2, -2, 4], [ 5, 6, 7, 8, 9], [10, -3, 12, -4, 14]]) -For convenience, the orthogonal indexing functionality is also available via the ``oindex`` -property, e.g.:: +For convenience, the orthogonal indexing functionality is also available via the +``oindex`` property, e.g.:: >>> z = zarr.array(np.arange(15).reshape(3, 5)) >>> z.oindex[[0, 2], :] # select first and third rows @@ -563,19 +612,19 @@ property, e.g.:: array([[ 1, 3], [11, 13]]) >>> z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] - >>> z[...] + >>> z[:] array([[ 0, -1, 2, -2, 4], [ 5, 6, 7, 8, 9], [10, -3, 12, -4, 14]]) -Any combination of integer, slice, integer array and/or Boolean array can be used for orthogonal -indexing. +Any combination of integer, slice, 1D integer array and/or 1D Boolean array can +be used for orthogonal indexing. Indexing fields in structured arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -All selection methods support a ``fields`` parameter which allows retrieving or replacing data -for a specific field in an array with a structured dtype. E.g.:: +All selection methods support a ``fields`` parameter which allows retrieving or +replacing data for a specific field in an array with a structured dtype. E.g.:: >>> a = np.array([(b'aaa', 1, 4.2), ... (b'bbb', 2, 8.4), @@ -593,84 +642,258 @@ for a specific field in an array with a structured dtype. E.g.:: array([(b'aaa', 4.2), (b'ccc', 12.6)], dtype=[('foo', 'S3'), ('baz', '>> z = zarr.open('data/example.zarr', mode='w', shape=1000000, dtype='i4') -Array and group information -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +...is short-hand for:: -Diagnostic information about arrays and groups is available via the ``info`` property. E.g.:: + >>> store = zarr.DirectoryStore('data/example.zarr') + >>> z = zarr.create(store=store, overwrite=True, shape=1000000, dtype='i4') - >>> root_group = zarr.group() - >>> foo_group = root_group.create_group('foo') - >>> z = foo_group.zeros('bar', shape=1000000, chunks=100000) +...and the following code:: + + >>> root = zarr.open('data/example.zarr', mode='w') + +...is short-hand for:: + + >>> store = zarr.DirectoryStore('data/example.zarr') + >>> root = zarr.group(store=store, overwrite=True) + +Any other compatible storage class could be used in place of +:class:`zarr.storage.DirectoryStore` in the code examples above. For example, +here is an array stored directly into a Zip file, via the +:class:`zarr.storage.ZipStore` class:: + + >>> store = zarr.ZipStore('data/example.zip', mode='w') + >>> root = zarr.group(store=store) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') >>> z[:] = 42 - >>> root_group.info - Name : / - Type : zarr.hierarchy.Group - Read-only : False - Store type : zarr.storage.DictStore - No. members : 1 - No. arrays : 0 - No. groups : 1 - Groups : foo + >>> store.close() + >>> import os + >>> os.path.getsize('data/example.zip') + 32805 - >>> foo_group.info - Name : /foo - Type : zarr.hierarchy.Group - Read-only : False - Store type : zarr.storage.DictStore - No. members : 1 - No. arrays : 1 - No. groups : 0 - Arrays : bar +Re-open and check that data have been written:: + >>> store = zarr.ZipStore('data/example.zip', mode='r') + >>> root = zarr.group(store=store) + >>> z = root['foo/bar'] + >>> z[:] + array([[42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + ..., + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42]], dtype=int32) + >>> store.close() + +Note that there are some limitations on how Zip files can be used, because items +within a Zip file cannot be updated in place. This means that data in the array +should only be written once and write operations should be aligned with chunk +boundaries. Note also that the ``close()`` method must be called after writing +any data to the store, otherwise essential records will not be written to the +underlying zip file. + +Another storage alternative is the :class:`zarr.storage.DBMStore` class, added +in Zarr version 2.2. This class allows any DBM-style database to be used for +storing an array or group. Here is an example using a Berkeley DB B-tree +database for storage (requires `bsddb3 +`_ to be installed): + + >>> import bsddb3 + >>> store = zarr.DBMStore('data/example.db', open=bsddb3.btopen, flag='n') + >>> root = zarr.group(store=store) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + >>> z[:] = 42 + >>> store.close() + >>> import os + >>> os.path.getsize('data/example.db') + 36864 + +Re-open and check that data have been written:: + + >>> store = zarr.DBMStore('data/example.db', open=bsddb3.btopen) + >>> root = zarr.group(store=store) + >>> z = root['foo/bar'] + >>> z[:] + array([[42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + ..., + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42]], dtype=int32) + >>> store.close() + +It is also possible to use distributed storage systems. The Dask project has +implementations of the ``MutableMapping`` interface for Amazon S3 (`S3Map +`_), Hadoop +Distributed File System (`HDFSMap +`_) and +Google Cloud Storage (`GCSMap +`_), which +can be used with Zarr. + +Here is an example using S3Map to read an array created previously:: + + >>> import s3fs + >>> import zarr + >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) + >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) + >>> root = zarr.group(store=store) + >>> z = root['foo/bar/baz'] + >>> z + >>> z.info - Name : /foo/bar + Name : /foo/bar/baz Type : zarr.core.Array - Data type : float64 - Shape : (1000000,) - Chunk shape : (100000,) + Data type : |S1 + Shape : (21,) + Chunk shape : (7,) Order : C Read-only : False Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.DictStore - No. bytes : 8000000 (7.6M) - No. bytes stored : 38484 (37.6K) - Storage ratio : 207.9 - Chunks initialized : 10/10 + Store type : s3fs.mapping.S3Map + No. bytes : 21 + Chunks initialized : 3/3 + >>> z[:] + array([b'H', b'e', b'l', b'l', b'o', b' ', b'f', b'r', b'o', b'm', b' ', + b't', b'h', b'e', b' ', b'c', b'l', b'o', b'u', b'd', b'!'], + dtype='|S1') + >>> z[:].tostring() + b'Hello from the cloud!' -.. _tutorial_tips_copy: +.. _tutorial_strings: -Copying large arrays +String arrays +------------- + +There are several options for storing arrays of strings. + +If your strings are all ASCII strings, and you know the maximum length of the string in +your dataset, then you can use an array with a fixed-length bytes dtype. E.g.:: + + >>> z = zarr.zeros(10, dtype='S6') + >>> z[0] = b'Hello' + >>> z[1] = b'world!' + >>> z[:] + array([b'Hello', b'world!', b'', b'', b'', b'', b'', b'', b'', b''], + dtype='|S6') + +A fixed-length unicode dtype is also available, e.g.:: + + >>> z = zarr.zeros(12, dtype='U20') + >>> greetings = ['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!', + ... 'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', + ... 'こんにちは世界', '世界,你好!', 'Helló, világ!', 'Zdravo svete!', + ... 'เฮลโลเวิลด์'] + >>> z[:] = greetings + >>> z[:] + array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!', + 'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', 'こんにちは世界', + '世界,你好!', 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], + dtype='>> import numcodecs + >>> z = zarr.zeros(12, dtype=object, filters=[numcodecs.Pickle()]) + >>> z[:] = greetings + >>> z[:] + array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!', + 'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', 'こんにちは世界', + '世界,你好!', 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) + +...or alternatively using msgpack (requires `msgpack-python +`_ to be installed):: + + >>> z = zarr.zeros(12, dtype=object, filters=[numcodecs.MsgPack()]) + >>> z[:] = greetings + >>> z[:] + array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!', + 'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', 'こんにちは世界', + '世界,你好!', 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) + +.. _tutorial_chunks: + +Chunk optimizations +------------------- + +.. _tutorial_chunks_shape: + +Chunk size and shape ~~~~~~~~~~~~~~~~~~~~ -Data can be copied between large arrays without needing much memory, +In general, chunks of at least 1 megabyte (1M) uncompressed size seem to provide +better performance, at least when using the Blosc compression library. + +The optimal chunk shape will depend on how you want to access the data. E.g., +for a 2-dimensional array, if you only ever take slices along the first +dimension, then chunk across the second dimenson. If you know you want to chunk +across an entire dimension you can use ``None`` within the ``chunks`` argument, e.g.:: - >>> z1 = zarr.empty((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z1[:] = 42 - >>> z2 = zarr.empty_like(z1) - >>> z2[:] = z1 + >>> z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4') + >>> z1.chunks + (100, 10000) + +Alternatively, if you only ever take slices along the second dimension, then +chunk across the first dimension, e.g.:: -Internally the example above works chunk-by-chunk, extracting only the -data from ``z1`` required to fill each chunk in ``z2``. The source of -the data (``z1``) could equally be an h5py Dataset. + >>> z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4') + >>> z2.chunks + (10000, 100) -.. _tutorial_tips_order: +If you require reasonable performance for both access patterns then you need to +find a compromise, e.g.:: -Changing memory layout -~~~~~~~~~~~~~~~~~~~~~~ + >>> z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') + >>> z3.chunks + (1000, 1000) + +If you are feeling lazy, you can let Zarr guess a chunk shape for your data by +providing ``chunks=True``, although please note that the algorithm for guessing +a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: -The order of bytes within each chunk of an array can be changed via -the ``order`` keyword argument, to use either C or Fortran layout. For -multi-dimensional arrays, these two layouts may provide different -compression ratios, depending on the correlation structure within the -data. E.g.:: + >>> z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4') + >>> z4.chunks + (313, 625) + +If you know you are always going to be loading the entire array into memory, you +can turn off chunks by providing ``chunks=False``, in which case there will be +one single chunk for the array:: + + >>> z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4') + >>> z5.chunks + (10000, 10000) + +.. _tutorial_chunks_order: + +Chunk memory layout +~~~~~~~~~~~~~~~~~~~ + +The order of bytes **within each chunk** of an array can be changed via the +``order`` keyword argument, to use either C or Fortran layout. For +multi-dimensional arrays, these two layouts may provide different compression +ratios, depending on the correlation structure within the data. E.g.:: >>> a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T >>> c = zarr.array(a, chunks=(1000, 1000)) @@ -702,130 +925,168 @@ data. E.g.:: Storage ratio : 41.5 Chunks initialized : 100/100 -In the above example, Fortran order gives a better compression ratio. This -is an artifical example but illustrates the general point that changing the -order of bytes within chunks of an array may improve the compression ratio, -depending on the structure of the data, the compression algorithm used, and -which compression filters (e.g., byte shuffle) have been applied. +In the above example, Fortran order gives a better compression ratio. This is an +artifical example but illustrates the general point that changing the order of +bytes within chunks of an array may improve the compression ratio, depending on +the structure of the data, the compression algorithm used, and which compression +filters (e.g., byte-shuffle) have been applied. -.. _tutorial_tips_storage: +.. _tutorial_sync: -Storage alternatives -~~~~~~~~~~~~~~~~~~~~ +Parallel computing and synchronization +-------------------------------------- -Zarr can use any object that implements the ``MutableMapping`` interface as the store for a group -or an array. Some storage classes are provided in the :mod:`zarr.storage` module. For example, -the :class:`zarr.storage.DirectoryStore` class provides a ``MutableMapping`` interface to a -directory on the local file system. This is used under the hood by the -:func:`zarr.creation.open_array` and :func:`zarr.hierarchy.open_group` functions. In other words, -the following code:: +Zarr arrays can be used as either the source or sink for data in parallel +computations. Both multi-threaded and multi-process parallelism are +supported. The Python global interpreter lock (GIL) is released wherever +possible for both compression and decompression operations, so Zarr will +generally not block other Python threads from running. + +A Zarr array can be read concurrently by multiple threads or processes. No +synchronization (i.e., locking) is required for concurrent reads. + +A Zarr array can also be written to concurrently by multiple threads or +processes. Some synchronization may be required, depending on the way the data +is being written. + +If each worker in a parallel computation is writing to a separate region of the +array, and if region boundaries are perfectly aligned with chunk boundaries, +then no synchronization is required. However, if region and chunk boundaries are +not perfectly aligned, then synchronization is required to avoid two workers +attempting to modify the same chunk at the same time. + +To give a simple example, consider a 1-dimensional array of length 60, ``z``, +divided into three chunks of 20 elements each. If three workers are running and +each attempts to write to a 20 element region (i.e., ``z[0:20]``, ``z[20:40]`` +and ``z[40:60]``) then each worker will be writing to a separate chunk and no +synchronization is required. However, if two workers are running and each +attempts to write to a 30 element region (i.e., ``z[0:30]`` and ``z[30:60]``) +then it is possible both workers will attempt to modify the middle chunk at the +same time, and synchronization is required to prevent data loss. + +Zarr provides support for chunk-level synchronization. E.g., create an array +with thread synchronization:: - >>> z = zarr.open_array('example.zarr', mode='w', shape=1000000, dtype='i4') + >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', + ... synchronizer=zarr.ThreadSynchronizer()) + >>> z + -...is just short-hand for:: +This array is safe to read or write within a multi-threaded program. - >>> store = zarr.DirectoryStore('example.zarr') - >>> z = zarr.zeros(store=store, overwrite=True, shape=1000000, dtype='i4') +Zarr also provides support for process synchronization via file locking, +provided that all processes have access to a shared file system, and provided +that the underlying file system supports file locking (which is not the case for +some networked file systems). E.g.:: -...and the following code:: + >>> synchronizer = zarr.ProcessSynchronizer('data/example.sync') + >>> z = zarr.open_array('data/example', mode='w', shape=(10000, 10000), + ... chunks=(1000, 1000), dtype='i4', + ... synchronizer=synchronizer) + >>> z + - >>> grp = zarr.open_group('example.zarr', mode='w') +This array is safe to read or write from multiple processes, -...is just a short-hand for:: +.. _tutorial_pickle: - >>> store = zarr.DirectoryStore('example.zarr') - >>> grp = zarr.group(store=store, overwrite=True) +Pickle support +-------------- -Any other storage class could be used in place of :class:`zarr.storage.DirectoryStore`. For -example, here is an array stored directly into a Zip file:: +Zarr arrays and groups can be pickled, as long as the underlying store object can be +pickled. Instances of any of the storage classes provided in the :mod:`zarr.storage` +module can be pickled, as can the built-in ``dict`` class which can also be used for +storage. - >>> store = zarr.ZipStore('example.zip', mode='w') - >>> root_group = zarr.group(store=store) - >>> z = root_group.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - >>> import os - >>> os.path.getsize('example.zip') - 32805 +Note that if an array or group is backed by an in-memory store like a ``dict`` or +:class:`zarr.storage.DictStore`, then when it is pickled all of the store data will be +included in the pickled data. However, if an array or group is backed by a persistent +store like a :class:`zarr.storage.DirectoryStore`, :class:`zarr.storage.ZipStore` or +:class:`zarr.storage.DBMStore` then the store data **are not** pickled. The only thing +that is pickled is the necessary parameters to allow the store to re-open any +underlying files or databases upon being unpickled. -Re-open and check that data have been written:: +E.g., pickle/unpickle an in-memory array:: - >>> store = zarr.ZipStore('example.zip', mode='r') - >>> root_group = zarr.group(store=store) - >>> z = root_group['foo/bar'] - >>> z[:] - array([[42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - ..., - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42]], dtype=int32) - >>> store.close() + >>> import pickle + >>> z1 = zarr.array(np.arange(100000)) + >>> s = pickle.dumps(z1) + >>> len(s) > 10000 # relatively large because data have been pickled + True + >>> z2 = pickle.loads(s) + >>> z1 == z2 + True + >>> np.all(z1[:] == z2[:]) + True -Note that there are some restrictions on how Zip files can be used, -because items within a Zip file cannot be updated in place. This means -that data in the array should only be written once and write -operations should be aligned with chunk boundaries. Note also that the ``close()`` method must be -called after writing any data to the store, otherwise essential records will not be written to -the underlying zip file. +E.g., pickle/unpickle an array stored on disk:: -The Dask project has implementations of the ``MutableMapping`` -interface for distributed storage systems, see the `S3Map -`_ -and `HDFSMap -`_ -classes. + >>> z3 = zarr.open('data/walnuts.zarr', mode='w', shape=100000, dtype='i8') + >>> z3[:] = np.arange(100000) + >>> s = pickle.dumps(z3) + >>> len(s) < 200 # small because no data have been pickled + True + >>> z4 = pickle.loads(s) + >>> z3 == z4 + True + >>> np.all(z3[:] == z4[:]) + True -.. _tutorial_tips_chunks: +.. _tutorial_datetime: -Chunk size and shape -~~~~~~~~~~~~~~~~~~~~ +Datetimes and timedeltas +------------------------ -In general, chunks of at least 1 megabyte (1M) seem to provide the best -performance, at least when using the Blosc compression library. +Please note that NumPy's ``datetime64`` and ``timedelta64`` dtypes are **not** currently +supported for Zarr arrays. If you would like to store datetime or timedelta data, you +can store the data in an array with an integer dtype, e.g.:: -The optimal chunk shape will depend on how you want to access the data. E.g., -for a 2-dimensional array, if you only ever take slices along the first -dimension, then chunk across the second dimenson. If you know you want to -chunk across an entire dimension you can use ``None`` within the ``chunks`` -argument, e.g.:: + >>> a = np.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]') + >>> z = zarr.array(a.view('i8')) + >>> z + + >>> z[:] + array([13707, 13161, 14834]) + >>> z[:].view(a.dtype) + array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]') - >>> z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4') - >>> z1.chunks - (100, 10000) +If you would like a convenient way to retrieve the data from this array viewed as the +original datetime64 dtype, try the :func:`zarr.core.Array.astype` method, e.g.:: -Alternatively, if you only ever take slices along the second dimension, then -chunk across the first dimension, e.g.:: + >>> zv = z.astype(a.dtype) + >>> zv[:] + array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]') - >>> z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4') - >>> z2.chunks - (10000, 100) +.. _tutorial_tips: -If you require reasonable performance for both access patterns then you need -to find a compromise, e.g.:: +Usage tips +---------- - >>> z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z3.chunks - (1000, 1000) +.. _tutorial_tips_copy: + +Copying large arrays +~~~~~~~~~~~~~~~~~~~~ -If you are feeling lazy, you can let Zarr guess a chunk shape for your data, -although please note that the algorithm for guessing a chunk shape is based on -simple heuristics and may be far from optimal. E.g.:: +Data can be copied between large arrays without needing much memory, e.g.:: - >>> z4 = zarr.zeros((10000, 10000), dtype='i4') - >>> z4.chunks - (313, 625) + >>> z1 = zarr.empty((10000, 10000), chunks=(1000, 1000), dtype='i4') + >>> z1[:] = 42 + >>> z2 = zarr.empty_like(z1) + >>> z2[:] = z1 + +Internally the example above works chunk-by-chunk, extracting only the data from +``z1`` required to fill each chunk in ``z2``. The source of the data (``z1``) +could equally be an h5py Dataset. .. _tutorial_tips_blosc: Configuring Blosc ~~~~~~~~~~~~~~~~~ -The Blosc compressor is able to use multiple threads internally to -accelerate compression and decompression. By default, Zarr allows -Blosc to use up to 8 internal threads. The number of Blosc threads can -be changed to increase or decrease this number, e.g.:: +The Blosc compressor is able to use multiple threads internally to accelerate +compression and decompression. By default, Zarr allows Blosc to use up to 8 +internal threads. The number of Blosc threads can be changed to increase or +decrease this number, e.g.:: >>> from zarr import blosc >>> blosc.set_nthreads(2) diff --git a/requirements_dev.txt b/requirements_dev.txt index f54c565d2e..6ac0d20562 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -27,6 +27,7 @@ pyparsing==2.2.0 requests==2.18.4 requests-toolbelt==0.8.0 setuptools-scm==1.15.6 +s3fs==0.1.2 tox==2.9.1 tox-travis==0.8 tqdm==4.19.4 diff --git a/requirements_rtfd.txt b/requirements_rtfd.txt index 55730713f7..9381263874 100644 --- a/requirements_rtfd.txt +++ b/requirements_rtfd.txt @@ -2,7 +2,7 @@ asciitree setuptools setuptools_scm sphinx +sphinx-issues numpydoc mock numpy -cython diff --git a/zarr/convenience.py b/zarr/convenience.py index 5a477db3ce..991384e675 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -36,10 +36,10 @@ def open(store, mode='a', **kwargs): Examples -------- - Storing data in a directory 'example.zarr' on the local file system:: + Storing data in a directory 'data/example.zarr' on the local file system:: >>> import zarr - >>> store = 'example.zarr' + >>> store = 'data/example.zarr' >>> zw = zarr.open(store, mode='w', shape=100, dtype='i4') # open new array >>> zw @@ -111,14 +111,14 @@ def save_array(store, arr, **kwargs): >>> import zarr >>> import numpy as np >>> arr = np.arange(10000) - >>> zarr.save_array('example.zarr', arr) - >>> zarr.load('example.zarr') + >>> zarr.save_array('data/example.zarr', arr) + >>> zarr.load('data/example.zarr') array([ 0, 1, 2, ..., 9997, 9998, 9999]) Save an array to a single file (uses a :class:`ZipStore`):: - >>> zarr.save_array('example.zip', arr) - >>> zarr.load('example.zip') + >>> zarr.save_array('data/example.zip', arr) + >>> zarr.load('data/example.zip') array([ 0, 1, 2, ..., 9997, 9998, 9999]) """ @@ -153,8 +153,8 @@ def save_group(store, *args, **kwargs): >>> import numpy as np >>> a1 = np.arange(10000) >>> a2 = np.arange(10000, 0, -1) - >>> zarr.save_group('example.zarr', a1, a2) - >>> loader = zarr.load('example.zarr') + >>> zarr.save_group('data/example.zarr', a1, a2) + >>> loader = zarr.load('data/example.zarr') >>> loader >>> loader['arr_0'] @@ -164,8 +164,8 @@ def save_group(store, *args, **kwargs): Save several arrays using named keyword arguments:: - >>> zarr.save_group('example.zarr', foo=a1, bar=a2) - >>> loader = zarr.load('example.zarr') + >>> zarr.save_group('data/example.zarr', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zarr') >>> loader >>> loader['foo'] @@ -175,8 +175,8 @@ def save_group(store, *args, **kwargs): Store several arrays in a single zip file (uses a :class:`ZipStore`):: - >>> zarr.save_group('example.zip', foo=a1, bar=a2) - >>> loader = zarr.load('example.zip') + >>> zarr.save_group('data/example.zip', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zip') >>> loader >>> loader['foo'] @@ -208,7 +208,7 @@ def save_group(store, *args, **kwargs): def save(store, *args, **kwargs): - """Convenience function to save an array or arrays to the local file system. + """Convenience function to save an array or group of arrays to the local file system. Parameters ---------- @@ -226,24 +226,25 @@ def save(store, *args, **kwargs): >>> import zarr >>> import numpy as np >>> arr = np.arange(10000) - >>> zarr.save('example.zarr', arr) - >>> zarr.load('example.zarr') + >>> zarr.save('data/example.zarr', arr) + >>> zarr.load('data/example.zarr') array([ 0, 1, 2, ..., 9997, 9998, 9999]) - Save an array to a single file (uses a :class:`ZipStore`):: + Save an array to a Zip file (uses a :class:`ZipStore`):: - >>> zarr.save('example.zip', arr) - >>> zarr.load('example.zip') + >>> zarr.save('data/example.zip', arr) + >>> zarr.load('data/example.zip') array([ 0, 1, 2, ..., 9997, 9998, 9999]) - Save several arrays to a directory on the file system (uses a :class:`DirectoryStore`):: + Save several arrays to a directory on the file system (uses a + :class:`DirectoryStore` and stores arrays in a group):: >>> import zarr >>> import numpy as np >>> a1 = np.arange(10000) >>> a2 = np.arange(10000, 0, -1) - >>> zarr.save('example.zarr', a1, a2) - >>> loader = zarr.load('example.zarr') + >>> zarr.save('data/example.zarr', a1, a2) + >>> loader = zarr.load('data/example.zarr') >>> loader >>> loader['arr_0'] @@ -253,8 +254,8 @@ def save(store, *args, **kwargs): Save several arrays using named keyword arguments:: - >>> zarr.save('example.zarr', foo=a1, bar=a2) - >>> loader = zarr.load('example.zarr') + >>> zarr.save('data/example.zarr', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zarr') >>> loader >>> loader['foo'] @@ -264,8 +265,8 @@ def save(store, *args, **kwargs): Store several arrays in a single zip file (uses a :class:`ZipStore`):: - >>> zarr.save('example.zip', foo=a1, bar=a2) - >>> loader = zarr.load('example.zip') + >>> zarr.save('data/example.zip', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zip') >>> loader >>> loader['foo'] diff --git a/zarr/core.py b/zarr/core.py index b149ffa78c..9c830718a3 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -9,17 +9,18 @@ from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, - normalize_storage_path, normalize_shape, normalize_chunks, InfoReporter, - check_array_shape) + normalize_storage_path, normalize_shape, normalize_chunks, + InfoReporter, check_array_shape) from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec -from zarr.indexing import (OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer, - MaskIndexer, check_fields, pop_fields, ensure_tuple, is_scalar, - is_contiguous_selection, err_too_many_indices, check_no_multi_fields) +from zarr.indexing import (OIndex, OrthogonalIndexer, BasicIndexer, VIndex, + CoordinateIndexer, MaskIndexer, check_fields, pop_fields, + ensure_tuple, is_scalar, is_contiguous_selection, + err_too_many_indices, check_no_multi_fields) # noinspection PyUnresolvedReferences @@ -420,8 +421,8 @@ def __getitem__(self, selection): Parameters ---------- selection : tuple - An integer index or slice or tuple of int/slice objects specifying the requested - item or region for each dimension of the array. + An integer index or slice or tuple of int/slice objects specifying the + requested item or region for each dimension of the array. Returns ------- @@ -527,8 +528,9 @@ def __getitem__(self, selection): ----- Slices with step > 1 are supported, but slices with negative step are not. - Currently the implementation for __getitem__ is provided by :func:`get_basic_selection`. - For advanced ("fancy") indexing, see the methods listed under See Also. + Currently the implementation for __getitem__ is provided by + :func:`get_basic_selection`. For advanced ("fancy") indexing, see the methods + listed under See Also. See Also -------- @@ -547,13 +549,13 @@ def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): Parameters ---------- selection : tuple - A tuple specifying the requested item or region for each dimension of the array. May - be any combination of int and/or slice for multidimensional arrays. + A tuple specifying the requested item or region for each dimension of the + array. May be any combination of int and/or slice for multidimensional arrays. out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to extract - data for. + For arrays with a structured dtype, one or more fields can be specified to + extract data for. Returns ------- @@ -640,9 +642,9 @@ def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): ----- Slices with step > 1 are supported, but slices with negative step are not. - Currently this method provides the implementation for accessing data via the square - bracket notation (__getitem__). See :func:`__getitem__` for examples using the - alternative notation. + Currently this method provides the implementation for accessing data via the + square bracket notation (__getitem__). See :func:`__getitem__` for examples + using the alternative notation. See Also -------- @@ -661,9 +663,11 @@ def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): # handle zero-dimensional arrays if self._shape == (): - return self._get_basic_selection_zd(selection=selection, out=out, fields=fields) + return self._get_basic_selection_zd(selection=selection, out=out, + fields=fields) else: - return self._get_basic_selection_nd(selection=selection, out=out, fields=fields) + return self._get_basic_selection_nd(selection=selection, out=out, + fields=fields) def _get_basic_selection_zd(self, selection, out=None, fields=None): # special case basic selection for zero-dimensional array @@ -708,21 +712,22 @@ def _get_basic_selection_nd(self, selection, out=None, fields=None): return self._get_selection(indexer=indexer, out=out, fields=fields) def get_orthogonal_selection(self, selection, out=None, fields=None): - """Retrieve data by making a selection for each dimension of the array. For example, - if an array has 2 dimensions, allows selecting specific rows and/or columns. The - selection for each dimension can be either an integer (indexing a single item), a slice, - an array of integers, or a Boolean array where True values indicate a selection. + """Retrieve data by making a selection for each dimension of the array. For + example, if an array has 2 dimensions, allows selecting specific rows and/or + columns. The selection for each dimension can be either an integer (indexing a + single item), a slice, an array of integers, or a Boolean array where True + values indicate a selection. Parameters ---------- selection : tuple - A selection for each dimension of the array. May be any combination of int, slice, - integer array or Boolean array. + A selection for each dimension of the array. May be any combination of int, + slice, integer array or Boolean array. out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to extract - data for. + For arrays with a structured dtype, one or more fields can be specified to + extract data for. Returns ------- @@ -737,8 +742,8 @@ def get_orthogonal_selection(self, selection, out=None, fields=None): >>> import numpy as np >>> z = zarr.array(np.arange(100).reshape(10, 10)) - Retrieve rows and columns via any combination of int, slice, integer array and/or Boolean - array:: + Retrieve rows and columns via any combination of int, slice, integer array and/or + Boolean array:: >>> z.get_orthogonal_selection(([1, 4], slice(None))) array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], @@ -800,8 +805,8 @@ def get_orthogonal_selection(self, selection, out=None, fields=None): See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection, vindex, - oindex, __getitem__, __setitem__ + get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection, + vindex, oindex, __getitem__, __setitem__ """ @@ -818,8 +823,8 @@ def get_orthogonal_selection(self, selection, out=None, fields=None): return self._get_selection(indexer=indexer, out=out, fields=fields) def get_coordinate_selection(self, selection, out=None, fields=None): - """Retrieve a selection of individual items, by providing the indices (coordinates) for - each selected item. + """Retrieve a selection of individual items, by providing the indices + (coordinates) for each selected item. Parameters ---------- @@ -828,8 +833,8 @@ def get_coordinate_selection(self, selection, out=None, fields=None): out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to extract - data for. + For arrays with a structured dtype, one or more fields can be specified to + extract data for. Returns ------- @@ -857,22 +862,22 @@ def get_coordinate_selection(self, selection, out=None, fields=None): Notes ----- - Coordinate indexing is also known as point selection, and is a form of vectorized or inner - indexing. + Coordinate indexing is also known as point selection, and is a form of vectorized + or inner indexing. - Slices are not supported. Coordinate arrays must be provided for all dimensions of the - array. + Slices are not supported. Coordinate arrays must be provided for all dimensions + of the array. - Coordinate arrays may be multidimensional, in which case the output array will also be - multidimensional. Coordinate arrays are broadcast against each other before being - applied. The shape of the output will be the same as the shape of each coordinate array - after broadcasting. + Coordinate arrays may be multidimensional, in which case the output array will + also be multidimensional. Coordinate arrays are broadcast against each other + before being applied. The shape of the output will be the same as the shape of + each coordinate array after broadcasting. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, set_coordinate_selection, vindex, - oindex, __getitem__, __setitem__ + get_orthogonal_selection, set_orthogonal_selection, set_coordinate_selection, + vindex, oindex, __getitem__, __setitem__ """ @@ -898,20 +903,20 @@ def get_coordinate_selection(self, selection, out=None, fields=None): return out def get_mask_selection(self, selection, out=None, fields=None): - """Retrieve a selection of individual items, by providing a Boolean array of the same - shape as the array against which the selection is being made, where True values indicate - a selected item. + """Retrieve a selection of individual items, by providing a Boolean array of the + same shape as the array against which the selection is being made, where True + values indicate a selected item. Parameters ---------- selection : ndarray, bool - A Boolean array of the same shape as the array against which the selection is being - made. + A Boolean array of the same shape as the array against which the selection is + being made. out : ndarray, optional If given, load the selected data directly into this array. fields : str or sequence of str, optional - For arrays with a structured dtype, one or more fields can be specified to extract - data for. + For arrays with a structured dtype, one or more fields can be specified to + extract data for. Returns ------- @@ -942,15 +947,15 @@ def get_mask_selection(self, selection, out=None, fields=None): Notes ----- - Mask indexing is a form of vectorized or inner indexing, and is equivalent to coordinate - indexing. Internally the mask array is converted to coordinate arrays by calling - `np.nonzero`. + Mask indexing is a form of vectorized or inner indexing, and is equivalent to + coordinate indexing. Internally the mask array is converted to coordinate + arrays by calling `np.nonzero`. See Also -------- - get_basic_selection, set_basic_selection, set_mask_selection, get_orthogonal_selection, - set_orthogonal_selection, get_coordinate_selection, set_coordinate_selection, vindex, - oindex, __getitem__, __setitem__ + get_basic_selection, set_basic_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, vindex, oindex, __getitem__, __setitem__ """ @@ -968,12 +973,12 @@ def get_mask_selection(self, selection, out=None, fields=None): def _get_selection(self, indexer, out=None, fields=None): - # We iterate over all chunks which overlap the selection and thus contain data that needs - # to be extracted. Each chunk is processed in turn, extracting the necessary data and - # storing into the correct location in the output array. + # We iterate over all chunks which overlap the selection and thus contain data + # that needs to be extracted. Each chunk is processed in turn, extracting the + # necessary data and storing into the correct location in the output array. - # N.B., it is an important optimisation that we only visit chunks which overlap the - # selection. This minimises the number of iterations in the main for loop. + # N.B., it is an important optimisation that we only visit chunks which overlap + # the selection. This minimises the number of iterations in the main for loop. # check fields are sensible out_dtype = check_fields(fields, self._dtype) @@ -1005,8 +1010,8 @@ def __setitem__(self, selection, value): Parameters ---------- selection : tuple - An integer index or slice or tuple of int/slice specifying the requested region for - each dimension of the array. + An integer index or slice or tuple of int/slice specifying the requested + region for each dimension of the array. value : scalar or array-like Value to be stored into the array. @@ -1065,9 +1070,10 @@ def __setitem__(self, selection, value): ----- Slices with step > 1 are supported, but slices with negative step are not. - Currently the implementation for __setitem__ is provided by :func:`set_basic_selection`, - which means that only integers and slices are supported within the selection. For - advanced ("fancy") indexing, see the methods listed under See Also. + Currently the implementation for __setitem__ is provided by + :func:`set_basic_selection`, which means that only integers and slices are + supported within the selection. For advanced ("fancy") indexing, see the + methods listed under See Also. See Also -------- @@ -1086,8 +1092,8 @@ def set_basic_selection(self, selection, value, fields=None): Parameters ---------- selection : tuple - An integer index or slice or tuple of int/slice specifying the requested region for - each dimension of the array. + An integer index or slice or tuple of int/slice specifying the requested + region for each dimension of the array. value : scalar or array-like Value to be stored into the array. fields : str or sequence of str, optional @@ -1134,8 +1140,8 @@ def set_basic_selection(self, selection, value, fields=None): [ 3, 42, 42, 42, 42], [ 4, 42, 42, 42, 42]]) - For arrays with a structured dtype, the `fields` parameter can be used to set data for - a specific field, e.g.:: + For arrays with a structured dtype, the `fields` parameter can be used to set + data for a specific field, e.g.:: >>> a = np.array([(b'aaa', 1, 4.2), ... (b'bbb', 2, 8.4), @@ -1150,14 +1156,14 @@ def set_basic_selection(self, selection, value, fields=None): Notes ----- This method provides the underlying implementation for modifying data via square - bracket notation, see :func:`__setitem__` for equivalent examples using the alternative - notation. + bracket notation, see :func:`__setitem__` for equivalent examples using the + alternative notation. See Also -------- - get_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, - set_coordinate_selection, get_orthogonal_selection, set_orthogonal_selection, vindex, - oindex, __getitem__, __setitem__ + get_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ """ @@ -1181,8 +1187,8 @@ def set_orthogonal_selection(self, selection, value, fields=None): Parameters ---------- selection : tuple - A selection for each dimension of the array. May be any combination of int, slice, - integer array or Boolean array. + A selection for each dimension of the array. May be any combination of int, + slice, integer array or Boolean array. value : scalar or array-like Value to be stored into the array. fields : str or sequence of str, optional @@ -1227,7 +1233,8 @@ def set_orthogonal_selection(self, selection, value, fields=None): [0, 2, 0, 0, 2], [1, 3, 1, 1, 3]]) - For convenience, this functionality is also available via the `oindex` property. E.g.:: + For convenience, this functionality is also available via the `oindex` property. + E.g.:: >>> z.oindex[[1, 4], [1, 4]] = 4 >>> z[...] @@ -1265,8 +1272,8 @@ def set_orthogonal_selection(self, selection, value, fields=None): self._set_selection(indexer, value, fields=fields) def set_coordinate_selection(self, selection, value, fields=None): - """Modify a selection of individual items, by providing the indices (coordinates) for - each item to be modified. + """Modify a selection of individual items, by providing the indices (coordinates) + for each item to be modified. Parameters ---------- @@ -1296,7 +1303,8 @@ def set_coordinate_selection(self, selection, value, fields=None): [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]]) - For convenience, this functionality is also available via the `vindex` property. E.g.:: + For convenience, this functionality is also available via the `vindex` property. + E.g.:: >>> z.vindex[[1, 4], [1, 4]] = 2 >>> z[...] @@ -1308,17 +1316,17 @@ def set_coordinate_selection(self, selection, value, fields=None): Notes ----- - Coordinate indexing is also known as point selection, and is a form of vectorized or inner - indexing. + Coordinate indexing is also known as point selection, and is a form of vectorized + or inner indexing. - Slices are not supported. Coordinate arrays must be provided for all dimensions of the - array. + Slices are not supported. Coordinate arrays must be provided for all dimensions + of the array. See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, - get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, vindex, - oindex, __getitem__, __setitem__ + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + vindex, oindex, __getitem__, __setitem__ """ @@ -1342,15 +1350,15 @@ def set_coordinate_selection(self, selection, value, fields=None): self._set_selection(indexer, value, fields=fields) def set_mask_selection(self, selection, value, fields=None): - """Modify a selection of individual items, by providing a Boolean array of the same - shape as the array against which the selection is being made, where True values indicate - a selected item. + """Modify a selection of individual items, by providing a Boolean array of the + same shape as the array against which the selection is being made, where True + values indicate a selected item. Parameters ---------- selection : ndarray, bool - A Boolean array of the same shape as the array against which the selection is being - made. + A Boolean array of the same shape as the array against which the selection is + being made. value : scalar or array-like Value to be stored into the array. fields : str or sequence of str, optional @@ -1378,7 +1386,8 @@ def set_mask_selection(self, selection, value, fields=None): [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]]) - For convenience, this functionality is also available via the `vindex` property. E.g.:: + For convenience, this functionality is also available via the `vindex` property. + E.g.:: >>> z.vindex[sel] = 2 >>> z[...] @@ -1390,15 +1399,15 @@ def set_mask_selection(self, selection, value, fields=None): Notes ----- - Mask indexing is a form of vectorized or inner indexing, and is equivalent to coordinate - indexing. Internally the mask array is converted to coordinate arrays by calling - `np.nonzero`. + Mask indexing is a form of vectorized or inner indexing, and is equivalent to + coordinate indexing. Internally the mask array is converted to coordinate + arrays by calling `np.nonzero`. See Also -------- - get_basic_selection, set_basic_selection, get_mask_selection, get_orthogonal_selection, - set_orthogonal_selection, get_coordinate_selection, set_coordinate_selection, vindex, - oindex, __getitem__, __setitem__ + get_basic_selection, set_basic_selection, get_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, vindex, oindex, __getitem__, __setitem__ """ @@ -1465,12 +1474,12 @@ def _set_basic_selection_nd(self, selection, value, fields=None): def _set_selection(self, indexer, value, fields=None): - # We iterate over all chunks which overlap the selection and thus contain data that needs - # to be replaced. Each chunk is processed in turn, extracting the necessary data from the - # value array and storing into the chunk array. + # We iterate over all chunks which overlap the selection and thus contain data + # that needs to be replaced. Each chunk is processed in turn, extracting the + # necessary data from the value array and storing into the chunk array. - # N.B., it is an important optimisation that we only visit chunks which overlap the - # selection. This minimises the nuimber of iterations in the main for loop. + # N.B., it is an important optimisation that we only visit chunks which overlap + # the selection. This minimises the nuimber of iterations in the main for loop. # check fields are sensible check_fields(fields, self._dtype) @@ -1505,8 +1514,8 @@ def _set_selection(self, indexer, value, fields=None): # put data self._chunk_setitem(chunk_coords, chunk_selection, chunk_value, fields=fields) - def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes=None, - fields=None): + def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, + drop_axes=None, fields=None): """Obtain part or whole of a chunk. Parameters @@ -1599,12 +1608,14 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): # synchronization if self._synchronizer is None: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, fields=fields) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, + fields=fields) else: # synchronize on the chunk ckey = self._chunk_key(chunk_coords) with self._synchronizer[ckey]: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, fields=fields) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, + fields=fields) def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): @@ -1656,8 +1667,8 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) chunk.fill(self._fill_value) else: - # N.B., use zeros here so any region beyond the array has consistent and - # compressible data + # N.B., use zeros here so any region beyond the array has consistent + # and compressible data chunk = np.zeros(self._chunks, dtype=self._dtype, order=self._order) else: @@ -1669,8 +1680,8 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non # modify if fields: - # N.B., currently multi-field assignment is not supported in numpy, so this only - # works for a single field + # N.B., currently multi-field assignment is not supported in numpy, so + # this only works for a single field chunk[fields][chunk_selection] = value else: chunk[chunk_selection] = value @@ -1817,8 +1828,8 @@ def bytestr(n): return items def __getstate__(self): - return (self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, - self._cache_metadata) + return (self._store, self._path, self._read_only, self._chunk_store, + self._synchronizer, self._cache_metadata) def __setstate__(self, state): self.__init__(*state) @@ -1952,8 +1963,9 @@ def _append_nosync(self, data, axis=0): data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) if self_shape_preserved != data_shape_preserved: - raise ValueError('shape of data to append is not compatible with the array; all ' - 'dimensions must match except for the dimension being appended') + raise ValueError('shape of data to append is not compatible with the array; ' + 'all dimensions must match except for the dimension being ' + 'appended') # remember old shape old_shape = self._shape diff --git a/zarr/creation.py b/zarr/creation.py index b4aef70c75..f053654830 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -385,12 +385,12 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= -------- >>> import numpy as np >>> import zarr - >>> z1 = zarr.open_array('example.zarr', mode='w', shape=(10000, 10000), + >>> z1 = zarr.open_array('data/example.zarr', mode='w', shape=(10000, 10000), ... chunks=(1000, 1000), fill_value=0) >>> z1[:] = np.arange(100000000).reshape(10000, 10000) >>> z1 - >>> z2 = zarr.open_array('example.zarr', mode='r') + >>> z2 = zarr.open_array('data/example.zarr', mode='r') >>> z2 >>> np.all(z1[:] == z2[:]) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index fc936d15a1..bb4e33ce0f 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -945,7 +945,6 @@ def group(store=None, overwrite=False, chunk_store=None, synchronizer=None, path Examples -------- - Create a group in memory:: >>> import zarr @@ -955,7 +954,7 @@ def group(store=None, overwrite=False, chunk_store=None, synchronizer=None, path Create a group with a different store:: - >>> store = zarr.DirectoryStore('example') + >>> store = zarr.DirectoryStore('data/example.zarr') >>> g = zarr.group(store=store, overwrite=True) >>> g @@ -999,12 +998,12 @@ def open_group(store, mode='a', synchronizer=None, path=None): Examples -------- >>> import zarr - >>> root = zarr.open_group('example', mode='w') + >>> root = zarr.open_group('data/example.zarr', mode='w') >>> foo = root.create_group('foo') >>> bar = root.create_group('bar') >>> root - >>> root2 = zarr.open_group('example', mode='a') + >>> root2 = zarr.open_group('data/example.zarr', mode='a') >>> root2 >>> root == root2 diff --git a/zarr/storage.py b/zarr/storage.py index b4eba0bfbd..72d1774e72 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- -""" -This module contains storage classes for use with Zarr arrays and groups. Note that any -object implementing the ``MutableMapping`` interface can be used as a Zarr array store. +"""This module contains storage classes for use with Zarr arrays and groups. + +Note that any object implementing the :class:`MutableMapping` interface from the +:mod:`collections` module in the Python standard library can be used as a Zarr +array store, as long as it accepts string (str) keys and bytes values. """ from __future__ import absolute_import, print_function, division @@ -147,7 +149,8 @@ def _require_parent_group(path, store, chunk_store, overwrite): def init_array(store, shape, chunks=True, dtype=None, compressor='default', fill_value=None, order='C', overwrite=False, path=None, chunk_store=None, filters=None): - """initialize an array store with the given configuration. + """Initialize an array store with the given configuration. Note that this is a low-level + function and there should be no need to call this directly from user code. Parameters ---------- @@ -188,7 +191,7 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default', Array metadata is stored as JSON:: - >>> print(str(store['.zarray'], 'ascii')) + >>> print(store['.zarray'].decode()) { "chunks": [ 1000, @@ -214,17 +217,16 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default', User-defined attributes are also stored as JSON, initially empty:: - >>> print(str(store['.zattrs'], 'ascii')) + >>> print(store['.zattrs'].decode()) {} Initialize an array using a storage path:: >>> store = dict() - >>> init_array(store, shape=100000000, chunks=1000000, dtype='i1', - ... path='foo') + >>> init_array(store, shape=100000000, chunks=1000000, dtype='i1', path='foo') >>> sorted(store.keys()) ['.zattrs', '.zgroup', 'foo/.zarray', 'foo/.zattrs'] - >>> print(str(store['foo/.zarray'], 'ascii')) + >>> print(store['foo/.zarray'].decode()) { "chunks": [ 1000000 @@ -248,9 +250,9 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default', Notes ----- - The initialisation process involves normalising all array metadata, - encoding as JSON and storing under the '.zarray' key. User attributes are - also initialized and stored as JSON under the '.zattrs' key. + The initialisation process involves normalising all array metadata, encoding + as JSON and storing under the '.zarray' key. User attributes are also + initialized and stored as JSON under the '.zattrs' key. """ @@ -284,6 +286,9 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa # normalize metadata shape = normalize_shape(shape) dtype = np.dtype(dtype) + if dtype.kind in 'mM': + raise ValueError('datetime64 and timedelta64 dtypes are not currently supported; ' + 'please store the data using int64 instead') chunks = normalize_chunks(chunks, shape, dtype.itemsize) order = normalize_order(order) fill_value = normalize_fill_value(fill_value, dtype) @@ -329,7 +334,8 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa def init_group(store, overwrite=False, path=None, chunk_store=None): - """initialize a group store. + """Initialize a group store. Note that this is a low-level function and there should be no + need to call this directly from user code. Parameters ---------- @@ -410,27 +416,24 @@ def _dict_store_keys(d, prefix='', cls=dict): class DictStore(MutableMapping): - """Extended mutable mapping interface to a hierarchy of dicts. + """Store class that uses a hierarchy of :class:`dict` objects, thus all data + will be held in main memory. Examples -------- - >>> import zarr - >>> store = zarr.DictStore() - >>> store['foo'] = b'bar' - >>> store['foo'] - b'bar' - >>> store['a/b/c'] = b'xxx' - >>> store['a/b/c'] - b'xxx' - >>> sorted(store.keys()) - ['a/b/c', 'foo'] - >>> store.listdir() - ['a', 'foo'] - >>> store.listdir('a/b') - ['c'] - >>> store.rmdir('a') - >>> sorted(store.keys()) - ['foo'] + This is the default class used when creating a group. E.g.:: + + >>> import zarr + >>> g = zarr.group() + >>> type(g.store) + + + Note that the default class when creating an array is the built-in + :class:`dict` class, i.e.:: + + >>> z = zarr.zeros(100) + >>> type(z.store) + """ @@ -575,42 +578,55 @@ def getsize(self, path=None): class DirectoryStore(MutableMapping): - """Mutable Mapping interface to a directory. Keys must be strings, - values must be bytes-like objects. + """Storage class using directories and files on a standard file system. Parameters ---------- path : string - Location of directory. + Location of directory to use as the root of the storage hierarchy. Examples -------- - >>> import zarr - >>> store = zarr.DirectoryStore('example_store') - >>> store['foo'] = b'bar' - >>> store['foo'] - b'bar' - >>> with open('example_store/foo', 'rb') as f: - ... f.read() - b'bar' - >>> store['a/b/c'] = b'xxx' - >>> store['a/b/c'] - b'xxx' - >>> with open('example_store/a/b/c', 'rb') as f: - ... f.read() - b'xxx' - >>> sorted(store.keys()) - ['a/b/c', 'foo'] - >>> store.listdir() - ['a', 'foo'] - >>> store.listdir('a/b') - ['c'] - >>> store.rmdir('a') - >>> sorted(store.keys()) - ['foo'] - >>> import os - >>> os.path.exists('example_store/a') - False + Store a single array:: + + >>> import zarr + >>> store = zarr.DirectoryStore('data/array.zarr') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Each chunk of the array is stored as a separate file on the file system, + i.e.:: + + >>> import os + >>> sorted(os.listdir('data/array.zarr')) + ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] + + Store a group:: + + >>> store = zarr.DirectoryStore('data/group.zarr') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + When storing a group, levels in the group hierarchy will correspond to + directories on the file system, i.e.:: + + >>> sorted(os.listdir('data/group.zarr')) + ['.zattrs', '.zgroup', 'foo'] + >>> sorted(os.listdir('data/group.zarr/foo')) + ['.zattrs', '.zgroup', 'bar'] + >>> sorted(os.listdir('data/group.zarr/foo/bar')) + ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] + + Notes + ----- + Atomic writes are used, which means that data are first written to a + temporary file, then moved into place when the write is successfully + completed. + + Files are only held open while they are being read or written and are closed + immediately afterwards, so there is no need to manually close any files. """ @@ -761,7 +777,18 @@ def atexit_rmtree(path, class TempStore(DirectoryStore): - """Directory store using a temporary directory for storage.""" + """Directory store using a temporary directory for storage. + + Parameters + ---------- + suffix : string, optional + Suffix for the temporary directory name. + prefix : string, optional + Prefix for the temporary directory name. + dir : string, optional + Path to parent directory in which to create temporary directory. + + """ # noinspection PyShadowingBuiltins def __init__(self, suffix='', prefix='zarr', dir=None): @@ -786,58 +813,65 @@ def _map_ckey(key): class NestedDirectoryStore(DirectoryStore): - """Mutable Mapping interface to a directory, with special handling for chunk keys so - that chunk files for multidimensional arrays are stored in a nested directory tree. - Keys must be strings, values must be bytes-like objects. + """Storage class using directories and files on a standard file system, with + special handling for chunk keys so that chunk files for multidimensional + arrays are stored in a nested directory tree. Parameters ---------- path : string - Location of directory. + Location of directory to use as the root of the storage hierarchy. Examples -------- - Most keys are mapped to file paths as normal, e.g.:: + Store a single array:: >>> import zarr - >>> store = zarr.NestedDirectoryStore('example_nested_store') - >>> store['foo'] = b'bar' - >>> store['foo'] - b'bar' - >>> store['a/b/c'] = b'xxx' - >>> store['a/b/c'] - b'xxx' - >>> with open('example_nested_store/foo', 'rb') as f: - ... f.read() - b'bar' - >>> with open('example_nested_store/a/b/c', 'rb') as f: - ... f.read() - b'xxx' - - Chunk keys are handled in a special way, such that the '.' characters in the key - are mapped to directory path separators internally. E.g.:: - - >>> store['bar/0.0'] = b'yyy' - >>> store['bar/0.0'] - b'yyy' - >>> store['baz/2.1.12'] = b'zzz' - >>> store['baz/2.1.12'] - b'zzz' - >>> with open('example_nested_store/bar/0/0', 'rb') as f: - ... f.read() - b'yyy' - >>> with open('example_nested_store/baz/2/1/12', 'rb') as f: - ... f.read() - b'zzz' + >>> store = zarr.NestedDirectoryStore('data/array.zarr') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Each chunk of the array is stored as a separate file on the file system, + note the multiple directory levels used for the chunk files:: + + >>> import os + >>> sorted(os.listdir('data/array.zarr')) + ['.zarray', '.zattrs', '0', '1'] + >>> sorted(os.listdir('data/array.zarr/0')) + ['0', '1'] + >>> sorted(os.listdir('data/array.zarr/1')) + ['0', '1'] + + Store a group:: + + >>> store = zarr.NestedDirectoryStore('data/group.zarr') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + When storing a group, levels in the group hierarchy will correspond to + directories on the file system, i.e.:: + + >>> sorted(os.listdir('data/group.zarr')) + ['.zattrs', '.zgroup', 'foo'] + >>> sorted(os.listdir('data/group.zarr/foo')) + ['.zattrs', '.zgroup', 'bar'] + >>> sorted(os.listdir('data/group.zarr/foo/bar')) + ['.zarray', '.zattrs', '0', '1'] + >>> sorted(os.listdir('data/group.zarr/foo/bar/0')) + ['0', '1'] + >>> sorted(os.listdir('data/group.zarr/foo/bar/1')) + ['0', '1'] Notes ----- - The standard DirectoryStore class stores all chunk files for an array together in a - single directory. On some file systems the potentially large number of files in a - single directory can cause performance issues. The NestedDirectoryStore class - provides an alternative where chunk files for multidimensional arrays will be - organised into a directory hierarchy, thus reducing the number of files in any one - directory. + The :class:`DirectoryStore` class stores all chunk files for an array + together in a single directory. On some file systems, the potentially large + number of files in a single directory can cause performance issues. The + :class:`NestedDirectoryStore` class provides an alternative where chunk + files for multidimensional arrays will be organised into a directory + hierarchy, thus reducing the number of files in any one directory. """ @@ -890,8 +924,7 @@ def listdir(self, path=None): # noinspection PyPep8Naming class ZipStore(MutableMapping): - """MutableMapping interface to a Zip file. Keys must be strings, - values must be bytes-like objects. + """Storage class using a Zip file. Parameters ---------- @@ -911,28 +944,63 @@ class ZipStore(MutableMapping): Examples -------- - >>> import zarr - >>> store = zarr.ZipStore('example.zip', mode='w') - >>> store['foo'] = b'bar' - >>> store['foo'] - b'bar' - >>> store['a/b/c'] = b'xxx' - >>> store['a/b/c'] - b'xxx' - >>> sorted(store.keys()) - ['a/b/c', 'foo'] - >>> store.close() - >>> import zipfile - >>> zf = zipfile.ZipFile('example.zip', mode='r') - >>> sorted(zf.namelist()) - ['a/b/c', 'foo'] + Store a single array:: + + >>> import zarr + >>> store = zarr.ZipStore('data/array.zip', mode='w') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.ZipStore('data/group.zip', mode='w') + >>> root = zarr.group(store=store) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a ZipStore, the ``close()`` method must be called, otherwise + essential data will not be written to the underlying Zip file. The ZipStore + class also supports the context manager protocol, which ensures the ``close()`` + method is called on leaving the context, e.g.:: + + >>> with zarr.ZipStore('data/array.zip', mode='w') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store) + ... z[...] = 42 + ... # no need to call store.close() Notes ----- - When modifying a ZipStore the close() method must be called otherwise - essential data will not be written to the underlying zip file. The - ZipStore class also supports the context manager protocol, which ensures - the close() method is called on leaving the with statement. + Each chunk of an array is stored as a separate entry in the Zip file. Note + that Zip files do not provide any way to remove or replace existing entries. + If an attempt is made to replace an entry, then a warning is generated by + the Python standard library about a duplicate Zip file entry. This can be + triggered if you attempt to write data to a Zarr array more than once, + e.g.:: + + >>> store = zarr.ZipStore('data/example.zip', mode='w') + >>> z = zarr.zeros(100, chunks=10, store=store) + >>> z[...] = 42 # first write OK + >>> z[...] = 42 # second write generates warnings + >>> store.close() + + This can also happen in a more subtle situation, where data are written only + once to a Zarr array, but the write operations are not aligned with chunk + boundaries, e.g.:: + + >>> store = zarr.ZipStore('data/example.zip', mode='w') + >>> z = zarr.zeros(100, chunks=10, store=store) + >>> z[5:15] = 42 + >>> z[15:25] = 42 # write overlaps chunk previously written, generates warnings + + To avoid creating duplicate entries, only write data once, and align writes + with chunk boundaries. This alignment is done automatically if you call + ``z[...] = ...`` or create an array from existing data via :func:`zarr.array`. + + Alternatively, use a :class:`DirectoryStore` when writing the data, then + manually Zip the directory and use the Zip file for subsequent reads. """ @@ -1131,35 +1199,64 @@ class DBMStore(MutableMapping): mode : int File mode used if a new file is created. open : function, optional - Function to open the database file. If not provided, `dbm.open` will be used on - Python 3, and `anydbm.open` will be used on Python 2. + Function to open the database file. If not provided, :func:`dbm.open` will be + used on Python 3, and :func:`anydbm.open` will be used on Python 2. **open_kwargs Keyword arguments to pass the `open` function. - Notes - ----- - Please note that, by default, this class will use the Python standard library - `dbm.open` function to open the database file (or `anydbm.open` on Python 2). There - are up to three different implementations of DBM-style databases available in any - Python installation, and which one is used may vary from one system to another. - Database file formats are not compatible between these different implementations. - Also some implementations are more efficient than others. If you want to ensure a - specific implementation is used, pass the corresponding open function, e.g., - `dbm.gnu.open` to use the GNU DBM library. - Examples -------- - >>> import zarr - >>> store = zarr.DBMStore('example.dbm', flag='c') - >>> store['foo'] = b'bar' - >>> store['foo'] - b'bar' - >>> store['a/b/c'] = b'xxx' - >>> store['a/b/c'] - b'xxx' - >>> sorted(store.keys()) - ['a/b/c', 'foo'] - >>> store.close() + Store a single array:: + + >>> import zarr + >>> store = zarr.DBMStore('data/array.db') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.DBMStore('data/group.db') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a DBMStore, the ``close()`` method must be called, otherwise + essential data may not be written to the underlying database file. The + DBMStore class also supports the context manager protocol, which ensures the + ``close()`` method is called on leaving the context, e.g.:: + + >>> with zarr.DBMStore('data/array.db') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + ... z[...] = 42 + ... # no need to call store.close() + + A different database library can be used by passing a different function to + the `open` parameter. For example, if the `bsddb3 + `_ package is installed, a + Berkeley DB database can be used:: + + >>> import bsddb3 + >>> store = zarr.DBMStore('data/array.bdb', open=bsddb3.btopen) + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() + + Notes + ----- + Please note that, by default, this class will use the Python standard + library `dbm.open` function to open the database file (or `anydbm.open` on + Python 2). There are up to three different implementations of DBM-style + databases available in any Python installation, and which one is used may + vary from one system to another. Database file formats are not compatible + between these different implementations. Also, some implementations are + more efficient than others. In particular, the "dumb" implementation will be + the fall-back on many systems, and has very poor performance for some usage + scenarios. If you want to ensure a specific implementation is used, pass the + corresponding open function, e.g., `dbm.gnu.open` to use the GNU DBM + library. """ @@ -1194,10 +1291,12 @@ def __setstate__(self, state): self.__init__(path=path, flag=flag, mode=mode, open=open, **open_kws) def close(self): + """Closes the underlying database file.""" if hasattr(self.db, 'close'): self.db.close() def sync(self): + """Synchronizes data to the underlying database file.""" if hasattr(self.db, 'sync'): self.db.sync() diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 063aaa278c..7756f35aad 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -65,16 +65,16 @@ def test_open_group(): def test_save_errors(): with assert_raises(ValueError): # no arrays provided - save_group('example.zarr') + save_group('data/group.zarr') with assert_raises(ValueError): # no arrays provided - save('example.zarr') + save('data/group.zarr') def test_lazy_loader(): foo = np.arange(100) bar = np.arange(100, 0, -1) - store = 'example.zarr' + store = 'data/group.zarr' save(store, foo=foo, bar=bar) loader = load(store) assert 'foo' in loader diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index a0b5d7d808..bdea45a1f0 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -9,7 +9,7 @@ import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_array_almost_equal from nose.tools import (eq_ as eq, assert_is_instance, assert_raises, assert_true, assert_false, assert_is, assert_is_none) @@ -346,8 +346,8 @@ def test_array_2d(self): assert_array_equal(a[:, 7:], z[:, 7:]) def test_array_2d_edge_case(self): - # this fails with filters - chunks extend beyond edge of array, messes with delta filter - # if no fill value? + # this fails with filters - chunks extend beyond edge of array, messes with delta + # filter if no fill value? shape = 1000, 10 chunks = 300, 30 dtype = 'i8' @@ -791,7 +791,8 @@ def test_structured_array(self): (b'ccc', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) for fill_value in None, b'', (b'zzz', 0, 0.0): - z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) + z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, + fill_value=fill_value) eq(3, len(z)) if fill_value is not None: np_fill_value = np.array(fill_value, dtype=a.dtype)[()] @@ -809,6 +810,31 @@ def test_structured_array(self): # dodgy fill value self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=42) + def test_dtypes(self): + + # integers + for t in 'u1', 'u2', 'u4', 'u8', 'i1', 'i2', 'i4', 'i8': + z = self.create_array(shape=10, chunks=3, dtype=t) + assert z.dtype == np.dtype(t) + a = np.arange(z.shape[0], dtype=t) + z[:] = a + assert_array_equal(a, z[:]) + + # floats + for t in 'f2', 'f4', 'f8': + z = self.create_array(shape=10, chunks=3, dtype=t) + assert z.dtype == np.dtype(t) + a = np.linspace(0, 1, z.shape[0], dtype=t) + z[:] = a + assert_array_almost_equal(a, z[:]) + + # datetime, timedelta are not supported for the time being + for resolution in 'D', 'us', 'ns': + with assert_raises(ValueError): + self.create_array(shape=10, dtype='datetime64[{}]'.format(resolution)) + with assert_raises(ValueError): + self.create_array(shape=10, dtype='timedelta64[{}]'.format(resolution)) + class TestArrayWithPath(TestArray): @@ -1033,6 +1059,10 @@ def test_structured_array(self): # don't implement this one, cannot do delta on structured array pass + def test_dtypes(self): + # don't implement this one, delta messes up floats + pass + # custom store, does not support getsize() class CustomMapping(object): diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index da680f7df0..5ef48247cf 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -175,7 +175,7 @@ def test_full(): def test_open_array(): - store = 'example' + store = 'data/array.zarr' # mode == 'w' z = open_array(store, mode='w', shape=100, chunks=10) @@ -187,12 +187,12 @@ def test_open_array(): assert_array_equal(np.full(100, fill_value=42), z[:]) # mode in 'r', 'r+' - open_group('example_group', mode='w') + open_group('data/group.zarr', mode='w') for mode in 'r', 'r+': with assert_raises(ValueError): open_array('doesnotexist', mode=mode) with assert_raises(ValueError): - open_array('example_group', mode=mode) + open_array('data/group.zarr', mode=mode) z = open_array(store, mode='r') assert_is_instance(z, Array) assert_is_instance(z.store, DirectoryStore) @@ -220,7 +220,7 @@ def test_open_array(): eq((10,), z.chunks) assert_array_equal(np.full(100, fill_value=42), z[:]) with assert_raises(ValueError): - open_array('example_group', mode='a') + open_array('data/group.zarr', mode='a') # mode in 'w-', 'x' for mode in 'w-', 'x': @@ -235,7 +235,7 @@ def test_open_array(): with assert_raises(ValueError): open_array(store, mode=mode) with assert_raises(ValueError): - open_array('example_group', mode=mode) + open_array('data/group.zarr', mode=mode) # with synchronizer z = open_array(store, synchronizer=ThreadSynchronizer()) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index fd5b362b7d..6ff9df9d97 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -915,7 +915,7 @@ def test_group(): def test_open_group(): # test the open_group() convenience function - store = 'example' + store = 'data/group.zarr' # mode == 'w' g = open_group(store, mode='w') @@ -926,12 +926,12 @@ def test_open_group(): eq(2, len(g)) # mode in 'r', 'r+' - open_array('example_array', shape=100, chunks=10, mode='w') + open_array('data/array.zarr', shape=100, chunks=10, mode='w') for mode in 'r', 'r+': with assert_raises(ValueError): open_group('doesnotexist', mode=mode) with assert_raises(ValueError): - open_group('example_array', mode=mode) + open_group('data/array.zarr', mode=mode) g = open_group(store, mode='r') assert_is_instance(g, Group) eq(2, len(g)) @@ -952,7 +952,7 @@ def test_open_group(): g.create_groups('foo', 'bar') eq(2, len(g)) with assert_raises(ValueError): - open_group('example_array', mode='a') + open_group('data/array.zarr', mode='a') # mode in 'w-', 'x' for mode in 'w-', 'x': @@ -966,7 +966,7 @@ def test_open_group(): with assert_raises(ValueError): open_group(store, mode=mode) with assert_raises(ValueError): - open_group('example_array', mode=mode) + open_group('data/array.zarr', mode=mode) # open with path g = open_group(store, path='foo/bar') diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index 3b0c3900b8..dd9a91159b 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -163,10 +163,7 @@ def test_encode_decode_fill_values_bytes(): for v in fills: - s = base64.standard_b64encode(v) - if not PY2: - s = str(s, 'ascii') - + # setup and encode metadata meta = dict( shape=(100,), chunks=(10,), @@ -176,7 +173,12 @@ def test_encode_decode_fill_values_bytes(): filters=None, order='C' ) + meta_enc = encode_array_metadata(meta) + # define expected metadata encoded as JSON + s = base64.standard_b64encode(v) + if not PY2: + s = s.decode() meta_json = '''{ "chunks": [10], "compressor": {"id": "zlib", "level": 1}, @@ -189,14 +191,13 @@ def test_encode_decode_fill_values_bytes(): }''' % (s, ZARR_FORMAT) # test encoding - meta_enc = encode_array_metadata(meta) assert_json_eq(meta_json, meta_enc) # test decoding meta_dec = decode_array_metadata(meta_enc) actual = meta_dec['fill_value'] - np_v = np.array(v, dtype=dtype)[()] - eq(np_v, actual) + expect = np.array(v, dtype=dtype)[()] + eq(expect, actual) def test_decode_array_unsupported_format(): diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index a1c259af68..b0daf1caa1 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -580,7 +580,7 @@ def create_store(self): def test_filesystem_path(self): # test behaviour with path that does not exist - path = 'example' + path = 'data/store' if os.path.exists(path): shutil.rmtree(path) store = DirectoryStore(path) @@ -651,20 +651,20 @@ def create_store(self): return store def test_mode(self): - with ZipStore('example.zip', mode='w') as store: + with ZipStore('data/store.zip', mode='w') as store: store['foo'] = b'bar' - store = ZipStore('example.zip', mode='r') + store = ZipStore('data/store.zip', mode='r') with assert_raises(PermissionError): store['foo'] = b'bar' def test_flush(self): - store = ZipStore('example.zip', mode='w') + store = ZipStore('data/store.zip', mode='w') store['foo'] = b'bar' store.flush() assert store['foo'] == b'bar' store.close() - store = ZipStore('example.zip', mode='r') + store = ZipStore('data/store.zip', mode='r') with assert_raises(PermissionError): store.flush() @@ -787,8 +787,8 @@ def test_migrate_1to2(): def test_format_compatibility(): - # This test is intended to catch any unintended changes that break the ability to read data - # stored with a previous minor version (which should be format-compatible). + # This test is intended to catch any unintended changes that break the ability to + # read data stored with a previous minor version (which should be format-compatible). # fixture data fixture = group(store=DirectoryStore('fixture')) diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index 37524ac60a..eec28450ee 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -2,11 +2,13 @@ from __future__ import absolute_import, print_function, division +import numpy as np from nose.tools import (eq_ as eq, assert_raises, assert_true, assert_false, assert_is_instance) + from zarr.util import (normalize_shape, normalize_chunks, is_total_slice, normalize_resize_args, human_readable_size, normalize_order, guess_chunks, info_html_report, - info_text_report) + info_text_report, normalize_fill_value) def test_normalize_shape(): @@ -99,6 +101,12 @@ def test_normalize_order(): normalize_order('foo') +def test_normalize_fill_value(): + eq(b'', normalize_fill_value(0, dtype=np.dtype('S1'))) + eq(b'', normalize_fill_value(0, dtype=np.dtype([('foo', 'i4'), ('bar', 'f8')]))) + eq('', normalize_fill_value(0, dtype=np.dtype('U1'))) + + def test_guess_chunks(): shapes = ( (100,), diff --git a/zarr/util.py b/zarr/util.py index 1470b014d8..94c536193d 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -190,29 +190,33 @@ def normalize_fill_value(fill_value, dtype): # no fill value pass - elif fill_value == 0 and dtype.kind == 'V': + elif fill_value == 0 and dtype.kind in 'SV': # special case because 0 used as default, but cannot be used for structured arrays fill_value = b'' elif dtype.kind == 'U': - # special case unicode because of encoding issues on Windows if passed through numpy + # special case unicode because of encoding issues on Windows if passed through + # numpy # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 - if PY2 and isinstance(fill_value, binary_type): # pragma: py3 no cover + if fill_value == 0: + fill_value = '' + + elif PY2 and isinstance(fill_value, binary_type): # pragma: py3 no cover # this is OK on PY2, can be written as JSON pass elif not isinstance(fill_value, text_type): - raise ValueError('fill_value {!r} is not valid for dtype {}; must be a unicode string' - .format(fill_value, dtype)) + raise ValueError('fill_value {!r} is not valid for dtype {}; must be a ' + 'unicode string'.format(fill_value, dtype)) else: try: fill_value = np.array(fill_value, dtype=dtype)[()] except Exception as e: # re-raise with our own error message to be helpful - raise ValueError('fill_value {!r} is not valid for dtype {}; nested exception: {}' - .format(fill_value, dtype, e)) + raise ValueError('fill_value {!r} is not valid for dtype {}; nested ' + 'exception: {}'.format(fill_value, dtype, e)) return fill_value