diff --git a/docs/release.rst b/docs/release.rst index 7c73a02911..5929efb317 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1,6 +1,20 @@ Release notes ============= +.. _release_2.8.0: + +2.8.0 +----- + +V2 Specification Update +~~~~~~~~~~~~~~~~~~~~~~~ + +* Introduce optional dimension_separator .zarray key for nested chunks. + By :user:`Josh Moore `; :issue:`715`, :issue:`716`. + +.. _release_2.7.0: + + .. _release_2.7.1: 2.7.1 diff --git a/zarr/core.py b/zarr/core.py index b6f82ff7d7..dded2fd74e 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -89,6 +89,7 @@ class Array: dtype compression compression_opts + dimension_separator fill_value order synchronizer @@ -194,6 +195,7 @@ def _load_metadata_nosync(self): self._dtype = meta['dtype'] self._fill_value = meta['fill_value'] self._order = meta['order'] + self._dimension_separator = meta.get('dimension_separator', '.') # setup compressor config = meta['compressor'] diff --git a/zarr/creation.py b/zarr/creation.py index 6fbdaf04c1..d017ec921d 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -13,13 +13,14 @@ from zarr.storage import (DirectoryStore, ZipStore, contains_array, contains_group, default_compressor, init_array, normalize_storage_path, FSStore) +from zarr.util import normalize_dimension_separator def create(shape, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, - object_codec=None, **kwargs): + object_codec=None, dimension_separator=None, **kwargs): """Create an array. Parameters @@ -66,6 +67,9 @@ def create(shape, chunks=True, dtype=None, compressor='default', True if array should be protected against modification. object_codec : Codec, optional A codec to encode object arrays, only needed if dtype=object. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + .. versionadded:: 2.8 Returns ------- @@ -117,10 +121,16 @@ def create(shape, chunks=True, dtype=None, compressor='default', # API compatibility with h5py compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs) + # optional array metadata + if dimension_separator is None: + dimension_separator = getattr(store, "_dimension_separator", None) + dimension_separator = normalize_dimension_separator(dimension_separator) + # initialize array metadata init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, overwrite=overwrite, path=path, - chunk_store=chunk_store, filters=filters, object_codec=object_codec) + chunk_store=chunk_store, filters=filters, object_codec=object_codec, + dimension_separator=dimension_separator) # instantiate array z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer, diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 39dc82c724..89804d445b 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -783,6 +783,8 @@ def create_dataset(self, name, **kwargs): lifetime of the object. If False, array metadata will be reloaded prior to all data access and modification operations (may incur overhead depending on storage and data access pattern). + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. Returns ------- diff --git a/zarr/meta.py b/zarr/meta.py index d1a1e43bbe..c8c5d77004 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -50,7 +50,9 @@ def decode_array_metadata(s: Union[MappingType, str]) -> MappingType[str, Any]: fill_value=fill_value, order=meta['order'], filters=meta['filters'], + dimension_separator=meta.get('dimension_separator', '.'), ) + except Exception as e: raise MetadataError('error decoding metadata: %s' % e) else: @@ -62,6 +64,9 @@ def encode_array_metadata(meta: MappingType[str, Any]) -> bytes: sdshape = () if dtype.subdtype is not None: dtype, sdshape = dtype.subdtype + + dimension_separator = meta.get('dimension_separator') + meta = dict( zarr_format=ZARR_FORMAT, shape=meta['shape'] + sdshape, @@ -72,6 +77,10 @@ def encode_array_metadata(meta: MappingType[str, Any]) -> bytes: order=meta['order'], filters=meta['filters'], ) + + if dimension_separator: + meta['dimension_separator'] = dimension_separator + return json_dumps(meta) diff --git a/zarr/n5.py b/zarr/n5.py index fa01005302..67e39357e7 100644 --- a/zarr/n5.py +++ b/zarr/n5.py @@ -355,6 +355,9 @@ def array_metadata_to_n5(array_metadata): compressor_config = compressor_config_to_n5(compressor_config) array_metadata['compression'] = compressor_config + if 'dimension_separator' in array_metadata: + del array_metadata['dimension_separator'] + return array_metadata diff --git a/zarr/storage.py b/zarr/storage.py index c332ee02f5..aa46e59e79 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -52,6 +52,7 @@ ) from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.util import (buffer_size, json_loads, nolock, normalize_chunks, + normalize_dimension_separator, normalize_dtype, normalize_fill_value, normalize_order, normalize_shape, normalize_storage_path, retry_call) @@ -235,6 +236,7 @@ def init_array( chunk_store: MutableMapping = None, filters=None, object_codec=None, + dimension_separator=None, ): """Initialize an array store with the given configuration. Note that this is a low-level function and there should be no need to call this directly from user code. @@ -267,6 +269,8 @@ def init_array( Sequence of filters to use to encode chunk data prior to compression. object_codec : Codec, optional A codec to encode object arrays, only needed if dtype=object. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. Examples -------- @@ -349,7 +353,8 @@ def init_array( compressor=compressor, fill_value=fill_value, order=order, overwrite=overwrite, path=path, chunk_store=chunk_store, filters=filters, - object_codec=object_codec) + object_codec=object_codec, + dimension_separator=dimension_separator) def _init_array_metadata( @@ -365,6 +370,7 @@ def _init_array_metadata( chunk_store=None, filters=None, object_codec=None, + dimension_separator=None, ): # guard conditions @@ -386,6 +392,11 @@ def _init_array_metadata( order = normalize_order(order) fill_value = normalize_fill_value(fill_value, dtype) + # optional array metadata + if dimension_separator is None: + dimension_separator = getattr(store, "_dimension_separator", None) + dimension_separator = normalize_dimension_separator(dimension_separator) + # compressor prep if shape == (): # no point in compressing a 0-dimensional array, only a single value @@ -433,7 +444,8 @@ def _init_array_metadata( # initialize metadata meta = dict(shape=shape, chunks=chunks, dtype=dtype, compressor=compressor_config, fill_value=fill_value, - order=order, filters=filters_config) + order=order, filters=filters_config, + dimension_separator=dimension_separator) key = _path_to_prefix(path) + array_meta_key store[key] = encode_array_metadata(meta) @@ -539,13 +551,14 @@ class MemoryStore(MutableMapping): """ - def __init__(self, root=None, cls=dict): + def __init__(self, root=None, cls=dict, dimension_separator=None): if root is None: self.root = cls() else: self.root = root self.cls = cls self.write_mutex = Lock() + self._dimension_separator = dimension_separator def __getstate__(self): return self.root, self.cls @@ -728,6 +741,8 @@ class DirectoryStore(MutableMapping): (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be useful to avoid potential discrepancies between case-senstive and case-insensitive file system. Default value is False. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. Examples -------- @@ -774,7 +789,7 @@ class DirectoryStore(MutableMapping): """ - def __init__(self, path, normalize_keys=False): + def __init__(self, path, normalize_keys=False, dimension_separator=None): # guard conditions path = os.path.abspath(path) @@ -783,6 +798,7 @@ def __init__(self, path, normalize_keys=False): self.path = path self.normalize_keys = normalize_keys + self._dimension_separator = dimension_separator def _normalize_key(self, key): return key.lower() if self.normalize_keys else key @@ -1012,30 +1028,44 @@ class FSStore(MutableMapping): like "s3://bucket/root" normalize_keys : bool key_separator : str - Character to use when constructing the target path strings - for data keys + public API for accessing dimension_separator. Never `None` + See dimension_separator for more information. mode : str "w" for writable, "r" for read-only exceptions : list of Exception subclasses When accessing data, any of these exceptions will be treated as a missing key + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. storage_options : passed to the fsspec implementation """ _META_KEYS = (attrs_key, group_meta_key, array_meta_key) - def __init__(self, url, normalize_keys=True, key_separator='.', + def __init__(self, url, normalize_keys=True, key_separator=None, mode='w', exceptions=(KeyError, PermissionError, IOError), + dimension_separator=None, **storage_options): import fsspec self.normalize_keys = normalize_keys - self.key_separator = key_separator self.map = fsspec.get_mapper(url, **storage_options) self.fs = self.map.fs # for direct operations self.path = self.fs._strip_protocol(url) self.mode = mode self.exceptions = exceptions + + # For backwards compatibility. Guaranteed to be non-None + if key_separator is not None: + dimension_separator = key_separator + + self.key_separator = dimension_separator + if self.key_separator is None: + self.key_separator = "." + + # Pass attributes to array creation + self._dimension_separator = dimension_separator + if self.fs.exists(self.path) and not self.fs.isdir(self.path): raise FSPathExistNotDir(url) @@ -1153,11 +1183,13 @@ class TempStore(DirectoryStore): (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be useful to avoid potential discrepancies between case-senstive and case-insensitive file system. Default value is False. - + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. """ # noinspection PyShadowingBuiltins - def __init__(self, suffix='', prefix='zarr', dir=None, normalize_keys=False): + def __init__(self, suffix='', prefix='zarr', dir=None, normalize_keys=False, + dimension_separator=None): path = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir) atexit.register(atexit_rmtree, path) super().__init__(path, normalize_keys=normalize_keys) @@ -1192,6 +1224,9 @@ class NestedDirectoryStore(DirectoryStore): (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be useful to avoid potential discrepancies between case-senstive and case-insensitive file system. Default value is False. + dimension_separator : {'/'}, optional + Separator placed between the dimensions of a chunk. + Only supports "/" unlike other implementations. Examples -------- @@ -1248,8 +1283,14 @@ class NestedDirectoryStore(DirectoryStore): """ - def __init__(self, path, normalize_keys=False): + def __init__(self, path, normalize_keys=False, dimension_separator="/"): super().__init__(path, normalize_keys=normalize_keys) + if dimension_separator is None: + dimension_separator = "/" + elif dimension_separator != "/": + raise ValueError( + "NestedDirectoryStore only supports '/' as dimension_separator") + self._dimension_separator = dimension_separator def __getitem__(self, key): key = _nested_map_ckey(key) @@ -1314,6 +1355,8 @@ class ZipStore(MutableMapping): One of 'r' to read an existing file, 'w' to truncate and write a new file, 'a' to append to an existing file, or 'x' to exclusively create and write a new file. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. Examples -------- @@ -1382,7 +1425,8 @@ class also supports the context manager protocol, which ensures the ``close()`` """ - def __init__(self, path, compression=zipfile.ZIP_STORED, allowZip64=True, mode='a'): + def __init__(self, path, compression=zipfile.ZIP_STORED, allowZip64=True, mode='a', + dimension_separator=None): # store properties path = os.path.abspath(path) @@ -1390,6 +1434,7 @@ def __init__(self, path, compression=zipfile.ZIP_STORED, allowZip64=True, mode=' self.compression = compression self.allowZip64 = allowZip64 self.mode = mode + self._dimension_separator = dimension_separator # Current understanding is that zipfile module in stdlib is not thread-safe, # and so locking is required for both read and write. However, this has not @@ -1602,6 +1647,8 @@ class DBMStore(MutableMapping): used on Python 3, and :func:`anydbm.open` will be used on Python 2. write_lock: bool, optional Use a lock to prevent concurrent writes from multiple threads (True by default). + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk.e **open_kwargs Keyword arguments to pass the `open` function. @@ -1666,6 +1713,7 @@ class DBMStore(MutableMapping): """ def __init__(self, path, flag='c', mode=0o666, open=None, write_lock=True, + dimension_separator=None, **open_kwargs): if open is None: import dbm @@ -1685,6 +1733,7 @@ def __init__(self, path, flag='c', mode=0o666, open=None, write_lock=True, else: self.write_mutex = nolock self.open_kwargs = open_kwargs + self._dimension_separator = dimension_separator def __getstate__(self): try: @@ -1786,6 +1835,8 @@ class LMDBStore(MutableMapping): buffers : bool, optional If True (default) use support for buffers, which should increase performance by reducing memory copies. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. **kwargs Keyword arguments passed through to the `lmdb.open` function. @@ -1828,7 +1879,7 @@ class LMDBStore(MutableMapping): """ - def __init__(self, path, buffers=True, **kwargs): + def __init__(self, path, buffers=True, dimension_separator=None, **kwargs): import lmdb # set default memory map size to something larger than the lmdb default, which is @@ -1866,6 +1917,7 @@ def __init__(self, path, buffers=True, **kwargs): self.buffers = buffers self.path = path self.kwargs = kwargs + self._dimension_separator = dimension_separator def __getstate__(self): try: @@ -2147,6 +2199,8 @@ class ABSStore(MutableMapping): blob_service_kwargs : dictionary Extra arguments to be passed into the azure blob client, for e.g. when using the emulator, pass in blob_service_kwargs={'is_emulated': True}. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. Notes ----- @@ -2154,12 +2208,13 @@ class ABSStore(MutableMapping): """ def __init__(self, container, prefix='', account_name=None, account_key=None, - blob_service_kwargs=None): + blob_service_kwargs=None, dimension_separator=None): from azure.storage.blob import BlockBlobService self.container = container self.prefix = normalize_storage_path(prefix) self.account_name = account_name self.account_key = account_key + self._dimension_separator = dimension_separator if blob_service_kwargs is not None: self.blob_service_kwargs = blob_service_kwargs else: # pragma: no cover @@ -2301,6 +2356,8 @@ class SQLiteStore(MutableMapping): ---------- path : string Location of database file. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. **kwargs Keyword arguments passed through to the `sqlite3.connect` function. @@ -2324,9 +2381,11 @@ class SQLiteStore(MutableMapping): >>> store.close() # don't forget to call this when you're done """ - def __init__(self, path, **kwargs): + def __init__(self, path, dimension_separator=None, **kwargs): import sqlite3 + self._dimension_separator = dimension_separator + # normalize path if path != ':memory:': path = os.path.abspath(path) @@ -2507,6 +2566,8 @@ class MongoDBStore(MutableMapping): Name of database collection : string Name of collection + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. **kwargs Keyword arguments passed through to the `pymongo.MongoClient` function. @@ -2520,11 +2581,12 @@ class MongoDBStore(MutableMapping): _value = 'value' def __init__(self, database='mongodb_zarr', collection='zarr_collection', - **kwargs): + dimension_separator=None, **kwargs): import pymongo self._database = database self._collection = collection + self._dimension_separator = dimension_separator self._kwargs = kwargs self.client = pymongo.MongoClient(**self._kwargs) @@ -2585,14 +2647,17 @@ class RedisStore(MutableMapping): ---------- prefix : string Name of prefix for Redis keys + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. **kwargs Keyword arguments passed through to the `redis.Redis` function. """ - def __init__(self, prefix='zarr', **kwargs): + def __init__(self, prefix='zarr', dimension_separator=None, **kwargs): import redis self._prefix = prefix self._kwargs = kwargs + self._dimension_separator = dimension_separator self.client = redis.Redis(**kwargs) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index eaa87800ef..6914cff87e 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -532,39 +532,52 @@ def test_setitem_data_not_shared(self): if hasattr(z.store, 'close'): z.store.close() + def expected(self): + return [ + "063b02ff8d9d3bab6da932ad5828b506ef0a6578", + "f97b84dc9ffac807415f750100108764e837bb82", + "c7190ad2bea1e9d2e73eaa2d3ca9187be1ead261", + "14470724dca6c1837edddedc490571b6a7f270bc", + "2a1046dd99b914459b3e86be9dde05027a07d209", + ] + def test_hexdigest(self): + found = [] + # Check basic 1-D array z = self.create_array(shape=(1050,), chunks=100, dtype=' str: return order +def normalize_dimension_separator(sep: Optional[str]) -> Optional[str]: + if sep in (".", "/", None): + return sep + else: + raise ValueError( + "dimension_separator must be either '.' or '/', found: %r" % sep) + + def normalize_fill_value(fill_value, dtype: np.dtype): if fill_value is None: