Skip to content

Commit baf81b4

Browse files
lilyminiumrabernat
authored andcommitted
Manually specify chunks in open_zarr (#2530)
* added manual chunks for open_zarr * updated whats-new * fixed pep8 issues * removed whitespace * added deprecation warning * fixed pep8 issues * added warning for bad chunks * fixed lingering rebase conflicts * fixed pep8 issues * added stacklevel * fixed pep8 issues * Various fixes for explicit Dataset.indexes (#2858) * Various fixes for explicit Dataset.indexes Fixes GH2856 I've added internal consistency checks to the uses of ``assert_equal`` in our test suite, so this shouldn't happen again. * Fix indexes in Dataset.interp * 0.12.1 release * revert to 0.12.2 dev * update links to https (#2872) * Fix mypy typing error in cftime_offsets.py (#2878) * decreased pytest verbosity (#2881) * added manual chunks for open_zarr * updated whats-new * fixed pep8 issues * removed whitespace * added deprecation warning * fixed pep8 issues * added warning for bad chunks * fixed lingering rebase conflicts * fixed pep8 issues * added stacklevel * fixed pep8 issues * disallow unicode again * disallow unicode again
1 parent aebe60c commit baf81b4

File tree

3 files changed

+178
-27
lines changed

3 files changed

+178
-27
lines changed

doc/whats-new.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ Other enhancements
140140
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
141141
- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`).
142142
By `Kevin Squire <https://github.com/kmsquire>`_.
143+
- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=``
144+
parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for
145+
backwards compatibility. The ``overwrite_encoded_chunks`` parameter is
146+
added to remove the original zarr chunk encoding.
147+
By `Lily Wang <https://github.com/lilyminium>`_.
143148

144149
Bug fixes
145150
~~~~~~~~~

xarray/backends/zarr.py

Lines changed: 87 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import warnings
12
from collections import OrderedDict
23
from distutils.version import LooseVersion
34

@@ -352,10 +353,11 @@ def close(self):
352353
zarr.consolidate_metadata(self.ds.store)
353354

354355

355-
def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
356+
def open_zarr(store, group=None, synchronizer=None, chunks='auto',
356357
decode_cf=True, mask_and_scale=True, decode_times=True,
357358
concat_characters=True, decode_coords=True,
358-
drop_variables=None, consolidated=False):
359+
drop_variables=None, consolidated=False,
360+
overwrite_encoded_chunks=False, **kwargs):
359361
"""Load and decode a dataset from a Zarr store.
360362
361363
.. note:: Experimental
@@ -375,10 +377,15 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
375377
Array synchronizer provided to zarr
376378
group : str, obtional
377379
Group path. (a.k.a. `path` in zarr terminology.)
378-
auto_chunk : bool, optional
379-
Whether to automatically create dask chunks corresponding to each
380-
variable's zarr chunks. If False, zarr array data will lazily convert
381-
to numpy arrays upon access.
380+
chunks : int or dict or tuple or {None, 'auto'}, optional
381+
Chunk sizes along each dimension, e.g., ``5`` or
382+
``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created
383+
based on the variable's zarr chunks. If `chunks=None`, zarr array
384+
data will lazily convert to numpy arrays upon access. This accepts
385+
all the chunk specifications as Dask does.
386+
overwrite_encoded_chunks: bool, optional
387+
Whether to drop the zarr chunks encoded for each variable when a
388+
dataset is loaded with specified chunk sizes (default: False)
382389
decode_cf : bool, optional
383390
Whether to decode these variables, assuming they were saved according
384391
to CF conventions.
@@ -422,6 +429,24 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
422429
----------
423430
http://zarr.readthedocs.io/
424431
"""
432+
if 'auto_chunk' in kwargs:
433+
auto_chunk = kwargs.pop('auto_chunk')
434+
if auto_chunk:
435+
chunks = 'auto' # maintain backwards compatibility
436+
else:
437+
chunks = None
438+
439+
warnings.warn("auto_chunk is deprecated. Use chunks='auto' instead.",
440+
FutureWarning, stacklevel=2)
441+
442+
if kwargs:
443+
raise TypeError("open_zarr() got unexpected keyword arguments " +
444+
",".join(kwargs.keys()))
445+
446+
if not isinstance(chunks, (int, dict)):
447+
if chunks != 'auto' and chunks is not None:
448+
raise ValueError("chunks must be an int, dict, 'auto', or None. "
449+
"Instead found %s. " % chunks)
425450

426451
if not decode_cf:
427452
mask_and_scale = False
@@ -449,21 +474,60 @@ def maybe_decode_store(store, lock=False):
449474

450475
# auto chunking needs to be here and not in ZarrStore because variable
451476
# chunks do not survive decode_cf
452-
if auto_chunk:
453-
# adapted from Dataset.Chunk()
454-
def maybe_chunk(name, var):
455-
from dask.base import tokenize
456-
chunks = var.encoding.get('chunks')
457-
if (var.ndim > 0) and (chunks is not None):
458-
# does this cause any data to be read?
459-
token2 = tokenize(name, var._data)
460-
name2 = 'zarr-%s' % token2
461-
return var.chunk(chunks, name=name2, lock=None)
462-
else:
463-
return var
464-
465-
variables = OrderedDict([(k, maybe_chunk(k, v))
466-
for k, v in ds.variables.items()])
467-
return ds._replace_vars_and_dims(variables)
468-
else:
477+
# return trivial case
478+
if not chunks:
469479
return ds
480+
481+
# adapted from Dataset.Chunk()
482+
if isinstance(chunks, int):
483+
chunks = dict.fromkeys(ds.dims, chunks)
484+
485+
if isinstance(chunks, tuple) and len(chunks) == len(ds.dims):
486+
chunks = dict(zip(ds.dims, chunks))
487+
488+
def get_chunk(name, var, chunks):
489+
chunk_spec = dict(zip(var.dims, var.encoding.get('chunks')))
490+
491+
# Coordinate labels aren't chunked
492+
if var.ndim == 1 and var.dims[0] == name:
493+
return chunk_spec
494+
495+
if chunks == 'auto':
496+
return chunk_spec
497+
498+
for dim in var.dims:
499+
if dim in chunks:
500+
spec = chunks[dim]
501+
if isinstance(spec, int):
502+
spec = (spec,)
503+
if isinstance(spec, (tuple, list)) and chunk_spec[dim]:
504+
if any(s % chunk_spec[dim] for s in spec):
505+
warnings.warn("Specified Dask chunks %r would "
506+
"separate Zarr chunk shape %r for "
507+
"dimension %r. This significantly "
508+
"degrades performance. Consider "
509+
"rechunking after loading instead."
510+
% (chunks[dim], chunk_spec[dim], dim),
511+
stacklevel=2)
512+
chunk_spec[dim] = chunks[dim]
513+
return chunk_spec
514+
515+
def maybe_chunk(name, var, chunks):
516+
from dask.base import tokenize
517+
518+
chunk_spec = get_chunk(name, var, chunks)
519+
520+
if (var.ndim > 0) and (chunk_spec is not None):
521+
# does this cause any data to be read?
522+
token2 = tokenize(name, var._data)
523+
name2 = 'zarr-%s' % token2
524+
var = var.chunk(chunk_spec, name=name2, lock=None)
525+
if overwrite_encoded_chunks and var.chunks is not None:
526+
var.encoding['chunks'] = tuple(x[0] for x in var.chunks)
527+
return var
528+
else:
529+
return var
530+
531+
variables = OrderedDict([(k, maybe_chunk(k, v, chunks))
532+
for k, v in ds.variables.items()])
533+
return ds._replace_vars_and_dims(variables)

xarray/tests/test_backends.py

Lines changed: 86 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,27 +1391,109 @@ def test_auto_chunk(self):
13911391
original = create_test_data().chunk()
13921392

13931393
with self.roundtrip(
1394-
original, open_kwargs={'auto_chunk': False}) as actual:
1394+
original, open_kwargs={'chunks': None}) as actual:
13951395
for k, v in actual.variables.items():
13961396
# only index variables should be in memory
13971397
assert v._in_memory == (k in actual.dims)
13981398
# there should be no chunks
13991399
assert v.chunks is None
14001400

14011401
with self.roundtrip(
1402-
original, open_kwargs={'auto_chunk': True}) as actual:
1402+
original, open_kwargs={'chunks': 'auto'}) as actual:
14031403
for k, v in actual.variables.items():
14041404
# only index variables should be in memory
14051405
assert v._in_memory == (k in actual.dims)
14061406
# chunk size should be the same as original
14071407
assert v.chunks == original[k].chunks
14081408

1409+
def test_manual_chunk(self):
1410+
original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3})
1411+
1412+
# All of these should return non-chunked arrays
1413+
NO_CHUNKS = (None, 0, {})
1414+
for no_chunk in NO_CHUNKS:
1415+
open_kwargs = {'chunks': no_chunk}
1416+
with self.roundtrip(original, open_kwargs=open_kwargs) as actual:
1417+
for k, v in actual.variables.items():
1418+
# only index variables should be in memory
1419+
assert v._in_memory == (k in actual.dims)
1420+
# there should be no chunks
1421+
assert v.chunks is None
1422+
1423+
# uniform arrays
1424+
for i in range(2, 6):
1425+
rechunked = original.chunk(chunks=i)
1426+
open_kwargs = {'chunks': i}
1427+
with self.roundtrip(original, open_kwargs=open_kwargs) as actual:
1428+
for k, v in actual.variables.items():
1429+
# only index variables should be in memory
1430+
assert v._in_memory == (k in actual.dims)
1431+
# chunk size should be the same as rechunked
1432+
assert v.chunks == rechunked[k].chunks
1433+
1434+
chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5}
1435+
rechunked = original.chunk(chunks=chunks)
1436+
1437+
open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True}
1438+
with self.roundtrip(original, open_kwargs=open_kwargs) as actual:
1439+
for k, v in actual.variables.items():
1440+
assert v.chunks == rechunked[k].chunks
1441+
1442+
with self.roundtrip(actual) as auto:
1443+
# encoding should have changed
1444+
for k, v in actual.variables.items():
1445+
assert v.chunks == rechunked[k].chunks
1446+
1447+
assert_identical(actual, auto)
1448+
assert_identical(actual.load(), auto.load())
1449+
1450+
def test_warning_on_bad_chunks(self):
1451+
original = create_test_data().chunk({'dim1': 4, 'dim2': 3, 'dim3': 5})
1452+
1453+
bad_chunks = (2, {'dim2': (3, 3, 2, 1)})
1454+
for chunks in bad_chunks:
1455+
kwargs = {'chunks': chunks}
1456+
with pytest.warns(UserWarning):
1457+
with self.roundtrip(original, open_kwargs=kwargs) as actual:
1458+
for k, v in actual.variables.items():
1459+
# only index variables should be in memory
1460+
assert v._in_memory == (k in actual.dims)
1461+
1462+
good_chunks = ({'dim2': 3}, {'dim3': 10})
1463+
for chunks in good_chunks:
1464+
kwargs = {'chunks': chunks}
1465+
with pytest.warns(None) as record:
1466+
with self.roundtrip(original, open_kwargs=kwargs) as actual:
1467+
for k, v in actual.variables.items():
1468+
# only index variables should be in memory
1469+
assert v._in_memory == (k in actual.dims)
1470+
assert len(record) == 0
1471+
1472+
def test_deprecate_auto_chunk(self):
1473+
original = create_test_data().chunk()
1474+
with pytest.warns(FutureWarning):
1475+
with self.roundtrip(
1476+
original, open_kwargs={'auto_chunk': True}) as actual:
1477+
for k, v in actual.variables.items():
1478+
# only index variables should be in memory
1479+
assert v._in_memory == (k in actual.dims)
1480+
# chunk size should be the same as original
1481+
assert v.chunks == original[k].chunks
1482+
1483+
with pytest.warns(FutureWarning):
1484+
with self.roundtrip(
1485+
original, open_kwargs={'auto_chunk': False}) as actual:
1486+
for k, v in actual.variables.items():
1487+
# only index variables should be in memory
1488+
assert v._in_memory == (k in actual.dims)
1489+
# there should be no chunks
1490+
assert v.chunks is None
1491+
14091492
def test_write_uneven_dask_chunks(self):
14101493
# regression for GH#2225
14111494
original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3})
1412-
14131495
with self.roundtrip(
1414-
original, open_kwargs={'auto_chunk': True}) as actual:
1496+
original, open_kwargs={'chunks': 'auto'}) as actual:
14151497
for k, v in actual.data_vars.items():
14161498
print(k)
14171499
assert v.chunks == actual[k].chunks

0 commit comments

Comments
 (0)