1
+ import warnings
1
2
from collections import OrderedDict
2
3
from distutils .version import LooseVersion
3
4
@@ -352,10 +353,11 @@ def close(self):
352
353
zarr .consolidate_metadata (self .ds .store )
353
354
354
355
355
- def open_zarr (store , group = None , synchronizer = None , auto_chunk = True ,
356
+ def open_zarr (store , group = None , synchronizer = None , chunks = 'auto' ,
356
357
decode_cf = True , mask_and_scale = True , decode_times = True ,
357
358
concat_characters = True , decode_coords = True ,
358
- drop_variables = None , consolidated = False ):
359
+ drop_variables = None , consolidated = False ,
360
+ overwrite_encoded_chunks = False , ** kwargs ):
359
361
"""Load and decode a dataset from a Zarr store.
360
362
361
363
.. note:: Experimental
@@ -375,10 +377,15 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
375
377
Array synchronizer provided to zarr
376
378
group : str, obtional
377
379
Group path. (a.k.a. `path` in zarr terminology.)
378
- auto_chunk : bool, optional
379
- Whether to automatically create dask chunks corresponding to each
380
- variable's zarr chunks. If False, zarr array data will lazily convert
381
- to numpy arrays upon access.
380
+ chunks : int or dict or tuple or {None, 'auto'}, optional
381
+ Chunk sizes along each dimension, e.g., ``5`` or
382
+ ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created
383
+ based on the variable's zarr chunks. If `chunks=None`, zarr array
384
+ data will lazily convert to numpy arrays upon access. This accepts
385
+ all the chunk specifications as Dask does.
386
+ overwrite_encoded_chunks: bool, optional
387
+ Whether to drop the zarr chunks encoded for each variable when a
388
+ dataset is loaded with specified chunk sizes (default: False)
382
389
decode_cf : bool, optional
383
390
Whether to decode these variables, assuming they were saved according
384
391
to CF conventions.
@@ -422,6 +429,24 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
422
429
----------
423
430
http://zarr.readthedocs.io/
424
431
"""
432
+ if 'auto_chunk' in kwargs :
433
+ auto_chunk = kwargs .pop ('auto_chunk' )
434
+ if auto_chunk :
435
+ chunks = 'auto' # maintain backwards compatibility
436
+ else :
437
+ chunks = None
438
+
439
+ warnings .warn ("auto_chunk is deprecated. Use chunks='auto' instead." ,
440
+ FutureWarning , stacklevel = 2 )
441
+
442
+ if kwargs :
443
+ raise TypeError ("open_zarr() got unexpected keyword arguments " +
444
+ "," .join (kwargs .keys ()))
445
+
446
+ if not isinstance (chunks , (int , dict )):
447
+ if chunks != 'auto' and chunks is not None :
448
+ raise ValueError ("chunks must be an int, dict, 'auto', or None. "
449
+ "Instead found %s. " % chunks )
425
450
426
451
if not decode_cf :
427
452
mask_and_scale = False
@@ -449,21 +474,60 @@ def maybe_decode_store(store, lock=False):
449
474
450
475
# auto chunking needs to be here and not in ZarrStore because variable
451
476
# chunks do not survive decode_cf
452
- if auto_chunk :
453
- # adapted from Dataset.Chunk()
454
- def maybe_chunk (name , var ):
455
- from dask .base import tokenize
456
- chunks = var .encoding .get ('chunks' )
457
- if (var .ndim > 0 ) and (chunks is not None ):
458
- # does this cause any data to be read?
459
- token2 = tokenize (name , var ._data )
460
- name2 = 'zarr-%s' % token2
461
- return var .chunk (chunks , name = name2 , lock = None )
462
- else :
463
- return var
464
-
465
- variables = OrderedDict ([(k , maybe_chunk (k , v ))
466
- for k , v in ds .variables .items ()])
467
- return ds ._replace_vars_and_dims (variables )
468
- else :
477
+ # return trivial case
478
+ if not chunks :
469
479
return ds
480
+
481
+ # adapted from Dataset.Chunk()
482
+ if isinstance (chunks , int ):
483
+ chunks = dict .fromkeys (ds .dims , chunks )
484
+
485
+ if isinstance (chunks , tuple ) and len (chunks ) == len (ds .dims ):
486
+ chunks = dict (zip (ds .dims , chunks ))
487
+
488
+ def get_chunk (name , var , chunks ):
489
+ chunk_spec = dict (zip (var .dims , var .encoding .get ('chunks' )))
490
+
491
+ # Coordinate labels aren't chunked
492
+ if var .ndim == 1 and var .dims [0 ] == name :
493
+ return chunk_spec
494
+
495
+ if chunks == 'auto' :
496
+ return chunk_spec
497
+
498
+ for dim in var .dims :
499
+ if dim in chunks :
500
+ spec = chunks [dim ]
501
+ if isinstance (spec , int ):
502
+ spec = (spec ,)
503
+ if isinstance (spec , (tuple , list )) and chunk_spec [dim ]:
504
+ if any (s % chunk_spec [dim ] for s in spec ):
505
+ warnings .warn ("Specified Dask chunks %r would "
506
+ "separate Zarr chunk shape %r for "
507
+ "dimension %r. This significantly "
508
+ "degrades performance. Consider "
509
+ "rechunking after loading instead."
510
+ % (chunks [dim ], chunk_spec [dim ], dim ),
511
+ stacklevel = 2 )
512
+ chunk_spec [dim ] = chunks [dim ]
513
+ return chunk_spec
514
+
515
+ def maybe_chunk (name , var , chunks ):
516
+ from dask .base import tokenize
517
+
518
+ chunk_spec = get_chunk (name , var , chunks )
519
+
520
+ if (var .ndim > 0 ) and (chunk_spec is not None ):
521
+ # does this cause any data to be read?
522
+ token2 = tokenize (name , var ._data )
523
+ name2 = 'zarr-%s' % token2
524
+ var = var .chunk (chunk_spec , name = name2 , lock = None )
525
+ if overwrite_encoded_chunks and var .chunks is not None :
526
+ var .encoding ['chunks' ] = tuple (x [0 ] for x in var .chunks )
527
+ return var
528
+ else :
529
+ return var
530
+
531
+ variables = OrderedDict ([(k , maybe_chunk (k , v , chunks ))
532
+ for k , v in ds .variables .items ()])
533
+ return ds ._replace_vars_and_dims (variables )
0 commit comments