Skip to content

Commit 04597a8

Browse files
dcherianTomNicholas
authored andcommitted
mfdataset, concat now support the 'join' kwarg. (#3102)
* mfdatset, concat now support the 'join' kwarg. Closes #1354 * Add whats-new.rst * Add concat tests * doc improvements. * update todo. * mfdataset tests. * manual_combine → combine_nested * Add tests for combine_nested & combine_coords * Update docstring. * lint.
1 parent 1ab7569 commit 04597a8

File tree

8 files changed

+225
-52
lines changed

8 files changed

+225
-52
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ New functions/methods
4040
Enhancements
4141
~~~~~~~~~~~~
4242

43+
- :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg.
44+
It is passed down to :py:func:`~xarray.align`. By `Deepak Cherian <https://github.com/dcherian>`_.
4345
- In :py:meth:`~xarray.Dataset.to_zarr`, passing ``mode`` is not mandatory if
4446
``append_dim`` is set, as it will automatically be set to ``'a'`` internally.
4547
By `David Brochart <https://github.com/davidbrochart>`_.

xarray/backends/api.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,7 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
609609
compat='no_conflicts', preprocess=None, engine=None,
610610
lock=None, data_vars='all', coords='different',
611611
combine='_old_auto', autoclose=None, parallel=False,
612-
**kwargs):
612+
join='outer', **kwargs):
613613
"""Open multiple files as a single dataset.
614614
615615
If combine='by_coords' then the function ``combine_by_coords`` is used to
@@ -704,6 +704,16 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
704704
parallel : bool, optional
705705
If True, the open and preprocess steps of this function will be
706706
performed in parallel using ``dask.delayed``. Default is False.
707+
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
708+
String indicating how to combine differing indexes
709+
(excluding concat_dim) in objects
710+
711+
- 'outer': use the union of object indexes
712+
- 'inner': use the intersection of object indexes
713+
- 'left': use indexes from the first object with each dimension
714+
- 'right': use indexes from the last object with each dimension
715+
- 'exact': instead of aligning, raise `ValueError` when indexes to be
716+
aligned are not equal
707717
**kwargs : optional
708718
Additional arguments passed on to :py:func:`xarray.open_dataset`.
709719
@@ -798,18 +808,20 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
798808

799809
combined = auto_combine(datasets, concat_dim=concat_dim,
800810
compat=compat, data_vars=data_vars,
801-
coords=coords, from_openmfds=True)
811+
coords=coords, join=join,
812+
from_openmfds=True)
802813
elif combine == 'nested':
803814
# Combined nested list by successive concat and merge operations
804815
# along each dimension, using structure given by "ids"
805816
combined = _nested_combine(datasets, concat_dims=concat_dim,
806817
compat=compat, data_vars=data_vars,
807-
coords=coords, ids=ids)
818+
coords=coords, ids=ids, join=join)
808819
elif combine == 'by_coords':
809820
# Redo ordering from coordinates, ignoring how they were ordered
810821
# previously
811822
combined = combine_by_coords(datasets, compat=compat,
812-
data_vars=data_vars, coords=coords)
823+
data_vars=data_vars, coords=coords,
824+
join=join)
813825
else:
814826
raise ValueError("{} is an invalid option for the keyword argument"
815827
" ``combine``".format(combine))

xarray/core/combine.py

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def _check_shape_tile_ids(combined_tile_ids):
136136

137137
def _combine_nd(combined_ids, concat_dims, data_vars='all',
138138
coords='different', compat='no_conflicts',
139-
fill_value=dtypes.NA):
139+
fill_value=dtypes.NA, join='outer'):
140140
"""
141141
Combines an N-dimensional structure of datasets into one by applying a
142142
series of either concat and merge operations along each dimension.
@@ -177,13 +177,14 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all',
177177
data_vars=data_vars,
178178
coords=coords,
179179
compat=compat,
180-
fill_value=fill_value)
180+
fill_value=fill_value,
181+
join=join)
181182
(combined_ds,) = combined_ids.values()
182183
return combined_ds
183184

184185

185186
def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
186-
fill_value=dtypes.NA):
187+
fill_value=dtypes.NA, join='outer'):
187188

188189
# Group into lines of datasets which must be combined along dim
189190
# need to sort by _new_tile_id first for groupby to work
@@ -197,12 +198,13 @@ def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
197198
combined_ids = OrderedDict(sorted(group))
198199
datasets = combined_ids.values()
199200
new_combined_ids[new_id] = _combine_1d(datasets, dim, compat,
200-
data_vars, coords, fill_value)
201+
data_vars, coords, fill_value,
202+
join)
201203
return new_combined_ids
202204

203205

204206
def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
205-
coords='different', fill_value=dtypes.NA):
207+
coords='different', fill_value=dtypes.NA, join='outer'):
206208
"""
207209
Applies either concat or merge to 1D list of datasets depending on value
208210
of concat_dim
@@ -211,7 +213,7 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
211213
if concat_dim is not None:
212214
try:
213215
combined = concat(datasets, dim=concat_dim, data_vars=data_vars,
214-
coords=coords, fill_value=fill_value)
216+
coords=coords, fill_value=fill_value, join=join)
215217
except ValueError as err:
216218
if "encountered unexpected variable" in str(err):
217219
raise ValueError("These objects cannot be combined using only "
@@ -222,7 +224,8 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
222224
else:
223225
raise
224226
else:
225-
combined = merge(datasets, compat=compat, fill_value=fill_value)
227+
combined = merge(datasets, compat=compat, fill_value=fill_value,
228+
join=join)
226229

227230
return combined
228231

@@ -233,7 +236,7 @@ def _new_tile_id(single_id_ds_pair):
233236

234237

235238
def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
236-
fill_value=dtypes.NA):
239+
fill_value=dtypes.NA, join='outer'):
237240

238241
if len(datasets) == 0:
239242
return Dataset()
@@ -254,12 +257,13 @@ def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
254257
# Apply series of concatenate or merge operations along each dimension
255258
combined = _combine_nd(combined_ids, concat_dims, compat=compat,
256259
data_vars=data_vars, coords=coords,
257-
fill_value=fill_value)
260+
fill_value=fill_value, join=join)
258261
return combined
259262

260263

261264
def combine_nested(datasets, concat_dim, compat='no_conflicts',
262-
data_vars='all', coords='different', fill_value=dtypes.NA):
265+
data_vars='all', coords='different', fill_value=dtypes.NA,
266+
join='outer'):
263267
"""
264268
Explicitly combine an N-dimensional grid of datasets into one by using a
265269
succession of concat and merge operations along each dimension of the grid.
@@ -312,6 +316,16 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
312316
Details are in the documentation of concat
313317
fill_value : scalar, optional
314318
Value to use for newly missing values
319+
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
320+
String indicating how to combine differing indexes
321+
(excluding concat_dim) in objects
322+
323+
- 'outer': use the union of object indexes
324+
- 'inner': use the intersection of object indexes
325+
- 'left': use indexes from the first object with each dimension
326+
- 'right': use indexes from the last object with each dimension
327+
- 'exact': instead of aligning, raise `ValueError` when indexes to be
328+
aligned are not equal
315329
316330
Returns
317331
-------
@@ -383,15 +397,15 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
383397
# The IDs argument tells _manual_combine that datasets aren't yet sorted
384398
return _nested_combine(datasets, concat_dims=concat_dim, compat=compat,
385399
data_vars=data_vars, coords=coords, ids=False,
386-
fill_value=fill_value)
400+
fill_value=fill_value, join=join)
387401

388402

389403
def vars_as_keys(ds):
390404
return tuple(sorted(ds))
391405

392406

393407
def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
394-
coords='different', fill_value=dtypes.NA):
408+
coords='different', fill_value=dtypes.NA, join='outer'):
395409
"""
396410
Attempt to auto-magically combine the given datasets into one by using
397411
dimension coordinates.
@@ -439,6 +453,16 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
439453
Details are in the documentation of concat
440454
fill_value : scalar, optional
441455
Value to use for newly missing values
456+
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
457+
String indicating how to combine differing indexes
458+
(excluding concat_dim) in objects
459+
460+
- 'outer': use the union of object indexes
461+
- 'inner': use the intersection of object indexes
462+
- 'left': use indexes from the first object with each dimension
463+
- 'right': use indexes from the last object with each dimension
464+
- 'exact': instead of aligning, raise `ValueError` when indexes to be
465+
aligned are not equal
442466
443467
Returns
444468
-------
@@ -498,7 +522,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
498522
# Concatenate along all of concat_dims one by one to create single ds
499523
concatenated = _combine_nd(combined_ids, concat_dims=concat_dims,
500524
data_vars=data_vars, coords=coords,
501-
fill_value=fill_value)
525+
fill_value=fill_value, join=join)
502526

503527
# Check the overall coordinates are monotonically increasing
504528
for dim in concat_dims:
@@ -511,7 +535,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
511535
concatenated_grouped_by_data_vars.append(concatenated)
512536

513537
return merge(concatenated_grouped_by_data_vars, compat=compat,
514-
fill_value=fill_value)
538+
fill_value=fill_value, join=join)
515539

516540

517541
# Everything beyond here is only needed until the deprecation cycle in #2616
@@ -523,7 +547,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
523547

524548
def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
525549
data_vars='all', coords='different', fill_value=dtypes.NA,
526-
from_openmfds=False):
550+
join='outer', from_openmfds=False):
527551
"""
528552
Attempt to auto-magically combine the given datasets into one.
529553
@@ -571,6 +595,16 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
571595
Details are in the documentation of concat
572596
fill_value : scalar, optional
573597
Value to use for newly missing values
598+
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
599+
String indicating how to combine differing indexes
600+
(excluding concat_dim) in objects
601+
602+
- 'outer': use the union of object indexes
603+
- 'inner': use the intersection of object indexes
604+
- 'left': use indexes from the first object with each dimension
605+
- 'right': use indexes from the last object with each dimension
606+
- 'exact': instead of aligning, raise `ValueError` when indexes to be
607+
aligned are not equal
574608
575609
Returns
576610
-------
@@ -629,7 +663,8 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
629663

630664
return _old_auto_combine(datasets, concat_dim=concat_dim,
631665
compat=compat, data_vars=data_vars,
632-
coords=coords, fill_value=fill_value)
666+
coords=coords, fill_value=fill_value,
667+
join=join)
633668

634669

635670
def _dimension_coords_exist(datasets):
@@ -670,7 +705,7 @@ def _requires_concat_and_merge(datasets):
670705
def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
671706
compat='no_conflicts',
672707
data_vars='all', coords='different',
673-
fill_value=dtypes.NA):
708+
fill_value=dtypes.NA, join='outer'):
674709
if concat_dim is not None:
675710
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
676711

@@ -679,16 +714,17 @@ def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
679714

680715
concatenated = [_auto_concat(list(datasets), dim=dim,
681716
data_vars=data_vars, coords=coords,
682-
fill_value=fill_value)
717+
fill_value=fill_value, join=join)
683718
for vars, datasets in grouped]
684719
else:
685720
concatenated = datasets
686-
merged = merge(concatenated, compat=compat, fill_value=fill_value)
721+
merged = merge(concatenated, compat=compat, fill_value=fill_value,
722+
join=join)
687723
return merged
688724

689725

690726
def _auto_concat(datasets, dim=None, data_vars='all', coords='different',
691-
fill_value=dtypes.NA):
727+
fill_value=dtypes.NA, join='outer'):
692728
if len(datasets) == 1 and dim is None:
693729
# There is nothing more to combine, so kick out early.
694730
return datasets[0]

xarray/core/concat.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
def concat(objs, dim=None, data_vars='all', coords='different',
1313
compat='equals', positions=None, indexers=None, mode=None,
14-
concat_over=None, fill_value=dtypes.NA):
14+
concat_over=None, fill_value=dtypes.NA, join='outer'):
1515
"""Concatenate xarray objects along a new or existing dimension.
1616
1717
Parameters
@@ -52,7 +52,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
5252
* 'all': All coordinate variables will be concatenated, except
5353
those corresponding to other dimensions.
5454
* list of str: The listed coordinate variables will be concatenated,
55-
in addition the 'minimal' coordinates.
55+
in addition to the 'minimal' coordinates.
5656
compat : {'equals', 'identical'}, optional
5757
String indicating how to compare non-concatenated variables and
5858
dataset global attributes for potential conflicts. 'equals' means
@@ -65,6 +65,17 @@ def concat(objs, dim=None, data_vars='all', coords='different',
6565
supplied, objects are concatenated in the provided order.
6666
fill_value : scalar, optional
6767
Value to use for newly missing values
68+
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
69+
String indicating how to combine differing indexes
70+
(excluding dim) in objects
71+
72+
- 'outer': use the union of object indexes
73+
- 'inner': use the intersection of object indexes
74+
- 'left': use indexes from the first object with each dimension
75+
- 'right': use indexes from the last object with each dimension
76+
- 'exact': instead of aligning, raise `ValueError` when indexes to be
77+
aligned are not equal
78+
6879
indexers, mode, concat_over : deprecated
6980
7081
Returns
@@ -76,7 +87,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
7687
merge
7788
auto_combine
7889
"""
79-
# TODO: add join and ignore_index arguments copied from pandas.concat
90+
# TODO: add ignore_index arguments copied from pandas.concat
8091
# TODO: support concatenating scalar coordinates even if the concatenated
8192
# dimension already exists
8293
from .dataset import Dataset
@@ -116,7 +127,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
116127
else:
117128
raise TypeError('can only concatenate xarray Dataset and DataArray '
118129
'objects, got %s' % type(first_obj))
119-
return f(objs, dim, data_vars, coords, compat, positions, fill_value)
130+
return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)
120131

121132

122133
def _calc_concat_dim_coord(dim):
@@ -212,7 +223,7 @@ def process_subset_opt(opt, subset):
212223

213224

214225
def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
215-
fill_value=dtypes.NA):
226+
fill_value=dtypes.NA, join='outer'):
216227
"""
217228
Concatenate a sequence of datasets along a new or existing dimension
218229
"""
@@ -225,7 +236,7 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
225236
dim, coord = _calc_concat_dim_coord(dim)
226237
# Make sure we're working on a copy (we'll be loading variables)
227238
datasets = [ds.copy() for ds in datasets]
228-
datasets = align(*datasets, join='outer', copy=False, exclude=[dim],
239+
datasets = align(*datasets, join=join, copy=False, exclude=[dim],
229240
fill_value=fill_value)
230241

231242
concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords)
@@ -318,7 +329,7 @@ def ensure_common_dims(vars):
318329

319330

320331
def _dataarray_concat(arrays, dim, data_vars, coords, compat,
321-
positions, fill_value=dtypes.NA):
332+
positions, fill_value=dtypes.NA, join='outer'):
322333
arrays = list(arrays)
323334

324335
if data_vars != 'all':
@@ -337,5 +348,5 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
337348
datasets.append(arr._to_temp_dataset())
338349

339350
ds = _dataset_concat(datasets, dim, data_vars, coords, compat,
340-
positions, fill_value=fill_value)
351+
positions, fill_value=fill_value, join=join)
341352
return arrays[0]._from_temp_dataset(ds, name)

xarray/core/merge.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,14 @@ def merge(objects, compat='no_conflicts', join='outer', fill_value=dtypes.NA):
530530
must be equal. The returned dataset then contains the combination
531531
of all non-null values.
532532
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
533-
How to combine objects with different indexes.
533+
String indicating how to combine differing indexes in objects.
534+
535+
- 'outer': use the union of object indexes
536+
- 'inner': use the intersection of object indexes
537+
- 'left': use indexes from the first object with each dimension
538+
- 'right': use indexes from the last object with each dimension
539+
- 'exact': instead of aligning, raise `ValueError` when indexes to be
540+
aligned are not equal
534541
fill_value : scalar, optional
535542
Value to use for newly missing values
536543

0 commit comments

Comments
 (0)