Skip to content

Commit 705e656

Browse files
committed
Squashed commit of the following:
commit 08f7f74 Merge: 53c0f4e 278d2e6 Author: dcherian <[email protected]> Date: Tue Oct 29 09:36:58 2019 -0600 Merge remote-tracking branch 'upstream/master' into fix/dask-computes * upstream/master: upgrade black verison to 19.10b0 (pydata#3456) Remove outdated code related to compatibility with netcdftime (pydata#3450) commit 53c0f4e Author: dcherian <[email protected]> Date: Tue Oct 29 09:25:27 2019 -0600 Add identity check to lazy_array_equiv commit 5e742e4 Author: dcherian <[email protected]> Date: Tue Oct 29 09:22:15 2019 -0600 update whats new commit ee0d422 Merge: e99148e 74ca69a Author: dcherian <[email protected]> Date: Tue Oct 29 09:18:38 2019 -0600 Merge remote-tracking branch 'upstream/master' into fix/dask-computes * upstream/master: Remove deprecated behavior from dataset.drop docstring (pydata#3451) jupyterlab dark theme (pydata#3443) Drop groups associated with nans in group variable (pydata#3406) Allow ellipsis (...) in transpose (pydata#3421) Another groupby.reduce bugfix. (pydata#3403) add icomoon license (pydata#3448) commit e99148e Author: dcherian <[email protected]> Date: Tue Oct 29 09:17:58 2019 -0600 add concat test commit 4a66e7c Author: dcherian <[email protected]> Date: Mon Oct 28 10:19:32 2019 -0600 review suggestions. commit 8739ddd Author: dcherian <[email protected]> Date: Mon Oct 28 08:32:15 2019 -0600 better docstring commit e84cc97 Author: dcherian <[email protected]> Date: Sun Oct 27 20:22:13 2019 -0600 Optimize dask array equality checks. Dask arrays with the same graph have the same name. We can use this to quickly compare dask-backed variables without computing. Fixes pydata#3068 and pydata#3311
1 parent 7612ecc commit 705e656

File tree

6 files changed

+171
-25
lines changed

6 files changed

+171
-25
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ Bug fixes
5959
but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle <https://github.com/rdoyle45>`_
6060
- Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`).
6161
By `Deepak Cherian <https://github.com/dcherian>`_.
62+
- Use dask names to compare dask objects prior to comparing values after computation.
63+
(:issue:`3068`, :issue:`3311`, :issue:`3454`, :pull:`3453`).
64+
By `Deepak Cherian <https://github.com/dcherian>`_.
6265
- Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4.
6366
By `Anderson Banihirwe <https://github.com/andersy005>`_.
6467
- Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and

xarray/core/concat.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from . import dtypes, utils
44
from .alignment import align
5+
from .duck_array_ops import lazy_array_equiv
56
from .merge import _VALID_COMPAT, unique_variable
67
from .variable import IndexVariable, Variable, as_variable
78
from .variable import concat as concat_vars
@@ -189,6 +190,21 @@ def process_subset_opt(opt, subset):
189190
# all nonindexes that are not the same in each dataset
190191
for k in getattr(datasets[0], subset):
191192
if k not in concat_over:
193+
equals[k] = None
194+
variables = [ds.variables[k] for ds in datasets]
195+
# first check without comparing values i.e. no computes
196+
for var in variables[1:]:
197+
equals[k] = getattr(variables[0], compat)(
198+
var, equiv=lazy_array_equiv
199+
)
200+
if not equals[k]:
201+
break
202+
203+
if equals[k] is not None:
204+
if equals[k] is False:
205+
concat_over.add(k)
206+
continue
207+
192208
# Compare the variable of all datasets vs. the one
193209
# of the first dataset. Perform the minimum amount of
194210
# loads in order to avoid multiple loads from disk

xarray/core/duck_array_ops.py

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -174,27 +174,53 @@ def as_shared_dtype(scalars_or_arrays):
174174
return [x.astype(out_type, copy=False) for x in arrays]
175175

176176

177-
def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8):
178-
"""Like np.allclose, but also allows values to be NaN in both arrays
177+
def lazy_array_equiv(arr1, arr2):
178+
"""Like array_equal, but doesn't actually compare values.
179+
Returns True or False when equality can be determined without computing.
180+
Returns None when equality cannot determined (e.g. one or both of arr1, arr2 are numpy arrays)
179181
"""
182+
if arr1 is arr2:
183+
return True
180184
arr1 = asarray(arr1)
181185
arr2 = asarray(arr2)
182186
if arr1.shape != arr2.shape:
183187
return False
184-
return bool(isclose(arr1, arr2, rtol=rtol, atol=atol, equal_nan=True).all())
188+
if (
189+
dask_array
190+
and isinstance(arr1, dask_array.Array)
191+
and isinstance(arr2, dask_array.Array)
192+
):
193+
# GH3068
194+
if arr1.name == arr2.name:
195+
return True
196+
return None
197+
198+
199+
def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8):
200+
"""Like np.allclose, but also allows values to be NaN in both arrays
201+
"""
202+
arr1 = asarray(arr1)
203+
arr2 = asarray(arr2)
204+
lazy_equiv = lazy_array_equiv(arr1, arr2)
205+
if lazy_equiv is None:
206+
return bool(isclose(arr1, arr2, rtol=rtol, atol=atol, equal_nan=True).all())
207+
else:
208+
return lazy_equiv
185209

186210

187211
def array_equiv(arr1, arr2):
188212
"""Like np.array_equal, but also allows values to be NaN in both arrays
189213
"""
190214
arr1 = asarray(arr1)
191215
arr2 = asarray(arr2)
192-
if arr1.shape != arr2.shape:
193-
return False
194-
with warnings.catch_warnings():
195-
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
196-
flag_array = (arr1 == arr2) | (isnull(arr1) & isnull(arr2))
197-
return bool(flag_array.all())
216+
lazy_equiv = lazy_array_equiv(arr1, arr2)
217+
if lazy_equiv is None:
218+
with warnings.catch_warnings():
219+
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
220+
flag_array = (arr1 == arr2) | (isnull(arr1) & isnull(arr2))
221+
return bool(flag_array.all())
222+
else:
223+
return lazy_equiv
198224

199225

200226
def array_notnull_equiv(arr1, arr2):
@@ -203,12 +229,14 @@ def array_notnull_equiv(arr1, arr2):
203229
"""
204230
arr1 = asarray(arr1)
205231
arr2 = asarray(arr2)
206-
if arr1.shape != arr2.shape:
207-
return False
208-
with warnings.catch_warnings():
209-
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
210-
flag_array = (arr1 == arr2) | isnull(arr1) | isnull(arr2)
211-
return bool(flag_array.all())
232+
lazy_equiv = lazy_array_equiv(arr1, arr2)
233+
if lazy_equiv is None:
234+
with warnings.catch_warnings():
235+
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
236+
flag_array = (arr1 == arr2) | isnull(arr1) | isnull(arr2)
237+
return bool(flag_array.all())
238+
else:
239+
return lazy_equiv
212240

213241

214242
def count(data, axis=None):

xarray/core/merge.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from . import dtypes, pdcompat
2121
from .alignment import deep_align
22+
from .duck_array_ops import lazy_array_equiv
2223
from .utils import Frozen, dict_equiv
2324
from .variable import Variable, as_variable, assert_unique_multiindex_level_names
2425

@@ -123,16 +124,24 @@ def unique_variable(
123124
combine_method = "fillna"
124125

125126
if equals is None:
126-
out = out.compute()
127+
# first check without comparing values i.e. no computes
127128
for var in variables[1:]:
128-
equals = getattr(out, compat)(var)
129+
equals = getattr(out, compat)(var, equiv=lazy_array_equiv)
129130
if not equals:
130131
break
131132

133+
# now compare values with minimum number of computes
134+
if not equals:
135+
out = out.compute()
136+
for var in variables[1:]:
137+
equals = getattr(out, compat)(var)
138+
if not equals:
139+
break
140+
132141
if not equals:
133142
raise MergeError(
134-
"conflicting values for variable {!r} on objects to be combined. "
135-
"You can skip this check by specifying compat='override'.".format(name)
143+
f"conflicting values for variable {name!r} on objects to be combined. "
144+
"You can skip this check by specifying compat='override'."
136145
)
137146

138147
if combine_method:

xarray/core/variable.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,7 +1231,9 @@ def transpose(self, *dims) -> "Variable":
12311231
dims = self.dims[::-1]
12321232
dims = tuple(infix_dims(dims, self.dims))
12331233
axes = self.get_axis_num(dims)
1234-
if len(dims) < 2: # no need to transpose if only one dimension
1234+
if len(dims) < 2 or dims == self.dims:
1235+
# no need to transpose if only one dimension
1236+
# or dims are in same order
12351237
return self.copy(deep=False)
12361238

12371239
data = as_indexable(self._data).transpose(axes)
@@ -1590,22 +1592,24 @@ def broadcast_equals(self, other, equiv=duck_array_ops.array_equiv):
15901592
return False
15911593
return self.equals(other, equiv=equiv)
15921594

1593-
def identical(self, other):
1595+
def identical(self, other, equiv=duck_array_ops.array_equiv):
15941596
"""Like equals, but also checks attributes.
15951597
"""
15961598
try:
1597-
return utils.dict_equiv(self.attrs, other.attrs) and self.equals(other)
1599+
return utils.dict_equiv(self.attrs, other.attrs) and self.equals(
1600+
other, equiv=equiv
1601+
)
15981602
except (TypeError, AttributeError):
15991603
return False
16001604

1601-
def no_conflicts(self, other):
1605+
def no_conflicts(self, other, equiv=duck_array_ops.array_notnull_equiv):
16021606
"""True if the intersection of two Variable's non-null data is
16031607
equal; otherwise false.
16041608
16051609
Variables can thus still be equal if there are locations where either,
16061610
or both, contain NaN values.
16071611
"""
1608-
return self.broadcast_equals(other, equiv=duck_array_ops.array_notnull_equiv)
1612+
return self.broadcast_equals(other, equiv=equiv)
16091613

16101614
def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None):
16111615
"""Compute the qth quantile of the data along the specified dimension.

xarray/tests/test_dask.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
assert_identical,
2323
raises_regex,
2424
)
25+
from ..core.duck_array_ops import lazy_array_equiv
2526

2627
dask = pytest.importorskip("dask")
2728
da = pytest.importorskip("dask.array")
@@ -423,7 +424,53 @@ def test_concat_loads_variables(self):
423424
out.compute()
424425
assert kernel_call_count == 24
425426

426-
# Finally, test that riginals are unaltered
427+
# Finally, test that originals are unaltered
428+
assert ds1["d"].data is d1
429+
assert ds1["c"].data is c1
430+
assert ds2["d"].data is d2
431+
assert ds2["c"].data is c2
432+
assert ds3["d"].data is d3
433+
assert ds3["c"].data is c3
434+
435+
# now check that concat() is correctly using dask name equality to skip loads
436+
out = xr.concat(
437+
[ds1, ds1, ds1], dim="n", data_vars="different", coords="different"
438+
)
439+
assert kernel_call_count == 24
440+
# variables are not loaded in the output
441+
assert isinstance(out["d"].data, dask.array.Array)
442+
assert isinstance(out["c"].data, dask.array.Array)
443+
444+
out = xr.concat(
445+
[ds1, ds1, ds1], dim="n", data_vars=[], coords=[], compat="identical"
446+
)
447+
assert kernel_call_count == 24
448+
# variables are not loaded in the output
449+
assert isinstance(out["d"].data, dask.array.Array)
450+
assert isinstance(out["c"].data, dask.array.Array)
451+
452+
out = xr.concat(
453+
[ds1, ds2.compute(), ds3],
454+
dim="n",
455+
data_vars="all",
456+
coords="different",
457+
compat="identical",
458+
)
459+
# c1,c3 must be computed for comparison since c2 is numpy;
460+
# d2 is computed too
461+
assert kernel_call_count == 28
462+
463+
out = xr.concat(
464+
[ds1, ds2.compute(), ds3],
465+
dim="n",
466+
data_vars="all",
467+
coords="all",
468+
compat="identical",
469+
)
470+
# no extra computes
471+
assert kernel_call_count == 30
472+
473+
# Finally, test that originals are unaltered
427474
assert ds1["d"].data is d1
428475
assert ds1["c"].data is c1
429476
assert ds2["d"].data is d2
@@ -1135,3 +1182,42 @@ def test_make_meta(map_ds):
11351182
for variable in map_ds.data_vars:
11361183
assert variable in meta.data_vars
11371184
assert meta.data_vars[variable].shape == (0,) * meta.data_vars[variable].ndim
1185+
1186+
1187+
def test_identical_coords_no_computes():
1188+
lons2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
1189+
a = xr.DataArray(
1190+
da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
1191+
)
1192+
b = xr.DataArray(
1193+
da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
1194+
)
1195+
with raise_if_dask_computes():
1196+
c = a + b
1197+
assert_identical(c, a)
1198+
1199+
1200+
def test_lazy_array_equiv():
1201+
lons1 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
1202+
lons2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
1203+
var1 = lons1.variable
1204+
var2 = lons2.variable
1205+
with raise_if_dask_computes():
1206+
lons1.equals(lons2)
1207+
with raise_if_dask_computes():
1208+
var1.equals(var2 / 2, equiv=lazy_array_equiv)
1209+
assert var1.equals(var2.compute(), equiv=lazy_array_equiv) is None
1210+
assert var1.compute().equals(var2.compute(), equiv=lazy_array_equiv) is None
1211+
1212+
with raise_if_dask_computes():
1213+
assert lons1.equals(lons1.transpose("y", "x"))
1214+
1215+
with raise_if_dask_computes():
1216+
for compat in [
1217+
"broadcast_equals",
1218+
"equals",
1219+
"override",
1220+
"identical",
1221+
"no_conflicts",
1222+
]:
1223+
xr.merge([lons1, lons2], compat=compat)

0 commit comments

Comments
 (0)