From 06c706e0f03cb640d8fdaf74493791a0c3ae9507 Mon Sep 17 00:00:00 2001 From: dcherian Date: Sat, 30 Nov 2019 20:14:13 -0500 Subject: [PATCH 1/6] Make dask names change when chunking Variables by different amounts. When rechunking by the current chunk size, name should not change. Add a __dask_tokenize__ method for ReprObject so that this behaviour is present when DataArrays are converted to temporary Datasets and back. --- doc/whats-new.rst | 4 ++++ xarray/core/dataset.py | 5 ++++- xarray/core/utils.py | 5 +++++ xarray/tests/test_dask.py | 18 +++++++++--------- xarray/tests/test_dataarray.py | 7 +++++++ xarray/tests/test_dataset.py | 16 ++++++++++++++++ 6 files changed, 45 insertions(+), 10 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 884c3cef91c..a332a25e841 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,10 @@ New Features Bug fixes ~~~~~~~~~ +- Make sure dask names change when rechunking by different chunk sizes. Conversely, make sure they + stay the same when rechunking by the same chunk size. (:issue:`3350`) + By `Deepak Cherian `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 61dde6a393b..b2524526dfd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1747,7 +1747,10 @@ def maybe_chunk(name, var, chunks): if not chunks: chunks = None if var.ndim > 0: - token2 = tokenize(name, token if token else var._data) + # when rechunking by different amounts, make sure dask names change + # by provinding chunks as an input to tokenize. + # subtle bugs result otherwise. see GH3350 + token2 = tokenize(name, token if token else var._data, chunks) name2 = f"{name_prefix}{name}-{token2}" return var.chunk(chunks, name=name2, lock=lock) else: diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 6681375c18e..42229563e23 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -549,6 +549,11 @@ def __eq__(self, other) -> bool: def __hash__(self) -> int: return hash((ReprObject, self._value)) + def __dask_tokenize__(self): + from dask.base import normalize_token + + return normalize_token((type(self), self._value)) + @contextlib.contextmanager def close_on_error(f): diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 4c1f317342f..9e98da2ed55 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1081,7 +1081,7 @@ def func(obj): actual = xr.map_blocks(func, obj) expected = func(obj) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual.compute(), expected.compute()) + xr.testing.assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1090,7 +1090,7 @@ def test_map_blocks_convert_args_to_list(obj): with raise_if_dask_computes(): actual = xr.map_blocks(operator.add, obj, [10]) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual.compute(), expected.compute()) + xr.testing.assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1105,7 +1105,7 @@ def add_attrs(obj): with raise_if_dask_computes(): actual = xr.map_blocks(add_attrs, obj) - xr.testing.assert_identical(actual.compute(), expected.compute()) + xr.testing.assert_identical(actual, expected) def test_map_blocks_change_name(map_da): @@ -1118,7 +1118,7 @@ def change_name(obj): with raise_if_dask_computes(): actual = xr.map_blocks(change_name, map_da) - xr.testing.assert_identical(actual.compute(), expected.compute()) + xr.testing.assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1127,7 +1127,7 @@ def test_map_blocks_kwargs(obj): with raise_if_dask_computes(): actual = xr.map_blocks(xr.full_like, obj, kwargs=dict(fill_value=np.nan)) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual.compute(), expected.compute()) + xr.testing.assert_identical(actual, expected) def test_map_blocks_to_array(map_ds): @@ -1135,7 +1135,7 @@ def test_map_blocks_to_array(map_ds): actual = xr.map_blocks(lambda x: x.to_array(), map_ds) # to_array does not preserve name, so cannot use assert_identical - assert_equal(actual.compute(), map_ds.to_array().compute()) + assert_equal(actual, map_ds.to_array().compute()) @pytest.mark.parametrize( @@ -1154,7 +1154,7 @@ def test_map_blocks_da_transformations(func, map_da): with raise_if_dask_computes(): actual = xr.map_blocks(func, map_da) - assert_identical(actual.compute(), func(map_da).compute()) + assert_identical(actual, func(map_da)) @pytest.mark.parametrize( @@ -1173,7 +1173,7 @@ def test_map_blocks_ds_transformations(func, map_ds): with raise_if_dask_computes(): actual = xr.map_blocks(func, map_ds) - assert_identical(actual.compute(), func(map_ds).compute()) + assert_identical(actual, func(map_ds).compute()) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1186,7 +1186,7 @@ def func(obj): expected = xr.map_blocks(func, obj) actual = obj.map_blocks(func) - assert_identical(expected.compute(), actual.compute()) + assert_identical(expected, actual) def test_make_meta(map_ds): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index a1e34abd0d5..7e511af84b6 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -753,12 +753,19 @@ def test_chunk(self): blocked = unblocked.chunk() assert blocked.chunks == ((3,), (4,)) + first_dask_name = blocked.data.name blocked = unblocked.chunk(chunks=((2, 1), (2, 2))) assert blocked.chunks == ((2, 1), (2, 2)) + assert blocked.data.name != first_dask_name blocked = unblocked.chunk(chunks=(3, 3)) assert blocked.chunks == ((3,), (3, 1)) + assert blocked.data.name != first_dask_name + + # name doesn't change when rechunking by same amount + # this fails if ReprObject doesn't have __dask_tokenize__ defined + assert unblocked.chunk(2).data.name == unblocked.chunk(2).data.name assert blocked.load().chunks is None diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d8282f58051..ce65d04cde1 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -936,19 +936,35 @@ def test_chunk(self): expected_chunks = {"dim1": (8,), "dim2": (9,), "dim3": (10,)} assert reblocked.chunks == expected_chunks + def get_dask_names(ds): + return {k: v.data.name for k, v in ds.items()} + + orig_dask_names = get_dask_names(reblocked) + reblocked = data.chunk({"time": 5, "dim1": 5, "dim2": 5, "dim3": 5}) # time is not a dim in any of the data_vars, so it # doesn't get chunked expected_chunks = {"dim1": (5, 3), "dim2": (5, 4), "dim3": (5, 5)} assert reblocked.chunks == expected_chunks + # make sure dask names change when rechunking by different amounts + # regression test for GH3350 + new_dask_names = get_dask_names(reblocked) + for k, v in new_dask_names.items(): + assert v != orig_dask_names[k] + reblocked = data.chunk(expected_chunks) assert reblocked.chunks == expected_chunks # reblock on already blocked data + orig_dask_names = get_dask_names(reblocked) reblocked = reblocked.chunk(expected_chunks) + new_dask_names = get_dask_names(reblocked) assert reblocked.chunks == expected_chunks assert_identical(reblocked, data) + # recuhnking with same chunk sizes should not change names + for k, v in new_dask_names.items(): + assert v == orig_dask_names[k] with raises_regex(ValueError, "some chunks"): data.chunk({"foo": 10}) From 96f47d4afb7aec38594efb0d643886c8f6d0228a Mon Sep 17 00:00:00 2001 From: dcherian Date: Sun, 1 Dec 2019 09:30:49 -0500 Subject: [PATCH 2/6] remove more computes. --- xarray/tests/test_dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 9e98da2ed55..9d638789b5f 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1135,7 +1135,7 @@ def test_map_blocks_to_array(map_ds): actual = xr.map_blocks(lambda x: x.to_array(), map_ds) # to_array does not preserve name, so cannot use assert_identical - assert_equal(actual, map_ds.to_array().compute()) + assert_equal(actual, map_ds.to_array()) @pytest.mark.parametrize( @@ -1173,7 +1173,7 @@ def test_map_blocks_ds_transformations(func, map_ds): with raise_if_dask_computes(): actual = xr.map_blocks(func, map_ds) - assert_identical(actual, func(map_ds).compute()) + assert_identical(actual, func(map_ds)) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) From cd8a283208fb3e49a28722103c6237e53b0559dc Mon Sep 17 00:00:00 2001 From: dcherian Date: Sun, 1 Dec 2019 09:30:49 -0500 Subject: [PATCH 3/6] remove more computes. --- xarray/tests/test_dask.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 9e98da2ed55..ab66093c25d 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1081,7 +1081,7 @@ def func(obj): actual = xr.map_blocks(func, obj) expected = func(obj) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual, expected) + assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1090,7 +1090,7 @@ def test_map_blocks_convert_args_to_list(obj): with raise_if_dask_computes(): actual = xr.map_blocks(operator.add, obj, [10]) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual, expected) + assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1105,7 +1105,7 @@ def add_attrs(obj): with raise_if_dask_computes(): actual = xr.map_blocks(add_attrs, obj) - xr.testing.assert_identical(actual, expected) + assert_identical(actual, expected) def test_map_blocks_change_name(map_da): @@ -1118,7 +1118,7 @@ def change_name(obj): with raise_if_dask_computes(): actual = xr.map_blocks(change_name, map_da) - xr.testing.assert_identical(actual, expected) + assert_identical(actual, expected) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) @@ -1127,7 +1127,7 @@ def test_map_blocks_kwargs(obj): with raise_if_dask_computes(): actual = xr.map_blocks(xr.full_like, obj, kwargs=dict(fill_value=np.nan)) assert_chunks_equal(expected.chunk(), actual) - xr.testing.assert_identical(actual, expected) + assert_identical(actual, expected) def test_map_blocks_to_array(map_ds): @@ -1135,7 +1135,7 @@ def test_map_blocks_to_array(map_ds): actual = xr.map_blocks(lambda x: x.to_array(), map_ds) # to_array does not preserve name, so cannot use assert_identical - assert_equal(actual, map_ds.to_array().compute()) + assert_equal(actual, map_ds.to_array()) @pytest.mark.parametrize( @@ -1173,7 +1173,7 @@ def test_map_blocks_ds_transformations(func, map_ds): with raise_if_dask_computes(): actual = xr.map_blocks(func, map_ds) - assert_identical(actual, func(map_ds).compute()) + assert_identical(actual, func(map_ds)) @pytest.mark.parametrize("obj", [make_da(), make_ds()]) From ba7cc8e4c77f10bad6df249b366843c9a4d80647 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 6 Dec 2019 21:31:43 -0700 Subject: [PATCH 4/6] fix whats-new --- doc/whats-new.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ce796c4e5ee..89500818ff3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,7 +38,6 @@ Bug fixes - Make sure dask names change when rechunking by different chunk sizes. Conversely, make sure they stay the same when rechunking by the same chunk size. (:issue:`3350`) By `Deepak Cherian `_. - - Fix plotting with transposed 2D non-dimensional coordinates. (:issue:`3138`, :pull:`3441`) By `Deepak Cherian `_. - Fix issue with Dask-backed datasets raising a ``KeyError`` on some computations involving ``map_blocks`` (:pull:`3598`) From 50e3f2c50eae9ccb96994882d529c3ee5327ff9e Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 6 Dec 2019 21:33:28 -0700 Subject: [PATCH 5/6] internal change. --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 89500818ff3..ac1b862c21f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,9 +35,6 @@ New Features Bug fixes ~~~~~~~~~ -- Make sure dask names change when rechunking by different chunk sizes. Conversely, make sure they - stay the same when rechunking by the same chunk size. (:issue:`3350`) - By `Deepak Cherian `_. - Fix plotting with transposed 2D non-dimensional coordinates. (:issue:`3138`, :pull:`3441`) By `Deepak Cherian `_. - Fix issue with Dask-backed datasets raising a ``KeyError`` on some computations involving ``map_blocks`` (:pull:`3598`) @@ -63,6 +60,9 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Make sure dask names change when rechunking by different chunk sizes. Conversely, make sure they + stay the same when rechunking by the same chunk size. (:issue:`3350`) + By `Deepak Cherian `_. - 2x to 5x speed boost (on small arrays) for :py:meth:`Dataset.isel`, :py:meth:`DataArray.isel`, and :py:meth:`DataArray.__getitem__` when indexing by int, slice, list of int, scalar ndarray, or 1-dimensional ndarray. From 7154efacd7b5e3934716c85262ab7acd41712ab7 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 10 Jan 2020 15:37:01 +0000 Subject: [PATCH 6/6] Update xarray/core/utils.py Co-Authored-By: crusaderky --- xarray/core/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 42229563e23..e335365d5ca 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -547,7 +547,7 @@ def __eq__(self, other) -> bool: return False def __hash__(self) -> int: - return hash((ReprObject, self._value)) + return hash((type(self), self._value)) def __dask_tokenize__(self): from dask.base import normalize_token