From 88ee12a453af5dce948d2850b215b9d3c00e1cd0 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 5 Nov 2018 20:09:13 +0000 Subject: [PATCH 01/37] concatenates along a single dimension --- xarray/core/combine.py | 51 ++++++++++++++++++++++++++++++++++++ xarray/tests/test_combine.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 6853939c02d..ded6430f4aa 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function import warnings +import toolz.itertoolz as itertoolz import pandas as pd @@ -369,6 +370,56 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' +def _concat_nd(combined_IDs, concat_dims): + """ + Recursively concatenates an N-dimensional structure of datasets. + + No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, + because it is assumed that this has already been done. + + Parameters + ---------- + combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] + Structure containing all datasets to be concatenated with "tile_IDs" as keys, which + specify position within the desired final concatenated result. + concat_dims : sequence of str + + Returns + ------- + + """ + + for dim in concat_dims: + combined_IDs = _concat_all_along_last_dim(combined_IDs, dim) + + return combined_IDs.item + + +def _concat_all_along_last_dim(combined_IDs, dim): + + grouped = itertoolz.groupby(_rest_of_tile_id, combined_IDs.items()) + + new_combined_IDs = {} + for new_ID, group in grouped.items(): + print(new_ID) + print(group) + to_concat = [ds for old_ID, ds in group] + print(to_concat) + + new_combined_IDs[new_ID] = concat(to_concat, dim) + + return new_combined_IDs + + +def _rest_of_tile_id(single_id_ds_pair): + + # probably replace with something like lambda x: x[0][1:] + + tile_id, ds = single_id_ds_pair + tile_id_except_first_element = tile_id[1:] + return tile_id_except_first_element + + def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 2004b1e660f..7edd00dced5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function from copy import deepcopy +from itertools import product import numpy as np import pandas as pd @@ -8,6 +9,7 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems +from xarray.core.combine import _rest_of_tile_id, _concat_all_along_last_dim from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -396,3 +398,41 @@ def test_auto_combine_no_concat(self): 'y': (('baz', 'z'), [[1, 2]])}, {'baz': [100]}) assert_identical(expected, actual) + + +@pytest.fixture(scope='module') +def create_combined_ids(): + return _create_combined_ids + + +def _create_combined_ids(shape): + tile_ids = _create_tile_ids(shape) + return {tile_id: create_test_data(0) for tile_id in tile_ids} + + +def _create_tile_ids(shape): + tile_ids = product(*(range(i) for i in shape)) + return list(tile_ids) + + +class TestConcatND(object): + def test_get_tile_ids(self, create_combined_ids): + shape = (1, 2, 3) + combined_ids = _create_combined_ids(shape) + print(combined_ids.keys()) + + for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): + expected_new_tile_id = tile_id[1:] + assert _rest_of_tile_id(combined) == expected_new_tile_id + + def test_concat_once(self, create_combined_ids): + shape = (2,) + combined_ids = _create_combined_ids(shape) + print(combined_ids) + + result = _concat_all_along_last_dim(combined_ids, 'dim1') + print('-------------------') + print(result[()]) + expected_ds = concat([create_test_data(0), create_test_data(0)], 'dim1') + print(expected_ds) + assert_equal(result[()], expected_ds) From 1aaa0756f6b4d0e865744ab94bf39b55ec5aab02 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 6 Nov 2018 12:42:34 +0000 Subject: [PATCH 02/37] Wrote function to find correct tile_IDs from nested list of datasets --- xarray/core/combine.py | 81 +++++++++++++++++++++++++++--------- xarray/testing.py | 8 ++++ xarray/tests/__init__.py | 2 +- xarray/tests/test_combine.py | 64 ++++++++++++++++++++++++++-- 4 files changed, 130 insertions(+), 25 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index ded6430f4aa..d6e2aef6456 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -367,21 +367,69 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) -_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' +def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): + """ + Given a list of lists (of lists...) of datasets, returns a dictionary + with the index of each dataset in the nested list structure as the key. + + Recursively traverses the given structure, while keeping track of the current + position. + + Parameters + ---------- + entry : list[list[xarray.Dataset, xarray.Dataset, ...]] + List of lists of arbitrary depth, containing datasets in the order they + are to be concatenated. + + Returns + ------- + combined_tile_ids : dict[tuple(int, ...), xarray.Dataset] + """ + + from .dataset import Dataset + + if isinstance(entry, list): + # Check if list is redundant + if len(entry) == 1: + raise TypeError('Redundant list nesting at ' + 'position ' + str(current_pos)) + + # Dive down tree + current_pos.append(0) + for i, item in enumerate(entry): + current_pos[-1] = i + combined_tile_ids = _infer_tile_ids_from_nested_list(item, current_pos, + combined_tile_ids) + # Move back up tree + del current_pos[-1] + return combined_tile_ids + + elif isinstance(entry, Dataset): + # Termination condition + combined_tile_ids[tuple(current_pos)] = entry + return combined_tile_ids + + else: + raise TypeError("Element at position " + str(current_pos) + + " is neither a list nor an xarray.Dataset") + + +def _check_shape_tile_ids(combined_tile_ids): + ... def _concat_nd(combined_IDs, concat_dims): """ Recursively concatenates an N-dimensional structure of datasets. - No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, - because it is assumed that this has already been done. + No checks are performed on the consistency of the datasets, concat_dims or + tile_IDs, because it is assumed that this has already been done. Parameters ---------- combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] - Structure containing all datasets to be concatenated with "tile_IDs" as keys, which - specify position within the desired final concatenated result. + Structure containing all datasets to be concatenated with "tile_IDs" as + keys, which specify position within the desired final concatenated result. concat_dims : sequence of str Returns @@ -390,34 +438,27 @@ def _concat_nd(combined_IDs, concat_dims): """ for dim in concat_dims: - combined_IDs = _concat_all_along_last_dim(combined_IDs, dim) + combined_IDs = _concat_all_along_first_dim(combined_IDs, dim) return combined_IDs.item -def _concat_all_along_last_dim(combined_IDs, dim): - - grouped = itertoolz.groupby(_rest_of_tile_id, combined_IDs.items()) - +def _concat_all_along_first_dim(combined_IDs, dim): + grouped = itertoolz.groupby(_tile_id_except_first_element, combined_IDs.items()) new_combined_IDs = {} for new_ID, group in grouped.items(): - print(new_ID) - print(group) to_concat = [ds for old_ID, ds in group] - print(to_concat) - new_combined_IDs[new_ID] = concat(to_concat, dim) - return new_combined_IDs -def _rest_of_tile_id(single_id_ds_pair): - +def _tile_id_except_first_element(single_id_ds_pair): # probably replace with something like lambda x: x[0][1:] - tile_id, ds = single_id_ds_pair - tile_id_except_first_element = tile_id[1:] - return tile_id_except_first_element + return tile_id[1:] + + +_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' def auto_combine(datasets, diff --git a/xarray/testing.py b/xarray/testing.py index ee5a54cd7dc..03c5354cdff 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -138,3 +138,11 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): else: raise TypeError('{} not supported by assertion comparison' .format(type(a))) + + +def assert_combined_tile_ids_equal(dict1, dict2): + assert len(dict1) == len(dict2) + for k, v in dict1.items(): + assert k in dict2.keys() + assert_equal(dict1[k], dict2[k]) + diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index a45f71bbc3b..cd66ad82356 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -15,7 +15,7 @@ from xarray.core import utils from xarray.core.indexing import ExplicitlyIndexed from xarray.testing import (assert_equal, assert_identical, # noqa: F401 - assert_allclose) + assert_allclose, assert_combined_tile_ids_equal) from xarray.plot.utils import import_seaborn try: diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 7edd00dced5..3ec602cb6f9 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -9,11 +9,13 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems -from xarray.core.combine import _rest_of_tile_id, _concat_all_along_last_dim +from xarray.core.combine import ( + _tile_id_except_first_element, _concat_all_along_first_dim, + _infer_tile_ids_from_nested_list) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, - raises_regex, requires_dask) + assert_combined_tile_ids_equal, raises_regex, requires_dask) from .test_dataset import create_test_data @@ -400,6 +402,55 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +class TestTileIDsFromNestedList(object): + # TODO test ordering is correct by seeding datasets differently + def test_1d(self): + ds = create_test_data(0) + input = [ds, ds] + + expected = {(0,): ds, (1,): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + def test_2d(self): + ds = create_test_data(0) + input = [[ds, ds], [ds, ds], [ds, ds]] + + expected = {(0, 0): ds, (0, 1): ds, + (1, 0): ds, (1, 1): ds, + (2, 0): ds, (2, 1): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + def test_3d(self): + ds = create_test_data(0) + input = [[[ds, ds], [ds, ds], [ds, ds]], + [[ds, ds], [ds, ds], [ds, ds]]] + + expected = {(0, 0, 0): ds, (0, 0, 1): ds, + (0, 1, 0): ds, (0, 1, 1): ds, + (0, 2, 0): ds, (0, 2, 1): ds, + (1, 0, 0): ds, (1, 0, 1): ds, + (1, 1, 0): ds, (1, 1, 1): ds, + (1, 2, 0): ds, (1, 2, 1): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + def test_redundant_nesting_gotcha(self): + ds = create_test_data(0) + input = [[ds], [ds]] + + expected = {(0,): ds, (1,): ds} + with pytest.raises(TypeError): + actual = _infer_tile_ids_from_nested_list(input, [], {}) + + def test_bad_element(self): + ds = create_test_data() + input = [ds, 'bad_element'] + with pytest.raises(TypeError): + _infer_tile_ids_from_nested_list(input, [], {}) + + @pytest.fixture(scope='module') def create_combined_ids(): return _create_combined_ids @@ -423,16 +474,21 @@ def test_get_tile_ids(self, create_combined_ids): for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): expected_new_tile_id = tile_id[1:] - assert _rest_of_tile_id(combined) == expected_new_tile_id + assert _tile_id_except_first_element(combined) == expected_new_tile_id def test_concat_once(self, create_combined_ids): shape = (2,) combined_ids = _create_combined_ids(shape) print(combined_ids) - result = _concat_all_along_last_dim(combined_ids, 'dim1') + result = _concat_all_along_first_dim(combined_ids, 'dim1') print('-------------------') print(result[()]) expected_ds = concat([create_test_data(0), create_test_data(0)], 'dim1') print(expected_ds) assert_equal(result[()], expected_ds) + + @pytest.mark.skip + def test_concat_twice(self, create_combined_ids): + shape = (2, 3) + combined_ids = _create_combined_ids(shape) \ No newline at end of file From dbb371d7209011180d4621b77e93e8b1d70452da Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 7 Nov 2018 11:41:42 +0000 Subject: [PATCH 03/37] Wrote function to check that combined_tile_ids structure is valid --- xarray/core/combine.py | 23 ++++++++++++++--- xarray/tests/test_combine.py | 48 +++++++++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index d6e2aef6456..37cdc3e7f12 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -392,7 +392,7 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): # Check if list is redundant if len(entry) == 1: raise TypeError('Redundant list nesting at ' - 'position ' + str(current_pos)) + 'position ' + str(tuple(current_pos))) # Dive down tree current_pos.append(0) @@ -410,12 +410,29 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): return combined_tile_ids else: - raise TypeError("Element at position " + str(current_pos) + + raise TypeError("Element at position " + str(tuple(current_pos)) + " is neither a list nor an xarray.Dataset") def _check_shape_tile_ids(combined_tile_ids): - ... + # TODO create custom exception class instead of using asserts? + + tile_ids = combined_tile_ids.keys() + + # Check all tuples are the same length + lengths = [len(id) for id in tile_ids] + assert set(lengths) == {lengths[0]} + + # Check each dimension has indices 0 to n represented with no gaps + for dim in range(lengths[0]): + indices = [id[dim] for id in tile_ids] + assert len(indices) > 1 + assert sorted(indices) == range(max(indices)) + + # Check only datasets are contained + from .dataset import Dataset + for v in combined_tile_ids.values(): + assert isinstance(v, Dataset) def _concat_nd(combined_IDs, concat_dims): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 3ec602cb6f9..91742864436 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -11,7 +11,7 @@ from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( _tile_id_except_first_element, _concat_all_along_first_dim, - _infer_tile_ids_from_nested_list) + _infer_tile_ids_from_nested_list, _check_shape_tile_ids) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -402,6 +402,14 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +# TODO get this fixture to work!! +@pytest.fixture(scope='module') +def ds(): + def _create_test_data(s=0): + return create_test_data(seed=s) + return _create_test_data + + class TestTileIDsFromNestedList(object): # TODO test ordering is correct by seeding datasets differently def test_1d(self): @@ -440,9 +448,8 @@ def test_redundant_nesting_gotcha(self): ds = create_test_data(0) input = [[ds], [ds]] - expected = {(0,): ds, (1,): ds} with pytest.raises(TypeError): - actual = _infer_tile_ids_from_nested_list(input, [], {}) + _infer_tile_ids_from_nested_list(input, [], {}) def test_bad_element(self): ds = create_test_data() @@ -450,6 +457,14 @@ def test_bad_element(self): with pytest.raises(TypeError): _infer_tile_ids_from_nested_list(input, [], {}) + def test_ragged_input(self): + ds = create_test_data(0) + input = [ds, [ds, ds]] + + expected = {(0,): ds, (1, 0): ds, (1, 1): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + @pytest.fixture(scope='module') def create_combined_ids(): @@ -491,4 +506,29 @@ def test_concat_once(self, create_combined_ids): @pytest.mark.skip def test_concat_twice(self, create_combined_ids): shape = (2, 3) - combined_ids = _create_combined_ids(shape) \ No newline at end of file + combined_ids = _create_combined_ids(shape) + + +class TestCheckShapeTileIDs(object): + def test_check_lengths(self): + ds = create_test_data(0) + combined_tile_ids = {(0,): ds, (0, 1): ds} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_non_zero_length_along_all_dims(self): + ds = create_test_data(0) + combined_tile_ids = {(0, 0): ds, (1, 0): ds} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_linearity(self): + ds = create_test_data(0) + combined_tile_ids = {(0,): ds, (2,): ds} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_contains_datasets(self): + combined_tile_ids = {(0,): 'a', (1,): 'b'} + with pytest.raises(AssertionError): + _check_shape_tile_ids(combined_tile_ids) From cc4d7438320d67931d5e5f4d655922d3f129ee2c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 7 Nov 2018 16:20:07 +0000 Subject: [PATCH 04/37] Added test of 2d-concatenation --- xarray/core/combine.py | 10 +++++++--- xarray/tests/test_combine.py | 26 ++++++++++++++------------ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 37cdc3e7f12..4fa98a70858 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -457,19 +457,23 @@ def _concat_nd(combined_IDs, concat_dims): for dim in concat_dims: combined_IDs = _concat_all_along_first_dim(combined_IDs, dim) - return combined_IDs.item + combined_ds = combined_IDs[()] + + return combined_ds def _concat_all_along_first_dim(combined_IDs, dim): - grouped = itertoolz.groupby(_tile_id_except_first_element, combined_IDs.items()) + grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) new_combined_IDs = {} + + # TODO Would there be any point in parallelizing this concatenation step? for new_ID, group in grouped.items(): to_concat = [ds for old_ID, ds in group] new_combined_IDs[new_ID] = concat(to_concat, dim) return new_combined_IDs -def _tile_id_except_first_element(single_id_ds_pair): +def _new_tile_id(single_id_ds_pair): # probably replace with something like lambda x: x[0][1:] tile_id, ds = single_id_ds_pair return tile_id[1:] diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 91742864436..392df28ddd2 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,8 +10,8 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _tile_id_except_first_element, _concat_all_along_first_dim, - _infer_tile_ids_from_nested_list, _check_shape_tile_ids) + _new_tile_id, _concat_all_along_first_dim, + _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _concat_nd) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -485,28 +485,30 @@ class TestConcatND(object): def test_get_tile_ids(self, create_combined_ids): shape = (1, 2, 3) combined_ids = _create_combined_ids(shape) - print(combined_ids.keys()) for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): expected_new_tile_id = tile_id[1:] - assert _tile_id_except_first_element(combined) == expected_new_tile_id + assert _new_tile_id(combined) == expected_new_tile_id def test_concat_once(self, create_combined_ids): shape = (2,) combined_ids = _create_combined_ids(shape) - print(combined_ids) - + ds = create_test_data(0) result = _concat_all_along_first_dim(combined_ids, 'dim1') - print('-------------------') - print(result[()]) - expected_ds = concat([create_test_data(0), create_test_data(0)], 'dim1') - print(expected_ds) - assert_equal(result[()], expected_ds) - @pytest.mark.skip + expected_ds = concat([ds, ds], 'dim1') + assert_combined_tile_ids_equal(result, {(): expected_ds}) + def test_concat_twice(self, create_combined_ids): shape = (2, 3) combined_ids = _create_combined_ids(shape) + result = _concat_nd(combined_ids, concat_dims=['dim1', 'dim2']) + + ds = create_test_data(0) + partway = concat([ds, ds], dim='dim1') + expected = concat([partway, partway, partway], dim='dim2') + + assert_equal(result, expected) class TestCheckShapeTileIDs(object): From d2fc7e723fe4b99032f9ac4445fb3fc4404b4477 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 8 Nov 2018 08:53:11 +0000 Subject: [PATCH 05/37] Tests now check that dataset ordering is correct --- xarray/tests/test_combine.py | 68 ++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 392df28ddd2..f7372165eb2 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -402,45 +402,36 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) -# TODO get this fixture to work!! -@pytest.fixture(scope='module') -def ds(): - def _create_test_data(s=0): - return create_test_data(seed=s) - return _create_test_data - - class TestTileIDsFromNestedList(object): - # TODO test ordering is correct by seeding datasets differently def test_1d(self): - ds = create_test_data(0) - input = [ds, ds] + ds = create_test_data + input = [ds(0), ds(1)] - expected = {(0,): ds, (1,): ds} + expected = {(0,): ds(0), (1,): ds(1)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) def test_2d(self): - ds = create_test_data(0) - input = [[ds, ds], [ds, ds], [ds, ds]] + ds = create_test_data + input = [[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]] - expected = {(0, 0): ds, (0, 1): ds, - (1, 0): ds, (1, 1): ds, - (2, 0): ds, (2, 1): ds} + expected = {(0, 0): ds(0), (0, 1): ds(1), + (1, 0): ds(2), (1, 1): ds(3), + (2, 0): ds(4), (2, 1): ds(5)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) def test_3d(self): - ds = create_test_data(0) - input = [[[ds, ds], [ds, ds], [ds, ds]], - [[ds, ds], [ds, ds], [ds, ds]]] - - expected = {(0, 0, 0): ds, (0, 0, 1): ds, - (0, 1, 0): ds, (0, 1, 1): ds, - (0, 2, 0): ds, (0, 2, 1): ds, - (1, 0, 0): ds, (1, 0, 1): ds, - (1, 1, 0): ds, (1, 1, 1): ds, - (1, 2, 0): ds, (1, 2, 1): ds} + ds = create_test_data + input = [[[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]], + [[ds(6), ds(7)], [ds(8), ds(9)], [ds(10), ds(11)]]] + + expected = {(0, 0, 0): ds(0), (0, 0, 1): ds(1), + (0, 1, 0): ds(2), (0, 1, 1): ds(3), + (0, 2, 0): ds(4), (0, 2, 1): ds(5), + (1, 0, 0): ds(6), (1, 0, 1): ds(7), + (1, 1, 0): ds(8), (1, 1, 1): ds(9), + (1, 2, 0): ds(10), (1, 2, 1): ds(11)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) @@ -452,16 +443,16 @@ def test_redundant_nesting_gotcha(self): _infer_tile_ids_from_nested_list(input, [], {}) def test_bad_element(self): - ds = create_test_data() + ds = create_test_data(0) input = [ds, 'bad_element'] with pytest.raises(TypeError): _infer_tile_ids_from_nested_list(input, [], {}) def test_ragged_input(self): - ds = create_test_data(0) - input = [ds, [ds, ds]] + ds = create_test_data + input = [ds(0), [ds(1), ds(2)]] - expected = {(0,): ds, (1, 0): ds, (1, 1): ds} + expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)} actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) @@ -473,7 +464,8 @@ def create_combined_ids(): def _create_combined_ids(shape): tile_ids = _create_tile_ids(shape) - return {tile_id: create_test_data(0) for tile_id in tile_ids} + nums = range(len(tile_ids)) + return {tile_id: create_test_data(num) for tile_id, num in zip(tile_ids, nums)} def _create_tile_ids(shape): @@ -493,10 +485,10 @@ def test_get_tile_ids(self, create_combined_ids): def test_concat_once(self, create_combined_ids): shape = (2,) combined_ids = _create_combined_ids(shape) - ds = create_test_data(0) + ds = create_test_data result = _concat_all_along_first_dim(combined_ids, 'dim1') - expected_ds = concat([ds, ds], 'dim1') + expected_ds = concat([ds(0), ds(1)], 'dim1') assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_twice(self, create_combined_ids): @@ -504,9 +496,11 @@ def test_concat_twice(self, create_combined_ids): combined_ids = _create_combined_ids(shape) result = _concat_nd(combined_ids, concat_dims=['dim1', 'dim2']) - ds = create_test_data(0) - partway = concat([ds, ds], dim='dim1') - expected = concat([partway, partway, partway], dim='dim2') + ds = create_test_data + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim='dim2') assert_equal(result, expected) From e3f3699a76d2ed2f88605ddd0cc2b46d6f89baa4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 8 Nov 2018 09:11:17 +0000 Subject: [PATCH 06/37] Test concatentation along a new dimension --- xarray/tests/test_combine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index f7372165eb2..f171f998a7e 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -482,13 +482,14 @@ def test_get_tile_ids(self, create_combined_ids): expected_new_tile_id = tile_id[1:] assert _new_tile_id(combined) == expected_new_tile_id - def test_concat_once(self, create_combined_ids): + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) + def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = _create_combined_ids(shape) ds = create_test_data - result = _concat_all_along_first_dim(combined_ids, 'dim1') + result = _concat_all_along_first_dim(combined_ids, dim=concat_dim) - expected_ds = concat([ds(0), ds(1)], 'dim1') + expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_twice(self, create_combined_ids): From 55bf6853bb41e5c8de1da749859acd9e9aa1e0b9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 9 Nov 2018 09:14:32 +0000 Subject: [PATCH 07/37] Started generalising auto_combine to N-D by integrating the N-D concatentation algorithm --- xarray/core/combine.py | 131 ++++++++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 34 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 4fa98a70858..b7de0d15e0d 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -367,6 +367,30 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): return concat(datasets, dim=dim, data_vars=data_vars, coords=coords) +_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' + + +def _infer_concat_order_from_nested_list(datasets, concat_dims): + + # TODO check that datasets is a list containing multiple elements + + combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) + + # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call + # TODO would be faster in this case to just work out the concat_dims once here + tile_id, ds = combined_ids[0] + n_dims = len(tile_id) + if concat_dims is None: + concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims + else: + if len(concat_dims) != n_dims: + raise ValueError("concat_dims is of length " + str(len(concat_dims)) + + " but the datasets passed are nested in a " + + str(n_dims) + "-dimensional structure") + + return concat_dims, combined_ids + + def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): """ Given a list of lists (of lists...) of datasets, returns a dictionary @@ -435,9 +459,15 @@ def _check_shape_tile_ids(combined_tile_ids): assert isinstance(v, Dataset) -def _concat_nd(combined_IDs, concat_dims): +def _data_vars(combined_id): + id, ds = combined_id + return tuple(sorted(ds.data_vars)) + + +def _combine_nd(combined_IDs, concat_dims, data_vars='all', + coords='different', compat='no_conflicts'): """ - Recursively concatenates an N-dimensional structure of datasets. + Concatenates and merges an N-dimensional structure of datasets. No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, because it is assumed that this has already been done. @@ -446,31 +476,34 @@ def _concat_nd(combined_IDs, concat_dims): ---------- combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] Structure containing all datasets to be concatenated with "tile_IDs" as - keys, which specify position within the desired final concatenated result. + keys, which specify position within the desired final combined result. concat_dims : sequence of str + The dimensions along which the datasets should be concatenated. Must be + in order, and the length must match Returns ------- """ - for dim in concat_dims: - combined_IDs = _concat_all_along_first_dim(combined_IDs, dim) - - combined_ds = combined_IDs[()] + # Organise by data variables + grouped_by_data_vars = itertoolz.groupby(_data_vars, + combined_IDs.items()).values() + concatenated_datasets = [] + for tiled_datasets in grouped_by_data_vars: + concatenated_ids = tiled_datasets - return combined_ds + # Perform N-D dimensional concatenation + for concat_dim in concat_dims: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + concatenated_ids = _concat_along_first_dim(concatenated_ids, + dim=dim, + data_vars=data_vars, + coords=coords) + concatenated_datasets.append(concatenated_ids.values()) -def _concat_all_along_first_dim(combined_IDs, dim): - grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) - new_combined_IDs = {} - - # TODO Would there be any point in parallelizing this concatenation step? - for new_ID, group in grouped.items(): - to_concat = [ds for old_ID, ds in group] - new_combined_IDs[new_ID] = concat(to_concat, dim) - return new_combined_IDs + return merge(concatenated_datasets, compat=compat) def _new_tile_id(single_id_ds_pair): @@ -479,13 +512,25 @@ def _new_tile_id(single_id_ds_pair): return tile_id[1:] -_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' +def _concat_along_first_dim(combined_IDs, dim, data_vars='all', + coords='different'): + grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) + new_combined_IDs = {} + + # TODO Would there be any point in parallelizing this concatenation step? + for new_ID, group in grouped.items(): + to_concat = [ds for old_ID, ds in group] + new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim, + data_vars=data_vars, + coords=coords) + return new_combined_IDs def auto_combine(datasets, - concat_dim=_CONCAT_DIM_DEFAULT, + concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', - data_vars='all', coords='different'): + data_vars='all', coords='different', + infer_order_from_coords=True): """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by @@ -504,10 +549,10 @@ def auto_combine(datasets, ---------- datasets : sequence of xarray.Dataset Dataset objects to merge. - concat_dim : str or DataArray or Index, optional - Dimension along which to concatenate variables, as used by + concat_dims : list of str or DataArray or Index, optional + Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if - the dimension along which you want to concatenate is not a dimension + the dimensions along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining @@ -528,8 +573,14 @@ def auto_combine(datasets, of all non-null values. data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat - coords : {'minimal', 'different', 'all' o list of str}, optional + coords : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat + infer_order_from_coords : bool, optional + If true attempt to deduce the order in which the datasets should be + concatenated from their coordinates. To do this the coordinates should + be monotonic along the dimension to be concatenated. + If false instead read the order from the structure the datasets are + supplied in. This structure should be a nested list of lists. Returns ------- @@ -540,15 +591,27 @@ def auto_combine(datasets, concat Dataset.merge """ - from toolz import itertoolz - if concat_dim is not None: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), - datasets).values() - concatenated = [_auto_concat(ds, dim=dim, - data_vars=data_vars, coords=coords) - for ds in grouped] + if concat_dims is not None: + + # TODO this could be where we would optionally check alignment, as in #2039 + + # Organise datasets in concatentation order in N-D + if infer_order_from_coords: + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # i.e. (shoyer's (1) from discussion in #2159) + raise NotImplementedError + else: + # Determine tile_IDs by structure of input in N-D (i.e. ordering in list-of-lists) + concat_dims, combined_ids = _infer_concat_order_from_nested_list(datasets, concat_dims) + + # Check that the combined_ids are sensible + _check_shape_tile_ids(combined_ids) + + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) else: + # Case of no concatenation wanted concatenated = datasets - merged = merge(concatenated, compat=compat) - return merged + combined = merge(concatenated, compat=compat) + return combined From 845206c96da16e2dd44e04f2b76450904c17965f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 9 Nov 2018 18:58:07 +0000 Subject: [PATCH 08/37] All unit tests now passing --- xarray/backends/api.py | 6 +- xarray/core/combine.py | 87 +++++++++++++-------------- xarray/tests/test_backends.py | 4 +- xarray/tests/test_combine.py | 107 ++++++++++++++++++++++++---------- 4 files changed, 120 insertions(+), 84 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ca440872d73..d3c5efb6ec2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -480,7 +480,7 @@ def close(self): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' -def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, +def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', autoclose=None, parallel=False, **kwargs): @@ -620,11 +620,11 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # close datasets in case of a ValueError try: - if concat_dim is _CONCAT_DIM_DEFAULT: + if concat_dims is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat, data_vars=data_vars, coords=coords) else: - combined = auto_combine(datasets, concat_dim=concat_dim, + combined = auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords) except ValueError: diff --git a/xarray/core/combine.py b/xarray/core/combine.py index b7de0d15e0d..67b514c6e3c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -372,23 +372,21 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): def _infer_concat_order_from_nested_list(datasets, concat_dims): - # TODO check that datasets is a list containing multiple elements - combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call - # TODO would be faster in this case to just work out the concat_dims once here - tile_id, ds = combined_ids[0] + # TODO might be faster in this case to just work out the concat_dims once here + tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims is None: + if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT: concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims else: if len(concat_dims) != n_dims: - raise ValueError("concat_dims is of length " + str(len(concat_dims)) + raise ValueError("concat_dims has length " + str(len(concat_dims)) + " but the datasets passed are nested in a " + str(n_dims) + "-dimensional structure") - return concat_dims, combined_ids + return combined_ids, concat_dims def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): @@ -396,8 +394,8 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): Given a list of lists (of lists...) of datasets, returns a dictionary with the index of each dataset in the nested list structure as the key. - Recursively traverses the given structure, while keeping track of the current - position. + Recursively traverses the given structure, while keeping track of the + current position. Parameters ---------- @@ -413,17 +411,13 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): from .dataset import Dataset if isinstance(entry, list): - # Check if list is redundant - if len(entry) == 1: - raise TypeError('Redundant list nesting at ' - 'position ' + str(tuple(current_pos))) - - # Dive down tree + # Dive down tree and recursively open the next list current_pos.append(0) for i, item in enumerate(entry): current_pos[-1] = i - combined_tile_ids = _infer_tile_ids_from_nested_list(item, current_pos, - combined_tile_ids) + combined_tile_ids = _infer_tile_ids_from_nested_list\ + (item, current_pos, combined_tile_ids) + # Move back up tree del current_pos[-1] return combined_tile_ids @@ -435,11 +429,13 @@ def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): else: raise TypeError("Element at position " + str(tuple(current_pos)) + - " is neither a list nor an xarray.Dataset") + " is of type " + str(type(entry)) + ", which is " + "neither a list nor an xarray.Dataset") def _check_shape_tile_ids(combined_tile_ids): # TODO create custom exception class instead of using asserts? + # Is this function even necessary? tile_ids = combined_tile_ids.keys() @@ -447,12 +443,6 @@ def _check_shape_tile_ids(combined_tile_ids): lengths = [len(id) for id in tile_ids] assert set(lengths) == {lengths[0]} - # Check each dimension has indices 0 to n represented with no gaps - for dim in range(lengths[0]): - indices = [id[dim] for id in tile_ids] - assert len(indices) > 1 - assert sorted(indices) == range(max(indices)) - # Check only datasets are contained from .dataset import Dataset for v in combined_tile_ids.values(): @@ -489,11 +479,16 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', # Organise by data variables grouped_by_data_vars = itertoolz.groupby(_data_vars, combined_IDs.items()).values() + concatenated_datasets = [] - for tiled_datasets in grouped_by_data_vars: - concatenated_ids = tiled_datasets + for tiled_datasets_group in grouped_by_data_vars: + + # Convert list of tuples back into a dictionary + concatenated_ids = dict(tiled_datasets_group) # Perform N-D dimensional concatenation + # Each iteration of this loop reduces the length of the tile_IDs tuples + # by one. It always removes the first for concat_dim in concat_dims: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim @@ -501,13 +496,13 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', dim=dim, data_vars=data_vars, coords=coords) - concatenated_datasets.append(concatenated_ids.values()) - + concatenated_datasets = concatenated_datasets \ + + list(concatenated_ids.values()) return merge(concatenated_datasets, compat=compat) def _new_tile_id(single_id_ds_pair): - # probably replace with something like lambda x: x[0][1:] + # TODO maybe replace with something like lambda x: x[0][1:]? tile_id, ds = single_id_ds_pair return tile_id[1:] @@ -530,20 +525,12 @@ def auto_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', data_vars='all', coords='different', - infer_order_from_coords=True): + infer_order_from_coords=False): """Attempt to auto-magically combine the given datasets into one. - This method attempts to combine a list of datasets into a single entity by - inspecting metadata and using a combination of concat and merge. - - It does not concatenate along more than one dimension or sort data under - any circumstances. It does align coordinates, but different variables on - datasets can cause it to fail under some scenarios. In complex cases, you - may need to clean up your data and use ``concat``/``merge`` explicitly. - - ``auto_combine`` works well if you have N years of data and M data - variables, and each combination of a distinct time period and set of data - variables is saved its own dataset. + This method attempts to combine a list (or nested list of lists) of + datasets into a single entity by inspecting metadata and using a + combination of concat and merge. Parameters ---------- @@ -552,9 +539,9 @@ def auto_combine(datasets, concat_dims : list of str or DataArray or Index, optional Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if - the dimensions along which you want to concatenate is not a dimension - in the original datasets, e.g., if you want to stack a collection of - 2D arrays along a third dimension. + any of the dimensions along which you want to concatenate are not a + dimension in the original datasets, e.g., if you want to stack a + collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. @@ -591,18 +578,24 @@ def auto_combine(datasets, concat Dataset.merge """ + + # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? + if concat_dims is not None: - # TODO this could be where we would optionally check alignment, as in #2039 + # TODO this could be where we would optionally check alignment, as in #2039? # Organise datasets in concatentation order in N-D if infer_order_from_coords: # TODO Use coordinates to determine tile_ID for each dataset in N-D # i.e. (shoyer's (1) from discussion in #2159) raise NotImplementedError + # Once this is implemented I think it should be the default else: - # Determine tile_IDs by structure of input in N-D (i.e. ordering in list-of-lists) - concat_dims, combined_ids = _infer_concat_order_from_nested_list(datasets, concat_dims) + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_nested_list\ + (datasets, concat_dims) # Check that the combined_ids are sensible _check_shape_tile_ids(combined_ids) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fb9c43c0165..0edc7db4980 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2234,7 +2234,7 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual: + with open_mfdataset([tmp1, tmp2], concat_dims=None) as actual: assert_identical(data, actual) def test_open_dataset(self): @@ -2261,7 +2261,7 @@ def test_open_single_dataset(self): {'baz': [100]}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dim=dim) as actual: + with open_mfdataset([tmp], concat_dims=[dim]) as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index f171f998a7e..8b6912e98d5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,8 +10,9 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _new_tile_id, _concat_all_along_first_dim, - _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _concat_nd) + _new_tile_id, _concat_along_first_dim, + _infer_concat_order_from_nested_list, _infer_tile_ids_from_nested_list, + _check_shape_tile_ids, _combine_nd) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -308,6 +309,9 @@ def test_auto_combine(self): expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) + actual = auto_combine(actual) + assert_identical(expected, actual) + actual = auto_combine([actual]) assert_identical(expected, actual) @@ -354,7 +358,7 @@ def test_auto_combine_previously_failed(self): expected = Dataset({'a': (('t', 'x'), [[np.nan, 2, 3], [1, 2, np.nan]])}, {'x': [0, 1, 2]}) - actual = auto_combine(datasets, concat_dim='t') + actual = auto_combine(datasets, concat_dims=['t']) assert_identical(expected, actual) @requires_dask # only for toolz @@ -379,14 +383,14 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) data = Dataset({'x': 0}) - actual = auto_combine([data, data, data], concat_dim=None) + actual = auto_combine([data, data, data], concat_dims=None) assert_identical(data, actual) # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({'x': 0, 'y': 1})] dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dim=dim) + actual = auto_combine(objs, concat_dims=[dim]) expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, {'baz': [100]}) assert_identical(expected, actual) @@ -395,7 +399,7 @@ def test_auto_combine_no_concat(self): # expected for non-scalar values, too. objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dim=dim) + actual = auto_combine(objs, concat_dims=[dim]) expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), 'y': (('baz', 'z'), [[1, 2]])}, {'baz': [100]}) @@ -435,20 +439,41 @@ def test_3d(self): actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) - def test_redundant_nesting_gotcha(self): + def test_single_dataset(self): ds = create_test_data(0) - input = [[ds], [ds]] + input = [ds] - with pytest.raises(TypeError): - _infer_tile_ids_from_nested_list(input, [], {}) + expected = {(0,): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) - def test_bad_element(self): + def test_redundant_nesting(self): + ds = create_test_data + input = [[ds(0)], [ds(1)]] + + expected = {(0, 0): ds(0), (1, 0): ds(1)} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + + @pytest.mark.parametrize("bad_element", ['a', 2]) + def test_bad_element(self, bad_element): ds = create_test_data(0) - input = [ds, 'bad_element'] - with pytest.raises(TypeError): + input = [ds, bad_element] + with raises_regex(TypeError, 'Element at position .* is of type .*, ' + 'which is neither a list nor an ' + 'xarray.Dataset'): _infer_tile_ids_from_nested_list(input, [], {}) + def test_ignore_empty_list(self): + ds = create_test_data(0) + input = [ds, []] + expected = {(0,): ds} + actual = _infer_tile_ids_from_nested_list(input, [], {}) + assert_combined_tile_ids_equal(expected, actual) + def test_ragged_input(self): + # Auto_combine won't work on ragged input + # but this is just to increase test coverage ds = create_test_data input = [ds(0), [ds(1), ds(2)]] @@ -456,6 +481,15 @@ def test_ragged_input(self): actual = _infer_tile_ids_from_nested_list(input, [], {}) assert_combined_tile_ids_equal(expected, actual) + def test_infer_from_datasets(self): + ds = create_test_data + input = [ds(0), ds(1)] + + expected = {(0,): ds(0), (1,): ds(1)} + actual, concat_dims = _infer_concat_order_from_nested_list\ + (input, ['dim1']) + assert_combined_tile_ids_equal(expected, actual) + @pytest.fixture(scope='module') def create_combined_ids(): @@ -473,10 +507,10 @@ def _create_tile_ids(shape): return list(tile_ids) -class TestConcatND(object): - def test_get_tile_ids(self, create_combined_ids): +class TestCombineND(object): + def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) - combined_ids = _create_combined_ids(shape) + combined_ids = create_combined_ids(shape) for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): expected_new_tile_id = tile_id[1:] @@ -485,17 +519,17 @@ def test_get_tile_ids(self, create_combined_ids): @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) - combined_ids = _create_combined_ids(shape) + combined_ids = create_combined_ids(shape) ds = create_test_data - result = _concat_all_along_first_dim(combined_ids, dim=concat_dim) + result = _concat_along_first_dim(combined_ids, dim=concat_dim) expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_twice(self, create_combined_ids): shape = (2, 3) - combined_ids = _create_combined_ids(shape) - result = _concat_nd(combined_ids, concat_dims=['dim1', 'dim2']) + combined_ids = create_combined_ids(shape) + result = _combine_nd(combined_ids, concat_dims=['dim1', 'dim2']) ds = create_test_data partway1 = concat([ds(0), ds(3)], dim='dim1') @@ -513,19 +547,28 @@ def test_check_lengths(self): with pytest.raises(AssertionError): _check_shape_tile_ids(combined_tile_ids) - def test_check_non_zero_length_along_all_dims(self): - ds = create_test_data(0) - combined_tile_ids = {(0, 0): ds, (1, 0): ds} - with pytest.raises(AssertionError): - _check_shape_tile_ids(combined_tile_ids) - - def test_check_linearity(self): - ds = create_test_data(0) - combined_tile_ids = {(0,): ds, (2,): ds} - with pytest.raises(AssertionError): - _check_shape_tile_ids(combined_tile_ids) - def test_check_contains_datasets(self): combined_tile_ids = {(0,): 'a', (1,): 'b'} with pytest.raises(AssertionError): _check_shape_tile_ids(combined_tile_ids) + + +class TestAutoCombineND(object): + # TODO there should be a lot more tests in here testing different cases + + def test_auto_combine_2d(self): + ds = create_test_data + + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim='dim2') + + datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] + result = auto_combine(datasets, concat_dims=['dim1', 'dim2']) + + assert_equal(result, expected) + + def test_ragged_input(self): + # TODO should throw an informative error if you try this + ... From f4e9aad81f154d5edbc0fc422678d6fa9025c0cf Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 10 Nov 2018 12:53:12 +0000 Subject: [PATCH 09/37] Fixed a failing test which I didn't notice because I don't have pseudoNetCDF --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0edc7db4980..d9109c4b67f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2625,7 +2625,7 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - concat_dim='TSTEP', + concat_dims=['TSTEP'], backend_kwargs={'format': 'uamiv'}) data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5) From 00004a14304d461c8784fca75a4c848cccbc7011 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 12:44:00 +0000 Subject: [PATCH 10/37] Began updating open_mfdataset to handle N-D input --- xarray/backends/api.py | 44 ++++++++++++++++++++++++++++++++++-------- xarray/core/combine.py | 11 ++++++----- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d3c5efb6ec2..89aa989624b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,8 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import auto_combine +from ..core.combine import (_infer_concat_order_from_positions, _combine_nd,\ + _check_shape_tile_ids, merge) from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_remote_uri, is_grib_path from .common import ArrayWriter @@ -483,6 +484,7 @@ def close(self): def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', + infer_order_from_coords=False, autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. @@ -502,7 +504,7 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. - concat_dim : None, str, DataArray or Index, optional + concat_dims : None, str, DataArray or Index, optional Dimension to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only need to provide this argument if the dimension along which you want to @@ -561,6 +563,12 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition the 'minimal' coordinates. + infer_order_from_coords : bool, optional + If true attempt to deduce the order in which the datasets should be + concatenated from their coordinates. To do this the coordinates should + be monotonic along the dimension to be concatenated. + If false instead read the order from the structure the datasets are + supplied in. This structure should be a nested list of lists. parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. @@ -594,6 +602,11 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') + # If infer_order_from_coords=True then this is uneccessary, but quick as + # it will just loop over one list + combined_ids_paths, concat_dims = _infer_concat_order_from_positions(paths, concat_dims) # Use an OrderedDict? + ids, paths = list(combined_ids_paths.keys()), list(combined_ids_paths.values()) # Is this in order?? + open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) @@ -620,13 +633,28 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # close datasets in case of a ValueError try: - if concat_dims is _CONCAT_DIM_DEFAULT: - combined = auto_combine(datasets, compat=compat, - data_vars=data_vars, coords=coords) + # TODO refactor this section to avoid duplicating any logic with auto_combine + if concat_dims is not None: + # Arrange datasets for concatenation + if infer_order_from_coords: + # Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + raise NotImplementedError + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) + else: + # Already sorted so just use the ids already determined from the input shape + combined_ids = dict(zip(ids, datasets)) + + # Check that the combined_ids are sensible + _check_shape_tile_ids(combined_ids) + + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) else: - combined = auto_combine(datasets, concat_dims=concat_dims, - compat=compat, - data_vars=data_vars, coords=coords) + # Case of no concatenation wanted + concatenated = datasets + combined = merge(concatenated, compat=compat) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 67b514c6e3c..a05d0ac31ee 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -370,7 +370,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' -def _infer_concat_order_from_nested_list(datasets, concat_dims): +def _infer_concat_order_from_positions(datasets, concat_dims): combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) @@ -585,24 +585,25 @@ def auto_combine(datasets, # TODO this could be where we would optionally check alignment, as in #2039? - # Organise datasets in concatentation order in N-D + # Arrange datasets for concatenation if infer_order_from_coords: # TODO Use coordinates to determine tile_ID for each dataset in N-D # i.e. (shoyer's (1) from discussion in #2159) raise NotImplementedError # Once this is implemented I think it should be the default + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) else: # Determine tile_IDs by structure of input in N-D # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_nested_list\ - (datasets, concat_dims) + combined_ids, concat_dims = _infer_concat_order_from_positions \ + (datasets, concat_dims) # Check that the combined_ids are sensible _check_shape_tile_ids(combined_ids) # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) + data_vars=data_vars, coords=coords) else: # Case of no concatenation wanted concatenated = datasets From b41e37494dea8224c544b04533222c9bfdc7f6a6 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 13:58:45 +0000 Subject: [PATCH 11/37] Refactored to remove duplicate logic in open_mfdataset & auto_combine --- xarray/backends/api.py | 42 +++++++++--------------- xarray/core/combine.py | 73 +++++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 56 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 89aa989624b..672e43e9e58 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,8 +10,7 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import (_infer_concat_order_from_positions, _combine_nd,\ - _check_shape_tile_ids, merge) +from ..core.combine import _infer_concat_order_from_positions, _auto_combine from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_remote_uri, is_grib_path from .common import ArrayWriter @@ -602,8 +601,10 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # If infer_order_from_coords=True then this is uneccessary, but quick as - # it will just loop over one list + # If infer_order_from_coords=True then this is uneccessary, but that's fine + # as it should be quick - in this case it will just loop over one list + # If infer_order_from_coords=False then this creates a flat list which is + # easier to iterate over, while saving the originally-supplied structure combined_ids_paths, concat_dims = _infer_concat_order_from_positions(paths, concat_dims) # Use an OrderedDict? ids, paths = list(combined_ids_paths.keys()), list(combined_ids_paths.values()) # Is this in order?? @@ -631,30 +632,17 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # the underlying datasets will still be stored as dask arrays datasets, file_objs = dask.compute(datasets, file_objs) - # close datasets in case of a ValueError + # Close datasets in case of a ValueError try: - # TODO refactor this section to avoid duplicating any logic with auto_combine - if concat_dims is not None: - # Arrange datasets for concatenation - if infer_order_from_coords: - # Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously - raise NotImplementedError - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) - else: - # Already sorted so just use the ids already determined from the input shape - combined_ids = dict(zip(ids, datasets)) - - # Check that the combined_ids are sensible - _check_shape_tile_ids(combined_ids) - - # Repeatedly concatenate then merge along each dimension - combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) - else: - # Case of no concatenation wanted - concatenated = datasets - combined = merge(concatenated, compat=compat) + if infer_order_from_coords: + # Discard ordering because it should be redone from coordinates + ids = False + + combined = _auto_combine(datasets, concat_dims=concat_dims, + compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=infer_order_from_coords, + ids=ids) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a05d0ac31ee..caa6d805232 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -512,7 +512,6 @@ def _concat_along_first_dim(combined_IDs, dim, data_vars='all', grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) new_combined_IDs = {} - # TODO Would there be any point in parallelizing this concatenation step? for new_ID, group in grouped.items(): to_concat = [ds for old_ID, ds in group] new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim, @@ -521,6 +520,45 @@ def _concat_along_first_dim(combined_IDs, dim, data_vars='all', return new_combined_IDs +def _auto_combine(datasets, concat_dims, compat, data_vars, coords, + infer_order_from_coords, ids): + """ + This function decides if any concatenation is necessary, and if so it calls + the logic to decide their concatenation order before concatenating. + """ + + if concat_dims is not None: + # Arrange datasets for concatenation + if infer_order_from_coords: + # Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + raise NotImplementedError + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) + else: + # Use information from the shape of the user input + if not ids: + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_positions\ + (datasets, concat_dims) + else: + # Already sorted so just use the ids already passed + combined_ids = dict(zip(ids, datasets)) + + # Check that the combined_ids are sensible + _check_shape_tile_ids(combined_ids) + + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) + else: + # Case of no concatenation wanted + concatenated = datasets + combined = merge(concatenated, compat=compat) + + return combined + + def auto_combine(datasets, concat_dims=_CONCAT_DIM_DEFAULT, compat='no_conflicts', @@ -581,31 +619,8 @@ def auto_combine(datasets, # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? - if concat_dims is not None: - - # TODO this could be where we would optionally check alignment, as in #2039? - - # Arrange datasets for concatenation - if infer_order_from_coords: - # TODO Use coordinates to determine tile_ID for each dataset in N-D - # i.e. (shoyer's (1) from discussion in #2159) - raise NotImplementedError - # Once this is implemented I think it should be the default - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) - else: - # Determine tile_IDs by structure of input in N-D - # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_positions \ - (datasets, concat_dims) - - # Check that the combined_ids are sensible - _check_shape_tile_ids(combined_ids) - - # Repeatedly concatenate then merge along each dimension - combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) - else: - # Case of no concatenation wanted - concatenated = datasets - combined = merge(concatenated, compat=compat) - return combined + # The IDs argument tells _auto_combine that the datasets are not sorted + return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=infer_order_from_coords, + ids=False) \ No newline at end of file From 8672a79833236a893aba85771c96b02db06008ef Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 16:32:28 +0000 Subject: [PATCH 12/37] Implemented Shoyers suggestion in #2553 to rewrite the recursive nested list traverser as an iterator --- xarray/core/combine.py | 46 +++++++++++++----------------------- xarray/tests/test_combine.py | 27 +++++++-------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index caa6d805232..66ece818470 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -372,7 +372,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): def _infer_concat_order_from_positions(datasets, concat_dims): - combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {}) + combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call # TODO might be faster in this case to just work out the concat_dims once here @@ -389,48 +389,34 @@ def _infer_concat_order_from_positions(datasets, concat_dims): return combined_ids, concat_dims -def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids): +def _infer_tile_ids_from_nested_list(entry, current_pos): """ - Given a list of lists (of lists...) of datasets, returns a dictionary - with the index of each dataset in the nested list structure as the key. + Given a list of lists (of lists...) of objects, returns a iterator + which returns a tuple containing the index of each object in the nested + list structure as the key, and the object. This can then be called by the + dict constructor to create a dictionary of the objects organised byt their + position in the original nested list. Recursively traverses the given structure, while keeping track of the - current position. + current position. Should work for any type of object which isn't a list. Parameters ---------- - entry : list[list[xarray.Dataset, xarray.Dataset, ...]] - List of lists of arbitrary depth, containing datasets in the order they - are to be concatenated. + entry : list[list[obj, obj, ...]] + List of lists of arbitrary depth, containing objects in the order + they are to be concatenated. Returns ------- - combined_tile_ids : dict[tuple(int, ...), xarray.Dataset] + combined_tile_ids : dict[tuple(int, ...), obj] """ - from .dataset import Dataset - if isinstance(entry, list): - # Dive down tree and recursively open the next list - current_pos.append(0) for i, item in enumerate(entry): - current_pos[-1] = i - combined_tile_ids = _infer_tile_ids_from_nested_list\ - (item, current_pos, combined_tile_ids) - - # Move back up tree - del current_pos[-1] - return combined_tile_ids - - elif isinstance(entry, Dataset): - # Termination condition - combined_tile_ids[tuple(current_pos)] = entry - return combined_tile_ids - + for result in _infer_tile_ids_from_nested_list(item, current_pos + (i,)): + yield result else: - raise TypeError("Element at position " + str(tuple(current_pos)) + - " is of type " + str(type(entry)) + ", which is " - "neither a list nor an xarray.Dataset") + yield current_pos, entry def _check_shape_tile_ids(combined_tile_ids): @@ -619,7 +605,7 @@ def auto_combine(datasets, # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? - # The IDs argument tells _auto_combine that the datasets are not sorted + # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, infer_order_from_coords=infer_order_from_coords, diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 8b6912e98d5..0efeaeec30b 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -11,7 +11,7 @@ from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( _new_tile_id, _concat_along_first_dim, - _infer_concat_order_from_nested_list, _infer_tile_ids_from_nested_list, + _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _combine_nd) from . import ( @@ -412,7 +412,7 @@ def test_1d(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_2d(self): @@ -422,7 +422,7 @@ def test_2d(self): expected = {(0, 0): ds(0), (0, 1): ds(1), (1, 0): ds(2), (1, 1): ds(3), (2, 0): ds(4), (2, 1): ds(5)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_3d(self): @@ -436,7 +436,7 @@ def test_3d(self): (1, 0, 0): ds(6), (1, 0, 1): ds(7), (1, 1, 0): ds(8), (1, 1, 1): ds(9), (1, 2, 0): ds(10), (1, 2, 1): ds(11)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_single_dataset(self): @@ -444,7 +444,7 @@ def test_single_dataset(self): input = [ds] expected = {(0,): ds} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_redundant_nesting(self): @@ -452,23 +452,14 @@ def test_redundant_nesting(self): input = [[ds(0)], [ds(1)]] expected = {(0, 0): ds(0), (1, 0): ds(1)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) - @pytest.mark.parametrize("bad_element", ['a', 2]) - def test_bad_element(self, bad_element): - ds = create_test_data(0) - input = [ds, bad_element] - with raises_regex(TypeError, 'Element at position .* is of type .*, ' - 'which is neither a list nor an ' - 'xarray.Dataset'): - _infer_tile_ids_from_nested_list(input, [], {}) - def test_ignore_empty_list(self): ds = create_test_data(0) input = [ds, []] expected = {(0,): ds} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_ragged_input(self): @@ -478,7 +469,7 @@ def test_ragged_input(self): input = [ds(0), [ds(1), ds(2)]] expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)} - actual = _infer_tile_ids_from_nested_list(input, [], {}) + actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) def test_infer_from_datasets(self): @@ -486,7 +477,7 @@ def test_infer_from_datasets(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual, concat_dims = _infer_concat_order_from_nested_list\ + actual, concat_dims = _infer_concat_order_from_positions\ (input, ['dim1']) assert_combined_tile_ids_equal(expected, actual) From 4f56b240216849044d8d68c17e0ce88256393017 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 16:33:08 +0000 Subject: [PATCH 13/37] --amend --- xarray/core/combine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 66ece818470..d724a8dfdce 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -413,7 +413,8 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): if isinstance(entry, list): for i, item in enumerate(entry): - for result in _infer_tile_ids_from_nested_list(item, current_pos + (i,)): + for result in _infer_tile_ids_from_nested_list(item, + current_pos + (i,)): yield result else: yield current_pos, entry From 4cfaf2e1fb02dbb390cf64bbdfb37fca562afab9 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 14 Nov 2018 19:38:26 +0000 Subject: [PATCH 14/37] Now raises ValueError if input not ordered correctly before concatenation --- xarray/core/combine.py | 59 +++++++++++++++++++++++++----------- xarray/tests/test_combine.py | 8 ++--- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index d724a8dfdce..c0a264f6335 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -374,8 +374,9 @@ def _infer_concat_order_from_positions(datasets, concat_dims): combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) - # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call - # TODO might be faster in this case to just work out the concat_dims once here + # Currently if concat_dims is not supplied then _auto_concat attempts to + # deduce it on every call + # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT: @@ -420,20 +421,39 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): yield current_pos, entry -def _check_shape_tile_ids(combined_tile_ids): - # TODO create custom exception class instead of using asserts? - # Is this function even necessary? - +def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): tile_ids = combined_tile_ids.keys() - # Check all tuples are the same length - lengths = [len(id) for id in tile_ids] - assert set(lengths) == {lengths[0]} + # TODO cover all of these with separate unit tests - # Check only datasets are contained - from .dataset import Dataset - for v in combined_tile_ids.values(): - assert isinstance(v, Dataset) + # Check all tuples are the same length + # i.e. check that all lists are nested to the same depth + nesting_depths = [len(id) for id in tile_ids] + if not set(nesting_depths) == {nesting_depths[0]}: + raise ValueError("The supplied objects do not form a hypercube because" + " sub-lists do not have consistent depths") + + # Check objects form a hypercube + # i.e. check all lists along one dimension are same length, monotonically- + # increasing with no repetitions + for dim in range(nesting_depths[0]): + try: + indices_along_dim = [id[dim] for id in tile_ids] + except IndexError: + raise ValueError("The supplied objects do not form a hypercube " + "because sub-lists do not have consistent " + "lengths along dimension {}".format(str(dim))) + + # TODO work out if this actually means something is wrong + if not set(indices_along_dim) == indices_along_dim: + raise ValueError("The supplied objects do not form a hypercube " + "because there are repeated concatenation " + "positions along concatenation dimension " + "{}".format(str(dim))) + + if not sorted(indices_along_dim) == indices_along_dim: + raise ValueError("The supplied objects have not been successfully " + "ordered along dimension {}".format(str(dim))) def _data_vars(combined_id): @@ -473,6 +493,7 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', # Convert list of tuples back into a dictionary concatenated_ids = dict(tiled_datasets_group) + # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_IDs tuples # by one. It always removes the first @@ -517,10 +538,12 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, if concat_dims is not None: # Arrange datasets for concatenation if infer_order_from_coords: - # Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously raise NotImplementedError - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, concat_dims) + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + # Shoould look like + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, + # concat_dims) else: # Use information from the shape of the user input if not ids: @@ -604,10 +627,10 @@ def auto_combine(datasets, Dataset.merge """ - # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here? + # TODO do some of _calc_concat_dim_coord's checks on concat_dims here? # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, infer_order_from_coords=infer_order_from_coords, - ids=False) \ No newline at end of file + ids=False) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 0efeaeec30b..6aff75aefd4 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -532,15 +532,11 @@ def test_concat_twice(self, create_combined_ids): class TestCheckShapeTileIDs(object): + # TODO test all types of ValueErrors from _check_shape_tile_id def test_check_lengths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} - with pytest.raises(AssertionError): - _check_shape_tile_ids(combined_tile_ids) - - def test_check_contains_datasets(self): - combined_tile_ids = {(0,): 'a', (1,): 'b'} - with pytest.raises(AssertionError): + with pytest.raises(ValueError): _check_shape_tile_ids(combined_tile_ids) From 9fd1413692cec1cfd46c3898c413b0c16c822b8a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 22 Nov 2018 10:20:45 +0000 Subject: [PATCH 15/37] Added some more prototype tests defining desired behaviour more clearly --- xarray/tests/test_combine.py | 43 +++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 6aff75aefd4..e129ccb041f 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -462,7 +462,7 @@ def test_ignore_empty_list(self): actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) - def test_ragged_input(self): + def test_uneven_depth_input(self): # Auto_combine won't work on ragged input # but this is just to increase test coverage ds = create_test_data @@ -472,6 +472,17 @@ def test_ragged_input(self): actual = dict(_infer_tile_ids_from_nested_list(input, ())) assert_combined_tile_ids_equal(expected, actual) + def test_uneven_length_input(self): + # Auto_combine won't work on ragged input + # but this is just to increase test coverage + ds = create_test_data + input = [[ds(0)], [ds(1), ds(2)]] + + expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + print(actual) + assert_combined_tile_ids_equal(expected, actual) + def test_infer_from_datasets(self): ds = create_test_data input = [ds(0), ds(1)] @@ -533,10 +544,20 @@ def test_concat_twice(self, create_combined_ids): class TestCheckShapeTileIDs(object): # TODO test all types of ValueErrors from _check_shape_tile_id - def test_check_lengths(self): + def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} - with pytest.raises(ValueError): + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent depths'): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_lengths(self): + ds = create_test_data(0) + combined_tile_ids = {(0, 0): ds, (0, 1): ds, + (1, 0): ds, (1, 1): ds, + (0, 1): ds} + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent lengths'): _check_shape_tile_ids(combined_tile_ids) @@ -556,6 +577,22 @@ def test_auto_combine_2d(self): assert_equal(result, expected) + @pytest.mark.skip def test_ragged_input(self): # TODO should throw an informative error if you try this ... + + def test_combine_redundant_nesting(self): + objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] + actual = auto_combine(objs, concat_dims=[None, 'x']) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + objs = [[Dataset({'x': [0]})], [Dataset({'x': [1]})]] + actual = auto_combine(objs, concat_dims=['x', None]) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + @pytest.mark.skip + def test_mixed_default_concat_dims(self): + ... From 8ad01211609005e455e8f32149578c133ba06f74 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 24 Nov 2018 13:27:12 +0000 Subject: [PATCH 16/37] Now raises informative errors on invalid forms of input --- xarray/core/combine.py | 22 +++++----------------- xarray/tests/test_combine.py | 30 +++++++++++++++++------------- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c0a264f6335..c829d570124 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -2,6 +2,7 @@ import warnings import toolz.itertoolz as itertoolz +from collections import Counter import pandas as pd @@ -433,28 +434,15 @@ def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): raise ValueError("The supplied objects do not form a hypercube because" " sub-lists do not have consistent depths") - # Check objects form a hypercube - # i.e. check all lists along one dimension are same length, monotonically- - # increasing with no repetitions + # Check all lists along one dimension are same length for dim in range(nesting_depths[0]): - try: - indices_along_dim = [id[dim] for id in tile_ids] - except IndexError: + indices_along_dim = [id[dim] for id in tile_ids] + occurrences = Counter(indices_along_dim) + if len(set(occurrences.values())) != 1: raise ValueError("The supplied objects do not form a hypercube " "because sub-lists do not have consistent " "lengths along dimension {}".format(str(dim))) - # TODO work out if this actually means something is wrong - if not set(indices_along_dim) == indices_along_dim: - raise ValueError("The supplied objects do not form a hypercube " - "because there are repeated concatenation " - "positions along concatenation dimension " - "{}".format(str(dim))) - - if not sorted(indices_along_dim) == indices_along_dim: - raise ValueError("The supplied objects have not been successfully " - "ordered along dimension {}".format(str(dim))) - def _data_vars(combined_id): id, ds = combined_id diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index e129ccb041f..3fa9650c2ca 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -406,6 +406,8 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +# TODO should we use @requires_dask? only for toolz? + class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data @@ -543,7 +545,6 @@ def test_concat_twice(self, create_combined_ids): class TestCheckShapeTileIDs(object): - # TODO test all types of ValueErrors from _check_shape_tile_id def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} @@ -553,9 +554,8 @@ def test_check_depths(self): def test_check_lengths(self): ds = create_test_data(0) - combined_tile_ids = {(0, 0): ds, (0, 1): ds, - (1, 0): ds, (1, 1): ds, - (0, 1): ds} + combined_tile_ids = {(0, 0): ds, (0, 1): ds , (0, 2): ds, + (1, 0): ds, (1, 1): ds} with raises_regex(ValueError, 'sub-lists do not have ' 'consistent lengths'): _check_shape_tile_ids(combined_tile_ids) @@ -577,12 +577,20 @@ def test_auto_combine_2d(self): assert_equal(result, expected) - @pytest.mark.skip - def test_ragged_input(self): - # TODO should throw an informative error if you try this - ... + def test_invalid_hypercube_input(self): + ds = create_test_data + + datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent lengths'): + auto_combine(datasets, concat_dims=['dim1', 'dim2']) - def test_combine_redundant_nesting(self): + datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent depths'): + auto_combine(datasets, concat_dims=['dim1', 'dim2']) + + def test_combine_concat_one_dim_merge_another(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) expected = Dataset({'x': [0, 1]}) @@ -592,7 +600,3 @@ def test_combine_redundant_nesting(self): actual = auto_combine(objs, concat_dims=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) - - @pytest.mark.skip - def test_mixed_default_concat_dims(self): - ... From 4b2c5443a637ce5ca1fe1f756921d0898b91cda4 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 25 Nov 2018 16:59:42 +0000 Subject: [PATCH 17/37] Refactoring to alos merge along each dimension --- xarray/backends/api.py | 8 ++- xarray/core/combine.py | 124 ++++++++++++++++++++++++----------- xarray/tests/test_combine.py | 19 +++++- 3 files changed, 107 insertions(+), 44 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 672e43e9e58..cc5aad2bfaf 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -601,12 +601,14 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # If infer_order_from_coords=True then this is uneccessary, but that's fine + # If infer_order_from_coords=True then this is unnecessary, but that's fine # as it should be quick - in this case it will just loop over one list # If infer_order_from_coords=False then this creates a flat list which is # easier to iterate over, while saving the originally-supplied structure - combined_ids_paths, concat_dims = _infer_concat_order_from_positions(paths, concat_dims) # Use an OrderedDict? - ids, paths = list(combined_ids_paths.keys()), list(combined_ids_paths.values()) # Is this in order?? + combined_ids_paths, concat_dims = _infer_concat_order_from_positions\ + (paths, concat_dims) + ids, paths = list(combined_ids_paths.keys()), \ + list(combined_ids_paths.values()) open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c829d570124..e0e18c8149d 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -380,8 +380,11 @@ def _infer_concat_order_from_positions(datasets, concat_dims): # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT: + if concat_dims == _CONCAT_DIM_DEFAULT: concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims + elif concat_dims is None: + # TODO should this be a list of n Nones? + pass else: if len(concat_dims) != n_dims: raise ValueError("concat_dims has length " + str(len(concat_dims)) @@ -425,8 +428,6 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): tile_ids = combined_tile_ids.keys() - # TODO cover all of these with separate unit tests - # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(id) for id in tile_ids] @@ -449,7 +450,7 @@ def _data_vars(combined_id): return tuple(sorted(ds.data_vars)) -def _combine_nd(combined_IDs, concat_dims, data_vars='all', +def _combine_nd(combined_ids, concat_dims, data_vars='all', coords='different', compat='no_conflicts'): """ Concatenates and merges an N-dimensional structure of datasets. @@ -459,7 +460,7 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', Parameters ---------- - combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset] + combined_ids : Dict[Tuple[int, ...]], xarray.Dataset] Structure containing all datasets to be concatenated with "tile_IDs" as keys, which specify position within the desired final combined result. concat_dims : sequence of str @@ -471,49 +472,94 @@ def _combine_nd(combined_IDs, concat_dims, data_vars='all', """ + # TODO merge, don't concat if concat_dim is None + + # TODO refactor this logic, possibly using method in np.blocks + # Perform N-D dimensional concatenation + # Each iteration of this loop reduces the length of the tile_IDs tuples + # by one. It always removes the first + for concat_dim in concat_dims: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + + combined_ids = _auto_combine_along_first_dim(combined_ids, + dim=dim, + data_vars=data_vars, + coords=coords, + compat=compat) + combined_ds = next(iter(combined_ids.values())) + #print(combined_ds) + return combined_ds + + +def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, + coords, compat): # Organise by data variables grouped_by_data_vars = itertoolz.groupby(_data_vars, - combined_IDs.items()).values() + combined_ids.items()).values() - concatenated_datasets = [] + new_combined_ids_all_vars = [] for tiled_datasets_group in grouped_by_data_vars: - # Convert list of tuples back into a dictionary - concatenated_ids = dict(tiled_datasets_group) + # Groupby returns list of tuples - convert back into a dictionary + combined_ids = dict(tiled_datasets_group) - # TODO refactor this logic, possibly using method in np.blocks - # Perform N-D dimensional concatenation - # Each iteration of this loop reduces the length of the tile_IDs tuples - # by one. It always removes the first - for concat_dim in concat_dims: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + # Group into lines of datasets which must be combined along dim + grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) - concatenated_ids = _concat_along_first_dim(concatenated_ids, - dim=dim, - data_vars=data_vars, - coords=coords) - concatenated_datasets = concatenated_datasets \ - + list(concatenated_ids.values()) - return merge(concatenated_datasets, compat=compat) + new_combined_ids = {} + for new_id, group in grouped.items(): + new_combined_ids[new_id] = _auto_combine_1d(group, dim, data_vars, + coords, compat) + print("New combined IDs:") + print(new_combined_ids) + new_combined_ids_all_vars.append(new_combined_ids) -def _new_tile_id(single_id_ds_pair): - # TODO maybe replace with something like lambda x: x[0][1:]? - tile_id, ds = single_id_ds_pair - return tile_id[1:] + # merge the new_combined_ids dicts by using xarray.merge on elements with the same key + return _merge_by_new_ids(new_combined_ids_all_vars, compat=compat) + + +def _auto_combine_1d(to_combine, dim, data_vars, coords, compat): + if dim is not None: + combined = _auto_concat(to_combine, dim=dim, data_vars=data_vars, + coords=coords) + else: + print(to_combine) + combined = merge(to_combine, compat=compat) + + return combined + + +def _merge_by_new_ids(new_combined_ids_all_vars, compat): + # Merge different variables back together + # Merging a list of dicts + # Group by indexes + grouped_by_new_id = itertoolz.groupby(_tile_id, new_combined_ids_all_vars) + + print("Grouped by new ID:") + print(grouped_by_new_id) + + # Merge different variables back together, while retaining new tile IDs + merged_new_combined_ids = {} + for tile_id, group in grouped_by_new_id.items(): + + print(group) + to_merge = [list(combined_ds.values())[0] for combined_ds in group] + print(to_merge) + merged = merge(to_merge, compat=compat) + merged_new_combined_ids[tile_id] = merged + return merged_new_combined_ids -def _concat_along_first_dim(combined_IDs, dim, data_vars='all', - coords='different'): - grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items()) - new_combined_IDs = {} - for new_ID, group in grouped.items(): - to_concat = [ds for old_ID, ds in group] - new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim, - data_vars=data_vars, - coords=coords) - return new_combined_IDs +def _tile_id(new_combined_ids): + return next(iter(new_combined_ids.keys())) + + +def _new_tile_id(single_id_ds_pair): + # TODO maybe replace with something like lambda x: x[0][1:]? + tile_id, ds = single_id_ds_pair + return tile_id[1:] def _auto_combine(datasets, concat_dims, compat, data_vars, coords, @@ -529,7 +575,7 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, raise NotImplementedError # TODO Use coordinates to determine tile_ID for each dataset in N-D # Ignore how they were ordered previously - # Shoould look like + # Should look like # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, # concat_dims) else: @@ -546,11 +592,13 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, # Check that the combined_ids are sensible _check_shape_tile_ids(combined_ids) + print(concat_dims) + # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, data_vars=data_vars, coords=coords) else: - # Case of no concatenation wanted + # Case of no concatenation wanted at all concatenated = datasets combined = merge(concatenated, compat=compat) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 3fa9650c2ca..dbd2c070251 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,7 +10,7 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _new_tile_id, _concat_along_first_dim, + _new_tile_id, _auto_combine_along_first_dim, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, _check_shape_tile_ids, _combine_nd) @@ -386,6 +386,13 @@ def test_auto_combine_no_concat(self): actual = auto_combine([data, data, data], concat_dims=None) assert_identical(data, actual) + tmp1 = Dataset({'x': 0}) + tmp2 = Dataset({'x': np.nan}) + actual = auto_combine([tmp1, tmp2], concat_dims=None) + assert_identical(tmp1, actual) + actual = auto_combine([tmp1, tmp2], concat_dims=[None]) + assert_identical(tmp1, actual) + # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({'x': 0, 'y': 1})] @@ -520,12 +527,18 @@ def test_get_new_tile_ids(self, create_combined_ids): expected_new_tile_id = tile_id[1:] assert _new_tile_id(combined) == expected_new_tile_id + def test_merge_by_new_ids(self): + ... + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = create_combined_ids(shape) ds = create_test_data - result = _concat_along_first_dim(combined_ids, dim=concat_dim) + result = _auto_combine_along_first_dim(combined_ids, dim=concat_dim, + data_vars='all', + coords='different', + compat='no_conflicts') expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) @@ -590,7 +603,7 @@ def test_invalid_hypercube_input(self): 'consistent depths'): auto_combine(datasets, concat_dims=['dim1', 'dim2']) - def test_combine_concat_one_dim_merge_another(self): + def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) expected = Dataset({'x': [0, 1]}) From 3d0061e382dc51badc1f731eb53df8f42df9a7c1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 25 Nov 2018 17:57:05 +0000 Subject: [PATCH 18/37] Refactored to literally just apply the old auto_combine along each dimension --- xarray/core/combine.py | 116 +++++++++++------------------------ xarray/tests/test_combine.py | 51 ++++++++------- 2 files changed, 64 insertions(+), 103 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index e0e18c8149d..b476e77e850 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -380,11 +380,8 @@ def _infer_concat_order_from_positions(datasets, concat_dims): # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT: - concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims - elif concat_dims is None: - # TODO should this be a list of n Nones? - pass + if concat_dims == _CONCAT_DIM_DEFAULT or None: + concat_dims = [concat_dims]*n_dims else: if len(concat_dims) != n_dims: raise ValueError("concat_dims has length " + str(len(concat_dims)) @@ -428,6 +425,8 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): tile_ids = combined_tile_ids.keys() + # TODO a check that only the expected types of objects are contained + # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(id) for id in tile_ids] @@ -472,93 +471,54 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', """ - # TODO merge, don't concat if concat_dim is None - # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_IDs tuples # by one. It always removes the first for concat_dim in concat_dims: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - - combined_ids = _auto_combine_along_first_dim(combined_ids, - dim=dim, - data_vars=data_vars, - coords=coords, - compat=compat) + combined_ids = _auto_combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars=data_vars, + coords=coords, + compat=compat) combined_ds = next(iter(combined_ids.values())) - #print(combined_ds) return combined_ds def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat): - # Organise by data variables - grouped_by_data_vars = itertoolz.groupby(_data_vars, - combined_ids.items()).values() - - new_combined_ids_all_vars = [] - for tiled_datasets_group in grouped_by_data_vars: - - # Groupby returns list of tuples - convert back into a dictionary - combined_ids = dict(tiled_datasets_group) + # Group into lines of datasets which must be combined along dim + grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) - # Group into lines of datasets which must be combined along dim - grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) + new_combined_ids = {} + for new_id, group in grouped.items(): + combined_ids = dict(group) + datasets = list(combined_ids.values()) + new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, + data_vars, coords) - new_combined_ids = {} - for new_id, group in grouped.items(): - new_combined_ids[new_id] = _auto_combine_1d(group, dim, data_vars, - coords, compat) + return new_combined_ids - print("New combined IDs:") - print(new_combined_ids) - new_combined_ids_all_vars.append(new_combined_ids) - # merge the new_combined_ids dicts by using xarray.merge on elements with the same key - return _merge_by_new_ids(new_combined_ids_all_vars, compat=compat) - - -def _auto_combine_1d(to_combine, dim, data_vars, coords, compat): - if dim is not None: - combined = _auto_concat(to_combine, dim=dim, data_vars=data_vars, - coords=coords) +def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', + data_vars='all', coords='different'): + # This is just the old auto_combine function (which only worked along 1D) + if concat_dim is not None: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), + datasets).values() + concatenated = [_auto_concat(ds, dim=dim, + data_vars=data_vars, coords=coords) + for ds in grouped] else: - print(to_combine) - combined = merge(to_combine, compat=compat) - - return combined - - -def _merge_by_new_ids(new_combined_ids_all_vars, compat): - # Merge different variables back together - # Merging a list of dicts - # Group by indexes - grouped_by_new_id = itertoolz.groupby(_tile_id, new_combined_ids_all_vars) - - print("Grouped by new ID:") - print(grouped_by_new_id) - - # Merge different variables back together, while retaining new tile IDs - merged_new_combined_ids = {} - for tile_id, group in grouped_by_new_id.items(): - - print(group) - to_merge = [list(combined_ds.values())[0] for combined_ds in group] - print(to_merge) - merged = merge(to_merge, compat=compat) - merged_new_combined_ids[tile_id] = merged - - return merged_new_combined_ids - - -def _tile_id(new_combined_ids): - return next(iter(new_combined_ids.keys())) + concatenated = datasets + merged = merge(concatenated, compat=compat) + return merged def _new_tile_id(single_id_ds_pair): - # TODO maybe replace with something like lambda x: x[0][1:]? - tile_id, ds = single_id_ds_pair + tile_id, ds = single_id_ds_pair return tile_id[1:] @@ -589,19 +549,15 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, # Already sorted so just use the ids already passed combined_ids = dict(zip(ids, datasets)) - # Check that the combined_ids are sensible + # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) - print(concat_dims) - # Repeatedly concatenate then merge along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, data_vars=data_vars, coords=coords) else: # Case of no concatenation wanted at all - concatenated = datasets - combined = merge(concatenated, compat=compat) - + combined = merge(datasets, compat=compat) return combined @@ -663,8 +619,6 @@ def auto_combine(datasets, Dataset.merge """ - # TODO do some of _calc_concat_dim_coord's checks on concat_dims here? - # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index dbd2c070251..8abe0ac0f5d 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -10,9 +10,9 @@ from xarray import DataArray, Dataset, Variable, auto_combine, concat from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( - _new_tile_id, _auto_combine_along_first_dim, + _new_tile_id, _auto_combine_all_along_first_dim, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, - _check_shape_tile_ids, _combine_nd) + _check_shape_tile_ids, _combine_nd, _auto_combine_1d) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -302,42 +302,41 @@ def test_concat_lazy(self): class TestAutoCombine(object): + @pytest.mark.parametrize("combine", [_auto_combine_1d, auto_combine]) @requires_dask # only for toolz - def test_auto_combine(self): + def test_auto_combine(self, combine): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) - actual = auto_combine(actual) - assert_identical(expected, actual) - - actual = auto_combine([actual]) + actual = combine([actual]) assert_identical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) + # TODO find out why this fails!! # ensure auto_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) assert_identical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] with raises_regex(ValueError, 'too many .* dimensions'): - auto_combine(objs) + combine(objs) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'cannot infer dimension'): - auto_combine(objs) + combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with pytest.raises(KeyError): - auto_combine(objs) + combine(objs) @requires_dask # only for toolz def test_auto_combine_previously_failed(self): @@ -413,8 +412,6 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) -# TODO should we use @requires_dask? only for toolz? - class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data @@ -518,6 +515,7 @@ def _create_tile_ids(shape): return list(tile_ids) +@requires_dask # only for toolz class TestCombineND(object): def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) @@ -527,18 +525,16 @@ def test_get_new_tile_ids(self, create_combined_ids): expected_new_tile_id = tile_id[1:] assert _new_tile_id(combined) == expected_new_tile_id - def test_merge_by_new_ids(self): - ... - @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = create_combined_ids(shape) ds = create_test_data - result = _auto_combine_along_first_dim(combined_ids, dim=concat_dim, - data_vars='all', - coords='different', - compat='no_conflicts') + result = _auto_combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars='all', + coords='different', + compat='no_conflicts') expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) @@ -574,9 +570,20 @@ def test_check_lengths(self): _check_shape_tile_ids(combined_tile_ids) +@requires_dask # only for toolz class TestAutoCombineND(object): # TODO there should be a lot more tests in here testing different cases + def test_single_dataset(self): + + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + actual = auto_combine(actual) + assert_identical(expected, actual) + def test_auto_combine_2d(self): ds = create_test_data From 60c93ba59fd6636a9f6ebe2bb460b9924d9ebec7 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 26 Nov 2018 11:28:23 +0000 Subject: [PATCH 19/37] Added unit tests for open_mfdatset --- xarray/backends/api.py | 13 ++++--- xarray/core/combine.py | 8 ++--- xarray/tests/test_backends.py | 65 +++++++++++++++++++++++++++++++++++ xarray/tests/test_combine.py | 10 +++++- 4 files changed, 84 insertions(+), 12 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cc5aad2bfaf..45ceb814a18 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -503,12 +503,12 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. - concat_dims : None, str, DataArray or Index, optional - Dimension to concatenate files along. This argument is passed on to + concat_dims : list of str, DataArray, Index or None, optional + Dimensions to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only - need to provide this argument if the dimension along which you want to - concatenate is not a dimension in the original datasets, e.g., if you - want to stack a collection of 2D arrays along a third dimension. + need to provide this argument if any of the dimensions along which you + want to concatenate is not a dimension in the original datasets, e.g., + if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. @@ -601,8 +601,7 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # If infer_order_from_coords=True then this is unnecessary, but that's fine - # as it should be quick - in this case it will just loop over one list + # If infer_order_from_coords=True then this is unnecessary, but quick. # If infer_order_from_coords=False then this creates a flat list which is # easier to iterate over, while saving the originally-supplied structure combined_ids_paths, concat_dims = _infer_concat_order_from_positions\ diff --git a/xarray/core/combine.py b/xarray/core/combine.py index b476e77e850..bf44997e9f6 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -380,7 +380,7 @@ def _infer_concat_order_from_positions(datasets, concat_dims): # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT or None: + if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims == None: concat_dims = [concat_dims]*n_dims else: if len(concat_dims) != n_dims: @@ -468,7 +468,7 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', Returns ------- - + combined_ds : xarray.Dataset """ # TODO refactor this logic, possibly using method in np.blocks @@ -481,7 +481,7 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', data_vars=data_vars, coords=coords, compat=compat) - combined_ds = next(iter(combined_ids.values())) + combined_ds = list(combined_ids.values())[0] return combined_ds @@ -576,7 +576,7 @@ def auto_combine(datasets, ---------- datasets : sequence of xarray.Dataset Dataset objects to merge. - concat_dims : list of str or DataArray or Index, optional + concat_dims : list of str, DataArray, Index or None, optional Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if any of the dimensions along which you want to concatenate are not a diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d9109c4b67f..42893cf5d9e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2135,12 +2135,42 @@ def test_open_mfdataset(self): assert actual.foo.variable.data.chunks == \ ((3, 2, 3, 2),) + with raises_regex(IOError, 'no files to open'): open_mfdataset('foo-bar-baz-*.nc') with raises_regex(ValueError, 'wild-card'): open_mfdataset('http://some/remote/uri') + def test_open_mfdataset_2d(self): + original = Dataset({'foo': (['x', 'y'], np.random.randn(10, 8))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + with create_tmp_file() as tmp4: + original.isel(x=slice(5), + y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), + y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), + y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), + y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dims=['y', 'x']) as actual: + assert isinstance(actual.foo.variable.data, + da.Array) + assert actual.foo.variable.data.chunks == \ + ((5, 5), (4, 4)) + assert_identical(original, actual) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dims=['y', 'x'], + chunks={'x': 3, 'y': 2}) as actual: + assert actual.foo.variable.data.chunks == \ + ((3, 2, 3, 2), (2, 2, 2, 2),) + @requires_pathlib def test_open_mfdataset_pathlib(self): original = Dataset({'foo': ('x', np.random.randn(10))}) @@ -2153,6 +2183,41 @@ def test_open_mfdataset_pathlib(self): with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) + @requires_pathlib + def test_open_mfdataset_2d_pathlib(self): + original = Dataset({'foo': (['x', 'y'], np.random.randn(10, 8))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + with create_tmp_file() as tmp4: + tmp1 = Path(tmp1) + tmp2 = Path(tmp2) + tmp3 = Path(tmp3) + tmp4 = Path(tmp4) + original.isel(x=slice(5), + y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), + y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), + y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), + y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dims=['y', 'x']) as actual: + assert_identical(original, actual) + + @pytest.mark.xfail(reason="Not yet implemented") + def test_open_mfdataset(self): + original = Dataset({'foo': ('x', np.random.randn(10))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + with open_mfdataset([tmp1, tmp2], + infer_order_from_coords=True) as actual: + assert_identical(original, actual) + def test_attrs_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 8abe0ac0f5d..0edc715698a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -318,7 +318,6 @@ def test_auto_combine(self, combine): expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) - # TODO find out why this fails!! # ensure auto_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] @@ -411,6 +410,15 @@ def test_auto_combine_no_concat(self): {'baz': [100]}) assert_identical(expected, actual) + @pytest.mark.xfail(reason="Not yet implemented") + def test_infer_order_from_coords(self): + data = create_test_data() + print(data) + objs = [data.isel(x=slice(4, 9)), data.isel(x=slice(4))] + actual = auto_combine(objs, infer_order_from_coords=True) + expected = data + assert_identical(expected, actual) + class TestTileIDsFromNestedList(object): def test_1d(self): From 18245384c12d77177c0c230b7826739ca5089234 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 26 Nov 2018 13:31:51 +0000 Subject: [PATCH 20/37] Removed TODOs --- xarray/core/combine.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index bf44997e9f6..2d5da1053fb 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -375,9 +375,6 @@ def _infer_concat_order_from_positions(datasets, concat_dims): combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) - # Currently if concat_dims is not supplied then _auto_concat attempts to - # deduce it on every call - # TODO might be faster in this case to just work out concat_dims once here tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims == None: @@ -422,11 +419,9 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): yield current_pos, entry -def _check_shape_tile_ids(combined_tile_ids, contains='datasets'): +def _check_shape_tile_ids(combined_tile_ids): tile_ids = combined_tile_ids.keys() - # TODO a check that only the expected types of objects are contained - # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(id) for id in tile_ids] From d380815de26510fd977269e35b8b9648db279769 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 18:58:02 +0000 Subject: [PATCH 21/37] Removed format strings --- xarray/core/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 2d5da1053fb..d81849e037b 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -436,7 +436,7 @@ def _check_shape_tile_ids(combined_tile_ids): if len(set(occurrences.values())) != 1: raise ValueError("The supplied objects do not form a hypercube " "because sub-lists do not have consistent " - "lengths along dimension {}".format(str(dim))) + "lengths along dimension" + str(dim)) def _data_vars(combined_id): From c4bb8d0396498154da3521ffe7c9a81a4199cf7b Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 20:07:54 +0000 Subject: [PATCH 22/37] test_get_new_tile_ids now doesn't assume dicts are ordered --- xarray/tests/test_combine.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 0edc715698a..b4c3c36f46a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable, auto_combine, concat +from xarray import DataArray, Dataset, Variable, auto_combine, concat, merge from xarray.core.pycompat import OrderedDict, iteritems from xarray.core.combine import ( _new_tile_id, _auto_combine_all_along_first_dim, @@ -529,9 +529,9 @@ def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) combined_ids = create_combined_ids(shape) - for combined, tile_id in zip(combined_ids.items(), _create_tile_ids(shape)): - expected_new_tile_id = tile_id[1:] - assert _new_tile_id(combined) == expected_new_tile_id + expected_tile_ids = sorted(combined_ids.keys()) + actual_tile_ids = _create_tile_ids(shape) + assert expected_tile_ids == actual_tile_ids @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): @@ -547,16 +547,36 @@ def test_concat_once(self, create_combined_ids, concat_dim): expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) - def test_concat_twice(self, create_combined_ids): + def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) - result = _combine_nd(combined_ids, concat_dims=['dim1', 'dim2']) + ds = create_test_data + result = _auto_combine_all_along_first_dim(combined_ids, + dim='dim1', + data_vars='all', + coords='different', + compat='no_conflicts') ds = create_test_data partway1 = concat([ds(0), ds(3)], dim='dim1') partway2 = concat([ds(1), ds(4)], dim='dim1') partway3 = concat([ds(2), ds(5)], dim='dim1') - expected = concat([partway1, partway2, partway3], dim='dim2') + expected_datasets = [partway1, partway2, partway3] + expected = {(i,): ds for i, ds in enumerate(expected_datasets)} + + assert_combined_tile_ids_equal(result, expected) + + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) + def test_concat_twice(self, create_combined_ids, concat_dim): + shape = (2, 3) + combined_ids = create_combined_ids(shape) + result = _combine_nd(combined_ids, concat_dims=['dim1', concat_dim]) + + ds = create_test_data + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim=concat_dim) assert_equal(result, expected) From 6b7f8891c9e3e113b511fded3e6b7d2801e9873e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 21:08:42 +0000 Subject: [PATCH 23/37] Fixed failing tests on python3.5 caused by accidentally assuming dict was ordered --- xarray/core/combine.py | 7 ++++--- xarray/tests/test_combine.py | 38 ++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index d81849e037b..aaef8cd4162 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -2,7 +2,7 @@ import warnings import toolz.itertoolz as itertoolz -from collections import Counter +from collections import Counter, OrderedDict import pandas as pd @@ -470,6 +470,7 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_IDs tuples # by one. It always removes the first + for concat_dim in concat_dims: combined_ids = _auto_combine_all_along_first_dim(combined_ids, dim=concat_dim, @@ -487,11 +488,11 @@ def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, new_combined_ids = {} for new_id, group in grouped.items(): - combined_ids = dict(group) + # TODO is there a way to unpack this object without using OrderedDict? + combined_ids = OrderedDict(sorted(group)) datasets = list(combined_ids.values()) new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, data_vars, coords) - return new_combined_ids diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index b4c3c36f46a..d44ba20ba1c 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -12,7 +12,7 @@ from xarray.core.combine import ( _new_tile_id, _auto_combine_all_along_first_dim, _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, - _check_shape_tile_ids, _combine_nd, _auto_combine_1d) + _check_shape_tile_ids, _combine_nd, _auto_combine_1d, _auto_combine) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, @@ -410,15 +410,6 @@ def test_auto_combine_no_concat(self): {'baz': [100]}) assert_identical(expected, actual) - @pytest.mark.xfail(reason="Not yet implemented") - def test_infer_order_from_coords(self): - data = create_test_data() - print(data) - objs = [data.isel(x=slice(4, 9)), data.isel(x=slice(4))] - actual = auto_combine(objs, infer_order_from_coords=True) - expected = data - assert_identical(expected, actual) - class TestTileIDsFromNestedList(object): def test_1d(self): @@ -515,7 +506,8 @@ def create_combined_ids(): def _create_combined_ids(shape): tile_ids = _create_tile_ids(shape) nums = range(len(tile_ids)) - return {tile_id: create_test_data(num) for tile_id, num in zip(tile_ids, nums)} + return {tile_id: create_test_data(num) + for tile_id, num in zip(tile_ids, nums)} def _create_tile_ids(shape): @@ -551,6 +543,7 @@ def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) ds = create_test_data + print(combined_ids) result = _auto_combine_all_along_first_dim(combined_ids, dim='dim1', data_vars='all', @@ -563,7 +556,7 @@ def test_concat_only_first_dim(self, create_combined_ids): partway3 = concat([ds(2), ds(5)], dim='dim1') expected_datasets = [partway1, partway2, partway3] expected = {(i,): ds for i, ds in enumerate(expected_datasets)} - + assert_combined_tile_ids_equal(result, expected) @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) @@ -648,3 +641,24 @@ def test_combine_concat_over_redundant_nesting(self): actual = auto_combine(objs, concat_dims=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) + + +# TODO test for _new_tile_id + +class TestAutoCombineUsingCoords(object): + @pytest.mark.xfail(reason="Not yet implemented") + def test_infer_order_from_coords(self): + # Should pass once inferring order from coords is implemented + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + actual = auto_combine(objs, infer_order_from_coords=True) + expected = data + assert_identical(expected, actual) + + def test_order_inferred_from_coords(self): + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + with pytest.raises(NotImplementedError): + _auto_combine(objs, concat_dims=['dim2'],compat='no_conflicts', + data_vars='all', coords='different', + infer_order_from_coords=True, ids=True) From 58a3648cc2b4a7465f5cacdaa8e6eb67e54ff583 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 21:21:23 +0000 Subject: [PATCH 24/37] Test for getting new tile id --- xarray/tests/test_combine.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index d44ba20ba1c..87d4efb57f5 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -517,6 +517,15 @@ def _create_tile_ids(shape): @requires_dask # only for toolz class TestCombineND(object): + @pytest.mark.parametrize("old_id, new_id", [((3,0,1), (0,1)), + ((0, 0), (0,)), + ((1,), ()), + ((0,), ()), + ((1, 0), (0,))]) + def test_new_tile_id(self, old_id, new_id): + ds = create_test_data + assert _new_tile_id((old_id, ds)) == new_id + def test_get_new_tile_ids(self, create_combined_ids): shape = (1, 2, 3) combined_ids = create_combined_ids(shape) @@ -643,8 +652,6 @@ def test_combine_concat_over_redundant_nesting(self): assert_identical(expected, actual) -# TODO test for _new_tile_id - class TestAutoCombineUsingCoords(object): @pytest.mark.xfail(reason="Not yet implemented") def test_infer_order_from_coords(self): From a12a34a9be51fb16cfebcf13ad0ccb0ef26a4edd Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 30 Nov 2018 21:51:38 +0000 Subject: [PATCH 25/37] Fixed itertoolz import so that it's compatible with older versions --- xarray/core/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index aaef8cd4162..274659b2d61 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, print_function import warnings -import toolz.itertoolz as itertoolz +from toolz import itertoolz from collections import Counter, OrderedDict import pandas as pd From ada1f4a4ef501acb571c1724a84f3da2e9b05ac2 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 13:43:54 +0000 Subject: [PATCH 26/37] Increased test coverage --- xarray/core/combine.py | 7 +------ xarray/tests/test_backends.py | 8 ++++++-- xarray/tests/test_combine.py | 34 +++++++++++++++++++++++++--------- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 274659b2d61..cf832b1a9d2 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -2,7 +2,7 @@ import warnings from toolz import itertoolz -from collections import Counter, OrderedDict +from collections import Counter import pandas as pd @@ -439,11 +439,6 @@ def _check_shape_tile_ids(combined_tile_ids): "lengths along dimension" + str(dim)) -def _data_vars(combined_id): - id, ds = combined_id - return tuple(sorted(ds.data_vars)) - - def _combine_nd(combined_ids, concat_dims, data_vars='all', coords='different', compat='no_conflicts'): """ diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 42893cf5d9e..2dc950b9234 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2214,8 +2214,12 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], - infer_order_from_coords=True) as actual: + + with pytest.raises(NotImplementedError): + open_mfdataset([tmp1, tmp2], infer_order_from_coords=True) + + # With infer_order_from_coords=True this should pass in future + with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) def test_attrs_mfdataset(self): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 87d4efb57f5..9258dc0b5ee 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -497,6 +497,10 @@ def test_infer_from_datasets(self): (input, ['dim1']) assert_combined_tile_ids_equal(expected, actual) + input = [ds(0), ds(1)] + with pytest.raises(ValueError): + _infer_concat_order_from_positions(input, ['dim1', 'extra_dim']) + @pytest.fixture(scope='module') def create_combined_ids(): @@ -602,10 +606,7 @@ def test_check_lengths(self): @requires_dask # only for toolz class TestAutoCombineND(object): - # TODO there should be a lot more tests in here testing different cases - def test_single_dataset(self): - objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] actual = auto_combine(objs) expected = Dataset({'x': [0, 1]}) @@ -640,6 +641,10 @@ def test_invalid_hypercube_input(self): 'consistent depths'): auto_combine(datasets, concat_dims=['dim1', 'dim2']) + datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] + with raises_regex(ValueError, 'concat_dims has length'): + auto_combine(datasets, concat_dims=['dim1']) + def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) @@ -651,16 +656,18 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) + objs = [[Dataset({'x': [0]})]] + actual = auto_combine(objs, concat_dims=[None, None]) + expected = Dataset({'x': [0]}) + assert_identical(expected, actual) + class TestAutoCombineUsingCoords(object): - @pytest.mark.xfail(reason="Not yet implemented") - def test_infer_order_from_coords(self): - # Should pass once inferring order from coords is implemented + def test_infer_order_from_coords_not_implemented(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = auto_combine(objs, infer_order_from_coords=True) - expected = data - assert_identical(expected, actual) + with pytest.raises(NotImplementedError): + auto_combine(objs, infer_order_from_coords=True) def test_order_inferred_from_coords(self): data = create_test_data() @@ -669,3 +676,12 @@ def test_order_inferred_from_coords(self): _auto_combine(objs, concat_dims=['dim2'],compat='no_conflicts', data_vars='all', coords='different', infer_order_from_coords=True, ids=True) + + @pytest.mark.xfail(reason="Not yet implemented") + def test_infer_order_from_coords(self): + # Should pass once inferring order from coords is implemented + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + actual = auto_combine(objs) # but with infer_order_from_coords=True + expected = data + assert_identical(expected, actual) From ef0a30e789521e03a7f5322f488be1dafc208481 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 15:15:11 +0000 Subject: [PATCH 27/37] Added toolz as an explicit dependency to pass tests on python2.7 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3b56d9265af..ebc69b18f1c 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'Topic :: Scientific/Engineering', ] -INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2'] +INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2', 'toolz >= 0.9.0'] TESTS_REQUIRE = ['pytest >= 2.7.1'] if sys.version_info[0] < 3: TESTS_REQUIRE.append('mock') From 3be70bc2f181c1bc1cf04dc4da0154854bc78239 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 15:40:06 +0000 Subject: [PATCH 28/37] Updated 'what's new' --- doc/whats-new.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1da1da700e7..1d021a7f98e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,7 @@ What's New always be available to python 2.7 users. For more information see the following references - - `Xarray Github issue discussing dropping Python 2 `__ + - `Xarray Github issue discussing dropping Python 2 `__ - `Python 3 Statement `__ - `Tips on porting to Python 3 `__ @@ -33,6 +33,19 @@ v0.11.1 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ +- Auto-combine along N dimensions: + + - ``open_mfdataset`` and ``auto_combine`` can now combine datasets along any + number of dimensions, instead of just a 1D list of datasets. To combine + along multiple dimensions the datasets must be passed as a nested + list-of-lists. + + Breaking because ``open_mfdataset`` and ``auto_combine`` now expect an + argument ``concat_dims`` instead of ``concat_dim``. ``concat_dims`` accepts + a list of valid ``concat_dim`` arguments, e.g. ``['dim1', 'dim2']``. + (:issue:`2159`) + By `Tom Nicholas `_. + Enhancements ~~~~~~~~~~~~ From f266bc36f082951f2f6b418f57b49ea45668f240 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 15:52:40 +0000 Subject: [PATCH 29/37] No longer attempts to shortcut all concatenation at once if concat_dims=None --- xarray/core/combine.py | 53 ++++++++++++++++-------------------- xarray/tests/test_combine.py | 5 ++++ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index cf832b1a9d2..668348ccafe 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -377,7 +377,7 @@ def _infer_concat_order_from_positions(datasets, concat_dims): tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims == None: + if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims is None: concat_dims = [concat_dims]*n_dims else: if len(concat_dims) != n_dims: @@ -516,39 +516,34 @@ def _new_tile_id(single_id_ds_pair): def _auto_combine(datasets, concat_dims, compat, data_vars, coords, infer_order_from_coords, ids): """ - This function decides if any concatenation is necessary, and if so it calls - the logic to decide their concatenation order before concatenating. + Calls logic to decide concatenation order before concatenating. """ - if concat_dims is not None: - # Arrange datasets for concatenation - if infer_order_from_coords: - raise NotImplementedError - # TODO Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously - # Should look like - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, - # concat_dims) - else: - # Use information from the shape of the user input - if not ids: - # Determine tile_IDs by structure of input in N-D - # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_positions\ + # Arrange datasets for concatenation + if infer_order_from_coords: + raise NotImplementedError + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + # Should look like: + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, + # concat_dims) + else: + # Use information from the shape of the user input + if not ids: + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_positions\ (datasets, concat_dims) - else: - # Already sorted so just use the ids already passed - combined_ids = dict(zip(ids, datasets)) + else: + # Already sorted so just use the ids already passed + combined_ids = dict(zip(ids, datasets)) - # Check that the inferred shape is combinable - _check_shape_tile_ids(combined_ids) + # Check that the inferred shape is combinable + _check_shape_tile_ids(combined_ids) - # Repeatedly concatenate then merge along each dimension - combined = _combine_nd(combined_ids, concat_dims, compat=compat, - data_vars=data_vars, coords=coords) - else: - # Case of no concatenation wanted at all - combined = merge(datasets, compat=compat) + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) return combined diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 9258dc0b5ee..4225099aa05 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -661,6 +661,11 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({'x': [0]}) assert_identical(expected, actual) + objs = [[Dataset({'x': [0]})]] + actual = auto_combine(objs, concat_dims=None) + expected = Dataset({'x': [0]}) + assert_identical(expected, actual) + class TestAutoCombineUsingCoords(object): def test_infer_order_from_coords_not_implemented(self): From 878e1f9d3650d0e090acc9d9395015d9046fb950 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 20:07:50 +0000 Subject: [PATCH 30/37] Rewrote using itertools.groupby instead of toolz.itertoolz.groupby to remove hidden dependency on toolz --- setup.py | 2 +- xarray/core/combine.py | 32 +++++++++++++++++--------------- xarray/tests/test_combine.py | 3 --- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/setup.py b/setup.py index ebc69b18f1c..3b56d9265af 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'Topic :: Scientific/Engineering', ] -INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2', 'toolz >= 0.9.0'] +INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2'] TESTS_REQUIRE = ['pytest >= 2.7.1'] if sys.version_info[0] < 3: TESTS_REQUIRE.append('mock') diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 668348ccafe..4ae7b519a84 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,12 +1,11 @@ from __future__ import absolute_import, division, print_function import warnings -from toolz import itertoolz +import itertools from collections import Counter import pandas as pd -from . import utils from .alignment import align from .merge import merge from .pycompat import OrderedDict, basestring, iteritems @@ -373,7 +372,7 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): def _infer_concat_order_from_positions(datasets, concat_dims): - combined_ids = dict(_infer_tile_ids_from_nested_list(datasets, ())) + combined_ids = OrderedDict(_infer_tile_ids_from_nested_list(datasets, ())) tile_id, ds = list(combined_ids.items())[0] n_dims = len(tile_id) @@ -463,9 +462,9 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation - # Each iteration of this loop reduces the length of the tile_IDs tuples - # by one. It always removes the first - + # Each iteration of this loop reduces the length of the tile_ids tuples + # by one. It always combines along the first dimension, removing the first + # element of the tuple for concat_dim in concat_dims: combined_ids = _auto_combine_all_along_first_dim(combined_ids, dim=concat_dim, @@ -479,13 +478,15 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat): # Group into lines of datasets which must be combined along dim - grouped = itertoolz.groupby(_new_tile_id, combined_ids.items()) + # need to sort by _new_tile_id first for groupby to work + # TODO remove all these sorted OrderedDicts once python >= 3.6 only + combined_ids = OrderedDict(sorted(combined_ids.items(), key=_new_tile_id)) + grouped = itertools.groupby(combined_ids.items(), key=_new_tile_id) new_combined_ids = {} - for new_id, group in grouped.items(): - # TODO is there a way to unpack this object without using OrderedDict? + for new_id, group in grouped: combined_ids = OrderedDict(sorted(group)) - datasets = list(combined_ids.values()) + datasets = combined_ids.values() new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, data_vars, coords) return new_combined_ids @@ -497,11 +498,12 @@ def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, # This is just the old auto_combine function (which only worked along 1D) if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), - datasets).values() - concatenated = [_auto_concat(ds, dim=dim, + grouped = itertools.groupby(datasets, + key=lambda ds: tuple(sorted(ds.data_vars)), + ) + concatenated = [_auto_concat(list(ds_group), dim=dim, data_vars=data_vars, coords=coords) - for ds in grouped] + for id, ds_group in grouped] else: concatenated = datasets merged = merge(concatenated, compat=compat) @@ -536,7 +538,7 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, (datasets, concat_dims) else: # Already sorted so just use the ids already passed - combined_ids = dict(zip(ids, datasets)) + combined_ids = OrderedDict(zip(ids, datasets)) # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 4225099aa05..1c1313ccf5a 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -485,7 +485,6 @@ def test_uneven_length_input(self): expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)} actual = dict(_infer_tile_ids_from_nested_list(input, ())) - print(actual) assert_combined_tile_ids_equal(expected, actual) def test_infer_from_datasets(self): @@ -555,8 +554,6 @@ def test_concat_once(self, create_combined_ids, concat_dim): def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) - ds = create_test_data - print(combined_ids) result = _auto_combine_all_along_first_dim(combined_ids, dim='dim1', data_vars='all', From e6f25a31799b75eaf22a2d05a730aa400736800c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 1 Dec 2018 20:25:35 +0000 Subject: [PATCH 31/37] Fixed erroneous removal of utils import --- xarray/core/combine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 50607392b76..adb9ed32986 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -6,6 +6,7 @@ import pandas as pd +from . import utils from .alignment import align from .merge import merge from .pycompat import OrderedDict, basestring, iteritems @@ -44,7 +45,7 @@ def concat(objs, dim=None, data_vars='all', coords='different', * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. If objects are DataArrays, data_vars must be 'all'. - coords : {'minimal', 'different', 'all' or list of str}, optional + coords : {'minimal', 'different', 'all' o list of str}, optional These coordinate variables will be concatenated together: * 'minimal': Only coordinates in which the dimension already appears are included. From f85648548e325d9a352ffd8ac60261ef33cc2329 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 2 Dec 2018 19:31:22 +0000 Subject: [PATCH 32/37] Updated docstrings to include an example of multidimensional concatenation --- xarray/backends/api.py | 18 ++++++------- xarray/core/combine.py | 58 +++++++++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 417e2c016ee..dd176b33310 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -488,14 +488,18 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, """Open multiple files as a single dataset. Requires dask to be installed. See documentation for details on dask [1]. + Uses ``auto_combine`` to combine the opened datasets - see + ``auto_combine`` for details. Attributes from the first dataset file are used for the combined dataset. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit - list of files to open. Paths can be given as strings or as pathlib - Paths. + list of files to open. Paths can be given as strings or as pathlib + Paths. If concatenation along more than one dimension is desired, then + ``paths`` must be a nested list-of-lists (see ``auto_combine`` for + details). chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. @@ -510,8 +514,8 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=None`` explicitly to disable - concatenation. + component files. Set ``concat_dims=[..., None, ...]`` explicitly to + disable concatenation along a particular dimension. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for @@ -562,12 +566,6 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition the 'minimal' coordinates. - infer_order_from_coords : bool, optional - If true attempt to deduce the order in which the datasets should be - concatenated from their coordinates. To do this the coordinates should - be monotonic along the dimension to be concatenated. - If false instead read the order from the structure the datasets are - supplied in. This structure should be a nested list of lists. parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index adb9ed32986..5d14bf76520 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -561,6 +561,20 @@ def auto_combine(datasets, datasets into a single entity by inspecting metadata and using a combination of concat and merge. + Does not sort data under any circumstances. It does align coordinates, but + different variables on datasets can cause it to fail under some scenarios. + In complex cases, you may need to clean up your data and use concat/merge + explicitly. + + Works well if, for example, you have N years of data and M data variables, + and each combination of a distinct time period and set of data variables is + saved as its own dataset. + + Can concatenate along multiple dimensions. To do this the datasets must be + passed as a nested list-of-lists, with a depth equal to the length of + ``concat_dims``. ``auto_combine`` will concatenate along the top-level list + first. + Parameters ---------- datasets : sequence of xarray.Dataset @@ -572,8 +586,8 @@ def auto_combine(datasets, dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=None`` explicitly to disable - concatenation. + component files. Set ``concat_dims=[..., None, ...]`` explicitly to + disable concatenation along a particular dimension. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for @@ -591,21 +605,47 @@ def auto_combine(datasets, Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat - infer_order_from_coords : bool, optional - If true attempt to deduce the order in which the datasets should be - concatenated from their coordinates. To do this the coordinates should - be monotonic along the dimension to be concatenated. - If false instead read the order from the structure the datasets are - supplied in. This structure should be a nested list of lists. Returns ------- combined : xarray.Dataset + Examples + -------- + + Collecting output from a parallel simulation: + + Collecting data from a simulation which decomposes its domain into 4 parts, + 2 each along both the x and y axes, requires organising the datasets into a + nested list, e.g. + + >>> x1y1 + + Dimensions: (x: 2, y: 2) + Coordinates: + lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 + lat (x, y) float64 42.25 42.21 42.63 42.59 + Dimensions without coordinates: x, y + Data variables: + temperature (x, y) float64 11.04 23.57 20.77 ... + precipitation (x, y) float64 5.904 2.453 3.404 ... + + >>> ds_grid = [[x1y1, x1y2], [x2y1, x2y2]] + >>> combined = xr.auto_combine(ds_grid, concat_dims=['x', 'y']) + + Dimensions: (x: 4, y: 4) + Coordinates: + lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 + lat (x, y) float64 42.25 42.21 42.63 42.59 + Dimensions without coordinates: x, y + Data variables: + temperature (x, y) float64 11.04 23.57 20.77 ... + precipitation (x, y) float64 5.904 2.453 3.404 ... + See also -------- concat - Dataset.merge + merge """ # The IDs argument tells _auto_combine that the datasets are not yet sorted From 6305d83988be8d7807d90c580d5e01ce72293012 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 5 Dec 2018 16:26:15 +0000 Subject: [PATCH 33/37] Clarified auto_combine docstring for N-D behaviour --- xarray/core/combine.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 5d14bf76520..19ad58b5224 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -461,7 +461,6 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', combined_ds : xarray.Dataset """ - # TODO refactor this logic, possibly using method in np.blocks # Perform N-D dimensional concatenation # Each iteration of this loop reduces the length of the tile_ids tuples # by one. It always combines along the first dimension, removing the first @@ -557,8 +556,8 @@ def auto_combine(datasets, infer_order_from_coords=False): """Attempt to auto-magically combine the given datasets into one. - This method attempts to combine a list (or nested list of lists) of - datasets into a single entity by inspecting metadata and using a + This method attempts to combine a group of datasets along any number of + dimensions into a single entity by inspecting metadata and using a combination of concat and merge. Does not sort data under any circumstances. It does align coordinates, but @@ -577,8 +576,11 @@ def auto_combine(datasets, Parameters ---------- - datasets : sequence of xarray.Dataset - Dataset objects to merge. + datasets : sequence of xarray.Dataset, or nested list of xarray.Dataset + objects. + Dataset objects to combine. + If concatenation along more than one dimension is desired, then + datasets must be supplied in a nested list-of-lists. concat_dims : list of str, DataArray, Index or None, optional Dimensions along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if @@ -588,6 +590,8 @@ def auto_combine(datasets, By default, xarray attempts to infer this argument by examining component files. Set ``concat_dims=[..., None, ...]`` explicitly to disable concatenation along a particular dimension. + Must be the same length as the depth of the list passed to + ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for From ce59da139c678b5d20dedf9367635e7d12c6cd08 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 10 Dec 2018 15:14:01 +0000 Subject: [PATCH 34/37] Added unit test for nested list of Datasets with different variables --- xarray/tests/test_combine.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 1c1313ccf5a..b523ec11e50 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -642,6 +642,24 @@ def test_invalid_hypercube_input(self): with raises_regex(ValueError, 'concat_dims has length'): auto_combine(datasets, concat_dims=['dim1']) + def test_merge_one_dim_concat_another(self): + objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'bar': ('x', [10, 20])})], + [Dataset({'foo': ('x', [2, 3])}), Dataset({'bar': ('x', [30, 40])})]] + expected = Dataset({'foo': ('x', [0, 1, 2, 3]), + 'bar': ('x', [10, 20, 30, 40])}) + + actual = auto_combine(objs, concat_dims=['x', None]) + assert_identical(expected, actual) + + actual = auto_combine(objs) + assert_identical(expected, actual) + + # Proving it works symmetrically + objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'foo': ('x', [2, 3])})], + [Dataset({'bar': ('x', [10, 20])}), Dataset({'bar': ('x', [30, 40])})]] + actual = auto_combine(objs, concat_dims=[None, 'x']) + assert_identical(expected, actual) + def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] actual = auto_combine(objs, concat_dims=[None, 'x']) From 9fb34cf8e402a02dbf374ca3084d74cf08bf1f71 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Mon, 10 Dec 2018 15:33:09 +0000 Subject: [PATCH 35/37] Minor spelling and pep8 fixes --- xarray/backends/api.py | 8 ++++---- xarray/core/combine.py | 12 +++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index dd176b33310..1b4935f23b0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -602,10 +602,10 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, # If infer_order_from_coords=True then this is unnecessary, but quick. # If infer_order_from_coords=False then this creates a flat list which is # easier to iterate over, while saving the originally-supplied structure - combined_ids_paths, concat_dims = _infer_concat_order_from_positions\ - (paths, concat_dims) - ids, paths = list(combined_ids_paths.keys()), \ - list(combined_ids_paths.values()) + combined_ids_paths, concat_dims = _infer_concat_order_from_positions( + paths, concat_dims) + ids, paths = ( + list(combined_ids_paths.keys()), list(combined_ids_paths.values())) open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 19ad58b5224..a0e01cda2b7 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -45,7 +45,7 @@ def concat(objs, dim=None, data_vars='all', coords='different', * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. If objects are DataArrays, data_vars must be 'all'. - coords : {'minimal', 'different', 'all' o list of str}, optional + coords : {'minimal', 'different', 'all' or list of str}, optional These coordinate variables will be concatenated together: * 'minimal': Only coordinates in which the dimension already appears are included. @@ -393,7 +393,7 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): Given a list of lists (of lists...) of objects, returns a iterator which returns a tuple containing the index of each object in the nested list structure as the key, and the object. This can then be called by the - dict constructor to create a dictionary of the objects organised byt their + dict constructor to create a dictionary of the objects organised by their position in the original nested list. Recursively traverses the given structure, while keeping track of the @@ -424,14 +424,14 @@ def _check_shape_tile_ids(combined_tile_ids): # Check all tuples are the same length # i.e. check that all lists are nested to the same depth - nesting_depths = [len(id) for id in tile_ids] + nesting_depths = [len(tile_id) for tile_id in tile_ids] if not set(nesting_depths) == {nesting_depths[0]}: raise ValueError("The supplied objects do not form a hypercube because" " sub-lists do not have consistent depths") # Check all lists along one dimension are same length for dim in range(nesting_depths[0]): - indices_along_dim = [id[dim] for id in tile_ids] + indices_along_dim = [tile_id[dim] for tile_id in tile_ids] occurrences = Counter(indices_along_dim) if len(set(occurrences.values())) != 1: raise ValueError("The supplied objects do not form a hypercube " @@ -498,9 +498,7 @@ def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, # This is just the old auto_combine function (which only worked along 1D) if concat_dim is not None: dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertools.groupby(datasets, - key=lambda ds: tuple(sorted(ds.data_vars)), - ) + grouped = itertools.groupby(datasets, key=lambda ds: tuple(sorted(ds))) concatenated = [_auto_concat(list(ds_group), dim=dim, data_vars=data_vars, coords=coords) for id, ds_group in grouped] From f47486f7552909d0f724350dddf6402419f0e24d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 11 Dec 2018 10:50:16 +0000 Subject: [PATCH 36/37] Reverted API so that N-D generalisation is hidden --- doc/whats-new.rst | 13 ----- setup.py | 1 - xarray/backends/api.py | 38 +++++++------ xarray/core/combine.py | 101 +++++++++++----------------------- xarray/tests/test_backends.py | 12 ++-- xarray/tests/test_combine.py | 32 +++++------ 6 files changed, 74 insertions(+), 123 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 45c3ce35639..c058c5ce068 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,19 +33,6 @@ v0.11.1 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ -- Auto-combine along N dimensions: - - - ``open_mfdataset`` and ``auto_combine`` can now combine datasets along any - number of dimensions, instead of just a 1D list of datasets. To combine - along multiple dimensions the datasets must be passed as a nested - list-of-lists. - - Breaking because ``open_mfdataset`` and ``auto_combine`` now expect an - argument ``concat_dims`` instead of ``concat_dim``. ``concat_dims`` accepts - a list of valid ``concat_dim`` arguments, e.g. ``['dim1', 'dim2']``. - (:issue:`2159`) - By `Tom Nicholas `_. - Enhancements ~~~~~~~~~~~~ diff --git a/setup.py b/setup.py index a9104156764..eaf57dff81d 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,6 @@ 'Topic :: Scientific/Engineering', ] - INSTALL_REQUIRES = ['numpy >= 1.12', 'pandas >= 0.19.2'] SETUP_REQUIRES = ['pytest-runner >= 4.2'] TESTS_REQUIRE = ['pytest >= 2.7.1'] diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1b4935f23b0..008606d6eb8 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -480,26 +480,20 @@ def close(self): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' -def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, +def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', infer_order_from_coords=False, autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. - Requires dask to be installed. See documentation for details on dask [1]. - Uses ``auto_combine`` to combine the opened datasets - see - ``auto_combine`` for details. Attributes from the first dataset file are used for the combined dataset. - Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit - list of files to open. Paths can be given as strings or as pathlib - Paths. If concatenation along more than one dimension is desired, then - ``paths`` must be a nested list-of-lists (see ``auto_combine`` for - details). + list of files to open. Paths can be given as strings or as pathlib + Paths. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. @@ -507,20 +501,19 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. - concat_dims : list of str, DataArray, Index or None, optional - Dimensions to concatenate files along. This argument is passed on to + concat_dim : None, str, DataArray or Index, optional + Dimension to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only - need to provide this argument if any of the dimensions along which you - want to concatenate is not a dimension in the original datasets, e.g., - if you want to stack a collection of 2D arrays along a third dimension. + need to provide this argument if the dimension along which you want to + concatenate is not a dimension in the original datasets, e.g., if you + want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dims=[..., None, ...]`` explicitly to - disable concatenation along a particular dimension. + component files. Set ``concat_dim=None`` explicitly to disable + concatenation. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. @@ -583,6 +576,7 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, References ---------- + .. [1] http://xarray.pydata.org/en/stable/dask.html .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance """ @@ -599,6 +593,16 @@ def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') + # Coerce 1D input into ND to maintain backwards-compatible API until API + # for N-D combine decided + # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746) + if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT: + concat_dims = concat_dim + elif not isinstance(concat_dim, list): + concat_dims = [concat_dim] + else: + concat_dims = concat_dim + # If infer_order_from_coords=True then this is unnecessary, but quick. # If infer_order_from_coords=False then this creates a flat list which is # easier to iterate over, while saving the originally-supplied structure diff --git a/xarray/core/combine.py b/xarray/core/combine.py index a0e01cda2b7..6b033dc20a9 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -548,53 +548,37 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, def auto_combine(datasets, - concat_dims=_CONCAT_DIM_DEFAULT, + concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', data_vars='all', coords='different', infer_order_from_coords=False): """Attempt to auto-magically combine the given datasets into one. - - This method attempts to combine a group of datasets along any number of - dimensions into a single entity by inspecting metadata and using a - combination of concat and merge. - - Does not sort data under any circumstances. It does align coordinates, but - different variables on datasets can cause it to fail under some scenarios. - In complex cases, you may need to clean up your data and use concat/merge - explicitly. - - Works well if, for example, you have N years of data and M data variables, - and each combination of a distinct time period and set of data variables is - saved as its own dataset. - - Can concatenate along multiple dimensions. To do this the datasets must be - passed as a nested list-of-lists, with a depth equal to the length of - ``concat_dims``. ``auto_combine`` will concatenate along the top-level list - first. - + This method attempts to combine a list of datasets into a single entity by + inspecting metadata and using a combination of concat and merge. + It does not concatenate along more than one dimension or sort data under + any circumstances. It does align coordinates, but different variables on + datasets can cause it to fail under some scenarios. In complex cases, you + may need to clean up your data and use ``concat``/``merge`` explicitly. + ``auto_combine`` works well if you have N years of data and M data + variables, and each combination of a distinct time period and set of data + variables is saved its own dataset. Parameters ---------- - datasets : sequence of xarray.Dataset, or nested list of xarray.Dataset - objects. - Dataset objects to combine. - If concatenation along more than one dimension is desired, then - datasets must be supplied in a nested list-of-lists. - concat_dims : list of str, DataArray, Index or None, optional - Dimensions along which to concatenate variables, as used by + datasets : sequence of xarray.Dataset + Dataset objects to merge. + concat_dim : str or DataArray or Index, optional + Dimension along which to concatenate variables, as used by :py:func:`xarray.concat`. You only need to provide this argument if - any of the dimensions along which you want to concatenate are not a - dimension in the original datasets, e.g., if you want to stack a - collection of 2D arrays along a third dimension. + the dimension along which you want to concatenate is not a dimension + in the original datasets, e.g., if you want to stack a collection of + 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dims=[..., None, ...]`` explicitly to - disable concatenation along a particular dimension. - Must be the same length as the depth of the list passed to - ``datasets``. + component files. Set ``concat_dim=None`` explicitly to disable + concatenation. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts: - - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. @@ -606,50 +590,27 @@ def auto_combine(datasets, data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional - Details are in the documentation of concat - + Details are in the documentation of conca Returns ------- combined : xarray.Dataset - Examples - -------- - - Collecting output from a parallel simulation: - - Collecting data from a simulation which decomposes its domain into 4 parts, - 2 each along both the x and y axes, requires organising the datasets into a - nested list, e.g. - - >>> x1y1 - - Dimensions: (x: 2, y: 2) - Coordinates: - lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 - lat (x, y) float64 42.25 42.21 42.63 42.59 - Dimensions without coordinates: x, y - Data variables: - temperature (x, y) float64 11.04 23.57 20.77 ... - precipitation (x, y) float64 5.904 2.453 3.404 ... - - >>> ds_grid = [[x1y1, x1y2], [x2y1, x2y2]] - >>> combined = xr.auto_combine(ds_grid, concat_dims=['x', 'y']) - - Dimensions: (x: 4, y: 4) - Coordinates: - lon (x, y) float64 -99.83 -99.32 -99.79 -99.23 - lat (x, y) float64 42.25 42.21 42.63 42.59 - Dimensions without coordinates: x, y - Data variables: - temperature (x, y) float64 11.04 23.57 20.77 ... - precipitation (x, y) float64 5.904 2.453 3.404 ... - See also -------- concat - merge + Dataset.merge """ + # Coerce 1D input into ND to maintain backwards-compatible API until API + # for N-D combine decided + # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746) + if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT: + concat_dims = concat_dim + elif not isinstance(concat_dim, list): + concat_dims = [concat_dim] + else: + concat_dims = concat_dim + # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, data_vars=data_vars, coords=coords, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2dc950b9234..ea3d099e6ad 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2158,7 +2158,7 @@ def test_open_mfdataset_2d(self): y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], - concat_dims=['y', 'x']) as actual: + concat_dim=['y', 'x']) as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == \ @@ -2166,7 +2166,7 @@ def test_open_mfdataset_2d(self): assert_identical(original, actual) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], - concat_dims=['y', 'x'], + concat_dim=['y', 'x'], chunks={'x': 3, 'y': 2}) as actual: assert actual.foo.variable.data.chunks == \ ((3, 2, 3, 2), (2, 2, 2, 2),) @@ -2204,7 +2204,7 @@ def test_open_mfdataset_2d_pathlib(self): y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], - concat_dims=['y', 'x']) as actual: + concat_dim=['y', 'x']) as actual: assert_identical(original, actual) @pytest.mark.xfail(reason="Not yet implemented") @@ -2303,7 +2303,7 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dims=None) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual: assert_identical(data, actual) def test_open_dataset(self): @@ -2330,7 +2330,7 @@ def test_open_single_dataset(self): {'baz': [100]}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dims=[dim]) as actual: + with open_mfdataset([tmp], concat_dim=dim) as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): @@ -2694,7 +2694,7 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - concat_dims=['TSTEP'], + concat_dim=['TSTEP'], backend_kwargs={'format': 'uamiv'}) data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index b523ec11e50..8e90b705124 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -356,7 +356,7 @@ def test_auto_combine_previously_failed(self): expected = Dataset({'a': (('t', 'x'), [[np.nan, 2, 3], [1, 2, np.nan]])}, {'x': [0, 1, 2]}) - actual = auto_combine(datasets, concat_dims=['t']) + actual = auto_combine(datasets, concat_dim='t') assert_identical(expected, actual) @requires_dask # only for toolz @@ -381,21 +381,21 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) data = Dataset({'x': 0}) - actual = auto_combine([data, data, data], concat_dims=None) + actual = auto_combine([data, data, data], concat_dim=None) assert_identical(data, actual) tmp1 = Dataset({'x': 0}) tmp2 = Dataset({'x': np.nan}) - actual = auto_combine([tmp1, tmp2], concat_dims=None) + actual = auto_combine([tmp1, tmp2], concat_dim=None) assert_identical(tmp1, actual) - actual = auto_combine([tmp1, tmp2], concat_dims=[None]) + actual = auto_combine([tmp1, tmp2], concat_dim=[None]) assert_identical(tmp1, actual) # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({'x': 0, 'y': 1})] dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dims=[dim]) + actual = auto_combine(objs, concat_dim=dim) expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, {'baz': [100]}) assert_identical(expected, actual) @@ -404,7 +404,7 @@ def test_auto_combine_no_concat(self): # expected for non-scalar values, too. objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dims=[dim]) + actual = auto_combine(objs, concat_dim=dim) expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), 'y': (('baz', 'z'), [[1, 2]])}, {'baz': [100]}) @@ -621,7 +621,7 @@ def test_auto_combine_2d(self): expected = concat([partway1, partway2, partway3], dim='dim2') datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] - result = auto_combine(datasets, concat_dims=['dim1', 'dim2']) + result = auto_combine(datasets, concat_dim=['dim1', 'dim2']) assert_equal(result, expected) @@ -631,16 +631,16 @@ def test_invalid_hypercube_input(self): datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent lengths'): - auto_combine(datasets, concat_dims=['dim1', 'dim2']) + auto_combine(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent depths'): - auto_combine(datasets, concat_dims=['dim1', 'dim2']) + auto_combine(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] with raises_regex(ValueError, 'concat_dims has length'): - auto_combine(datasets, concat_dims=['dim1']) + auto_combine(datasets, concat_dim=['dim1']) def test_merge_one_dim_concat_another(self): objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'bar': ('x', [10, 20])})], @@ -648,7 +648,7 @@ def test_merge_one_dim_concat_another(self): expected = Dataset({'foo': ('x', [0, 1, 2, 3]), 'bar': ('x', [10, 20, 30, 40])}) - actual = auto_combine(objs, concat_dims=['x', None]) + actual = auto_combine(objs, concat_dim=['x', None]) assert_identical(expected, actual) actual = auto_combine(objs) @@ -657,27 +657,27 @@ def test_merge_one_dim_concat_another(self): # Proving it works symmetrically objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'foo': ('x', [2, 3])})], [Dataset({'bar': ('x', [10, 20])}), Dataset({'bar': ('x', [30, 40])})]] - actual = auto_combine(objs, concat_dims=[None, 'x']) + actual = auto_combine(objs, concat_dim=[None, 'x']) assert_identical(expected, actual) def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] - actual = auto_combine(objs, concat_dims=[None, 'x']) + actual = auto_combine(objs, concat_dim=[None, 'x']) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})], [Dataset({'x': [1]})]] - actual = auto_combine(objs, concat_dims=['x', None]) + actual = auto_combine(objs, concat_dim=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})]] - actual = auto_combine(objs, concat_dims=[None, None]) + actual = auto_combine(objs, concat_dim=[None, None]) expected = Dataset({'x': [0]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})]] - actual = auto_combine(objs, concat_dims=None) + actual = auto_combine(objs, concat_dim=None) expected = Dataset({'x': [0]}) assert_identical(expected, actual) From ebbe47f450ed4407655bd9a4ed45274b140452dd Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 12 Dec 2018 00:52:51 +0000 Subject: [PATCH 37/37] Removed infer_order_from_coords argument --- xarray/backends/api.py | 2 +- xarray/core/combine.py | 15 +++++++-------- xarray/tests/test_combine.py | 8 +------- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 008606d6eb8..fdc7badde20 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -483,7 +483,6 @@ def close(self): def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', - infer_order_from_coords=False, autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. Requires dask to be installed. See documentation for details on dask [1]. @@ -602,6 +601,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, concat_dims = [concat_dim] else: concat_dims = concat_dim + infer_order_from_coords = False # If infer_order_from_coords=True then this is unnecessary, but quick. # If infer_order_from_coords=False then this creates a flat list which is diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 6b033dc20a9..c9924b2ad1e 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -381,9 +381,10 @@ def _infer_concat_order_from_positions(datasets, concat_dims): concat_dims = [concat_dims]*n_dims else: if len(concat_dims) != n_dims: - raise ValueError("concat_dims has length " + str(len(concat_dims)) - + " but the datasets passed are nested in a " + - str(n_dims) + "-dimensional structure") + raise ValueError("concat_dims has length {} but the datasets " + "passed are nested in a {}-dimensional " + "structure".format(str(len(concat_dims)), + str(n_dims))) return combined_ids, concat_dims @@ -547,11 +548,8 @@ def _auto_combine(datasets, concat_dims, compat, data_vars, coords, return combined -def auto_combine(datasets, - concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', - data_vars='all', coords='different', - infer_order_from_coords=False): +def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', data_vars='all', coords='different'): """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by inspecting metadata and using a combination of concat and merge. @@ -610,6 +608,7 @@ def auto_combine(datasets, concat_dims = [concat_dim] else: concat_dims = concat_dim + infer_order_from_coords = False # The IDs argument tells _auto_combine that the datasets are not yet sorted return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 8e90b705124..ec2288b1d2d 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -683,17 +683,11 @@ def test_combine_concat_over_redundant_nesting(self): class TestAutoCombineUsingCoords(object): - def test_infer_order_from_coords_not_implemented(self): - data = create_test_data() - objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - with pytest.raises(NotImplementedError): - auto_combine(objs, infer_order_from_coords=True) - def test_order_inferred_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] with pytest.raises(NotImplementedError): - _auto_combine(objs, concat_dims=['dim2'],compat='no_conflicts', + _auto_combine(objs, concat_dims=['dim2'], compat='no_conflicts', data_vars='all', coords='different', infer_order_from_coords=True, ids=True)