diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b1d5b92da4d..c058c5ce068 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,7 @@ What's New always be available to python 2.7 users. For more information see the following references - - `Xarray Github issue discussing dropping Python 2 `__ + - `Xarray Github issue discussing dropping Python 2 `__ - `Python 3 Statement `__ - `Tips on porting to Python 3 `__ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index c1ace7774f9..fdc7badde20 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,7 @@ from .. import Dataset, backends, conventions from ..core import indexing -from ..core.combine import auto_combine +from ..core.combine import _infer_concat_order_from_positions, _auto_combine from ..core.pycompat import basestring, path_type from ..core.utils import close_on_error, is_remote_uri, is_grib_path from .common import ArrayWriter @@ -485,10 +485,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, lock=None, data_vars='all', coords='different', autoclose=None, parallel=False, **kwargs): """Open multiple files as a single dataset. - Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. - Parameters ---------- paths : str or sequence @@ -515,7 +513,6 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. @@ -578,6 +575,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, References ---------- + .. [1] http://xarray.pydata.org/en/stable/dask.html .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance """ @@ -594,6 +592,25 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') + # Coerce 1D input into ND to maintain backwards-compatible API until API + # for N-D combine decided + # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746) + if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT: + concat_dims = concat_dim + elif not isinstance(concat_dim, list): + concat_dims = [concat_dim] + else: + concat_dims = concat_dim + infer_order_from_coords = False + + # If infer_order_from_coords=True then this is unnecessary, but quick. + # If infer_order_from_coords=False then this creates a flat list which is + # easier to iterate over, while saving the originally-supplied structure + combined_ids_paths, concat_dims = _infer_concat_order_from_positions( + paths, concat_dims) + ids, paths = ( + list(combined_ids_paths.keys()), list(combined_ids_paths.values())) + open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) @@ -618,15 +635,17 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # the underlying datasets will still be stored as dask arrays datasets, file_objs = dask.compute(datasets, file_objs) - # close datasets in case of a ValueError + # Close datasets in case of a ValueError try: - if concat_dim is _CONCAT_DIM_DEFAULT: - combined = auto_combine(datasets, compat=compat, - data_vars=data_vars, coords=coords) - else: - combined = auto_combine(datasets, concat_dim=concat_dim, - compat=compat, - data_vars=data_vars, coords=coords) + if infer_order_from_coords: + # Discard ordering because it should be redone from coordinates + ids = False + + combined = _auto_combine(datasets, concat_dims=concat_dims, + compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=infer_order_from_coords, + ids=ids) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index ea156667430..c9924b2ad1e 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,6 +1,8 @@ from __future__ import absolute_import, division, print_function import warnings +import itertools +from collections import Counter import pandas as pd @@ -369,24 +371,195 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): _CONCAT_DIM_DEFAULT = '__infer_concat_dim__' -def auto_combine(datasets, - concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', - data_vars='all', coords='different'): - """Attempt to auto-magically combine the given datasets into one. +def _infer_concat_order_from_positions(datasets, concat_dims): + + combined_ids = OrderedDict(_infer_tile_ids_from_nested_list(datasets, ())) + + tile_id, ds = list(combined_ids.items())[0] + n_dims = len(tile_id) + if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims is None: + concat_dims = [concat_dims]*n_dims + else: + if len(concat_dims) != n_dims: + raise ValueError("concat_dims has length {} but the datasets " + "passed are nested in a {}-dimensional " + "structure".format(str(len(concat_dims)), + str(n_dims))) + + return combined_ids, concat_dims + + +def _infer_tile_ids_from_nested_list(entry, current_pos): + """ + Given a list of lists (of lists...) of objects, returns a iterator + which returns a tuple containing the index of each object in the nested + list structure as the key, and the object. This can then be called by the + dict constructor to create a dictionary of the objects organised by their + position in the original nested list. + + Recursively traverses the given structure, while keeping track of the + current position. Should work for any type of object which isn't a list. + + Parameters + ---------- + entry : list[list[obj, obj, ...]] + List of lists of arbitrary depth, containing objects in the order + they are to be concatenated. + + Returns + ------- + combined_tile_ids : dict[tuple(int, ...), obj] + """ + + if isinstance(entry, list): + for i, item in enumerate(entry): + for result in _infer_tile_ids_from_nested_list(item, + current_pos + (i,)): + yield result + else: + yield current_pos, entry + + +def _check_shape_tile_ids(combined_tile_ids): + tile_ids = combined_tile_ids.keys() + + # Check all tuples are the same length + # i.e. check that all lists are nested to the same depth + nesting_depths = [len(tile_id) for tile_id in tile_ids] + if not set(nesting_depths) == {nesting_depths[0]}: + raise ValueError("The supplied objects do not form a hypercube because" + " sub-lists do not have consistent depths") + + # Check all lists along one dimension are same length + for dim in range(nesting_depths[0]): + indices_along_dim = [tile_id[dim] for tile_id in tile_ids] + occurrences = Counter(indices_along_dim) + if len(set(occurrences.values())) != 1: + raise ValueError("The supplied objects do not form a hypercube " + "because sub-lists do not have consistent " + "lengths along dimension" + str(dim)) + + +def _combine_nd(combined_ids, concat_dims, data_vars='all', + coords='different', compat='no_conflicts'): + """ + Concatenates and merges an N-dimensional structure of datasets. + + No checks are performed on the consistency of the datasets, concat_dims or + tile_IDs, because it is assumed that this has already been done. + + Parameters + ---------- + combined_ids : Dict[Tuple[int, ...]], xarray.Dataset] + Structure containing all datasets to be concatenated with "tile_IDs" as + keys, which specify position within the desired final combined result. + concat_dims : sequence of str + The dimensions along which the datasets should be concatenated. Must be + in order, and the length must match + + Returns + ------- + combined_ds : xarray.Dataset + """ + + # Perform N-D dimensional concatenation + # Each iteration of this loop reduces the length of the tile_ids tuples + # by one. It always combines along the first dimension, removing the first + # element of the tuple + for concat_dim in concat_dims: + combined_ids = _auto_combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars=data_vars, + coords=coords, + compat=compat) + combined_ds = list(combined_ids.values())[0] + return combined_ds + + +def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, + coords, compat): + # Group into lines of datasets which must be combined along dim + # need to sort by _new_tile_id first for groupby to work + # TODO remove all these sorted OrderedDicts once python >= 3.6 only + combined_ids = OrderedDict(sorted(combined_ids.items(), key=_new_tile_id)) + grouped = itertools.groupby(combined_ids.items(), key=_new_tile_id) + + new_combined_ids = {} + for new_id, group in grouped: + combined_ids = OrderedDict(sorted(group)) + datasets = combined_ids.values() + new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, + data_vars, coords) + return new_combined_ids + + +def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', + data_vars='all', coords='different'): + # This is just the old auto_combine function (which only worked along 1D) + if concat_dim is not None: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + grouped = itertools.groupby(datasets, key=lambda ds: tuple(sorted(ds))) + concatenated = [_auto_concat(list(ds_group), dim=dim, + data_vars=data_vars, coords=coords) + for id, ds_group in grouped] + else: + concatenated = datasets + merged = merge(concatenated, compat=compat) + return merged + + +def _new_tile_id(single_id_ds_pair): + tile_id, ds = single_id_ds_pair + return tile_id[1:] + + +def _auto_combine(datasets, concat_dims, compat, data_vars, coords, + infer_order_from_coords, ids): + """ + Calls logic to decide concatenation order before concatenating. + """ + + # Arrange datasets for concatenation + if infer_order_from_coords: + raise NotImplementedError + # TODO Use coordinates to determine tile_ID for each dataset in N-D + # Ignore how they were ordered previously + # Should look like: + # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, + # concat_dims) + else: + # Use information from the shape of the user input + if not ids: + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids, concat_dims = _infer_concat_order_from_positions\ + (datasets, concat_dims) + else: + # Already sorted so just use the ids already passed + combined_ids = OrderedDict(zip(ids, datasets)) + + # Check that the inferred shape is combinable + _check_shape_tile_ids(combined_ids) + # Repeatedly concatenate then merge along each dimension + combined = _combine_nd(combined_ids, concat_dims, compat=compat, + data_vars=data_vars, coords=coords) + return combined + + +def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', data_vars='all', coords='different'): + """Attempt to auto-magically combine the given datasets into one. This method attempts to combine a list of datasets into a single entity by inspecting metadata and using a combination of concat and merge. - It does not concatenate along more than one dimension or sort data under any circumstances. It does align coordinates, but different variables on datasets can cause it to fail under some scenarios. In complex cases, you may need to clean up your data and use ``concat``/``merge`` explicitly. - ``auto_combine`` works well if you have N years of data and M data variables, and each combination of a distinct time period and set of data variables is saved its own dataset. - Parameters ---------- datasets : sequence of xarray.Dataset @@ -404,7 +577,6 @@ def auto_combine(datasets, 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts: - - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. @@ -415,9 +587,8 @@ def auto_combine(datasets, of all non-null values. data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat - coords : {'minimal', 'different', 'all' o list of str}, optional - Details are in the documentation of concat - + coords : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of conca Returns ------- combined : xarray.Dataset @@ -427,15 +598,20 @@ def auto_combine(datasets, concat Dataset.merge """ - from toolz import itertoolz - if concat_dim is not None: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)), - datasets).values() - concatenated = [_auto_concat(ds, dim=dim, - data_vars=data_vars, coords=coords) - for ds in grouped] + + # Coerce 1D input into ND to maintain backwards-compatible API until API + # for N-D combine decided + # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746) + if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT: + concat_dims = concat_dim + elif not isinstance(concat_dim, list): + concat_dims = [concat_dim] else: - concatenated = datasets - merged = merge(concatenated, compat=compat) - return merged + concat_dims = concat_dim + infer_order_from_coords = False + + # The IDs argument tells _auto_combine that the datasets are not yet sorted + return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, + data_vars=data_vars, coords=coords, + infer_order_from_coords=infer_order_from_coords, + ids=False) diff --git a/xarray/testing.py b/xarray/testing.py index ee5a54cd7dc..03c5354cdff 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -138,3 +138,11 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): else: raise TypeError('{} not supported by assertion comparison' .format(type(a))) + + +def assert_combined_tile_ids_equal(dict1, dict2): + assert len(dict1) == len(dict2) + for k, v in dict1.items(): + assert k in dict2.keys() + assert_equal(dict1[k], dict2[k]) + diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index a45f71bbc3b..cd66ad82356 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -15,7 +15,7 @@ from xarray.core import utils from xarray.core.indexing import ExplicitlyIndexed from xarray.testing import (assert_equal, assert_identical, # noqa: F401 - assert_allclose) + assert_allclose, assert_combined_tile_ids_equal) from xarray.plot.utils import import_seaborn try: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fb9c43c0165..ea3d099e6ad 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2135,12 +2135,42 @@ def test_open_mfdataset(self): assert actual.foo.variable.data.chunks == \ ((3, 2, 3, 2),) + with raises_regex(IOError, 'no files to open'): open_mfdataset('foo-bar-baz-*.nc') with raises_regex(ValueError, 'wild-card'): open_mfdataset('http://some/remote/uri') + def test_open_mfdataset_2d(self): + original = Dataset({'foo': (['x', 'y'], np.random.randn(10, 8))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + with create_tmp_file() as tmp4: + original.isel(x=slice(5), + y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), + y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), + y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), + y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dim=['y', 'x']) as actual: + assert isinstance(actual.foo.variable.data, + da.Array) + assert actual.foo.variable.data.chunks == \ + ((5, 5), (4, 4)) + assert_identical(original, actual) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dim=['y', 'x'], + chunks={'x': 3, 'y': 2}) as actual: + assert actual.foo.variable.data.chunks == \ + ((3, 2, 3, 2), (2, 2, 2, 2),) + @requires_pathlib def test_open_mfdataset_pathlib(self): original = Dataset({'foo': ('x', np.random.randn(10))}) @@ -2153,6 +2183,45 @@ def test_open_mfdataset_pathlib(self): with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(original, actual) + @requires_pathlib + def test_open_mfdataset_2d_pathlib(self): + original = Dataset({'foo': (['x', 'y'], np.random.randn(10, 8))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + with create_tmp_file() as tmp4: + tmp1 = Path(tmp1) + tmp2 = Path(tmp2) + tmp3 = Path(tmp3) + tmp4 = Path(tmp4) + original.isel(x=slice(5), + y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), + y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), + y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), + y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset([[tmp1, tmp2], + [tmp3, tmp4]], + concat_dim=['y', 'x']) as actual: + assert_identical(original, actual) + + @pytest.mark.xfail(reason="Not yet implemented") + def test_open_mfdataset(self): + original = Dataset({'foo': ('x', np.random.randn(10))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + + with pytest.raises(NotImplementedError): + open_mfdataset([tmp1, tmp2], infer_order_from_coords=True) + + # With infer_order_from_coords=True this should pass in future + with open_mfdataset([tmp1, tmp2]) as actual: + assert_identical(original, actual) + def test_attrs_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: @@ -2625,7 +2694,7 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - concat_dim='TSTEP', + concat_dim=['TSTEP'], backend_kwargs={'format': 'uamiv'}) data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 2004b1e660f..ec2288b1d2d 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,17 +1,22 @@ from __future__ import absolute_import, division, print_function from copy import deepcopy +from itertools import product import numpy as np import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable, auto_combine, concat +from xarray import DataArray, Dataset, Variable, auto_combine, concat, merge from xarray.core.pycompat import OrderedDict, iteritems +from xarray.core.combine import ( + _new_tile_id, _auto_combine_all_along_first_dim, + _infer_concat_order_from_positions, _infer_tile_ids_from_nested_list, + _check_shape_tile_ids, _combine_nd, _auto_combine_1d, _auto_combine) from . import ( InaccessibleArray, assert_array_equal, assert_equal, assert_identical, - raises_regex, requires_dask) + assert_combined_tile_ids_equal, raises_regex, requires_dask) from .test_dataset import create_test_data @@ -297,39 +302,40 @@ def test_concat_lazy(self): class TestAutoCombine(object): + @pytest.mark.parametrize("combine", [_auto_combine_1d, auto_combine]) @requires_dask # only for toolz - def test_auto_combine(self): + def test_auto_combine(self, combine): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) - actual = auto_combine([actual]) + actual = combine([actual]) assert_identical(expected, actual) objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': [0, 1, 2]}) assert_identical(expected, actual) # ensure auto_combine handles non-sorted variables objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] - actual = auto_combine(objs) + actual = combine(objs) expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) assert_identical(expected, actual) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] with raises_regex(ValueError, 'too many .* dimensions'): - auto_combine(objs) + combine(objs) objs = [Dataset({'x': 0}), Dataset({'x': 1})] with raises_regex(ValueError, 'cannot infer dimension'): - auto_combine(objs) + combine(objs) objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] with pytest.raises(KeyError): - auto_combine(objs) + combine(objs) @requires_dask # only for toolz def test_auto_combine_previously_failed(self): @@ -378,6 +384,13 @@ def test_auto_combine_no_concat(self): actual = auto_combine([data, data, data], concat_dim=None) assert_identical(data, actual) + tmp1 = Dataset({'x': 0}) + tmp2 = Dataset({'x': np.nan}) + actual = auto_combine([tmp1, tmp2], concat_dim=None) + assert_identical(tmp1, actual) + actual = auto_combine([tmp1, tmp2], concat_dim=[None]) + assert_identical(tmp1, actual) + # Single object, with a concat_dim explicitly provided # Test the issue reported in GH #1988 objs = [Dataset({'x': 0, 'y': 1})] @@ -396,3 +409,293 @@ def test_auto_combine_no_concat(self): 'y': (('baz', 'z'), [[1, 2]])}, {'baz': [100]}) assert_identical(expected, actual) + + +class TestTileIDsFromNestedList(object): + def test_1d(self): + ds = create_test_data + input = [ds(0), ds(1)] + + expected = {(0,): ds(0), (1,): ds(1)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_2d(self): + ds = create_test_data + input = [[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]] + + expected = {(0, 0): ds(0), (0, 1): ds(1), + (1, 0): ds(2), (1, 1): ds(3), + (2, 0): ds(4), (2, 1): ds(5)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_3d(self): + ds = create_test_data + input = [[[ds(0), ds(1)], [ds(2), ds(3)], [ds(4), ds(5)]], + [[ds(6), ds(7)], [ds(8), ds(9)], [ds(10), ds(11)]]] + + expected = {(0, 0, 0): ds(0), (0, 0, 1): ds(1), + (0, 1, 0): ds(2), (0, 1, 1): ds(3), + (0, 2, 0): ds(4), (0, 2, 1): ds(5), + (1, 0, 0): ds(6), (1, 0, 1): ds(7), + (1, 1, 0): ds(8), (1, 1, 1): ds(9), + (1, 2, 0): ds(10), (1, 2, 1): ds(11)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_single_dataset(self): + ds = create_test_data(0) + input = [ds] + + expected = {(0,): ds} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_redundant_nesting(self): + ds = create_test_data + input = [[ds(0)], [ds(1)]] + + expected = {(0, 0): ds(0), (1, 0): ds(1)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_ignore_empty_list(self): + ds = create_test_data(0) + input = [ds, []] + expected = {(0,): ds} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_uneven_depth_input(self): + # Auto_combine won't work on ragged input + # but this is just to increase test coverage + ds = create_test_data + input = [ds(0), [ds(1), ds(2)]] + + expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_uneven_length_input(self): + # Auto_combine won't work on ragged input + # but this is just to increase test coverage + ds = create_test_data + input = [[ds(0)], [ds(1), ds(2)]] + + expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)} + actual = dict(_infer_tile_ids_from_nested_list(input, ())) + assert_combined_tile_ids_equal(expected, actual) + + def test_infer_from_datasets(self): + ds = create_test_data + input = [ds(0), ds(1)] + + expected = {(0,): ds(0), (1,): ds(1)} + actual, concat_dims = _infer_concat_order_from_positions\ + (input, ['dim1']) + assert_combined_tile_ids_equal(expected, actual) + + input = [ds(0), ds(1)] + with pytest.raises(ValueError): + _infer_concat_order_from_positions(input, ['dim1', 'extra_dim']) + + +@pytest.fixture(scope='module') +def create_combined_ids(): + return _create_combined_ids + + +def _create_combined_ids(shape): + tile_ids = _create_tile_ids(shape) + nums = range(len(tile_ids)) + return {tile_id: create_test_data(num) + for tile_id, num in zip(tile_ids, nums)} + + +def _create_tile_ids(shape): + tile_ids = product(*(range(i) for i in shape)) + return list(tile_ids) + + +@requires_dask # only for toolz +class TestCombineND(object): + @pytest.mark.parametrize("old_id, new_id", [((3,0,1), (0,1)), + ((0, 0), (0,)), + ((1,), ()), + ((0,), ()), + ((1, 0), (0,))]) + def test_new_tile_id(self, old_id, new_id): + ds = create_test_data + assert _new_tile_id((old_id, ds)) == new_id + + def test_get_new_tile_ids(self, create_combined_ids): + shape = (1, 2, 3) + combined_ids = create_combined_ids(shape) + + expected_tile_ids = sorted(combined_ids.keys()) + actual_tile_ids = _create_tile_ids(shape) + assert expected_tile_ids == actual_tile_ids + + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) + def test_concat_once(self, create_combined_ids, concat_dim): + shape = (2,) + combined_ids = create_combined_ids(shape) + ds = create_test_data + result = _auto_combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars='all', + coords='different', + compat='no_conflicts') + + expected_ds = concat([ds(0), ds(1)], dim=concat_dim) + assert_combined_tile_ids_equal(result, {(): expected_ds}) + + def test_concat_only_first_dim(self, create_combined_ids): + shape = (2, 3) + combined_ids = create_combined_ids(shape) + result = _auto_combine_all_along_first_dim(combined_ids, + dim='dim1', + data_vars='all', + coords='different', + compat='no_conflicts') + + ds = create_test_data + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected_datasets = [partway1, partway2, partway3] + expected = {(i,): ds for i, ds in enumerate(expected_datasets)} + + assert_combined_tile_ids_equal(result, expected) + + @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) + def test_concat_twice(self, create_combined_ids, concat_dim): + shape = (2, 3) + combined_ids = create_combined_ids(shape) + result = _combine_nd(combined_ids, concat_dims=['dim1', concat_dim]) + + ds = create_test_data + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim=concat_dim) + + assert_equal(result, expected) + + +class TestCheckShapeTileIDs(object): + def test_check_depths(self): + ds = create_test_data(0) + combined_tile_ids = {(0,): ds, (0, 1): ds} + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent depths'): + _check_shape_tile_ids(combined_tile_ids) + + def test_check_lengths(self): + ds = create_test_data(0) + combined_tile_ids = {(0, 0): ds, (0, 1): ds , (0, 2): ds, + (1, 0): ds, (1, 1): ds} + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent lengths'): + _check_shape_tile_ids(combined_tile_ids) + + +@requires_dask # only for toolz +class TestAutoCombineND(object): + def test_single_dataset(self): + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + actual = auto_combine(actual) + assert_identical(expected, actual) + + def test_auto_combine_2d(self): + ds = create_test_data + + partway1 = concat([ds(0), ds(3)], dim='dim1') + partway2 = concat([ds(1), ds(4)], dim='dim1') + partway3 = concat([ds(2), ds(5)], dim='dim1') + expected = concat([partway1, partway2, partway3], dim='dim2') + + datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] + result = auto_combine(datasets, concat_dim=['dim1', 'dim2']) + + assert_equal(result, expected) + + def test_invalid_hypercube_input(self): + ds = create_test_data + + datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent lengths'): + auto_combine(datasets, concat_dim=['dim1', 'dim2']) + + datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] + with raises_regex(ValueError, 'sub-lists do not have ' + 'consistent depths'): + auto_combine(datasets, concat_dim=['dim1', 'dim2']) + + datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] + with raises_regex(ValueError, 'concat_dims has length'): + auto_combine(datasets, concat_dim=['dim1']) + + def test_merge_one_dim_concat_another(self): + objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'bar': ('x', [10, 20])})], + [Dataset({'foo': ('x', [2, 3])}), Dataset({'bar': ('x', [30, 40])})]] + expected = Dataset({'foo': ('x', [0, 1, 2, 3]), + 'bar': ('x', [10, 20, 30, 40])}) + + actual = auto_combine(objs, concat_dim=['x', None]) + assert_identical(expected, actual) + + actual = auto_combine(objs) + assert_identical(expected, actual) + + # Proving it works symmetrically + objs = [[Dataset({'foo': ('x', [0, 1])}), Dataset({'foo': ('x', [2, 3])})], + [Dataset({'bar': ('x', [10, 20])}), Dataset({'bar': ('x', [30, 40])})]] + actual = auto_combine(objs, concat_dim=[None, 'x']) + assert_identical(expected, actual) + + def test_combine_concat_over_redundant_nesting(self): + objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] + actual = auto_combine(objs, concat_dim=[None, 'x']) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + objs = [[Dataset({'x': [0]})], [Dataset({'x': [1]})]] + actual = auto_combine(objs, concat_dim=['x', None]) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + objs = [[Dataset({'x': [0]})]] + actual = auto_combine(objs, concat_dim=[None, None]) + expected = Dataset({'x': [0]}) + assert_identical(expected, actual) + + objs = [[Dataset({'x': [0]})]] + actual = auto_combine(objs, concat_dim=None) + expected = Dataset({'x': [0]}) + assert_identical(expected, actual) + + +class TestAutoCombineUsingCoords(object): + def test_order_inferred_from_coords(self): + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + with pytest.raises(NotImplementedError): + _auto_combine(objs, concat_dims=['dim2'], compat='no_conflicts', + data_vars='all', coords='different', + infer_order_from_coords=True, ids=True) + + @pytest.mark.xfail(reason="Not yet implemented") + def test_infer_order_from_coords(self): + # Should pass once inferring order from coords is implemented + data = create_test_data() + objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] + actual = auto_combine(objs) # but with infer_order_from_coords=True + expected = data + assert_identical(expected, actual)