diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py new file mode 100644 index 00000000000..8670760abc1 --- /dev/null +++ b/asv_bench/benchmarks/combine.py @@ -0,0 +1,37 @@ +import numpy as np +import xarray as xr + + +class Combine: + """Benchmark concatenating and merging large datasets""" + + def setup(self): + """Create 4 datasets with two different variables""" + + t_size, x_size, y_size = 100, 900, 800 + t = np.arange(t_size) + data = np.random.randn(t_size, x_size, y_size) + + self.dsA0 = xr.Dataset( + {'A': xr.DataArray(data, coords={'T': t}, + dims=('T', 'X', 'Y'))}) + self.dsA1 = xr.Dataset( + {'A': xr.DataArray(data, coords={'T': t + t_size}, + dims=('T', 'X', 'Y'))}) + self.dsB0 = xr.Dataset( + {'B': xr.DataArray(data, coords={'T': t}, + dims=('T', 'X', 'Y'))}) + self.dsB1 = xr.Dataset( + {'B': xr.DataArray(data, coords={'T': t + t_size}, + dims=('T', 'X', 'Y'))}) + + def time_combine_manual(self): + datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]] + + xr.combine_manual(datasets, concat_dim=[None, 't']) + + def time_auto_combine(self): + """Also has to load and arrange t coordinate""" + datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1] + + xr.combine_auto(datasets) diff --git a/doc/api.rst b/doc/api.rst index 258d1748c1b..e1f9238c815 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -19,6 +19,9 @@ Top-level functions broadcast concat merge + auto_combine + combine_auto + combine_manual where set_options full_like diff --git a/doc/combining.rst b/doc/combining.rst index 388cc2ba5f3..852157e748f 100644 --- a/doc/combining.rst +++ b/doc/combining.rst @@ -11,9 +11,10 @@ Combining data import xarray as xr np.random.seed(123456) -* For combining datasets or data arrays along a dimension, see concatenate_. +* For combining datasets or data arrays along a single dimension, see concatenate_. * For combining datasets with different variables, see merge_. * For combining datasets or data arrays with different indexes or missing values, see combine_. +* For combining datasets or data arrays along multiple dimensions see combining.multi_. .. _concatenate: @@ -77,7 +78,7 @@ Merge ~~~~~ To combine variables and coordinates between multiple ``DataArray`` and/or -``Dataset`` object, use :py:func:`~xarray.merge`. It can merge a list of +``Dataset`` objects, use :py:func:`~xarray.merge`. It can merge a list of ``Dataset``, ``DataArray`` or dictionaries of objects convertible to ``DataArray`` objects: @@ -237,3 +238,76 @@ coordinates as long as any non-missing values agree or are disjoint: Note that due to the underlying representation of missing values as floating point numbers (``NaN``), variable data type is not always preserved when merging in this manner. + +.. _combining.multi: + +Combining along multiple dimensions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + There are currently three combining functions with similar names: + :py:func:`~xarray.auto_combine`, :py:func:`~xarray.combine_auto`, and + :py:func:`~xarray.combine_manual`. This is because + ``auto_combine`` is in the process of being deprecated in favour of the other + two functions, which are more general. If your code currently relies on + ``auto_combine``, then you will be able to get similar functionality by using + ``combine_manual``. + +For combining many objects along multiple dimensions xarray provides +:py:func:`~xarray.combine_manual`` and :py:func:`~xarray.combine_auto`. These +functions use a combination of ``concat`` and ``merge`` across different +variables to combine many objects into one. + +:py:func:`~xarray.combine_manual`` requires specifying the order in which the +objects should be combined, while :py:func:`~xarray.combine_auto` attempts to +infer this ordering automatically from the coordinates in the data. + +:py:func:`~xarray.combine_manual` is useful when you know the spatial +relationship between each object in advance. The datasets must be provided in +the form of a nested list, which specifies their relative position and +ordering. A common task is collecting data from a parallelized simulation where +each processor wrote out data to a separate file. A domain which was decomposed +into 4 parts, 2 each along both the x and y axes, requires organising the +datasets into a doubly-nested list, e.g: + +.. ipython:: python + + arr = xr.DataArray(name='temperature', data=np.random.randint(5, size=(2, 2)), dims=['x', 'y']) + arr + ds_grid = [[arr, arr], [arr, arr]] + xr.combine_manual(ds_grid, concat_dim=['x', 'y']) + +:py:func:`~xarray.combine_manual` can also be used to explicitly merge datasets +with different variables. For example if we have 4 datasets, which are divided +along two times, and contain two different variables, we can pass ``None`` +to ``'concat_dim'`` to specify the dimension of the nested list over which +we wish to use ``merge`` instead of ``concat``: + +.. ipython:: python + + temp = xr.DataArray(name='temperature', data=np.random.randn(2), dims=['t']) + precip = xr.DataArray(name='precipitation', data=np.random.randn(2), dims=['t']) + ds_grid = [[temp, precip], [temp, precip]] + xr.combine_manual(ds_grid, concat_dim=['t', None]) + +:py:func:`~xarray.combine_auto` is for combining objects which have dimension +coordinates which specify their relationship to and order relative to one +another, for example a linearly-increasing 'time' dimension coordinate. + +Here we combine two datasets using their common dimension coordinates. Notice +they are concatenated in order based on the values in their dimension +coordinates, not on their position in the list passed to ``combine_auto``. + +.. ipython:: python + :okwarning: + + x1 = xr.DataArray(name='foo', data=np.random.randn(3), coords=[('x', [0, 1, 2])]) + x2 = xr.DataArray(name='foo', data=np.random.randn(3), coords=[('x', [3, 4, 5])]) + xr.combine_auto([x2, x1]) + +These functions can be used by :py:func:`~xarray.open_mfdataset` to open many +files as one dataset. The particular function used is specified by setting the +argument ``'combine'`` to ``'auto'`` or ``'manual'``. This is useful for +situations where your data is split across many files in multiple locations, +which have some known relationship between one another. \ No newline at end of file diff --git a/doc/io.rst b/doc/io.rst index b470284f071..ef7ba67e789 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -766,7 +766,10 @@ Combining multiple files NetCDF files are often encountered in collections, e.g., with different files corresponding to different model runs. xarray can straightforwardly combine such -files into a single Dataset by making use of :py:func:`~xarray.concat`. +files into a single Dataset by making use of :py:func:`~xarray.concat`, +:py:func:`~xarray.merge`, :py:func:`~xarray.combine_manual` and +:py:func:`~xarray.combine_auto`. For details on the difference between these +functions see :ref:`combining data`. .. note:: @@ -779,7 +782,8 @@ files into a single Dataset by making use of :py:func:`~xarray.concat`. This function automatically concatenates and merges multiple files into a single xarray dataset. It is the recommended way to open multiple files with xarray. - For more details, see :ref:`dask.io` and a `blog post`_ by Stephan Hoyer. + For more details, see :ref:`combining.multi`, :ref:`dask.io` and a + `blog post`_ by Stephan Hoyer. .. _dask: http://dask.pydata.org .. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 999aaa47843..ac45c24f845 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -56,6 +56,23 @@ Enhancements helpful for avoiding file-lock errors when trying to write to files opened using ``open_dataset()`` or ``open_dataarray()``. (:issue:`2887`) By `Dan Nowacki `_. +- Combining datasets along N dimensions: + Datasets can now be combined along any number of dimensions, + instead of just a one-dimensional list of datasets. + + The new ``combine_manual`` will accept the datasets as a a nested + list-of-lists, and combine by applying a series of concat and merge + operations. The new ``combine_auto`` will instead use the dimension + coordinates of the datasets to order them. + + ``open_mfdataset`` can use either ``combine_manual`` or ``combine_auto`` to + combine datasets along multiple dimensions, by specifying the argument + `combine='manual'` or `combine='auto'`. + + This means that the original function ``auto_combine`` is being deprecated. + To avoid FutureWarnings switch to using `combine_manual` or `combine_auto`, + (or set the `combine` argument in `open_mfdataset`). (:issue:`2159`) + By `Tom Nicholas `_. - Better warning message when supplying invalid objects to ``xr.merge`` (:issue:`2948`). By `Mathias Hauser `_. - Added ``strftime`` method to ``.dt`` accessor, making it simpler to hand a @@ -203,6 +220,10 @@ Other enhancements report showing what exactly differs between the two objects (dimensions / coordinates / variables / attributes) (:issue:`1507`). By `Benoit Bovy `_. +- Resampling of standard and non-standard calendars indexed by + :py:class:`~xarray.CFTimeIndex` is now possible. (:issue:`2191`). + By `Jwen Fai Low `_ and + `Spencer Clark `_. - Add ``tolerance`` option to ``resample()`` methods ``bfill``, ``pad``, ``nearest``. (:issue:`2695`) By `Hauke Schulz `_. diff --git a/xarray/__init__.py b/xarray/__init__.py index 9eaa705e108..22c12d02d71 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -7,7 +7,8 @@ from .core.alignment import align, broadcast, broadcast_arrays from .core.common import full_like, zeros_like, ones_like -from .core.combine import concat, auto_combine +from .core.concat import concat +from .core.combine import combine_auto, combine_manual, auto_combine from .core.computation import apply_ufunc, dot, where from .core.extensions import (register_dataarray_accessor, register_dataset_accessor) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 01188e92752..f3bab5d084d 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -7,10 +7,11 @@ import numpy as np -from .. import Dataset, backends, conventions +from .. import Dataset, DataArray, backends, conventions from ..core import indexing -from ..core.combine import ( - _CONCAT_DIM_DEFAULT, _auto_combine, _infer_concat_order_from_positions) +from .. import auto_combine +from ..core.combine import (combine_auto, _manual_combine, + _infer_concat_order_from_positions) from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter from .locks import _get_scheduler @@ -591,12 +592,20 @@ def close(self): f.close() -def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, +def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied', compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', - autoclose=None, parallel=False, **kwargs): + combine='_old_auto', autoclose=None, parallel=False, + **kwargs): """Open multiple files as a single dataset. + If combine='auto' then the function `combine_auto` is used to combine the + datasets into one before returning the result, and if combine='manual' then + `combine_manual` is used. The filepaths must be structured according to + which combining function is used, the details of which are given in the + documentation for ``combine_auto`` and ``combine_manual``. + By default the old (now deprecated) ``auto_combine`` will be used, please + specify either ``combine='auto'`` or ``combine='manual'`` in future. Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. @@ -604,8 +613,10 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit - list of files to open. Paths can be given as strings or as pathlib - Paths. + list of files to open. Paths can be given as strings or as pathlib + Paths. If concatenation along more than one dimension is desired, then + ``paths`` must be a nested list-of-lists (see ``manual_combine`` for + details). (A string glob will be expanded to a 1-dimensional list.) chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. @@ -613,16 +624,20 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. - concat_dim : None, str, DataArray or Index, optional - Dimension to concatenate files along. This argument is passed on to - :py:func:`xarray.auto_combine` along with the dataset objects. You only - need to provide this argument if the dimension along which you want to - concatenate is not a dimension in the original datasets, e.g., if you - want to stack a collection of 2D arrays along a third dimension. - By default, xarray attempts to infer this argument by examining - component files. Set ``concat_dim=None`` explicitly to disable - concatenation. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + concat_dim : str, or list of str, DataArray, Index or None, optional + Dimensions to concatenate files along. You only + need to provide this argument if any of the dimensions along which you + want to concatenate is not a dimension in the original datasets, e.g., + if you want to stack a collection of 2D arrays along a third dimension. + Set ``concat_dim=[..., None, ...]`` explicitly to + disable concatenation along a particular dimension. + combine : {'auto', 'manual'}, optional + Whether ``xarray.auto_combine`` or ``xarray.manual_combine`` is used to + combine all the data. If this argument is not provided, + `xarray.auto_combine` is used, but in the future this behavior will + switch to use `xarray.combine_auto`. + compat : {'identical', 'equals', 'broadcast_equals', + 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: * 'broadcast_equals': all values must be equal when variables are @@ -649,20 +664,18 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, active dask scheduler. data_vars : {'minimal', 'different', 'all' or list of str}, optional These data variables will be concatenated together: - - * 'minimal': Only data variables in which the dimension already - appears are included. - * 'different': Data variables which are not equal (ignoring - attributes) across all datasets are also concatenated (as well as - all for which dimension already appears). Beware: this option may - load the data payload of data variables into memory if they are not - already loaded. - * 'all': All data variables will be concatenated. - * list of str: The listed data variables will be concatenated, in - addition to the 'minimal' data variables. - coords : {'minimal', 'different', 'all' o list of str}, optional + * 'minimal': Only data variables in which the dimension already + appears are included. + * 'different': Data variables which are not equal (ignoring + attributes) across all datasets are also concatenated (as well as + all for which dimension already appears). Beware: this option may + load the data payload of data variables into memory if they are not + already loaded. + * 'all': All data variables will be concatenated. + * list of str: The listed data variables will be concatenated, in + addition to the 'minimal' data variables. + coords : {'minimal', 'different', 'all' or list of str}, optional These coordinate variables will be concatenated together: - * 'minimal': Only coordinates in which the dimension already appears are included. * 'different': Coordinates which are not equal (ignoring attributes) @@ -693,6 +706,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, See Also -------- + combine_auto + combine_manual auto_combine open_dataset @@ -715,22 +730,17 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, if not paths: raise IOError('no files to open') - # Coerce 1D input into ND to maintain backwards-compatible API until API - # for N-D combine decided - # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746) - if concat_dim is None or concat_dim is _CONCAT_DIM_DEFAULT: - concat_dims = concat_dim - elif not isinstance(concat_dim, list): - concat_dims = [concat_dim] - else: - concat_dims = concat_dim - infer_order_from_coords = False - - # If infer_order_from_coords=True then this is unnecessary, but quick. - # If infer_order_from_coords=False then this creates a flat list which is - # easier to iterate over, while saving the originally-supplied structure - combined_ids_paths, concat_dims = _infer_concat_order_from_positions( - paths, concat_dims) + # If combine='auto' then this is unnecessary, but quick. + # If combine='manual' then this creates a flat list which is easier to + # iterate over, while saving the originally-supplied structure as "ids" + if combine == 'manual': + if str(concat_dim) == '_not_supplied': + raise ValueError("Must supply concat_dim when using " + "combine='manual'") + else: + if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: + concat_dim = [concat_dim] + combined_ids_paths = _infer_concat_order_from_positions(paths) ids, paths = ( list(combined_ids_paths.keys()), list(combined_ids_paths.values())) @@ -758,18 +768,28 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, # the underlying datasets will still be stored as dask arrays datasets, file_objs = dask.compute(datasets, file_objs) - # Close datasets in case of a ValueError + # Combine all datasets, closing them in case of a ValueError try: - if infer_order_from_coords: - # Discard ordering because it should be redone from coordinates - ids = False - - combined = _auto_combine( - datasets, concat_dims=concat_dims, - compat=compat, - data_vars=data_vars, coords=coords, - infer_order_from_coords=infer_order_from_coords, - ids=ids) + if combine == '_old_auto': + # Use the old auto_combine for now + # Remove this after deprecation cycle from #2616 is complete + combined = auto_combine(datasets, concat_dim=concat_dim, + compat=compat, data_vars=data_vars, + coords=coords) + elif combine == 'manual': + # Combined nested list by successive concat and merge operations + # along each dimension, using structure given by "ids" + combined = _manual_combine(datasets, concat_dims=concat_dim, + compat=compat, data_vars=data_vars, + coords=coords, ids=ids) + elif combine == 'auto': + # Redo ordering from coordinates, ignoring how they were ordered + # previously + combined = combine_auto(datasets, compat=compat, + data_vars=data_vars, coords=coords) + else: + raise ValueError("{} is an invalid option for the keyword argument" + " ``combine``".format(combine)) except ValueError: for ds in datasets: ds.close() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 0b18aa47dee..92d7992c000 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,399 +1,20 @@ import itertools import warnings from collections import Counter, OrderedDict +from textwrap import dedent import pandas as pd -from . import dtypes, utils -from .alignment import align -from .computation import result_name +from .dataarray import DataArray +from .dataset import Dataset +from .concat import concat +from . import dtypes from .merge import merge -from .variable import IndexVariable, Variable, as_variable -from .variable import concat as concat_vars -def concat(objs, dim=None, data_vars='all', coords='different', - compat='equals', positions=None, indexers=None, mode=None, - concat_over=None, fill_value=dtypes.NA): - """Concatenate xarray objects along a new or existing dimension. - - Parameters - ---------- - objs : sequence of Dataset and DataArray objects - xarray objects to concatenate together. Each object is expected to - consist of variables and coordinates with matching shapes except for - along the concatenated dimension. - dim : str or DataArray or pandas.Index - Name of the dimension to concatenate along. This can either be a new - dimension name, in which case it is added along axis=0, or an existing - dimension name, in which case the location of the dimension is - unchanged. If dimension is provided as a DataArray or Index, its name - is used as the dimension to concatenate along and the values are added - as a coordinate. - data_vars : {'minimal', 'different', 'all' or list of str}, optional - These data variables will be concatenated together: - * 'minimal': Only data variables in which the dimension already - appears are included. - * 'different': Data variables which are not equal (ignoring - attributes) across all datasets are also concatenated (as well as - all for which dimension already appears). Beware: this option may - load the data payload of data variables into memory if they are not - already loaded. - * 'all': All data variables will be concatenated. - * list of str: The listed data variables will be concatenated, in - addition to the 'minimal' data variables. - If objects are DataArrays, data_vars must be 'all'. - coords : {'minimal', 'different', 'all' or list of str}, optional - These coordinate variables will be concatenated together: - * 'minimal': Only coordinates in which the dimension already appears - are included. - * 'different': Coordinates which are not equal (ignoring attributes) - across all datasets are also concatenated (as well as all for which - dimension already appears). Beware: this option may load the data - payload of coordinate variables into memory if they are not already - loaded. - * 'all': All coordinate variables will be concatenated, except - those corresponding to other dimensions. - * list of str: The listed coordinate variables will be concatenated, - in addition the 'minimal' coordinates. - compat : {'equals', 'identical'}, optional - String indicating how to compare non-concatenated variables and - dataset global attributes for potential conflicts. 'equals' means - that all variable values and dimensions must be the same; - 'identical' means that variable attributes and global attributes - must also be equal. - positions : None or list of integer arrays, optional - List of integer arrays which specifies the integer positions to which - to assign each dataset along the concatenated dimension. If not - supplied, objects are concatenated in the provided order. - fill_value : scalar, optional - Value to use for newly missing values - indexers, mode, concat_over : deprecated - - Returns - ------- - concatenated : type of objs - - See also - -------- - merge - auto_combine - """ - # TODO: add join and ignore_index arguments copied from pandas.concat - # TODO: support concatenating scalar coordinates even if the concatenated - # dimension already exists - from .dataset import Dataset - from .dataarray import DataArray - - try: - first_obj, objs = utils.peek_at(objs) - except StopIteration: - raise ValueError('must supply at least one object to concatenate') - - if dim is None: - warnings.warn('the `dim` argument to `concat` will be required ' - 'in a future version of xarray; for now, setting it to ' - "the old default of 'concat_dim'", - FutureWarning, stacklevel=2) - dim = 'concat_dims' - - if indexers is not None: # pragma: nocover - warnings.warn('indexers has been renamed to positions; the alias ' - 'will be removed in a future version of xarray', - FutureWarning, stacklevel=2) - positions = indexers - - if mode is not None: - raise ValueError('`mode` is no longer a valid argument to ' - 'xarray.concat; it has been split into the ' - '`data_vars` and `coords` arguments') - if concat_over is not None: - raise ValueError('`concat_over` is no longer a valid argument to ' - 'xarray.concat; it has been split into the ' - '`data_vars` and `coords` arguments') - - if isinstance(first_obj, DataArray): - f = _dataarray_concat - elif isinstance(first_obj, Dataset): - f = _dataset_concat - else: - raise TypeError('can only concatenate xarray Dataset and DataArray ' - 'objects, got %s' % type(first_obj)) - return f(objs, dim, data_vars, coords, compat, positions, fill_value) - - -def _calc_concat_dim_coord(dim): - """ - Infer the dimension name and 1d coordinate variable (if appropriate) - for concatenating along the new dimension. - """ - from .dataarray import DataArray - - if isinstance(dim, str): - coord = None - elif not isinstance(dim, (DataArray, Variable)): - dim_name = getattr(dim, 'name', None) - if dim_name is None: - dim_name = 'concat_dim' - coord = IndexVariable(dim_name, dim) - dim = dim_name - elif not isinstance(dim, DataArray): - coord = as_variable(dim).to_index_variable() - dim, = coord.dims - else: - coord = dim - dim, = coord.dims - return dim, coord - - -def _calc_concat_over(datasets, dim, data_vars, coords): - """ - Determine which dataset variables need to be concatenated in the result, - and which can simply be taken from the first dataset. - """ - # Return values - concat_over = set() - equals = {} - - if dim in datasets[0]: - concat_over.add(dim) - for ds in datasets: - concat_over.update(k for k, v in ds.variables.items() - if dim in v.dims) - - def process_subset_opt(opt, subset): - if isinstance(opt, str): - if opt == 'different': - # all nonindexes that are not the same in each dataset - for k in getattr(datasets[0], subset): - if k not in concat_over: - # Compare the variable of all datasets vs. the one - # of the first dataset. Perform the minimum amount of - # loads in order to avoid multiple loads from disk - # while keeping the RAM footprint low. - v_lhs = datasets[0].variables[k].load() - # We'll need to know later on if variables are equal. - computed = [] - for ds_rhs in datasets[1:]: - v_rhs = ds_rhs.variables[k].compute() - computed.append(v_rhs) - if not v_lhs.equals(v_rhs): - concat_over.add(k) - equals[k] = False - # computed variables are not to be re-computed - # again in the future - for ds, v in zip(datasets[1:], computed): - ds.variables[k].data = v.data - break - else: - equals[k] = True - - elif opt == 'all': - concat_over.update(set(getattr(datasets[0], subset)) - - set(datasets[0].dims)) - elif opt == 'minimal': - pass - else: - raise ValueError("unexpected value for %s: %s" % (subset, opt)) - else: - invalid_vars = [k for k in opt - if k not in getattr(datasets[0], subset)] - if invalid_vars: - if subset == 'coords': - raise ValueError( - 'some variables in coords are not coordinates on ' - 'the first dataset: %s' % (invalid_vars,)) - else: - raise ValueError( - 'some variables in data_vars are not data variables ' - 'on the first dataset: %s' % (invalid_vars,)) - concat_over.update(opt) - - process_subset_opt(data_vars, 'data_vars') - process_subset_opt(coords, 'coords') - return concat_over, equals - - -def _dataset_concat(datasets, dim, data_vars, coords, compat, positions, - fill_value=dtypes.NA): - """ - Concatenate a sequence of datasets along a new or existing dimension - """ - from .dataset import Dataset - - if compat not in ['equals', 'identical']: - raise ValueError("compat=%r invalid: must be 'equals' " - "or 'identical'" % compat) - - dim, coord = _calc_concat_dim_coord(dim) - # Make sure we're working on a copy (we'll be loading variables) - datasets = [ds.copy() for ds in datasets] - datasets = align(*datasets, join='outer', copy=False, exclude=[dim], - fill_value=fill_value) - - concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) - - def insert_result_variable(k, v): - assert isinstance(v, Variable) - if k in datasets[0].coords: - result_coord_names.add(k) - result_vars[k] = v - - # create the new dataset and add constant variables - result_vars = OrderedDict() - result_coord_names = set(datasets[0].coords) - result_attrs = datasets[0].attrs - result_encoding = datasets[0].encoding - - for k, v in datasets[0].variables.items(): - if k not in concat_over: - insert_result_variable(k, v) - - # check that global attributes and non-concatenated variables are fixed - # across all datasets - for ds in datasets[1:]: - if (compat == 'identical' and - not utils.dict_equiv(ds.attrs, result_attrs)): - raise ValueError('dataset global attributes not equal') - for k, v in ds.variables.items(): - if k not in result_vars and k not in concat_over: - raise ValueError('encountered unexpected variable %r' % k) - elif (k in result_coord_names) != (k in ds.coords): - raise ValueError('%r is a coordinate in some datasets but not ' - 'others' % k) - elif k in result_vars and k != dim: - # Don't use Variable.identical as it internally invokes - # Variable.equals, and we may already know the answer - if compat == 'identical' and not utils.dict_equiv( - v.attrs, result_vars[k].attrs): - raise ValueError( - 'variable %s not identical across datasets' % k) - - # Proceed with equals() - try: - # May be populated when using the "different" method - is_equal = equals[k] - except KeyError: - result_vars[k].load() - is_equal = v.equals(result_vars[k]) - if not is_equal: - raise ValueError( - 'variable %s not equal across datasets' % k) - - # we've already verified everything is consistent; now, calculate - # shared dimension sizes so we can expand the necessary variables - dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] - non_concat_dims = {} - for ds in datasets: - non_concat_dims.update(ds.dims) - non_concat_dims.pop(dim, None) - - def ensure_common_dims(vars): - # ensure each variable with the given name shares the same - # dimensions and the same shape for all of them except along the - # concat dimension - common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) - if dim not in common_dims: - common_dims = (dim,) + common_dims - for var, dim_len in zip(vars, dim_lengths): - if var.dims != common_dims: - common_shape = tuple(non_concat_dims.get(d, dim_len) - for d in common_dims) - var = var.set_dims(common_dims, common_shape) - yield var - - # stack up each variable to fill-out the dataset (in order) - for k in datasets[0].variables: - if k in concat_over: - vars = ensure_common_dims([ds.variables[k] for ds in datasets]) - combined = concat_vars(vars, dim, positions) - insert_result_variable(k, combined) - - result = Dataset(result_vars, attrs=result_attrs) - result = result.set_coords(result_coord_names) - result.encoding = result_encoding - - if coord is not None: - # add concat dimension last to ensure that its in the final Dataset - result[coord.name] = coord - - return result - - -def _dataarray_concat(arrays, dim, data_vars, coords, compat, - positions, fill_value=dtypes.NA): - arrays = list(arrays) - - if data_vars != 'all': - raise ValueError('data_vars is not a valid argument when ' - 'concatenating DataArray objects') - - datasets = [] - for n, arr in enumerate(arrays): - if n == 0: - name = arr.name - elif name != arr.name: - if compat == 'identical': - raise ValueError('array names not identical') - else: - arr = arr.rename(name) - datasets.append(arr._to_temp_dataset()) - - ds = _dataset_concat(datasets, dim, data_vars, coords, compat, - positions, fill_value) - result = arrays[0]._from_temp_dataset(ds, name) - - result.name = result_name(arrays) - return result - - -def _auto_concat(datasets, dim=None, data_vars='all', coords='different', - fill_value=dtypes.NA): - if len(datasets) == 1 and dim is None: - # There is nothing more to combine, so kick out early. - return datasets[0] - else: - if dim is None: - ds0 = datasets[0] - ds1 = datasets[1] - concat_dims = set(ds0.dims) - if ds0.dims != ds1.dims: - dim_tuples = set(ds0.dims.items()) - set(ds1.dims.items()) - concat_dims = set(i for i, _ in dim_tuples) - if len(concat_dims) > 1: - concat_dims = set(d for d in concat_dims - if not ds0[d].equals(ds1[d])) - if len(concat_dims) > 1: - raise ValueError('too many different dimensions to ' - 'concatenate: %s' % concat_dims) - elif len(concat_dims) == 0: - raise ValueError('cannot infer dimension to concatenate: ' - 'supply the ``concat_dim`` argument ' - 'explicitly') - dim, = concat_dims - return concat(datasets, dim=dim, data_vars=data_vars, - coords=coords, fill_value=fill_value) - - -_CONCAT_DIM_DEFAULT = utils.ReprObject('') - - -def _infer_concat_order_from_positions(datasets, concat_dims): - +def _infer_concat_order_from_positions(datasets): combined_ids = OrderedDict(_infer_tile_ids_from_nested_list(datasets, ())) - - tile_id, ds = list(combined_ids.items())[0] - n_dims = len(tile_id) - if concat_dims == _CONCAT_DIM_DEFAULT or concat_dims is None: - concat_dims = [concat_dims] * n_dims - else: - if len(concat_dims) != n_dims: - raise ValueError("concat_dims has length {} but the datasets " - "passed are nested in a {}-dimensional " - "structure".format(str(len(concat_dims)), - str(n_dims))) - - return combined_ids, concat_dims + return combined_ids def _infer_tile_ids_from_nested_list(entry, current_pos): @@ -409,7 +30,7 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): Parameters ---------- - entry : list[list[obj, obj, ...]] + entry : list[list[obj, obj, ...], ...] List of lists of arbitrary depth, containing objects in the order they are to be concatenated. @@ -427,12 +48,79 @@ def _infer_tile_ids_from_nested_list(entry, current_pos): yield current_pos, entry +def _infer_concat_order_from_coords(datasets): + + concat_dims = [] + tile_ids = [() for ds in datasets] + + # All datasets have same variables because they've been grouped as such + ds0 = datasets[0] + for dim in ds0.dims: + + # Check if dim is a coordinate dimension + if dim in ds0: + + # Need to read coordinate values to do ordering + indexes = [ds.indexes.get(dim) for ds in datasets] + if any(index is None for index in indexes): + raise ValueError("Every dimension needs a coordinate for " + "inferring concatenation order") + + # If dimension coordinate values are same on every dataset then + # should be leaving this dimension alone (it's just a "bystander") + if not all(index.equals(indexes[0]) for index in indexes[1:]): + + # Infer order datasets should be arranged in along this dim + concat_dims.append(dim) + + if all(index.is_monotonic_increasing for index in indexes): + ascending = True + elif all(index.is_monotonic_decreasing for index in indexes): + ascending = False + else: + raise ValueError("Coordinate variable {} is neither " + "monotonically increasing nor " + "monotonically decreasing on all datasets" + .format(dim)) + + # Assume that any two datasets whose coord along dim starts + # with the same value have the same coord values throughout. + if any(index.size == 0 for index in indexes): + raise ValueError('Cannot handle size zero dimensions') + first_items = pd.Index([index.take([0]) + for index in indexes]) + + # Sort datasets along dim + # We want rank but with identical elements given identical + # position indices - they should be concatenated along another + # dimension, not along this one + series = first_items.to_series() + rank = series.rank(method='dense', ascending=ascending) + order = rank.astype(int).values - 1 + + # Append positions along extra dimension to structure which + # encodes the multi-dimensional concatentation order + tile_ids = [tile_id + (position,) for tile_id, position + in zip(tile_ids, order)] + + if len(datasets) > 1 and not concat_dims: + raise ValueError("Could not find any dimension coordinates to use to " + "order the datasets for concatenation") + + combined_ids = OrderedDict(zip(tile_ids, datasets)) + + return combined_ids, concat_dims + + def _check_shape_tile_ids(combined_tile_ids): tile_ids = combined_tile_ids.keys() # Check all tuples are the same length # i.e. check that all lists are nested to the same depth nesting_depths = [len(tile_id) for tile_id in tile_ids] + print(nesting_depths) + if not nesting_depths: + nesting_depths = [0] if not set(nesting_depths) == {nesting_depths[0]}: raise ValueError("The supplied objects do not form a hypercube because" " sub-lists do not have consistent depths") @@ -451,7 +139,8 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', coords='different', compat='no_conflicts', fill_value=dtypes.NA): """ - Concatenates and merges an N-dimensional structure of datasets. + Combines an N-dimensional structure of datasets into one by applying a + series of either concat and merge operations along each dimension. No checks are performed on the consistency of the datasets, concat_dims or tile_IDs, because it is assumed that this has already been done. @@ -463,67 +152,80 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all', keys, which specify position within the desired final combined result. concat_dims : sequence of str The dimensions along which the datasets should be concatenated. Must be - in order, and the length must match + in order, and the length must match the length of the tuples used as + keys in combined_ids. If the string is a dimension name then concat + along that dimension, if it is None then merge. Returns ------- combined_ds : xarray.Dataset """ - # Perform N-D dimensional concatenation + example_tile_id = next(iter(combined_ids.keys())) + + n_dims = len(example_tile_id) + if len(concat_dims) != n_dims: + raise ValueError("concat_dims has length {} but the datasets " + "passed are nested in a {}-dimensional structure" + .format(len(concat_dims), n_dims)) + # Each iteration of this loop reduces the length of the tile_ids tuples # by one. It always combines along the first dimension, removing the first # element of the tuple for concat_dim in concat_dims: - combined_ids = _auto_combine_all_along_first_dim(combined_ids, - dim=concat_dim, - data_vars=data_vars, - coords=coords, - compat=compat, - fill_value=fill_value) - combined_ds = list(combined_ids.values())[0] + combined_ids = _combine_all_along_first_dim(combined_ids, + dim=concat_dim, + data_vars=data_vars, + coords=coords, + compat=compat, + fill_value=fill_value) + (combined_ds,) = combined_ids.values() return combined_ds -def _auto_combine_all_along_first_dim(combined_ids, dim, data_vars, - coords, compat, fill_value=dtypes.NA): +def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat, + fill_value=dtypes.NA): + # Group into lines of datasets which must be combined along dim # need to sort by _new_tile_id first for groupby to work # TODO remove all these sorted OrderedDicts once python >= 3.6 only combined_ids = OrderedDict(sorted(combined_ids.items(), key=_new_tile_id)) grouped = itertools.groupby(combined_ids.items(), key=_new_tile_id) + # Combine all of these datasets along dim new_combined_ids = {} for new_id, group in grouped: combined_ids = OrderedDict(sorted(group)) datasets = combined_ids.values() - new_combined_ids[new_id] = _auto_combine_1d(datasets, dim, compat, - data_vars, coords, - fill_value) + new_combined_ids[new_id] = _combine_1d(datasets, dim, compat, + data_vars, coords, fill_value) return new_combined_ids -def vars_as_keys(ds): - return tuple(sorted(ds)) - +def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all', + coords='different', fill_value=dtypes.NA): + """ + Applies either concat or merge to 1D list of datasets depending on value + of concat_dim + """ -def _auto_combine_1d(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', - data_vars='all', coords='different', - fill_value=dtypes.NA): - # This is just the old auto_combine function (which only worked along 1D) if concat_dim is not None: - dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim - sorted_datasets = sorted(datasets, key=vars_as_keys) - grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) - concatenated = [_auto_concat(list(ds_group), dim=dim, - data_vars=data_vars, coords=coords, - fill_value=fill_value) - for id, ds_group in grouped_by_vars] + try: + combined = concat(datasets, dim=concat_dim, data_vars=data_vars, + coords=coords, fill_value=fill_value) + except ValueError as err: + if "encountered unexpected variable" in str(err): + raise ValueError("These objects cannot be combined using only " + "xarray.combine_manual, instead either use " + "xarray.combine_auto, or do it manually " + "with xarray.concat, xarray.merge and " + "xarray.align") + else: + raise else: - concatenated = datasets - merged = merge(concatenated, compat=compat, fill_value=fill_value) - return merged + combined = merge(datasets, compat=compat, fill_value=fill_value) + + return combined def _new_tile_id(single_id_ds_pair): @@ -531,45 +233,304 @@ def _new_tile_id(single_id_ds_pair): return tile_id[1:] -def _auto_combine(datasets, concat_dims, compat, data_vars, coords, - infer_order_from_coords, ids, fill_value=dtypes.NA): - """ - Calls logic to decide concatenation order before concatenating. - """ +def _manual_combine(datasets, concat_dims, compat, data_vars, coords, ids, + fill_value=dtypes.NA): + + if len(datasets) == 0: + return Dataset() # Arrange datasets for concatenation - if infer_order_from_coords: - raise NotImplementedError - # TODO Use coordinates to determine tile_ID for each dataset in N-D - # Ignore how they were ordered previously - # Should look like: - # combined_ids, concat_dims = _infer_tile_ids_from_coords(datasets, - # concat_dims) + # Use information from the shape of the user input + if not ids: + # Determine tile_IDs by structure of input in N-D + # (i.e. ordering in list-of-lists) + combined_ids = _infer_concat_order_from_positions(datasets) else: - # Use information from the shape of the user input - if not ids: - # Determine tile_IDs by structure of input in N-D - # (i.e. ordering in list-of-lists) - combined_ids, concat_dims = _infer_concat_order_from_positions( - datasets, concat_dims) - else: - # Already sorted so just use the ids already passed - combined_ids = OrderedDict(zip(ids, datasets)) + # Already sorted so just use the ids already passed + combined_ids = OrderedDict(zip(ids, datasets)) # Check that the inferred shape is combinable _check_shape_tile_ids(combined_ids) - # Repeatedly concatenate then merge along each dimension + # Apply series of concatenate or merge operations along each dimension combined = _combine_nd(combined_ids, concat_dims, compat=compat, data_vars=data_vars, coords=coords, fill_value=fill_value) return combined -def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, - compat='no_conflicts', data_vars='all', coords='different', - fill_value=dtypes.NA): - """Attempt to auto-magically combine the given datasets into one. +def combine_manual(datasets, concat_dim, compat='no_conflicts', + data_vars='all', coords='different', fill_value=dtypes.NA): + """ + Explicitly combine an N-dimensional grid of datasets into one by using a + succession of concat and merge operations along each dimension of the grid. + + Does not sort the supplied datasets under any circumstances, so the + datasets must be passed in the order you wish them to be concatenated. It + does align coordinates, but different variables on datasets can cause it to + fail under some scenarios. In complex cases, you may need to clean up your + data and use concat/merge explicitly. + + To concatenate along multiple dimensions the datasets must be passed as a + nested list-of-lists, with a depth equal to the length of ``concat_dims``. + ``manual_combine`` will concatenate along the top-level list first. + + Useful for combining datasets from a set of nested directories, or for + collecting the output of a simulation parallelized along multiple + dimensions. + + Parameters + ---------- + datasets : list or nested list of xarray.Dataset objects. + Dataset objects to combine. + If concatenation or merging along more than one dimension is desired, + then datasets must be supplied in a nested list-of-lists. + concat_dim : str, or list of str, DataArray, Index or None + Dimensions along which to concatenate variables, as used by + :py:func:`xarray.concat`. + Set ``concat_dim=[..., None, ...]`` explicitly to disable concatenation + and merge instead along a particular dimension. + The position of ``None`` in the list specifies the dimension of the + nested-list input along which to merge. + Must be the same length as the depth of the list passed to + ``datasets``. + compat : {'identical', 'equals', 'broadcast_equals', + 'no_conflicts'}, optional + String indicating how to compare variables of the same name for + potential merge conflicts: + + - 'broadcast_equals': all values must be equal when variables are + broadcast against each other to ensure common dimensions. + - 'equals': all values and dimensions must be the same. + - 'identical': all values, dimensions and attributes must be the + same. + - 'no_conflicts': only values which are not null in both datasets + must be equal. The returned dataset then contains the combination + of all non-null values. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + coords : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + fill_value : scalar, optional + Value to use for newly missing values + + Returns + ------- + combined : xarray.Dataset + + Examples + -------- + + A common task is collecting data from a parallelized simulation in which + each processor wrote out to a separate file. A domain which was decomposed + into 4 parts, 2 each along both the x and y axes, requires organising the + datasets into a doubly-nested list, e.g: + + >>> x1y1 + + Dimensions: (x: 2, y: 2) + Dimensions without coordinates: x, y + Data variables: + temperature (x, y) float64 11.04 23.57 20.77 ... + precipitation (x, y) float64 5.904 2.453 3.404 ... + + >>> ds_grid = [[x1y1, x1y2], [x2y1, x2y2]] + >>> combined = xr.combine_manual(ds_grid, concat_dim=['x', 'y']) + + Dimensions: (x: 4, y: 4) + Dimensions without coordinates: x, y + Data variables: + temperature (x, y) float64 11.04 23.57 20.77 ... + precipitation (x, y) float64 5.904 2.453 3.404 ... + + ``manual_combine`` can also be used to explicitly merge datasets with + different variables. For example if we have 4 datasets, which are divided + along two times, and contain two different variables, we can pass ``None`` + to ``concat_dim`` to specify the dimension of the nested list over which + we wish to use ``merge`` instead of ``concat``: + + >>> t1temp + + Dimensions: (t: 5) + Dimensions without coordinates: t + Data variables: + temperature (t) float64 11.04 23.57 20.77 ... + + >>> t1precip + + Dimensions: (t: 5) + Dimensions without coordinates: t + Data variables: + precipitation (t) float64 5.904 2.453 3.404 ... + + >>> ds_grid = [[t1temp, t1precip], [t2temp, t2precip]] + >>> combined = xr.combine_manual(ds_grid, concat_dim=['t', None]) + + Dimensions: (t: 10) + Dimensions without coordinates: t + Data variables: + temperature (t) float64 11.04 23.57 20.77 ... + precipitation (t) float64 5.904 2.453 3.404 ... + + See also + -------- + concat + merge + auto_combine + """ + if isinstance(concat_dim, (str, DataArray)) or concat_dim is None: + concat_dim = [concat_dim] + + # The IDs argument tells _manual_combine that datasets aren't yet sorted + return _manual_combine(datasets, concat_dims=concat_dim, compat=compat, + data_vars=data_vars, coords=coords, ids=False, + fill_value=fill_value) + + +def vars_as_keys(ds): + return tuple(sorted(ds)) + + +def combine_auto(datasets, compat='no_conflicts', data_vars='all', + coords='different', fill_value=dtypes.NA): + """ + Attempt to auto-magically combine the given datasets into one by using + dimension coordinates. + + This method attempts to combine a group of datasets along any number of + dimensions into a single entity by inspecting coords and metadata and using + a combination of concat and merge. + + Will attempt to order the datasets such that the values in their dimension + coordinates are monotonic along all dimensions. If it cannot determine the + order in which to concatenate the datasets, it will raise a ValueError. + Non-coordinate dimensions will be ignored, as will any coordinate + dimensions which do not vary between each dataset. + + Aligns coordinates, but different variables on datasets can cause it + to fail under some scenarios. In complex cases, you may need to clean up + your data and use concat/merge explicitly (also see `manual_combine`). + + Works well if, for example, you have N years of data and M data variables, + and each combination of a distinct time period and set of data variables is + saved as its own dataset. Also useful for if you have a simulation which is + parallelized in multiple dimensions, but has global coordinates saved in + each file specifying the positions of points within the global domain. + + Parameters + ---------- + datasets : sequence of xarray.Dataset + Dataset objects to combine. + compat : {'identical', 'equals', 'broadcast_equals', + 'no_conflicts'}, optional + String indicating how to compare variables of the same name for + potential conflicts: + + - 'broadcast_equals': all values must be equal when variables are + broadcast against each other to ensure common dimensions. + - 'equals': all values and dimensions must be the same. + - 'identical': all values, dimensions and attributes must be the + same. + - 'no_conflicts': only values which are not null in both datasets + must be equal. The returned dataset then contains the combination + of all non-null values. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + coords : {'minimal', 'different', 'all' or list of str}, optional + Details are in the documentation of concat + fill_value : scalar, optional + Value to use for newly missing values + + Returns + ------- + combined : xarray.Dataset + + See also + -------- + concat + merge + combine_manual + + Examples + -------- + + Combining two datasets using their common dimension coordinates. Notice + they are concatenated based on the values in their dimension coordinates, + not on their position in the list passed to `combine_auto`. + + >>> x1 + + Dimensions: (x: 3) + Coords: + * position (x) int64 0 1 2 + Data variables: + temperature (x) float64 11.04 23.57 20.77 ... + + >>> x2 + + Dimensions: (x: 3) + Coords: + * position (x) int64 3 4 5 + Data variables: + temperature (x) float64 6.97 8.13 7.42 ... + + >>> combined = xr.combine_auto([x2, x1]) + + Dimensions: (x: 6) + Coords: + * position (x) int64 0 1 2 3 4 5 + Data variables: + temperature (x) float64 11.04 23.57 20.77 ... + """ + + # Group by data vars + sorted_datasets = sorted(datasets, key=vars_as_keys) + grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) + + # Perform the multidimensional combine on each group of data variables + # before merging back together + concatenated_grouped_by_data_vars = [] + for vars, datasets_with_same_vars in grouped_by_vars: + combined_ids, concat_dims = _infer_concat_order_from_coords( + list(datasets_with_same_vars)) + + _check_shape_tile_ids(combined_ids) + + # Concatenate along all of concat_dims one by one to create single ds + concatenated = _combine_nd(combined_ids, concat_dims=concat_dims, + data_vars=data_vars, coords=coords, + fill_value=fill_value) + + # Check the overall coordinates are monotonically increasing + for dim in concatenated.dims: + if dim in concatenated: + indexes = concatenated.indexes.get(dim) + if not (indexes.is_monotonic_increasing + or indexes.is_monotonic_decreasing): + raise ValueError("Resulting object does not have monotonic" + " global indexes along dimension {}" + .format(dim)) + concatenated_grouped_by_data_vars.append(concatenated) + + return merge(concatenated_grouped_by_data_vars, compat=compat, + fill_value=fill_value) + + +# Everything beyond here is only needed until the deprecation cycle in #2616 +# is completed + + +_CONCAT_DIM_DEFAULT = '__infer_concat_dim__' + + +def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts', + data_vars='all', coords='different', fill_value=dtypes.NA): + """ + Attempt to auto-magically combine the given datasets into one. + + This entire function is deprecated in favour of ``combine_manual`` and + ``combine_auto``. + This method attempts to combine a list of datasets into a single entity by inspecting metadata and using a combination of concat and merge. It does not concatenate along more than one dimension or sort data under @@ -593,10 +554,10 @@ def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', + 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts: - - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. @@ -607,8 +568,8 @@ def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, of all non-null values. data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat - coords : {'minimal', 'different', 'all' or list of str}, optional - Details are in the documentation of conca + coords : {'minimal', 'different', 'all' o list of str}, optional + Details are in the documentation of concat fill_value : scalar, optional Value to use for newly missing values @@ -620,21 +581,132 @@ def auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, -------- concat Dataset.merge - """ # noqa - - # Coerce 1D input into ND to maintain backwards-compatible API until API - # for N-D combine decided - # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746) - if concat_dim is None or concat_dim == _CONCAT_DIM_DEFAULT: - concat_dims = concat_dim - elif not isinstance(concat_dim, list): - concat_dims = [concat_dim] + """ + + basic_msg = """In xarray version 0.13 `auto_combine` will be deprecated.""" + warnings.warn(basic_msg, FutureWarning, stacklevel=2) + + if concat_dim == '_not_supplied': + concat_dim = _CONCAT_DIM_DEFAULT + message = '' + else: + message = dedent("""\ + Also `open_mfdataset` will no longer accept a `concat_dim` argument. + To get equivalent behaviour from now on please use the new + `combine_manual` function instead (or the `combine='manual'` option to + `open_mfdataset`).""") + + if _dimension_coords_exist(datasets): + message += dedent("""\ + The datasets supplied have global dimension coordinates. You may want + to use the new `combine_auto` function (or the `combine='auto'` option + to `open_mfdataset` to order the datasets before concatenation. + Alternatively, to continue concatenating based on the order the + datasets are supplied in in future, please use the new `combine_manual` + function (or the `combine='manual'` option to open_mfdataset).""") + else: + message += dedent("""\ + The datasets supplied do not have global dimension coordinates. In + future, to continue concatenating without supplying dimension + coordinates, please use the new `combine_manual` function (or the + `combine='manual'` option to open_mfdataset.""") + + if _requires_concat_and_merge(datasets): + manual_dims = [concat_dim].append(None) + message += dedent("""\ + The datasets supplied require both concatenation and merging. From + xarray version 0.14 this will operation will require either using the + new `combine_manual` function (or the `combine='manual'` option to + open_mfdataset), with a nested list structure such that you can combine + along the dimensions {}. Alternatively if your datasets have global + dimension coordinates then you can use the new `combine_auto` function. + """.format(manual_dims)) + + warnings.warn(message, FutureWarning, stacklevel=2) + + return _old_auto_combine(datasets, concat_dim=concat_dim, + compat=compat, data_vars=data_vars, + coords=coords, fill_value=fill_value) + + +def _dimension_coords_exist(datasets): + """ + Check if the datasets have consistent global dimension coordinates + which would in future be used by `auto_combine` for concatenation ordering. + """ + + # Group by data vars + sorted_datasets = sorted(datasets, key=vars_as_keys) + grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) + + # Simulates performing the multidimensional combine on each group of data + # variables before merging back together + try: + for vars, datasets_with_same_vars in grouped_by_vars: + _infer_concat_order_from_coords(list(datasets_with_same_vars)) + return True + except ValueError: + # ValueError means datasets don't have global dimension coordinates + # Or something else went wrong in trying to determine them + return False + + +def _requires_concat_and_merge(datasets): + """ + Check if the datasets require the use of both xarray.concat and + xarray.merge, which in future might require the user to use + `manual_combine` instead. + """ + # Group by data vars + sorted_datasets = sorted(datasets, key=vars_as_keys) + grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) + + return len(list(grouped_by_vars)) > 1 + + +def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT, + compat='no_conflicts', + data_vars='all', coords='different', + fill_value=dtypes.NA): + if concat_dim is not None: + dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim + + sorted_datasets = sorted(datasets, key=vars_as_keys) + grouped = itertools.groupby(sorted_datasets, key=vars_as_keys) + + concatenated = [_auto_concat(list(datasets), dim=dim, + data_vars=data_vars, coords=coords, + fill_value=fill_value) + for vars, datasets in grouped] else: - concat_dims = concat_dim - infer_order_from_coords = False - - # The IDs argument tells _auto_combine that the datasets are not yet sorted - return _auto_combine(datasets, concat_dims=concat_dims, compat=compat, - data_vars=data_vars, coords=coords, - infer_order_from_coords=infer_order_from_coords, - ids=False, fill_value=fill_value) + concatenated = datasets + merged = merge(concatenated, compat=compat, fill_value=fill_value) + return merged + + +def _auto_concat(datasets, dim=None, data_vars='all', coords='different', + fill_value=dtypes.NA): + if len(datasets) == 1 and dim is None: + # There is nothing more to combine, so kick out early. + return datasets[0] + else: + if dim is None: + ds0 = datasets[0] + ds1 = datasets[1] + concat_dims = set(ds0.dims) + if ds0.dims != ds1.dims: + dim_tuples = set(ds0.dims.items()) - set(ds1.dims.items()) + concat_dims = set(i for i, _ in dim_tuples) + if len(concat_dims) > 1: + concat_dims = set(d for d in concat_dims + if not ds0[d].equals(ds1[d])) + if len(concat_dims) > 1: + raise ValueError('too many different dimensions to ' + 'concatenate: %s' % concat_dims) + elif len(concat_dims) == 0: + raise ValueError('cannot infer dimension to concatenate: ' + 'supply the ``concat_dim`` argument ' + 'explicitly') + dim, = concat_dims + return concat(datasets, dim=dim, data_vars=data_vars, coords=coords, + fill_value=fill_value) diff --git a/xarray/core/concat.py b/xarray/core/concat.py new file mode 100644 index 00000000000..5698596dde7 --- /dev/null +++ b/xarray/core/concat.py @@ -0,0 +1,341 @@ +import warnings +from collections import OrderedDict + +import pandas as pd + +from . import utils, dtypes +from .alignment import align +from .variable import IndexVariable, Variable, as_variable +from .variable import concat as concat_vars + + +def concat(objs, dim=None, data_vars='all', coords='different', + compat='equals', positions=None, indexers=None, mode=None, + concat_over=None, fill_value=dtypes.NA): + """Concatenate xarray objects along a new or existing dimension. + + Parameters + ---------- + objs : sequence of Dataset and DataArray objects + xarray objects to concatenate together. Each object is expected to + consist of variables and coordinates with matching shapes except for + along the concatenated dimension. + dim : str or DataArray or pandas.Index + Name of the dimension to concatenate along. This can either be a new + dimension name, in which case it is added along axis=0, or an existing + dimension name, in which case the location of the dimension is + unchanged. If dimension is provided as a DataArray or Index, its name + is used as the dimension to concatenate along and the values are added + as a coordinate. + data_vars : {'minimal', 'different', 'all' or list of str}, optional + These data variables will be concatenated together: + * 'minimal': Only data variables in which the dimension already + appears are included. + * 'different': Data variables which are not equal (ignoring + attributes) across all datasets are also concatenated (as well as + all for which dimension already appears). Beware: this option may + load the data payload of data variables into memory if they are not + already loaded. + * 'all': All data variables will be concatenated. + * list of str: The listed data variables will be concatenated, in + addition to the 'minimal' data variables. + If objects are DataArrays, data_vars must be 'all'. + coords : {'minimal', 'different', 'all' or list of str}, optional + These coordinate variables will be concatenated together: + * 'minimal': Only coordinates in which the dimension already appears + are included. + * 'different': Coordinates which are not equal (ignoring attributes) + across all datasets are also concatenated (as well as all for which + dimension already appears). Beware: this option may load the data + payload of coordinate variables into memory if they are not already + loaded. + * 'all': All coordinate variables will be concatenated, except + those corresponding to other dimensions. + * list of str: The listed coordinate variables will be concatenated, + in addition the 'minimal' coordinates. + compat : {'equals', 'identical'}, optional + String indicating how to compare non-concatenated variables and + dataset global attributes for potential conflicts. 'equals' means + that all variable values and dimensions must be the same; + 'identical' means that variable attributes and global attributes + must also be equal. + positions : None or list of integer arrays, optional + List of integer arrays which specifies the integer positions to which + to assign each dataset along the concatenated dimension. If not + supplied, objects are concatenated in the provided order. + fill_value : scalar, optional + Value to use for newly missing values + indexers, mode, concat_over : deprecated + + Returns + ------- + concatenated : type of objs + + See also + -------- + merge + auto_combine + """ + # TODO: add join and ignore_index arguments copied from pandas.concat + # TODO: support concatenating scalar coordinates even if the concatenated + # dimension already exists + from .dataset import Dataset + from .dataarray import DataArray + + try: + first_obj, objs = utils.peek_at(objs) + except StopIteration: + raise ValueError('must supply at least one object to concatenate') + + if dim is None: + warnings.warn('the `dim` argument to `concat` will be required ' + 'in a future version of xarray; for now, setting it to ' + "the old default of 'concat_dim'", + FutureWarning, stacklevel=2) + dim = 'concat_dims' + + if indexers is not None: # pragma: nocover + warnings.warn('indexers has been renamed to positions; the alias ' + 'will be removed in a future version of xarray', + FutureWarning, stacklevel=2) + positions = indexers + + if mode is not None: + raise ValueError('`mode` is no longer a valid argument to ' + 'xarray.concat; it has been split into the ' + '`data_vars` and `coords` arguments') + if concat_over is not None: + raise ValueError('`concat_over` is no longer a valid argument to ' + 'xarray.concat; it has been split into the ' + '`data_vars` and `coords` arguments') + + if isinstance(first_obj, DataArray): + f = _dataarray_concat + elif isinstance(first_obj, Dataset): + f = _dataset_concat + else: + raise TypeError('can only concatenate xarray Dataset and DataArray ' + 'objects, got %s' % type(first_obj)) + return f(objs, dim, data_vars, coords, compat, positions, fill_value) + + +def _calc_concat_dim_coord(dim): + """ + Infer the dimension name and 1d coordinate variable (if appropriate) + for concatenating along the new dimension. + """ + from .dataarray import DataArray + + if isinstance(dim, str): + coord = None + elif not isinstance(dim, (DataArray, Variable)): + dim_name = getattr(dim, 'name', None) + if dim_name is None: + dim_name = 'concat_dim' + coord = IndexVariable(dim_name, dim) + dim = dim_name + elif not isinstance(dim, DataArray): + coord = as_variable(dim).to_index_variable() + dim, = coord.dims + else: + coord = dim + dim, = coord.dims + return dim, coord + + +def _calc_concat_over(datasets, dim, data_vars, coords): + """ + Determine which dataset variables need to be concatenated in the result, + and which can simply be taken from the first dataset. + """ + # Return values + concat_over = set() + equals = {} + + if dim in datasets[0]: + concat_over.add(dim) + for ds in datasets: + concat_over.update(k for k, v in ds.variables.items() + if dim in v.dims) + + def process_subset_opt(opt, subset): + if isinstance(opt, str): + if opt == 'different': + # all nonindexes that are not the same in each dataset + for k in getattr(datasets[0], subset): + if k not in concat_over: + # Compare the variable of all datasets vs. the one + # of the first dataset. Perform the minimum amount of + # loads in order to avoid multiple loads from disk + # while keeping the RAM footprint low. + v_lhs = datasets[0].variables[k].load() + # We'll need to know later on if variables are equal. + computed = [] + for ds_rhs in datasets[1:]: + v_rhs = ds_rhs.variables[k].compute() + computed.append(v_rhs) + if not v_lhs.equals(v_rhs): + concat_over.add(k) + equals[k] = False + # computed variables are not to be re-computed + # again in the future + for ds, v in zip(datasets[1:], computed): + ds.variables[k].data = v.data + break + else: + equals[k] = True + + elif opt == 'all': + concat_over.update(set(getattr(datasets[0], subset)) - + set(datasets[0].dims)) + elif opt == 'minimal': + pass + else: + raise ValueError("unexpected value for %s: %s" % (subset, opt)) + else: + invalid_vars = [k for k in opt + if k not in getattr(datasets[0], subset)] + if invalid_vars: + if subset == 'coords': + raise ValueError( + 'some variables in coords are not coordinates on ' + 'the first dataset: %s' % (invalid_vars,)) + else: + raise ValueError( + 'some variables in data_vars are not data variables ' + 'on the first dataset: %s' % (invalid_vars,)) + concat_over.update(opt) + + process_subset_opt(data_vars, 'data_vars') + process_subset_opt(coords, 'coords') + return concat_over, equals + + +def _dataset_concat(datasets, dim, data_vars, coords, compat, positions, + fill_value=dtypes.NA): + """ + Concatenate a sequence of datasets along a new or existing dimension + """ + from .dataset import Dataset + + if compat not in ['equals', 'identical']: + raise ValueError("compat=%r invalid: must be 'equals' " + "or 'identical'" % compat) + + dim, coord = _calc_concat_dim_coord(dim) + # Make sure we're working on a copy (we'll be loading variables) + datasets = [ds.copy() for ds in datasets] + datasets = align(*datasets, join='outer', copy=False, exclude=[dim], + fill_value=fill_value) + + concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) + + def insert_result_variable(k, v): + assert isinstance(v, Variable) + if k in datasets[0].coords: + result_coord_names.add(k) + result_vars[k] = v + + # create the new dataset and add constant variables + result_vars = OrderedDict() + result_coord_names = set(datasets[0].coords) + result_attrs = datasets[0].attrs + result_encoding = datasets[0].encoding + + for k, v in datasets[0].variables.items(): + if k not in concat_over: + insert_result_variable(k, v) + + # check that global attributes and non-concatenated variables are fixed + # across all datasets + for ds in datasets[1:]: + if (compat == 'identical' and + not utils.dict_equiv(ds.attrs, result_attrs)): + raise ValueError('dataset global attributes not equal') + for k, v in ds.variables.items(): + if k not in result_vars and k not in concat_over: + raise ValueError('encountered unexpected variable %r' % k) + elif (k in result_coord_names) != (k in ds.coords): + raise ValueError('%r is a coordinate in some datasets but not ' + 'others' % k) + elif k in result_vars and k != dim: + # Don't use Variable.identical as it internally invokes + # Variable.equals, and we may already know the answer + if compat == 'identical' and not utils.dict_equiv( + v.attrs, result_vars[k].attrs): + raise ValueError( + 'variable %s not identical across datasets' % k) + + # Proceed with equals() + try: + # May be populated when using the "different" method + is_equal = equals[k] + except KeyError: + result_vars[k].load() + is_equal = v.equals(result_vars[k]) + if not is_equal: + raise ValueError( + 'variable %s not equal across datasets' % k) + + # we've already verified everything is consistent; now, calculate + # shared dimension sizes so we can expand the necessary variables + dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] + non_concat_dims = {} + for ds in datasets: + non_concat_dims.update(ds.dims) + non_concat_dims.pop(dim, None) + + def ensure_common_dims(vars): + # ensure each variable with the given name shares the same + # dimensions and the same shape for all of them except along the + # concat dimension + common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) + if dim not in common_dims: + common_dims = (dim,) + common_dims + for var, dim_len in zip(vars, dim_lengths): + if var.dims != common_dims: + common_shape = tuple(non_concat_dims.get(d, dim_len) + for d in common_dims) + var = var.set_dims(common_dims, common_shape) + yield var + + # stack up each variable to fill-out the dataset (in order) + for k in datasets[0].variables: + if k in concat_over: + vars = ensure_common_dims([ds.variables[k] for ds in datasets]) + combined = concat_vars(vars, dim, positions) + insert_result_variable(k, combined) + + result = Dataset(result_vars, attrs=result_attrs) + result = result.set_coords(result_coord_names) + result.encoding = result_encoding + + if coord is not None: + # add concat dimension last to ensure that its in the final Dataset + result[coord.name] = coord + + return result + + +def _dataarray_concat(arrays, dim, data_vars, coords, compat, + positions, fill_value=dtypes.NA): + arrays = list(arrays) + + if data_vars != 'all': + raise ValueError('data_vars is not a valid argument when ' + 'concatenating DataArray objects') + + datasets = [] + for n, arr in enumerate(arrays): + if n == 0: + name = arr.name + elif name != arr.name: + if compat == 'identical': + raise ValueError('array names not identical') + else: + arr = arr.rename(name) + datasets.append(arr._to_temp_dataset()) + + ds = _dataset_concat(datasets, dim, data_vars, coords, compat, + positions, fill_value=fill_value) + return arrays[0]._from_temp_dataset(ds, name) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 108e85f729f..57bfc5f7ebb 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -7,7 +7,7 @@ from . import dtypes, duck_array_ops, nputils, ops, utils from .arithmetic import SupportsArithmetic -from .combine import concat +from .concat import concat from .common import ALL_DIMS, ImplementsArrayReduce, ImplementsDatasetReduce from .options import _get_keep_attrs from .pycompat import integer_types diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 89190eee590..efcaefa0049 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2198,7 +2198,8 @@ def test_open_mfdataset_manyfiles(readengine, nfiles, parallel, chunks, subds.to_netcdf(tmpfiles[ii], engine=writeengine) # check that calculation on opened datasets works properly - with open_mfdataset(tmpfiles, engine=readengine, parallel=parallel, + with open_mfdataset(tmpfiles, combine='manual', concat_dim='x', + engine=readengine, parallel=parallel, chunks=chunks) as actual: # check that using open_mfdataset returns dask arrays for variables @@ -2254,11 +2255,13 @@ def gen_datasets_with_common_coord_and_time(self): @pytest.mark.parametrize('opt', ['all', 'minimal', 'different']) def test_open_mfdataset_does_same_as_concat(self, opt): with self.setup_files_and_datasets() as (files, [ds1, ds2]): - with open_mfdataset(files, data_vars=opt) as ds: + with open_mfdataset(files, data_vars=opt, + combine='manual', concat_dim='t') as ds: kwargs = dict(data_vars=opt, dim='t') ds_expect = xr.concat([ds1, ds2], **kwargs) assert_identical(ds, ds_expect) - with open_mfdataset(files, coords=opt) as ds: + with open_mfdataset(files, coords=opt, + combine='manual', concat_dim='t') as ds: kwargs = dict(coords=opt, dim='t') ds_expect = xr.concat([ds1, ds2], **kwargs) assert_identical(ds, ds_expect) @@ -2268,7 +2271,8 @@ def test_common_coord_when_datavars_all(self): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files with the data_var option - with open_mfdataset(files, data_vars=opt) as ds: + with open_mfdataset(files, data_vars=opt, + combine='manual', concat_dim='t') as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -2285,7 +2289,8 @@ def test_common_coord_when_datavars_minimal(self): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # open the files using data_vars option - with open_mfdataset(files, data_vars=opt) as ds: + with open_mfdataset(files, data_vars=opt, + combine='manual', concat_dim='t') as ds: coord_shape = ds[self.coord_name].shape coord_shape1 = ds1[self.coord_name].shape @@ -2301,12 +2306,14 @@ def test_invalid_data_vars_value_should_fail(self): with self.setup_files_and_datasets() as (files, _): with pytest.raises(ValueError): - with open_mfdataset(files, data_vars='minimum'): + with open_mfdataset(files, data_vars='minimum', + combine='auto'): pass # test invalid coord parameter with pytest.raises(ValueError): - with open_mfdataset(files, coords='minimum'): + with open_mfdataset(files, coords='minimum', + combine='auto'): pass @@ -2374,11 +2381,14 @@ def test_open_mfdataset(self): with create_tmp_file() as tmp2: original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert isinstance(actual.foo.variable.data, da.Array) assert actual.foo.variable.data.chunks == ((5, 5),) assert_identical(original, actual) - with open_mfdataset([tmp1, tmp2], chunks={'x': 3}) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual', + chunks={'x': 3}) as actual: assert actual.foo.variable.data.chunks == ((3, 2, 3, 2),) with raises_regex(IOError, 'no files to open'): @@ -2403,6 +2413,7 @@ def test_open_mfdataset_2d(self): y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], + combine='manual', concat_dim=['y', 'x']) as actual: assert isinstance(actual.foo.variable.data, da.Array) @@ -2411,6 +2422,7 @@ def test_open_mfdataset_2d(self): assert_identical(original, actual) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], + combine='manual', concat_dim=['y', 'x'], chunks={'x': 3, 'y': 2}) as actual: assert actual.foo.variable.data.chunks == \ @@ -2425,7 +2437,8 @@ def test_open_mfdataset_pathlib(self): tmp2 = Path(tmp2) original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(original, actual) @requires_pathlib @@ -2449,10 +2462,10 @@ def test_open_mfdataset_2d_pathlib(self): y=slice(4, 8)).to_netcdf(tmp4) with open_mfdataset([[tmp1, tmp2], [tmp3, tmp4]], + combine='manual', concat_dim=['y', 'x']) as actual: assert_identical(original, actual) - @pytest.mark.xfail(reason="Not yet implemented") def test_open_mfdataset_2(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: @@ -2460,11 +2473,8 @@ def test_open_mfdataset_2(self): original.isel(x=slice(5)).to_netcdf(tmp1) original.isel(x=slice(5, 10)).to_netcdf(tmp2) - with pytest.raises(NotImplementedError): - open_mfdataset([tmp1, tmp2], infer_order_from_coords=True) - - # With infer_order_from_coords=True this should pass in future - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(original, actual) def test_attrs_mfdataset(self): @@ -2477,7 +2487,8 @@ def test_attrs_mfdataset(self): ds2.attrs['test2'] = 'bar' ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: # presumes that attributes inherited from # first dataset loaded assert actual.test1 == ds1.test1 @@ -2486,6 +2497,28 @@ def test_attrs_mfdataset(self): 'no attribute'): actual.test2 + def test_open_mfdataset_auto_combine(self): + original = Dataset({'foo': ('x', np.random.randn(10)), + 'x': np.arange(10)}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + + with open_mfdataset([tmp2, tmp1], combine='auto') as actual: + assert_identical(original, actual) + + def test_open_mfdataset_combine_manual_no_concat_dim(self): + original = Dataset({'foo': ('x', np.random.randn(10)), + 'x': np.arange(10)}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + + with raises_regex(ValueError, 'Must supply concat_dim'): + open_mfdataset([tmp2, tmp1], combine='manual') + @pytest.mark.xfail(reason='mfdataset loses encoding currently.') def test_encoding_mfdataset(self): original = Dataset({'foo': ('t', np.random.randn(10)), @@ -2516,7 +2549,8 @@ def preprocess(ds): return ds.assign_coords(z=0) expected = preprocess(original) - with open_mfdataset(tmp, preprocess=preprocess) as actual: + with open_mfdataset(tmp, preprocess=preprocess, + combine='auto') as actual: assert_identical(expected, actual) def test_save_mfdataset_roundtrip(self): @@ -2526,7 +2560,8 @@ def test_save_mfdataset_roundtrip(self): with create_tmp_file() as tmp1: with create_tmp_file() as tmp2: save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(actual, original) def test_save_mfdataset_invalid(self): @@ -2552,14 +2587,15 @@ def test_save_mfdataset_pathlib_roundtrip(self): tmp1 = Path(tmp1) tmp2 = Path(tmp2) save_mfdataset(datasets, [tmp1, tmp2]) - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim='x', + combine='manual') as actual: assert_identical(actual, original) def test_open_and_do_math(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset(tmp) as ds: + with open_mfdataset(tmp, combine='auto') as ds: actual = 1.0 * ds assert_allclose(original, actual, decode_bytes=False) @@ -2569,7 +2605,8 @@ def test_open_mfdataset_concat_dim_none(self): data = Dataset({'x': 0}) data.to_netcdf(tmp1) Dataset({'x': np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim=None, + combine='manual') as actual: assert_identical(data, actual) def test_open_dataset(self): @@ -2596,7 +2633,8 @@ def test_open_single_dataset(self): {'baz': [100]}) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dim=dim) as actual: + with open_mfdataset([tmp], concat_dim=dim, + combine='manual') as actual: assert_identical(expected, actual) def test_open_multi_dataset(self): @@ -2619,7 +2657,8 @@ def test_open_multi_dataset(self): create_tmp_file() as tmp2: original.to_netcdf(tmp1) original.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], concat_dim=dim) as actual: + with open_mfdataset([tmp1, tmp2], concat_dim=dim, + combine='manual') as actual: assert_identical(expected, actual) def test_dask_roundtrip(self): @@ -2638,10 +2677,10 @@ def test_deterministic_names(self): with create_tmp_file() as tmp: data = create_test_data() data.to_netcdf(tmp) - with open_mfdataset(tmp) as ds: + with open_mfdataset(tmp, combine='auto') as ds: original_names = dict((k, v.data.name) for k, v in ds.data_vars.items()) - with open_mfdataset(tmp) as ds: + with open_mfdataset(tmp, combine='auto') as ds: repeat_names = dict((k, v.data.name) for k, v in ds.data_vars.items()) for var_name, dask_name in original_names.items(): @@ -2671,7 +2710,8 @@ def test_save_mfdataset_compute_false_roundtrip(self): engine=self.engine, compute=False) assert isinstance(delayed_obj, Delayed) delayed_obj.compute() - with open_mfdataset([tmp1, tmp2]) as actual: + with open_mfdataset([tmp1, tmp2], combine='manual', + concat_dim='x') as actual: assert_identical(actual, original) def test_load_dataset(self): @@ -2692,6 +2732,59 @@ def test_load_dataarray(self): ds.to_netcdf(tmp) +@requires_scipy_or_netCDF4 +class TestOpenMFDataSetDeprecation: + """ + Set of tests to check that FutureWarnings are correctly raised until the + deprecation cycle is complete. #2616 + """ + def test_open_mfdataset_with_concat_dim(self): + ds1, ds2 = Dataset({'x': [0]}), Dataset({'x': [1]}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with pytest.warns(FutureWarning, match="`concat_dim`"): + open_mfdataset([tmp1, tmp2], concat_dim='x') + + def test_auto_combine_with_merge_and_concat(self): + ds1, ds2 = Dataset({'x': [0]}), Dataset({'x': [1]}) + ds3 = Dataset({'z': ((), 99)}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + with create_tmp_file() as tmp3: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + ds3.to_netcdf(tmp3) + + with pytest.warns(FutureWarning, + match="require both concatenation"): + open_mfdataset([tmp1, tmp2, tmp3]) + + def test_auto_combine_with_coords(self): + ds1 = Dataset({'foo': ('x', [0])}, coords={'x': ('x', [0])}) + ds2 = Dataset({'foo': ('x', [1])}, coords={'x': ('x', [1])}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with pytest.warns(FutureWarning, match="supplied have global"): + open_mfdataset([tmp1, tmp2]) + + def test_auto_combine_without_coords(self): + ds1, ds2 = Dataset({'foo': ('x', [0])}), Dataset({'foo': ('x', [1])}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with pytest.warns(FutureWarning, + match="supplied do not have global"): + open_mfdataset([tmp1, tmp2]) + + @requires_scipy_or_netCDF4 @requires_pydap class TestPydap: @@ -3010,7 +3103,7 @@ def test_uamiv_format_mfread(self): ['example.uamiv', 'example.uamiv'], engine='pseudonetcdf', - concat_dim=['TSTEP'], + concat_dim='TSTEP', backend_kwargs={'format': 'uamiv'}) data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index e9b63dd18fc..adbd85675fa 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,465 +1,22 @@ from collections import OrderedDict -from copy import deepcopy from itertools import product +from datetime import datetime import numpy as np -import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable, auto_combine, concat +from xarray import DataArray, Dataset, concat, combine_auto, combine_manual +from xarray import auto_combine from xarray.core import dtypes from xarray.core.combine import ( - _auto_combine, _auto_combine_1d, _auto_combine_all_along_first_dim, - _check_shape_tile_ids, _combine_nd, _infer_concat_order_from_positions, - _infer_tile_ids_from_nested_list, _new_tile_id) + _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, + _combine_nd, _infer_concat_order_from_positions, + _infer_concat_order_from_coords) -from . import ( - InaccessibleArray, assert_array_equal, assert_equal, assert_identical, - raises_regex, requires_dask) +from . import (assert_identical, assert_equal, raises_regex) from .test_dataset import create_test_data -class TestConcatDataset: - def test_concat(self): - # TODO: simplify and split this test case - - # drop the third dimension to keep things relatively understandable - data = create_test_data() - for k in list(data.variables): - if 'dim3' in data[k].dims: - del data[k] - - split_data = [data.isel(dim1=slice(3)), - data.isel(dim1=slice(3, None))] - assert_identical(data, concat(split_data, 'dim1')) - - def rectify_dim_order(dataset): - # return a new dataset with all variable dimensions transposed into - # the order in which they are found in `data` - return Dataset(dict((k, v.transpose(*data[k].dims)) - for k, v in dataset.data_vars.items()), - dataset.coords, attrs=dataset.attrs) - - for dim in ['dim1', 'dim2']: - datasets = [g for _, g in data.groupby(dim, squeeze=False)] - assert_identical(data, concat(datasets, dim)) - - dim = 'dim2' - assert_identical( - data, concat(datasets, data[dim])) - assert_identical( - data, concat(datasets, data[dim], coords='minimal')) - - datasets = [g for _, g in data.groupby(dim, squeeze=True)] - concat_over = [k for k, v in data.coords.items() - if dim in v.dims and k != dim] - actual = concat(datasets, data[dim], coords=concat_over) - assert_identical(data, rectify_dim_order(actual)) - - actual = concat(datasets, data[dim], coords='different') - assert_identical(data, rectify_dim_order(actual)) - - # make sure the coords argument behaves as expected - data.coords['extra'] = ('dim4', np.arange(3)) - for dim in ['dim1', 'dim2']: - datasets = [g for _, g in data.groupby(dim, squeeze=True)] - actual = concat(datasets, data[dim], coords='all') - expected = np.array([data['extra'].values - for _ in range(data.dims[dim])]) - assert_array_equal(actual['extra'].values, expected) - - actual = concat(datasets, data[dim], coords='different') - assert_equal(data['extra'], actual['extra']) - actual = concat(datasets, data[dim], coords='minimal') - assert_equal(data['extra'], actual['extra']) - - # verify that the dim argument takes precedence over - # concatenating dataset variables of the same name - dim = (2 * data['dim1']).rename('dim1') - datasets = [g for _, g in data.groupby('dim1', squeeze=False)] - expected = data.copy() - expected['dim1'] = dim - assert_identical(expected, concat(datasets, dim)) - - def test_concat_data_vars(self): - data = Dataset({'foo': ('x', np.random.randn(10))}) - objs = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] - for data_vars in ['minimal', 'different', 'all', [], ['foo']]: - actual = concat(objs, dim='x', data_vars=data_vars) - assert_identical(data, actual) - - def test_concat_coords(self): - data = Dataset({'foo': ('x', np.random.randn(10))}) - expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5)) - objs = [data.isel(x=slice(5)).assign_coords(c=0), - data.isel(x=slice(5, None)).assign_coords(c=1)] - for coords in ['different', 'all', ['c']]: - actual = concat(objs, dim='x', coords=coords) - assert_identical(expected, actual) - for coords in ['minimal', []]: - with raises_regex(ValueError, 'not equal across'): - concat(objs, dim='x', coords=coords) - - def test_concat_constant_index(self): - # GH425 - ds1 = Dataset({'foo': 1.5}, {'y': 1}) - ds2 = Dataset({'foo': 2.5}, {'y': 1}) - expected = Dataset({'foo': ('y', [1.5, 2.5]), 'y': [1, 1]}) - for mode in ['different', 'all', ['foo']]: - actual = concat([ds1, ds2], 'y', data_vars=mode) - assert_identical(expected, actual) - with raises_regex(ValueError, 'not equal across datasets'): - concat([ds1, ds2], 'y', data_vars='minimal') - - def test_concat_size0(self): - data = create_test_data() - split_data = [data.isel(dim1=slice(0, 0)), data] - actual = concat(split_data, 'dim1') - assert_identical(data, actual) - - actual = concat(split_data[::-1], 'dim1') - assert_identical(data, actual) - - def test_concat_autoalign(self): - ds1 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 2])])}) - ds2 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 3])])}) - actual = concat([ds1, ds2], 'y') - expected = Dataset({'foo': DataArray([[1, 2, np.nan], [1, np.nan, 2]], - dims=['y', 'x'], - coords={'x': [1, 2, 3]})}) - assert_identical(expected, actual) - - def test_concat_errors(self): - data = create_test_data() - split_data = [data.isel(dim1=slice(3)), - data.isel(dim1=slice(3, None))] - - with raises_regex(ValueError, 'must supply at least one'): - concat([], 'dim1') - - with raises_regex(ValueError, 'are not coordinates'): - concat([data, data], 'new_dim', coords=['not_found']) - - with raises_regex(ValueError, 'global attributes not'): - data0, data1 = deepcopy(split_data) - data1.attrs['foo'] = 'bar' - concat([data0, data1], 'dim1', compat='identical') - assert_identical( - data, concat([data0, data1], 'dim1', compat='equals')) - - with raises_regex(ValueError, 'encountered unexpected'): - data0, data1 = deepcopy(split_data) - data1['foo'] = ('bar', np.random.randn(10)) - concat([data0, data1], 'dim1') - - with raises_regex(ValueError, 'compat.* invalid'): - concat(split_data, 'dim1', compat='foobar') - - with raises_regex(ValueError, 'unexpected value for'): - concat([data, data], 'new_dim', coords='foobar') - - with raises_regex( - ValueError, 'coordinate in some datasets but not others'): - concat([Dataset({'x': 0}), Dataset({'x': [1]})], dim='z') - - with raises_regex( - ValueError, 'coordinate in some datasets but not others'): - concat([Dataset({'x': 0}), Dataset({}, {'x': 1})], dim='z') - - with raises_regex(ValueError, 'no longer a valid'): - concat([data, data], 'new_dim', mode='different') - with raises_regex(ValueError, 'no longer a valid'): - concat([data, data], 'new_dim', concat_over='different') - - def test_concat_promote_shape(self): - # mixed dims within variables - objs = [Dataset({}, {'x': 0}), Dataset({'x': [1]})] - actual = concat(objs, 'x') - expected = Dataset({'x': [0, 1]}) - assert_identical(actual, expected) - - objs = [Dataset({'x': [0]}), Dataset({}, {'x': 1})] - actual = concat(objs, 'x') - assert_identical(actual, expected) - - # mixed dims between variables - objs = [Dataset({'x': [2], 'y': 3}), Dataset({'x': [4], 'y': 5})] - actual = concat(objs, 'x') - expected = Dataset({'x': [2, 4], 'y': ('x', [3, 5])}) - assert_identical(actual, expected) - - # mixed dims in coord variable - objs = [Dataset({'x': [0]}, {'y': -1}), - Dataset({'x': [1]}, {'y': ('x', [-2])})] - actual = concat(objs, 'x') - expected = Dataset({'x': [0, 1]}, {'y': ('x', [-1, -2])}) - assert_identical(actual, expected) - - # scalars with mixed lengths along concat dim -- values should repeat - objs = [Dataset({'x': [0]}, {'y': -1}), - Dataset({'x': [1, 2]}, {'y': -2})] - actual = concat(objs, 'x') - expected = Dataset({'x': [0, 1, 2]}, {'y': ('x', [-1, -2, -2])}) - assert_identical(actual, expected) - - # broadcast 1d x 1d -> 2d - objs = [Dataset({'z': ('x', [-1])}, {'x': [0], 'y': [0]}), - Dataset({'z': ('y', [1])}, {'x': [1], 'y': [0]})] - actual = concat(objs, 'x') - expected = Dataset({'z': (('x', 'y'), [[-1], [1]])}, - {'x': [0, 1], 'y': [0]}) - assert_identical(actual, expected) - - def test_concat_do_not_promote(self): - # GH438 - objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), - Dataset({'y': ('t', [2])}, {'x': 1, 't': [0]})] - expected = Dataset({'y': ('t', [1, 2])}, {'x': 1, 't': [0, 0]}) - actual = concat(objs, 't') - assert_identical(expected, actual) - - objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), - Dataset({'y': ('t', [2])}, {'x': 2, 't': [0]})] - with pytest.raises(ValueError): - concat(objs, 't', coords='minimal') - - def test_concat_dim_is_variable(self): - objs = [Dataset({'x': 0}), Dataset({'x': 1})] - coord = Variable('y', [3, 4]) - expected = Dataset({'x': ('y', [0, 1]), 'y': [3, 4]}) - actual = concat(objs, coord) - assert_identical(actual, expected) - - def test_concat_multiindex(self): - x = pd.MultiIndex.from_product([[1, 2, 3], ['a', 'b']]) - expected = Dataset({'x': x}) - actual = concat([expected.isel(x=slice(2)), - expected.isel(x=slice(2, None))], 'x') - assert expected.equals(actual) - assert isinstance(actual.x.to_index(), pd.MultiIndex) - - @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) - def test_concat_fill_value(self, fill_value): - datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), - Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] - if fill_value == dtypes.NA: - # if we supply the default, we expect the missing value for a - # float array - fill_value = np.nan - expected = Dataset({'a': (('t', 'x'), - [[fill_value, 2, 3], [1, 2, fill_value]])}, - {'x': [0, 1, 2]}) - actual = concat(datasets, dim='t', fill_value=fill_value) - assert_identical(actual, expected) - - -class TestConcatDataArray: - def test_concat(self): - ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), - 'bar': (['x', 'y'], np.random.random((2, 3)))}, - {'x': [0, 1]}) - foo = ds['foo'] - bar = ds['bar'] - - # from dataset array: - expected = DataArray(np.array([foo.values, bar.values]), - dims=['w', 'x', 'y'], coords={'x': [0, 1]}) - actual = concat([foo, bar], 'w') - assert_equal(expected, actual) - # from iteration: - grouped = [g for _, g in foo.groupby('x')] - stacked = concat(grouped, ds['x']) - assert_identical(foo, stacked) - # with an index as the 'dim' argument - stacked = concat(grouped, ds.indexes['x']) - assert_identical(foo, stacked) - - actual = concat([foo[0], foo[1]], pd.Index([0, 1]) - ).reset_coords(drop=True) - expected = foo[:2].rename({'x': 'concat_dim'}) - assert_identical(expected, actual) - - actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) - expected = foo[:2].rename({'x': 'concat_dim'}) - assert_identical(expected, actual) - - with raises_regex(ValueError, 'not identical'): - concat([foo, bar], dim='w', compat='identical') - - with raises_regex(ValueError, 'not a valid argument'): - concat([foo, bar], dim='w', data_vars='minimal') - - def test_concat_encoding(self): - # Regression test for GH1297 - ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), - 'bar': (['x', 'y'], np.random.random((2, 3)))}, - {'x': [0, 1]}) - foo = ds['foo'] - foo.encoding = {"complevel": 5} - ds.encoding = {"unlimited_dims": 'x'} - assert concat([foo, foo], dim="x").encoding == foo.encoding - assert concat([ds, ds], dim="x").encoding == ds.encoding - - @pytest.mark.parametrize("colors, expected_name", - [(['blue', 'green', 'red'], None), - (['red', 'red', 'red'], 'red')]) - def test_concat_determine_name(self, colors, expected_name): - das = [DataArray(np.random.random((2, 2)), dims=['x', 'y'], name=k) - for k in colors] - result = concat(das, dim="band") - assert result.name is expected_name - - @requires_dask - def test_concat_lazy(self): - import dask.array as da - - arrays = [DataArray( - da.from_array(InaccessibleArray(np.zeros((3, 3))), 3), - dims=['x', 'y']) for _ in range(2)] - # should not raise - combined = concat(arrays, dim='z') - assert combined.shape == (2, 3, 3) - assert combined.dims == ('z', 'x', 'y') - - @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) - def test_concat_fill_value(self, fill_value): - foo = DataArray([1, 2], coords=[('x', [1, 2])]) - bar = DataArray([1, 2], coords=[('x', [1, 3])]) - if fill_value == dtypes.NA: - # if we supply the default, we expect the missing value for a - # float array - fill_value = np.nan - expected = DataArray([[1, 2, fill_value], [1, fill_value, 2]], - dims=['y', 'x'], coords={'x': [1, 2, 3]}) - actual = concat((foo, bar), dim='y', fill_value=fill_value) - assert_identical(actual, expected) - - -class TestAutoCombine: - - @pytest.mark.parametrize("combine", [_auto_combine_1d, auto_combine]) - @requires_dask # only for toolz - def test_auto_combine(self, combine): - objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = combine(objs) - expected = Dataset({'x': [0, 1]}) - assert_identical(expected, actual) - - actual = combine([actual]) - assert_identical(expected, actual) - - objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] - actual = combine(objs) - expected = Dataset({'x': [0, 1, 2]}) - assert_identical(expected, actual) - - # ensure auto_combine handles non-sorted variables - objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), - Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] - actual = combine(objs) - expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) - assert_identical(expected, actual) - - objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] - with raises_regex(ValueError, 'too many .* dimensions'): - combine(objs) - - objs = [Dataset({'x': 0}), Dataset({'x': 1})] - with raises_regex(ValueError, 'cannot infer dimension'): - combine(objs) - - objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] - with pytest.raises(KeyError): - combine(objs) - - @requires_dask # only for toolz - def test_auto_combine_previously_failed(self): - # In the above scenario, one file is missing, containing the data for - # one year's data for one variable. - datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), - Dataset({'b': ('x', [0]), 'x': [0]}), - Dataset({'a': ('x', [1]), 'x': [1]})] - expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, - {'x': [0, 1]}) - actual = auto_combine(datasets) - assert_identical(expected, actual) - - # Your data includes "time" and "station" dimensions, and each year's - # data has a different set of stations. - datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), - Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] - expected = Dataset({'a': (('t', 'x'), - [[np.nan, 2, 3], [1, 2, np.nan]])}, - {'x': [0, 1, 2]}) - actual = auto_combine(datasets, concat_dim='t') - assert_identical(expected, actual) - - @requires_dask # only for toolz - def test_auto_combine_still_fails(self): - # concat can't handle new variables (yet): - # https://github.com/pydata/xarray/issues/508 - datasets = [Dataset({'x': 0}, {'y': 0}), - Dataset({'x': 1}, {'y': 1, 'z': 1})] - with pytest.raises(ValueError): - auto_combine(datasets, 'y') - - @requires_dask # only for toolz - def test_auto_combine_no_concat(self): - objs = [Dataset({'x': 0}), Dataset({'y': 1})] - actual = auto_combine(objs) - expected = Dataset({'x': 0, 'y': 1}) - assert_identical(expected, actual) - - objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] - actual = auto_combine(objs) - expected = Dataset({'x': 0, 'y': 1, 'z': 2}) - assert_identical(expected, actual) - - data = Dataset({'x': 0}) - actual = auto_combine([data, data, data], concat_dim=None) - assert_identical(data, actual) - - tmp1 = Dataset({'x': 0}) - tmp2 = Dataset({'x': np.nan}) - actual = auto_combine([tmp1, tmp2], concat_dim=None) - assert_identical(tmp1, actual) - actual = auto_combine([tmp1, tmp2], concat_dim=[None]) - assert_identical(tmp1, actual) - - # Single object, with a concat_dim explicitly provided - # Test the issue reported in GH #1988 - objs = [Dataset({'x': 0, 'y': 1})] - dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dim=dim) - expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, - {'baz': [100]}) - assert_identical(expected, actual) - - # Just making sure that auto_combine is doing what is - # expected for non-scalar values, too. - objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] - dim = DataArray([100], name='baz', dims='baz') - actual = auto_combine(objs, concat_dim=dim) - expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), - 'y': (('baz', 'z'), [[1, 2]])}, - {'baz': [100]}) - assert_identical(expected, actual) - - @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) - def test_auto_combine_fill_value(self, fill_value): - datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), - Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] - if fill_value == dtypes.NA: - # if we supply the default, we expect the missing value for a - # float array - fill_value = np.nan - expected = Dataset({'a': (('t', 'x'), - [[fill_value, 2, 3], [1, 2, fill_value]])}, - {'x': [0, 1, 2]}) - actual = auto_combine(datasets, concat_dim='t', fill_value=fill_value) - assert_identical(expected, actual) - - def assert_combined_tile_ids_equal(dict1, dict2): assert len(dict1) == len(dict2) for k, v in dict1.items(): @@ -473,7 +30,7 @@ def test_1d(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_2d(self): @@ -483,7 +40,7 @@ def test_2d(self): expected = {(0, 0): ds(0), (0, 1): ds(1), (1, 0): ds(2), (1, 1): ds(3), (2, 0): ds(4), (2, 1): ds(5)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_3d(self): @@ -497,7 +54,7 @@ def test_3d(self): (1, 0, 0): ds(6), (1, 0, 1): ds(7), (1, 1, 0): ds(8), (1, 1, 1): ds(9), (1, 2, 0): ds(10), (1, 2, 1): ds(11)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_single_dataset(self): @@ -505,7 +62,7 @@ def test_single_dataset(self): input = [ds] expected = {(0,): ds} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_redundant_nesting(self): @@ -513,14 +70,14 @@ def test_redundant_nesting(self): input = [[ds(0)], [ds(1)]] expected = {(0, 0): ds(0), (1, 0): ds(1)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_ignore_empty_list(self): ds = create_test_data(0) input = [ds, []] expected = {(0,): ds} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_uneven_depth_input(self): @@ -530,7 +87,7 @@ def test_uneven_depth_input(self): input = [ds(0), [ds(1), ds(2)]] expected = {(0,): ds(0), (1, 0): ds(1), (1, 1): ds(2)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_uneven_length_input(self): @@ -540,7 +97,7 @@ def test_uneven_length_input(self): input = [[ds(0)], [ds(1), ds(2)]] expected = {(0, 0): ds(0), (1, 0): ds(1), (1, 1): ds(2)} - actual = dict(_infer_tile_ids_from_nested_list(input, ())) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) def test_infer_from_datasets(self): @@ -548,13 +105,105 @@ def test_infer_from_datasets(self): input = [ds(0), ds(1)] expected = {(0,): ds(0), (1,): ds(1)} - actual, concat_dims = _infer_concat_order_from_positions(input, [ - 'dim1']) + actual = _infer_concat_order_from_positions(input) assert_combined_tile_ids_equal(expected, actual) - input = [ds(0), ds(1)] - with pytest.raises(ValueError): - _infer_concat_order_from_positions(input, ['dim1', 'extra_dim']) + +class TestTileIDsFromCoords: + def test_1d(self): + ds0 = Dataset({'x': [0, 1]}) + ds1 = Dataset({'x': [2, 3]}) + + expected = {(0,): ds0, (1,): ds1} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x'] + + def test_2d(self): + ds0 = Dataset({'x': [0, 1], 'y': [10, 20, 30]}) + ds1 = Dataset({'x': [2, 3], 'y': [10, 20, 30]}) + ds2 = Dataset({'x': [0, 1], 'y': [40, 50, 60]}) + ds3 = Dataset({'x': [2, 3], 'y': [40, 50, 60]}) + ds4 = Dataset({'x': [0, 1], 'y': [70, 80, 90]}) + ds5 = Dataset({'x': [2, 3], 'y': [70, 80, 90]}) + + expected = {(0, 0): ds0, (1, 0): ds1, + (0, 1): ds2, (1, 1): ds3, + (0, 2): ds4, (1, 2): ds5} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0, ds3, + ds5, ds2, ds4]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x', 'y'] + + def test_no_dimension_coords(self): + ds0 = Dataset({'foo': ('x', [0, 1])}) + ds1 = Dataset({'foo': ('x', [2, 3])}) + with raises_regex(ValueError, "Could not find any dimension"): + _infer_concat_order_from_coords([ds1, ds0]) + + def test_coord_not_monotonic(self): + ds0 = Dataset({'x': [0, 1]}) + ds1 = Dataset({'x': [3, 2]}) + with raises_regex(ValueError, "Coordinate variable x is neither " + "monotonically increasing nor"): + _infer_concat_order_from_coords([ds1, ds0]) + + def test_coord_monotonically_decreasing(self): + ds0 = Dataset({'x': [3, 2]}) + ds1 = Dataset({'x': [1, 0]}) + + expected = {(0,): ds0, (1,): ds1} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x'] + + def test_no_concatenation_needed(self): + ds = Dataset({'foo': ('x', [0, 1])}) + expected = {(): ds} + actual, concat_dims = _infer_concat_order_from_coords([ds]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == [] + + def test_2d_plus_bystander_dim(self): + ds0 = Dataset({'x': [0, 1], 'y': [10, 20, 30], 't': [0.1, 0.2]}) + ds1 = Dataset({'x': [2, 3], 'y': [10, 20, 30], 't': [0.1, 0.2]}) + ds2 = Dataset({'x': [0, 1], 'y': [40, 50, 60], 't': [0.1, 0.2]}) + ds3 = Dataset({'x': [2, 3], 'y': [40, 50, 60], 't': [0.1, 0.2]}) + + expected = {(0, 0): ds0, (1, 0): ds1, + (0, 1): ds2, (1, 1): ds3} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0, + ds3, ds2]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['x', 'y'] + + def test_string_coords(self): + ds0 = Dataset({'person': ['Alice', 'Bob']}) + ds1 = Dataset({'person': ['Caroline', 'Daniel']}) + + expected = {(0,): ds0, (1,): ds1} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['person'] + + # Decided against natural sorting of string coords GH #2616 + def test_lexicographic_sort_string_coords(self): + ds0 = Dataset({'simulation': ['run8', 'run9']}) + ds1 = Dataset({'simulation': ['run10', 'run11']}) + + expected = {(0,): ds1, (1,): ds0} + actual, concat_dims = _infer_concat_order_from_coords([ds1, ds0]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['simulation'] + + def test_datetime_coords(self): + ds0 = Dataset({'time': [datetime(2000, 3, 6), datetime(2001, 3, 7)]}) + ds1 = Dataset({'time': [datetime(1999, 1, 1), datetime(1999, 2, 4)]}) + + expected = {(0,): ds1, (1,): ds0} + actual, concat_dims = _infer_concat_order_from_coords([ds0, ds1]) + assert_combined_tile_ids_equal(expected, actual) + assert concat_dims == ['time'] @pytest.fixture(scope='module') @@ -574,8 +223,7 @@ def _create_tile_ids(shape): return list(tile_ids) -@requires_dask # only for toolz -class TestCombineND: +class TestNewTileIDs: @pytest.mark.parametrize("old_id, new_id", [((3, 0, 1), (0, 1)), ((0, 0), (0,)), ((1,), ()), @@ -593,16 +241,17 @@ def test_get_new_tile_ids(self, create_combined_ids): actual_tile_ids = _create_tile_ids(shape) assert expected_tile_ids == actual_tile_ids + +class TestCombineND: @pytest.mark.parametrize("concat_dim", ['dim1', 'new_dim']) def test_concat_once(self, create_combined_ids, concat_dim): shape = (2,) combined_ids = create_combined_ids(shape) ds = create_test_data - result = _auto_combine_all_along_first_dim(combined_ids, - dim=concat_dim, - data_vars='all', - coords='different', - compat='no_conflicts') + result = _combine_all_along_first_dim(combined_ids, dim=concat_dim, + data_vars='all', + coords='different', + compat='no_conflicts') expected_ds = concat([ds(0), ds(1)], dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) @@ -610,11 +259,10 @@ def test_concat_once(self, create_combined_ids, concat_dim): def test_concat_only_first_dim(self, create_combined_ids): shape = (2, 3) combined_ids = create_combined_ids(shape) - result = _auto_combine_all_along_first_dim(combined_ids, - dim='dim1', - data_vars='all', - coords='different', - compat='no_conflicts') + result = _combine_all_along_first_dim(combined_ids, dim='dim1', + data_vars='all', + coords='different', + compat='no_conflicts') ds = create_test_data partway1 = concat([ds(0), ds(3)], dim='dim1') @@ -657,17 +305,138 @@ def test_check_lengths(self): _check_shape_tile_ids(combined_tile_ids) -@requires_dask # only for toolz -class TestAutoCombineND: - def test_single_dataset(self): +class TestManualCombine: + def test_manual_concat(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] - actual = auto_combine(objs) expected = Dataset({'x': [0, 1]}) + actual = combine_manual(objs, concat_dim='x') + assert_identical(expected, actual) + actual = combine_manual(objs, concat_dim=['x']) assert_identical(expected, actual) - actual = auto_combine(actual) + actual = combine_manual([actual], concat_dim=None) assert_identical(expected, actual) + actual = combine_manual([actual], concat_dim='x') + assert_identical(expected, actual) + + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] + actual = combine_manual(objs, concat_dim='x') + expected = Dataset({'x': [0, 1, 2]}) + assert_identical(expected, actual) + + # ensure manual_combine handles non-sorted variables + objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), + Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] + actual = combine_manual(objs, concat_dim='a') + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] + with pytest.raises(KeyError): + combine_manual(objs, concat_dim='x') + + def test_empty_input(self): + assert_identical(Dataset(), combine_manual([], concat_dim='x')) + + # Fails because of concat's weird treatment of dimension coords, see #2975 + @pytest.mark.xfail + def test_manual_concat_too_many_dims_at_once(self): + objs = [Dataset({'x': [0], 'y': [1]}), Dataset({'y': [0], 'x': [1]})] + with pytest.raises(ValueError, "not equal across datasets"): + combine_manual(objs, concat_dim='x', coords='minimal') + + def test_manual_concat_along_new_dim(self): + objs = [Dataset({'a': ('x', [10]), 'x': [0]}), + Dataset({'a': ('x', [20]), 'x': [0]})] + expected = Dataset({'a': (('t', 'x'), [[10], [20]]), 'x': [0]}) + actual = combine_manual(objs, concat_dim='t') + assert_identical(expected, actual) + + # Same but with a DataArray as new dim, see GH #1988 and #2647 + dim = DataArray([100, 150], name='baz', dims='baz') + expected = Dataset({'a': (('baz', 'x'), [[10], [20]]), + 'x': [0], 'baz': [100, 150]}) + actual = combine_manual(objs, concat_dim=dim) + assert_identical(expected, actual) + + def test_manual_merge(self): + data = Dataset({'x': 0}) + actual = combine_manual([data, data, data], concat_dim=None) + assert_identical(data, actual) + + ds1 = Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) + ds2 = Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}) + expected = Dataset({'a': ('x', [1, 2, 3]), 'x': [0, 1, 2]}) + actual = combine_manual([ds1, ds2], concat_dim=None) + assert_identical(expected, actual) + actual = combine_manual([ds1, ds2], concat_dim=[None]) + assert_identical(expected, actual) + + tmp1 = Dataset({'x': 0}) + tmp2 = Dataset({'x': np.nan}) + actual = combine_manual([tmp1, tmp2], concat_dim=None) + assert_identical(tmp1, actual) + actual = combine_manual([tmp1, tmp2], concat_dim=[None]) + assert_identical(tmp1, actual) + + # Single object, with a concat_dim explicitly provided + # Test the issue reported in GH #1988 + objs = [Dataset({'x': 0, 'y': 1})] + dim = DataArray([100], name='baz', dims='baz') + actual = combine_manual(objs, concat_dim=[dim]) + expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, + {'baz': [100]}) + assert_identical(expected, actual) + + # Just making sure that auto_combine is doing what is + # expected for non-scalar values, too. + objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] + dim = DataArray([100], name='baz', dims='baz') + actual = combine_manual(objs, concat_dim=[dim]) + expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), + 'y': (('baz', 'z'), [[1, 2]])}, + {'baz': [100]}) + assert_identical(expected, actual) + + def test_concat_multiple_dims(self): + objs = [[Dataset({'a': (('x', 'y'), [[0]])}), + Dataset({'a': (('x', 'y'), [[1]])})], + [Dataset({'a': (('x', 'y'), [[2]])}), + Dataset({'a': (('x', 'y'), [[3]])})]] + actual = combine_manual(objs, concat_dim=['x', 'y']) + expected = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])}) + assert_identical(expected, actual) + + def test_concat_name_symmetry(self): + """Inspired by the discussion on GH issue #2777""" + + da1 = DataArray(name='a', data=[[0]], dims=['x', 'y']) + da2 = DataArray(name='b', data=[[1]], dims=['x', 'y']) + da3 = DataArray(name='a', data=[[2]], dims=['x', 'y']) + da4 = DataArray(name='b', data=[[3]], dims=['x', 'y']) + + x_first = combine_manual([[da1, da2], [da3, da4]], + concat_dim=['x', 'y']) + y_first = combine_manual([[da1, da3], [da2, da4]], + concat_dim=['y', 'x']) + + assert_identical(x_first, y_first) + + def test_concat_one_dim_merge_another(self): + data = create_test_data() + data1 = data.copy(deep=True) + data2 = data.copy(deep=True) + + objs = [[data1.var1.isel(dim2=slice(4)), + data2.var1.isel(dim2=slice(4, 9))], + [data1.var2.isel(dim2=slice(4)), + data2.var2.isel(dim2=slice(4, 9))]] + + expected = data[['var1', 'var2']] + actual = combine_manual(objs, concat_dim=[None, 'dim2']) + assert expected.identical(actual) + def test_auto_combine_2d(self): ds = create_test_data @@ -677,26 +446,36 @@ def test_auto_combine_2d(self): expected = concat([partway1, partway2, partway3], dim='dim2') datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] - result = auto_combine(datasets, concat_dim=['dim1', 'dim2']) - + result = combine_manual(datasets, concat_dim=['dim1', 'dim2']) assert_equal(result, expected) + def test_manual_combine_missing_data_new_dim(self): + # Your data includes "time" and "station" dimensions, and each year's + # data has a different set of stations. + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + expected = Dataset({'a': (('t', 'x'), + [[np.nan, 2, 3], [1, 2, np.nan]])}, + {'x': [0, 1, 2]}) + actual = combine_manual(datasets, concat_dim='t') + assert_identical(expected, actual) + def test_invalid_hypercube_input(self): ds = create_test_data datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4)]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent lengths'): - auto_combine(datasets, concat_dim=['dim1', 'dim2']) + combine_manual(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [[ds(3), ds(4)]]] with raises_regex(ValueError, 'sub-lists do not have ' 'consistent depths'): - auto_combine(datasets, concat_dim=['dim1', 'dim2']) + combine_manual(datasets, concat_dim=['dim1', 'dim2']) datasets = [[ds(0), ds(1)], [ds(3), ds(4)]] with raises_regex(ValueError, 'concat_dims has length'): - auto_combine(datasets, concat_dim=['dim1']) + combine_manual(datasets, concat_dim=['dim1']) def test_merge_one_dim_concat_another(self): objs = [[Dataset({'foo': ('x', [0, 1])}), @@ -706,10 +485,7 @@ def test_merge_one_dim_concat_another(self): expected = Dataset({'foo': ('x', [0, 1, 2, 3]), 'bar': ('x', [10, 20, 30, 40])}) - actual = auto_combine(objs, concat_dim=['x', None], compat='equals') - assert_identical(expected, actual) - - actual = auto_combine(objs) + actual = combine_manual(objs, concat_dim=['x', None], compat='equals') assert_identical(expected, actual) # Proving it works symmetrically @@ -717,57 +493,284 @@ def test_merge_one_dim_concat_another(self): Dataset({'foo': ('x', [2, 3])})], [Dataset({'bar': ('x', [10, 20])}), Dataset({'bar': ('x', [30, 40])})]] - actual = auto_combine(objs, concat_dim=[None, 'x'], compat='equals') - assert_identical(expected, actual) - - def test_internal_ordering(self): - # This gives a MergeError if _auto_combine_1d is not sorting by - # data_vars correctly, see GH #2662 - objs = [Dataset({'foo': ('x', [0, 1])}), - Dataset({'bar': ('x', [10, 20])}), - Dataset({'foo': ('x', [2, 3])}), - Dataset({'bar': ('x', [30, 40])})] - actual = auto_combine(objs, concat_dim='x', compat='equals') - expected = Dataset({'foo': ('x', [0, 1, 2, 3]), - 'bar': ('x', [10, 20, 30, 40])}) + actual = combine_manual(objs, concat_dim=[None, 'x'], compat='equals') assert_identical(expected, actual) def test_combine_concat_over_redundant_nesting(self): objs = [[Dataset({'x': [0]}), Dataset({'x': [1]})]] - actual = auto_combine(objs, concat_dim=[None, 'x']) + actual = combine_manual(objs, concat_dim=[None, 'x']) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})], [Dataset({'x': [1]})]] - actual = auto_combine(objs, concat_dim=['x', None]) + actual = combine_manual(objs, concat_dim=['x', None]) expected = Dataset({'x': [0, 1]}) assert_identical(expected, actual) objs = [[Dataset({'x': [0]})]] - actual = auto_combine(objs, concat_dim=[None, None]) + actual = combine_manual(objs, concat_dim=[None, None]) expected = Dataset({'x': [0]}) assert_identical(expected, actual) - objs = [[Dataset({'x': [0]})]] - actual = auto_combine(objs, concat_dim=None) - expected = Dataset({'x': [0]}) + def test_manual_combine_but_need_auto_combine(self): + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2], 'wall': [0]})] + with raises_regex(ValueError, 'cannot be combined'): + combine_manual(objs, concat_dim='x') + + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_combine_manual_fill_value(self, fill_value): + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = Dataset({'a': (('t', 'x'), + [[fill_value, 2, 3], [1, 2, fill_value]])}, + {'x': [0, 1, 2]}) + actual = combine_manual(datasets, concat_dim='t', + fill_value=fill_value) assert_identical(expected, actual) -class TestAutoCombineUsingCoords: - def test_order_inferred_from_coords(self): - data = create_test_data() - objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - with pytest.raises(NotImplementedError): - _auto_combine(objs, concat_dims=['dim2'], compat='no_conflicts', - data_vars='all', coords='different', - infer_order_from_coords=True, ids=True) +class TestCombineAuto: + def test_combine_auto(self): + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + actual = combine_auto(objs) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + actual = combine_auto([actual]) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] + actual = combine_auto(objs) + expected = Dataset({'x': [0, 1, 2]}) + assert_identical(expected, actual) + + # ensure auto_combine handles non-sorted variables + objs = [Dataset({'x': ('a', [0]), 'y': ('a', [0]), 'a': [0]}), + Dataset({'x': ('a', [1]), 'y': ('a', [1]), 'a': [1]})] + actual = combine_auto(objs) + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1]), + 'a': [0, 1]}) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] + actual = combine_auto(objs) + expected = Dataset({'x': [0, 1], 'y': [0, 1]}) + assert_equal(actual, expected) + + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + with raises_regex(ValueError, 'Could not find any dimension ' + 'coordinates'): + combine_auto(objs) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] + with raises_regex(ValueError, 'Every dimension needs a coordinate'): + combine_auto(objs) + + def test_empty_input(self): + assert_identical(Dataset(), combine_auto([])) - @pytest.mark.xfail(reason="Not yet implemented") def test_infer_order_from_coords(self): - # Should pass once inferring order from coords is implemented data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = auto_combine(objs) # but with infer_order_from_coords=True + actual = combine_auto(objs) expected = data + assert expected.broadcast_equals(actual) + + def test_combine_auto_previously_failed(self): + # In the above scenario, one file is missing, containing the data for + # one year's data for one variable. + datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), + Dataset({'b': ('x', [0]), 'x': [0]}), + Dataset({'a': ('x', [1]), 'x': [1]})] + expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, + {'x': [0, 1]}) + actual = combine_auto(datasets) + assert_identical(expected, actual) + + def test_combine_auto_still_fails(self): + # concat can't handle new variables (yet): + # https://github.com/pydata/xarray/issues/508 + datasets = [Dataset({'x': 0}, {'y': 0}), + Dataset({'x': 1}, {'y': 1, 'z': 1})] + with pytest.raises(ValueError): + combine_auto(datasets, 'y') + + def test_combine_auto_no_concat(self): + objs = [Dataset({'x': 0}), Dataset({'y': 1})] + actual = combine_auto(objs) + expected = Dataset({'x': 0, 'y': 1}) + assert_identical(expected, actual) + + objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] + actual = combine_auto(objs) + expected = Dataset({'x': 0, 'y': 1, 'z': 2}) + assert_identical(expected, actual) + + def test_check_for_impossible_ordering(self): + ds0 = Dataset({'x': [0, 1, 5]}) + ds1 = Dataset({'x': [2, 3]}) + with raises_regex(ValueError, "does not have monotonic global indexes" + " along dimension x"): + combine_auto([ds1, ds0]) + + +@pytest.mark.filterwarnings("ignore:In xarray version 0.13 `auto_combine` " + "will be deprecated") +@pytest.mark.filterwarnings("ignore:Also `open_mfdataset` will no longer") +@pytest.mark.filterwarnings("ignore:The datasets supplied") +class TestAutoCombineOldAPI: + """ + Set of tests which check that old 1-dimensional auto_combine behaviour is + still satisfied. #2616 + """ + def test_auto_combine(self): + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1]}) + assert_identical(expected, actual) + + actual = auto_combine([actual]) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})] + actual = auto_combine(objs) + expected = Dataset({'x': [0, 1, 2]}) + assert_identical(expected, actual) + + # ensure auto_combine handles non-sorted variables + objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])), + Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))] + actual = auto_combine(objs) + expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1])}) + assert_identical(expected, actual) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})] + with raises_regex(ValueError, 'too many .* dimensions'): + auto_combine(objs) + + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + with raises_regex(ValueError, 'cannot infer dimension'): + auto_combine(objs) + + objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})] + with pytest.raises(KeyError): + auto_combine(objs) + + def test_auto_combine_previously_failed(self): + # In the above scenario, one file is missing, containing the data for + # one year's data for one variable. + datasets = [Dataset({'a': ('x', [0]), 'x': [0]}), + Dataset({'b': ('x', [0]), 'x': [0]}), + Dataset({'a': ('x', [1]), 'x': [1]})] + expected = Dataset({'a': ('x', [0, 1]), 'b': ('x', [0, np.nan])}, + {'x': [0, 1]}) + actual = auto_combine(datasets) + assert_identical(expected, actual) + + # Your data includes "time" and "station" dimensions, and each year's + # data has a different set of stations. + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + expected = Dataset({'a': (('t', 'x'), + [[np.nan, 2, 3], [1, 2, np.nan]])}, + {'x': [0, 1, 2]}) + actual = auto_combine(datasets, concat_dim='t') + assert_identical(expected, actual) + + def test_auto_combine_still_fails(self): + # concat can't handle new variables (yet): + # https://github.com/pydata/xarray/issues/508 + datasets = [Dataset({'x': 0}, {'y': 0}), + Dataset({'x': 1}, {'y': 1, 'z': 1})] + with pytest.raises(ValueError): + auto_combine(datasets, 'y') + + def test_auto_combine_no_concat(self): + objs = [Dataset({'x': 0}), Dataset({'y': 1})] + actual = auto_combine(objs) + expected = Dataset({'x': 0, 'y': 1}) + assert_identical(expected, actual) + + objs = [Dataset({'x': 0, 'y': 1}), Dataset({'y': np.nan, 'z': 2})] + actual = auto_combine(objs) + expected = Dataset({'x': 0, 'y': 1, 'z': 2}) + assert_identical(expected, actual) + + data = Dataset({'x': 0}) + actual = auto_combine([data, data, data], concat_dim=None) + assert_identical(data, actual) + + # Single object, with a concat_dim explicitly provided + # Test the issue reported in GH #1988 + objs = [Dataset({'x': 0, 'y': 1})] + dim = DataArray([100], name='baz', dims='baz') + actual = auto_combine(objs, concat_dim=dim) + expected = Dataset({'x': ('baz', [0]), 'y': ('baz', [1])}, + {'baz': [100]}) assert_identical(expected, actual) + + # Just making sure that auto_combine is doing what is + # expected for non-scalar values, too. + objs = [Dataset({'x': ('z', [0, 1]), 'y': ('z', [1, 2])})] + dim = DataArray([100], name='baz', dims='baz') + actual = auto_combine(objs, concat_dim=dim) + expected = Dataset({'x': (('baz', 'z'), [[0, 1]]), + 'y': (('baz', 'z'), [[1, 2]])}, + {'baz': [100]}) + assert_identical(expected, actual) + + def test_auto_combine_order_by_appearance_not_coords(self): + objs = [Dataset({'foo': ('x', [0])}, coords={'x': ('x', [1])}), + Dataset({'foo': ('x', [1])}, coords={'x': ('x', [0])})] + actual = auto_combine(objs) + expected = Dataset({'foo': ('x', [0, 1])}, + coords={'x': ('x', [1, 0])}) + assert_identical(expected, actual) + + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_auto_combine_fill_value(self, fill_value): + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = Dataset({'a': (('t', 'x'), + [[fill_value, 2, 3], [1, 2, fill_value]])}, + {'x': [0, 1, 2]}) + actual = auto_combine(datasets, concat_dim='t', fill_value=fill_value) + assert_identical(expected, actual) + + +class TestAutoCombineDeprecation: + """ + Set of tests to check that FutureWarnings are correctly raised until the + deprecation cycle is complete. #2616 + """ + def test_auto_combine_with_concat_dim(self): + objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] + with pytest.warns(FutureWarning, match="`concat_dim`"): + auto_combine(objs, concat_dim='x') + + def test_auto_combine_with_merge_and_concat(self): + objs = [Dataset({'x': [0]}), + Dataset({'x': [1]}), + Dataset({'z': ((), 99)})] + with pytest.warns(FutureWarning, match="require both concatenation"): + auto_combine(objs) + + def test_auto_combine_with_coords(self): + objs = [Dataset({'foo': ('x', [0])}, coords={'x': ('x', [0])}), + Dataset({'foo': ('x', [1])}, coords={'x': ('x', [1])})] + with pytest.warns(FutureWarning, match="supplied have global"): + auto_combine(objs) + + def test_auto_combine_without_coords(self): + objs = [Dataset({'foo': ('x', [0])}), + Dataset({'foo': ('x', [1])})] + with pytest.warns(FutureWarning, match="supplied do not have global"): + auto_combine(objs) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py new file mode 100644 index 00000000000..31d5b9c6f72 --- /dev/null +++ b/xarray/tests/test_concat.py @@ -0,0 +1,320 @@ +from copy import deepcopy + +import numpy as np +import pandas as pd +import pytest + +from xarray import DataArray, Dataset, Variable, concat +from xarray.core import dtypes +from . import ( + InaccessibleArray, assert_array_equal, + assert_equal, assert_identical, raises_regex, requires_dask) +from .test_dataset import create_test_data + + +class TestConcatDataset(object): + def test_concat(self): + # TODO: simplify and split this test case + + # drop the third dimension to keep things relatively understandable + data = create_test_data() + for k in list(data.variables): + if 'dim3' in data[k].dims: + del data[k] + + split_data = [data.isel(dim1=slice(3)), + data.isel(dim1=slice(3, None))] + assert_identical(data, concat(split_data, 'dim1')) + + def rectify_dim_order(dataset): + # return a new dataset with all variable dimensions transposed into + # the order in which they are found in `data` + return Dataset(dict((k, v.transpose(*data[k].dims)) + for k, v in dataset.data_vars.items()), + dataset.coords, attrs=dataset.attrs) + + for dim in ['dim1', 'dim2']: + datasets = [g for _, g in data.groupby(dim, squeeze=False)] + assert_identical(data, concat(datasets, dim)) + + dim = 'dim2' + assert_identical( + data, concat(datasets, data[dim])) + assert_identical( + data, concat(datasets, data[dim], coords='minimal')) + + datasets = [g for _, g in data.groupby(dim, squeeze=True)] + concat_over = [k for k, v in data.coords.items() + if dim in v.dims and k != dim] + actual = concat(datasets, data[dim], coords=concat_over) + assert_identical(data, rectify_dim_order(actual)) + + actual = concat(datasets, data[dim], coords='different') + assert_identical(data, rectify_dim_order(actual)) + + # make sure the coords argument behaves as expected + data.coords['extra'] = ('dim4', np.arange(3)) + for dim in ['dim1', 'dim2']: + datasets = [g for _, g in data.groupby(dim, squeeze=True)] + actual = concat(datasets, data[dim], coords='all') + expected = np.array([data['extra'].values + for _ in range(data.dims[dim])]) + assert_array_equal(actual['extra'].values, expected) + + actual = concat(datasets, data[dim], coords='different') + assert_equal(data['extra'], actual['extra']) + actual = concat(datasets, data[dim], coords='minimal') + assert_equal(data['extra'], actual['extra']) + + # verify that the dim argument takes precedence over + # concatenating dataset variables of the same name + dim = (2 * data['dim1']).rename('dim1') + datasets = [g for _, g in data.groupby('dim1', squeeze=False)] + expected = data.copy() + expected['dim1'] = dim + assert_identical(expected, concat(datasets, dim)) + + def test_concat_data_vars(self): + data = Dataset({'foo': ('x', np.random.randn(10))}) + objs = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] + for data_vars in ['minimal', 'different', 'all', [], ['foo']]: + actual = concat(objs, dim='x', data_vars=data_vars) + assert_identical(data, actual) + + def test_concat_coords(self): + data = Dataset({'foo': ('x', np.random.randn(10))}) + expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5)) + objs = [data.isel(x=slice(5)).assign_coords(c=0), + data.isel(x=slice(5, None)).assign_coords(c=1)] + for coords in ['different', 'all', ['c']]: + actual = concat(objs, dim='x', coords=coords) + assert_identical(expected, actual) + for coords in ['minimal', []]: + with raises_regex(ValueError, 'not equal across'): + concat(objs, dim='x', coords=coords) + + def test_concat_constant_index(self): + # GH425 + ds1 = Dataset({'foo': 1.5}, {'y': 1}) + ds2 = Dataset({'foo': 2.5}, {'y': 1}) + expected = Dataset({'foo': ('y', [1.5, 2.5]), 'y': [1, 1]}) + for mode in ['different', 'all', ['foo']]: + actual = concat([ds1, ds2], 'y', data_vars=mode) + assert_identical(expected, actual) + with raises_regex(ValueError, 'not equal across datasets'): + concat([ds1, ds2], 'y', data_vars='minimal') + + def test_concat_size0(self): + data = create_test_data() + split_data = [data.isel(dim1=slice(0, 0)), data] + actual = concat(split_data, 'dim1') + assert_identical(data, actual) + + actual = concat(split_data[::-1], 'dim1') + assert_identical(data, actual) + + def test_concat_autoalign(self): + ds1 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 2])])}) + ds2 = Dataset({'foo': DataArray([1, 2], coords=[('x', [1, 3])])}) + actual = concat([ds1, ds2], 'y') + expected = Dataset({'foo': DataArray([[1, 2, np.nan], [1, np.nan, 2]], + dims=['y', 'x'], + coords={'x': [1, 2, 3]})}) + assert_identical(expected, actual) + + def test_concat_errors(self): + data = create_test_data() + split_data = [data.isel(dim1=slice(3)), + data.isel(dim1=slice(3, None))] + + with raises_regex(ValueError, 'must supply at least one'): + concat([], 'dim1') + + with raises_regex(ValueError, 'are not coordinates'): + concat([data, data], 'new_dim', coords=['not_found']) + + with raises_regex(ValueError, 'global attributes not'): + data0, data1 = deepcopy(split_data) + data1.attrs['foo'] = 'bar' + concat([data0, data1], 'dim1', compat='identical') + assert_identical( + data, concat([data0, data1], 'dim1', compat='equals')) + + with raises_regex(ValueError, 'encountered unexpected'): + data0, data1 = deepcopy(split_data) + data1['foo'] = ('bar', np.random.randn(10)) + concat([data0, data1], 'dim1') + + with raises_regex(ValueError, 'compat.* invalid'): + concat(split_data, 'dim1', compat='foobar') + + with raises_regex(ValueError, 'unexpected value for'): + concat([data, data], 'new_dim', coords='foobar') + + with raises_regex( + ValueError, 'coordinate in some datasets but not others'): + concat([Dataset({'x': 0}), Dataset({'x': [1]})], dim='z') + + with raises_regex( + ValueError, 'coordinate in some datasets but not others'): + concat([Dataset({'x': 0}), Dataset({}, {'x': 1})], dim='z') + + with raises_regex(ValueError, 'no longer a valid'): + concat([data, data], 'new_dim', mode='different') + with raises_regex(ValueError, 'no longer a valid'): + concat([data, data], 'new_dim', concat_over='different') + + def test_concat_promote_shape(self): + # mixed dims within variables + objs = [Dataset({}, {'x': 0}), Dataset({'x': [1]})] + actual = concat(objs, 'x') + expected = Dataset({'x': [0, 1]}) + assert_identical(actual, expected) + + objs = [Dataset({'x': [0]}), Dataset({}, {'x': 1})] + actual = concat(objs, 'x') + assert_identical(actual, expected) + + # mixed dims between variables + objs = [Dataset({'x': [2], 'y': 3}), Dataset({'x': [4], 'y': 5})] + actual = concat(objs, 'x') + expected = Dataset({'x': [2, 4], 'y': ('x', [3, 5])}) + assert_identical(actual, expected) + + # mixed dims in coord variable + objs = [Dataset({'x': [0]}, {'y': -1}), + Dataset({'x': [1]}, {'y': ('x', [-2])})] + actual = concat(objs, 'x') + expected = Dataset({'x': [0, 1]}, {'y': ('x', [-1, -2])}) + assert_identical(actual, expected) + + # scalars with mixed lengths along concat dim -- values should repeat + objs = [Dataset({'x': [0]}, {'y': -1}), + Dataset({'x': [1, 2]}, {'y': -2})] + actual = concat(objs, 'x') + expected = Dataset({'x': [0, 1, 2]}, {'y': ('x', [-1, -2, -2])}) + assert_identical(actual, expected) + + # broadcast 1d x 1d -> 2d + objs = [Dataset({'z': ('x', [-1])}, {'x': [0], 'y': [0]}), + Dataset({'z': ('y', [1])}, {'x': [1], 'y': [0]})] + actual = concat(objs, 'x') + expected = Dataset({'z': (('x', 'y'), [[-1], [1]])}, + {'x': [0, 1], 'y': [0]}) + assert_identical(actual, expected) + + def test_concat_do_not_promote(self): + # GH438 + objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), + Dataset({'y': ('t', [2])}, {'x': 1, 't': [0]})] + expected = Dataset({'y': ('t', [1, 2])}, {'x': 1, 't': [0, 0]}) + actual = concat(objs, 't') + assert_identical(expected, actual) + + objs = [Dataset({'y': ('t', [1])}, {'x': 1, 't': [0]}), + Dataset({'y': ('t', [2])}, {'x': 2, 't': [0]})] + with pytest.raises(ValueError): + concat(objs, 't', coords='minimal') + + def test_concat_dim_is_variable(self): + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + coord = Variable('y', [3, 4]) + expected = Dataset({'x': ('y', [0, 1]), 'y': [3, 4]}) + actual = concat(objs, coord) + assert_identical(actual, expected) + + def test_concat_multiindex(self): + x = pd.MultiIndex.from_product([[1, 2, 3], ['a', 'b']]) + expected = Dataset({'x': x}) + actual = concat([expected.isel(x=slice(2)), + expected.isel(x=slice(2, None))], 'x') + assert expected.equals(actual) + assert isinstance(actual.x.to_index(), pd.MultiIndex) + + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_concat_fill_value(self, fill_value): + datasets = [Dataset({'a': ('x', [2, 3]), 'x': [1, 2]}), + Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})] + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = Dataset({'a': (('t', 'x'), + [[fill_value, 2, 3], + [1, 2, fill_value]])}, + {'x': [0, 1, 2]}) + actual = concat(datasets, dim='t', fill_value=fill_value) + assert_identical(actual, expected) + + +class TestConcatDataArray(object): + def test_concat(self): + ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), + 'bar': (['x', 'y'], np.random.random((2, 3)))}, + {'x': [0, 1]}) + foo = ds['foo'] + bar = ds['bar'] + + # from dataset array: + expected = DataArray(np.array([foo.values, bar.values]), + dims=['w', 'x', 'y'], coords={'x': [0, 1]}) + actual = concat([foo, bar], 'w') + assert_equal(expected, actual) + # from iteration: + grouped = [g for _, g in foo.groupby('x')] + stacked = concat(grouped, ds['x']) + assert_identical(foo, stacked) + # with an index as the 'dim' argument + stacked = concat(grouped, ds.indexes['x']) + assert_identical(foo, stacked) + + actual = concat([foo[0], foo[1]], pd.Index([0, 1]) + ).reset_coords(drop=True) + expected = foo[:2].rename({'x': 'concat_dim'}) + assert_identical(expected, actual) + + actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) + expected = foo[:2].rename({'x': 'concat_dim'}) + assert_identical(expected, actual) + + with raises_regex(ValueError, 'not identical'): + concat([foo, bar], dim='w', compat='identical') + + with raises_regex(ValueError, 'not a valid argument'): + concat([foo, bar], dim='w', data_vars='minimal') + + def test_concat_encoding(self): + # Regression test for GH1297 + ds = Dataset({'foo': (['x', 'y'], np.random.random((2, 3))), + 'bar': (['x', 'y'], np.random.random((2, 3)))}, + {'x': [0, 1]}) + foo = ds['foo'] + foo.encoding = {"complevel": 5} + ds.encoding = {"unlimited_dims": 'x'} + assert concat([foo, foo], dim="x").encoding == foo.encoding + assert concat([ds, ds], dim="x").encoding == ds.encoding + + @requires_dask + def test_concat_lazy(self): + import dask.array as da + + arrays = [DataArray( + da.from_array(InaccessibleArray(np.zeros((3, 3))), 3), + dims=['x', 'y']) for _ in range(2)] + # should not raise + combined = concat(arrays, dim='z') + assert combined.shape == (2, 3, 3) + assert combined.dims == ('z', 'x', 'y') + + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_concat_fill_value(self, fill_value): + foo = DataArray([1, 2], coords=[('x', [1, 2])]) + bar = DataArray([1, 2], coords=[('x', [1, 3])]) + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = DataArray([[1, 2, fill_value], [1, fill_value, 2]], + dims=['y', 'x'], coords={'x': [1, 2, 3]}) + actual = concat((foo, bar), dim='y', fill_value=fill_value) + assert_identical(actual, expected)