diff --git a/doc/user-guide/combining.rst b/doc/user-guide/combining.rst index 53d5fc17cbd..ebadd6d7c60 100644 --- a/doc/user-guide/combining.rst +++ b/doc/user-guide/combining.rst @@ -43,7 +43,6 @@ new dimension by stacking lower dimensional arrays together: .. ipython:: python - da.sel(x="a") xr.concat([da.isel(x=0), da.isel(x=1)], "x") If the second argument to ``concat`` is a new dimension name, the arrays will @@ -52,7 +51,10 @@ dimension: .. ipython:: python - xr.concat([da.isel(x=0), da.isel(x=1)], "new_dim") + da0 = da.isel(x=0).drop_vars("x") + da1 = da.isel(x=1).drop_vars("x") + + xr.concat([da0, da1], "new_dim") The second argument to ``concat`` can also be an :py:class:`~pandas.Index` or :py:class:`~xarray.DataArray` object as well as a string, in which case it is @@ -60,7 +62,7 @@ used to label the values along the new dimension: .. ipython:: python - xr.concat([da.isel(x=0), da.isel(x=1)], pd.Index([-90, -100], name="new_dim")) + xr.concat([da0, da1], pd.Index([-90, -100], name="new_dim")) Of course, ``concat`` also works on ``Dataset`` objects: @@ -75,6 +77,12 @@ between datasets. With the default parameters, xarray will load some coordinate variables into memory to compare them between datasets. This may be prohibitively expensive if you are manipulating your dataset lazily using :ref:`dask`. +.. note:: + + In a future version of xarray the default values for many of these options + will change. You can opt into the new default values early using + ``xr.set_options(use_new_combine_kwarg_defaults=True)``. + .. _merge: Merge @@ -94,10 +102,18 @@ If you merge another dataset (or a dictionary including data array objects), by default the resulting dataset will be aligned on the **union** of all index coordinates: +.. note:: + + In a future version of xarray the default value for ``join`` and ``compat`` + will change. This change will mean that xarray will no longer attempt + to align the indices of the merged dataset. You can opt into the new default + values early using ``xr.set_options(use_new_combine_kwarg_defaults=True)``. + Or explicitly set ``join='outer'`` to preserve old behavior. + .. ipython:: python other = xr.Dataset({"bar": ("x", [1, 2, 3, 4]), "x": list("abcd")}) - xr.merge([ds, other]) + xr.merge([ds, other], join="outer") This ensures that ``merge`` is non-destructive. ``xarray.MergeError`` is raised if you attempt to merge two variables with the same name but different values: @@ -114,6 +130,16 @@ if you attempt to merge two variables with the same name but different values: array([[ 1.4691123 , 0.71713666, -0.5090585 ], [-0.13563237, 2.21211203, 0.82678535]]) +.. note:: + + In a future version of xarray the default value for ``compat`` will change + from ``compat='no_conflicts'`` to ``compat='override'``. In this scenario + the values in the first object override all the values in other objects. + + .. ipython:: python + + xr.merge([ds, ds + 1], compat="override") + The same non-destructive merging between ``DataArray`` index coordinates is used in the :py:class:`~xarray.Dataset` constructor: @@ -144,6 +170,11 @@ For datasets, ``ds0.combine_first(ds1)`` works similarly to there are conflicting values in variables to be merged, whereas ``.combine_first`` defaults to the calling object's values. +.. note:: + + In a future version of xarray the default options for ``xr.merge`` will change + such that the behavior matches ``combine_first``. + .. _update: Update @@ -236,7 +267,7 @@ coordinates as long as any non-missing values agree or are disjoint: ds1 = xr.Dataset({"a": ("x", [10, 20, 30, np.nan])}, {"x": [1, 2, 3, 4]}) ds2 = xr.Dataset({"a": ("x", [np.nan, 30, 40, 50])}, {"x": [2, 3, 4, 5]}) - xr.merge([ds1, ds2], compat="no_conflicts") + xr.merge([ds1, ds2], join="outer", compat="no_conflicts") Note that due to the underlying representation of missing values as floating point numbers (``NaN``), variable data type is not always preserved when merging @@ -295,13 +326,12 @@ they are concatenated in order based on the values in their dimension coordinates, not on their position in the list passed to ``combine_by_coords``. .. ipython:: python - :okwarning: x1 = xr.DataArray(name="foo", data=np.random.randn(3), coords=[("x", [0, 1, 2])]) x2 = xr.DataArray(name="foo", data=np.random.randn(3), coords=[("x", [3, 4, 5])]) xr.combine_by_coords([x2, x1]) -These functions can be used by :py:func:`~xarray.open_mfdataset` to open many +These functions are used by :py:func:`~xarray.open_mfdataset` to open many files as one dataset. The particular function used is specified by setting the argument ``'combine'`` to ``'by_coords'`` or ``'nested'``. This is useful for situations where your data is split across many files in multiple locations, diff --git a/doc/user-guide/terminology.rst b/doc/user-guide/terminology.rst index c581fcb374d..295fbcd8b51 100644 --- a/doc/user-guide/terminology.rst +++ b/doc/user-guide/terminology.rst @@ -217,7 +217,7 @@ complete examples, please consult the relevant documentation.* ) # combine the datasets - combined_ds = xr.combine_by_coords([ds1, ds2]) + combined_ds = xr.combine_by_coords([ds1, ds2], join="outer") combined_ds lazy diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c8fbecf82af..2d0596644ef 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,15 @@ Breaking changes Deprecations ~~~~~~~~~~~~ +- Start deprecation cycle for changing the default keyword arguments to ``concat``, ``merge``, ``combine``, ``open_mfdataset``. + Emits a ``FutureWarning`` when using old defaults and new defaults would result in different behavior. + Adds an option: ``use_new_combine_kwarg_defaults`` to opt in to new defaults immediately. + New values are: + - ``data_vars``: "minimal" + - ``coords``: "minimal" + - ``compat``: "override" + - ``join``: "exact" + By `Julia Signell `_. Bug fixes ~~~~~~~~~ @@ -8121,13 +8130,17 @@ Backwards incompatible changes Now, the default always concatenates data variables: .. ipython:: python - :suppress: - - ds = xray.Dataset({"x": 0}) + :verbatim: - .. ipython:: python + In [1]: ds = xray.Dataset({"x": 0}) - xray.concat([ds, ds], dim="y") + In [2]: xray.concat([ds, ds], dim="y") + Out[2]: + Size: 16B + Dimensions: (y: 2) + Dimensions without coordinates: y + Data variables: + x (y) int64 16B 0 0 To obtain the old behavior, supply the argument ``concat_over=[]``. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f30f4e54705..1c3c31d0379 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -34,7 +34,7 @@ ) from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder -from xarray.core import indexing +from xarray.core import dtypes, indexing from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -50,6 +50,13 @@ _nested_combine, combine_by_coords, ) +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _COORDS_DEFAULT, + _DATA_VARS_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: try: @@ -1404,14 +1411,16 @@ def open_mfdataset( | Sequence[Index] | None ) = None, - compat: CompatOptions = "no_conflicts", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, preprocess: Callable[[Dataset], Dataset] | None = None, engine: T_Engine | None = None, - data_vars: Literal["all", "minimal", "different"] | list[str] = "all", - coords="different", + data_vars: Literal["all", "minimal", "different"] + | list[str] + | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords=_COORDS_DEFAULT, combine: Literal["by_coords", "nested"] = "by_coords", parallel: bool = False, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, attrs_file: str | os.PathLike | None = None, combine_attrs: CombineAttrsOptions = "override", **kwargs, @@ -1598,9 +1607,6 @@ def open_mfdataset( paths1d: list[str | ReadBuffer] if combine == "nested": - if isinstance(concat_dim, str | DataArray) or concat_dim is None: - concat_dim = [concat_dim] # type: ignore[assignment] - # This creates a flat list which is easier to iterate over, whilst # encoding the originally-supplied structure as "ids". # The "ids" are not used at all if combine='by_coords`. @@ -1649,13 +1655,14 @@ def open_mfdataset( # along each dimension, using structure given by "ids" combined = _nested_combine( datasets, - concat_dims=concat_dim, + concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords, ids=ids, join=join, combine_attrs=combine_attrs, + fill_value=dtypes.NA, ) elif combine == "by_coords": # Redo ordering from coordinates, ignoring how they were ordered diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5a7f757ba8a..93e03234fcb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -121,7 +121,13 @@ merge_coordinates_without_align, merge_data_and_coords, ) -from xarray.util.deprecation_helpers import _deprecate_positional_args, deprecate_dims +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, + _deprecate_positional_args, + deprecate_dims, +) if TYPE_CHECKING: from dask.dataframe import DataFrame as DaskDataFrame @@ -5287,7 +5293,14 @@ def stack_dataarray(da): # concatenate the arrays stackable_vars = [stack_dataarray(da) for da in self.data_vars.values()] - data_array = concat(stackable_vars, dim=new_dim) + data_array = concat( + stackable_vars, + dim=new_dim, + data_vars="all", + coords="different", + compat="equals", + join="outer", + ) if name is not None: data_array.name = name @@ -5531,8 +5544,8 @@ def merge( self, other: CoercibleMapping | DataArray, overwrite_vars: Hashable | Iterable[Hashable] = frozenset(), - compat: CompatOptions = "no_conflicts", - join: JoinOptions = "outer", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, fill_value: Any = xrdtypes.NA, combine_attrs: CombineAttrsOptions = "override", ) -> Self: diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index e1541a98af0..ef48c1454ba 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -1622,7 +1622,14 @@ def _combine(self, applied, shortcut=False): if shortcut: combined = self._concat_shortcut(applied, dim, positions) else: - combined = concat(applied, dim) + combined = concat( + applied, + dim, + data_vars="all", + coords="different", + compat="equals", + join="outer", + ) combined = _maybe_reorder(combined, dim, positions, N=self.group1d.size) if isinstance(combined, type(self._obj)): @@ -1783,7 +1790,14 @@ def _combine(self, applied): """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) dim, positions = self._infer_concat_args(applied_example) - combined = concat(applied, dim) + combined = concat( + applied, + dim, + data_vars="all", + coords="different", + compat="equals", + join="outer", + ) combined = _maybe_reorder(combined, dim, positions, N=self.group1d.size) # assign coord when the applied function does not return that coord if dim not in applied_example.dims: diff --git a/xarray/core/options.py b/xarray/core/options.py index adaa563d09b..d8635e309d9 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -30,6 +30,7 @@ "keep_attrs", "warn_for_unclosed_files", "use_bottleneck", + "use_new_combine_kwarg_defaults", "use_numbagg", "use_opt_einsum", "use_flox", @@ -59,6 +60,7 @@ class T_Options(TypedDict): warn_for_unclosed_files: bool use_bottleneck: bool use_flox: bool + use_new_combine_kwarg_defaults: bool use_numbagg: bool use_opt_einsum: bool @@ -87,6 +89,7 @@ class T_Options(TypedDict): "warn_for_unclosed_files": False, "use_bottleneck": True, "use_flox": True, + "use_new_combine_kwarg_defaults": False, "use_numbagg": True, "use_opt_einsum": True, } @@ -117,6 +120,7 @@ def _positive_integer(value: Any) -> bool: "file_cache_maxsize": _positive_integer, "keep_attrs": lambda choice: choice in [True, False, "default"], "use_bottleneck": lambda value: isinstance(value, bool), + "use_new_combine_kwarg_defaults": lambda value: isinstance(value, bool), "use_numbagg": lambda value: isinstance(value, bool), "use_opt_einsum": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), @@ -256,6 +260,15 @@ class set_options: use_flox : bool, default: True Whether to use ``numpy_groupies`` and `flox`` to accelerate groupby and resampling reductions. + use_new_combine_kwarg_defaults : bool, default False + Whether to use new kwarg default values for combine functions: + :py:func:`~xarray.concat`, :py:func:`~xarray.merge`, + :py:func:`~xarray.open_mfdataset`. New values are: + + * ``data_vars``: "minimal" + * ``coords``: "minimal" + * ``compat``: "override" + * ``join``: "exact" use_numbagg : bool, default: True Whether to use ``numbagg`` to accelerate reductions. Takes precedence over ``use_bottleneck`` when both are True. diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index 996325e179a..92b7e8d2eeb 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -351,7 +351,9 @@ def _wrapper( result = func(*converted_args, **kwargs) merged_coordinates = merge( - [arg.coords for arg in args if isinstance(arg, Dataset | DataArray)] + [arg.coords for arg in args if isinstance(arg, Dataset | DataArray)], + join="exact", + compat="override", ).coords # check all dims are present @@ -439,7 +441,11 @@ def _wrapper( # rechunk any numpy variables appropriately xarray_objs = tuple(arg.chunk(arg.chunksizes) for arg in xarray_objs) - merged_coordinates = merge([arg.coords for arg in aligned]).coords + merged_coordinates = merge( + [arg.coords for arg in aligned], + join="exact", + compat="override", + ).coords _, npargs = unzip( sorted( @@ -472,7 +478,9 @@ def _wrapper( ) coordinates = merge( - (preserved_coords, template.coords.to_dataset()[new_coord_vars]) + (preserved_coords, template.coords.to_dataset()[new_coord_vars]), + join="outer", + compat="override", ).coords output_chunks: Mapping[Hashable, tuple[int, ...]] = { dim: input_chunks[dim] for dim in template.dims if dim in input_chunks diff --git a/xarray/plot/dataarray_plot.py b/xarray/plot/dataarray_plot.py index ee49928aa01..25294536a2c 100644 --- a/xarray/plot/dataarray_plot.py +++ b/xarray/plot/dataarray_plot.py @@ -196,7 +196,13 @@ def _prepare_plot1d_data( dim = coords_to_plot.get(v, None) if (dim is not None) and (dim in darray.dims): darray_nan = np.nan * darray.isel({dim: -1}) - darray = concat([darray, darray_nan], dim=dim) + darray = concat( + [darray, darray_nan], + dim=dim, + coords="minimal", + compat="override", + join="exact", + ) dims_T.append(coords_to_plot[v]) # Lines should never connect to the same coordinate when stacked, diff --git a/xarray/structure/alignment.py b/xarray/structure/alignment.py index ea90519143c..fe7b0c166df 100644 --- a/xarray/structure/alignment.py +++ b/xarray/structure/alignment.py @@ -5,7 +5,7 @@ from collections import defaultdict from collections.abc import Callable, Hashable, Iterable, Mapping from contextlib import suppress -from typing import TYPE_CHECKING, Any, Final, Generic, TypeVar, cast, overload +from typing import TYPE_CHECKING, Any, Final, Generic, TypeVar, cast, get_args, overload import numpy as np import pandas as pd @@ -19,9 +19,10 @@ indexes_all_equal, safe_cast_to_index, ) -from xarray.core.types import T_Alignable -from xarray.core.utils import is_dict_like, is_full_slice +from xarray.core.types import JoinOptions, T_Alignable +from xarray.core.utils import emit_user_level_warning, is_dict_like, is_full_slice from xarray.core.variable import Variable, as_compatible_data, calculate_dimensions +from xarray.util.deprecation_helpers import CombineKwargDefault if TYPE_CHECKING: from xarray.core.dataarray import DataArray @@ -116,7 +117,7 @@ class Aligner(Generic[T_Alignable]): objects: tuple[T_Alignable, ...] results: tuple[T_Alignable, ...] objects_matching_indexes: tuple[dict[MatchingIndexKey, Index], ...] - join: str + join: JoinOptions | CombineKwargDefault exclude_dims: frozenset[Hashable] exclude_vars: frozenset[Hashable] copy: bool @@ -136,7 +137,7 @@ class Aligner(Generic[T_Alignable]): def __init__( self, objects: Iterable[T_Alignable], - join: str = "inner", + join: JoinOptions | CombineKwargDefault = "inner", indexes: Mapping[Any, Any] | None = None, exclude_dims: str | Iterable[Hashable] = frozenset(), exclude_vars: Iterable[Hashable] = frozenset(), @@ -149,7 +150,9 @@ def __init__( self.objects = tuple(objects) self.objects_matching_indexes = () - if join not in ["inner", "outer", "override", "exact", "left", "right"]: + if not isinstance(join, CombineKwargDefault) and join not in get_args( + JoinOptions + ): raise ValueError(f"invalid value for join: {join}") self.join = join @@ -403,12 +406,34 @@ def update_dicts( else: need_reindex = False if need_reindex: + if ( + isinstance(self.join, CombineKwargDefault) + and self.join != "exact" + ): + emit_user_level_warning( + self.join.warning_message( + "This change will result in the following ValueError:" + "cannot be aligned with join='exact' because " + "index/labels/sizes are not equal along " + "these coordinates (dimensions): " + + ", ".join( + f"{name!r} {dims!r}" for name, dims in key[0] + ), + recommend_set_options=False, + ), + FutureWarning, + ) if self.join == "exact": raise AlignmentError( "cannot align objects with join='exact' where " "index/labels/sizes are not equal along " "these coordinates (dimensions): " + ", ".join(f"{name!r} {dims!r}" for name, dims in key[0]) + + ( + self.join.error_message() + if isinstance(self.join, CombineKwargDefault) + else "" + ) ) joiner = self._get_index_joiner(index_cls) joined_index = joiner(matching_indexes) @@ -579,7 +604,7 @@ def align( obj1: T_Obj1, /, *, - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), @@ -593,7 +618,7 @@ def align( obj2: T_Obj2, /, *, - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), @@ -608,7 +633,7 @@ def align( obj3: T_Obj3, /, *, - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), @@ -624,7 +649,7 @@ def align( obj4: T_Obj4, /, *, - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), @@ -641,7 +666,7 @@ def align( obj5: T_Obj5, /, *, - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), @@ -652,7 +677,7 @@ def align( @overload def align( *objects: T_Alignable, - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), @@ -662,7 +687,7 @@ def align( def align( *objects: T_Alignable, - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), @@ -870,7 +895,7 @@ def align( def deep_align( objects: Iterable[Any], - join: JoinOptions = "inner", + join: JoinOptions | CombineKwargDefault = "inner", copy: bool = True, indexes=None, exclude: str | Iterable[Hashable] = frozenset(), diff --git a/xarray/structure/combine.py b/xarray/structure/combine.py index 01c14dffee4..7adc2ef3c8b 100644 --- a/xarray/structure/combine.py +++ b/xarray/structure/combine.py @@ -12,6 +12,13 @@ from xarray.core.utils import iterate_nested from xarray.structure.concat import concat from xarray.structure.merge import merge +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _COORDS_DEFAULT, + _DATA_VARS_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: from xarray.core.types import ( @@ -200,12 +207,12 @@ def _check_shape_tile_ids(combined_tile_ids): def _combine_nd( combined_ids, concat_dims, - data_vars="all", - coords="different", - compat: CompatOptions = "no_conflicts", - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + data_vars, + coords, + compat: CompatOptions | CombineKwargDefault, + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): """ Combines an N-dimensional structure of datasets into one by applying a @@ -262,10 +269,10 @@ def _combine_all_along_first_dim( dim, data_vars, coords, - compat: CompatOptions, - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + compat: CompatOptions | CombineKwargDefault, + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): # Group into lines of datasets which must be combined along dim grouped = groupby_defaultdict(list(combined_ids.items()), key=_new_tile_id) @@ -276,7 +283,14 @@ def _combine_all_along_first_dim( combined_ids = dict(sorted(group)) datasets = combined_ids.values() new_combined_ids[new_id] = _combine_1d( - datasets, dim, compat, data_vars, coords, fill_value, join, combine_attrs + datasets, + concat_dim=dim, + compat=compat, + data_vars=data_vars, + coords=coords, + fill_value=fill_value, + join=join, + combine_attrs=combine_attrs, ) return new_combined_ids @@ -284,12 +298,12 @@ def _combine_all_along_first_dim( def _combine_1d( datasets, concat_dim, - compat: CompatOptions = "no_conflicts", - data_vars="all", - coords="different", - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + compat: CompatOptions | CombineKwargDefault, + data_vars, + coords, + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): """ Applies either concat or merge to 1D list of datasets depending on value @@ -338,18 +352,21 @@ def _new_tile_id(single_id_ds_pair): def _nested_combine( datasets, - concat_dims, + concat_dim, compat, data_vars, coords, ids, - fill_value=dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "drop", + fill_value, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): if len(datasets) == 0: return Dataset() + if isinstance(concat_dim, str | DataArray) or concat_dim is None: + concat_dim = [concat_dim] + # Arrange datasets for concatenation # Use information from the shape of the user input if not ids: @@ -366,7 +383,7 @@ def _nested_combine( # Apply series of concatenate or merge operations along each dimension combined = _combine_nd( combined_ids, - concat_dims, + concat_dims=concat_dim, compat=compat, data_vars=data_vars, coords=coords, @@ -384,11 +401,11 @@ def _nested_combine( def combine_nested( datasets: DATASET_HYPERCUBE, concat_dim: str | DataArray | None | Sequence[str | DataArray | pd.Index | None], - compat: str = "no_conflicts", - data_vars: str = "all", - coords: str = "different", + compat: str | CombineKwargDefault = _COMPAT_DEFAULT, + data_vars: str | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: str | CombineKwargDefault = _COORDS_DEFAULT, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "drop", ) -> Dataset: """ @@ -581,13 +598,10 @@ def combine_nested( if mixed_datasets_and_arrays: raise ValueError("Can't combine datasets with unnamed arrays.") - if isinstance(concat_dim, str | DataArray) or concat_dim is None: - concat_dim = [concat_dim] - # The IDs argument tells _nested_combine that datasets aren't yet sorted return _nested_combine( datasets, - concat_dims=concat_dim, + concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords, @@ -619,12 +633,12 @@ def groupby_defaultdict( def _combine_single_variable_hypercube( datasets, - fill_value=dtypes.NA, - data_vars="all", - coords="different", - compat: CompatOptions = "no_conflicts", - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "no_conflicts", + fill_value, + data_vars, + coords, + compat: CompatOptions | CombineKwargDefault, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, ): """ Attempt to combine a list of Datasets into a hypercube using their @@ -678,11 +692,13 @@ def _combine_single_variable_hypercube( def combine_by_coords( data_objects: Iterable[Dataset | DataArray] = [], - compat: CompatOptions = "no_conflicts", - data_vars: Literal["all", "minimal", "different"] | list[str] = "all", - coords: str = "different", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, + data_vars: Literal["all", "minimal", "different"] + | list[str] + | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: str | CombineKwargDefault = _COORDS_DEFAULT, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "no_conflicts", ) -> Dataset | DataArray: """ @@ -862,7 +878,7 @@ def combine_by_coords( temperature (y, x) float64 96B 10.98 14.3 12.06 ... 1.743 0.4044 16.65 precipitation (y, x) float64 96B 0.4376 0.8918 0.9637 ... 0.4615 0.7805 - >>> xr.combine_by_coords([x3, x1]) + >>> xr.combine_by_coords([x3, x1], join="outer") Size: 464B Dimensions: (y: 4, x: 6) Coordinates: @@ -882,7 +898,7 @@ def combine_by_coords( temperature (y, x) float64 96B 10.98 14.3 12.06 ... 18.89 10.44 8.293 precipitation (y, x) float64 96B 0.4376 0.8918 0.9637 ... 0.01879 0.6176 - >>> xr.combine_by_coords([x1, x2, x3]) + >>> xr.combine_by_coords([x1, x2, x3], join="outer") Size: 464B Dimensions: (y: 4, x: 6) Coordinates: diff --git a/xarray/structure/concat.py b/xarray/structure/concat.py index 81269320e1c..c76ccdceb5f 100644 --- a/xarray/structure/concat.py +++ b/xarray/structure/concat.py @@ -11,6 +11,7 @@ from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import Index, PandasIndex from xarray.core.types import T_DataArray, T_Dataset, T_Variable +from xarray.core.utils import emit_user_level_warning from xarray.core.variable import Variable from xarray.core.variable import concat as concat_vars from xarray.structure.alignment import align, reindex_variables @@ -20,6 +21,13 @@ merge_attrs, merge_collected, ) +from xarray.util.deprecation_helpers import ( + _COMPAT_CONCAT_DEFAULT, + _COORDS_DEFAULT, + _DATA_VARS_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: from xarray.core.types import ( @@ -37,12 +45,12 @@ def concat( objs: Iterable[T_Dataset], dim: Hashable | T_Variable | T_DataArray | pd.Index | Any, - data_vars: T_DataVars = "all", - coords: ConcatOptions | list[Hashable] = "different", - compat: CompatOptions = "equals", + data_vars: T_DataVars | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: ConcatOptions | list[Hashable] | CombineKwargDefault = _COORDS_DEFAULT, + compat: CompatOptions | CombineKwargDefault = _COMPAT_CONCAT_DEFAULT, positions: Iterable[Iterable[int]] | None = None, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "override", create_index_for_new_dim: bool = True, ) -> T_Dataset: ... @@ -52,12 +60,12 @@ def concat( def concat( objs: Iterable[T_DataArray], dim: Hashable | T_Variable | T_DataArray | pd.Index | Any, - data_vars: T_DataVars = "all", - coords: ConcatOptions | list[Hashable] = "different", - compat: CompatOptions = "equals", + data_vars: T_DataVars | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: ConcatOptions | list[Hashable] | CombineKwargDefault = _COORDS_DEFAULT, + compat: CompatOptions | CombineKwargDefault = _COMPAT_CONCAT_DEFAULT, positions: Iterable[Iterable[int]] | None = None, fill_value: object = dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "override", create_index_for_new_dim: bool = True, ) -> T_DataArray: ... @@ -66,12 +74,12 @@ def concat( def concat( objs, dim, - data_vars: T_DataVars = "all", - coords="different", - compat: CompatOptions = "equals", + data_vars: T_DataVars | CombineKwargDefault = _DATA_VARS_DEFAULT, + coords: ConcatOptions | list[Hashable] | CombineKwargDefault = _COORDS_DEFAULT, + compat: CompatOptions | CombineKwargDefault = _COMPAT_CONCAT_DEFAULT, positions=None, fill_value=dtypes.NA, - join: JoinOptions = "outer", + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, combine_attrs: CombineAttrsOptions = "override", create_index_for_new_dim: bool = True, ): @@ -199,7 +207,7 @@ def concat( * x (x) >> xr.concat([da.isel(x=0), da.isel(x=1)], "x") + >>> xr.concat([da.isel(x=0), da.isel(x=1)], "x", coords="minimal") Size: 48B array([[0, 1, 2], [3, 4, 5]]) @@ -207,7 +215,7 @@ def concat( * x (x) >> xr.concat([da.isel(x=0), da.isel(x=1)], "new_dim") + >>> xr.concat([da.isel(x=0), da.isel(x=1)], "new_dim", coords="all") Size: 48B array([[0, 1, 2], [3, 4, 5]]) @@ -216,7 +224,11 @@ def concat( * y (y) int64 24B 10 20 30 Dimensions without coordinates: new_dim - >>> xr.concat([da.isel(x=0), da.isel(x=1)], pd.Index([-90, -100], name="new_dim")) + >>> xr.concat( + ... [da.isel(x=0), da.isel(x=1)], + ... pd.Index([-90, -100], name="new_dim"), + ... coords="all", + ... ) Size: 48B array([[0, 1, 2], [3, 4, 5]]) @@ -255,7 +267,9 @@ def concat( except StopIteration as err: raise ValueError("must supply at least one object to concatenate") from err - if compat not in set(_VALID_COMPAT) - {"minimal"}: + if not isinstance(compat, CombineKwargDefault) and compat not in set( + _VALID_COMPAT + ) - {"minimal"}: raise ValueError( f"compat={compat!r} invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" ) @@ -320,7 +334,14 @@ def _calc_concat_dim_index( return dim, index -def _calc_concat_over(datasets, dim, dim_names, data_vars: T_DataVars, coords, compat): +def _calc_concat_over( + datasets, + dim, + dim_names, + data_vars: T_DataVars | CombineKwargDefault, + coords, + compat, +): """ Determine which dataset variables need to be concatenated in the result, """ @@ -344,11 +365,31 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars: T_DataVars, coords, c concat_dim_lengths.append(ds.sizes.get(dim, 1)) def process_subset_opt(opt, subset): - if isinstance(opt, str): + original = set(concat_over) + compat_str = ( + compat._value if isinstance(compat, CombineKwargDefault) else compat + ) + if isinstance(opt, str | CombineKwargDefault): if opt == "different": + if isinstance(compat, CombineKwargDefault) and compat != "override": + if not isinstance(opt, CombineKwargDefault): + emit_user_level_warning( + compat.warning_message( + "This change will result in the following ValueError: " + f"Cannot specify both {subset}='different' and compat='override'.", + recommend_set_options=False, + ), + FutureWarning, + ) + if compat == "override": raise ValueError( f"Cannot specify both {subset}='different' and compat='override'." + + ( + compat.error_message() + if isinstance(compat, CombineKwargDefault) + else "" + ) ) # all nonindexes that are not the same in each dataset for k in getattr(datasets[0], subset): @@ -372,7 +413,7 @@ def process_subset_opt(opt, subset): # first check without comparing values i.e. no computes for var in variables[1:]: - equals[k] = getattr(variables[0], compat)( + equals[k] = getattr(variables[0], compat_str)( var, equiv=lazy_array_equiv ) if equals[k] is not True: @@ -395,7 +436,7 @@ def process_subset_opt(opt, subset): for ds_rhs in datasets[1:]: v_rhs = ds_rhs.variables[k].compute() computed.append(v_rhs) - if not getattr(v_lhs, compat)(v_rhs): + if not getattr(v_lhs, compat_str)(v_rhs): concat_over.add(k) equals[k] = False # computed variables are not to be re-computed @@ -418,6 +459,18 @@ def process_subset_opt(opt, subset): pass else: raise ValueError(f"unexpected value for {subset}: {opt}") + + if ( + isinstance(opt, CombineKwargDefault) + and opt != "minimal" + and original != concat_over + ): + warnings.append( + opt.warning_message( + "This is likely to lead to different results when multiple datasets " + "have matching variables with overlapping values.", + ) + ) else: valid_vars = tuple(getattr(datasets[0], subset)) invalid_vars = [k for k in opt if k not in valid_vars] @@ -436,8 +489,13 @@ def process_subset_opt(opt, subset): ) concat_over.update(opt) + warnings: list[str] = [] process_subset_opt(data_vars, "data_vars") process_subset_opt(coords, "coords") + + for warning in warnings: + emit_user_level_warning(warning, FutureWarning) + return concat_over, equals, concat_dim_lengths @@ -479,14 +537,14 @@ def _parse_datasets( def _dataset_concat( datasets: Iterable[T_Dataset], dim: str | T_Variable | T_DataArray | pd.Index, - data_vars: T_DataVars, - coords: str | list[str], - compat: CompatOptions, + data_vars: T_DataVars | CombineKwargDefault, + coords: str | list[Hashable] | CombineKwargDefault, + compat: CompatOptions | CombineKwargDefault, positions: Iterable[Iterable[int]] | None, - fill_value: Any = dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "override", - create_index_for_new_dim: bool = True, + fill_value: Any, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, + create_index_for_new_dim: bool, ) -> T_Dataset: """ Concatenate a sequence of datasets along a new or existing dimension @@ -718,14 +776,14 @@ def get_indexes(name): def _dataarray_concat( arrays: Iterable[T_DataArray], dim: str | T_Variable | T_DataArray | pd.Index, - data_vars: T_DataVars, - coords: str | list[str], - compat: CompatOptions, + data_vars: T_DataVars | CombineKwargDefault, + coords: str | list[Hashable] | CombineKwargDefault, + compat: CompatOptions | CombineKwargDefault, positions: Iterable[Iterable[int]] | None, - fill_value: object = dtypes.NA, - join: JoinOptions = "outer", - combine_attrs: CombineAttrsOptions = "override", - create_index_for_new_dim: bool = True, + fill_value: object, + join: JoinOptions | CombineKwargDefault, + combine_attrs: CombineAttrsOptions, + create_index_for_new_dim: bool, ) -> T_DataArray: from xarray.core.dataarray import DataArray @@ -736,7 +794,14 @@ def _dataarray_concat( "The elements in the input list need to be either all 'Dataset's or all 'DataArray's" ) - if data_vars != "all": + # Allow passing old or new default even though we always use `data_vars="all"` + # when passing off to `_dataset_concat`. This allows people to explicitly + # set the data_vars value to the new default without worrying about whether + # they have datasets or dataarrays. + if not isinstance(data_vars, CombineKwargDefault) and data_vars not in [ + "all", + "minimal", + ]: raise ValueError( "data_vars is not a valid argument when concatenating DataArray objects" ) @@ -754,11 +819,11 @@ def _dataarray_concat( ds = _dataset_concat( datasets, - dim, - data_vars, - coords, - compat, - positions, + dim=dim, + data_vars="all", + coords=coords, + compat=compat, + positions=positions, fill_value=fill_value, join=join, combine_attrs=combine_attrs, diff --git a/xarray/structure/merge.py b/xarray/structure/merge.py index 7d773ce0b4b..eb584322e0c 100644 --- a/xarray/structure/merge.py +++ b/xarray/structure/merge.py @@ -14,9 +14,20 @@ filter_indexes_from_coords, indexes_equal, ) -from xarray.core.utils import Frozen, compat_dict_union, dict_equiv, equivalent +from xarray.core.utils import ( + Frozen, + compat_dict_union, + dict_equiv, + emit_user_level_warning, + equivalent, +) from xarray.core.variable import Variable, as_variable, calculate_dimensions from xarray.structure.alignment import deep_align +from xarray.util.deprecation_helpers import ( + _COMPAT_DEFAULT, + _JOIN_DEFAULT, + CombineKwargDefault, +) if TYPE_CHECKING: from xarray.core.coordinates import Coordinates @@ -88,7 +99,7 @@ class MergeError(ValueError): def unique_variable( name: Hashable, variables: list[Variable], - compat: CompatOptions = "broadcast_equals", + compat: CompatOptions | CombineKwargDefault = "broadcast_equals", equals: bool | None = None, ) -> Variable: """Return the unique variable from a list of variables or raise MergeError. @@ -131,9 +142,12 @@ def unique_variable( combine_method = "fillna" if equals is None: + compat_str = ( + compat._value if isinstance(compat, CombineKwargDefault) else compat + ) # first check without comparing values i.e. no computes for var in variables[1:]: - equals = getattr(out, compat)(var, equiv=lazy_array_equiv) + equals = getattr(out, compat_str)(var, equiv=lazy_array_equiv) if equals is not True: break @@ -141,7 +155,7 @@ def unique_variable( # now compare values with minimum number of computes out = out.compute() for var in variables[1:]: - equals = getattr(out, compat)(var) + equals = getattr(out, compat_str)(var) if not equals: break @@ -159,7 +173,7 @@ def unique_variable( def _assert_compat_valid(compat): - if compat not in _VALID_COMPAT: + if not isinstance(compat, CombineKwargDefault) and compat not in _VALID_COMPAT: raise ValueError(f"compat={compat!r} invalid: must be {set(_VALID_COMPAT)}") @@ -201,7 +215,7 @@ def _assert_prioritized_valid( def merge_collected( grouped: dict[Any, list[MergeElement]], prioritized: Mapping[Any, MergeElement] | None = None, - compat: CompatOptions = "minimal", + compat: CompatOptions | CombineKwargDefault = "minimal", combine_attrs: CombineAttrsOptions = "override", equals: dict[Any, bool] | None = None, ) -> tuple[dict[Hashable, Variable], dict[Hashable, Index]]: @@ -295,6 +309,20 @@ def merge_collected( merged_vars[name] = unique_variable( name, variables, compat, equals.get(name, None) ) + # This is very likely to result in false positives, but there is no way + # to tell if the output will change without computing. + if ( + isinstance(compat, CombineKwargDefault) + and compat == "no_conflicts" + and len(variables) > 1 + ): + emit_user_level_warning( + compat.warning_message( + "This is likely to lead to different results when " + "combining overlapping variables with the same name.", + ), + FutureWarning, + ) except MergeError: if compat != "minimal": # we need more than "minimal" compatibility (for which @@ -499,7 +527,7 @@ def coerce_pandas_values(objects: Iterable[CoercibleMapping]) -> list[DatasetLik def _get_priority_vars_and_indexes( objects: Sequence[DatasetLike], priority_arg: int | None, - compat: CompatOptions = "equals", + compat: CompatOptions | CombineKwargDefault = "equals", ) -> dict[Hashable, MergeElement]: """Extract the priority variable from a list of mappings. @@ -631,8 +659,8 @@ class _MergeResult(NamedTuple): def merge_core( objects: Iterable[CoercibleMapping], - compat: CompatOptions = "broadcast_equals", - join: JoinOptions = "outer", + compat: CompatOptions | CombineKwargDefault, + join: JoinOptions | CombineKwargDefault, combine_attrs: CombineAttrsOptions = "override", priority_arg: int | None = None, explicit_coords: Iterable[Hashable] | None = None, @@ -691,7 +719,11 @@ def merge_core( coerced = coerce_pandas_values(objects) aligned = deep_align( - coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value + coerced, + join=join, + copy=False, + indexes=indexes, + fill_value=fill_value, ) for pos, obj in skip_align_objs: @@ -700,7 +732,10 @@ def merge_core( collected = collect_variables_and_indexes(aligned, indexes=indexes) prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat) variables, out_indexes = merge_collected( - collected, prioritized, compat=compat, combine_attrs=combine_attrs + collected, + prioritized, + compat=compat, + combine_attrs=combine_attrs, ) dims = calculate_dimensions(variables) @@ -731,8 +766,8 @@ def merge_core( def merge( objects: Iterable[DataArray | CoercibleMapping], - compat: CompatOptions = "no_conflicts", - join: JoinOptions = "outer", + compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT, + join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, fill_value: object = dtypes.NA, combine_attrs: CombineAttrsOptions = "override", ) -> Dataset: @@ -843,7 +878,7 @@ def merge( * time (time) float64 16B 30.0 60.0 * lon (lon) float64 16B 100.0 150.0 - >>> xr.merge([x, y, z]) + >>> xr.merge([x, y, z], join="outer") Size: 256B Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -855,7 +890,7 @@ def merge( var2 (lat, lon) float64 72B 5.0 nan 6.0 nan nan nan 7.0 nan 8.0 var3 (time, lon) float64 48B 0.0 nan 3.0 4.0 nan 9.0 - >>> xr.merge([x, y, z], compat="identical") + >>> xr.merge([x, y, z], compat="identical", join="outer") Size: 256B Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -867,7 +902,7 @@ def merge( var2 (lat, lon) float64 72B 5.0 nan 6.0 nan nan nan 7.0 nan 8.0 var3 (time, lon) float64 48B 0.0 nan 3.0 4.0 nan 9.0 - >>> xr.merge([x, y, z], compat="equals") + >>> xr.merge([x, y, z], compat="equals", join="outer") Size: 256B Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -879,7 +914,7 @@ def merge( var2 (lat, lon) float64 72B 5.0 nan 6.0 nan nan nan 7.0 nan 8.0 var3 (time, lon) float64 48B 0.0 nan 3.0 4.0 nan 9.0 - >>> xr.merge([x, y, z], compat="equals", fill_value=-999.0) + >>> xr.merge([x, y, z], compat="equals", join="outer", fill_value=-999.0) Size: 256B Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -976,8 +1011,8 @@ def merge( merge_result = merge_core( dict_like_objects, - compat, - join, + compat=compat, + join=join, combine_attrs=combine_attrs, fill_value=fill_value, ) @@ -988,8 +1023,8 @@ def dataset_merge_method( dataset: Dataset, other: CoercibleMapping, overwrite_vars: Hashable | Iterable[Hashable], - compat: CompatOptions, - join: JoinOptions, + compat: CompatOptions | CombineKwargDefault, + join: JoinOptions | CombineKwargDefault, fill_value: Any, combine_attrs: CombineAttrsOptions, ) -> _MergeResult: @@ -1022,8 +1057,8 @@ def dataset_merge_method( return merge_core( objs, - compat, - join, + compat=compat, + join=join, priority_arg=priority_arg, fill_value=fill_value, combine_attrs=combine_attrs, @@ -1055,6 +1090,8 @@ def dataset_update_method(dataset: Dataset, other: CoercibleMapping) -> _MergeRe return merge_core( [dataset, other], + compat="broadcast_equals", + join="outer", priority_arg=1, indexes=dataset.xindexes, combine_attrs="override", @@ -1076,6 +1113,7 @@ def merge_data_and_coords(data_vars: DataVars, coords) -> _MergeResult: [data_vars, coords], compat="broadcast_equals", join="outer", + combine_attrs="override", explicit_coords=tuple(coords), indexes=coords.xindexes, priority_arg=1, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 95c53786f86..645d92d58b1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4653,13 +4653,14 @@ def setup_files_and_datasets(self, fuzz=0): # to test join='exact' ds1["x"] = ds1.x + fuzz - with create_tmp_file() as tmpfile1: - with create_tmp_file() as tmpfile2: - # save data to the temporary files - ds1.to_netcdf(tmpfile1) - ds2.to_netcdf(tmpfile2) + with set_options(use_new_combine_kwarg_defaults=True): + with create_tmp_file() as tmpfile1: + with create_tmp_file() as tmpfile2: + # save data to the temporary files + ds1.to_netcdf(tmpfile1) + ds2.to_netcdf(tmpfile2) - yield [tmpfile1, tmpfile2], [ds1, ds2] + yield [tmpfile1, tmpfile2], [ds1, ds2] def gen_datasets_with_common_coord_and_time(self): # create coordinate data @@ -4696,11 +4697,19 @@ def test_open_mfdataset_does_same_as_concat( if combine == "by_coords": files.reverse() with open_mfdataset( - files, data_vars=opt, combine=combine, concat_dim=concat_dim, join=join + files, + data_vars=opt, + combine=combine, + concat_dim=concat_dim, + join=join, + compat="no_conflicts", ) as ds: - ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim="t", join=join) + ds_expect = xr.concat( + [ds1, ds2], data_vars=opt, dim="t", join=join, compat="equals" + ) assert_identical(ds, ds_expect) + @pytest.mark.parametrize("use_new_combine_kwarg_defaults", [True, False]) @pytest.mark.parametrize( ["combine_attrs", "attrs", "expected", "expect_error"], ( @@ -4728,7 +4737,12 @@ def test_open_mfdataset_does_same_as_concat( ), ) def test_open_mfdataset_dataset_combine_attrs( - self, combine_attrs, attrs, expected, expect_error + self, + use_new_combine_kwarg_defaults, + combine_attrs, + attrs, + expected, + expect_error, ): with self.setup_files_and_datasets() as (files, [ds1, ds2]): # Give the files an inconsistent attribute @@ -4738,22 +4752,28 @@ def test_open_mfdataset_dataset_combine_attrs( ds.close() ds.to_netcdf(f) - if expect_error: - with pytest.raises(xr.MergeError): - xr.open_mfdataset( - files, - combine="nested", - concat_dim="t", - combine_attrs=combine_attrs, - ) - else: - with xr.open_mfdataset( - files, - combine="nested", - concat_dim="t", - combine_attrs=combine_attrs, - ) as ds: - assert ds.attrs == expected + with set_options( + use_new_combine_kwarg_defaults=use_new_combine_kwarg_defaults + ): + warning: contextlib.AbstractContextManager = ( + pytest.warns(FutureWarning) + if not use_new_combine_kwarg_defaults + else contextlib.nullcontext() + ) + error: contextlib.AbstractContextManager = ( + pytest.raises(xr.MergeError) + if expect_error + else contextlib.nullcontext() + ) + with warning: + with error: + with xr.open_mfdataset( + files, + combine="nested", + concat_dim="t", + combine_attrs=combine_attrs, + ) as ds: + assert ds.attrs == expected def test_open_mfdataset_dataset_attr_by_coords(self) -> None: """ @@ -4782,30 +4802,65 @@ def test_open_mfdataset_dataarray_attr_by_coords(self) -> None: ds.close() ds.to_netcdf(f) - with xr.open_mfdataset(files, combine="nested", concat_dim="t") as ds: + with xr.open_mfdataset( + files, data_vars="minimal", combine="nested", concat_dim="t" + ) as ds: assert ds["v1"].test_dataarray_attr == 0 @pytest.mark.parametrize( "combine, concat_dim", [("nested", "t"), ("by_coords", None)] ) - @pytest.mark.parametrize("opt", ["all", "minimal", "different"]) + @pytest.mark.parametrize( + "kwargs", + [ + {"data_vars": "all"}, + {"data_vars": "minimal"}, + { + "data_vars": "all", + "coords": "different", + "compat": "no_conflicts", + }, # old defaults + { + "data_vars": "minimal", + "coords": "minimal", + "compat": "override", + }, # new defaults + {"data_vars": "different", "compat": "no_conflicts"}, + {}, + ], + ) def test_open_mfdataset_exact_join_raises_error( - self, combine, concat_dim, opt + self, combine, concat_dim, kwargs ) -> None: - with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]): + with self.setup_files_and_datasets(fuzz=0.1) as (files, _): if combine == "by_coords": files.reverse() with pytest.raises( - ValueError, match=r"cannot align objects.*join.*exact.*" + ValueError, match="cannot align objects with join='exact'" ): open_mfdataset( files, - data_vars=opt, + **kwargs, combine=combine, concat_dim=concat_dim, join="exact", ) + def test_open_mfdataset_defaults_with_exact_join_warns_as_well_as_raising( + self, + ) -> None: + with self.setup_files_and_datasets(fuzz=0.1) as (files, _): + with set_options(use_new_combine_kwarg_defaults=False): + files.reverse() + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + with pytest.raises( + ValueError, match="cannot align objects with join='exact'" + ): + open_mfdataset(files, combine="by_coords", join="exact") + def test_common_coord_when_datavars_all(self) -> None: opt: Final = "all" @@ -4853,6 +4908,52 @@ def test_invalid_data_vars_value_should_fail(self) -> None: with open_mfdataset(files, coords="minimum", combine="by_coords"): pass + @pytest.mark.parametrize( + "combine, concat_dim", [("nested", "t"), ("by_coords", None)] + ) + @pytest.mark.parametrize( + "kwargs", [{"data_vars": "different"}, {"coords": "different"}] + ) + def test_open_mfdataset_warns_when_kwargs_set_to_different( + self, combine, concat_dim, kwargs + ) -> None: + with self.setup_files_and_datasets() as (files, [ds1, ds2]): + if combine == "by_coords": + files.reverse() + with pytest.raises( + ValueError, match="Previously the default was compat='no_conflicts'" + ): + open_mfdataset(files, combine=combine, concat_dim=concat_dim, **kwargs) + with pytest.raises( + ValueError, match="Previously the default was compat='equals'" + ): + xr.concat([ds1, ds2], dim="t", **kwargs) + + with set_options(use_new_combine_kwarg_defaults=False): + expectation: contextlib.AbstractContextManager = ( + pytest.warns( + FutureWarning, + match="will change from data_vars='all'", + ) + if "data_vars" not in kwargs + else contextlib.nullcontext() + ) + + with pytest.warns( + FutureWarning, + match="will change from compat='equals'", + ): + with expectation: + ds_expect = xr.concat([ds1, ds2], dim="t", **kwargs) + with pytest.warns( + FutureWarning, match="will change from compat='no_conflicts'" + ): + with expectation: + with open_mfdataset( + files, combine=combine, concat_dim=concat_dim, **kwargs + ) as ds: + assert_identical(ds, ds_expect) + @requires_dask @requires_scipy @@ -5108,11 +5209,58 @@ def test_encoding_mfdataset(self) -> None: ds2.t.encoding["units"] = "days since 2000-01-01" ds1.to_netcdf(tmp1) ds2.to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine="nested") as actual: + with open_mfdataset( + [tmp1, tmp2], combine="nested", compat="no_conflicts", join="outer" + ) as actual: assert actual.t.encoding["units"] == original.t.encoding["units"] assert actual.t.encoding["units"] == ds1.t.encoding["units"] assert actual.t.encoding["units"] != ds2.t.encoding["units"] + def test_encoding_mfdataset_new_defaults(self) -> None: + original = Dataset( + { + "foo": ("t", np.random.randn(10)), + "t": ("t", pd.date_range(start="2010-01-01", periods=10, freq="1D")), + } + ) + original.t.encoding["units"] = "days since 2010-01-01" + + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1 = original.isel(t=slice(5)) + ds2 = original.isel(t=slice(5, 10)) + ds1.t.encoding["units"] = "days since 2010-01-01" + ds2.t.encoding["units"] = "days since 2000-01-01" + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from join='outer' to join='exact'", + ): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + with open_mfdataset([tmp1, tmp2], combine="nested") as old: + assert ( + old.t.encoding["units"] + == original.t.encoding["units"] + ) + assert ( + old.t.encoding["units"] == ds1.t.encoding["units"] + ) + assert ( + old.t.encoding["units"] != ds2.t.encoding["units"] + ) + + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises( + ValueError, match="Error might be related to new default" + ): + open_mfdataset([tmp1, tmp2], combine="nested") + def test_preprocess_mfdataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_file() as tmp: @@ -5195,25 +5343,21 @@ def test_open_and_do_math(self) -> None: actual = 1.0 * ds assert_allclose(original, actual, decode_bytes=False) - def test_open_mfdataset_concat_dim_none(self) -> None: - with create_tmp_file() as tmp1: - with create_tmp_file() as tmp2: - data = Dataset({"x": 0}) - data.to_netcdf(tmp1) - Dataset({"x": np.nan}).to_netcdf(tmp2) - with open_mfdataset( - [tmp1, tmp2], concat_dim=None, combine="nested" - ) as actual: - assert_identical(data, actual) - - def test_open_mfdataset_concat_dim_default_none(self) -> None: - with create_tmp_file() as tmp1: - with create_tmp_file() as tmp2: - data = Dataset({"x": 0}) - data.to_netcdf(tmp1) - Dataset({"x": np.nan}).to_netcdf(tmp2) - with open_mfdataset([tmp1, tmp2], combine="nested") as actual: - assert_identical(data, actual) + @pytest.mark.parametrize( + "kwargs", + [pytest.param({"concat_dim": None}, id="none"), pytest.param({}, id="default")], + ) + def test_open_mfdataset_concat_dim(self, kwargs) -> None: + with set_options(use_new_combine_kwarg_defaults=True): + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + data = Dataset({"x": 0}) + data.to_netcdf(tmp1) + Dataset({"x": np.nan}).to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2], **kwargs, combine="nested" + ) as actual: + assert_identical(data, actual) def test_open_dataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) @@ -5240,7 +5384,9 @@ def test_open_single_dataset(self) -> None: ) with create_tmp_file() as tmp: original.to_netcdf(tmp) - with open_mfdataset([tmp], concat_dim=dim, combine="nested") as actual: + with open_mfdataset( + [tmp], concat_dim=dim, data_vars="all", combine="nested" + ) as actual: assert_identical(expected, actual) def test_open_multi_dataset(self) -> None: @@ -5264,7 +5410,7 @@ def test_open_multi_dataset(self) -> None: original.to_netcdf(tmp1) original.to_netcdf(tmp2) with open_mfdataset( - [tmp1, tmp2], concat_dim=dim, combine="nested" + [tmp1, tmp2], concat_dim=dim, data_vars="all", combine="nested" ) as actual: assert_identical(expected, actual) @@ -6743,19 +6889,20 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): @requires_h5netcdf @requires_fsspec def test_h5netcdf_storage_options() -> None: - with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2): - ds1 = create_test_data() - ds1.to_netcdf(f1, engine="h5netcdf") + with set_options(use_new_combine_kwarg_defaults=True): + with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2): + ds1 = create_test_data() + ds1.to_netcdf(f1, engine="h5netcdf") - ds2 = create_test_data() - ds2.to_netcdf(f2, engine="h5netcdf") + ds2 = create_test_data() + ds2.to_netcdf(f2, engine="h5netcdf") - files = [f"file://{f}" for f in [f1, f2]] - with xr.open_mfdataset( - files, - engine="h5netcdf", - concat_dim="time", - combine="nested", - storage_options={"skip_instance_cache": False}, - ) as ds: - assert_identical(xr.concat([ds1, ds2], dim="time"), ds) + files = [f"file://{f}" for f in [f1, f2]] + with xr.open_mfdataset( + files, + engine="h5netcdf", + concat_dim="time", + combine="nested", + storage_options={"skip_instance_cache": False}, + ) as ds: + assert_identical(xr.concat([ds1, ds2], dim="time"), ds) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 80a795c4c52..a3697290fc8 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -13,6 +13,7 @@ combine_nested, concat, merge, + set_options, ) from xarray.core import dtypes from xarray.structure.combine import ( @@ -290,9 +291,12 @@ def test_concat_once(self, create_combined_ids, concat_dim): data_vars="all", coords="different", compat="no_conflicts", + fill_value=dtypes.NA, + join="outer", + combine_attrs="drop", ) - expected_ds = concat([ds(0), ds(1)], dim=concat_dim) + expected_ds = concat([ds(0), ds(1)], data_vars="all", dim=concat_dim) assert_combined_tile_ids_equal(result, {(): expected_ds}) def test_concat_only_first_dim(self, create_combined_ids): @@ -304,6 +308,9 @@ def test_concat_only_first_dim(self, create_combined_ids): data_vars="all", coords="different", compat="no_conflicts", + fill_value=dtypes.NA, + join="outer", + combine_attrs="drop", ) ds = create_test_data @@ -319,13 +326,24 @@ def test_concat_only_first_dim(self, create_combined_ids): def test_concat_twice(self, create_combined_ids, concat_dim): shape = (2, 3) combined_ids = create_combined_ids(shape) - result = _combine_nd(combined_ids, concat_dims=["dim1", concat_dim]) + result = _combine_nd( + combined_ids, + concat_dims=["dim1", concat_dim], + data_vars="all", + coords="different", + compat="no_conflicts", + fill_value=dtypes.NA, + join="outer", + combine_attrs="drop", + ) ds = create_test_data partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") - expected = concat([partway1, partway2, partway3], dim=concat_dim) + expected = concat( + [partway1, partway2, partway3], data_vars="all", dim=concat_dim + ) assert_equal(result, expected) @@ -417,7 +435,7 @@ def test_nested_concat_along_new_dim(self): Dataset({"a": ("x", [20]), "x": [0]}), ] expected = Dataset({"a": (("t", "x"), [[10], [20]]), "x": [0]}) - actual = combine_nested(objs, concat_dim="t") + actual = combine_nested(objs, data_vars="all", concat_dim="t") assert_identical(expected, actual) # Same but with a DataArray as new dim, see GH #1988 and #2647 @@ -425,42 +443,51 @@ def test_nested_concat_along_new_dim(self): expected = Dataset( {"a": (("baz", "x"), [[10], [20]]), "x": [0], "baz": [100, 150]} ) - actual = combine_nested(objs, concat_dim=dim) + actual = combine_nested(objs, data_vars="all", concat_dim=dim) assert_identical(expected, actual) - def test_nested_merge(self): + def test_nested_merge_with_self(self): data = Dataset({"x": 0}) - actual = combine_nested([data, data, data], concat_dim=None) + actual = combine_nested( + [data, data, data], compat="no_conflicts", concat_dim=None + ) assert_identical(data, actual) + def test_nested_merge_with_overlapping_values(self): ds1 = Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) expected = Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) - actual = combine_nested([ds1, ds2], concat_dim=None) + actual = combine_nested( + [ds1, ds2], join="outer", compat="no_conflicts", concat_dim=None + ) assert_identical(expected, actual) - actual = combine_nested([ds1, ds2], concat_dim=[None]) + actual = combine_nested( + [ds1, ds2], join="outer", compat="no_conflicts", concat_dim=[None] + ) assert_identical(expected, actual) + def test_nested_merge_with_nan(self): tmp1 = Dataset({"x": 0}) tmp2 = Dataset({"x": np.nan}) - actual = combine_nested([tmp1, tmp2], concat_dim=None) + actual = combine_nested([tmp1, tmp2], compat="no_conflicts", concat_dim=None) assert_identical(tmp1, actual) - actual = combine_nested([tmp1, tmp2], concat_dim=[None]) + actual = combine_nested([tmp1, tmp2], compat="no_conflicts", concat_dim=[None]) assert_identical(tmp1, actual) - # Single object, with a concat_dim explicitly provided + def test_nested_merge_with_concat_dim_explicitly_provided(self): # Test the issue reported in GH #1988 objs = [Dataset({"x": 0, "y": 1})] dim = DataArray([100], name="baz", dims="baz") - actual = combine_nested(objs, concat_dim=[dim]) + actual = combine_nested(objs, concat_dim=[dim], data_vars="all") expected = Dataset({"x": ("baz", [0]), "y": ("baz", [1])}, {"baz": [100]}) assert_identical(expected, actual) + def test_nested_merge_with_non_scalars(self): # Just making sure that auto_combine is doing what is # expected for non-scalar values, too. objs = [Dataset({"x": ("z", [0, 1]), "y": ("z", [1, 2])})] dim = DataArray([100], name="baz", dims="baz") - actual = combine_nested(objs, concat_dim=[dim]) + actual = combine_nested(objs, concat_dim=[dim], data_vars="all") expected = Dataset( {"x": (("baz", "z"), [[0, 1]]), "y": (("baz", "z"), [[1, 2]])}, {"baz": [100]}, @@ -510,10 +537,15 @@ def test_auto_combine_2d(self): partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") - expected = concat([partway1, partway2, partway3], dim="dim2") + expected = concat([partway1, partway2, partway3], data_vars="all", dim="dim2") datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] - result = combine_nested(datasets, concat_dim=["dim1", "dim2"]) + result = combine_nested( + datasets, + data_vars="all", + compat="no_conflicts", + concat_dim=["dim1", "dim2"], + ) assert_equal(result, expected) def test_auto_combine_2d_combine_attrs_kwarg(self): @@ -522,7 +554,7 @@ def test_auto_combine_2d_combine_attrs_kwarg(self): partway1 = concat([ds(0), ds(3)], dim="dim1") partway2 = concat([ds(1), ds(4)], dim="dim1") partway3 = concat([ds(2), ds(5)], dim="dim1") - expected = concat([partway1, partway2, partway3], dim="dim2") + expected = concat([partway1, partway2, partway3], data_vars="all", dim="dim2") expected_dict = {} expected_dict["drop"] = expected.copy(deep=True) @@ -553,12 +585,20 @@ def test_auto_combine_2d_combine_attrs_kwarg(self): with pytest.raises(ValueError, match=r"combine_attrs='identical'"): result = combine_nested( - datasets, concat_dim=["dim1", "dim2"], combine_attrs="identical" + datasets, + concat_dim=["dim1", "dim2"], + data_vars="all", + compat="no_conflicts", + combine_attrs="identical", ) for combine_attrs in expected_dict: result = combine_nested( - datasets, concat_dim=["dim1", "dim2"], combine_attrs=combine_attrs + datasets, + concat_dim=["dim1", "dim2"], + data_vars="all", + compat="no_conflicts", + combine_attrs=combine_attrs, ) assert_identical(result, expected_dict[combine_attrs]) @@ -572,7 +612,7 @@ def test_combine_nested_missing_data_new_dim(self): expected = Dataset( {"a": (("t", "x"), [[np.nan, 2, 3], [1, 2, np.nan]])}, {"x": [0, 1, 2]} ) - actual = combine_nested(datasets, concat_dim="t") + actual = combine_nested(datasets, data_vars="all", join="outer", concat_dim="t") assert_identical(expected, actual) def test_invalid_hypercube_input(self): @@ -650,7 +690,13 @@ def test_combine_nested_fill_value(self, fill_value): }, {"x": [0, 1, 2]}, ) - actual = combine_nested(datasets, concat_dim="t", fill_value=fill_value) + actual = combine_nested( + datasets, + concat_dim="t", + data_vars="all", + join="outer", + fill_value=fill_value, + ) assert_identical(expected, actual) def test_combine_nested_unnamed_data_arrays(self): @@ -710,26 +756,30 @@ def test_combine_by_coords(self): expected = Dataset({"x": [0, 1, 2]}) assert_identical(expected, actual) + def test_combine_by_coords_handles_non_sorted_variables(self): # ensure auto_combine handles non-sorted variables objs = [ Dataset({"x": ("a", [0]), "y": ("a", [0]), "a": [0]}), Dataset({"x": ("a", [1]), "y": ("a", [1]), "a": [1]}), ] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, join="outer") expected = Dataset({"x": ("a", [0, 1]), "y": ("a", [0, 1]), "a": [0, 1]}) assert_identical(expected, actual) + def test_combine_by_coords_multiple_variables(self): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, join="outer") expected = Dataset({"x": [0, 1], "y": [0, 1]}) assert_equal(actual, expected) + def test_combine_by_coords_for_scalar_variables(self): objs = [Dataset({"x": 0}), Dataset({"x": 1})] with pytest.raises( ValueError, match=r"Could not find any dimension coordinates" ): combine_by_coords(objs) + def test_combine_by_coords_requires_coord_or_index(self): objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] with pytest.raises( ValueError, @@ -945,7 +995,9 @@ def test_combine_by_coords_combine_attrs_variables( with pytest.raises(MergeError, match="combine_attrs"): combine_by_coords([data1, data2], combine_attrs=combine_attrs) else: - actual = combine_by_coords([data1, data2], combine_attrs=combine_attrs) + actual = combine_by_coords( + [data1, data2], data_vars="all", combine_attrs=combine_attrs + ) expected = Dataset( { "x": ("a", [0, 1], expected_attrs), @@ -959,7 +1011,7 @@ def test_combine_by_coords_combine_attrs_variables( def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, data_vars="all", compat="no_conflicts") expected = data assert expected.broadcast_equals(actual) @@ -997,7 +1049,7 @@ def test_combine_by_coords_previously_failed(self): Dataset({"a": ("x", [1]), "x": [1]}), ] expected = Dataset({"a": ("x", [0, 1]), "b": ("x", [0, np.nan])}, {"x": [0, 1]}) - actual = combine_by_coords(datasets) + actual = combine_by_coords(datasets, join="outer") assert_identical(expected, actual) def test_combine_by_coords_still_fails(self): @@ -1014,7 +1066,7 @@ def test_combine_by_coords_no_concat(self): assert_identical(expected, actual) objs = [Dataset({"x": 0, "y": 1}), Dataset({"y": np.nan, "z": 2})] - actual = combine_by_coords(objs) + actual = combine_by_coords(objs, compat="no_conflicts") expected = Dataset({"x": 0, "y": 1, "z": 2}) assert_identical(expected, actual) @@ -1032,7 +1084,7 @@ def test_combine_by_coords_incomplete_hypercube(self): x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]}) x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]}) x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]}) - actual = combine_by_coords([x1, x2, x3]) + actual = combine_by_coords([x1, x2, x3], join="outer") expected = Dataset( {"a": (("y", "x"), [[1, 1], [1, np.nan]])}, coords={"y": [0, 1], "x": [0, 1]}, @@ -1040,8 +1092,10 @@ def test_combine_by_coords_incomplete_hypercube(self): assert_identical(expected, actual) # test that this fails if fill_value is None - with pytest.raises(ValueError): - combine_by_coords([x1, x2, x3], fill_value=None) + with pytest.raises( + ValueError, match="supplied objects do not form a hypercube" + ): + combine_by_coords([x1, x2, x3], join="outer", fill_value=None) def test_combine_by_coords_override_order(self) -> None: # regression test for https://github.com/pydata/xarray/issues/8828 @@ -1111,7 +1165,7 @@ def test_combine_by_coords_all_named_dataarrays(self): named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") - actual = combine_by_coords([named_da1, named_da2]) + actual = combine_by_coords([named_da1, named_da2], join="outer") expected = Dataset( { "a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"), @@ -1124,11 +1178,146 @@ def test_combine_by_coords_all_dataarrays_with_the_same_name(self): named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") named_da2 = DataArray(name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") - actual = combine_by_coords([named_da1, named_da2]) - expected = merge([named_da1, named_da2]) + actual = combine_by_coords( + [named_da1, named_da2], compat="no_conflicts", join="outer" + ) + expected = merge([named_da1, named_da2], compat="no_conflicts", join="outer") assert_identical(expected, actual) +class TestNewDefaults: + def test_concat_along_existing_dim(self): + concat_dim = "dim1" + ds = create_test_data + with set_options(use_new_combine_kwarg_defaults=False): + old = concat([ds(0), ds(1)], dim=concat_dim) + with set_options(use_new_combine_kwarg_defaults=True): + new = concat([ds(0), ds(1)], dim=concat_dim) + + assert_identical(old, new) + + def test_concat_along_new_dim(self): + concat_dim = "new_dim" + ds = create_test_data + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + old = concat([ds(0), ds(1)], dim=concat_dim) + with set_options(use_new_combine_kwarg_defaults=True): + new = concat([ds(0), ds(1)], dim=concat_dim) + + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_nested_merge_with_overlapping_values(self): + ds1 = Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) + ds2 = Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) + expected = Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = combine_nested([ds1, ds2], concat_dim=None) + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + combine_nested([ds1, ds2], concat_dim=None) + + assert_identical(old, expected) + + def test_nested_merge_with_nan_order_matters(self): + ds1 = Dataset({"x": 0}) + ds2 = Dataset({"x": np.nan}) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = combine_nested([ds1, ds2], concat_dim=None) + with set_options(use_new_combine_kwarg_defaults=True): + new = combine_nested([ds1, ds2], concat_dim=None) + + assert_identical(ds1, old) + assert_identical(old, new) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = combine_nested([ds2, ds1], concat_dim=None) + with set_options(use_new_combine_kwarg_defaults=True): + new = combine_nested([ds2, ds1], concat_dim=None) + + assert_identical(ds1, old) + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_nested_merge_with_concat_dim_explicitly_provided(self): + # Test the issue reported in GH #1988 + objs = [Dataset({"x": 0, "y": 1})] + dim = DataArray([100], name="baz", dims="baz") + expected = Dataset({"x": ("baz", [0]), "y": ("baz", [1])}, {"baz": [100]}) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + old = combine_nested(objs, concat_dim=dim) + with set_options(use_new_combine_kwarg_defaults=True): + new = combine_nested(objs, concat_dim=dim) + + assert_identical(expected, old) + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_combine_nested_missing_data_new_dim(self): + # Your data includes "time" and "station" dimensions, and each year's + # data has a different set of stations. + datasets = [ + Dataset({"a": ("x", [2, 3]), "x": [1, 2]}), + Dataset({"a": ("x", [1, 2]), "x": [0, 1]}), + ] + expected = Dataset( + {"a": (("t", "x"), [[np.nan, 2, 3], [1, 2, np.nan]])}, {"x": [0, 1, 2]} + ) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + old = combine_nested(datasets, concat_dim="t") + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + combine_nested(datasets, concat_dim="t") + + assert_identical(expected, old) + + def test_combine_by_coords_multiple_variables(self): + objs = [Dataset({"x": [0], "y": [0]}), Dataset({"y": [1], "x": [1]})] + expected = Dataset({"x": [0, 1], "y": [0, 1]}) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + old = combine_by_coords(objs) + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + combine_by_coords(objs) + + assert_identical(old, expected) + + @requires_cftime def test_combine_by_coords_distant_cftime_dates(): # Regression test for https://github.com/pydata/xarray/issues/3535 diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 49c6490d819..86527eee1b0 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Callable +from contextlib import AbstractContextManager, nullcontext from copy import deepcopy from typing import TYPE_CHECKING, Any, Literal @@ -8,7 +9,7 @@ import pandas as pd import pytest -from xarray import DataArray, Dataset, Variable, concat +from xarray import DataArray, Dataset, Variable, concat, set_options from xarray.core import dtypes from xarray.core.coordinates import Coordinates from xarray.core.indexes import PandasIndex @@ -133,9 +134,9 @@ def test_concat_compat() -> None: for var in ["has_x", "no_x_y"]: assert "y" not in result[var].dims and "y" not in result[var].coords with pytest.raises(ValueError, match=r"'q' not present in all datasets"): - concat([ds1, ds2], dim="q") + concat([ds1, ds2], dim="q", data_vars="all", join="outer") with pytest.raises(ValueError, match=r"'q' not present in all datasets"): - concat([ds2, ds1], dim="q") + concat([ds2, ds1], dim="q", data_vars="all", join="outer") def test_concat_missing_var() -> None: @@ -214,8 +215,12 @@ def test_concat_second_empty() -> None: actual = concat([ds1, ds2], dim="y", coords="all") assert_identical(actual, expected) + +def test_concat_second_empty_with_scalar_data_var_only_on_first() -> None: # Check concatenating scalar data_var only present in ds1 - ds1["b"] = 0.1 + ds1 = Dataset(data_vars={"a": ("y", [0.1]), "b": 0.1}, coords={"x": 0.1}) + ds2 = Dataset(coords={"x": 0.1}) + expected = Dataset( data_vars={"a": ("y", [0.1, np.nan]), "b": ("y", [0.1, np.nan])}, coords={"x": ("y", [0.1, 0.1])}, @@ -226,7 +231,9 @@ def test_concat_second_empty() -> None: expected = Dataset( data_vars={"a": ("y", [0.1, np.nan]), "b": 0.1}, coords={"x": 0.1} ) - actual = concat([ds1, ds2], dim="y", coords="different", data_vars="different") + actual = concat( + [ds1, ds2], dim="y", coords="different", data_vars="different", compat="equals" + ) assert_identical(actual, expected) @@ -266,7 +273,7 @@ def test_concat_multiple_datasets_missing_vars(include_day: bool) -> None: datasets = create_concat_datasets( len(vars_to_drop), seed=123, include_day=include_day ) - expected = concat(datasets, dim="day") + expected = concat(datasets, dim="day", data_vars="all") for i, name in enumerate(vars_to_drop): if include_day: @@ -280,7 +287,7 @@ def test_concat_multiple_datasets_missing_vars(include_day: bool) -> None: for ds, varname in zip(datasets, vars_to_drop, strict=True) ] - actual = concat(datasets, dim="day") + actual = concat(datasets, dim="day", data_vars="all") assert list(actual.data_vars.keys()) == [ "pressure", @@ -491,7 +498,7 @@ def rectify_dim_order(self, data: Dataset, dataset) -> Dataset: ) def test_concat_simple(self, data: Dataset, dim, coords) -> None: datasets = [g for _, g in data.groupby(dim, squeeze=False)] - assert_identical(data, concat(datasets, dim, coords=coords)) + assert_identical(data, concat(datasets, dim, coords=coords, compat="equals")) def test_concat_merge_variables_present_in_some_datasets( self, data: Dataset @@ -512,7 +519,7 @@ def test_concat_merge_variables_present_in_some_datasets( assert_identical(expected, actual) # expand foo - actual = concat([data0, data1], "dim1") + actual = concat([data0, data1], "dim1", data_vars="all") foo = np.ones((8, 10), dtype=data1.foo.dtype) * np.nan foo[3:] = data1.foo.values[None, ...] expected = data.copy().assign(foo=(["dim1", "bar"], foo)) @@ -536,7 +543,9 @@ def test_concat_coords_kwarg( data.coords["extra"] = ("dim4", np.arange(3)) datasets = [g.squeeze() for _, g in data.groupby(dim, squeeze=False)] - actual = concat(datasets, data[dim], coords=coords) + actual = concat( + datasets, data[dim], coords=coords, data_vars="all", compat="equals" + ) if coords == "all": expected = np.array([data["extra"].values for _ in range(data.sizes[dim])]) assert_array_equal(actual["extra"].values, expected) @@ -568,41 +577,56 @@ def test_concat_data_vars_typing(self) -> None: actual = concat(objs, dim="x", data_vars="minimal") assert_identical(data, actual) - def test_concat_data_vars(self) -> None: + @pytest.mark.parametrize("data_vars", ["minimal", "different", "all", [], ["foo"]]) + def test_concat_data_vars(self, data_vars) -> None: data = Dataset({"foo": ("x", np.random.randn(10))}) objs: list[Dataset] = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] - for data_vars in ["minimal", "different", "all", [], ["foo"]]: - actual = concat(objs, dim="x", data_vars=data_vars) - assert_identical(data, actual) + actual = concat(objs, dim="x", data_vars=data_vars, compat="equals") + assert_identical(data, actual) - def test_concat_coords(self): - # TODO: annotating this func fails + @pytest.mark.parametrize("coords", ["different", "all", ["c"]]) + def test_concat_coords(self, coords) -> None: data = Dataset({"foo": ("x", np.random.randn(10))}) expected = data.assign_coords(c=("x", [0] * 5 + [1] * 5)) objs = [ data.isel(x=slice(5)).assign_coords(c=0), data.isel(x=slice(5, None)).assign_coords(c=1), ] - for coords in ["different", "all", ["c"]]: + if coords == "different": + actual = concat(objs, dim="x", coords=coords, compat="equals") + else: actual = concat(objs, dim="x", coords=coords) - assert_identical(expected, actual) - for coords in ["minimal", []]: - with pytest.raises(merge.MergeError, match="conflicting values"): - concat(objs, dim="x", coords=coords) + assert_identical(expected, actual) - def test_concat_constant_index(self): - # TODO: annotating this func fails + @pytest.mark.parametrize("coords", ["minimal", []]) + def test_concat_coords_raises_merge_error(self, coords) -> None: + data = Dataset({"foo": ("x", np.random.randn(10))}) + objs = [ + data.isel(x=slice(5)).assign_coords(c=0), + data.isel(x=slice(5, None)).assign_coords(c=1), + ] + with pytest.raises(merge.MergeError, match="conflicting values"): + concat(objs, dim="x", coords=coords, compat="equals") + + @pytest.mark.parametrize("data_vars", ["different", "all", ["foo"]]) + def test_concat_constant_index(self, data_vars) -> None: # GH425 ds1 = Dataset({"foo": 1.5}, {"y": 1}) ds2 = Dataset({"foo": 2.5}, {"y": 1}) expected = Dataset({"foo": ("y", [1.5, 2.5]), "y": [1, 1]}) - for mode in ["different", "all", ["foo"]]: - actual = concat([ds1, ds2], "y", data_vars=mode) - assert_identical(expected, actual) + if data_vars == "different": + actual = concat([ds1, ds2], "y", data_vars=data_vars, compat="equals") + else: + actual = concat([ds1, ds2], "y", data_vars=data_vars) + assert_identical(expected, actual) + + def test_concat_constant_index_minimal_raises_merge_error(self) -> None: + ds1 = Dataset({"foo": 1.5}, {"y": 1}) + ds2 = Dataset({"foo": 2.5}, {"y": 1}) with pytest.raises(merge.MergeError, match="conflicting values"): # previously dim="y", and raised error which makes no sense. # "foo" has dimension "y" so minimal should concatenate it? - concat([ds1, ds2], "new_dim", data_vars="minimal") + concat([ds1, ds2], "new_dim", data_vars="minimal", compat="equals") def test_concat_size0(self) -> None: data = create_test_data() @@ -616,7 +640,7 @@ def test_concat_size0(self) -> None: def test_concat_autoalign(self) -> None: ds1 = Dataset({"foo": DataArray([1, 2], coords=[("x", [1, 2])])}) ds2 = Dataset({"foo": DataArray([1, 2], coords=[("x", [1, 3])])}) - actual = concat([ds1, ds2], "y") + actual = concat([ds1, ds2], "y", data_vars="all", join="outer") expected = Dataset( { "foo": DataArray( @@ -628,8 +652,7 @@ def test_concat_autoalign(self) -> None: ) assert_identical(expected, actual) - def test_concat_errors(self): - # TODO: annotating this func fails + def test_concat_errors(self) -> None: data = create_test_data() split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] @@ -659,13 +682,13 @@ def test_concat_errors(self): assert_identical(data, concat([data0, data1], "dim1", compat="equals")) with pytest.raises(ValueError, match=r"compat.* invalid"): - concat(split_data, "dim1", compat="foobar") + concat(split_data, "dim1", compat="foobar") # type: ignore[call-overload] with pytest.raises(ValueError, match=r"compat.* invalid"): concat(split_data, "dim1", compat="minimal") with pytest.raises(ValueError, match=r"unexpected value for"): - concat([data, data], "new_dim", coords="foobar") + concat([data, data], "new_dim", coords="foobar") # type: ignore[call-overload] with pytest.raises( ValueError, match=r"coordinate in some datasets but not others" @@ -844,8 +867,7 @@ def test_concat_combine_attrs_kwarg_variables( assert_identical(actual, expected) - def test_concat_promote_shape(self) -> None: - # mixed dims within variables + def test_concat_promote_shape_with_mixed_dims_within_variables(self) -> None: objs = [Dataset({}, {"x": 0}), Dataset({"x": [1]})] actual = concat(objs, "x") expected = Dataset({"x": [0, 1]}) @@ -855,25 +877,28 @@ def test_concat_promote_shape(self) -> None: actual = concat(objs, "x") assert_identical(actual, expected) - # mixed dims between variables + def test_concat_promote_shape_with_mixed_dims_between_variables(self) -> None: objs = [Dataset({"x": [2], "y": 3}), Dataset({"x": [4], "y": 5})] - actual = concat(objs, "x") + actual = concat(objs, "x", data_vars="all") expected = Dataset({"x": [2, 4], "y": ("x", [3, 5])}) assert_identical(actual, expected) - # mixed dims in coord variable + def test_concat_promote_shape_with_mixed_dims_in_coord_variable(self) -> None: objs = [Dataset({"x": [0]}, {"y": -1}), Dataset({"x": [1]}, {"y": ("x", [-2])})] actual = concat(objs, "x") expected = Dataset({"x": [0, 1]}, {"y": ("x", [-1, -2])}) assert_identical(actual, expected) - # scalars with mixed lengths along concat dim -- values should repeat + def test_concat_promote_shape_for_scalars_with_mixed_lengths_along_concat_dim( + self, + ) -> None: + # values should repeat objs = [Dataset({"x": [0]}, {"y": -1}), Dataset({"x": [1, 2]}, {"y": -2})] - actual = concat(objs, "x") + actual = concat(objs, "x", coords="different", compat="equals") expected = Dataset({"x": [0, 1, 2]}, {"y": ("x", [-1, -2, -2])}) assert_identical(actual, expected) - # broadcast 1d x 1d -> 2d + def test_concat_promote_shape_broadcast_1d_x_1d_goes_to_2d(self) -> None: objs = [ Dataset({"z": ("x", [-1])}, {"x": [0], "y": [0]}), Dataset({"z": ("y", [1])}, {"x": [1], "y": [0]}), @@ -882,6 +907,7 @@ def test_concat_promote_shape(self) -> None: expected = Dataset({"z": (("x", "y"), [[-1], [1]])}, {"x": [0, 1], "y": [0]}) assert_identical(actual, expected) + def test_concat_promote_shape_with_scalar_coordinates(self) -> None: # regression GH6384 objs = [ Dataset({}, {"x": pd.Interval(-1, 0, closed="right")}), @@ -898,6 +924,7 @@ def test_concat_promote_shape(self) -> None: ) assert_identical(actual, expected) + def test_concat_promote_shape_with_coordinates_of_particular_dtypes(self) -> None: # regression GH6416 (coord dtype) and GH6434 time_data1 = np.array(["2022-01-01", "2022-02-01"], dtype="datetime64[ns]") time_data2 = np.array("2022-03-01", dtype="datetime64[ns]") @@ -931,14 +958,14 @@ def test_concat_dim_is_variable(self) -> None: objs = [Dataset({"x": 0}), Dataset({"x": 1})] coord = Variable("y", [3, 4], attrs={"foo": "bar"}) expected = Dataset({"x": ("y", [0, 1]), "y": coord}) - actual = concat(objs, coord) + actual = concat(objs, coord, data_vars="all") assert_identical(actual, expected) def test_concat_dim_is_dataarray(self) -> None: objs = [Dataset({"x": 0}), Dataset({"x": 1})] coord = DataArray([3, 4], dims="y", attrs={"foo": "bar"}) expected = Dataset({"x": ("y", [0, 1]), "y": coord}) - actual = concat(objs, coord) + actual = concat(objs, coord, data_vars="all") assert_identical(actual, expected) def test_concat_multiindex(self) -> None: @@ -984,7 +1011,9 @@ def test_concat_fill_value(self, fill_value) -> None: }, {"x": [0, 1, 2]}, ) - actual = concat(datasets, dim="t", fill_value=fill_value) + actual = concat( + datasets, dim="t", fill_value=fill_value, data_vars="all", join="outer" + ) assert_identical(actual, expected) @pytest.mark.parametrize("dtype", [str, bytes]) @@ -1006,7 +1035,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None: "x2": np.array(["c", "d"], dtype=dtype), } ) - actual = concat([da1, da2], dim=dim) + actual = concat([da1, da2], dim=dim, join="outer") assert np.issubdtype(actual.x2.dtype, dtype) @@ -1031,7 +1060,7 @@ def test_concat_avoids_index_auto_creation(self) -> None: assert combined.indexes == {} # should not raise on stack - combined = concat(datasets, dim="z") + combined = concat(datasets, dim="z", data_vars="all") assert combined["a"].shape == (2, 3, 3) assert combined["a"].dims == ("z", "x", "y") @@ -1096,11 +1125,13 @@ def test_concat(self) -> None: stacked = concat(grouped, pd.Index(ds["x"], name="x")) assert_identical(foo, stacked) - actual2 = concat([foo[0], foo[1]], pd.Index([0, 1])).reset_coords(drop=True) + actual2 = concat([foo[0], foo[1]], pd.Index([0, 1]), coords="all").reset_coords( + drop=True + ) expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual2) - actual3 = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True) + actual3 = concat([foo[0], foo[1]], [0, 1], coords="all").reset_coords(drop=True) expected = foo[:2].rename({"x": "concat_dim"}) assert_identical(expected, actual3) @@ -1108,7 +1139,7 @@ def test_concat(self) -> None: concat([foo, bar], dim="w", compat="identical") with pytest.raises(ValueError, match=r"not a valid argument"): - concat([foo, bar], dim="w", data_vars="minimal") + concat([foo, bar], dim="w", data_vars="different") def test_concat_encoding(self) -> None: # Regression test for GH1297 @@ -1162,7 +1193,7 @@ def test_concat_avoids_index_auto_creation(self) -> None: assert combined.indexes == {} # should not raise on stack - combined = concat(arrays, dim="z") + combined = concat(arrays, dim="z", coords="different", compat="equals") assert combined.shape == (2, 3, 3) assert combined.dims == ("z", "x", "y") @@ -1182,7 +1213,7 @@ def test_concat_fill_value(self, fill_value) -> None: dims=["y", "x"], coords={"x": [1, 2, 3]}, ) - actual = concat((foo, bar), dim="y", fill_value=fill_value) + actual = concat((foo, bar), dim="y", fill_value=fill_value, join="outer") assert_identical(actual, expected) def test_concat_join_kwarg(self) -> None: @@ -1260,7 +1291,7 @@ def test_concat_str_dtype(self, dtype, dim) -> None: dims=["x1", "x2"], coords={"x1": np.array([1, 2]), "x2": np.array(["c", "d"], dtype=dtype)}, ) - actual = concat([da1, da2], dim=dim) + actual = concat([da1, da2], dim=dim, join="outer") assert np.issubdtype(actual.x2.dtype, dtype) @@ -1285,16 +1316,17 @@ def test_concat_attrs_first_variable(attr1, attr2) -> None: assert concat_attrs == attr1 -def test_concat_merge_single_non_dim_coord(): - # TODO: annotating this func fails +def test_concat_merge_single_non_dim_coord() -> None: da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1}) da2 = DataArray([4, 5, 6], dims="x", coords={"x": [4, 5, 6]}) expected = DataArray(range(1, 7), dims="x", coords={"x": range(1, 7), "y": 1}) - for coords in ["different", "minimal"]: - actual = concat([da1, da2], "x", coords=coords) - assert_identical(actual, expected) + actual = concat([da1, da2], "x", coords="minimal", compat="override") + assert_identical(actual, expected) + + actual = concat([da1, da2], "x", coords="different", compat="equals") + assert_identical(actual, expected) with pytest.raises(ValueError, match=r"'y' not present in all datasets."): concat([da1, da2], dim="x", coords="all") @@ -1302,9 +1334,12 @@ def test_concat_merge_single_non_dim_coord(): da1 = DataArray([1, 2, 3], dims="x", coords={"x": [1, 2, 3], "y": 1}) da2 = DataArray([4, 5, 6], dims="x", coords={"x": [4, 5, 6]}) da3 = DataArray([7, 8, 9], dims="x", coords={"x": [7, 8, 9], "y": 1}) - for coords in ["different", "all"]: - with pytest.raises(ValueError, match=r"'y' not present in all datasets"): - concat([da1, da2, da3], dim="x", coords=coords) + + with pytest.raises(ValueError, match=r"'y' not present in all datasets"): + concat([da1, da2, da3], dim="x", coords="all") + + with pytest.raises(ValueError, match=r"'y' not present in all datasets"): + concat([da1, da2, da3], dim="x", coords="different", compat="equals") def test_concat_preserve_coordinate_order() -> None: @@ -1379,3 +1414,114 @@ def test_concat_index_not_same_dim() -> None: match=r"Cannot concatenate along dimension 'x' indexes with dimensions.*", ): concat([ds1, ds2], dim="x") + + +class TestNewDefaults: + def test_concat_second_empty_with_scalar_data_var_only_on_first(self) -> None: + ds1 = Dataset(data_vars={"a": ("y", [0.1]), "b": 0.1}, coords={"x": 0.1}) + ds2 = Dataset(coords={"x": 0.1}) + + expected = Dataset( + data_vars={"a": ("y", [0.1, np.nan]), "b": 0.1}, coords={"x": 0.1} + ) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='equals' to compat='override'", + ): + actual = concat( + [ds1, ds2], dim="y", coords="different", data_vars="different" + ) + assert_identical(actual, expected) + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + concat([ds1, ds2], dim="y", coords="different", data_vars="different") + + def test_concat_multiple_datasets_missing_vars(self) -> None: + vars_to_drop = [ + "temperature", + "pressure", + "humidity", + "precipitation", + "cloud_cover", + ] + + datasets = create_concat_datasets( + len(vars_to_drop), seed=123, include_day=False + ) + # set up the test data + datasets = [ + ds.drop_vars(varname) + for ds, varname in zip(datasets, vars_to_drop, strict=True) + ] + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + old = concat(datasets, dim="day") + with set_options(use_new_combine_kwarg_defaults=True): + new = concat(datasets, dim="day") + + with pytest.raises(AssertionError): + assert_identical(old, new) + + @pytest.mark.parametrize("coords", ["different", "minimal", "all"]) + def test_concat_coords_kwarg( + self, coords: Literal["all", "minimal", "different"] + ) -> None: + data = create_test_data().drop_dims("dim3") + + # make sure the coords argument behaves as expected + data.coords["extra"] = ("dim4", np.arange(3)) + datasets = [g.squeeze() for _, g in data.groupby("dim1", squeeze=False)] + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from data_vars='all' to data_vars='minimal'", + ): + expectation: AbstractContextManager = ( + pytest.warns( + FutureWarning, + match="will change from compat='equals' to compat='override'", + ) + if coords == "different" + else nullcontext() + ) + with expectation: + old = concat(datasets, data["dim1"], coords=coords) + + with set_options(use_new_combine_kwarg_defaults=True): + if coords == "different": + with pytest.raises(ValueError): + concat(datasets, data["dim1"], coords=coords) + else: + new = concat(datasets, data["dim1"], coords=coords) + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_concat_promote_shape_for_scalars_with_mixed_lengths_along_concat_dim( + self, + ) -> None: + # values should repeat + objs = [Dataset({"x": [0]}, {"y": -1}), Dataset({"x": [1, 2]}, {"y": -2})] + expected = Dataset({"x": [0, 1, 2]}, {"y": ("x", [-1, -2, -2])}) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from coords='different' to coords='minimal'", + ): + old = concat(objs, "x") + assert_identical(old, expected) + with set_options(use_new_combine_kwarg_defaults=True): + new = concat(objs, "x") + with pytest.raises(AssertionError): + assert_identical(new, old) + with pytest.raises(ValueError, match="might be related to new default"): + concat(objs, "x", coords="different") + with pytest.raises(merge.MergeError, match="conflicting values"): + concat(objs, "x", compat="equals") + + new = concat(objs, "x", coords="different", compat="equals") + assert_identical(old, new) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index dede0b01f1d..aeb33b70d22 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -446,7 +446,11 @@ def test_concat_loads_variables(self): assert kernel_call_count == 0 out = xr.concat( - [ds1, ds2, ds3], dim="n", data_vars="different", coords="different" + [ds1, ds2, ds3], + dim="n", + data_vars="different", + coords="different", + compat="equals", ) # each kernel is computed exactly once assert kernel_call_count == 6 @@ -488,7 +492,11 @@ def test_concat_loads_variables(self): # stop computing variables as it would not have any benefit ds4 = Dataset(data_vars={"d": ("x", [2.0])}, coords={"c": ("x", [2.0])}) out = xr.concat( - [ds1, ds2, ds4, ds3], dim="n", data_vars="different", coords="different" + [ds1, ds2, ds4, ds3], + dim="n", + data_vars="different", + coords="different", + compat="equals", ) # the variables of ds1 and ds2 were computed, but those of ds3 didn't assert kernel_call_count == 22 @@ -509,7 +517,11 @@ def test_concat_loads_variables(self): # now check that concat() is correctly using dask name equality to skip loads out = xr.concat( - [ds1, ds1, ds1], dim="n", data_vars="different", coords="different" + [ds1, ds1, ds1], + dim="n", + data_vars="different", + coords="different", + compat="equals", ) assert kernel_call_count == 24 # variables are not loaded in the output @@ -1369,7 +1381,9 @@ def test_map_blocks_ds_transformations(func, map_ds): def test_map_blocks_da_ds_with_template(obj): func = lambda x: x.isel(x=[1]) # a simple .isel(x=[1, 5, 9]) puts all those in a single chunk. - template = xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x") + template = xr.concat( + [obj.isel(x=[i]) for i in [1, 5, 9]], data_vars="minimal", dim="x" + ) with raise_if_dask_computes(): actual = xr.map_blocks(func, obj, template=template) assert_identical(actual, template) @@ -1442,7 +1456,9 @@ def test_map_blocks_errors_bad_template(obj): xr.map_blocks( lambda a: a.isel(x=[1]).assign_coords(x=[120]), # assign bad index values obj, - template=xr.concat([obj.isel(x=[i]) for i in [1, 5, 9]], dim="x"), + template=xr.concat( + [obj.isel(x=[i]) for i in [1, 5, 9]], data_vars="minimal", dim="x" + ), ).compute() diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e7acdcdd4f3..b52fde1ca5f 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1439,12 +1439,25 @@ def test_selection_multiindex_from_level(self) -> None: # GH: 3512 da = DataArray([0, 1], dims=["x"], coords={"x": [0, 1], "y": "a"}) db = DataArray([2, 3], dims=["x"], coords={"x": [0, 1], "y": "b"}) - data = xr.concat([da, db], dim="x").set_index(xy=["x", "y"]) + data = xr.concat( + [da, db], dim="x", coords="different", compat="equals" + ).set_index(xy=["x", "y"]) assert data.dims == ("xy",) actual = data.sel(y="a") expected = data.isel(xy=[0, 1]).unstack("xy").squeeze("y") assert_equal(actual, expected) + def test_concat_with_default_coords_warns(self) -> None: + da = DataArray([0, 1], dims=["x"], coords={"x": [0, 1], "y": "a"}) + db = DataArray([2, 3], dims=["x"], coords={"x": [0, 1], "y": "b"}) + + with pytest.warns(FutureWarning): + original = xr.concat([da, db], dim="x") + with set_options(use_new_combine_kwarg_defaults=True): + new = xr.concat([da, db], dim="x") + + assert original.y.shape != new.y.shape + def test_virtual_default_coords(self) -> None: array = DataArray(np.zeros((5,)), dims="x") expected = DataArray(range(5), dims="x", name="x") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index bacad96a213..3637fc668b4 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6195,7 +6195,7 @@ def test_dataset_math_auto_align(self) -> None: assert_equal(actual, expected) actual = ds + ds[["bar"]] - expected = (2 * ds[["bar"]]).merge(ds.coords) + expected = (2 * ds[["bar"]]).merge(ds.coords, compat="override") assert_identical(expected, actual) assert_identical(ds + Dataset(), ds.coords.to_dataset()) @@ -6631,12 +6631,12 @@ def test_combine_first(self) -> None: coords={"x": ["a", "b", "c"]}, ) assert_equal(actual, expected) - assert_equal(actual, xr.merge([dsx0, dsx1])) + assert_equal(actual, xr.merge([dsx0, dsx1], join="outer")) # works just like xr.merge([self, other]) dsy2 = DataArray([2, 2, 2], [("x", ["b", "c", "d"])]).to_dataset(name="dsy2") actual = dsx0.combine_first(dsy2) - expected = xr.merge([dsy2, dsx0]) + expected = xr.merge([dsy2, dsx0], join="outer") assert_equal(actual, expected) def test_sortby(self) -> None: diff --git a/xarray/tests/test_duck_array_wrapping.py b/xarray/tests/test_duck_array_wrapping.py index 42440385928..340669b795f 100644 --- a/xarray/tests/test_duck_array_wrapping.py +++ b/xarray/tests/test_duck_array_wrapping.py @@ -163,7 +163,7 @@ def test_concat(self): assert isinstance(result.data, self.Array) def test_merge(self): - result = xr.merge([self.x1, self.x2], compat="override") + result = xr.merge([self.x1, self.x2], compat="override", join="outer") assert isinstance(result.foo.data, self.Array) def test_where(self): diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 0f260eb381d..dae4864cff1 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -2433,6 +2433,7 @@ def test_resample_min_count(self) -> None: for i in range(3) ], dim=actual["time"], + data_vars="all", ) assert_allclose(expected, actual) diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 302d26df8f3..f226991eda2 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -5,6 +5,7 @@ import xarray as xr from xarray.core import dtypes +from xarray.core.options import set_options from xarray.structure import merge from xarray.structure.merge import MergeError from xarray.testing import assert_equal, assert_identical @@ -37,15 +38,17 @@ def test_merge_arrays(self): expected = data[["var1", "var2"]] assert_identical(actual, expected) - def test_merge_datasets(self): - data = create_test_data(add_attrs=False, use_extension_array=True) + @pytest.mark.parametrize("use_new_combine_kwarg_defaults", [True, False]) + def test_merge_datasets(self, use_new_combine_kwarg_defaults): + with set_options(use_new_combine_kwarg_defaults=use_new_combine_kwarg_defaults): + data = create_test_data(add_attrs=False, use_extension_array=True) - actual = xr.merge([data[["var1"]], data[["var2"]]]) - expected = data[["var1", "var2"]] - assert_identical(actual, expected) + actual = xr.merge([data[["var1"]], data[["var2"]]]) + expected = data[["var1", "var2"]] + assert_identical(actual, expected) - actual = xr.merge([data, data]) - assert_identical(actual, data) + actual = xr.merge([data, data], compat="no_conflicts") + assert_identical(actual, data) def test_merge_dataarray_unnamed(self): data = xr.DataArray([1, 2], dims="x") @@ -192,9 +195,13 @@ def test_merge_arrays_attrs_variables( if expect_exception: with pytest.raises(MergeError, match="combine_attrs"): - actual = xr.merge([data1, data2], combine_attrs=combine_attrs) + actual = xr.merge( + [data1, data2], compat="no_conflicts", combine_attrs=combine_attrs + ) else: - actual = xr.merge([data1, data2], combine_attrs=combine_attrs) + actual = xr.merge( + [data1, data2], compat="no_conflicts", combine_attrs=combine_attrs + ) expected = xr.Dataset( {"var1": ("dim1", [], expected_attrs)}, coords={"dim1": ("dim1", [], expected_attrs)}, @@ -267,8 +274,12 @@ def test_merge_no_conflicts_single_var(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) expected = xr.Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) - assert expected.identical(xr.merge([ds1, ds2], compat="no_conflicts")) - assert expected.identical(xr.merge([ds2, ds1], compat="no_conflicts")) + assert expected.identical( + xr.merge([ds1, ds2], compat="no_conflicts", join="outer") + ) + assert expected.identical( + xr.merge([ds2, ds1], compat="no_conflicts", join="outer") + ) assert ds1.identical(xr.merge([ds1, ds2], compat="no_conflicts", join="left")) assert ds2.identical(xr.merge([ds1, ds2], compat="no_conflicts", join="right")) expected = xr.Dataset({"a": ("x", [2]), "x": [1]}) @@ -278,11 +289,11 @@ def test_merge_no_conflicts_single_var(self): with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("x", [99, 3]), "x": [1, 2]}) - xr.merge([ds1, ds3], compat="no_conflicts") + xr.merge([ds1, ds3], compat="no_conflicts", join="outer") with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("y", [2, 3]), "y": [1, 2]}) - xr.merge([ds1, ds3], compat="no_conflicts") + xr.merge([ds1, ds3], compat="no_conflicts", join="outer") def test_merge_no_conflicts_multi_var(self): data = create_test_data(add_attrs=False) @@ -304,17 +315,19 @@ def test_merge_no_conflicts_multi_var(self): def test_merge_no_conflicts_preserve_attrs(self): data = xr.Dataset({"x": ([], 0, {"foo": "bar"})}) - actual = xr.merge([data, data], combine_attrs="no_conflicts") + actual = xr.merge( + [data, data], compat="no_conflicts", combine_attrs="no_conflicts" + ) assert_identical(data, actual) def test_merge_no_conflicts_broadcast(self): datasets = [xr.Dataset({"x": ("y", [0])}), xr.Dataset({"x": np.nan})] - actual = xr.merge(datasets) + actual = xr.merge(datasets, compat="no_conflicts") expected = xr.Dataset({"x": ("y", [0])}) assert_identical(expected, actual) datasets = [xr.Dataset({"x": ("y", [np.nan])}), xr.Dataset({"x": 0})] - actual = xr.merge(datasets) + actual = xr.merge(datasets, compat="no_conflicts") assert_identical(expected, actual) @@ -329,28 +342,28 @@ def test_merge(self): actual = ds2.merge(ds1) assert_identical(expected, actual) - - actual = data.merge(data) + with pytest.warns(FutureWarning): # this is a false alarm + actual = data.merge(data) assert_identical(data, actual) - actual = data.reset_coords(drop=True).merge(data) + actual = data.reset_coords(drop=True).merge(data, compat="no_conflicts") assert_identical(data, actual) - actual = data.merge(data.reset_coords(drop=True)) + actual = data.merge(data.reset_coords(drop=True), compat="no_conflicts") assert_identical(data, actual) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="conflicting values for variable"): ds1.merge(ds2.rename({"var3": "var1"})) with pytest.raises(ValueError, match=r"should be coordinates or not"): - data.reset_coords().merge(data) + data.reset_coords().merge(data, compat="no_conflicts") with pytest.raises(ValueError, match=r"should be coordinates or not"): - data.merge(data.reset_coords()) + data.merge(data.reset_coords(), compat="no_conflicts") - def test_merge_broadcast_equals(self): + def test_merge_compat_broadcast_equals(self): ds1 = xr.Dataset({"x": 0}) ds2 = xr.Dataset({"x": ("y", [0, 0])}) - actual = ds1.merge(ds2) + actual = ds1.merge(ds2, compat="broadcast_equals") assert_identical(ds2, actual) - actual = ds2.merge(ds1) + actual = ds2.merge(ds1, compat="broadcast_equals") assert_identical(ds2, actual) actual = ds1.copy() @@ -359,7 +372,7 @@ def test_merge_broadcast_equals(self): ds1 = xr.Dataset({"x": np.nan}) ds2 = xr.Dataset({"x": ("y", [np.nan, np.nan])}) - actual = ds1.merge(ds2) + actual = ds1.merge(ds2, compat="broadcast_equals") assert_identical(ds2, actual) def test_merge_compat(self): @@ -393,14 +406,14 @@ def test_merge_compat_minimal(self) -> None: expected = xr.Dataset(coords={"foo": [1, 2, 3]}) assert_identical(actual, expected) - def test_merge_auto_align(self): + def test_merge_join(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"b": ("x", [3, 4]), "x": [1, 2]}) expected = xr.Dataset( {"a": ("x", [1, 2, np.nan]), "b": ("x", [np.nan, 3, 4])}, {"x": [0, 1, 2]} ) - assert expected.identical(ds1.merge(ds2)) - assert expected.identical(ds2.merge(ds1)) + assert expected.identical(ds1.merge(ds2, join="outer")) + assert expected.identical(ds2.merge(ds1, join="outer")) expected = expected.isel(x=slice(2)) assert expected.identical(ds1.merge(ds2, join="left")) @@ -428,17 +441,19 @@ def test_merge_fill_value(self, fill_value): {"a": ("x", [1, 2, fill_value_a]), "b": ("x", [fill_value_b, 3, 4])}, {"x": [0, 1, 2]}, ) - assert expected.identical(ds1.merge(ds2, fill_value=fill_value)) - assert expected.identical(ds2.merge(ds1, fill_value=fill_value)) - assert expected.identical(xr.merge([ds1, ds2], fill_value=fill_value)) + assert expected.identical(ds1.merge(ds2, join="outer", fill_value=fill_value)) + assert expected.identical(ds2.merge(ds1, join="outer", fill_value=fill_value)) + assert expected.identical( + xr.merge([ds1, ds2], join="outer", fill_value=fill_value) + ) def test_merge_no_conflicts(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"a": ("x", [2, 3]), "x": [1, 2]}) expected = xr.Dataset({"a": ("x", [1, 2, 3]), "x": [0, 1, 2]}) - assert expected.identical(ds1.merge(ds2, compat="no_conflicts")) - assert expected.identical(ds2.merge(ds1, compat="no_conflicts")) + assert expected.identical(ds1.merge(ds2, compat="no_conflicts", join="outer")) + assert expected.identical(ds2.merge(ds1, compat="no_conflicts", join="outer")) assert ds1.identical(ds1.merge(ds2, compat="no_conflicts", join="left")) @@ -449,11 +464,11 @@ def test_merge_no_conflicts(self): with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("x", [99, 3]), "x": [1, 2]}) - ds1.merge(ds3, compat="no_conflicts") + ds1.merge(ds3, compat="no_conflicts", join="outer") with pytest.raises(xr.MergeError): ds3 = xr.Dataset({"a": ("y", [2, 3]), "y": [1, 2]}) - ds1.merge(ds3, compat="no_conflicts") + ds1.merge(ds3, compat="no_conflicts", join="outer") def test_merge_dataarray(self): ds = xr.Dataset({"a": 0}) @@ -491,3 +506,80 @@ def test_merge_combine_attrs( actual = ds1.merge(ds2, combine_attrs=combine_attrs) expected = xr.Dataset(attrs=expected_attrs) assert_identical(actual, expected) + + +class TestNewDefaults: + def test_merge_datasets_false_warning(self): + data = create_test_data(add_attrs=False, use_extension_array=True) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = xr.merge([data, data]) + + with set_options(use_new_combine_kwarg_defaults=True): + new = xr.merge([data, data]) + + assert_identical(old, new) + + def test_merge(self): + data = create_test_data() + ds1 = data[["var1"]] + ds2 = data[["var3"]] + expected = data[["var1", "var3"]] + with set_options(use_new_combine_kwarg_defaults=True): + actual = ds1.merge(ds2) + assert_identical(expected, actual) + + actual = ds2.merge(ds1) + assert_identical(expected, actual) + + actual = data.merge(data) + assert_identical(data, actual) + + ds1.merge(ds2.rename({"var3": "var1"})) + + with pytest.raises(ValueError, match=r"should be coordinates or not"): + data.reset_coords().merge(data) + with pytest.raises(ValueError, match=r"should be coordinates or not"): + data.merge(data.reset_coords()) + + def test_merge_broadcast_equals(self): + ds1 = xr.Dataset({"x": 0}) + ds2 = xr.Dataset({"x": ("y", [0, 0])}) + + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, + match="will change from compat='no_conflicts' to compat='override'", + ): + old = ds1.merge(ds2) + + with set_options(use_new_combine_kwarg_defaults=True): + new = ds1.merge(ds2) + + assert_identical(ds2, old) + with pytest.raises(AssertionError): + assert_identical(old, new) + + def test_merge_auto_align(self): + ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) + ds2 = xr.Dataset({"b": ("x", [3, 4]), "x": [1, 2]}) + expected = xr.Dataset( + {"a": ("x", [1, 2, np.nan]), "b": ("x", [np.nan, 3, 4])}, {"x": [0, 1, 2]} + ) + with set_options(use_new_combine_kwarg_defaults=False): + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + assert expected.identical(ds1.merge(ds2)) + with pytest.warns( + FutureWarning, match="will change from join='outer' to join='exact'" + ): + assert expected.identical(ds2.merge(ds1)) + + with set_options(use_new_combine_kwarg_defaults=True): + with pytest.raises(ValueError, match="might be related to new default"): + expected.identical(ds2.merge(ds1)) diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index ede065eac37..a26f737ae09 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -734,6 +734,9 @@ def test_broadcast_dataset(dtype): "coords", ), ) +@pytest.mark.filterwarnings( + "ignore:.*the default value for coords will change:FutureWarning" +) def test_combine_by_coords(variant, unit, error, dtype): original_unit = unit_registry.m @@ -811,6 +814,12 @@ def test_combine_by_coords(variant, unit, error, dtype): "coords", ), ) +@pytest.mark.filterwarnings( + "ignore:.*the default value for join will change:FutureWarning" +) +@pytest.mark.filterwarnings( + "ignore:.*the default value for compat will change:FutureWarning" +) def test_combine_nested(variant, unit, error, dtype): original_unit = unit_registry.m @@ -1051,6 +1060,12 @@ def test_concat_dataset(variant, unit, error, dtype): "coords", ), ) +@pytest.mark.filterwarnings( + "ignore:.*the default value for join will change:FutureWarning" +) +@pytest.mark.filterwarnings( + "ignore:.*the default value for compat will change:FutureWarning" +) def test_merge_dataarray(variant, unit, error, dtype): original_unit = unit_registry.m @@ -1155,6 +1170,12 @@ def test_merge_dataarray(variant, unit, error, dtype): "coords", ), ) +@pytest.mark.filterwarnings( + "ignore:.*the default value for join will change:FutureWarning" +) +@pytest.mark.filterwarnings( + "ignore:.*the default value for compat will change:FutureWarning" +) def test_merge_dataset(variant, unit, error, dtype): original_unit = unit_registry.m @@ -5573,6 +5594,12 @@ def test_content_manipulation(self, func, variant, dtype): "coords", ), ) + @pytest.mark.filterwarnings( + "ignore:.*the default value for join will change:FutureWarning" + ) + @pytest.mark.filterwarnings( + "ignore:.*the default value for compat will change:FutureWarning" + ) def test_merge(self, variant, unit, error, dtype): left_variants = { "data": (unit_registry.m, 1, 1), diff --git a/xarray/util/deprecation_helpers.py b/xarray/util/deprecation_helpers.py index 1064082872d..dd4c044ec7f 100644 --- a/xarray/util/deprecation_helpers.py +++ b/xarray/util/deprecation_helpers.py @@ -35,9 +35,10 @@ import warnings from collections.abc import Callable from functools import wraps -from typing import TypeVar +from typing import Any, TypeVar -from xarray.core.utils import emit_user_level_warning +from xarray.core.options import OPTIONS +from xarray.core.utils import ReprObject, emit_user_level_warning T = TypeVar("T", bound=Callable) @@ -145,3 +146,76 @@ def wrapper(*args, **kwargs): # We're quite confident we're just returning `T` from this function, so it's fine to ignore typing # within the function. return wrapper # type: ignore[return-value] + + +class CombineKwargDefault: + """Object that handles deprecation cycle for kwarg default values. + + Similar to ReprObject + """ + + _old: str + _new: str + _name: str + + def __init__(self, *, name: str, old: str, new: str): + self._name = name + self._old = old + self._new = new + + def __repr__(self) -> str: + return self._value + + def __eq__(self, other: ReprObject | Any) -> bool: + return ( + self._value == other._value + if isinstance(other, ReprObject) + else self._value == other + ) + + @property + def _value(self) -> str: + return self._new if OPTIONS["use_new_combine_kwarg_defaults"] else self._old + + def __hash__(self) -> int: + return hash(self._value) + + def __dask_tokenize__(self) -> object: + from dask.base import normalize_token + + return normalize_token((type(self), self._value)) + + def warning_message(self, message: str, recommend_set_options: bool = True) -> str: + if recommend_set_options: + recommendation = ( + " To opt in to new defaults and get rid of these warnings now " + "use `set_options(use_new_combine_kwarg_defaults=True) or " + f"set {self._name} explicitly." + ) + else: + recommendation = ( + f" The recommendation is to set {self._name} explicitly for this case." + ) + + return ( + f"In a future version of xarray the default value for {self._name} will " + + f"change from {self._name}={self._old!r} to {self._name}={self._new!r}. " + + message + + recommendation + ) + + def error_message(self) -> str: + return ( + f" Error might be related to new default ({self._name}={self._new!r}). " + f"Previously the default was {self._name}={self._old!r}. " + f"The recommendation is to set {self._name} explicitly for this case." + ) + + +_DATA_VARS_DEFAULT = CombineKwargDefault(name="data_vars", old="all", new="minimal") +_COORDS_DEFAULT = CombineKwargDefault(name="coords", old="different", new="minimal") +_COMPAT_CONCAT_DEFAULT = CombineKwargDefault( + name="compat", old="equals", new="override" +) +_COMPAT_DEFAULT = CombineKwargDefault(name="compat", old="no_conflicts", new="override") +_JOIN_DEFAULT = CombineKwargDefault(name="join", old="outer", new="exact")