Skip to content

combine_first by using apply_ufunc in ops.fillna #1204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 23, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions doc/combining.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Combining data

* For combining datasets or data arrays along a dimension, see concatenate_.
* For combining datasets with different variables, see merge_.
* For combining datasets or data arrays with different indexes or missing values, see combine_.

.. _concatenate:

Expand Down Expand Up @@ -116,6 +117,40 @@ used in the :py:class:`~xarray.Dataset` constructor:

xr.Dataset({'a': arr[:-1], 'b': arr[1:]})

.. _combine:

Combine
~~~~~~~

The instance method ``combine_first`` combines two datasets/data arrays and
defaults to non-null values in the calling object, using values from the called
object to fill holes. The resulting coordinates are the union of coordinate labels.
Vacant cells as a result of the outer-join are filled with nan.

Mimics the behavior of ``pandas.Dataframe.combine_first``

For data array,

.. ipython:: python

ar0 = DataArray([[0, 0], [0, 0]], [('x', ['a', 'b']), ('y', [-1, 0])])
ar1 = DataArray([[1, 1], [1, 1]], [('x', ['b', 'c']), ('y', [0, 1])])
ar2 = DataArray([2], [('x', ['d'])])
ar0.combine_first(ar1)
ar1.combine_first(ar0)
ar0.combine_first(ar2)

For datasets, ``ds0.combine_first(ds1)`` works similarly to ``xr.merge([ds0, ds1])``,
except that ``xr.merge`` raises a ``MergeError`` when there are conflicting values
in merging data variables, whereas ``.combine_first`` defaults to the calling object's values.

.. ipython:: python

ds0 = Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})
ds1 = Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
ds0.combine_first(ds1)
xr.merge([ds0, ds1])

.. _update:

Update
Expand Down
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ Deprecations

Enhancements
~~~~~~~~~~~~

- Added the xarray equivalent of `pandas.Dataframe.combine_first` as an instance
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just a note -- this will need to go in a new section for 0.9.1, since 0.9.0 will probably be released first.

method to DataArray/Dataset objects, facilitated by the new `ops.fillna` with
`join` and `data_vars_join` options.
(see :ref:`combine`)
By `Chun-Wei Yuan <https://github.com/chunweiyuan>`_.

- Added the ability to change default automatic alignment (arithmetic_join="inner")
for binary operations via :py:func:`~xarray.set_options()`
(see :ref:`automatic alignment`).
Expand Down
87 changes: 68 additions & 19 deletions xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import operator
import re

import numpy as np

from . import ops
from .alignment import deep_align
from .merge import expand_and_merge_variables
Expand All @@ -16,6 +18,8 @@


_DEFAULT_FROZEN_SET = frozenset()
_DEFAULT_FILL_VALUE = object()


# see http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
DIMENSION_NAME = r'\w+'
Expand Down Expand Up @@ -202,6 +206,7 @@ def apply_dataarray_ufunc(func, *args, **kwargs):
signature = kwargs.pop('signature')
join = kwargs.pop('join', 'inner')
exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET)
keep_attrs = kwargs.pop('keep_attrs', False)
if kwargs:
raise TypeError('apply_dataarray_ufunc() got unexpected keyword '
'arguments: %s' % list(kwargs))
Expand All @@ -217,12 +222,18 @@ def apply_dataarray_ufunc(func, *args, **kwargs):
result_var = func(*data_vars)

if signature.n_outputs > 1:
return tuple(DataArray(variable, coords, name=name, fastpath=True)
for variable, coords in zip(result_var, result_coords))
out = tuple(DataArray(variable, coords, name=name, fastpath=True)
for variable, coords in zip(result_var, result_coords))
else:
coords, = result_coords
return DataArray(result_var, coords, name=name, fastpath=True)
out = DataArray(result_var, coords, name=name, fastpath=True)

if keep_attrs and isinstance(args[0], DataArray):
if isinstance(out, tuple):
out = tuple(ds._copy_attrs_from(args[0]) for ds in out)
else:
out._copy_attrs_from(args[0])
return out

def ordered_set_union(all_keys):
# type: List[Iterable] -> Iterable
Expand Down Expand Up @@ -326,32 +337,53 @@ def _fast_dataset(variables, coord_variables):

def apply_dataset_ufunc(func, *args, **kwargs):
"""apply_dataset_ufunc(func, *args, signature, join='inner',
fill_value=None, exclude_dims=frozenset()):
dataset_join='inner', fill_value=None,
exclude_dims=frozenset(), keep_attrs=False):

If dataset_join != 'inner', a non-default fill_value must be supplied
by the user. Otherwise a TypeError is raised.
"""
from .dataset import Dataset
signature = kwargs.pop('signature')
join = kwargs.pop('join', 'inner')
dataset_join = kwargs.pop('dataset_join', 'inner')
fill_value = kwargs.pop('fill_value', None)
exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET)
keep_attrs = kwargs.pop('keep_attrs', False)
first_obj = args[0] # we'll copy attrs from this in case keep_attrs=True

if dataset_join != 'inner' and fill_value is _DEFAULT_FILL_VALUE:
raise TypeError('To apply an operation to datasets with different ',
'data variables, you must supply the ',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add if dataset_join != 'inner' to the docstring

'dataset_fill_value argument.')

if kwargs:
raise TypeError('apply_dataset_ufunc() got unexpected keyword '
'arguments: %s' % list(kwargs))

if len(args) > 1:
args = deep_align(args, join=join, copy=False, exclude=exclude_dims,
raise_on_invalid=False)

list_of_coords = build_output_coords(args, signature, exclude_dims)

args = [getattr(arg, 'data_vars', arg) for arg in args]

result_vars = apply_dict_of_variables_ufunc(
func, *args, signature=signature, join=join, fill_value=fill_value)
func, *args, signature=signature, join=dataset_join,
fill_value=fill_value)

if signature.n_outputs > 1:
return tuple(_fast_dataset(*args)
for args in zip(result_vars, list_of_coords))
out = tuple(_fast_dataset(*args)
for args in zip(result_vars, list_of_coords))
else:
coord_vars, = list_of_coords
return _fast_dataset(result_vars, coord_vars)
out = _fast_dataset(result_vars, coord_vars)

if keep_attrs and isinstance(first_obj, Dataset):
if isinstance(out, tuple):
out = tuple(ds._copy_attrs_from(first_obj) for ds in out)
else:
out._copy_attrs_from(first_obj)
return out


def _iter_over_selections(obj, dim, values):
Expand Down Expand Up @@ -530,7 +562,8 @@ def apply_array_ufunc(func, *args, **kwargs):

def apply_ufunc(func, *args, **kwargs):
"""apply_ufunc(func, *args, signature=None, join='inner',
exclude_dims=frozenset(), dataset_fill_value=None,
exclude_dims=frozenset(), dataset_join='inner',
dataset_fill_value=_DEFAULT_FILL_VALUE, keep_attrs=False,
kwargs=None, dask_array='forbidden')

Apply a vectorized function for unlabeled arrays to xarray objects.
Expand Down Expand Up @@ -581,14 +614,23 @@ def apply_ufunc(func, *args, **kwargs):
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
dataset_join : {'outer', 'inner', 'left', 'right'}, optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you fix the function signature at the top of the docstring, also to add in the new arguments?

Method for joining variables of Dataset objects with mismatched
data variables.
- 'outer': take variables from both Dataset objects
- 'inner': take only overlapped variables
- 'left': take only variables from the first object
- 'right': take only variables from the last object
dataset_fill_value : optional
Value used in place of missing variables on Dataset inputs when the
datasets do not share the exact same ``data_vars``. Required if
``dataset_join != 'inner'``, otherwise ignored.
keep_attrs: boolean, Optional
Whether to copy attributes from the first argument to the output.
exclude_dims : set, optional
Dimensions to exclude from alignment and broadcasting. Any inputs
coordinates along these dimensions will be dropped. Each excluded
dimension must be a core dimension in the function signature.
dataset_fill_value : optional
Value used in place of missing variables on Dataset inputs when the
datasets do not share the exact same ``data_vars``. Only relevant if
``join != 'inner'``.
kwargs: dict, optional
Optional keyword arguments passed directly on to call ``func``.
dask_array: 'forbidden' or 'allowed', optional
Expand Down Expand Up @@ -664,8 +706,10 @@ def stack(objects, dim, new_coord):

signature = kwargs.pop('signature', None)
join = kwargs.pop('join', 'inner')
dataset_join = kwargs.pop('dataset_join', 'inner')
keep_attrs = kwargs.pop('keep_attrs', False)
exclude_dims = kwargs.pop('exclude_dims', frozenset())
dataset_fill_value = kwargs.pop('dataset_fill_value', None)
dataset_fill_value = kwargs.pop('dataset_fill_value', _DEFAULT_FILL_VALUE)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should rename this argument to data_vars_fill_value, to help clarify that it is paired with data_vars_join? I would put them next to each other in the docstring, too.

Or maybe the pair should be dataset_fill_value/dataset_join? dataset_data_vars_fill_value/dataset_data_vars_join is probably too long :).

kwargs_ = kwargs.pop('kwargs', None)
dask_array = kwargs.pop('dask_array', 'forbidden')
if kwargs:
Expand Down Expand Up @@ -697,16 +741,21 @@ def stack(objects, dim, new_coord):
this_apply = functools.partial(
apply_ufunc, func, signature=signature, join=join,
dask_array=dask_array, exclude_dims=exclude_dims,
dataset_fill_value=dataset_fill_value)
dataset_fill_value=dataset_fill_value,
dataset_join=dataset_join,
keep_attrs=keep_attrs)
return apply_groupby_ufunc(this_apply, *args)
elif any(is_dict_like(a) for a in args):
return apply_dataset_ufunc(variables_ufunc, *args, signature=signature,
join=join, exclude_dims=exclude_dims,
fill_value=dataset_fill_value)
fill_value=dataset_fill_value,
dataset_join=dataset_join,
keep_attrs=keep_attrs)
elif any(isinstance(a, DataArray) for a in args):
return apply_dataarray_ufunc(variables_ufunc, *args,
signature=signature,
join=join, exclude_dims=exclude_dims)
join=join, exclude_dims=exclude_dims,
keep_attrs=keep_attrs)
elif any(isinstance(a, Variable) for a in args):
return variables_ufunc(*args)
else:
Expand Down
21 changes: 19 additions & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,10 +1097,27 @@ def fillna(self, value):
if utils.is_dict_like(value):
raise TypeError('cannot provide fill value as a dictionary with '
'fillna on a DataArray')
out = self._fillna(value)
out.attrs = self.attrs
out = ops.fillna(self, value)
return out

def combine_first(self, other):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it's worth making a separate method here to save a few characters.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think either we have combine_first for both Dataset and DataArray, or we lump everything into fillna. The use of combine_first is just that there's this concrete direct analogy from pandas. What's the decision here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fact that data_vars_join could sensibly differ for combine_first (as you have implemented it) suggests that a second method is reasonable. Consistency with pandas is also a nice factor. So I guess I've come around!

"""Combine two DataArray objects, with union of coordinates.

This operation follows the normal broadcasting and alignment rules of
``join='outer'``. Default to non-null values of array calling the
method. Use np.nan to fill in vacant cells after alignment.

Parameters
----------
other : DataArray
Used to fill all matching missing values in this array.

Returns
-------
DataArray
"""
return ops.fillna(self, other, join="outer")

def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs):
"""Reduce this array by applying `func` along some dimension(s).

Expand Down
48 changes: 32 additions & 16 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1955,8 +1955,31 @@ def fillna(self, value):
-------
Dataset
"""
out = self._fillna(value)
out._copy_attrs_from(self)
if utils.is_dict_like(value):
value_keys = getattr(value, 'data_vars', value).keys()
if not set(value_keys) <= set(self.data_vars.keys()):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we will need to separate the join argument to apply_ufunc into two parts:

  1. The join used for aligning indexes.
  2. The join used for aligning data variables between datasets.

The later should probably be renamed to something like data_vars_join

raise ValueError('all variables in the argument to `fillna` '
'must be contained in the original dataset')
out = ops.fillna(self, value)
return out
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be a good time to remove all the fillna specific logic from Dataset._calculate_binary_op that is no longer used


def combine_first(self, other):
"""Combine two Datasets, default to data_vars of self.

The new coordinates follow the normal broadcasting and alignment rules
of ``join='outer'``. Vacant cells in the expanded coordinates are
filled with np.nan.

Parameters
----------
other : DataArray
Used to fill all matching missing values in this array.

Returns
-------
DataArray
"""
out = ops.fillna(self, other, join="outer", dataset_join="outer")
return out

def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
Expand Down Expand Up @@ -2288,7 +2311,7 @@ def func(self, *args, **kwargs):
return func

@staticmethod
def _binary_op(f, reflexive=False, join=None, fillna=False):
def _binary_op(f, reflexive=False, join=None):
@functools.wraps(f)
def func(self, other):
if isinstance(other, groupby.GroupBy):
Expand All @@ -2297,8 +2320,7 @@ def func(self, other):
if hasattr(other, 'indexes'):
self, other = align(self, other, join=align_type, copy=False)
g = f if not reflexive else lambda x, y: f(y, x)
ds = self._calculate_binary_op(g, other, join=align_type,
fillna=fillna)
ds = self._calculate_binary_op(g, other, join=align_type)
return ds
return func

Expand All @@ -2321,14 +2343,9 @@ def func(self, other):
return func

def _calculate_binary_op(self, f, other, join='inner',
inplace=False, fillna=False):
inplace=False):

def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
if fillna and join != 'left':
raise ValueError('`fillna` must be accompanied by left join')
if fillna and not set(rhs_data_vars) <= set(lhs_data_vars):
raise ValueError('all variables in the argument to `fillna` '
'must be contained in the original dataset')
if inplace and set(lhs_data_vars) != set(rhs_data_vars):
raise ValueError('datasets must have the same data variables '
'for in-place arithmetic operations: %s, %s'
Expand All @@ -2340,12 +2357,10 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
if k in rhs_data_vars:
dest_vars[k] = f(lhs_vars[k], rhs_vars[k])
elif join in ["left", "outer"]:
dest_vars[k] = (lhs_vars[k] if fillna else
f(lhs_vars[k], np.nan))
dest_vars[k] = f(lhs_vars[k], np.nan)
for k in rhs_data_vars:
if k not in dest_vars and join in ["right", "outer"]:
dest_vars[k] = (rhs_vars[k] if fillna else
f(rhs_vars[k], np.nan))
dest_vars[k] = f(rhs_vars[k], np.nan)
return dest_vars

if utils.is_dict_like(other) and not isinstance(other, Dataset):
Expand All @@ -2372,7 +2387,8 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
def _copy_attrs_from(self, other):
self.attrs = other.attrs
for v in other.variables:
self.variables[v].attrs = other.variables[v].attrs
if v in self.variables:
self.variables[v].attrs = other.variables[v].attrs

def diff(self, dim, n=1, label='upper'):
"""Calculate the n-th order discrete difference along given axis.
Expand Down
3 changes: 2 additions & 1 deletion xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,8 @@ def fillna(self, value):
Dataset.fillna
DataArray.fillna
"""
return self._fillna(value)
out = ops.fillna(self, value)
return out

def where(self, cond):
"""Return an object of the same shape with all entries where cond is
Expand Down
Loading