-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
combine_first by using apply_ufunc in ops.fillna #1204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8c6a9ab
2fdd135
c190f84
02a4a14
f0c0866
b172eda
6e65c8b
67a599f
8c46c51
3c59009
a72e2a1
8bf856e
56c752a
4bf4efe
367e3ca
793552e
f5ebf78
998cd71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,8 @@ | |
import operator | ||
import re | ||
|
||
import numpy as np | ||
|
||
from . import ops | ||
from .alignment import deep_align | ||
from .merge import expand_and_merge_variables | ||
|
@@ -16,6 +18,8 @@ | |
|
||
|
||
_DEFAULT_FROZEN_SET = frozenset() | ||
_DEFAULT_FILL_VALUE = object() | ||
|
||
|
||
# see http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html | ||
DIMENSION_NAME = r'\w+' | ||
|
@@ -202,6 +206,7 @@ def apply_dataarray_ufunc(func, *args, **kwargs): | |
signature = kwargs.pop('signature') | ||
join = kwargs.pop('join', 'inner') | ||
exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET) | ||
keep_attrs = kwargs.pop('keep_attrs', False) | ||
if kwargs: | ||
raise TypeError('apply_dataarray_ufunc() got unexpected keyword ' | ||
'arguments: %s' % list(kwargs)) | ||
|
@@ -217,12 +222,18 @@ def apply_dataarray_ufunc(func, *args, **kwargs): | |
result_var = func(*data_vars) | ||
|
||
if signature.n_outputs > 1: | ||
return tuple(DataArray(variable, coords, name=name, fastpath=True) | ||
for variable, coords in zip(result_var, result_coords)) | ||
out = tuple(DataArray(variable, coords, name=name, fastpath=True) | ||
for variable, coords in zip(result_var, result_coords)) | ||
else: | ||
coords, = result_coords | ||
return DataArray(result_var, coords, name=name, fastpath=True) | ||
out = DataArray(result_var, coords, name=name, fastpath=True) | ||
|
||
if keep_attrs and isinstance(args[0], DataArray): | ||
if isinstance(out, tuple): | ||
out = tuple(ds._copy_attrs_from(args[0]) for ds in out) | ||
else: | ||
out._copy_attrs_from(args[0]) | ||
return out | ||
|
||
def ordered_set_union(all_keys): | ||
# type: List[Iterable] -> Iterable | ||
|
@@ -326,32 +337,53 @@ def _fast_dataset(variables, coord_variables): | |
|
||
def apply_dataset_ufunc(func, *args, **kwargs): | ||
"""apply_dataset_ufunc(func, *args, signature, join='inner', | ||
fill_value=None, exclude_dims=frozenset()): | ||
dataset_join='inner', fill_value=None, | ||
exclude_dims=frozenset(), keep_attrs=False): | ||
|
||
If dataset_join != 'inner', a non-default fill_value must be supplied | ||
by the user. Otherwise a TypeError is raised. | ||
""" | ||
from .dataset import Dataset | ||
signature = kwargs.pop('signature') | ||
join = kwargs.pop('join', 'inner') | ||
dataset_join = kwargs.pop('dataset_join', 'inner') | ||
fill_value = kwargs.pop('fill_value', None) | ||
exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET) | ||
keep_attrs = kwargs.pop('keep_attrs', False) | ||
first_obj = args[0] # we'll copy attrs from this in case keep_attrs=True | ||
|
||
if dataset_join != 'inner' and fill_value is _DEFAULT_FILL_VALUE: | ||
raise TypeError('To apply an operation to datasets with different ', | ||
'data variables, you must supply the ', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add |
||
'dataset_fill_value argument.') | ||
|
||
if kwargs: | ||
raise TypeError('apply_dataset_ufunc() got unexpected keyword ' | ||
'arguments: %s' % list(kwargs)) | ||
|
||
if len(args) > 1: | ||
args = deep_align(args, join=join, copy=False, exclude=exclude_dims, | ||
raise_on_invalid=False) | ||
|
||
list_of_coords = build_output_coords(args, signature, exclude_dims) | ||
|
||
args = [getattr(arg, 'data_vars', arg) for arg in args] | ||
|
||
result_vars = apply_dict_of_variables_ufunc( | ||
func, *args, signature=signature, join=join, fill_value=fill_value) | ||
func, *args, signature=signature, join=dataset_join, | ||
fill_value=fill_value) | ||
|
||
if signature.n_outputs > 1: | ||
return tuple(_fast_dataset(*args) | ||
for args in zip(result_vars, list_of_coords)) | ||
out = tuple(_fast_dataset(*args) | ||
for args in zip(result_vars, list_of_coords)) | ||
else: | ||
coord_vars, = list_of_coords | ||
return _fast_dataset(result_vars, coord_vars) | ||
out = _fast_dataset(result_vars, coord_vars) | ||
|
||
if keep_attrs and isinstance(first_obj, Dataset): | ||
if isinstance(out, tuple): | ||
out = tuple(ds._copy_attrs_from(first_obj) for ds in out) | ||
else: | ||
out._copy_attrs_from(first_obj) | ||
return out | ||
|
||
|
||
def _iter_over_selections(obj, dim, values): | ||
|
@@ -530,7 +562,8 @@ def apply_array_ufunc(func, *args, **kwargs): | |
|
||
def apply_ufunc(func, *args, **kwargs): | ||
"""apply_ufunc(func, *args, signature=None, join='inner', | ||
exclude_dims=frozenset(), dataset_fill_value=None, | ||
exclude_dims=frozenset(), dataset_join='inner', | ||
dataset_fill_value=_DEFAULT_FILL_VALUE, keep_attrs=False, | ||
kwargs=None, dask_array='forbidden') | ||
|
||
Apply a vectorized function for unlabeled arrays to xarray objects. | ||
|
@@ -581,14 +614,23 @@ def apply_ufunc(func, *args, **kwargs): | |
- 'inner': use the intersection of object indexes | ||
- 'left': use indexes from the first object with each dimension | ||
- 'right': use indexes from the last object with each dimension | ||
dataset_join : {'outer', 'inner', 'left', 'right'}, optional | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you fix the function signature at the top of the docstring, also to add in the new arguments? |
||
Method for joining variables of Dataset objects with mismatched | ||
data variables. | ||
- 'outer': take variables from both Dataset objects | ||
- 'inner': take only overlapped variables | ||
- 'left': take only variables from the first object | ||
- 'right': take only variables from the last object | ||
dataset_fill_value : optional | ||
Value used in place of missing variables on Dataset inputs when the | ||
datasets do not share the exact same ``data_vars``. Required if | ||
``dataset_join != 'inner'``, otherwise ignored. | ||
keep_attrs: boolean, Optional | ||
Whether to copy attributes from the first argument to the output. | ||
exclude_dims : set, optional | ||
Dimensions to exclude from alignment and broadcasting. Any inputs | ||
coordinates along these dimensions will be dropped. Each excluded | ||
dimension must be a core dimension in the function signature. | ||
dataset_fill_value : optional | ||
Value used in place of missing variables on Dataset inputs when the | ||
datasets do not share the exact same ``data_vars``. Only relevant if | ||
``join != 'inner'``. | ||
kwargs: dict, optional | ||
Optional keyword arguments passed directly on to call ``func``. | ||
dask_array: 'forbidden' or 'allowed', optional | ||
|
@@ -664,8 +706,10 @@ def stack(objects, dim, new_coord): | |
|
||
signature = kwargs.pop('signature', None) | ||
join = kwargs.pop('join', 'inner') | ||
dataset_join = kwargs.pop('dataset_join', 'inner') | ||
keep_attrs = kwargs.pop('keep_attrs', False) | ||
exclude_dims = kwargs.pop('exclude_dims', frozenset()) | ||
dataset_fill_value = kwargs.pop('dataset_fill_value', None) | ||
dataset_fill_value = kwargs.pop('dataset_fill_value', _DEFAULT_FILL_VALUE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should rename this argument to Or maybe the pair should be |
||
kwargs_ = kwargs.pop('kwargs', None) | ||
dask_array = kwargs.pop('dask_array', 'forbidden') | ||
if kwargs: | ||
|
@@ -697,16 +741,21 @@ def stack(objects, dim, new_coord): | |
this_apply = functools.partial( | ||
apply_ufunc, func, signature=signature, join=join, | ||
dask_array=dask_array, exclude_dims=exclude_dims, | ||
dataset_fill_value=dataset_fill_value) | ||
dataset_fill_value=dataset_fill_value, | ||
dataset_join=dataset_join, | ||
keep_attrs=keep_attrs) | ||
return apply_groupby_ufunc(this_apply, *args) | ||
elif any(is_dict_like(a) for a in args): | ||
return apply_dataset_ufunc(variables_ufunc, *args, signature=signature, | ||
join=join, exclude_dims=exclude_dims, | ||
fill_value=dataset_fill_value) | ||
fill_value=dataset_fill_value, | ||
dataset_join=dataset_join, | ||
keep_attrs=keep_attrs) | ||
elif any(isinstance(a, DataArray) for a in args): | ||
return apply_dataarray_ufunc(variables_ufunc, *args, | ||
signature=signature, | ||
join=join, exclude_dims=exclude_dims) | ||
join=join, exclude_dims=exclude_dims, | ||
keep_attrs=keep_attrs) | ||
elif any(isinstance(a, Variable) for a in args): | ||
return variables_ufunc(*args) | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1097,10 +1097,27 @@ def fillna(self, value): | |
if utils.is_dict_like(value): | ||
raise TypeError('cannot provide fill value as a dictionary with ' | ||
'fillna on a DataArray') | ||
out = self._fillna(value) | ||
out.attrs = self.attrs | ||
out = ops.fillna(self, value) | ||
return out | ||
|
||
def combine_first(self, other): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure it's worth making a separate method here to save a few characters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think either we have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The fact that |
||
"""Combine two DataArray objects, with union of coordinates. | ||
|
||
This operation follows the normal broadcasting and alignment rules of | ||
``join='outer'``. Default to non-null values of array calling the | ||
method. Use np.nan to fill in vacant cells after alignment. | ||
|
||
Parameters | ||
---------- | ||
other : DataArray | ||
Used to fill all matching missing values in this array. | ||
|
||
Returns | ||
------- | ||
DataArray | ||
""" | ||
return ops.fillna(self, other, join="outer") | ||
|
||
def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): | ||
"""Reduce this array by applying `func` along some dimension(s). | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1955,8 +1955,31 @@ def fillna(self, value): | |
------- | ||
Dataset | ||
""" | ||
out = self._fillna(value) | ||
out._copy_attrs_from(self) | ||
if utils.is_dict_like(value): | ||
value_keys = getattr(value, 'data_vars', value).keys() | ||
if not set(value_keys) <= set(self.data_vars.keys()): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we will need to separate the
The later should probably be renamed to something like |
||
raise ValueError('all variables in the argument to `fillna` ' | ||
'must be contained in the original dataset') | ||
out = ops.fillna(self, value) | ||
return out | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would be a good time to remove all the |
||
|
||
def combine_first(self, other): | ||
"""Combine two Datasets, default to data_vars of self. | ||
|
||
The new coordinates follow the normal broadcasting and alignment rules | ||
of ``join='outer'``. Vacant cells in the expanded coordinates are | ||
filled with np.nan. | ||
|
||
Parameters | ||
---------- | ||
other : DataArray | ||
Used to fill all matching missing values in this array. | ||
|
||
Returns | ||
------- | ||
DataArray | ||
""" | ||
out = ops.fillna(self, other, join="outer", dataset_join="outer") | ||
return out | ||
|
||
def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False, | ||
|
@@ -2288,7 +2311,7 @@ def func(self, *args, **kwargs): | |
return func | ||
|
||
@staticmethod | ||
def _binary_op(f, reflexive=False, join=None, fillna=False): | ||
def _binary_op(f, reflexive=False, join=None): | ||
@functools.wraps(f) | ||
def func(self, other): | ||
if isinstance(other, groupby.GroupBy): | ||
|
@@ -2297,8 +2320,7 @@ def func(self, other): | |
if hasattr(other, 'indexes'): | ||
self, other = align(self, other, join=align_type, copy=False) | ||
g = f if not reflexive else lambda x, y: f(y, x) | ||
ds = self._calculate_binary_op(g, other, join=align_type, | ||
fillna=fillna) | ||
ds = self._calculate_binary_op(g, other, join=align_type) | ||
return ds | ||
return func | ||
|
||
|
@@ -2321,14 +2343,9 @@ def func(self, other): | |
return func | ||
|
||
def _calculate_binary_op(self, f, other, join='inner', | ||
inplace=False, fillna=False): | ||
inplace=False): | ||
|
||
def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars): | ||
if fillna and join != 'left': | ||
raise ValueError('`fillna` must be accompanied by left join') | ||
if fillna and not set(rhs_data_vars) <= set(lhs_data_vars): | ||
raise ValueError('all variables in the argument to `fillna` ' | ||
'must be contained in the original dataset') | ||
if inplace and set(lhs_data_vars) != set(rhs_data_vars): | ||
raise ValueError('datasets must have the same data variables ' | ||
'for in-place arithmetic operations: %s, %s' | ||
|
@@ -2340,12 +2357,10 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars): | |
if k in rhs_data_vars: | ||
dest_vars[k] = f(lhs_vars[k], rhs_vars[k]) | ||
elif join in ["left", "outer"]: | ||
dest_vars[k] = (lhs_vars[k] if fillna else | ||
f(lhs_vars[k], np.nan)) | ||
dest_vars[k] = f(lhs_vars[k], np.nan) | ||
for k in rhs_data_vars: | ||
if k not in dest_vars and join in ["right", "outer"]: | ||
dest_vars[k] = (rhs_vars[k] if fillna else | ||
f(rhs_vars[k], np.nan)) | ||
dest_vars[k] = f(rhs_vars[k], np.nan) | ||
return dest_vars | ||
|
||
if utils.is_dict_like(other) and not isinstance(other, Dataset): | ||
|
@@ -2372,7 +2387,8 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars): | |
def _copy_attrs_from(self, other): | ||
self.attrs = other.attrs | ||
for v in other.variables: | ||
self.variables[v].attrs = other.variables[v].attrs | ||
if v in self.variables: | ||
self.variables[v].attrs = other.variables[v].attrs | ||
|
||
def diff(self, dim, n=1, label='upper'): | ||
"""Calculate the n-th order discrete difference along given axis. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just a note -- this will need to go in a new section for 0.9.1, since 0.9.0 will probably be released first.