diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0c62cca6093..7f561381b42 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -36,6 +36,12 @@ Documentation Enhancements ~~~~~~~~~~~~ +- min_count option is newly supported in :py:meth:`~xarray.DataArray.sum`, + :py:meth:`~xarray.DataArray.prod` and :py:meth:`~xarray.Dataset.sum`, and + :py:meth:`~xarray.Dataset.prod`. + (:issue:`2230`) + By `Keisuke Fujii `_. + - :py:meth:`plot()` now accepts the kwargs ``xscale, yscale, xlim, ylim, xticks, yticks`` just like Pandas. Also ``xincrease=False, yincrease=False`` now use matplotlib's axis inverting methods instead of setting limits. By `Deepak Cherian `_. (:issue:`2224`) @@ -65,6 +71,9 @@ Bug fixes - Tests can be run in parallel with pytest-xdist By `Tony Tung `_. +- Follow up the renamings in dask; from dask.ghost to dask.overlap + By `Keisuke Fujii `_. + - Now raises a ValueError when there is a conflict between dimension names and level names of MultiIndex. (:issue:`2299`) By `Keisuke Fujii `_. diff --git a/xarray/core/common.py b/xarray/core/common.py index 3f934fcc769..55aca5f557f 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -2,6 +2,7 @@ import warnings from distutils.version import LooseVersion +from textwrap import dedent import numpy as np import pandas as pd @@ -27,20 +28,20 @@ def wrapped_func(self, dim=None, axis=None, keep_attrs=False, allow_lazy=True, **kwargs) return wrapped_func - _reduce_extra_args_docstring = \ - """dim : str or sequence of str, optional + _reduce_extra_args_docstring = dedent("""\ + dim : str or sequence of str, optional Dimension(s) over which to apply `{name}`. axis : int or sequence of int, optional Axis(es) over which to apply `{name}`. Only one of the 'dim' and 'axis' arguments can be supplied. If neither are supplied, then - `{name}` is calculated over axes.""" + `{name}` is calculated over axes.""") - _cum_extra_args_docstring = \ - """dim : str or sequence of str, optional + _cum_extra_args_docstring = dedent("""\ + dim : str or sequence of str, optional Dimension over which to apply `{name}`. axis : int or sequence of int, optional Axis over which to apply `{name}`. Only one of the 'dim' - and 'axis' arguments can be supplied.""" + and 'axis' arguments can be supplied.""") class ImplementsDatasetReduce(object): @@ -308,12 +309,12 @@ def assign_coords(self, **kwargs): assigned : same type as caller A new object with the new coordinates in addition to the existing data. - + Examples -------- - + Convert longitude coordinates from 0-359 to -180-179: - + >>> da = xr.DataArray(np.random.rand(4), ... coords=[np.array([358, 359, 0, 1])], ... dims='lon') @@ -445,11 +446,11 @@ def groupby(self, group, squeeze=True): grouped : GroupBy A `GroupBy` object patterned after `pandas.GroupBy` that can be iterated over in the form of `(unique_value, grouped_array)` pairs. - + Examples -------- Calculate daily anomalies for daily data: - + >>> da = xr.DataArray(np.linspace(0, 1826, num=1827), ... coords=[pd.date_range('1/1/2000', '31/12/2004', ... freq='D')], @@ -465,7 +466,7 @@ def groupby(self, group, squeeze=True): Coordinates: * time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 ... dayofyear (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ... - + See Also -------- core.groupby.DataArrayGroupBy @@ -589,7 +590,7 @@ def resample(self, freq=None, dim=None, how=None, skipna=None, closed=None, label=None, base=0, keep_attrs=False, **indexer): """Returns a Resample object for performing resampling operations. - Handles both downsampling and upsampling. If any intervals contain no + Handles both downsampling and upsampling. If any intervals contain no values from the original object, they will be given the value ``NaN``. Parameters @@ -616,11 +617,11 @@ def resample(self, freq=None, dim=None, how=None, skipna=None, ------- resampled : same type as caller This object resampled. - + Examples -------- Downsample monthly time-series data to seasonal data: - + >>> da = xr.DataArray(np.linspace(0, 11, num=12), ... coords=[pd.date_range('15/12/1999', ... periods=12, freq=pd.DateOffset(months=1))], @@ -637,13 +638,13 @@ def resample(self, freq=None, dim=None, how=None, skipna=None, * time (time) datetime64[ns] 1999-12-01 2000-03-01 2000-06-01 2000-09-01 Upsample monthly time-series data to daily data: - + >>> da.resample(time='1D').interpolate('linear') array([ 0. , 0.032258, 0.064516, ..., 10.935484, 10.967742, 11. ]) Coordinates: * time (time) datetime64[ns] 1999-12-15 1999-12-16 1999-12-17 ... - + References ---------- @@ -957,8 +958,8 @@ def contains_cftime_datetimes(var): sample = sample.item() return isinstance(sample, cftime_datetime) else: - return False - + return False + def _contains_datetime_like_objects(var): """Check if a variable contains datetime like objects (either diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 7326b936e2e..7ad44472f06 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -98,6 +98,9 @@ def maybe_promote(dtype): return np.dtype(dtype), fill_value +NAT_TYPES = (np.datetime64('NaT'), np.timedelta64('NaT')) + + def get_fill_value(dtype): """Return an appropriate fill value for this dtype. diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 3bd105064da..17eb310f8db 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -17,14 +17,6 @@ from .nputils import nanfirst, nanlast from .pycompat import dask_array_type -try: - import bottleneck as bn - has_bottleneck = True -except ImportError: - # use numpy methods instead - bn = np - has_bottleneck = False - try: import dask.array as dask_array from . import dask_array_compat @@ -175,7 +167,7 @@ def array_notnull_equiv(arr1, arr2): def count(data, axis=None): """Count the number of non-NA in this array along the given axis or axes """ - return sum(~isnull(data), axis=axis) + return np.sum(~isnull(data), axis=axis) def where(condition, x, y): @@ -213,159 +205,69 @@ def _ignore_warnings_if(condition): yield -def _nansum_object(value, axis=None, **kwargs): - """ In house nansum for object array """ - value = fillna(value, 0) - return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) - - -def _nan_minmax_object(func, get_fill_value, value, axis=None, **kwargs): - """ In house nanmin and nanmax for object array """ - fill_value = get_fill_value(value.dtype) - valid_count = count(value, axis=axis) - filled_value = fillna(value, fill_value) - data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) - if not hasattr(data, 'dtype'): # scalar case - data = dtypes.fill_value(value.dtype) if valid_count == 0 else data - return np.array(data, dtype=value.dtype) - return where_method(data, valid_count != 0) - - -def _nan_argminmax_object(func, get_fill_value, value, axis=None, **kwargs): - """ In house nanargmin, nanargmax for object arrays. Always return integer - type """ - fill_value = get_fill_value(value.dtype) - valid_count = count(value, axis=axis) - value = fillna(value, fill_value) - data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) - # dask seems return non-integer type - if isinstance(value, dask_array_type): - data = data.astype(int) - - if (valid_count == 0).any(): - raise ValueError('All-NaN slice encountered') - - return np.array(data, dtype=int) - - -def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): - """ In house nanmean. ddof argument will be used in _nanvar method """ - valid_count = count(value, axis=axis) - value = fillna(value, 0) - # As dtype inference is impossible for object dtype, we assume float - # https://github.com/dask/dask/issues/3162 - dtype = kwargs.pop('dtype', None) - if dtype is None and value.dtype.kind == 'O': - dtype = value.dtype if value.dtype.kind in ['cf'] else float - - data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs) - data = data / (valid_count - ddof) - return where_method(data, valid_count != 0) - - -def _nanvar_object(value, axis=None, **kwargs): - ddof = kwargs.pop('ddof', 0) - kwargs_mean = kwargs.copy() - kwargs_mean.pop('keepdims', None) - value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis, - keepdims=True, **kwargs_mean) - squared = (value.astype(value_mean.dtype) - value_mean)**2 - return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs) - - -_nan_object_funcs = { - 'sum': _nansum_object, - 'min': partial(_nan_minmax_object, 'min', dtypes.get_pos_infinity), - 'max': partial(_nan_minmax_object, 'max', dtypes.get_neg_infinity), - 'argmin': partial(_nan_argminmax_object, 'argmin', - dtypes.get_pos_infinity), - 'argmax': partial(_nan_argminmax_object, 'argmax', - dtypes.get_neg_infinity), - 'mean': partial(_nanmean_ddof_object, 0), - 'var': _nanvar_object, -} - - -def _create_nan_agg_method(name, numeric_only=False, np_compat=False, - no_bottleneck=False, coerce_strings=False): +def _create_nan_agg_method(name, coerce_strings=False): + from . import nanops + def f(values, axis=None, skipna=None, **kwargs): if kwargs.pop('out', None) is not None: raise TypeError('`out` is not valid for {}'.format(name)) - # If dtype is supplied, we use numpy's method. - dtype = kwargs.get('dtype', None) values = asarray(values) - # dask requires dtype argument for object dtype - if (values.dtype == 'object' and name in ['sum', ]): - kwargs['dtype'] = values.dtype if dtype is None else dtype - if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) + func = None if skipna or (skipna is None and values.dtype.kind in 'cfO'): - if values.dtype.kind not in ['u', 'i', 'f', 'c']: - func = _nan_object_funcs.get(name, None) - using_numpy_nan_func = True - if func is None or values.dtype.kind not in 'Ob': - raise NotImplementedError( - 'skipna=True not yet implemented for %s with dtype %s' - % (name, values.dtype)) - else: - nanname = 'nan' + name - if (isinstance(axis, tuple) or not values.dtype.isnative or - no_bottleneck or (dtype is not None and - np.dtype(dtype) != values.dtype)): - # bottleneck can't handle multiple axis arguments or - # non-native endianness - if np_compat: - eager_module = npcompat - else: - eager_module = np - else: - kwargs.pop('dtype', None) - eager_module = bn - func = _dask_or_eager_func(nanname, eager_module) - using_numpy_nan_func = (eager_module is np or - eager_module is npcompat) + nanname = 'nan' + name + func = getattr(nanops, nanname) else: func = _dask_or_eager_func(name) - using_numpy_nan_func = False - with _ignore_warnings_if(using_numpy_nan_func): - try: - return func(values, axis=axis, **kwargs) - except AttributeError: - if isinstance(values, dask_array_type): - try: # dask/dask#3133 dask sometimes needs dtype argument - return func(values, axis=axis, dtype=values.dtype, - **kwargs) - except AttributeError: - msg = '%s is not yet implemented on dask arrays' % name - else: - assert using_numpy_nan_func - msg = ('%s is not available with skipna=False with the ' - 'installed version of numpy; upgrade to numpy 1.12 ' - 'or newer to use skipna=True or skipna=None' % name) - raise NotImplementedError(msg) - f.numeric_only = numeric_only + + try: + return func(values, axis=axis, **kwargs) + except AttributeError: + if isinstance(values, dask_array_type): + try: # dask/dask#3133 dask sometimes needs dtype argument + # if func does not accept dtype, then raises TypeError + return func(values, axis=axis, dtype=values.dtype, + **kwargs) + except (AttributeError, TypeError): + msg = '%s is not yet implemented on dask arrays' % name + else: + msg = ('%s is not available with skipna=False with the ' + 'installed version of numpy; upgrade to numpy 1.12 ' + 'or newer to use skipna=True or skipna=None' % name) + raise NotImplementedError(msg) + f.__name__ = name return f +# Attributes `numeric_only`, `available_min_count` is used for docs. +# See ops.inject_reduce_methods argmax = _create_nan_agg_method('argmax', coerce_strings=True) argmin = _create_nan_agg_method('argmin', coerce_strings=True) max = _create_nan_agg_method('max', coerce_strings=True) min = _create_nan_agg_method('min', coerce_strings=True) -sum = _create_nan_agg_method('sum', numeric_only=True) -mean = _create_nan_agg_method('mean', numeric_only=True) -std = _create_nan_agg_method('std', numeric_only=True) -var = _create_nan_agg_method('var', numeric_only=True) -median = _create_nan_agg_method('median', numeric_only=True) -prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True) -cumprod_1d = _create_nan_agg_method( - 'cumprod', numeric_only=True, no_bottleneck=True) -cumsum_1d = _create_nan_agg_method( - 'cumsum', numeric_only=True, no_bottleneck=True) +sum = _create_nan_agg_method('sum') +sum.numeric_only = True +sum.available_min_count = True +mean = _create_nan_agg_method('mean') +mean.numeric_only = True +std = _create_nan_agg_method('std') +std.numeric_only = True +var = _create_nan_agg_method('var') +var.numeric_only = True +median = _create_nan_agg_method('median') +median.numeric_only = True +prod = _create_nan_agg_method('prod') +prod.numeric_only = True +sum.available_min_count = True +cumprod_1d = _create_nan_agg_method('cumprod') +cumprod_1d.numeric_only = True +cumsum_1d = _create_nan_agg_method('cumsum') +cumsum_1d.numeric_only = True def _nd_cum_func(cum_func, array, axis, **kwargs): diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py new file mode 100644 index 00000000000..2309ed9619d --- /dev/null +++ b/xarray/core/nanops.py @@ -0,0 +1,208 @@ +from __future__ import absolute_import, division, print_function + +import numpy as np + +from . import dtypes +from .pycompat import dask_array_type +from . duck_array_ops import (count, isnull, fillna, where_method, + _dask_or_eager_func) +from . import nputils + +try: + import dask.array as dask_array +except ImportError: + dask_array = None + + +def _replace_nan(a, val): + """ + replace nan in a by val, and returns the replaced array and the nan + position + """ + mask = isnull(a) + return where_method(val, mask, a), mask + + +def _maybe_null_out(result, axis, mask, min_count=1): + """ + xarray version of pandas.core.nanops._maybe_null_out + """ + if hasattr(axis, '__len__'): # if tuple or list + raise ValueError('min_count is not available for reduction ' + 'with more than one dimensions.') + + if axis is not None and getattr(result, 'ndim', False): + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 + if null_mask.any(): + dtype, fill_value = dtypes.maybe_promote(result.dtype) + result = result.astype(dtype) + result[null_mask] = fill_value + + elif getattr(result, 'dtype', None) not in dtypes.NAT_TYPES: + null_mask = mask.size - mask.sum() + if null_mask < min_count: + result = np.nan + + return result + + +def _nan_argminmax_object(func, fill_value, value, axis=None, **kwargs): + """ In house nanargmin, nanargmax for object arrays. Always return integer + type + """ + valid_count = count(value, axis=axis) + value = fillna(value, fill_value) + data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) + + # TODO This will evaluate dask arrays and might be costly. + if (valid_count == 0).any(): + raise ValueError('All-NaN slice encountered') + + return data + + +def _nan_minmax_object(func, fill_value, value, axis=None, **kwargs): + """ In house nanmin and nanmax for object array """ + valid_count = count(value, axis=axis) + filled_value = fillna(value, fill_value) + data = getattr(np, func)(filled_value, axis=axis, **kwargs) + if not hasattr(data, 'dtype'): # scalar case + data = dtypes.fill_value(value.dtype) if valid_count == 0 else data + return np.array(data, dtype=value.dtype) + return where_method(data, valid_count != 0) + + +def nanmin(a, axis=None, out=None): + if a.dtype.kind == 'O': + return _nan_minmax_object( + 'min', dtypes.get_pos_infinity(a.dtype), a, axis) + + module = dask_array if isinstance(a, dask_array_type) else nputils + return module.nanmin(a, axis=axis) + + +def nanmax(a, axis=None, out=None): + if a.dtype.kind == 'O': + return _nan_minmax_object( + 'max', dtypes.get_neg_infinity(a.dtype), a, axis) + + module = dask_array if isinstance(a, dask_array_type) else nputils + return module.nanmax(a, axis=axis) + + +def nanargmin(a, axis=None): + fill_value = dtypes.get_pos_infinity(a.dtype) + if a.dtype.kind == 'O': + return _nan_argminmax_object('argmin', fill_value, a, axis=axis) + a, mask = _replace_nan(a, fill_value) + if isinstance(a, dask_array_type): + res = dask_array.argmin(a, axis=axis) + else: + res = np.argmin(a, axis=axis) + + if mask is not None: + mask = mask.all(axis=axis) + if mask.any(): + raise ValueError("All-NaN slice encountered") + return res + + +def nanargmax(a, axis=None): + fill_value = dtypes.get_neg_infinity(a.dtype) + if a.dtype.kind == 'O': + return _nan_argminmax_object('argmax', fill_value, a, axis=axis) + + a, mask = _replace_nan(a, fill_value) + if isinstance(a, dask_array_type): + res = dask_array.argmax(a, axis=axis) + else: + res = np.argmax(a, axis=axis) + + if mask is not None: + mask = mask.all(axis=axis) + if mask.any(): + raise ValueError("All-NaN slice encountered") + return res + + +def nansum(a, axis=None, dtype=None, out=None, min_count=None): + a, mask = _replace_nan(a, 0) + result = _dask_or_eager_func('sum')(a, axis=axis, dtype=dtype) + if min_count is not None: + return _maybe_null_out(result, axis, mask, min_count) + else: + return result + + +def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): + """ In house nanmean. ddof argument will be used in _nanvar method """ + from .duck_array_ops import (count, fillna, _dask_or_eager_func, + where_method) + + valid_count = count(value, axis=axis) + value = fillna(value, 0) + # As dtype inference is impossible for object dtype, we assume float + # https://github.com/dask/dask/issues/3162 + dtype = kwargs.pop('dtype', None) + if dtype is None and value.dtype.kind == 'O': + dtype = value.dtype if value.dtype.kind in ['cf'] else float + + data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs) + data = data / (valid_count - ddof) + return where_method(data, valid_count != 0) + + +def nanmean(a, axis=None, dtype=None, out=None): + if a.dtype.kind == 'O': + return _nanmean_ddof_object(0, a, axis=axis, dtype=dtype) + + if isinstance(a, dask_array_type): + return dask_array.nanmean(a, axis=axis, dtype=dtype) + + return np.nanmean(a, axis=axis, dtype=dtype) + + +def nanmedian(a, axis=None, out=None): + return _dask_or_eager_func('nanmedian', eager_module=nputils)(a, axis=axis) + + +def _nanvar_object(value, axis=None, **kwargs): + ddof = kwargs.pop('ddof', 0) + kwargs_mean = kwargs.copy() + kwargs_mean.pop('keepdims', None) + value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis, + keepdims=True, **kwargs_mean) + squared = (value.astype(value_mean.dtype) - value_mean)**2 + return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs) + + +def nanvar(a, axis=None, dtype=None, out=None, ddof=0): + if a.dtype.kind == 'O': + return _nanvar_object(a, axis=axis, dtype=dtype, ddof=ddof) + + return _dask_or_eager_func('nanvar', eager_module=nputils)( + a, axis=axis, dtype=dtype, ddof=ddof) + + +def nanstd(a, axis=None, dtype=None, out=None): + return _dask_or_eager_func('nanstd', eager_module=nputils)( + a, axis=axis, dtype=dtype) + + +def nanprod(a, axis=None, dtype=None, out=None, min_count=None): + a, mask = _replace_nan(a, 1) + result = _dask_or_eager_func('nanprod')(a, axis=axis, dtype=dtype, out=out) + if min_count is not None: + return _maybe_null_out(result, axis, mask, min_count) + else: + return result + + +def nancumsum(a, axis=None, dtype=None, out=None): + return _dask_or_eager_func('nancumsum', eager_module=nputils)( + a, axis=axis, dtype=dtype) + + +def nancumprod(a, axis=None, dtype=None, out=None): + return _dask_or_eager_func('nancumprod', eager_module=nputils)( + a, axis=axis, dtype=dtype) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 6df2d34bfe3..a8d596abd86 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -5,6 +5,14 @@ import numpy as np import pandas as pd +try: + import bottleneck as bn + _USE_BOTTLENECK = True +except ImportError: + # use numpy methods instead + bn = np + _USE_BOTTLENECK = False + def _validate_axis(data, axis): ndim = data.ndim @@ -195,3 +203,36 @@ def _rolling_window(a, window, axis=-1): rolling = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides, writeable=False) return np.swapaxes(rolling, -2, axis) + + +def _create_bottleneck_method(name, npmodule=np): + def f(values, axis=None, **kwds): + dtype = kwds.get('dtype', None) + bn_func = getattr(bn, name, None) + + if (_USE_BOTTLENECK and bn_func is not None and + not isinstance(axis, tuple) and + values.dtype.kind in 'uifc' and + values.dtype.isnative and + (dtype is None or np.dtype(dtype) == values.dtype)): + # bottleneck does not take care dtype, min_count + kwds.pop('dtype', None) + result = bn_func(values, axis=axis, **kwds) + else: + result = getattr(npmodule, name)(values, axis=axis, **kwds) + + return result + + f.__name__ = name + return f + + +nanmin = _create_bottleneck_method('nanmin') +nanmax = _create_bottleneck_method('nanmax') +nanmean = _create_bottleneck_method('nanmean') +nanmedian = _create_bottleneck_method('nanmedian') +nanvar = _create_bottleneck_method('nanvar') +nanstd = _create_bottleneck_method('nanstd') +nanprod = _create_bottleneck_method('nanprod') +nancumsum = _create_bottleneck_method('nancumsum') +nancumprod = _create_bottleneck_method('nancumprod') diff --git a/xarray/core/ops.py b/xarray/core/ops.py index d9e8ceb65d5..a0dd2212a8f 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -86,7 +86,7 @@ If True, skip missing values (as marked by NaN). By default, only skips missing values for float dtypes; other dtypes either do not have a sentinel missing value (int) or skipna=True has not been - implemented (object, datetime64 or timedelta64). + implemented (object, datetime64 or timedelta64).{min_count_docs} keep_attrs : bool, optional If True, the attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be @@ -102,6 +102,12 @@ indicated dimension(s) removed. """ +_MINCOUNT_DOCSTRING = """ +min_count : int, default None + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will + be NA. New in version 0.10.8: Added with the default being None.""" + _ROLLING_REDUCE_DOCSTRING_TEMPLATE = """\ Reduce this {da_or_ds}'s data windows by applying `{name}` along its dimension. @@ -236,11 +242,15 @@ def inject_reduce_methods(cls): [('count', duck_array_ops.count, False)]) for name, f, include_skipna in methods: numeric_only = getattr(f, 'numeric_only', False) + available_min_count = getattr(f, 'available_min_count', False) + min_count_docs = _MINCOUNT_DOCSTRING if available_min_count else '' + func = cls._reduce_method(f, include_skipna, numeric_only) func.__name__ = name func.__doc__ = _REDUCE_DOCSTRING_TEMPLATE.format( name=name, cls=cls.__name__, - extra_args=cls._reduce_extra_args_docstring.format(name=name)) + extra_args=cls._reduce_extra_args_docstring.format(name=name), + min_count_docs=min_count_docs) setattr(cls, name, func) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 2950e97cc75..ec431266c76 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3329,7 +3329,9 @@ def test_isin(da): def test_rolling_iter(da): rolling_obj = da.rolling(time=7) - rolling_obj_mean = rolling_obj.mean() + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'Mean of empty slice') + rolling_obj_mean = rolling_obj.mean() assert len(rolling_obj.window_labels) == len(da['time']) assert_identical(rolling_obj.window_labels, da['time']) @@ -3337,8 +3339,10 @@ def test_rolling_iter(da): for i, (label, window_da) in enumerate(rolling_obj): assert label == da['time'].isel(time=i) - actual = rolling_obj_mean.isel(time=i) - expected = window_da.mean('time') + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'Mean of empty slice') + actual = rolling_obj_mean.isel(time=i) + expected = window_da.mean('time') # TODO add assert_allclose_with_nan, which compares nan position # as well as the closeness of the values. diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 08d71d462d8..c1e3713af9c 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2727,6 +2727,20 @@ def test_resample_and_first(self): result = actual.reduce(method) assert_equal(expected, result) + def test_resample_min_count(self): + times = pd.date_range('2000-01-01', freq='6H', periods=10) + ds = Dataset({'foo': (['time', 'x', 'y'], np.random.randn(10, 5, 3)), + 'bar': ('time', np.random.randn(10), {'meta': 'data'}), + 'time': times}) + # inject nan + ds['foo'] = xr.where(ds['foo'] > 2.0, np.nan, ds['foo']) + + actual = ds.resample(time='1D').sum(min_count=1) + expected = xr.concat([ + ds.isel(time=slice(i * 4, (i + 1) * 4)).sum('time', min_count=1) + for i in range(3)], dim=actual['time']) + assert_equal(expected, actual) + def test_resample_by_mean_with_keep_attrs(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) ds = Dataset({'foo': (['time', 'x', 'y'], np.random.randn(10, 5, 3)), @@ -3365,7 +3379,6 @@ def test_reduce(self): (('dim2', 'time'), ['dim1', 'dim3']), ((), ['dim1', 'dim2', 'dim3', 'time'])]: actual = data.min(dim=reduct).dims - print(reduct, actual, expected) self.assertItemsEqual(actual, expected) assert_equal(data.mean(dim=[]), data) @@ -3420,7 +3433,6 @@ def test_reduce_cumsum_test_dims(self): ('time', ['dim1', 'dim2', 'dim3']) ]: actual = getattr(data, cumfunc)(dim=reduct).dims - print(reduct, actual, expected) self.assertItemsEqual(actual, expected) def test_reduce_non_numeric(self): diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index f3f93491822..3f32fc49fd2 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -1,12 +1,16 @@ from __future__ import absolute_import, division, print_function +from distutils.version import LooseVersion + import numpy as np +import pandas as pd import pytest +from textwrap import dedent from numpy import array, nan import warnings -from xarray import DataArray, concat -from xarray.core import duck_array_ops +from xarray import DataArray, Dataset, concat +from xarray.core import duck_array_ops, dtypes from xarray.core.duck_array_ops import ( array_notnull_equiv, concatenate, count, first, last, mean, rolling_window, stack, where) @@ -100,7 +104,10 @@ def test_concatenate_type_promotion(self): assert_array_equal(result, np.array([1, 'b'], dtype=object)) def test_all_nan_arrays(self): - assert np.isnan(mean([np.nan, np.nan])) + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'All-NaN slice') + warnings.filterwarnings('ignore', 'Mean of empty slice') + assert np.isnan(mean([np.nan, np.nan])) def test_cumsum_1d(): @@ -197,10 +204,15 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): array = rng.choice(['a', 'b', 'c', 'd'], size=shapes) else: raise ValueError - da = DataArray(array, dims=dims, coords={'x': np.arange(16)}, name='da') if contains_nan: - da = da.reindex(x=np.arange(20)) + inds = rng.choice(range(array.size), int(array.size * 0.2)) + dtype, fill_value = dtypes.maybe_promote(array.dtype) + array = array.astype(dtype) + array.flat[inds] = fill_value + + da = DataArray(array, dims=dims, coords={'x': np.arange(16)}, name='da') + if dask and has_dask: chunks = {d: 4 for d in dims} da = da.chunk(chunks) @@ -234,10 +246,16 @@ def series_reduce(da, func, dim, **kwargs): return concat(da1, dim=d) +def assert_dask_array(da, dask): + if dask and da.ndim > 0: + assert isinstance(da.data, dask_array_type) + + @pytest.mark.parametrize('dim_num', [1, 2]) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) @pytest.mark.parametrize('dask', [False, True]) @pytest.mark.parametrize('func', ['sum', 'min', 'max', 'mean', 'var']) +# TODO test cumsum, cumprod @pytest.mark.parametrize('skipna', [False, True]) @pytest.mark.parametrize('aggdim', [None, 'x']) def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): @@ -251,6 +269,9 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if dask and not has_dask: pytest.skip('requires dask') + if dask and skipna is False and dtype in [np.bool_]: + pytest.skip('dask does not compute object-typed array') + rtol = 1e-04 if dtype == np.float32 else 1e-05 da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) @@ -259,6 +280,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): # TODO: remove these after resolving # https://github.com/dask/dask/issues/3245 with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'Mean of empty slice') warnings.filterwarnings('ignore', 'All-NaN slice') warnings.filterwarnings('ignore', 'invalid value encountered in') @@ -272,6 +294,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): expected = getattr(np, func)(da.values, axis=axis) actual = getattr(da, func)(skipna=skipna, dim=aggdim) + assert_dask_array(actual, dask) assert np.allclose(actual.values, np.array(expected), rtol=1.0e-4, equal_nan=True) except (TypeError, AttributeError, ZeroDivisionError): @@ -279,14 +302,21 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): # nanmean for object dtype pass - # make sure the compatiblility with pandas' results. actual = getattr(da, func)(skipna=skipna, dim=aggdim) + + # for dask case, make sure the result is the same for numpy backend + expected = getattr(da.compute(), func)(skipna=skipna, dim=aggdim) + assert_allclose(actual, expected, rtol=rtol) + + # make sure the compatiblility with pandas' results. if func == 'var': expected = series_reduce(da, func, skipna=skipna, dim=aggdim, ddof=0) assert_allclose(actual, expected, rtol=rtol) # also check ddof!=0 case actual = getattr(da, func)(skipna=skipna, dim=aggdim, ddof=5) + if dask: + assert isinstance(da.data, dask_array_type) expected = series_reduce(da, func, skipna=skipna, dim=aggdim, ddof=5) assert_allclose(actual, expected, rtol=rtol) @@ -297,11 +327,14 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): # make sure the dtype argument if func not in ['max', 'min']: actual = getattr(da, func)(skipna=skipna, dim=aggdim, dtype=float) + assert_dask_array(actual, dask) assert actual.dtype == float # without nan da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) actual = getattr(da, func)(skipna=skipna) + if dask: + assert isinstance(da.data, dask_array_type) expected = getattr(np, 'nan{}'.format(func))(da.values) if actual.dtype == object: assert actual.values == np.array(expected) @@ -338,13 +371,6 @@ def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): with warnings.catch_warnings(): warnings.filterwarnings('ignore', 'All-NaN slice') - if aggdim == 'y' and contains_nan and skipna: - with pytest.raises(ValueError): - actual = da.isel(**{ - aggdim: getattr(da, 'arg' + func)( - dim=aggdim, skipna=skipna).compute()}) - return - actual = da.isel(**{aggdim: getattr(da, 'arg' + func) (dim=aggdim, skipna=skipna).compute()}) expected = getattr(da, func)(dim=aggdim, skipna=skipna) @@ -354,6 +380,7 @@ def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): def test_argmin_max_error(): da = construct_dataarray(2, np.bool_, contains_nan=True, dask=False) + da[0] = np.nan with pytest.raises(ValueError): da.argmin(dim='y') @@ -388,3 +415,122 @@ def test_dask_rolling(axis, window, center): with pytest.raises(ValueError): rolling_window(dx, axis=axis, window=100, center=center, fill_value=np.nan) + + +@pytest.mark.parametrize('dim_num', [1, 2]) +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('dask', [False, True]) +@pytest.mark.parametrize('func', ['sum', 'prod']) +@pytest.mark.parametrize('aggdim', [None, 'x']) +def test_min_count(dim_num, dtype, dask, func, aggdim): + if dask and not has_dask: + pytest.skip('requires dask') + + da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) + min_count = 3 + + actual = getattr(da, func)(dim=aggdim, skipna=True, min_count=min_count) + + if LooseVersion(pd.__version__) >= LooseVersion('0.22.0'): + # min_count is only implenented in pandas > 0.22 + expected = series_reduce(da, func, skipna=True, dim=aggdim, + min_count=min_count) + assert_allclose(actual, expected) + + assert_dask_array(actual, dask) + + +@pytest.mark.parametrize('func', ['sum', 'prod']) +def test_min_count_dataset(func): + da = construct_dataarray(2, dtype=float, contains_nan=True, dask=False) + ds = Dataset({'var1': da}, coords={'scalar': 0}) + actual = getattr(ds, func)(dim='x', skipna=True, min_count=3)['var1'] + expected = getattr(ds['var1'], func)(dim='x', skipna=True, min_count=3) + assert_allclose(actual, expected) + + +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('dask', [False, True]) +@pytest.mark.parametrize('func', ['sum', 'prod']) +def test_multiple_dims(dtype, dask, func): + if dask and not has_dask: + pytest.skip('requires dask') + da = construct_dataarray(3, dtype, contains_nan=True, dask=dask) + + actual = getattr(da, func)(('x', 'y')) + expected = getattr(getattr(da, func)('x'), func)('y') + assert_allclose(actual, expected) + + +def test_docs(): + # with min_count + actual = DataArray.sum.__doc__ + expected = dedent("""\ + Reduce this DataArray's data by applying `sum` along some dimension(s). + + Parameters + ---------- + dim : str or sequence of str, optional + Dimension(s) over which to apply `sum`. + axis : int or sequence of int, optional + Axis(es) over which to apply `sum`. Only one of the 'dim' + and 'axis' arguments can be supplied. If neither are supplied, then + `sum` is calculated over axes. + skipna : bool, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or skipna=True has not been + implemented (object, datetime64 or timedelta64). + min_count : int, default None + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will + be NA. New in version 0.10.8: Added with the default being None. + keep_attrs : bool, optional + If True, the attributes (`attrs`) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. + **kwargs : dict + Additional keyword arguments passed on to the appropriate array + function for calculating `sum` on this object's data. + + Returns + ------- + reduced : DataArray + New DataArray object with `sum` applied to its data and the + indicated dimension(s) removed. + """) + assert actual == expected + + # without min_count + actual = DataArray.std.__doc__ + expected = dedent("""\ + Reduce this DataArray's data by applying `std` along some dimension(s). + + Parameters + ---------- + dim : str or sequence of str, optional + Dimension(s) over which to apply `std`. + axis : int or sequence of int, optional + Axis(es) over which to apply `std`. Only one of the 'dim' + and 'axis' arguments can be supplied. If neither are supplied, then + `std` is calculated over axes. + skipna : bool, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or skipna=True has not been + implemented (object, datetime64 or timedelta64). + keep_attrs : bool, optional + If True, the attributes (`attrs`) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. + **kwargs : dict + Additional keyword arguments passed on to the appropriate array + function for calculating `std` on this object's data. + + Returns + ------- + reduced : DataArray + New DataArray object with `std` applied to its data and the + indicated dimension(s) removed. + """) + assert actual == expected diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index cdb578aff6c..3db5e6adc4b 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1503,8 +1503,8 @@ def test_reduce_funcs(self): assert_identical(v.all(dim='x'), Variable([], False)) v = Variable('t', pd.date_range('2000-01-01', periods=3)) - with pytest.raises(NotImplementedError): - v.argmax(skipna=True) + assert v.argmax(skipna=True) == 2 + assert_identical( v.max(), Variable([], pd.Timestamp('2000-01-03')))