diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0a3f949b496..bc5a5bb5ea4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,7 +43,16 @@ Documentation Enhancements ~~~~~~~~~~~~ -- reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` +- Reduce methods such as :py:func:`DataArray.sum()` now handles object-type array. + + .. ipython:: python + + da = xr.DataArray(np.array([True, False, np.nan], dtype=object), dims='x') + da.sum() + + (:issue:`1866`) + By `Keisuke Fujii `_. +- Reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` arguments. (:issue:`1838`) By `Keisuke Fujii `_. - Added nodatavals attribute to DataArray when using :py:func:`~xarray.open_rasterio`. (:issue:`1736`). diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index e811d189568..8dac39612e4 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -1,4 +1,5 @@ import numpy as np +import functools from . import utils @@ -7,6 +8,29 @@ NA = utils.ReprObject('') +@functools.total_ordering +class AlwaysGreaterThan(object): + def __gt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +@functools.total_ordering +class AlwaysLessThan(object): + def __lt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +# Equivalence to np.inf (-np.inf) for object-type +INF = AlwaysGreaterThan() +NINF = AlwaysLessThan() + + # Pairs of types that, if both found, should be promoted to object dtype # instead of following NumPy's own type-promotion rules. These type promotion # rules match pandas instead. For reference, see the NumPy type hierarchy: @@ -18,6 +42,29 @@ ] +@functools.total_ordering +class AlwaysGreaterThan(object): + def __gt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +@functools.total_ordering +class AlwaysLessThan(object): + def __lt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +# Equivalence to np.inf (-np.inf) for object-type +INF = AlwaysGreaterThan() +NINF = AlwaysLessThan() + + def maybe_promote(dtype): """Simpler equivalent of pandas.core.common._maybe_promote @@ -66,6 +113,46 @@ def get_fill_value(dtype): return fill_value +def get_pos_infinity(dtype): + """Return an appropriate positive infinity for this dtype. + + Parameters + ---------- + dtype : np.dtype + + Returns + ------- + fill_value : positive infinity value corresponding to this dtype. + """ + if issubclass(dtype.type, (np.floating, np.integer)): + return np.inf + + if issubclass(dtype.type, np.complexfloating): + return np.inf + 1j * np.inf + + return INF + + +def get_neg_infinity(dtype): + """Return an appropriate positive infinity for this dtype. + + Parameters + ---------- + dtype : np.dtype + + Returns + ------- + fill_value : positive infinity value corresponding to this dtype. + """ + if issubclass(dtype.type, (np.floating, np.integer)): + return -np.inf + + if issubclass(dtype.type, np.complexfloating): + return -np.inf - 1j * np.inf + + return NINF + + def is_datetime_like(dtype): """Check if a dtype is a subclass of the numpy datetime types """ diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 3e9a2e6d154..6f5548800a2 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -197,6 +197,79 @@ def _ignore_warnings_if(condition): yield +def _nansum_object(value, axis=None, **kwargs): + """ In house nansum for object array """ + value = fillna(value, 0) + return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) + + +def _nan_minmax_object(func, get_fill_value, value, axis=None, **kwargs): + """ In house nanmin and nanmax for object array """ + fill_value = get_fill_value(value.dtype) + valid_count = count(value, axis=axis) + filled_value = fillna(value, fill_value) + data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) + if not hasattr(data, 'dtype'): # scalar case + data = dtypes.fill_value(value.dtype) if valid_count == 0 else data + return np.array(data, dtype=value.dtype) + return where_method(data, valid_count != 0) + + +def _nan_argminmax_object(func, get_fill_value, value, axis=None, **kwargs): + """ In house nanargmin, nanargmax for object arrays. Always return integer + type """ + fill_value = get_fill_value(value.dtype) + valid_count = count(value, axis=axis) + value = fillna(value, fill_value) + data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) + # dask seems return non-integer type + if isinstance(value, dask_array_type): + data = data.astype(int) + + if (valid_count == 0).any(): + raise ValueError('All-NaN slice encountered') + + return np.array(data, dtype=int) + + +def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): + """ In house nanmean. ddof argument will be used in _nanvar method """ + valid_count = count(value, axis=axis) + value = fillna(value, 0) + # As dtype inference is impossible for object dtype, we assume float + # https://github.com/dask/dask/issues/3162 + dtype = kwargs.pop('dtype', None) + if dtype is None and value.dtype.kind == 'O': + dtype = value.dtype if value.dtype.kind in ['cf'] else float + + data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs) + data = data / (valid_count - ddof) + return where_method(data, valid_count != 0) + + +def _nanvar_object(value, axis=None, **kwargs): + ddof = kwargs.pop('ddof', 0) + kwargs_mean = kwargs.copy() + kwargs_mean.pop('keepdims', None) + value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis, + keepdims=True, **kwargs_mean) + squared = (value.astype(value_mean.dtype) - value_mean)**2 + return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs) + + +_nan_object_funcs = { + 'sum': _nansum_object, + 'min': partial(_nan_minmax_object, 'min', dtypes.get_pos_infinity), + 'max': partial(_nan_minmax_object, 'max', dtypes.get_neg_infinity), + 'argmin': partial(_nan_argminmax_object, 'argmin', + dtypes.get_pos_infinity), + 'argmax': partial(_nan_argminmax_object, 'argmax', + dtypes.get_neg_infinity), + 'mean': partial(_nanmean_ddof_object, 0), + 'var': _nanvar_object, +} + + def _create_nan_agg_method(name, numeric_only=False, np_compat=False, no_bottleneck=False, coerce_strings=False, keep_dims=False): @@ -211,27 +284,31 @@ def f(values, axis=None, skipna=None, **kwargs): if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) - if skipna or (skipna is None and values.dtype.kind in 'cf'): + if skipna or (skipna is None and values.dtype.kind in 'cfO'): if values.dtype.kind not in ['u', 'i', 'f', 'c']: - raise NotImplementedError( - 'skipna=True not yet implemented for %s with dtype %s' - % (name, values.dtype)) - nanname = 'nan' + name - if (isinstance(axis, tuple) or not values.dtype.isnative or - no_bottleneck or - (dtype is not None and np.dtype(dtype) != values.dtype)): - # bottleneck can't handle multiple axis arguments or non-native - # endianness - if np_compat: - eager_module = npcompat - else: - eager_module = np + func = _nan_object_funcs.get(name, None) + using_numpy_nan_func = True + if func is None or values.dtype.kind not in 'Ob': + raise NotImplementedError( + 'skipna=True not yet implemented for %s with dtype %s' + % (name, values.dtype)) else: - kwargs.pop('dtype', None) - eager_module = bn - func = _dask_or_eager_func(nanname, eager_module) - using_numpy_nan_func = (eager_module is np or - eager_module is npcompat) + nanname = 'nan' + name + if (isinstance(axis, tuple) or not values.dtype.isnative or + no_bottleneck or (dtype is not None and + np.dtype(dtype) != values.dtype)): + # bottleneck can't handle multiple axis arguments or + # non-native endianness + if np_compat: + eager_module = npcompat + else: + eager_module = np + else: + kwargs.pop('dtype', None) + eager_module = bn + func = _dask_or_eager_func(nanname, eager_module) + using_numpy_nan_func = (eager_module is np or + eager_module is npcompat) else: func = _dask_or_eager_func(name) using_numpy_nan_func = False @@ -240,7 +317,11 @@ def f(values, axis=None, skipna=None, **kwargs): return func(values, axis=axis, **kwargs) except AttributeError: if isinstance(values, dask_array_type): - msg = '%s is not yet implemented on dask arrays' % name + try: # dask/dask#3133 dask sometimes needs dtype argument + return func(values, axis=axis, dtype=values.dtype, + **kwargs) + except AttributeError: + msg = '%s is not yet implemented on dask arrays' % name else: assert using_numpy_nan_func msg = ('%s is not available with skipna=False with the ' diff --git a/xarray/tests/test_dtypes.py b/xarray/tests/test_dtypes.py index 51c1aaa4c0c..1b236e0160d 100644 --- a/xarray/tests/test_dtypes.py +++ b/xarray/tests/test_dtypes.py @@ -46,3 +46,9 @@ def error(): # would get promoted to float32 actual = dtypes.result_type(array, np.array([0.5, 1.0], dtype=np.float32)) assert actual == np.float64 + + +@pytest.mark.parametrize('obj', [1.0, np.inf, 'ab', 1.0 + 1.0j, True]) +def test_inf(obj): + assert dtypes.INF > obj + assert dtypes.NINF < obj diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 54eaf46e054..d68a7a382de 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -1,15 +1,19 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from pytest import mark +import pytest import numpy as np from numpy import array, nan +from distutils.version import LooseVersion from . import assert_array_equal from xarray.core.duck_array_ops import ( first, last, count, mean, array_notnull_equiv, where, stack, concatenate ) +from xarray import DataArray +from xarray.testing import assert_allclose +from xarray import concat -from . import TestCase, raises_regex +from . import TestCase, raises_regex, has_dask class TestOps(TestCase): @@ -97,7 +101,7 @@ def test_all_nan_arrays(self): class TestArrayNotNullEquiv(): - @mark.parametrize("arr1, arr2", [ + @pytest.mark.parametrize("arr1, arr2", [ (np.array([1, 2, 3]), np.array([1, 2, 3])), (np.array([1, 2, np.nan]), np.array([1, np.nan, 3])), (np.array([np.nan, 2, np.nan]), np.array([1, np.nan, np.nan])), @@ -115,7 +119,7 @@ def test_wrong_shape(self): b = np.array([[1, 2], [np.nan, 4]]) assert not array_notnull_equiv(a, b) - @mark.parametrize("val1, val2, val3, null", [ + @pytest.mark.parametrize("val1, val2, val3, null", [ (1, 2, 3, None), (1., 2., 3., np.nan), (1., 2., 3., None), @@ -125,3 +129,175 @@ def test_types(self, val1, val2, val3, null): arr1 = np.array([val1, null, val3, null]) arr2 = np.array([val1, val2, null, null]) assert array_notnull_equiv(arr1, arr2) + + +def construct_dataarray(dim_num, dtype, contains_nan, dask): + # dimnum <= 3 + rng = np.random.RandomState(0) + shapes = [16, 8, 4][:dim_num] + dims = ('x', 'y', 'z')[:dim_num] + + if np.issubdtype(dtype, np.floating): + array = rng.randn(*shapes).astype(dtype) + elif np.issubdtype(dtype, np.integer): + array = rng.randint(0, 10, size=shapes).astype(dtype) + elif np.issubdtype(dtype, np.bool_): + array = rng.randint(0, 1, size=shapes).astype(dtype) + elif dtype == str: + array = rng.choice(['a', 'b', 'c', 'd'], size=shapes) + else: + raise ValueError + da = DataArray(array, dims=dims, coords={'x': np.arange(16)}, name='da') + + if contains_nan: + da = da.reindex(x=np.arange(20)) + if dask and has_dask: + chunks = {d: 4 for d in dims} + da = da.chunk(chunks) + + return da + + +def from_series_or_scalar(se): + try: + return DataArray.from_series(se) + except AttributeError: # scalar case + return DataArray(se) + + +def series_reduce(da, func, dim, **kwargs): + """ convert DataArray to pd.Series, apply pd.func, then convert back to + a DataArray. Multiple dims cannot be specified.""" + if dim is None or da.ndim == 1: + se = da.to_series() + return from_series_or_scalar(getattr(se, func)(**kwargs)) + else: + da1 = [] + dims = list(da.dims) + dims.remove(dim) + d = dims[0] + for i in range(len(da[d])): + da1.append(series_reduce(da.isel(**{d: i}), func, dim, **kwargs)) + + if d in da.coords: + return concat(da1, dim=da[d]) + return concat(da1, dim=d) + + +@pytest.mark.parametrize('dim_num', [1, 2]) +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('dask', [False, True]) +@pytest.mark.parametrize('func', ['sum', 'min', 'max', 'mean', 'var']) +@pytest.mark.parametrize('skipna', [False, True]) +@pytest.mark.parametrize('aggdim', [None, 'x']) +def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): + + if aggdim == 'y' and dim_num < 2: + return + + if dtype == np.bool_ and func == 'mean': + return # numpy does not support this + + if dask and not has_dask: + return + + rtol = 1e-04 if dtype == np.float32 else 1e-05 + + da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) + axis = None if aggdim is None else da.get_axis_num(aggdim) + + if dask and not skipna and func in ['var', 'std'] and dtype == np.bool_: + # TODO this might be dask's bug + return + + if (LooseVersion(np.__version__) >= LooseVersion('1.13.0') and + da.dtype.kind == 'O' and skipna): + # Numpy < 1.13 does not handle object-type array. + try: + if skipna: + expected = getattr(np, 'nan{}'.format(func))(da.values, + axis=axis) + else: + expected = getattr(np, func)(da.values, axis=axis) + + actual = getattr(da, func)(skipna=skipna, dim=aggdim) + assert np.allclose(actual.values, np.array(expected), rtol=1.0e-4, + equal_nan=True) + except (TypeError, AttributeError, ZeroDivisionError): + # TODO currently, numpy does not support some methods such as + # nanmean for object dtype + pass + + # make sure the compatiblility with pandas' results. + actual = getattr(da, func)(skipna=skipna, dim=aggdim) + if func == 'var': + expected = series_reduce(da, func, skipna=skipna, dim=aggdim, ddof=0) + assert_allclose(actual, expected, rtol=rtol) + # also check ddof!=0 case + actual = getattr(da, func)(skipna=skipna, dim=aggdim, ddof=5) + expected = series_reduce(da, func, skipna=skipna, dim=aggdim, ddof=5) + assert_allclose(actual, expected, rtol=rtol) + else: + expected = series_reduce(da, func, skipna=skipna, dim=aggdim) + assert_allclose(actual, expected, rtol=rtol) + + # make sure the dtype argument + if func not in ['max', 'min']: + actual = getattr(da, func)(skipna=skipna, dim=aggdim, dtype=float) + assert actual.dtype == float + + # without nan + da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) + actual = getattr(da, func)(skipna=skipna) + expected = getattr(np, 'nan{}'.format(func))(da.values) + if actual.dtype == object: + assert actual.values == np.array(expected) + else: + assert np.allclose(actual.values, np.array(expected), rtol=rtol) + + +@pytest.mark.parametrize('dim_num', [1, 2]) +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_, str]) +@pytest.mark.parametrize('contains_nan', [True, False]) +@pytest.mark.parametrize('dask', [False, True]) +@pytest.mark.parametrize('func', ['min', 'max']) +@pytest.mark.parametrize('skipna', [False, True]) +@pytest.mark.parametrize('aggdim', ['x', 'y']) +def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): + # pandas-dev/pandas#16830, we do not check consistency with pandas but + # just make sure da[da.argmin()] == da.min() + + if aggdim == 'y' and dim_num < 2: + return + + if dask and not has_dask: + return + + if contains_nan: + if not skipna: + # numpy's argmin (not nanargmin) does not handle object-dtype + return + if skipna and np.dtype(dtype).kind in 'iufc': + # numpy's nanargmin raises ValueError for all nan axis + return + + da = construct_dataarray(dim_num, dtype, contains_nan=contains_nan, + dask=dask) + + if aggdim == 'y' and contains_nan and skipna: + with pytest.raises(ValueError): + actual = da.isel(**{ + aggdim: getattr(da, 'arg'+func)(dim=aggdim, + skipna=skipna).compute()}) + return + + actual = da.isel(**{ + aggdim: getattr(da, 'arg'+func)(dim=aggdim, skipna=skipna).compute()}) + expected = getattr(da, func)(dim=aggdim, skipna=skipna) + assert_allclose(actual.drop(actual.coords), expected.drop(expected.coords)) + + +def test_argmin_max_error(): + da = construct_dataarray(2, np.bool_, contains_nan=True, dask=False) + with pytest.raises(ValueError): + da.argmin(dim='y')