From 1b9c05fd327327d05c9639173564288cd58afcc6 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 3 Feb 2018 07:58:50 +0900 Subject: [PATCH 01/19] First support of sum, min, max for object-typed arrays --- xarray/core/duck_array_ops.py | 71 ++++++++++++++++++------- xarray/tests/test_duck_array_ops.py | 81 +++++++++++++++++++++++++++-- 2 files changed, 127 insertions(+), 25 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 2058ce86a99..6a8a0d3f813 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -171,6 +171,29 @@ def _ignore_warnings_if(condition): yield +def _nansum(value, axis=None, **kwargs): + """ Our in house nansum. This is used for object array """ + value = fillna(value, 0.0) + return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) + + +def _nanmin_or_nansum(func, fill_value, value, axis=None, **kwargs): + """ Our in house nansum. This is used for object array """ + nan_count = count(value, axis=axis) + value = fillna(value, fill_value) + data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) + if not hasattr(data, 'dtype'): # scalar case + return np.nan if data == fill_value else data + # convert all nan part axis to nan + return where_method(data, nan_count != 0) + + +_nan_funcs = {'sum': _nansum, + 'min': partial(_nanmin_or_nansum, 'min', np.inf), + 'max': partial(_nanmin_or_nansum, 'max', -np.inf), + } + + def _create_nan_agg_method(name, numeric_only=False, np_compat=False, no_bottleneck=False, coerce_strings=False, keep_dims=False): @@ -185,27 +208,31 @@ def f(values, axis=None, skipna=None, **kwargs): if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) - if skipna or (skipna is None and values.dtype.kind in 'cf'): + if skipna or (skipna is None and values.dtype.kind in 'cfo'): if values.dtype.kind not in ['u', 'i', 'f', 'c']: - raise NotImplementedError( - 'skipna=True not yet implemented for %s with dtype %s' - % (name, values.dtype)) - nanname = 'nan' + name - if (isinstance(axis, tuple) or not values.dtype.isnative or - no_bottleneck or - (dtype is not None and np.dtype(dtype) != values.dtype)): - # bottleneck can't handle multiple axis arguments or non-native - # endianness - if np_compat: - eager_module = npcompat - else: - eager_module = np + func = _nan_funcs.get(name, None) + using_numpy_nan_func = True + if func is None: + raise NotImplementedError( + 'skipna=True not yet implemented for %s with dtype %s' + % (name, values.dtype)) else: - kwargs.pop('dtype', None) - eager_module = bn - func = _dask_or_eager_func(nanname, eager_module) - using_numpy_nan_func = (eager_module is np or - eager_module is npcompat) + nanname = 'nan' + name + if (isinstance(axis, tuple) or not values.dtype.isnative or + no_bottleneck or (dtype is not None and + np.dtype(dtype) != values.dtype)): + # bottleneck can't handle multiple axis arguments or + # non-native endianness + if np_compat: + eager_module = npcompat + else: + eager_module = np + else: + kwargs.pop('dtype', None) + eager_module = bn + func = _dask_or_eager_func(nanname, eager_module) + using_numpy_nan_func = (eager_module is np or + eager_module is npcompat) else: func = _dask_or_eager_func(name) using_numpy_nan_func = False @@ -214,7 +241,11 @@ def f(values, axis=None, skipna=None, **kwargs): return func(values, axis=axis, **kwargs) except AttributeError: if isinstance(values, dask_array_type): - msg = '%s is not yet implemented on dask arrays' % name + try: # dask needs dtype argument for some cases + return func(values, axis=axis, dtype=values.dtype, + **kwargs) + except AttributeError: + msg = '%s is not yet implemented on dask arrays' % name else: assert using_numpy_nan_func msg = ('%s is not available with skipna=False with the ' diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 9fb1b1aad40..c77fa7239bc 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -1,15 +1,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from pytest import mark +import pytest import numpy as np from numpy import array, nan from . import assert_array_equal from xarray.core.duck_array_ops import ( - first, last, count, mean, array_notnull_equiv, + first, last, count, mean, array_notnull_equiv, _nansum ) +from xarray import DataArray +from xarray.core import npcompat -from . import TestCase, raises_regex +from . import TestCase, raises_regex, has_dask class TestOps(TestCase): @@ -81,7 +83,7 @@ def test_all_nan_arrays(self): class TestArrayNotNullEquiv(): - @mark.parametrize("arr1, arr2", [ + @pytest.mark.parametrize("arr1, arr2", [ (np.array([1, 2, 3]), np.array([1, 2, 3])), (np.array([1, 2, np.nan]), np.array([1, np.nan, 3])), (np.array([np.nan, 2, np.nan]), np.array([1, np.nan, np.nan])), @@ -99,7 +101,7 @@ def test_wrong_shape(self): b = np.array([[1, 2], [np.nan, 4]]) assert not array_notnull_equiv(a, b) - @mark.parametrize("val1, val2, val3, null", [ + @pytest.mark.parametrize("val1, val2, val3, null", [ (1, 2, 3, None), (1., 2., 3., np.nan), (1., 2., 3., None), @@ -109,3 +111,72 @@ def test_types(self, val1, val2, val3, null): arr1 = np.array([val1, null, val3, null]) arr2 = np.array([val1, val2, null, null]) assert array_notnull_equiv(arr1, arr2) + + +def test_nansum(): + rng = np.random.RandomState(0) + array = rng.randn(15, 30).astype(bool).astype(object) + array[1, 3] = np.nan + array[3, 10] = np.nan + reduced = _nansum(array) + reference = np.nansum(array) + array_notnull_equiv(reduced, reference) + + +def construct_dataarray(dtype, contains_nan, dask): + rng = np.random.RandomState(0) + da = DataArray(rng.randn(15, 30), dims=('x', 'y'), + coords={'x': np.arange(15)}, name='da').astype(dtype) + + if contains_nan: + da = da.reindex(x=np.arange(20)) + if dask and has_dask: + da = da.chunk({'x': 5, 'y': 10}) + + return da + + +def assert_allclose_with_nan(a, b, **kwargs): + """ Extension of np.allclose with nan-including array """ + for a1, b1 in zip(a.ravel(), b.ravel()): + assert (np.isnan(a1) and np.isnan(b1)) or np.allclose(a1, b1, + **kwargs) + + +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('dask', [False, True]) +@pytest.mark.parametrize('func', ['sum', 'min', 'max']) # TODO support more +@pytest.mark.parametrize('skipna', [False, True]) +@pytest.mark.parametrize('dim', [None, 'x', 'y']) +def test_reduce(dtype, dask, func, skipna, dim): + + da = construct_dataarray(dtype, contains_nan=True, dask=dask) + axis = None if dim is None else da.get_axis_num(dim) + + if dask and not has_dask: + return + + if skipna: + try: # TODO currently, we only support methods that numpy supports + expected = getattr(np, 'nan{}'.format(func))(da.values, axis=axis) + except (TypeError, AttributeError): + with pytest.raises(NotImplementedError): + actual = getattr(da, func)(skipna=skipna, dim=dim) + return + else: + expected = getattr(np, func)(da.values, axis=axis) + + actual = getattr(da, func)(skipna=skipna, dim=dim) + assert_allclose_with_nan(actual.values, np.array(expected)) + + # compatible with pandas + se = da.to_dataframe() + actual = getattr(da, func)(skipna=skipna) + expected = getattr(se, func)(skipna=skipna) + assert_allclose_with_nan(actual.values, np.array(expected)) + + # without nan + da = construct_dataarray(dtype, contains_nan=False, dask=dask) + expected = getattr(np, 'nan{}'.format(func))(da.values) + actual = getattr(da, func)(skipna=skipna) + assert np.allclose(actual.values, np.array(expected)) From 4f2f209e63ef0a38a6068cf021d5d81ed6ca5cb0 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 3 Feb 2018 08:11:41 +0900 Subject: [PATCH 02/19] typo --- xarray/core/duck_array_ops.py | 10 +++++----- xarray/tests/test_duck_array_ops.py | 3 +-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 6a8a0d3f813..b2c08ee301a 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -172,13 +172,13 @@ def _ignore_warnings_if(condition): def _nansum(value, axis=None, **kwargs): - """ Our in house nansum. This is used for object array """ + """ In house nansum. This is used for object array """ value = fillna(value, 0.0) return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) -def _nanmin_or_nansum(func, fill_value, value, axis=None, **kwargs): - """ Our in house nansum. This is used for object array """ +def _nanmin_or_nanmax(func, fill_value, value, axis=None, **kwargs): + """ In house nanmin or nanmax. This is used for object array """ nan_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) @@ -189,8 +189,8 @@ def _nanmin_or_nansum(func, fill_value, value, axis=None, **kwargs): _nan_funcs = {'sum': _nansum, - 'min': partial(_nanmin_or_nansum, 'min', np.inf), - 'max': partial(_nanmin_or_nansum, 'max', -np.inf), + 'min': partial(_nanmin_or_nanmax, 'min', np.inf), + 'max': partial(_nanmin_or_nanmax, 'max', -np.inf), } diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index c77fa7239bc..83634b116b9 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -139,8 +139,7 @@ def construct_dataarray(dtype, contains_nan, dask): def assert_allclose_with_nan(a, b, **kwargs): """ Extension of np.allclose with nan-including array """ for a1, b1 in zip(a.ravel(), b.ravel()): - assert (np.isnan(a1) and np.isnan(b1)) or np.allclose(a1, b1, - **kwargs) + assert (np.isnan(a1) and np.isnan(b1)) or np.allclose(a1, b1, **kwargs) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) From 4c455046518044b26b00379f1d5188fcece3df69 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 3 Feb 2018 08:15:29 +0900 Subject: [PATCH 03/19] flake8 --- xarray/tests/test_duck_array_ops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 83634b116b9..4a9c5ed7e1e 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -6,10 +6,9 @@ from numpy import array, nan from . import assert_array_equal from xarray.core.duck_array_ops import ( - first, last, count, mean, array_notnull_equiv, _nansum + first, last, count, mean, array_notnull_equiv, ) from xarray import DataArray -from xarray.core import npcompat from . import TestCase, raises_regex, has_dask From e01d0f8773b6180c9a9a14dca17f7f9ccf4868cf Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 4 Feb 2018 17:59:11 +0900 Subject: [PATCH 04/19] Pandas compatiblity test. Added nanmean for object-type array --- xarray/core/duck_array_ops.py | 29 +++++++++++++-- xarray/tests/test_duck_array_ops.py | 58 ++++++++++++++--------------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index b2c08ee301a..c2b48499b01 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -179,18 +179,41 @@ def _nansum(value, axis=None, **kwargs): def _nanmin_or_nanmax(func, fill_value, value, axis=None, **kwargs): """ In house nanmin or nanmax. This is used for object array """ - nan_count = count(value, axis=axis) + valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) if not hasattr(data, 'dtype'): # scalar case return np.nan if data == fill_value else data # convert all nan part axis to nan - return where_method(data, nan_count != 0) + return where_method(data, valid_count != 0) + + +def _nanmean(value, axis=None, **kwargs): + """ In house nanmean. This is used for object array """ + valid_count = count(value, axis=axis) + value = fillna(value, 0.0) + # TODO numpy does not support object-type array, so we cast them to float + dtype = kwargs.get('dtype', None) + if dtype is None: + dtype = value.dtype if value.dtype.kind in ['cf'] else float + data = _dask_or_eager_func('mean')(value, axis=axis, dtype=dtype, **kwargs) + if not hasattr(data, 'dtype'): # scalar case + return np.nan if data == 0.0 else data + + # adjust the sample size + if axis is None: + size = data.size + else: + size = np.prod(data.shape[axis]) + data = data / valid_count * size + # convert all nan part axis to nan + return where_method(data, valid_count != 0) _nan_funcs = {'sum': _nansum, 'min': partial(_nanmin_or_nanmax, 'min', np.inf), 'max': partial(_nanmin_or_nanmax, 'max', -np.inf), + 'mean': _nanmean, } @@ -241,7 +264,7 @@ def f(values, axis=None, skipna=None, **kwargs): return func(values, axis=axis, **kwargs) except AttributeError: if isinstance(values, dask_array_type): - try: # dask needs dtype argument for some cases + try: # dask/dask#3133 dask sometimes needs dtype argument return func(values, axis=axis, dtype=values.dtype, **kwargs) except AttributeError: diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 4a9c5ed7e1e..64528048cf6 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -112,25 +112,19 @@ def test_types(self, val1, val2, val3, null): assert array_notnull_equiv(arr1, arr2) -def test_nansum(): +def construct_dataarray(dim_num, dtype, contains_nan, dask): + # dimnum <= 3 rng = np.random.RandomState(0) - array = rng.randn(15, 30).astype(bool).astype(object) - array[1, 3] = np.nan - array[3, 10] = np.nan - reduced = _nansum(array) - reference = np.nansum(array) - array_notnull_equiv(reduced, reference) - - -def construct_dataarray(dtype, contains_nan, dask): - rng = np.random.RandomState(0) - da = DataArray(rng.randn(15, 30), dims=('x', 'y'), + shapes = [15, 30, 10][:dim_num] + dims = ('x', 'y', 'z')[:dim_num] + da = DataArray(rng.randn(*shapes), dims=dims, coords={'x': np.arange(15)}, name='da').astype(dtype) if contains_nan: da = da.reindex(x=np.arange(20)) if dask and has_dask: - da = da.chunk({'x': 5, 'y': 10}) + chunks = {d: 5 for d in dims} + da = da.chunk(chunks) return da @@ -141,40 +135,46 @@ def assert_allclose_with_nan(a, b, **kwargs): assert (np.isnan(a1) and np.isnan(b1)) or np.allclose(a1, b1, **kwargs) +@pytest.mark.parametrize('dim_num', [1, 2, 3]) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) @pytest.mark.parametrize('dask', [False, True]) -@pytest.mark.parametrize('func', ['sum', 'min', 'max']) # TODO support more +@pytest.mark.parametrize('func', ['sum', 'min', 'max', 'mean']) @pytest.mark.parametrize('skipna', [False, True]) -@pytest.mark.parametrize('dim', [None, 'x', 'y']) -def test_reduce(dtype, dask, func, skipna, dim): +@pytest.mark.parametrize('aggdim', [None, 'x', 'y']) +def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): + if aggdim == 'y' and dim_num < 2: + return + + if dtype == np.bool_ and func == 'mean': + return # numpy does not support this - da = construct_dataarray(dtype, contains_nan=True, dask=dask) - axis = None if dim is None else da.get_axis_num(dim) + da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) + axis = None if aggdim is None else da.get_axis_num(aggdim) if dask and not has_dask: return if skipna: - try: # TODO currently, we only support methods that numpy supports + try: expected = getattr(np, 'nan{}'.format(func))(da.values, axis=axis) except (TypeError, AttributeError): - with pytest.raises(NotImplementedError): - actual = getattr(da, func)(skipna=skipna, dim=dim) + # TODO currently, numpy does not support nanmean for object dtype return else: expected = getattr(np, func)(da.values, axis=axis) - actual = getattr(da, func)(skipna=skipna, dim=dim) - assert_allclose_with_nan(actual.values, np.array(expected)) + actual = getattr(da, func)(skipna=skipna, dim=aggdim) + assert_allclose_with_nan(actual.values, np.array(expected), rtol=1.0e-4) - # compatible with pandas - se = da.to_dataframe() - actual = getattr(da, func)(skipna=skipna) - expected = getattr(se, func)(skipna=skipna) - assert_allclose_with_nan(actual.values, np.array(expected)) + # compatible with pandas for 1d case + if dim_num == 1: + se = da.to_dataframe() + actual = getattr(da, func)(skipna=skipna, dim=aggdim) + expected = getattr(se, func)(skipna=skipna) + assert_allclose_with_nan(actual.values, np.array(expected)) # without nan - da = construct_dataarray(dtype, contains_nan=False, dask=dask) + da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) expected = getattr(np, 'nan{}'.format(func))(da.values) actual = getattr(da, func)(skipna=skipna) assert np.allclose(actual.values, np.array(expected)) From de9c05cff9001161a1df3e94252ff809e2adcff3 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 4 Feb 2018 19:16:06 +0900 Subject: [PATCH 05/19] Improve test --- xarray/tests/test_duck_array_ops.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 64528048cf6..b0368a48cf9 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -142,6 +142,7 @@ def assert_allclose_with_nan(a, b, **kwargs): @pytest.mark.parametrize('skipna', [False, True]) @pytest.mark.parametrize('aggdim', [None, 'x', 'y']) def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): + if aggdim == 'y' and dim_num < 2: return @@ -154,20 +155,21 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if dask and not has_dask: return - if skipna: - try: + try: + if skipna: expected = getattr(np, 'nan{}'.format(func))(da.values, axis=axis) - except (TypeError, AttributeError): - # TODO currently, numpy does not support nanmean for object dtype - return - else: - expected = getattr(np, func)(da.values, axis=axis) + else: + expected = getattr(np, func)(da.values, axis=axis) - actual = getattr(da, func)(skipna=skipna, dim=aggdim) - assert_allclose_with_nan(actual.values, np.array(expected), rtol=1.0e-4) + actual = getattr(da, func)(skipna=skipna, dim=aggdim) + assert_allclose_with_nan(actual.values, np.array(expected), + rtol=1.0e-4) + except (TypeError, AttributeError): + # TODO currently, numpy does not support nanmean for object dtype + pass # compatible with pandas for 1d case - if dim_num == 1: + if dim_num == 1 or aggdim is None: se = da.to_dataframe() actual = getattr(da, func)(skipna=skipna, dim=aggdim) expected = getattr(se, func)(skipna=skipna) From ebeea7999519fa52eaed133c13bcdfad7134d2f4 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 4 Feb 2018 20:11:01 +0900 Subject: [PATCH 06/19] Support nanvar, nanstd --- xarray/core/duck_array_ops.py | 32 +++++++++++++++++++++-------- xarray/tests/test_duck_array_ops.py | 13 +++++++++--- xarray/tests/test_variable.py | 1 + 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index c2b48499b01..d918dd30aba 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -188,8 +188,8 @@ def _nanmin_or_nanmax(func, fill_value, value, axis=None, **kwargs): return where_method(data, valid_count != 0) -def _nanmean(value, axis=None, **kwargs): - """ In house nanmean. This is used for object array """ +def _nanmean_ddof(ddof, value, axis=None, **kwargs): + """ In house nanmean. ddof argument will be used in _nanvar method """ valid_count = count(value, axis=axis) value = fillna(value, 0.0) # TODO numpy does not support object-type array, so we cast them to float @@ -197,23 +197,37 @@ def _nanmean(value, axis=None, **kwargs): if dtype is None: dtype = value.dtype if value.dtype.kind in ['cf'] else float data = _dask_or_eager_func('mean')(value, axis=axis, dtype=dtype, **kwargs) - if not hasattr(data, 'dtype'): # scalar case - return np.nan if data == 0.0 else data # adjust the sample size if axis is None: - size = data.size + size = value.size else: - size = np.prod(data.shape[axis]) - data = data / valid_count * size + size = np.prod(value.shape[axis]) + data = data / (valid_count - ddof) * size # convert all nan part axis to nan return where_method(data, valid_count != 0) +def _nanvar(value, axis=None, **kwargs): + ddof = kwargs.pop('ddof', 0) + kwargs_mean = kwargs.copy() + kwargs_mean.pop('keepdims', None) + value_mean = _nanmean_ddof(0, value, axis=axis, keepdims=True, **kwargs) + squared = _dask_or_eager_func('square')(value.astype(value_mean.dtype) - + value_mean) + return _nanmean_ddof(ddof, squared, axis=axis, **kwargs) + + +def _nanstd(value, axis=None, **kwargs): + return _dask_or_eager_func('sqrt')(_nanvar(value, axis=axis, **kwargs)) + + _nan_funcs = {'sum': _nansum, 'min': partial(_nanmin_or_nanmax, 'min', np.inf), 'max': partial(_nanmin_or_nanmax, 'max', -np.inf), - 'mean': _nanmean, + 'mean': partial(_nanmean_ddof, 0), + 'var': _nanvar, + 'std': _nanstd, } @@ -235,7 +249,7 @@ def f(values, axis=None, skipna=None, **kwargs): if values.dtype.kind not in ['u', 'i', 'f', 'c']: func = _nan_funcs.get(name, None) using_numpy_nan_func = True - if func is None: + if func is None or values.dtype.kind != 'o': raise NotImplementedError( 'skipna=True not yet implemented for %s with dtype %s' % (name, values.dtype)) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index b0368a48cf9..f579614564e 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -138,7 +138,7 @@ def assert_allclose_with_nan(a, b, **kwargs): @pytest.mark.parametrize('dim_num', [1, 2, 3]) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) @pytest.mark.parametrize('dask', [False, True]) -@pytest.mark.parametrize('func', ['sum', 'min', 'max', 'mean']) +@pytest.mark.parametrize('func', ['sum', 'min', 'max', 'mean', 'var', 'std']) @pytest.mark.parametrize('skipna', [False, True]) @pytest.mark.parametrize('aggdim', [None, 'x', 'y']) def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): @@ -155,6 +155,10 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if dask and not has_dask: return + if dask and not skipna and func in ['var', 'std'] and dtype == np.bool_: + # TODO this might be dask's bug + return + try: if skipna: expected = getattr(np, 'nan{}'.format(func))(da.values, axis=axis) @@ -164,7 +168,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): actual = getattr(da, func)(skipna=skipna, dim=aggdim) assert_allclose_with_nan(actual.values, np.array(expected), rtol=1.0e-4) - except (TypeError, AttributeError): + except (TypeError, AttributeError, ZeroDivisionError): # TODO currently, numpy does not support nanmean for object dtype pass @@ -172,7 +176,10 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if dim_num == 1 or aggdim is None: se = da.to_dataframe() actual = getattr(da, func)(skipna=skipna, dim=aggdim) - expected = getattr(se, func)(skipna=skipna) + if func in ['var', 'std']: + expected = getattr(se, func)(skipna=skipna, ddof=0) + else: + expected = getattr(se, func)(skipna=skipna) assert_allclose_with_nan(actual.values, np.array(expected)) # without nan diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 5a89627a0f9..fe3e07a9c6d 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1452,6 +1452,7 @@ def test_reduce_funcs(self): v = Variable('t', pd.date_range('2000-01-01', periods=3)) with pytest.raises(NotImplementedError): + print(v.dtype.kind) v.max(skipna=True) assert_identical( v.max(), Variable([], pd.Timestamp('2000-01-03'))) From bb3b3b07bfe6eb1df56ae20e5d061678cb5ca194 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 4 Feb 2018 21:28:46 +0900 Subject: [PATCH 07/19] Fix bug in _create_nan_agg_method --- xarray/core/duck_array_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index d918dd30aba..a5b621f81bf 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -245,11 +245,11 @@ def f(values, axis=None, skipna=None, **kwargs): if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) - if skipna or (skipna is None and values.dtype.kind in 'cfo'): + if skipna or (skipna is None and values.dtype.kind in 'cfO'): if values.dtype.kind not in ['u', 'i', 'f', 'c']: func = _nan_funcs.get(name, None) using_numpy_nan_func = True - if func is None or values.dtype.kind != 'o': + if func is None or values.dtype.kind not in 'Ob': raise NotImplementedError( 'skipna=True not yet implemented for %s with dtype %s' % (name, values.dtype)) From d194a8c8171630d9f489d659bbff1a5bd36ecba8 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 5 Feb 2018 17:59:35 +0900 Subject: [PATCH 08/19] Added nanargmin/nanargmax --- xarray/core/duck_array_ops.py | 31 +++++++++++++++--- xarray/tests/test_duck_array_ops.py | 49 ++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a5b621f81bf..775f21e2ac9 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -177,17 +177,36 @@ def _nansum(value, axis=None, **kwargs): return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) -def _nanmin_or_nanmax(func, fill_value, value, axis=None, **kwargs): - """ In house nanmin or nanmax. This is used for object array """ +def _nan_minmax(func, fill_value, value, axis=None, **kwargs): + """ In house nan-reduce used for nanmin, nanmax. This is used for object + array """ valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) if not hasattr(data, 'dtype'): # scalar case - return np.nan if data == fill_value else data + value = np.nan if valid_count == 0 else data + return np.array(value) # return 0d-array # convert all nan part axis to nan return where_method(data, valid_count != 0) +def _nan_argminmax(func, fill_value, value, axis=None, **kwargs): + """ In house nan-reduce used for nanargmin, nanargmax. This is used for + object array. This always return integer type """ + valid_count = count(value, axis=axis) + value = fillna(value, fill_value) + data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) + # dask seems return non-integer + if isinstance(value, dask_array_type): + data = data.astype(int) + if not hasattr(data, 'dtype'): # scalar case + # TODO should we raise ValueError if all-nan slice encountered? + value = -1 if valid_count == 0 else int(data) + return np.array(value) # return 0d-array + # convert all nan part axis to nan + return where_method(data, valid_count != 0, -1) + + def _nanmean_ddof(ddof, value, axis=None, **kwargs): """ In house nanmean. ddof argument will be used in _nanvar method """ valid_count = count(value, axis=axis) @@ -223,8 +242,10 @@ def _nanstd(value, axis=None, **kwargs): _nan_funcs = {'sum': _nansum, - 'min': partial(_nanmin_or_nanmax, 'min', np.inf), - 'max': partial(_nanmin_or_nanmax, 'max', -np.inf), + 'min': partial(_nan_minmax, 'min', np.inf), + 'max': partial(_nan_minmax, 'max', -np.inf), + 'argmin': partial(_nan_argminmax, 'argmin', np.inf), + 'argmax': partial(_nan_argminmax, 'argmax', -np.inf), 'mean': partial(_nanmean_ddof, 0), 'var': _nanvar, 'std': _nanstd, diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index f579614564e..0e4ebabd38b 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -117,6 +117,7 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): rng = np.random.RandomState(0) shapes = [15, 30, 10][:dim_num] dims = ('x', 'y', 'z')[:dim_num] + da = DataArray(rng.randn(*shapes), dims=dims, coords={'x': np.arange(15)}, name='da').astype(dtype) @@ -149,12 +150,12 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if dtype == np.bool_ and func == 'mean': return # numpy does not support this - da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) - axis = None if aggdim is None else da.get_axis_num(aggdim) - if dask and not has_dask: return + da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) + axis = None if aggdim is None else da.get_axis_num(aggdim) + if dask and not skipna and func in ['var', 'std'] and dtype == np.bool_: # TODO this might be dask's bug return @@ -176,14 +177,52 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if dim_num == 1 or aggdim is None: se = da.to_dataframe() actual = getattr(da, func)(skipna=skipna, dim=aggdim) + assert isinstance(actual, DataArray) if func in ['var', 'std']: expected = getattr(se, func)(skipna=skipna, ddof=0) + + assert_allclose_with_nan(actual.values, np.array(expected)) + # also check ddof!=0 case + actual = getattr(da, func)(skipna=skipna, dim=aggdim, ddof=5) + expected = getattr(se, func)(skipna=skipna, ddof=5) + assert_allclose_with_nan(actual.values, np.array(expected)) else: expected = getattr(se, func)(skipna=skipna) - assert_allclose_with_nan(actual.values, np.array(expected)) + assert_allclose_with_nan(actual.values, np.array(expected)) # without nan da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) - expected = getattr(np, 'nan{}'.format(func))(da.values) actual = getattr(da, func)(skipna=skipna) + expected = getattr(np, 'nan{}'.format(func))(da.values) assert np.allclose(actual.values, np.array(expected)) + + +@pytest.mark.parametrize('dim_num', [1, 2]) +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('contains_nan', [True, False]) +@pytest.mark.parametrize('dask', [False, True]) +@pytest.mark.parametrize('func', ['min', 'max']) +@pytest.mark.parametrize('skipna', [False, True]) +@pytest.mark.parametrize('aggdim', ['x', 'y']) +def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): + # Due to #****, we does not check consistency with pandas + # just make sure da[da.argmin ()] == da.min() + + if aggdim == 'y' and dim_num < 2: + return + + if dask and not has_dask: + return + + if (contains_nan and (dtype == np.bool_ and not skipna and contains_nan) or + dtype in [float, int, np.float32]): + # numpy's argmin does not handle object-dtype + return + + da = construct_dataarray(dim_num, dtype, contains_nan=contains_nan, + dask=dask) + actual = da.isel(**{aggdim: + getattr(da, 'arg'+func)(dim=aggdim, + skipna=skipna).compute()}) + expected = getattr(da, func)(dim=aggdim, skipna=skipna) + assert_allclose_with_nan(actual.values, expected.values) From 33724f43b53704b33be0879255c061dbeab4a58e Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 6 Feb 2018 18:03:53 +0900 Subject: [PATCH 09/19] Support numpy<1.13. --- xarray/core/duck_array_ops.py | 2 +- xarray/core/npcompat.py | 94 ++++++++++++++++++++++++++++- xarray/tests/test_duck_array_ops.py | 34 ++++++----- 3 files changed, 114 insertions(+), 16 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 775f21e2ac9..2b1820b1daa 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -65,7 +65,7 @@ def fail_on_dask_array_input(values, msg=None, func_name=None): around = _dask_or_eager_func('around') -isclose = _dask_or_eager_func('isclose') +isclose = _dask_or_eager_func('isclose', npcompat) notnull = _dask_or_eager_func('notnull', pd) _isnull = _dask_or_eager_func('isnull', pd) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index bbe7b745621..af2826908df 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -4,7 +4,7 @@ import numpy as np try: - from numpy import nancumsum, nancumprod, flip + from numpy import nancumsum, nancumprod, flip, isclose except ImportError: # pragma: no cover # Code copied from newer versions of NumPy (v1.12). # Used under the terms of NumPy's license, see licenses/NUMPY_LICENSE. @@ -245,3 +245,95 @@ def flip(m, axis): raise ValueError("axis=%i is invalid for the %i-dimensional " "input array" % (axis, m.ndim)) return m[tuple(indexer)] + + def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False): + """ + Returns a boolean array where two arrays are element-wise equal within a + tolerance. + The tolerance values are positive, typically very small numbers. The + relative difference (`rtol` * abs(`b`)) and the absolute difference + `atol` are added together to compare against the absolute difference + between `a` and `b`. + Parameters + ---------- + a, b : array_like + Input arrays to compare. + rtol : float + The relative tolerance parameter (see Notes). + atol : float + The absolute tolerance parameter (see Notes). + equal_nan : bool + Whether to compare NaN's as equal. If True, NaN's in `a` will be + considered equal to NaN's in `b` in the output array. + Returns + ------- + y : array_like + Returns a boolean array of where `a` and `b` are equal within the + given tolerance. If both `a` and `b` are scalars, returns a single + boolean value. + See Also + -------- + allclose + Notes + ----- + .. versionadded:: 1.7.0 + For finite values, isclose uses the following equation to test whether + two floating point values are equivalent. + absolute(`a` - `b`) <= (`atol` + `rtol` * absolute(`b`)) + The above equation is not symmetric in `a` and `b`, so that + `isclose(a, b)` might be different from `isclose(b, a)` in + some rare cases. + Examples + -------- + >>> np.isclose([1e10,1e-7], [1.00001e10,1e-8]) + array([True, False]) + >>> np.isclose([1e10,1e-8], [1.00001e10,1e-9]) + array([True, True]) + >>> np.isclose([1e10,1e-8], [1.0001e10,1e-9]) + array([False, True]) + >>> np.isclose([1.0, np.nan], [1.0, np.nan]) + array([True, False]) + >>> np.isclose([1.0, np.nan], [1.0, np.nan], equal_nan=True) + array([True, True]) + """ + def within_tol(x, y, atol, rtol): + with np.errstate(invalid='ignore'): + result = np.less_equal(abs(x-y), atol + rtol * abs(y)) + if np.isscalar(a) and np.isscalar(b): + result = bool(result) + return result + + x = np.array(a, copy=False, subok=True, ndmin=1) + y = np.array(b, copy=False, subok=True, ndmin=1) + + # Make sure y is an inexact type to avoid bad behavior on abs(MIN_INT). + # This will cause casting of x later. Also, make sure to allow subclasses + # (e.g., for numpy.ma). + dt = np.multiarray.result_type(y, 1.) + y = np.array(y, dtype=dt, copy=False, subok=True) + + xfin = np.isfinite(x) + yfin = np.isfinite(y) + if all(xfin) and all(yfin): + return within_tol(x, y, atol, rtol) + else: + finite = xfin & yfin + cond = np.zeros_like(finite, subok=True) + # Because we're using boolean indexing, x & y must be the same shape. + # Ideally, we'd just do x, y = broadcast_arrays(x, y). It's in + # lib.stride_tricks, though, so we can't import it here. + x = x * np.ones_like(cond) + y = y * np.ones_like(cond) + # Avoid subtraction with infinite/nan values... + cond[finite] = within_tol(x[finite], y[finite], atol, rtol) + # Check for equality of infinite values... + cond[~finite] = (x[~finite] == y[~finite]) + if equal_nan: + # Make NaN == NaN + both_nan = np.isnan(x) & np.isnan(y) + cond[both_nan] = both_nan[both_nan] + + if np.isscalar(a) and np.isscalar(b): + return bool(cond) + else: + return cond diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 0e4ebabd38b..ef845d71282 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -4,6 +4,7 @@ import pytest import numpy as np from numpy import array, nan +from distutils.version import LooseVersion from . import assert_array_equal from xarray.core.duck_array_ops import ( first, last, count, mean, array_notnull_equiv, @@ -160,18 +161,23 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): # TODO this might be dask's bug return - try: - if skipna: - expected = getattr(np, 'nan{}'.format(func))(da.values, axis=axis) - else: - expected = getattr(np, func)(da.values, axis=axis) - - actual = getattr(da, func)(skipna=skipna, dim=aggdim) - assert_allclose_with_nan(actual.values, np.array(expected), - rtol=1.0e-4) - except (TypeError, AttributeError, ZeroDivisionError): - # TODO currently, numpy does not support nanmean for object dtype - pass + if (LooseVersion(np.__version__) >= LooseVersion('1.13.0') and + da.dtype.kind == 'O' and skipna): + # Numpy < 1.13 does not handle object-type for + try: + if skipna: + expected = getattr(np, 'nan{}'.format(func))(da.values, + axis=axis) + else: + expected = getattr(np, func)(da.values, axis=axis) + + actual = getattr(da, func)(skipna=skipna, dim=aggdim) + assert_allclose_with_nan(actual.values, np.array(expected), + rtol=1.0e-4) + except (TypeError, AttributeError, ZeroDivisionError): + # TODO currently, numpy does not support some methods such as + # nanmean for object dtype + pass # compatible with pandas for 1d case if dim_num == 1 or aggdim is None: @@ -205,8 +211,8 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): @pytest.mark.parametrize('skipna', [False, True]) @pytest.mark.parametrize('aggdim', ['x', 'y']) def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): - # Due to #****, we does not check consistency with pandas - # just make sure da[da.argmin ()] == da.min() + # pandas-dev/pandas#16830, we does not check consistency with pandas + # just make sure da[da.argmin()] == da.min() if aggdim == 'y' and dim_num < 2: return From 9616915515265395a3ee5b91567695c089495e36 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 7 Feb 2018 08:38:16 +0900 Subject: [PATCH 10/19] Update tests. --- xarray/core/duck_array_ops.py | 16 ++--- xarray/core/npcompat.py | 94 +---------------------------- xarray/tests/test_duck_array_ops.py | 38 ++++++------ xarray/tests/test_variable.py | 1 - 4 files changed, 29 insertions(+), 120 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 2b1820b1daa..a2a91f74d00 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -65,7 +65,7 @@ def fail_on_dask_array_input(values, msg=None, func_name=None): around = _dask_or_eager_func('around') -isclose = _dask_or_eager_func('isclose', npcompat) +isclose = _dask_or_eager_func('isclose') notnull = _dask_or_eager_func('notnull', pd) _isnull = _dask_or_eager_func('isnull', pd) @@ -181,11 +181,11 @@ def _nan_minmax(func, fill_value, value, axis=None, **kwargs): """ In house nan-reduce used for nanmin, nanmax. This is used for object array """ valid_count = count(value, axis=axis) - value = fillna(value, fill_value) - data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) + filled_value = fillna(value, fill_value) + data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) if not hasattr(data, 'dtype'): # scalar case - value = np.nan if valid_count == 0 else data - return np.array(value) # return 0d-array + data = np.nan if valid_count == 0 else data + return np.array(data, dtype=value.dtype) # return 0d-array # convert all nan part axis to nan return where_method(data, valid_count != 0) @@ -201,8 +201,8 @@ def _nan_argminmax(func, fill_value, value, axis=None, **kwargs): data = data.astype(int) if not hasattr(data, 'dtype'): # scalar case # TODO should we raise ValueError if all-nan slice encountered? - value = -1 if valid_count == 0 else int(data) - return np.array(value) # return 0d-array + data = -1 if valid_count == 0 else int(data) + return np.array(data) # return 0d-array # convert all nan part axis to nan return where_method(data, valid_count != 0, -1) @@ -213,7 +213,7 @@ def _nanmean_ddof(ddof, value, axis=None, **kwargs): value = fillna(value, 0.0) # TODO numpy does not support object-type array, so we cast them to float dtype = kwargs.get('dtype', None) - if dtype is None: + if dtype is None and value.dtype.kind == 'O': dtype = value.dtype if value.dtype.kind in ['cf'] else float data = _dask_or_eager_func('mean')(value, axis=axis, dtype=dtype, **kwargs) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index af2826908df..bbe7b745621 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -4,7 +4,7 @@ import numpy as np try: - from numpy import nancumsum, nancumprod, flip, isclose + from numpy import nancumsum, nancumprod, flip except ImportError: # pragma: no cover # Code copied from newer versions of NumPy (v1.12). # Used under the terms of NumPy's license, see licenses/NUMPY_LICENSE. @@ -245,95 +245,3 @@ def flip(m, axis): raise ValueError("axis=%i is invalid for the %i-dimensional " "input array" % (axis, m.ndim)) return m[tuple(indexer)] - - def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False): - """ - Returns a boolean array where two arrays are element-wise equal within a - tolerance. - The tolerance values are positive, typically very small numbers. The - relative difference (`rtol` * abs(`b`)) and the absolute difference - `atol` are added together to compare against the absolute difference - between `a` and `b`. - Parameters - ---------- - a, b : array_like - Input arrays to compare. - rtol : float - The relative tolerance parameter (see Notes). - atol : float - The absolute tolerance parameter (see Notes). - equal_nan : bool - Whether to compare NaN's as equal. If True, NaN's in `a` will be - considered equal to NaN's in `b` in the output array. - Returns - ------- - y : array_like - Returns a boolean array of where `a` and `b` are equal within the - given tolerance. If both `a` and `b` are scalars, returns a single - boolean value. - See Also - -------- - allclose - Notes - ----- - .. versionadded:: 1.7.0 - For finite values, isclose uses the following equation to test whether - two floating point values are equivalent. - absolute(`a` - `b`) <= (`atol` + `rtol` * absolute(`b`)) - The above equation is not symmetric in `a` and `b`, so that - `isclose(a, b)` might be different from `isclose(b, a)` in - some rare cases. - Examples - -------- - >>> np.isclose([1e10,1e-7], [1.00001e10,1e-8]) - array([True, False]) - >>> np.isclose([1e10,1e-8], [1.00001e10,1e-9]) - array([True, True]) - >>> np.isclose([1e10,1e-8], [1.0001e10,1e-9]) - array([False, True]) - >>> np.isclose([1.0, np.nan], [1.0, np.nan]) - array([True, False]) - >>> np.isclose([1.0, np.nan], [1.0, np.nan], equal_nan=True) - array([True, True]) - """ - def within_tol(x, y, atol, rtol): - with np.errstate(invalid='ignore'): - result = np.less_equal(abs(x-y), atol + rtol * abs(y)) - if np.isscalar(a) and np.isscalar(b): - result = bool(result) - return result - - x = np.array(a, copy=False, subok=True, ndmin=1) - y = np.array(b, copy=False, subok=True, ndmin=1) - - # Make sure y is an inexact type to avoid bad behavior on abs(MIN_INT). - # This will cause casting of x later. Also, make sure to allow subclasses - # (e.g., for numpy.ma). - dt = np.multiarray.result_type(y, 1.) - y = np.array(y, dtype=dt, copy=False, subok=True) - - xfin = np.isfinite(x) - yfin = np.isfinite(y) - if all(xfin) and all(yfin): - return within_tol(x, y, atol, rtol) - else: - finite = xfin & yfin - cond = np.zeros_like(finite, subok=True) - # Because we're using boolean indexing, x & y must be the same shape. - # Ideally, we'd just do x, y = broadcast_arrays(x, y). It's in - # lib.stride_tricks, though, so we can't import it here. - x = x * np.ones_like(cond) - y = y * np.ones_like(cond) - # Avoid subtraction with infinite/nan values... - cond[finite] = within_tol(x[finite], y[finite], atol, rtol) - # Check for equality of infinite values... - cond[~finite] = (x[~finite] == y[~finite]) - if equal_nan: - # Make NaN == NaN - both_nan = np.isnan(x) & np.isnan(y) - cond[both_nan] = both_nan[both_nan] - - if np.isscalar(a) and np.isscalar(b): - return bool(cond) - else: - return cond diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index ef845d71282..05e3860a6d2 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -10,6 +10,7 @@ first, last, count, mean, array_notnull_equiv, ) from xarray import DataArray +from xarray.testing import assert_allclose from . import TestCase, raises_regex, has_dask @@ -131,10 +132,11 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): return da -def assert_allclose_with_nan(a, b, **kwargs): - """ Extension of np.allclose with nan-including array """ - for a1, b1 in zip(a.ravel(), b.ravel()): - assert (np.isnan(a1) and np.isnan(b1)) or np.allclose(a1, b1, **kwargs) +def from_series_or_scalar(se): + try: + return DataArray.from_series(se) + except AttributeError: # scalar case + return DataArray(se) @pytest.mark.parametrize('dim_num', [1, 2, 3]) @@ -172,8 +174,8 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): expected = getattr(np, func)(da.values, axis=axis) actual = getattr(da, func)(skipna=skipna, dim=aggdim) - assert_allclose_with_nan(actual.values, np.array(expected), - rtol=1.0e-4) + assert np.allclose(actual.values, np.array(expected), rtol=1.0e-4, + equal_nan=True) except (TypeError, AttributeError, ZeroDivisionError): # TODO currently, numpy does not support some methods such as # nanmean for object dtype @@ -181,20 +183,21 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): # compatible with pandas for 1d case if dim_num == 1 or aggdim is None: - se = da.to_dataframe() + se = da.to_series() actual = getattr(da, func)(skipna=skipna, dim=aggdim) assert isinstance(actual, DataArray) if func in ['var', 'std']: - expected = getattr(se, func)(skipna=skipna, ddof=0) - - assert_allclose_with_nan(actual.values, np.array(expected)) + expected = from_series_or_scalar( + getattr(se, func)(skipna=skipna, ddof=0)) + assert_allclose(actual, expected) # also check ddof!=0 case actual = getattr(da, func)(skipna=skipna, dim=aggdim, ddof=5) - expected = getattr(se, func)(skipna=skipna, ddof=5) - assert_allclose_with_nan(actual.values, np.array(expected)) + expected = from_series_or_scalar( + getattr(se, func)(skipna=skipna, ddof=5)) + assert_allclose(actual, expected) else: - expected = getattr(se, func)(skipna=skipna) - assert_allclose_with_nan(actual.values, np.array(expected)) + expected = from_series_or_scalar(getattr(se, func)(skipna=skipna)) + assert_allclose(actual, expected) # without nan da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) @@ -227,8 +230,7 @@ def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): da = construct_dataarray(dim_num, dtype, contains_nan=contains_nan, dask=dask) - actual = da.isel(**{aggdim: - getattr(da, 'arg'+func)(dim=aggdim, - skipna=skipna).compute()}) + actual = da.isel(**{ + aggdim: getattr(da, 'arg'+func)(dim=aggdim, skipna=skipna).compute()}) expected = getattr(da, func)(dim=aggdim, skipna=skipna) - assert_allclose_with_nan(actual.values, expected.values) + assert_allclose(actual.drop(actual.coords), expected.drop(expected.coords)) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index fe3e07a9c6d..5a89627a0f9 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1452,7 +1452,6 @@ def test_reduce_funcs(self): v = Variable('t', pd.date_range('2000-01-01', periods=3)) with pytest.raises(NotImplementedError): - print(v.dtype.kind) v.max(skipna=True) assert_identical( v.max(), Variable([], pd.Timestamp('2000-01-03'))) From 670ae8a499d8da2ad69bda920ddd78122be23f58 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 7 Feb 2018 10:03:52 +0900 Subject: [PATCH 11/19] Some cleanups and whatsnew --- doc/whats-new.rst | 11 ++++++++++- xarray/core/duck_array_ops.py | 11 ++++------- xarray/tests/test_duck_array_ops.py | 13 ++++++++----- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 153d2c32959..e487065a4bf 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -41,7 +41,16 @@ Documentation Enhancements ~~~~~~~~~~~~ -- reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` +- Reduce methods such as :py:func:`DataArray.sum()` now handles object-type array. + + .. ipython:: python + + da = xray.DataArray(np.array([True, False, np.nan], dtype=object), dims='x') + da.sum() + + (:issue:`1866`) + By `Keisuke Fujii `_. +- Reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` arguments. (:issue:`1838`) By `Keisuke Fujii `_. - Added nodatavals attribute to DataArray when using :py:func:`~xarray.open_rasterio`. (:issue:`1736`). diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a2a91f74d00..33053e919dc 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -185,8 +185,7 @@ def _nan_minmax(func, fill_value, value, axis=None, **kwargs): data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) if not hasattr(data, 'dtype'): # scalar case data = np.nan if valid_count == 0 else data - return np.array(data, dtype=value.dtype) # return 0d-array - # convert all nan part axis to nan + return np.array(data, dtype=value.dtype) return where_method(data, valid_count != 0) @@ -196,14 +195,13 @@ def _nan_argminmax(func, fill_value, value, axis=None, **kwargs): valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) - # dask seems return non-integer + # dask seems return non-integer type if isinstance(value, dask_array_type): data = data.astype(int) if not hasattr(data, 'dtype'): # scalar case # TODO should we raise ValueError if all-nan slice encountered? data = -1 if valid_count == 0 else int(data) - return np.array(data) # return 0d-array - # convert all nan part axis to nan + return np.array(data) return where_method(data, valid_count != 0, -1) @@ -211,7 +209,7 @@ def _nanmean_ddof(ddof, value, axis=None, **kwargs): """ In house nanmean. ddof argument will be used in _nanvar method """ valid_count = count(value, axis=axis) value = fillna(value, 0.0) - # TODO numpy does not support object-type array, so we cast them to float + # TODO numpy's mean does not support object-type array, so we assume float dtype = kwargs.get('dtype', None) if dtype is None and value.dtype.kind == 'O': dtype = value.dtype if value.dtype.kind in ['cf'] else float @@ -223,7 +221,6 @@ def _nanmean_ddof(ddof, value, axis=None, **kwargs): else: size = np.prod(value.shape[axis]) data = data / (valid_count - ddof) * size - # convert all nan part axis to nan return where_method(data, valid_count != 0) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 05e3860a6d2..731da2ded9c 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -165,7 +165,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if (LooseVersion(np.__version__) >= LooseVersion('1.13.0') and da.dtype.kind == 'O' and skipna): - # Numpy < 1.13 does not handle object-type for + # Numpy < 1.13 does not handle object-type array. try: if skipna: expected = getattr(np, 'nan{}'.format(func))(da.values, @@ -223,10 +223,13 @@ def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): if dask and not has_dask: return - if (contains_nan and (dtype == np.bool_ and not skipna and contains_nan) or - dtype in [float, int, np.float32]): - # numpy's argmin does not handle object-dtype - return + if contains_nan: + if not skipna: + # numpy's argmin (not nanargmin) does not handle object-dtype + return + if skipna and np.dtype(dtype).kind in 'iufc': + # numpy's nanargmin raises ValueError for all nan axis + return da = construct_dataarray(dim_num, dtype, contains_nan=contains_nan, dask=dask) From 4da55c48a47be99a0ca06f2e1ee4ed17f3dac449 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 8 Feb 2018 21:27:45 +0900 Subject: [PATCH 12/19] Simplify tests. Drop support std. --- xarray/core/duck_array_ops.py | 19 ++++----- xarray/tests/test_duck_array_ops.py | 65 ++++++++++++++++++----------- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 33053e919dc..70a8a8b96cd 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -172,14 +172,13 @@ def _ignore_warnings_if(condition): def _nansum(value, axis=None, **kwargs): - """ In house nansum. This is used for object array """ + """ In house nansum for object array """ value = fillna(value, 0.0) return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) def _nan_minmax(func, fill_value, value, axis=None, **kwargs): - """ In house nan-reduce used for nanmin, nanmax. This is used for object - array """ + """ In house nanmin and nanmax for object array """ valid_count = count(value, axis=axis) filled_value = fillna(value, fill_value) data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) @@ -190,8 +189,8 @@ def _nan_minmax(func, fill_value, value, axis=None, **kwargs): def _nan_argminmax(func, fill_value, value, axis=None, **kwargs): - """ In house nan-reduce used for nanargmin, nanargmax. This is used for - object array. This always return integer type """ + """ In house nanargmin, nanargmax for object arrays. Always return integer + type """ valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) @@ -228,16 +227,13 @@ def _nanvar(value, axis=None, **kwargs): ddof = kwargs.pop('ddof', 0) kwargs_mean = kwargs.copy() kwargs_mean.pop('keepdims', None) - value_mean = _nanmean_ddof(0, value, axis=axis, keepdims=True, **kwargs) + value_mean = _nanmean_ddof(ddof=0, value=value, axis=axis, keepdims=True, + **kwargs_mean) squared = _dask_or_eager_func('square')(value.astype(value_mean.dtype) - value_mean) return _nanmean_ddof(ddof, squared, axis=axis, **kwargs) -def _nanstd(value, axis=None, **kwargs): - return _dask_or_eager_func('sqrt')(_nanvar(value, axis=axis, **kwargs)) - - _nan_funcs = {'sum': _nansum, 'min': partial(_nan_minmax, 'min', np.inf), 'max': partial(_nan_minmax, 'max', -np.inf), @@ -245,7 +241,6 @@ def _nanstd(value, axis=None, **kwargs): 'argmax': partial(_nan_argminmax, 'argmax', -np.inf), 'mean': partial(_nanmean_ddof, 0), 'var': _nanvar, - 'std': _nanstd, } @@ -294,7 +289,7 @@ def f(values, axis=None, skipna=None, **kwargs): with _ignore_warnings_if(using_numpy_nan_func): try: return func(values, axis=axis, **kwargs) - except AttributeError: + except AttributeError as e: if isinstance(values, dask_array_type): try: # dask/dask#3133 dask sometimes needs dtype argument return func(values, axis=axis, dtype=values.dtype, diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 731da2ded9c..5c70293337c 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -11,6 +11,7 @@ ) from xarray import DataArray from xarray.testing import assert_allclose +from xarray import concat from . import TestCase, raises_regex, has_dask @@ -117,16 +118,16 @@ def test_types(self, val1, val2, val3, null): def construct_dataarray(dim_num, dtype, contains_nan, dask): # dimnum <= 3 rng = np.random.RandomState(0) - shapes = [15, 30, 10][:dim_num] + shapes = [16, 8, 4][:dim_num] dims = ('x', 'y', 'z')[:dim_num] da = DataArray(rng.randn(*shapes), dims=dims, - coords={'x': np.arange(15)}, name='da').astype(dtype) + coords={'x': np.arange(16)}, name='da').astype(dtype) if contains_nan: da = da.reindex(x=np.arange(20)) if dask and has_dask: - chunks = {d: 5 for d in dims} + chunks = {d: 4 for d in dims} da = da.chunk(chunks) return da @@ -139,12 +140,31 @@ def from_series_or_scalar(se): return DataArray(se) -@pytest.mark.parametrize('dim_num', [1, 2, 3]) +def series_reduce(da, func, dim, **kwargs): + """ convert DataArray to pd.Series, apply pd.func, then convert back to + a DataArray. Multiple dims cannot be specified.""" + if dim is None or da.ndim == 1: + se = da.to_series() + return from_series_or_scalar(getattr(se, func)(**kwargs)) + else: + da1 = [] + dims = list(da.dims) + dims.remove(dim) + d = dims[0] + for i in range(len(da[d])): + da1.append(series_reduce(da.isel(**{d: i}), func, dim, **kwargs)) + + if d in da.coords: + return concat(da1, dim=da[d]) + return concat(da1, dim=d) + + +@pytest.mark.parametrize('dim_num', [1, 2]) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) @pytest.mark.parametrize('dask', [False, True]) -@pytest.mark.parametrize('func', ['sum', 'min', 'max', 'mean', 'var', 'std']) +@pytest.mark.parametrize('func', ['sum', 'min', 'max', 'mean', 'var']) @pytest.mark.parametrize('skipna', [False, True]) -@pytest.mark.parametrize('aggdim', [None, 'x', 'y']) +@pytest.mark.parametrize('aggdim', [None, 'x']) def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if aggdim == 'y' and dim_num < 2: @@ -156,6 +176,8 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): if dask and not has_dask: return + rtol = 1e-04 if dtype == np.float32 else 1e-05 + da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) axis = None if aggdim is None else da.get_axis_num(aggdim) @@ -181,29 +203,24 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): # nanmean for object dtype pass - # compatible with pandas for 1d case - if dim_num == 1 or aggdim is None: - se = da.to_series() - actual = getattr(da, func)(skipna=skipna, dim=aggdim) - assert isinstance(actual, DataArray) - if func in ['var', 'std']: - expected = from_series_or_scalar( - getattr(se, func)(skipna=skipna, ddof=0)) - assert_allclose(actual, expected) - # also check ddof!=0 case - actual = getattr(da, func)(skipna=skipna, dim=aggdim, ddof=5) - expected = from_series_or_scalar( - getattr(se, func)(skipna=skipna, ddof=5)) - assert_allclose(actual, expected) - else: - expected = from_series_or_scalar(getattr(se, func)(skipna=skipna)) - assert_allclose(actual, expected) + # make sure the compatiblility with pandas + actual = getattr(da, func)(skipna=skipna, dim=aggdim) + if func == 'var': + expected = series_reduce(da, func, skipna=skipna, dim=aggdim, ddof=0) + assert_allclose(actual, expected, rtol=rtol) + # also check ddof!=0 case + actual = getattr(da, func)(skipna=skipna, dim=aggdim, ddof=5) + expected = series_reduce(da, func, skipna=skipna, dim=aggdim, ddof=5) + assert_allclose(actual, expected, rtol=rtol) + else: + expected = series_reduce(da, func, skipna=skipna, dim=aggdim) + assert_allclose(actual, expected, rtol=rtol) # without nan da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) actual = getattr(da, func)(skipna=skipna) expected = getattr(np, 'nan{}'.format(func))(da.values) - assert np.allclose(actual.values, np.array(expected)) + assert np.allclose(actual.values, np.array(expected), rtol=rtol) @pytest.mark.parametrize('dim_num', [1, 2]) From 9fb17156d3e8a9bd002003e461fccef630fb07c4 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 8 Feb 2018 22:06:26 +0900 Subject: [PATCH 13/19] flake8 --- xarray/core/duck_array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 70a8a8b96cd..4f8c835a797 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -289,7 +289,7 @@ def f(values, axis=None, skipna=None, **kwargs): with _ignore_warnings_if(using_numpy_nan_func): try: return func(values, axis=axis, **kwargs) - except AttributeError as e: + except AttributeError: if isinstance(values, dask_array_type): try: # dask/dask#3133 dask sometimes needs dtype argument return func(values, axis=axis, dtype=values.dtype, From de1c613dbf24954436b893614e4ca8cc66d1c578 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 9 Feb 2018 08:41:17 +0900 Subject: [PATCH 14/19] xray -> xr --- doc/whats-new.rst | 2 +- xarray/tests/test_duck_array_ops.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e487065a4bf..2edf83f069b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,7 +45,7 @@ Enhancements .. ipython:: python - da = xray.DataArray(np.array([True, False, np.nan], dtype=object), dims='x') + da = xr.DataArray(np.array([True, False, np.nan], dtype=object), dims='x') da.sum() (:issue:`1866`) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 5c70293337c..9f22dedda67 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -203,7 +203,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): # nanmean for object dtype pass - # make sure the compatiblility with pandas + # make sure the compatiblility with pandas' results. actual = getattr(da, func)(skipna=skipna, dim=aggdim) if func == 'var': expected = series_reduce(da, func, skipna=skipna, dim=aggdim, ddof=0) @@ -231,7 +231,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): @pytest.mark.parametrize('skipna', [False, True]) @pytest.mark.parametrize('aggdim', ['x', 'y']) def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): - # pandas-dev/pandas#16830, we does not check consistency with pandas + # pandas-dev/pandas#16830, we do not check consistency with pandas but # just make sure da[da.argmin()] == da.min() if aggdim == 'y' and dim_num < 2: From c31fb49491c0464067e45382a620f6549690d0c0 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 12 Feb 2018 17:06:45 +0900 Subject: [PATCH 15/19] string array support --- xarray/core/dtypes.py | 37 ++++++++++++++++++++++++++++++++--- xarray/core/duck_array_ops.py | 36 ++++++++++++++++++---------------- 2 files changed, 53 insertions(+), 20 deletions(-) diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index ccbe48edc32..3f9b3e58681 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -1,4 +1,5 @@ import numpy as np +import functools from . import utils @@ -7,6 +8,30 @@ NA = utils.ReprObject('') +@functools.total_ordering +class AlwaysLessThan(object): + def __lt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +INF = AlwaysLessThan() + + +@functools.total_ordering +class AlwaysGreaterThan(object): + def __gt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +NINF = AlwaysGreaterThan() + + def maybe_promote(dtype): """Simpler equivalent of pandas.core.common._maybe_promote @@ -40,7 +65,7 @@ def maybe_promote(dtype): return np.dtype(dtype), fill_value -def get_fill_value(dtype): +def get_fill_value(dtype, fill_value_typ=None): """Return an appropriate fill value for this dtype. Parameters @@ -51,8 +76,14 @@ def get_fill_value(dtype): ------- fill_value : Missing value corresponding to this dtype. """ - _, fill_value = maybe_promote(dtype) - return fill_value + if fill_value_typ is None: + _, fill_value = maybe_promote(dtype) + return fill_value + + if np.issubdtype(dtype, np.floating): + return np.inf if fill_value_typ == '+inf' else -np.inf + + return INF if fill_value_typ == '+inf' else NINF def is_datetime_like(dtype): diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 4f8c835a797..f7c2ea37e3e 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -171,13 +171,13 @@ def _ignore_warnings_if(condition): yield -def _nansum(value, axis=None, **kwargs): +def _nansum_object(value, axis=None, **kwargs): """ In house nansum for object array """ value = fillna(value, 0.0) return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) -def _nan_minmax(func, fill_value, value, axis=None, **kwargs): +def _nan_minmax_object(func, fill_value, value, axis=None, **kwargs): """ In house nanmin and nanmax for object array """ valid_count = count(value, axis=axis) filled_value = fillna(value, fill_value) @@ -188,9 +188,10 @@ def _nan_minmax(func, fill_value, value, axis=None, **kwargs): return where_method(data, valid_count != 0) -def _nan_argminmax(func, fill_value, value, axis=None, **kwargs): +def _nan_argminmax_object(func, fill_value_typ, value, axis=None, **kwargs): """ In house nanargmin, nanargmax for object arrays. Always return integer type """ + fill_value = dtypes.get_fill_value(fill_value_typ) valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) @@ -204,7 +205,7 @@ def _nan_argminmax(func, fill_value, value, axis=None, **kwargs): return where_method(data, valid_count != 0, -1) -def _nanmean_ddof(ddof, value, axis=None, **kwargs): +def _nanmean_ddof_object(ddof, fill_value_typ, axis=None, **kwargs): """ In house nanmean. ddof argument will be used in _nanvar method """ valid_count = count(value, axis=axis) value = fillna(value, 0.0) @@ -223,25 +224,26 @@ def _nanmean_ddof(ddof, value, axis=None, **kwargs): return where_method(data, valid_count != 0) -def _nanvar(value, axis=None, **kwargs): +def _nanvar_object(value, axis=None, **kwargs): ddof = kwargs.pop('ddof', 0) kwargs_mean = kwargs.copy() kwargs_mean.pop('keepdims', None) - value_mean = _nanmean_ddof(ddof=0, value=value, axis=axis, keepdims=True, - **kwargs_mean) + value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis, + keepdims=True, **kwargs_mean) squared = _dask_or_eager_func('square')(value.astype(value_mean.dtype) - value_mean) - return _nanmean_ddof(ddof, squared, axis=axis, **kwargs) + return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs) -_nan_funcs = {'sum': _nansum, - 'min': partial(_nan_minmax, 'min', np.inf), - 'max': partial(_nan_minmax, 'max', -np.inf), - 'argmin': partial(_nan_argminmax, 'argmin', np.inf), - 'argmax': partial(_nan_argminmax, 'argmax', -np.inf), - 'mean': partial(_nanmean_ddof, 0), - 'var': _nanvar, - } +_nan_object_funcs = { + 'sum': _nansum_object, + 'min': partial(_nan_minmax_object, 'min', fill_value_typ='+inf'), + 'max': partial(_nan_minmax_object, 'max', fill_value_typ='-inf'), + 'argmin': partial(_nan_argminmax_object, 'argmin', fill_value_typ='+inf'), + 'argmax': partial(_nan_argminmax_object, 'argmax', fill_value_typ='-inf'), + 'mean': partial(_nanmean_ddof_object, 0), + 'var': _nanvar_object, +} def _create_nan_agg_method(name, numeric_only=False, np_compat=False, @@ -260,7 +262,7 @@ def f(values, axis=None, skipna=None, **kwargs): if skipna or (skipna is None and values.dtype.kind in 'cfO'): if values.dtype.kind not in ['u', 'i', 'f', 'c']: - func = _nan_funcs.get(name, None) + func = _nan_object_funcs.get(name, None) using_numpy_nan_func = True if func is None or values.dtype.kind not in 'Ob': raise NotImplementedError( From 4dcc1aac06995396315ce15201199cdf89646622 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 12 Feb 2018 18:12:36 +0900 Subject: [PATCH 16/19] Support str dtype. Refactor nanmean --- xarray/core/dtypes.py | 15 ++++++------ xarray/core/duck_array_ops.py | 37 +++++++++++++---------------- xarray/tests/test_duck_array_ops.py | 20 +++++++++++++--- 3 files changed, 40 insertions(+), 32 deletions(-) diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 3f9b3e58681..5396d6f2ee8 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -9,27 +9,26 @@ @functools.total_ordering -class AlwaysLessThan(object): - def __lt__(self, other): +class AlwaysGreaterThan(object): + def __gt__(self, other): return True def __eq__(self, other): return isinstance(other, type(self)) -INF = AlwaysLessThan() - - @functools.total_ordering -class AlwaysGreaterThan(object): - def __gt__(self, other): +class AlwaysLessThan(object): + def __lt__(self, other): return True def __eq__(self, other): return isinstance(other, type(self)) -NINF = AlwaysGreaterThan() +# Equivalence to np.inf (-np.inf) for object-type +INF = AlwaysGreaterThan() +NINF = AlwaysLessThan() def maybe_promote(dtype): diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index f7c2ea37e3e..7040f4133ad 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -173,17 +173,18 @@ def _ignore_warnings_if(condition): def _nansum_object(value, axis=None, **kwargs): """ In house nansum for object array """ - value = fillna(value, 0.0) + value = fillna(value, 0) return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) -def _nan_minmax_object(func, fill_value, value, axis=None, **kwargs): +def _nan_minmax_object(func, fill_value_typ, value, axis=None, **kwargs): """ In house nanmin and nanmax for object array """ + fill_value = dtypes.get_fill_value(value.dtype, fill_value_typ) valid_count = count(value, axis=axis) filled_value = fillna(value, fill_value) data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) if not hasattr(data, 'dtype'): # scalar case - data = np.nan if valid_count == 0 else data + data = dtypes.fill_value(value.dtype) if valid_count == 0 else data return np.array(data, dtype=value.dtype) return where_method(data, valid_count != 0) @@ -191,7 +192,7 @@ def _nan_minmax_object(func, fill_value, value, axis=None, **kwargs): def _nan_argminmax_object(func, fill_value_typ, value, axis=None, **kwargs): """ In house nanargmin, nanargmax for object arrays. Always return integer type """ - fill_value = dtypes.get_fill_value(fill_value_typ) + fill_value = dtypes.get_fill_value(value.dtype, fill_value_typ) valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) @@ -205,22 +206,17 @@ def _nan_argminmax_object(func, fill_value_typ, value, axis=None, **kwargs): return where_method(data, valid_count != 0, -1) -def _nanmean_ddof_object(ddof, fill_value_typ, axis=None, **kwargs): +def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): """ In house nanmean. ddof argument will be used in _nanvar method """ valid_count = count(value, axis=axis) - value = fillna(value, 0.0) - # TODO numpy's mean does not support object-type array, so we assume float - dtype = kwargs.get('dtype', None) + value = fillna(value, 0) + # As dtype inference is impossible for object dtype, we assume float + dtype = kwargs.pop('dtype', None) if dtype is None and value.dtype.kind == 'O': dtype = value.dtype if value.dtype.kind in ['cf'] else float - data = _dask_or_eager_func('mean')(value, axis=axis, dtype=dtype, **kwargs) - # adjust the sample size - if axis is None: - size = value.size - else: - size = np.prod(value.shape[axis]) - data = data / (valid_count - ddof) * size + data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs) + data = data / (valid_count - ddof) return where_method(data, valid_count != 0) @@ -230,17 +226,16 @@ def _nanvar_object(value, axis=None, **kwargs): kwargs_mean.pop('keepdims', None) value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis, keepdims=True, **kwargs_mean) - squared = _dask_or_eager_func('square')(value.astype(value_mean.dtype) - - value_mean) + squared = (value.astype(value_mean.dtype) - value_mean)**2 return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs) _nan_object_funcs = { 'sum': _nansum_object, - 'min': partial(_nan_minmax_object, 'min', fill_value_typ='+inf'), - 'max': partial(_nan_minmax_object, 'max', fill_value_typ='-inf'), - 'argmin': partial(_nan_argminmax_object, 'argmin', fill_value_typ='+inf'), - 'argmax': partial(_nan_argminmax_object, 'argmax', fill_value_typ='-inf'), + 'min': partial(_nan_minmax_object, 'min', '+inf'), + 'max': partial(_nan_minmax_object, 'max', '-inf'), + 'argmin': partial(_nan_argminmax_object, 'argmin', '+inf'), + 'argmax': partial(_nan_argminmax_object, 'argmax', '-inf'), 'mean': partial(_nanmean_ddof_object, 0), 'var': _nanvar_object, } diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 9f22dedda67..97d9e907c83 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -121,8 +121,17 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): shapes = [16, 8, 4][:dim_num] dims = ('x', 'y', 'z')[:dim_num] - da = DataArray(rng.randn(*shapes), dims=dims, - coords={'x': np.arange(16)}, name='da').astype(dtype) + if np.issubdtype(dtype, np.floating): + array = rng.randn(*shapes).astype(dtype) + elif np.issubdtype(dtype, np.integer): + array = rng.randint(0, 10, size=shapes).astype(dtype) + elif np.issubdtype(dtype, np.bool_): + array = rng.randint(0, 1, size=shapes).astype(dtype) + elif dtype == str: + array = rng.choice(['a', 'b', 'c', 'd'], size=shapes) + else: + raise ValueError + da = DataArray(array, dims=dims, coords={'x': np.arange(16)}, name='da') if contains_nan: da = da.reindex(x=np.arange(20)) @@ -216,6 +225,11 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): expected = series_reduce(da, func, skipna=skipna, dim=aggdim) assert_allclose(actual, expected, rtol=rtol) + # make sure the dtype argument + if func not in ['max', 'min']: + actual = getattr(da, func)(skipna=skipna, dim=aggdim, dtype=float) + assert actual.dtype == float + # without nan da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) actual = getattr(da, func)(skipna=skipna) @@ -224,7 +238,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): @pytest.mark.parametrize('dim_num', [1, 2]) -@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_, str]) @pytest.mark.parametrize('contains_nan', [True, False]) @pytest.mark.parametrize('dask', [False, True]) @pytest.mark.parametrize('func', ['min', 'max']) From f93a61893cbda7f3d41272881b876df60aa84020 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 13 Feb 2018 08:32:42 +0900 Subject: [PATCH 17/19] added get_pos_inifinity and get_neg_inifinity --- xarray/core/dtypes.py | 40 ++++++++++++++++++++++++----- xarray/core/duck_array_ops.py | 21 ++++++++++----- xarray/tests/test_duck_array_ops.py | 8 ++++++ 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 5396d6f2ee8..6fb28ee90ee 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -64,7 +64,7 @@ def maybe_promote(dtype): return np.dtype(dtype), fill_value -def get_fill_value(dtype, fill_value_typ=None): +def get_fill_value(dtype): """Return an appropriate fill value for this dtype. Parameters @@ -75,14 +75,42 @@ def get_fill_value(dtype, fill_value_typ=None): ------- fill_value : Missing value corresponding to this dtype. """ - if fill_value_typ is None: - _, fill_value = maybe_promote(dtype) - return fill_value + _, fill_value = maybe_promote(dtype) + return fill_value + +def get_pos_infinity(dtype): + """Return an appropriate positive infinity for this dtype. + + Parameters + ---------- + dtype : np.dtype + + Returns + ------- + fill_value : positive infinity value corresponding to this dtype. + """ + if np.issubdtype(dtype, np.floating): + return np.inf + + return INF + + +def get_neg_infinity(dtype): + """Return an appropriate positive infinity for this dtype. + + Parameters + ---------- + dtype : np.dtype + + Returns + ------- + fill_value : positive infinity value corresponding to this dtype. + """ if np.issubdtype(dtype, np.floating): - return np.inf if fill_value_typ == '+inf' else -np.inf + return -np.inf - return INF if fill_value_typ == '+inf' else NINF + return NINF def is_datetime_like(dtype): diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 7040f4133ad..46867df3861 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -179,7 +179,10 @@ def _nansum_object(value, axis=None, **kwargs): def _nan_minmax_object(func, fill_value_typ, value, axis=None, **kwargs): """ In house nanmin and nanmax for object array """ - fill_value = dtypes.get_fill_value(value.dtype, fill_value_typ) + if fill_value_typ == '+inf': + fill_value = dtypes.get_pos_infinity(value.dtype) + else: + fill_value = dtypes.get_neg_infinity(value.dtype) valid_count = count(value, axis=axis) filled_value = fillna(value, fill_value) data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) @@ -192,18 +195,21 @@ def _nan_minmax_object(func, fill_value_typ, value, axis=None, **kwargs): def _nan_argminmax_object(func, fill_value_typ, value, axis=None, **kwargs): """ In house nanargmin, nanargmax for object arrays. Always return integer type """ - fill_value = dtypes.get_fill_value(value.dtype, fill_value_typ) + if fill_value_typ == '+inf': + fill_value = dtypes.get_pos_infinity(value.dtype) + else: + fill_value = dtypes.get_neg_infinity(value.dtype) valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) # dask seems return non-integer type if isinstance(value, dask_array_type): data = data.astype(int) - if not hasattr(data, 'dtype'): # scalar case - # TODO should we raise ValueError if all-nan slice encountered? - data = -1 if valid_count == 0 else int(data) - return np.array(data) - return where_method(data, valid_count != 0, -1) + + if (valid_count == 0).any(): + raise ValueError('All-NaN slice encountered') + + return np.array(data, dtype=int) def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): @@ -211,6 +217,7 @@ def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): valid_count = count(value, axis=axis) value = fillna(value, 0) # As dtype inference is impossible for object dtype, we assume float + # https://github.com/dask/dask/issues/3162 dtype = kwargs.pop('dtype', None) if dtype is None and value.dtype.kind == 'O': dtype = value.dtype if value.dtype.kind in ['cf'] else float diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 97d9e907c83..6121a23e313 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -264,6 +264,14 @@ def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): da = construct_dataarray(dim_num, dtype, contains_nan=contains_nan, dask=dask) + + if aggdim == 'y' and contains_nan and skipna: + with pytest.raises(ValueError): + actual = da.isel(**{ + aggdim: getattr(da, 'arg'+func)(dim=aggdim, + skipna=skipna).compute()}) + return + actual = da.isel(**{ aggdim: getattr(da, 'arg'+func)(dim=aggdim, skipna=skipna).compute()}) expected = getattr(da, func)(dim=aggdim, skipna=skipna) From 28f0e0adee1cf21e49c914d611d0fd4afa37b6e3 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 15 Feb 2018 18:13:38 +0900 Subject: [PATCH 18/19] Use function for get_fill_value instead of str. Add test to make sure it raises ValueError in argmin/argmax. --- xarray/core/dtypes.py | 56 +++++++++++++++++++++++++++-- xarray/core/duck_array_ops.py | 24 ++++++------- xarray/tests/test_duck_array_ops.py | 11 +++++- 3 files changed, 74 insertions(+), 17 deletions(-) diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 12ce395c53b..8dac39612e4 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -8,6 +8,29 @@ NA = utils.ReprObject('') +@functools.total_ordering +class AlwaysGreaterThan(object): + def __gt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +@functools.total_ordering +class AlwaysLessThan(object): + def __lt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +# Equivalence to np.inf (-np.inf) for object-type +INF = AlwaysGreaterThan() +NINF = AlwaysLessThan() + + # Pairs of types that, if both found, should be promoted to object dtype # instead of following NumPy's own type-promotion rules. These type promotion # rules match pandas instead. For reference, see the NumPy type hierarchy: @@ -19,6 +42,29 @@ ] +@functools.total_ordering +class AlwaysGreaterThan(object): + def __gt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +@functools.total_ordering +class AlwaysLessThan(object): + def __lt__(self, other): + return True + + def __eq__(self, other): + return isinstance(other, type(self)) + + +# Equivalence to np.inf (-np.inf) for object-type +INF = AlwaysGreaterThan() +NINF = AlwaysLessThan() + + def maybe_promote(dtype): """Simpler equivalent of pandas.core.common._maybe_promote @@ -78,9 +124,12 @@ def get_pos_infinity(dtype): ------- fill_value : positive infinity value corresponding to this dtype. """ - if np.issubdtype(dtype, np.floating): + if issubclass(dtype.type, (np.floating, np.integer)): return np.inf + if issubclass(dtype.type, np.complexfloating): + return np.inf + 1j * np.inf + return INF @@ -95,9 +144,12 @@ def get_neg_infinity(dtype): ------- fill_value : positive infinity value corresponding to this dtype. """ - if np.issubdtype(dtype, np.floating): + if issubclass(dtype.type, (np.floating, np.integer)): return -np.inf + if issubclass(dtype.type, np.complexfloating): + return -np.inf - 1j * np.inf + return NINF diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a624722129c..6f5548800a2 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -203,12 +203,9 @@ def _nansum_object(value, axis=None, **kwargs): return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) -def _nan_minmax_object(func, fill_value_typ, value, axis=None, **kwargs): +def _nan_minmax_object(func, get_fill_value, value, axis=None, **kwargs): """ In house nanmin and nanmax for object array """ - if fill_value_typ == '+inf': - fill_value = dtypes.get_pos_infinity(value.dtype) - else: - fill_value = dtypes.get_neg_infinity(value.dtype) + fill_value = get_fill_value(value.dtype) valid_count = count(value, axis=axis) filled_value = fillna(value, fill_value) data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) @@ -218,13 +215,10 @@ def _nan_minmax_object(func, fill_value_typ, value, axis=None, **kwargs): return where_method(data, valid_count != 0) -def _nan_argminmax_object(func, fill_value_typ, value, axis=None, **kwargs): +def _nan_argminmax_object(func, get_fill_value, value, axis=None, **kwargs): """ In house nanargmin, nanargmax for object arrays. Always return integer type """ - if fill_value_typ == '+inf': - fill_value = dtypes.get_pos_infinity(value.dtype) - else: - fill_value = dtypes.get_neg_infinity(value.dtype) + fill_value = get_fill_value(value.dtype) valid_count = count(value, axis=axis) value = fillna(value, fill_value) data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) @@ -265,10 +259,12 @@ def _nanvar_object(value, axis=None, **kwargs): _nan_object_funcs = { 'sum': _nansum_object, - 'min': partial(_nan_minmax_object, 'min', '+inf'), - 'max': partial(_nan_minmax_object, 'max', '-inf'), - 'argmin': partial(_nan_argminmax_object, 'argmin', '+inf'), - 'argmax': partial(_nan_argminmax_object, 'argmax', '-inf'), + 'min': partial(_nan_minmax_object, 'min', dtypes.get_pos_infinity), + 'max': partial(_nan_minmax_object, 'max', dtypes.get_neg_infinity), + 'argmin': partial(_nan_argminmax_object, 'argmin', + dtypes.get_pos_infinity), + 'argmax': partial(_nan_argminmax_object, 'argmax', + dtypes.get_neg_infinity), 'mean': partial(_nanmean_ddof_object, 0), 'var': _nanvar_object, } diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 9d3b3ff65f8..d68a7a382de 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -250,7 +250,10 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): da = construct_dataarray(dim_num, dtype, contains_nan=False, dask=dask) actual = getattr(da, func)(skipna=skipna) expected = getattr(np, 'nan{}'.format(func))(da.values) - assert np.allclose(actual.values, np.array(expected), rtol=rtol) + if actual.dtype == object: + assert actual.values == np.array(expected) + else: + assert np.allclose(actual.values, np.array(expected), rtol=rtol) @pytest.mark.parametrize('dim_num', [1, 2]) @@ -292,3 +295,9 @@ def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): aggdim: getattr(da, 'arg'+func)(dim=aggdim, skipna=skipna).compute()}) expected = getattr(da, func)(dim=aggdim, skipna=skipna) assert_allclose(actual.drop(actual.coords), expected.drop(expected.coords)) + + +def test_argmin_max_error(): + da = construct_dataarray(2, np.bool_, contains_nan=True, dask=False) + with pytest.raises(ValueError): + da.argmin(dim='y') From e46d07de2dcaf7df1bf12e94c8ad70aa8a7cb10b Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 15 Feb 2018 20:35:54 +0900 Subject: [PATCH 19/19] Tests for dtypes.INF --- xarray/tests/test_dtypes.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/xarray/tests/test_dtypes.py b/xarray/tests/test_dtypes.py index 51c1aaa4c0c..1b236e0160d 100644 --- a/xarray/tests/test_dtypes.py +++ b/xarray/tests/test_dtypes.py @@ -46,3 +46,9 @@ def error(): # would get promoted to float32 actual = dtypes.result_type(array, np.array([0.5, 1.0], dtype=np.float32)) assert actual == np.float64 + + +@pytest.mark.parametrize('obj', [1.0, np.inf, 'ab', 1.0 + 1.0j, True]) +def test_inf(obj): + assert dtypes.INF > obj + assert dtypes.NINF < obj