From 630187f94a1949d140148ae93fe4e9a8ec23d4fb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 13 Aug 2015 11:08:44 -0400 Subject: [PATCH] API: Series.sum() will now return 0.0 for all-NaN series; this is for compat with numpy >= 1.8.2 and bottleneck >= 1.0, #9422 note that passing skipna=False will still return a NaN --- doc/source/whatsnew/v0.17.0.txt | 14 +++++++ pandas/__init__.py | 1 + pandas/core/groupby.py | 30 ++++++++----- pandas/core/nanops.py | 46 +++++++++++++++++--- pandas/tests/test_frame.py | 44 +++++++++++++++----- pandas/tests/test_nanops.py | 74 ++++++++++++++++++++++++++++----- pandas/tests/test_panel.py | 18 +++++--- pandas/tests/test_series.py | 4 +- 8 files changed, 187 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 13764543ec665..8078e459f2ee2 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -551,6 +551,20 @@ Other API Changes - Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`) +- ``Series.sum()`` will now return 0.0, and ``Series.prod()`` will return 1.0 for all-NaN series rather than ``NaN``; this is for compat with ``numpy`` >= 1.8.2 and ``bottleneck`` >= 1.0 (:issue:`9422`). + + .. ipython:: python + + s = Series([np.nan]) + s.sum() + s.sum(skipna=False) + s.prod() + s.prod(skipna=False) + + .. warning:: + + ``bottleneck`` is used for these calculations. If you have ``bottleneck`` < 1.0, then these will all return ``NaN``. + .. _whatsnew_0170.deprecations: Deprecations diff --git a/pandas/__init__.py b/pandas/__init__.py index dbc697410da80..61ced12a36ae1 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -27,6 +27,7 @@ _np_version = np.version.short_version _np_version_under1p8 = LooseVersion(_np_version) < '1.8' _np_version_under1p9 = LooseVersion(_np_version) < '1.9' +_np_version_under1p10 = LooseVersion(_np_version) < '1.10' from pandas.info import __doc__ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d23cb39c15548..baefc91a9fb5b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -102,11 +102,11 @@ class SpecificationError(GroupByError): def _groupby_function(name, alias, npfunc, numeric_only=True, - _convert=False): + fillna=None, _convert=False): def f(self): self._set_selection_from_grouper() try: - return self._cython_agg_general(alias, numeric_only=numeric_only) + return self._cython_agg_general(alias, numeric_only=numeric_only, fillna=fillna) except AssertionError as e: raise SpecificationError(str(e)) except Exception: @@ -793,8 +793,8 @@ def size(self): """ return self.grouper.size() - sum = _groupby_function('sum', 'add', np.sum) - prod = _groupby_function('prod', 'prod', np.prod) + sum = _groupby_function('sum', 'add', np.sum, fillna=0.0) + prod = _groupby_function('prod', 'prod', np.prod, fillna=1.0) min = _groupby_function('min', 'min', np.min, numeric_only=False) max = _groupby_function('max', 'max', np.max, numeric_only=False) first = _groupby_function('first', 'first', _first_compat, @@ -1118,7 +1118,7 @@ def _try_cast(self, result, obj): return result - def _cython_agg_general(self, how, numeric_only=True): + def _cython_agg_general(self, how, numeric_only=True, fillna=None): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -1126,7 +1126,7 @@ def _cython_agg_general(self, how, numeric_only=True): continue try: - result, names = self.grouper.aggregate(obj.values, how) + result, names = self.grouper.aggregate(obj.values, how, fillna=fillna) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -1511,7 +1511,7 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func, dtype_str - def aggregate(self, values, how, axis=0): + def aggregate(self, values, how, axis=0, fillna=None): arity = self._cython_arity.get(how, 1) vdim = values.ndim @@ -1534,14 +1534,18 @@ def aggregate(self, values, how, axis=0): values = values.view('int64') # GH 7754 is_numeric = True + fillna = None elif is_bool_dtype(values.dtype): values = _algos.ensure_float64(values) + fillna = None elif com.is_integer_dtype(values): values = values.astype('int64', copy=False) + fillna = None elif is_numeric: values = _algos.ensure_float64(values) else: values = values.astype(object) + fillna = None try: agg_func, dtype_str = self._get_aggregate_function(how, values) @@ -1564,6 +1568,10 @@ def aggregate(self, values, how, axis=0): result = self._aggregate(result, counts, values, agg_func, is_numeric) + # if we have a non-None fillna, then replace + if fillna is not None: + result[np.isnan(result)] = fillna + if com.is_integer_dtype(result): if len(result[result == tslib.iNaT]) > 0: result = result.astype('float64') @@ -2581,8 +2589,8 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, numeric_only=True): - new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) + def _cython_agg_general(self, how, numeric_only=True, fillna=None): + new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only, fillna=fillna) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -2608,7 +2616,7 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, numeric_only=True): + def _cython_agg_blocks(self, how, numeric_only=True, fillna=None): data, agg_axis = self._get_data_to_aggregate() new_blocks = [] @@ -2620,7 +2628,7 @@ def _cython_agg_blocks(self, how, numeric_only=True): values = block._try_operate(block.values) - result, _ = self.grouper.aggregate(values, how, axis=agg_axis) + result, _ = self.grouper.aggregate(values, how, axis=agg_axis, fillna=fillna) # see if we can cast the block back to the original dtype result = block._try_coerce_and_cast_result(result) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c70fb6339517d..d004ab26d1424 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -9,7 +9,7 @@ _USE_BOTTLENECK = False import pandas.hashtable as _hash -from pandas import compat, lib, algos, tslib +from pandas import compat, lib, algos, tslib, _np_version_under1p10 from pandas.compat import builtins from pandas.core.common import (isnull, notnull, _values_from_object, _maybe_upcast_putmask, @@ -243,12 +243,14 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch(zero_value=0) def nansum(values, axis=None, skipna=True): + dtype = values.dtype values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask) + the_sum = _maybe_null_out(the_sum, axis, mask, allow_all_null=not skipna, + dtype=dtype, fill_value=0) return _wrap_results(the_sum, dtype) @@ -549,12 +551,14 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8','m8') def nanprod(values, axis=None, skipna=True): + dtype = values.dtype mask = isnull(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask) + return _maybe_null_out(result, axis, mask, allow_all_null=not skipna, dtype=dtype, + fill_value=1) def _maybe_arg_null_out(result, axis, mask, skipna): @@ -588,7 +592,29 @@ def _get_counts(mask, axis, dtype=float): return np.array(count, dtype=dtype) -def _maybe_null_out(result, axis, mask): +def _maybe_null_out(result, axis, mask, allow_all_null=True, dtype=None, fill_value=None): + + + # 9422 + # if we have all nulls we normally return a + # null, but for numpy >= 1.8.2 and bottleneck >= 1.0 + # nansum/nanprod are set to be the fill_values + if not allow_all_null and dtype is not None: + + if is_complex_dtype(dtype) or not is_float_dtype(dtype): + + # we don't mask complex + # object or non-floats + # if numpy changes this, we will as well + + # IOW, np.nansum(np.array([np.nan],dtype='object')) is np.nan + # https://github.com/numpy/numpy/issues/6209 + allow_all_null = True + fill_value = np.nan + + else: + fill_value = np.nan + if axis is not None and getattr(result, 'ndim', False): null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 if np.any(null_mask): @@ -596,11 +622,19 @@ def _maybe_null_out(result, axis, mask): result = result.astype('c16') else: result = result.astype('f8') + + # mark nans result[null_mask] = np.nan + + # masker if for only all nan + if not allow_all_null: + null_mask = mask.all(axis) + if null_mask.any(): + result[null_mask] = fill_value else: null_mask = mask.size - mask.sum() - if null_mask == 0: - result = np.nan + if null_mask == 0 and (mask.size > 0 or allow_all_null): + result = fill_value return result diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 465f1da05ebde..f6004737ab97d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12230,10 +12230,10 @@ def test_count(self): assert_series_equal(result, expected) def test_sum(self): - self._check_stat_op('sum', np.sum, has_numeric_only=True) + self._check_stat_op('sum', np.sum, has_numeric_only=True, fillna=0.0) # mixed types (with upcasting happening) - self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'), + self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'), fillna=0.0, has_numeric_only=True, check_dtype=False, check_less_precise=True) def test_stat_operators_attempt_obj_array(self): @@ -12247,23 +12247,32 @@ def test_stat_operators_attempt_obj_array(self): df1 = DataFrame(data, index=['foo', 'bar', 'baz'], dtype='O') methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max'] + fills = [0.0, np.nan, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan] # GH #676 df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object) for df in [df1, df2]: - for meth in methods: + for meth, fill in zip(methods, fills): self.assertEqual(df.values.dtype, np.object_) result = getattr(df, meth)(1) + + # 9422 + # all-NaN object array is still NaN, while floats are not :< expected = getattr(df.astype('f8'), meth)(1) + if not np.isnan(fill): + mask = df.isnull().all(1) + if mask.any(): + expected[mask] = np.nan + assert_series_equal(result, expected) def test_mean(self): self._check_stat_op('mean', np.mean, check_dates=True) def test_product(self): - self._check_stat_op('product', np.prod) + self._check_stat_op('product', np.prod, fillna=1.0) def test_median(self): def wrapper(x): @@ -12435,7 +12444,7 @@ def alt(x): def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, has_numeric_only=False, check_dtype=True, check_dates=False, - check_less_precise=False): + check_less_precise=False, fillna=None): if frame is None: frame = self.frame # set some NAs @@ -12478,11 +12487,20 @@ def wrapper(x): wrapper = alternative result0 = f(axis=0) - result1 = f(axis=1) - assert_series_equal(result0, frame.apply(skipna_wrapper), + expected0 = frame.apply(skipna_wrapper) + assert_series_equal(result0, expected0, check_dtype=check_dtype, check_less_precise=check_less_precise) - assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), + + result1 = f(axis=1) + + # 9422 + # all-nan rows get the fillna + expected1 = frame.apply(skipna_wrapper, axis=1) + if fillna is not None: + expected1[isnull(frame).all(axis=1)] = fillna + + assert_series_equal(result1, expected1, check_dtype=False, check_less_precise=check_less_precise) @@ -12513,8 +12531,14 @@ def wrapper(x): all_na = self.frame * np.NaN r0 = getattr(all_na, name)(axis=0) r1 = getattr(all_na, name)(axis=1) - self.assertTrue(np.isnan(r0).all()) - self.assertTrue(np.isnan(r1).all()) + + # 9422 + if fillna is not None: + self.assertTrue((r0==fillna).all()) + self.assertTrue((r1==fillna).all()) + else: + self.assertTrue(np.isnan(r0).all()) + self.assertTrue(np.isnan(r1).all()) def test_mode(self): df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index a903b76b3ac7f..36aa2d833024f 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -4,18 +4,28 @@ from functools import partial import numpy as np -from pandas import Series +from pandas import Series, _np_version_under1p10 + from pandas.core.common import isnull, is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm -nanops._USE_BOTTLENECK = False +_USE_BOTTLENECK = nanops._USE_BOTTLENECK +class Base(object): -class TestnanopsDataFrame(tm.TestCase): def setUp(self): + nanops._USE_BOTTLENECK = False np.random.seed(11235) + def tearDown(self): + nanops._USE_BOTTLENECK = _USE_BOTTLENECK + +class TestnanopsDataFrame(Base, tm.TestCase): + + def setUp(self): + super(TestnanopsDataFrame, self).setUp() + self.arr_shape = (11, 7, 5) self.arr_float = np.random.randn(*self.arr_shape) @@ -172,15 +182,29 @@ def _coerce_tds(targ, res): tm.assert_almost_equal(targ.imag, res.imag) def check_fun_data(self, testfunc, targfunc, - testarval, targarval, targarnanval, **kwargs): + testarval, targarval, targarnanval, nanfunc=None, **kwargs): + + otargfunc = targfunc for axis in list(range(targarval.ndim))+[None]: for skipna in [False, True]: + targartempval = targarval if skipna else targarnanval try: + + # we need a different comp function if + # we have a provided nanfunc (e.g. nansum) + # and we are skipna=False + if nanfunc is not None: + if skipna: + targfunc = nanfunc + else: + targfunc = otargfunc + targ = targfunc(targartempval, axis=axis, **kwargs) res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) self.check_results(targ, res, axis) + if skipna: res = testfunc(testarval, axis=axis) self.check_results(targ, res, axis) @@ -205,9 +229,9 @@ def check_fun_data(self, testfunc, targfunc, targarnanval2 = np.take(targarnanval, 0, axis=-1) except ValueError: return - self.check_fun_data(testfunc, targfunc, + self.check_fun_data(testfunc, otargfunc, testarval2, targarval2, targarnanval2, - **kwargs) + nanfunc=nanfunc, **kwargs) def check_fun(self, testfunc, targfunc, testar, targar=None, targarnan=None, @@ -228,7 +252,7 @@ def check_fun(self, testfunc, targfunc, 'targarnan: %s' % targarnan) raise - def check_funs(self, testfunc, targfunc, + def check_funs(self, testfunc, targfunc, nanfunc=None, allow_complex=True, allow_all_nan=True, allow_str=True, allow_date=True, allow_tdelta=True, allow_obj=True, **kwargs): @@ -242,7 +266,7 @@ def check_funs(self, testfunc, targfunc, self.arr_bool.astype('O')] if allow_all_nan: - self.check_fun(testfunc, targfunc, 'arr_nan', **kwargs) + self.check_fun(testfunc, targfunc, 'arr_nan', nanfunc=nanfunc, **kwargs) if allow_complex: self.check_fun(testfunc, targfunc, 'arr_complex', **kwargs) @@ -315,9 +339,15 @@ def test_nanall(self): allow_all_nan=False, allow_str=False, allow_date=False, allow_tdelta=False) def test_nansum(self): - self.check_funs(nanops.nansum, np.sum, + self.check_funs(nanops.nansum, np.sum, nanfunc=np.nansum, allow_str=False, allow_date=False, allow_tdelta=True) + # validate that nansum of all nans is 0, True for numpy >= 1.8.2 & bottleneck >= 1.0 + # 9422 + s = Series([np.nan]) + self.assertEqual(s.sum(skipna=True),0.0) + self.assertIs(s.sum(skipna=False),np.nan) + def test_nanmean(self): self.check_funs(nanops.nanmean, np.mean, allow_complex=False, allow_obj=False, @@ -450,9 +480,30 @@ def test_nankurt(self): allow_complex=False, allow_str=False, allow_date=False, allow_tdelta=False) def test_nanprod(self): - self.check_funs(nanops.nanprod, np.prod, + + # use nanprod if it exists + # otherwise by construction + nanfunc = getattr(np,'nanprod',None) + if nanfunc is None: + def nanprod(x, axis, **kwargs): + result = x.prod(axis=axis) + if np.isnan(result).all(): + if np.isscalar(result): + result = 1 + else: + result[np.isnan(result)] = 1 + return result + nanfunc = nanprod + + self.check_funs(nanops.nanprod, np.prod, nanfunc=nanfunc, allow_str=False, allow_date=False, allow_tdelta=False) + # validate that nanprod of all nans is 1.0 + # 9422 + s = Series([np.nan]) + self.assertEqual(s.prod(skipna=True),1.0) + self.assertIs(s.prod(skipna=False),np.nan) + def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) @@ -769,7 +820,8 @@ def test__bn_ok_dtype(self): self.assertFalse(nanops._bn_ok_dtype(self.arr_obj.dtype, 'test')) -class TestEnsureNumeric(tm.TestCase): +class TestEnsureNumeric(Base, tm.TestCase): + def test_numeric_values(self): # Test integer self.assertEqual(nanops._ensure_numeric(1), 1, 'Failed for int') diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 9cdc769dd7d74..e504a63490f0b 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -70,13 +70,13 @@ def test_count(self): self._check_stat_op('count', f, obj=self.panel, has_skipna=False) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, fillna=0.0) def test_mean(self): self._check_stat_op('mean', np.mean) def test_prod(self): - self._check_stat_op('prod', np.prod) + self._check_stat_op('prod', np.prod, fillna=1.0) def test_median(self): def wrapper(x): @@ -139,7 +139,7 @@ def alt(x): # self._check_stat_op('skew', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, fillna=None): if obj is None: obj = self.panel @@ -161,14 +161,22 @@ def wrapper(x): for i in range(obj.ndim): result = f(axis=i, skipna=False) - assert_frame_equal(result, obj.apply(wrapper, axis=i)) + expected = obj.apply(wrapper, axis=i) + assert_frame_equal(result, expected) else: skipna_wrapper = alternative wrapper = alternative for i in range(obj.ndim): result = f(axis=i) - assert_frame_equal(result, obj.apply(skipna_wrapper, axis=i)) + expected = obj.apply(skipna_wrapper, axis=i) + + # 9422 + # all-nan rows get the fillna + if fillna is not None: + expected[isnull(obj).all(axis=i)] = fillna + + assert_frame_equal(result, expected) self.assertRaises(Exception, f, axis=obj.ndim) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 56146df37a27f..c99b08c8ea99c 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2762,7 +2762,9 @@ def testit(): self.assertTrue(bn.__version__ >= LooseVersion('1.0')) self.assertEqual(f(allna),0.0) except: - self.assertTrue(np.isnan(f(allna))) + + # 10815 pandas does as well + self.assertEqual(f(allna),0.0) # dtype=object with None, it works! s = Series([1, 2, 3, None, 5])