From 630187f94a1949d140148ae93fe4e9a8ec23d4fb Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Thu, 13 Aug 2015 11:08:44 -0400
Subject: [PATCH] API: Series.sum() will now return 0.0 for all-NaN series;
 this is for compat with numpy >= 1.8.2 and bottleneck >= 1.0, #9422

     note that passing skipna=False will still return a NaN
---
 doc/source/whatsnew/v0.17.0.txt | 14 +++++++
 pandas/__init__.py              |  1 +
 pandas/core/groupby.py          | 30 ++++++++-----
 pandas/core/nanops.py           | 46 +++++++++++++++++---
 pandas/tests/test_frame.py      | 44 +++++++++++++++-----
 pandas/tests/test_nanops.py     | 74 ++++++++++++++++++++++++++++-----
 pandas/tests/test_panel.py      | 18 +++++---
 pandas/tests/test_series.py     |  4 +-
 8 files changed, 187 insertions(+), 44 deletions(-)

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
index 13764543ec665..8078e459f2ee2 100644
--- a/doc/source/whatsnew/v0.17.0.txt
+++ b/doc/source/whatsnew/v0.17.0.txt
@@ -551,6 +551,20 @@ Other API Changes
 
 - Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
 
+- ``Series.sum()`` will now return 0.0, and ``Series.prod()`` will return 1.0 for all-NaN series rather than ``NaN``; this is for compat with ``numpy`` >= 1.8.2 and ``bottleneck`` >= 1.0 (:issue:`9422`).
+
+   .. ipython:: python
+
+      s = Series([np.nan])
+      s.sum()
+      s.sum(skipna=False)
+      s.prod()
+      s.prod(skipna=False)
+
+   .. warning::
+
+      ``bottleneck`` is used for these calculations. If you have ``bottleneck`` < 1.0, then these will all return ``NaN``.
+
 .. _whatsnew_0170.deprecations:
 
 Deprecations
diff --git a/pandas/__init__.py b/pandas/__init__.py
index dbc697410da80..61ced12a36ae1 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -27,6 +27,7 @@
 _np_version = np.version.short_version
 _np_version_under1p8 = LooseVersion(_np_version) < '1.8'
 _np_version_under1p9 = LooseVersion(_np_version) < '1.9'
+_np_version_under1p10 = LooseVersion(_np_version) < '1.10'
 
 
 from pandas.info import __doc__
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index d23cb39c15548..baefc91a9fb5b 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -102,11 +102,11 @@ class SpecificationError(GroupByError):
 
 
 def _groupby_function(name, alias, npfunc, numeric_only=True,
-                      _convert=False):
+                      fillna=None, _convert=False):
     def f(self):
         self._set_selection_from_grouper()
         try:
-            return self._cython_agg_general(alias, numeric_only=numeric_only)
+            return self._cython_agg_general(alias, numeric_only=numeric_only, fillna=fillna)
         except AssertionError as e:
             raise SpecificationError(str(e))
         except Exception:
@@ -793,8 +793,8 @@ def size(self):
         """
         return self.grouper.size()
 
-    sum = _groupby_function('sum', 'add', np.sum)
-    prod = _groupby_function('prod', 'prod', np.prod)
+    sum = _groupby_function('sum', 'add', np.sum, fillna=0.0)
+    prod = _groupby_function('prod', 'prod', np.prod, fillna=1.0)
     min = _groupby_function('min', 'min', np.min, numeric_only=False)
     max = _groupby_function('max', 'max', np.max, numeric_only=False)
     first = _groupby_function('first', 'first', _first_compat,
@@ -1118,7 +1118,7 @@ def _try_cast(self, result, obj):
 
         return result
 
-    def _cython_agg_general(self, how, numeric_only=True):
+    def _cython_agg_general(self, how, numeric_only=True, fillna=None):
         output = {}
         for name, obj in self._iterate_slices():
             is_numeric = is_numeric_dtype(obj.dtype)
@@ -1126,7 +1126,7 @@ def _cython_agg_general(self, how, numeric_only=True):
                 continue
 
             try:
-                result, names = self.grouper.aggregate(obj.values, how)
+                result, names = self.grouper.aggregate(obj.values, how, fillna=fillna)
             except AssertionError as e:
                 raise GroupByError(str(e))
             output[name] = self._try_cast(result, obj)
@@ -1511,7 +1511,7 @@ def wrapper(*args, **kwargs):
                                       (how, dtype_str))
         return func, dtype_str
 
-    def aggregate(self, values, how, axis=0):
+    def aggregate(self, values, how, axis=0, fillna=None):
         arity = self._cython_arity.get(how, 1)
 
         vdim = values.ndim
@@ -1534,14 +1534,18 @@ def aggregate(self, values, how, axis=0):
             values = values.view('int64')
             # GH 7754
             is_numeric = True
+            fillna = None
         elif is_bool_dtype(values.dtype):
             values = _algos.ensure_float64(values)
+            fillna = None
         elif com.is_integer_dtype(values):
             values = values.astype('int64', copy=False)
+            fillna = None
         elif is_numeric:
             values = _algos.ensure_float64(values)
         else:
             values = values.astype(object)
+            fillna = None
 
         try:
             agg_func, dtype_str = self._get_aggregate_function(how, values)
@@ -1564,6 +1568,10 @@ def aggregate(self, values, how, axis=0):
 
         result = self._aggregate(result, counts, values, agg_func, is_numeric)
 
+        # if we have a non-None fillna, then replace
+        if fillna is not None:
+            result[np.isnan(result)] = fillna
+
         if com.is_integer_dtype(result):
             if len(result[result == tslib.iNaT]) > 0:
                 result = result.astype('float64')
@@ -2581,8 +2589,8 @@ def _iterate_slices(self):
                 continue
             yield val, slicer(val)
 
-    def _cython_agg_general(self, how, numeric_only=True):
-        new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
+    def _cython_agg_general(self, how, numeric_only=True, fillna=None):
+        new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only, fillna=fillna)
         return self._wrap_agged_blocks(new_items, new_blocks)
 
     def _wrap_agged_blocks(self, items, blocks):
@@ -2608,7 +2616,7 @@ def _wrap_agged_blocks(self, items, blocks):
 
     _block_agg_axis = 0
 
-    def _cython_agg_blocks(self, how, numeric_only=True):
+    def _cython_agg_blocks(self, how, numeric_only=True, fillna=None):
         data, agg_axis = self._get_data_to_aggregate()
 
         new_blocks = []
@@ -2620,7 +2628,7 @@ def _cython_agg_blocks(self, how, numeric_only=True):
 
             values = block._try_operate(block.values)
 
-            result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
+            result, _ = self.grouper.aggregate(values, how, axis=agg_axis, fillna=fillna)
 
             # see if we can cast the block back to the original dtype
             result = block._try_coerce_and_cast_result(result)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index c70fb6339517d..d004ab26d1424 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -9,7 +9,7 @@
     _USE_BOTTLENECK = False
 
 import pandas.hashtable as _hash
-from pandas import compat, lib, algos, tslib
+from pandas import compat, lib, algos, tslib, _np_version_under1p10
 from pandas.compat import builtins
 from pandas.core.common import (isnull, notnull, _values_from_object,
                                 _maybe_upcast_putmask,
@@ -243,12 +243,14 @@ def nanall(values, axis=None, skipna=True):
 @disallow('M8')
 @bottleneck_switch(zero_value=0)
 def nansum(values, axis=None, skipna=True):
+    dtype = values.dtype
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
     dtype_sum = dtype_max
     if is_float_dtype(dtype):
         dtype_sum = dtype
     the_sum = values.sum(axis, dtype=dtype_sum)
-    the_sum = _maybe_null_out(the_sum, axis, mask)
+    the_sum = _maybe_null_out(the_sum, axis, mask, allow_all_null=not skipna,
+                              dtype=dtype, fill_value=0)
 
     return _wrap_results(the_sum, dtype)
 
@@ -549,12 +551,14 @@ def nankurt(values, axis=None, skipna=True):
 
 @disallow('M8','m8')
 def nanprod(values, axis=None, skipna=True):
+    dtype = values.dtype
     mask = isnull(values)
     if skipna and not is_any_int_dtype(values):
         values = values.copy()
         values[mask] = 1
     result = values.prod(axis)
-    return _maybe_null_out(result, axis, mask)
+    return _maybe_null_out(result, axis, mask, allow_all_null=not skipna, dtype=dtype,
+                           fill_value=1)
 
 
 def _maybe_arg_null_out(result, axis, mask, skipna):
@@ -588,7 +592,29 @@ def _get_counts(mask, axis, dtype=float):
         return np.array(count, dtype=dtype)
 
 
-def _maybe_null_out(result, axis, mask):
+def _maybe_null_out(result, axis, mask, allow_all_null=True, dtype=None, fill_value=None):
+
+
+    # 9422
+    # if we have all nulls we normally return a
+    # null, but for numpy >= 1.8.2 and bottleneck >= 1.0
+    # nansum/nanprod are set to be the fill_values
+    if not allow_all_null and dtype is not None:
+
+        if is_complex_dtype(dtype) or not is_float_dtype(dtype):
+
+            # we don't mask complex
+            # object or non-floats
+            # if numpy changes this, we will as well
+
+            # IOW, np.nansum(np.array([np.nan],dtype='object')) is np.nan
+            # https://github.com/numpy/numpy/issues/6209
+            allow_all_null = True
+            fill_value = np.nan
+
+    else:
+        fill_value = np.nan
+
     if axis is not None and getattr(result, 'ndim', False):
         null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
         if np.any(null_mask):
@@ -596,11 +622,19 @@ def _maybe_null_out(result, axis, mask):
                 result = result.astype('c16')
             else:
                 result = result.astype('f8')
+
+            # mark nans
             result[null_mask] = np.nan
+
+            # masker if for only all nan
+            if not allow_all_null:
+                null_mask = mask.all(axis)
+                if null_mask.any():
+                    result[null_mask] = fill_value
     else:
         null_mask = mask.size - mask.sum()
-        if null_mask == 0:
-            result = np.nan
+        if null_mask == 0 and (mask.size > 0 or allow_all_null):
+            result = fill_value
 
     return result
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 465f1da05ebde..f6004737ab97d 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -12230,10 +12230,10 @@ def test_count(self):
         assert_series_equal(result, expected)
 
     def test_sum(self):
-        self._check_stat_op('sum', np.sum, has_numeric_only=True)
+        self._check_stat_op('sum', np.sum, has_numeric_only=True, fillna=0.0)
 
         # mixed types (with upcasting happening)
-        self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'),
+        self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'), fillna=0.0,
                             has_numeric_only=True, check_dtype=False, check_less_precise=True)
 
     def test_stat_operators_attempt_obj_array(self):
@@ -12247,23 +12247,32 @@ def test_stat_operators_attempt_obj_array(self):
         df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
                         dtype='O')
         methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']
+        fills = [0.0, np.nan, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan]
 
         # GH #676
         df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
                         2: [np.nan, 4]}, dtype=object)
 
         for df in [df1, df2]:
-            for meth in methods:
+            for meth, fill in zip(methods, fills):
                 self.assertEqual(df.values.dtype, np.object_)
                 result = getattr(df, meth)(1)
+
+                # 9422
+                # all-NaN object array is still NaN, while floats are not :<
                 expected = getattr(df.astype('f8'), meth)(1)
+                if not np.isnan(fill):
+                    mask = df.isnull().all(1)
+                    if mask.any():
+                        expected[mask] = np.nan
+
                 assert_series_equal(result, expected)
 
     def test_mean(self):
         self._check_stat_op('mean', np.mean, check_dates=True)
 
     def test_product(self):
-        self._check_stat_op('product', np.prod)
+        self._check_stat_op('product', np.prod, fillna=1.0)
 
     def test_median(self):
         def wrapper(x):
@@ -12435,7 +12444,7 @@ def alt(x):
 
     def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
                        has_numeric_only=False, check_dtype=True, check_dates=False,
-                       check_less_precise=False):
+                       check_less_precise=False, fillna=None):
         if frame is None:
             frame = self.frame
             # set some NAs
@@ -12478,11 +12487,20 @@ def wrapper(x):
             wrapper = alternative
 
         result0 = f(axis=0)
-        result1 = f(axis=1)
-        assert_series_equal(result0, frame.apply(skipna_wrapper),
+        expected0 = frame.apply(skipna_wrapper)
+        assert_series_equal(result0, expected0,
                             check_dtype=check_dtype,
                             check_less_precise=check_less_precise)
-        assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
+
+        result1 = f(axis=1)
+
+        # 9422
+        # all-nan rows get the fillna
+        expected1 = frame.apply(skipna_wrapper, axis=1)
+        if fillna is not None:
+            expected1[isnull(frame).all(axis=1)] = fillna
+
+        assert_series_equal(result1, expected1,
                             check_dtype=False,
                             check_less_precise=check_less_precise)
 
@@ -12513,8 +12531,14 @@ def wrapper(x):
             all_na = self.frame * np.NaN
             r0 = getattr(all_na, name)(axis=0)
             r1 = getattr(all_na, name)(axis=1)
-            self.assertTrue(np.isnan(r0).all())
-            self.assertTrue(np.isnan(r1).all())
+
+            # 9422
+            if fillna is not None:
+                self.assertTrue((r0==fillna).all())
+                self.assertTrue((r1==fillna).all())
+            else:
+                self.assertTrue(np.isnan(r0).all())
+                self.assertTrue(np.isnan(r1).all())
 
     def test_mode(self):
         df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index a903b76b3ac7f..36aa2d833024f 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -4,18 +4,28 @@
 from functools import partial
 
 import numpy as np
-from pandas import Series
+from pandas import Series, _np_version_under1p10
+
 from pandas.core.common import isnull, is_integer_dtype
 import pandas.core.nanops as nanops
 import pandas.util.testing as tm
 
-nanops._USE_BOTTLENECK = False
+_USE_BOTTLENECK = nanops._USE_BOTTLENECK
 
+class Base(object):
 
-class TestnanopsDataFrame(tm.TestCase):
     def setUp(self):
+        nanops._USE_BOTTLENECK = False
         np.random.seed(11235)
 
+    def tearDown(self):
+        nanops._USE_BOTTLENECK = _USE_BOTTLENECK
+
+class TestnanopsDataFrame(Base, tm.TestCase):
+
+    def setUp(self):
+        super(TestnanopsDataFrame, self).setUp()
+
         self.arr_shape = (11, 7, 5)
 
         self.arr_float = np.random.randn(*self.arr_shape)
@@ -172,15 +182,29 @@ def _coerce_tds(targ, res):
             tm.assert_almost_equal(targ.imag, res.imag)
 
     def check_fun_data(self, testfunc, targfunc,
-                       testarval, targarval, targarnanval, **kwargs):
+                       testarval, targarval, targarnanval, nanfunc=None, **kwargs):
+
+        otargfunc = targfunc
         for axis in list(range(targarval.ndim))+[None]:
             for skipna in [False, True]:
+
                 targartempval = targarval if skipna else targarnanval
                 try:
+
+                    # we need a different comp function if
+                    # we have a provided nanfunc (e.g. nansum)
+                    # and we are skipna=False
+                    if nanfunc is not None:
+                        if skipna:
+                            targfunc = nanfunc
+                        else:
+                            targfunc = otargfunc
+
                     targ = targfunc(targartempval, axis=axis, **kwargs)
                     res = testfunc(testarval, axis=axis, skipna=skipna,
                                    **kwargs)
                     self.check_results(targ, res, axis)
+
                     if skipna:
                         res = testfunc(testarval, axis=axis)
                         self.check_results(targ, res, axis)
@@ -205,9 +229,9 @@ def check_fun_data(self, testfunc, targfunc,
             targarnanval2 = np.take(targarnanval, 0, axis=-1)
         except ValueError:
             return
-        self.check_fun_data(testfunc, targfunc,
+        self.check_fun_data(testfunc, otargfunc,
                             testarval2, targarval2, targarnanval2,
-                            **kwargs)
+                            nanfunc=nanfunc, **kwargs)
 
     def check_fun(self, testfunc, targfunc,
                   testar, targar=None, targarnan=None,
@@ -228,7 +252,7 @@ def check_fun(self, testfunc, targfunc,
                          'targarnan: %s' % targarnan)
             raise
 
-    def check_funs(self, testfunc, targfunc,
+    def check_funs(self, testfunc, targfunc, nanfunc=None,
                    allow_complex=True, allow_all_nan=True, allow_str=True,
                    allow_date=True, allow_tdelta=True, allow_obj=True,
                    **kwargs):
@@ -242,7 +266,7 @@ def check_funs(self, testfunc, targfunc,
                 self.arr_bool.astype('O')]
 
         if allow_all_nan:
-            self.check_fun(testfunc, targfunc, 'arr_nan', **kwargs)
+            self.check_fun(testfunc, targfunc, 'arr_nan', nanfunc=nanfunc, **kwargs)
 
         if allow_complex:
             self.check_fun(testfunc, targfunc, 'arr_complex', **kwargs)
@@ -315,9 +339,15 @@ def test_nanall(self):
                         allow_all_nan=False, allow_str=False, allow_date=False, allow_tdelta=False)
 
     def test_nansum(self):
-        self.check_funs(nanops.nansum, np.sum,
+        self.check_funs(nanops.nansum, np.sum, nanfunc=np.nansum,
                         allow_str=False, allow_date=False, allow_tdelta=True)
 
+        # validate that nansum of all nans is 0, True for numpy >= 1.8.2 & bottleneck >= 1.0
+        # 9422
+        s = Series([np.nan])
+        self.assertEqual(s.sum(skipna=True),0.0)
+        self.assertIs(s.sum(skipna=False),np.nan)
+
     def test_nanmean(self):
         self.check_funs(nanops.nanmean, np.mean,
                         allow_complex=False, allow_obj=False,
@@ -450,9 +480,30 @@ def test_nankurt(self):
                         allow_complex=False, allow_str=False, allow_date=False, allow_tdelta=False)
 
     def test_nanprod(self):
-        self.check_funs(nanops.nanprod, np.prod,
+
+        # use nanprod if it exists
+        # otherwise by construction
+        nanfunc = getattr(np,'nanprod',None)
+        if nanfunc is None:
+            def nanprod(x, axis, **kwargs):
+                result = x.prod(axis=axis)
+                if np.isnan(result).all():
+                    if np.isscalar(result):
+                        result = 1
+                    else:
+                        result[np.isnan(result)] = 1
+                return result
+            nanfunc = nanprod
+
+        self.check_funs(nanops.nanprod, np.prod, nanfunc=nanfunc,
                         allow_str=False, allow_date=False, allow_tdelta=False)
 
+        # validate that nanprod of all nans is 1.0
+        # 9422
+        s = Series([np.nan])
+        self.assertEqual(s.prod(skipna=True),1.0)
+        self.assertIs(s.prod(skipna=False),np.nan)
+
     def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs):
         res00 = checkfun(self.arr_float_2d, self.arr_float1_2d,
                          **kwargs)
@@ -769,7 +820,8 @@ def test__bn_ok_dtype(self):
         self.assertFalse(nanops._bn_ok_dtype(self.arr_obj.dtype, 'test'))
 
 
-class TestEnsureNumeric(tm.TestCase):
+class TestEnsureNumeric(Base, tm.TestCase):
+
     def test_numeric_values(self):
         # Test integer
         self.assertEqual(nanops._ensure_numeric(1), 1, 'Failed for int')
diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py
index 9cdc769dd7d74..e504a63490f0b 100644
--- a/pandas/tests/test_panel.py
+++ b/pandas/tests/test_panel.py
@@ -70,13 +70,13 @@ def test_count(self):
         self._check_stat_op('count', f, obj=self.panel, has_skipna=False)
 
     def test_sum(self):
-        self._check_stat_op('sum', np.sum)
+        self._check_stat_op('sum', np.sum, fillna=0.0)
 
     def test_mean(self):
         self._check_stat_op('mean', np.mean)
 
     def test_prod(self):
-        self._check_stat_op('prod', np.prod)
+        self._check_stat_op('prod', np.prod, fillna=1.0)
 
     def test_median(self):
         def wrapper(x):
@@ -139,7 +139,7 @@ def alt(x):
 
     #     self._check_stat_op('skew', alt)
 
-    def _check_stat_op(self, name, alternative, obj=None, has_skipna=True):
+    def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, fillna=None):
         if obj is None:
             obj = self.panel
 
@@ -161,14 +161,22 @@ def wrapper(x):
 
             for i in range(obj.ndim):
                 result = f(axis=i, skipna=False)
-                assert_frame_equal(result, obj.apply(wrapper, axis=i))
+                expected = obj.apply(wrapper, axis=i)
+                assert_frame_equal(result, expected)
         else:
             skipna_wrapper = alternative
             wrapper = alternative
 
         for i in range(obj.ndim):
             result = f(axis=i)
-            assert_frame_equal(result, obj.apply(skipna_wrapper, axis=i))
+            expected = obj.apply(skipna_wrapper, axis=i)
+
+            # 9422
+            # all-nan rows get the fillna
+            if fillna is not None:
+                expected[isnull(obj).all(axis=i)] = fillna
+
+            assert_frame_equal(result, expected)
 
         self.assertRaises(Exception, f, axis=obj.ndim)
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
index 56146df37a27f..c99b08c8ea99c 100644
--- a/pandas/tests/test_series.py
+++ b/pandas/tests/test_series.py
@@ -2762,7 +2762,9 @@ def testit():
                     self.assertTrue(bn.__version__ >= LooseVersion('1.0'))
                     self.assertEqual(f(allna),0.0)
                 except:
-                    self.assertTrue(np.isnan(f(allna)))
+
+                    # 10815 pandas does as well
+                    self.assertEqual(f(allna),0.0)
 
             # dtype=object with None, it works!
             s = Series([1, 2, 3, None, 5])