Skip to content

Commit 630187f

Browse files
committed
API: Series.sum() will now return 0.0 for all-NaN series; this is for compat with numpy >= 1.8.2 and bottleneck >= 1.0, #9422
note that passing skipna=False will still return a NaN
1 parent fa2378b commit 630187f

File tree

8 files changed

+187
-44
lines changed

8 files changed

+187
-44
lines changed

doc/source/whatsnew/v0.17.0.txt

+14
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,20 @@ Other API Changes
551551

552552
- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
553553

554+
- ``Series.sum()`` will now return 0.0, and ``Series.prod()`` will return 1.0 for all-NaN series rather than ``NaN``; this is for compat with ``numpy`` >= 1.8.2 and ``bottleneck`` >= 1.0 (:issue:`9422`).
555+
556+
.. ipython:: python
557+
558+
s = Series([np.nan])
559+
s.sum()
560+
s.sum(skipna=False)
561+
s.prod()
562+
s.prod(skipna=False)
563+
564+
.. warning::
565+
566+
``bottleneck`` is used for these calculations. If you have ``bottleneck`` < 1.0, then these will all return ``NaN``.
567+
554568
.. _whatsnew_0170.deprecations:
555569

556570
Deprecations

pandas/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_np_version = np.version.short_version
2828
_np_version_under1p8 = LooseVersion(_np_version) < '1.8'
2929
_np_version_under1p9 = LooseVersion(_np_version) < '1.9'
30+
_np_version_under1p10 = LooseVersion(_np_version) < '1.10'
3031

3132

3233
from pandas.info import __doc__

pandas/core/groupby.py

+19-11
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,11 @@ class SpecificationError(GroupByError):
102102

103103

104104
def _groupby_function(name, alias, npfunc, numeric_only=True,
105-
_convert=False):
105+
fillna=None, _convert=False):
106106
def f(self):
107107
self._set_selection_from_grouper()
108108
try:
109-
return self._cython_agg_general(alias, numeric_only=numeric_only)
109+
return self._cython_agg_general(alias, numeric_only=numeric_only, fillna=fillna)
110110
except AssertionError as e:
111111
raise SpecificationError(str(e))
112112
except Exception:
@@ -793,8 +793,8 @@ def size(self):
793793
"""
794794
return self.grouper.size()
795795

796-
sum = _groupby_function('sum', 'add', np.sum)
797-
prod = _groupby_function('prod', 'prod', np.prod)
796+
sum = _groupby_function('sum', 'add', np.sum, fillna=0.0)
797+
prod = _groupby_function('prod', 'prod', np.prod, fillna=1.0)
798798
min = _groupby_function('min', 'min', np.min, numeric_only=False)
799799
max = _groupby_function('max', 'max', np.max, numeric_only=False)
800800
first = _groupby_function('first', 'first', _first_compat,
@@ -1118,15 +1118,15 @@ def _try_cast(self, result, obj):
11181118

11191119
return result
11201120

1121-
def _cython_agg_general(self, how, numeric_only=True):
1121+
def _cython_agg_general(self, how, numeric_only=True, fillna=None):
11221122
output = {}
11231123
for name, obj in self._iterate_slices():
11241124
is_numeric = is_numeric_dtype(obj.dtype)
11251125
if numeric_only and not is_numeric:
11261126
continue
11271127

11281128
try:
1129-
result, names = self.grouper.aggregate(obj.values, how)
1129+
result, names = self.grouper.aggregate(obj.values, how, fillna=fillna)
11301130
except AssertionError as e:
11311131
raise GroupByError(str(e))
11321132
output[name] = self._try_cast(result, obj)
@@ -1511,7 +1511,7 @@ def wrapper(*args, **kwargs):
15111511
(how, dtype_str))
15121512
return func, dtype_str
15131513

1514-
def aggregate(self, values, how, axis=0):
1514+
def aggregate(self, values, how, axis=0, fillna=None):
15151515
arity = self._cython_arity.get(how, 1)
15161516

15171517
vdim = values.ndim
@@ -1534,14 +1534,18 @@ def aggregate(self, values, how, axis=0):
15341534
values = values.view('int64')
15351535
# GH 7754
15361536
is_numeric = True
1537+
fillna = None
15371538
elif is_bool_dtype(values.dtype):
15381539
values = _algos.ensure_float64(values)
1540+
fillna = None
15391541
elif com.is_integer_dtype(values):
15401542
values = values.astype('int64', copy=False)
1543+
fillna = None
15411544
elif is_numeric:
15421545
values = _algos.ensure_float64(values)
15431546
else:
15441547
values = values.astype(object)
1548+
fillna = None
15451549

15461550
try:
15471551
agg_func, dtype_str = self._get_aggregate_function(how, values)
@@ -1564,6 +1568,10 @@ def aggregate(self, values, how, axis=0):
15641568

15651569
result = self._aggregate(result, counts, values, agg_func, is_numeric)
15661570

1571+
# if we have a non-None fillna, then replace
1572+
if fillna is not None:
1573+
result[np.isnan(result)] = fillna
1574+
15671575
if com.is_integer_dtype(result):
15681576
if len(result[result == tslib.iNaT]) > 0:
15691577
result = result.astype('float64')
@@ -2581,8 +2589,8 @@ def _iterate_slices(self):
25812589
continue
25822590
yield val, slicer(val)
25832591

2584-
def _cython_agg_general(self, how, numeric_only=True):
2585-
new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
2592+
def _cython_agg_general(self, how, numeric_only=True, fillna=None):
2593+
new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only, fillna=fillna)
25862594
return self._wrap_agged_blocks(new_items, new_blocks)
25872595

25882596
def _wrap_agged_blocks(self, items, blocks):
@@ -2608,7 +2616,7 @@ def _wrap_agged_blocks(self, items, blocks):
26082616

26092617
_block_agg_axis = 0
26102618

2611-
def _cython_agg_blocks(self, how, numeric_only=True):
2619+
def _cython_agg_blocks(self, how, numeric_only=True, fillna=None):
26122620
data, agg_axis = self._get_data_to_aggregate()
26132621

26142622
new_blocks = []
@@ -2620,7 +2628,7 @@ def _cython_agg_blocks(self, how, numeric_only=True):
26202628

26212629
values = block._try_operate(block.values)
26222630

2623-
result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
2631+
result, _ = self.grouper.aggregate(values, how, axis=agg_axis, fillna=fillna)
26242632

26252633
# see if we can cast the block back to the original dtype
26262634
result = block._try_coerce_and_cast_result(result)

pandas/core/nanops.py

+40-6
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
_USE_BOTTLENECK = False
1010

1111
import pandas.hashtable as _hash
12-
from pandas import compat, lib, algos, tslib
12+
from pandas import compat, lib, algos, tslib, _np_version_under1p10
1313
from pandas.compat import builtins
1414
from pandas.core.common import (isnull, notnull, _values_from_object,
1515
_maybe_upcast_putmask,
@@ -243,12 +243,14 @@ def nanall(values, axis=None, skipna=True):
243243
@disallow('M8')
244244
@bottleneck_switch(zero_value=0)
245245
def nansum(values, axis=None, skipna=True):
246+
dtype = values.dtype
246247
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
247248
dtype_sum = dtype_max
248249
if is_float_dtype(dtype):
249250
dtype_sum = dtype
250251
the_sum = values.sum(axis, dtype=dtype_sum)
251-
the_sum = _maybe_null_out(the_sum, axis, mask)
252+
the_sum = _maybe_null_out(the_sum, axis, mask, allow_all_null=not skipna,
253+
dtype=dtype, fill_value=0)
252254

253255
return _wrap_results(the_sum, dtype)
254256

@@ -549,12 +551,14 @@ def nankurt(values, axis=None, skipna=True):
549551

550552
@disallow('M8','m8')
551553
def nanprod(values, axis=None, skipna=True):
554+
dtype = values.dtype
552555
mask = isnull(values)
553556
if skipna and not is_any_int_dtype(values):
554557
values = values.copy()
555558
values[mask] = 1
556559
result = values.prod(axis)
557-
return _maybe_null_out(result, axis, mask)
560+
return _maybe_null_out(result, axis, mask, allow_all_null=not skipna, dtype=dtype,
561+
fill_value=1)
558562

559563

560564
def _maybe_arg_null_out(result, axis, mask, skipna):
@@ -588,19 +592,49 @@ def _get_counts(mask, axis, dtype=float):
588592
return np.array(count, dtype=dtype)
589593

590594

591-
def _maybe_null_out(result, axis, mask):
595+
def _maybe_null_out(result, axis, mask, allow_all_null=True, dtype=None, fill_value=None):
596+
597+
598+
# 9422
599+
# if we have all nulls we normally return a
600+
# null, but for numpy >= 1.8.2 and bottleneck >= 1.0
601+
# nansum/nanprod are set to be the fill_values
602+
if not allow_all_null and dtype is not None:
603+
604+
if is_complex_dtype(dtype) or not is_float_dtype(dtype):
605+
606+
# we don't mask complex
607+
# object or non-floats
608+
# if numpy changes this, we will as well
609+
610+
# IOW, np.nansum(np.array([np.nan],dtype='object')) is np.nan
611+
# https://github.com/numpy/numpy/issues/6209
612+
allow_all_null = True
613+
fill_value = np.nan
614+
615+
else:
616+
fill_value = np.nan
617+
592618
if axis is not None and getattr(result, 'ndim', False):
593619
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
594620
if np.any(null_mask):
595621
if np.iscomplexobj(result):
596622
result = result.astype('c16')
597623
else:
598624
result = result.astype('f8')
625+
626+
# mark nans
599627
result[null_mask] = np.nan
628+
629+
# masker if for only all nan
630+
if not allow_all_null:
631+
null_mask = mask.all(axis)
632+
if null_mask.any():
633+
result[null_mask] = fill_value
600634
else:
601635
null_mask = mask.size - mask.sum()
602-
if null_mask == 0:
603-
result = np.nan
636+
if null_mask == 0 and (mask.size > 0 or allow_all_null):
637+
result = fill_value
604638

605639
return result
606640

pandas/tests/test_frame.py

+34-10
Original file line numberDiff line numberDiff line change
@@ -12230,10 +12230,10 @@ def test_count(self):
1223012230
assert_series_equal(result, expected)
1223112231

1223212232
def test_sum(self):
12233-
self._check_stat_op('sum', np.sum, has_numeric_only=True)
12233+
self._check_stat_op('sum', np.sum, has_numeric_only=True, fillna=0.0)
1223412234

1223512235
# mixed types (with upcasting happening)
12236-
self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'),
12236+
self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'), fillna=0.0,
1223712237
has_numeric_only=True, check_dtype=False, check_less_precise=True)
1223812238

1223912239
def test_stat_operators_attempt_obj_array(self):
@@ -12247,23 +12247,32 @@ def test_stat_operators_attempt_obj_array(self):
1224712247
df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
1224812248
dtype='O')
1224912249
methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']
12250+
fills = [0.0, np.nan, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan]
1225012251

1225112252
# GH #676
1225212253
df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
1225312254
2: [np.nan, 4]}, dtype=object)
1225412255

1225512256
for df in [df1, df2]:
12256-
for meth in methods:
12257+
for meth, fill in zip(methods, fills):
1225712258
self.assertEqual(df.values.dtype, np.object_)
1225812259
result = getattr(df, meth)(1)
12260+
12261+
# 9422
12262+
# all-NaN object array is still NaN, while floats are not :<
1225912263
expected = getattr(df.astype('f8'), meth)(1)
12264+
if not np.isnan(fill):
12265+
mask = df.isnull().all(1)
12266+
if mask.any():
12267+
expected[mask] = np.nan
12268+
1226012269
assert_series_equal(result, expected)
1226112270

1226212271
def test_mean(self):
1226312272
self._check_stat_op('mean', np.mean, check_dates=True)
1226412273

1226512274
def test_product(self):
12266-
self._check_stat_op('product', np.prod)
12275+
self._check_stat_op('product', np.prod, fillna=1.0)
1226712276

1226812277
def test_median(self):
1226912278
def wrapper(x):
@@ -12435,7 +12444,7 @@ def alt(x):
1243512444

1243612445
def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
1243712446
has_numeric_only=False, check_dtype=True, check_dates=False,
12438-
check_less_precise=False):
12447+
check_less_precise=False, fillna=None):
1243912448
if frame is None:
1244012449
frame = self.frame
1244112450
# set some NAs
@@ -12478,11 +12487,20 @@ def wrapper(x):
1247812487
wrapper = alternative
1247912488

1248012489
result0 = f(axis=0)
12481-
result1 = f(axis=1)
12482-
assert_series_equal(result0, frame.apply(skipna_wrapper),
12490+
expected0 = frame.apply(skipna_wrapper)
12491+
assert_series_equal(result0, expected0,
1248312492
check_dtype=check_dtype,
1248412493
check_less_precise=check_less_precise)
12485-
assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
12494+
12495+
result1 = f(axis=1)
12496+
12497+
# 9422
12498+
# all-nan rows get the fillna
12499+
expected1 = frame.apply(skipna_wrapper, axis=1)
12500+
if fillna is not None:
12501+
expected1[isnull(frame).all(axis=1)] = fillna
12502+
12503+
assert_series_equal(result1, expected1,
1248612504
check_dtype=False,
1248712505
check_less_precise=check_less_precise)
1248812506

@@ -12513,8 +12531,14 @@ def wrapper(x):
1251312531
all_na = self.frame * np.NaN
1251412532
r0 = getattr(all_na, name)(axis=0)
1251512533
r1 = getattr(all_na, name)(axis=1)
12516-
self.assertTrue(np.isnan(r0).all())
12517-
self.assertTrue(np.isnan(r1).all())
12534+
12535+
# 9422
12536+
if fillna is not None:
12537+
self.assertTrue((r0==fillna).all())
12538+
self.assertTrue((r1==fillna).all())
12539+
else:
12540+
self.assertTrue(np.isnan(r0).all())
12541+
self.assertTrue(np.isnan(r1).all())
1251812542

1251912543
def test_mode(self):
1252012544
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],

0 commit comments

Comments
 (0)