diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 13764543ec665..6b5ffded6c553 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -701,3 +701,5 @@ Bug Fixes - Bug in ``iloc`` allowing memory outside bounds of a Series to be accessed with negative integers (:issue:`10779`) - Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`) - Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`) + +- Bug in stat functions (``sum``, ``mean``, etc) returning a python scalar for empty series (:issue:`9733`) diff --git a/pandas/core/common.py b/pandas/core/common.py index aaa341240f538..f31bb5675f1a5 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2484,11 +2484,18 @@ def is_integer_dtype(arr_or_dtype): return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) + def is_int64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.int64) +def is_unsigned_integer_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.unsignedinteger) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c70fb6339517d..ce5dee32c47fb 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -18,6 +18,7 @@ is_float, is_integer, is_complex, is_float_dtype, is_complex_dtype, is_integer_dtype, + is_unsigned_integer_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_datetime_or_timedelta_dtype, _get_dtype, @@ -67,21 +68,7 @@ def f(values, axis=None, skipna=True, **kwds): if k not in kwds: kwds[k] = v try: - if self.zero_value is not None and values.size == 0: - if values.ndim == 1: - - # wrap the 0's if needed - if is_timedelta64_dtype(values): - return lib.Timedelta(0) - return 0 - else: - result_shape = (values.shape[:axis] + - values.shape[axis + 1:]) - result = np.empty(result_shape) - result.fill(0) - return result - - if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, + if values.size != 0 and _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name): result = bn_func(values, axis=axis, **kwds) @@ -187,7 +174,10 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None, # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): - dtype_max = np.int64 + if is_unsigned_integer_dtype(dtype): + dtype_max = np.uint64 + else: + dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 @@ -241,14 +231,14 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') -@bottleneck_switch(zero_value=0) +@bottleneck_switch() def nansum(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask) + the_sum = _maybe_null_out(the_sum, axis, mask, False) return _wrap_results(the_sum, dtype) @@ -414,7 +404,7 @@ def nanmin(values, axis=None, skipna=True): result = values.min(axis) result = _wrap_results(result, dtype) - return _maybe_null_out(result, axis, mask) + return _maybe_null_out(result, axis, mask, True) @bottleneck_switch() @@ -445,7 +435,7 @@ def nanmax(values, axis=None, skipna=True): result = values.max(axis) result = _wrap_results(result, dtype) - return _maybe_null_out(result, axis, mask) + return _maybe_null_out(result, axis, mask, True) def nanargmax(values, axis=None, skipna=True): @@ -554,7 +544,7 @@ def nanprod(values, axis=None, skipna=True): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask) + return _maybe_null_out(result, axis, mask, False) def _maybe_arg_null_out(result, axis, mask, skipna): @@ -588,9 +578,11 @@ def _get_counts(mask, axis, dtype=float): return np.array(count, dtype=dtype) -def _maybe_null_out(result, axis, mask): +def _maybe_null_out(result, axis, mask, null_on_empty): if axis is not None and getattr(result, 'ndim', False): null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 + if not null_on_empty: + null_mask = null_mask & (mask.shape[axis] > 0) if np.any(null_mask): if np.iscomplexobj(result): result = result.astype('c16') @@ -599,9 +591,8 @@ def _maybe_null_out(result, axis, mask): result[null_mask] = np.nan else: null_mask = mask.size - mask.sum() - if null_mask == 0: - result = np.nan - + if null_mask == 0 and (mask.size > 0 or null_on_empty): + return np.nan return result diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 56146df37a27f..41613195d9575 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3616,15 +3616,44 @@ def test_ops_consistency_on_empty(self): # GH 7869 # consistency on empty - # float - result = Series(dtype=float).sum() - self.assertEqual(result,0) + # Test type of empty Series - result = Series(dtype=float).mean() - self.assertTrue(isnull(result)) + ops = ['median', 'mean', 'sum', 'prod'] - result = Series(dtype=float).median() - self.assertTrue(isnull(result)) + # First test numpy types + # Just make sure that numpy and pandas have the same return type + for dtype in ['int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 'float16', 'float32', + 'float64', 'complex64', 'complex128']: + s = Series(dtype=dtype) + for op in ops: + result = getattr(s, op)() + np_type = getattr(np, dtype) + reference = getattr(np, op)(np_type([])) + if np.isnan(reference): + self.assertTrue(np.isnan(result), + msg="%s on empty %s Series: expecting nan, got %s" % (op, dtype, str(result))) + else: + self.assertEqual(result.dtype, reference.dtype, + msg="%s on empty %s Series: returned type %s, expected %s" % + (op, dtype, str(result.dtype), str(reference.dtype))) + self.assertEqual(result, reference, + msg='%s on empty %s Series: expected %s but received %s' % + (op, dtype, str(reference), str(result))) + + # Test str/unicode types + str_series = Series(dtype='str') + unicode_series = Series(dtype='unicode') + for op in ['median', 'mean', 'prod']: + # TODO: these operations should raise type errors + # self.assertRaises(TypeError, getattr(str_series, op)(), + # msg="%s on empty str Series should raise TypeError" % op) + # self.assertRaises(TypeError, getattr(unicode_series, op)(), + # msg="%s on empty unicode Series should raise TypeError" % op) + pass + + # TODO: these operations should return empty strings + # self.assertEqual('', str_series.sum()) + # self.assertEqual('', unicode_series.sum()) # timedelta64[ns] result = Series(dtype='m8[ns]').sum()