Skip to content

ENH: make sure return dtypes for nan funcs are consistent #10251

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 5, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.16.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ Bug Fixes
- Bug where read_hdf store.select modifies the passed columns list when
multi-indexed (:issue:`7212`)
- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)

- Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`)
- Bug in groupby.apply aggregation for Categorical not preserving categories (:issue:`10138`)
- Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`)
- Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`)
Expand Down
59 changes: 35 additions & 24 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,10 @@ def nanall(values, axis=None, skipna=True):
@bottleneck_switch(zero_value=0)
def nansum(values, axis=None, skipna=True):
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
the_sum = values.sum(axis, dtype=dtype_max)
dtype_sum = dtype_max
if is_float_dtype(dtype):
dtype_sum = dtype
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _maybe_null_out(the_sum, axis, mask)

return _wrap_results(the_sum, dtype)
Expand Down Expand Up @@ -288,7 +291,7 @@ def get_median(x):
return np.nan
return algos.median(_values_from_object(x[mask]))

if values.dtype != np.float64:
if not is_float_dtype(values):
values = values.astype('f8')
values[mask] = np.nan

Expand Down Expand Up @@ -317,10 +320,10 @@ def get_median(x):
return _wrap_results(get_median(values) if notempty else np.nan, dtype)


def _get_counts_nanvar(mask, axis, ddof):
count = _get_counts(mask, axis)

d = count-ddof
def _get_counts_nanvar(mask, axis, ddof, dtype=float):
dtype = _get_dtype(dtype)
count = _get_counts(mask, axis, dtype=dtype)
d = count - dtype.type(ddof)

# always return NaN, never inf
if np.isscalar(count):
Expand All @@ -341,15 +344,19 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
if is_any_int_dtype(values):
values = values.astype('f8')

count, d = _get_counts_nanvar(mask, axis, ddof)
if is_float_dtype(values):
count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
else:
count, d = _get_counts_nanvar(mask, axis, ddof)

if skipna:
values = values.copy()
np.putmask(values, mask, 0)

X = _ensure_numeric(values.sum(axis))
XX = _ensure_numeric((values ** 2).sum(axis))
return np.fabs((XX - X ** 2 / count) / d)
result = np.fabs((XX - X * X / count) / d)
return result

@disallow('M8')
@bottleneck_switch(ddof=1)
Expand All @@ -375,9 +382,9 @@ def nansem(values, axis=None, skipna=True, ddof=1):
mask = isnull(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')
count, _ = _get_counts_nanvar(mask, axis, ddof)
count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)

return np.sqrt(var)/np.sqrt(count)
return np.sqrt(var) / np.sqrt(count)


@bottleneck_switch()
Expand Down Expand Up @@ -469,23 +476,25 @@ def nanskew(values, axis=None, skipna=True):
mask = isnull(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')

count = _get_counts(mask, axis)
count = _get_counts(mask, axis)
else:
count = _get_counts(mask, axis, dtype=values.dtype)

if skipna:
values = values.copy()
np.putmask(values, mask, 0)

typ = values.dtype.type
A = values.sum(axis) / count
B = (values ** 2).sum(axis) / count - A ** 2
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
B = (values ** 2).sum(axis) / count - A ** typ(2)
C = (values ** 3).sum(axis) / count - A ** typ(3) - typ(3) * A * B

# floating point error
B = _zero_out_fperr(B)
C = _zero_out_fperr(C)

result = ((np.sqrt((count ** 2 - count)) * C) /
((count - 2) * np.sqrt(B) ** 3))
result = ((np.sqrt(count * count - count) * C) /
((count - typ(2)) * np.sqrt(B) ** typ(3)))

if isinstance(result, np.ndarray):
result = np.where(B == 0, 0, result)
Expand All @@ -504,17 +513,19 @@ def nankurt(values, axis=None, skipna=True):
mask = isnull(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')

count = _get_counts(mask, axis)
count = _get_counts(mask, axis)
else:
count = _get_counts(mask, axis, dtype=values.dtype)

if skipna:
values = values.copy()
np.putmask(values, mask, 0)

typ = values.dtype.type
A = values.sum(axis) / count
B = (values ** 2).sum(axis) / count - A ** 2
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
D = (values ** 4).sum(axis) / count - A ** 4 - 6 * B * A * A - 4 * C * A
B = (values ** 2).sum(axis) / count - A ** typ(2)
C = (values ** 3).sum(axis) / count - A ** typ(3) - typ(3) * A * B
D = (values ** 4).sum(axis) / count - A ** typ(4) - typ(6) * B * A * A - typ(4) * C * A

B = _zero_out_fperr(B)
D = _zero_out_fperr(D)
Expand All @@ -526,8 +537,8 @@ def nankurt(values, axis=None, skipna=True):
if B == 0:
return 0

result = (((count * count - 1.) * D / (B * B) - 3 * ((count - 1.) ** 2)) /
((count - 2.) * (count - 3.)))
result = (((count * count - typ(1)) * D / (B * B) - typ(3) * ((count - typ(1)) ** typ(2))) /
((count - typ(2)) * (count - typ(3))))

if isinstance(result, np.ndarray):
result = np.where(B == 0, 0, result)
Expand Down Expand Up @@ -598,7 +609,7 @@ def _zero_out_fperr(arg):
if isinstance(arg, np.ndarray):
return np.where(np.abs(arg) < 1e-14, 0, arg)
else:
return 0 if np.abs(arg) < 1e-14 else arg
return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg


@disallow('M8','m8')
Expand Down
22 changes: 13 additions & 9 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from functools import partial

import numpy as np

from pandas import Series
from pandas.core.common import isnull, is_integer_dtype
import pandas.core.nanops as nanops
import pandas.util.testing as tm
Expand Down Expand Up @@ -327,7 +327,6 @@ def test_nanmean_overflow(self):
# GH 10155
# In the previous implementation mean can overflow for int dtypes, it
# is now consistent with numpy
from pandas import Series

# numpy < 1.9.0 is not computing this correctly
from distutils.version import LooseVersion
Expand All @@ -340,14 +339,19 @@ def test_nanmean_overflow(self):
self.assertEqual(result, np_result)
self.assertTrue(result.dtype == np.float64)

# check returned dtype
for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]:
def test_returned_dtype(self):
for dtype in [np.int16, np.int32, np.int64, np.float32, np.float64, np.float128]:
s = Series(range(10), dtype=dtype)
result = s.mean()
if is_integer_dtype(dtype):
self.assertTrue(result.dtype == np.float64)
else:
self.assertTrue(result.dtype == dtype)
group_a = ['mean', 'std', 'var', 'skew', 'kurt']
group_b = ['min', 'max']
for method in group_a + group_b:
result = getattr(s, method)()
if is_integer_dtype(dtype) and method in group_a:
self.assertTrue(result.dtype == np.float64,
"return dtype expected from %s is np.float64, got %s instead" % (method, result.dtype))
else:
self.assertTrue(result.dtype == dtype,
"return dtype expected from %s is %s, got %s instead" % (method, dtype, result.dtype))

def test_nanmedian(self):
self.check_funs(nanops.nanmedian, np.median,
Expand Down
17 changes: 8 additions & 9 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,6 @@ def test_nansum_buglet(self):
assert_almost_equal(result, 1)

def test_overflow(self):

# GH 6915
# overflowing on the smaller int dtypes
for dtype in ['int32','int64']:
Expand All @@ -551,25 +550,25 @@ def test_overflow(self):
result = s.max()
self.assertEqual(int(result),v[-1])

for dtype in ['float32','float64']:
v = np.arange(5000000,dtype=dtype)
for dtype in ['float32', 'float64']:
v = np.arange(5000000, dtype=dtype)
s = Series(v)

# no bottleneck
result = s.sum(skipna=False)
self.assertTrue(np.allclose(float(result),v.sum(dtype='float64')))
self.assertEqual(result, v.sum(dtype=dtype))
result = s.min(skipna=False)
self.assertTrue(np.allclose(float(result),0.0))
self.assertTrue(np.allclose(float(result), 0.0))
result = s.max(skipna=False)
self.assertTrue(np.allclose(float(result),v[-1]))
self.assertTrue(np.allclose(float(result), v[-1]))

# use bottleneck if available
result = s.sum()
self.assertTrue(np.allclose(float(result),v.sum(dtype='float64')))
self.assertEqual(result, v.sum(dtype=dtype))
result = s.min()
self.assertTrue(np.allclose(float(result),0.0))
self.assertTrue(np.allclose(float(result), 0.0))
result = s.max()
self.assertTrue(np.allclose(float(result),v[-1]))
self.assertTrue(np.allclose(float(result), v[-1]))

class SafeForSparse(object):
pass
Expand Down