From d27e37af3bef8c88bc7b1d7ff7ac482cc71b9bb8 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 5 Oct 2014 16:06:45 -0400 Subject: [PATCH] BUG: allow std to work with timedeltas (GH8471) --- doc/source/v0.15.0.txt | 2 +- pandas/core/generic.py | 86 ++++++++++--------------- pandas/core/nanops.py | 36 +++++++---- pandas/tests/test_nanops.py | 4 ++ pandas/tseries/tests/test_timedeltas.py | 21 ++++-- 5 files changed, 79 insertions(+), 70 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 1b7b05847a901..5fdf9341bcd0e 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -638,7 +638,7 @@ TimedeltaIndex/Scalar We introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner, but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, parsing, and attributes. This type is very similar to how ``Timestamp`` works for ``datetimes``. It is a nice-API box for the type. See the :ref:`docs `. -(:issue:`3009`, :issue:`4533`, :issue:`8209`, :issue:`8187`, :issue:`8190`, :issue:`7869`, :issue:`7661`, :issue:`8345`) +(:issue:`3009`, :issue:`4533`, :issue:`8209`, :issue:`8187`, :issue:`8190`, :issue:`7869`, :issue:`7661`, :issue:`8345`, :issue:`8471`) .. warning:: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ffedeb9ade355..89736c27bf312 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3950,60 +3950,42 @@ def mad(self, axis=None, skipna=None, level=None, **kwargs): return np.abs(demeaned).mean(axis=axis, skipna=skipna) cls.mad = mad - @Substitution(outname='variance', - desc="Return unbiased variance over requested " - "axis.\n\nNormalized by N-1 by default. " - "This can be changed using the ddof argument") - @Appender(_num_doc) - def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level('var', axis=axis, level=level, - skipna=skipna, ddof=ddof) + def _make_stat_function_ddof(name, desc, f): - return self._reduce(nanops.nanvar, axis=axis, skipna=skipna, - ddof=ddof) - cls.var = var - - @Substitution(outname='stdev', - desc="Return unbiased standard deviation over requested " - "axis.\n\nNormalized by N-1 by default. " - "This can be changed using the ddof argument") - @Appender(_num_doc) - def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level('std', axis=axis, level=level, - skipna=skipna, ddof=ddof) - result = self.var(axis=axis, skipna=skipna, ddof=ddof) - if getattr(result, 'ndim', 0) > 0: - return result.apply(np.sqrt) - return np.sqrt(result) - cls.std = std - - @Substitution(outname='standarderror', - desc="Return unbiased standard error of the mean over " - "requested axis.\n\nNormalized by N-1 by default. " - "This can be changed using the ddof argument") - @Appender(_num_doc) - def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level('sem', axis=axis, level=level, - skipna=skipna, ddof=ddof) + @Substitution(outname=name, desc=desc) + @Appender(_num_doc) + def stat_func(self, axis=None, skipna=None, level=None, ddof=1, + **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna, ddof=ddof) + return self._reduce(f, axis=axis, + skipna=skipna, ddof=ddof) + stat_func.__name__ = name + return stat_func - return self._reduce(nanops.nansem, axis=axis, skipna=skipna, - ddof=ddof) - cls.sem = sem + cls.sem = _make_stat_function_ddof( + 'sem', + "Return unbiased standard error of the mean over " + "requested axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument", + nanops.nansem) + cls.var = _make_stat_function_ddof( + 'var', + "Return unbiased variance over requested " + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument", + nanops.nanvar) + cls.std = _make_stat_function_ddof( + 'std', + "Return unbiased standard deviation over requested " + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument", + nanops.nanstd) @Substitution(outname='compounded', desc="Return the compound percentage of the values for " diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 3d6fa915d6b99..9703dba40a18a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -228,7 +228,7 @@ def _wrap_results(result, dtype): if not isinstance(result, np.ndarray): result = lib.Timedelta(result) else: - result = result.view(dtype) + result = result.astype('i8').view(dtype) return result @@ -295,7 +295,7 @@ def get_median(x): if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: - return np.apply_along_axis(get_median, axis, values) + return _wrap_results(np.apply_along_axis(get_median, axis, values), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" @@ -305,7 +305,7 @@ def get_median(x): dims = np.arange(values.ndim) ret = np.empty(shp[dims != axis]) ret.fill(np.nan) - return ret + return _wrap_results(ret, dtype) # otherwise return a scalar value return _wrap_results(get_median(values) if notempty else np.nan, dtype) @@ -329,15 +329,8 @@ def _get_counts_nanvar(mask, axis, ddof): return count, d -@disallow('M8','m8') -@bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1): - - # we are going to allow timedelta64[ns] here - # but NOT going to coerce them to the Timedelta type - # as this could cause overflow - # so var cannot be computed (but std can!) - +def _nanvar(values, axis=None, skipna=True, ddof=1): + # private nanvar calculator mask = isnull(values) if not _is_floating_dtype(values): values = values.astype('f8') @@ -352,6 +345,23 @@ def nanvar(values, axis=None, skipna=True, ddof=1): XX = _ensure_numeric((values ** 2).sum(axis)) return np.fabs((XX - X ** 2 / count) / d) +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanstd(values, axis=None, skipna=True, ddof=1): + + result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) + return _wrap_results(result, values.dtype) + +@disallow('M8','m8') +@bottleneck_switch(ddof=1) +def nanvar(values, axis=None, skipna=True, ddof=1): + + # we are going to allow timedelta64[ns] here + # but NOT going to coerce them to the Timedelta type + # as this could cause overflow + # so var cannot be computed (but std can!) + return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof) + @disallow('M8','m8') def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) @@ -517,7 +527,7 @@ def nankurt(values, axis=None, skipna=True): return result -@disallow('M8') +@disallow('M8','m8') def nanprod(values, axis=None, skipna=True): mask = isnull(values) if skipna and not _is_any_int_dtype(values): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 509ef4925bb66..3ec00fee1d151 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -332,6 +332,10 @@ def test_nanvar(self): self.check_funs_ddof(nanops.nanvar, np.var, allow_complex=False, allow_date=False, allow_tdelta=False) + def test_nanstd(self): + self.check_funs_ddof(nanops.nanstd, np.std, + allow_complex=False, allow_date=False, allow_tdelta=True) + def test_nansem(self): tm.skip_if_no_package('scipy.stats') self.check_funs_ddof(nanops.nansem, np.var, diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 3d87751c296f3..282301499dcbc 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -479,6 +479,9 @@ def test_timedelta_ops(self): expected = to_timedelta(timedelta(seconds=9)) self.assertEqual(result, expected) + result = td.to_frame().mean() + self.assertEqual(result[0], expected) + result = td.quantile(.1) expected = Timedelta(np.timedelta64(2600,'ms')) self.assertEqual(result, expected) @@ -487,18 +490,28 @@ def test_timedelta_ops(self): expected = to_timedelta('00:00:08') self.assertEqual(result, expected) + result = td.to_frame().median() + self.assertEqual(result[0], expected) + # GH 6462 # consistency in returned values for sum result = td.sum() expected = to_timedelta('00:01:21') self.assertEqual(result, expected) - # you can technically do a std, but var overflows - # so this is tricky - self.assertRaises(TypeError, lambda : td.std()) + result = td.to_frame().sum() + self.assertEqual(result[0], expected) + + # std + result = td.std() + expected = to_timedelta(Series(td.dropna().values).std()) + self.assertEqual(result, expected) + + result = td.to_frame().std() + self.assertEqual(result[0], expected) # invalid ops - for op in ['skew','kurt','sem','var']: + for op in ['skew','kurt','sem','var','prod']: self.assertRaises(TypeError, lambda : getattr(td,op)()) def test_timedelta_ops_scalar(self):