From c187ac935926694d446445baa9494476acae9ca2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 7 Sep 2015 18:37:55 -0400 Subject: [PATCH] PERF: use NaT comparisons in int64/datetimelikes #11010 --- doc/source/whatsnew/v0.17.0.txt | 3 +-- pandas/core/groupby.py | 2 -- pandas/src/generate_code.py | 13 ++++++----- pandas/src/generated.pyx | 39 ++++++++++++++++++--------------- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index cbcee664d8be4..7100f78cb3c7a 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -1009,11 +1009,10 @@ Bug Fixes - Bug in ``to_json`` which was causing segmentation fault when serializing 0-rank ndarray (:issue:`9576`) - Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`) - Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`) -- Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`) +- Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`, :issue:`11010`) - Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`) - Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`) - Bug in ``DataFrame.plot(subplots=True)`` with duplicated columns outputs incorrect result (:issue:`10962`) - Bug in ``Index`` arithmetic may result in incorrect class (:issue:`10638`) - Bug in ``date_range`` results in empty if freq is negative annualy, quarterly and monthly (:issue:`11018`) - Bug in ``DatetimeIndex`` cannot infer negative freq (:issue:`11018`) - diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1f5855e63dee8..0293fc655742e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1523,8 +1523,6 @@ def aggregate(self, values, how, axis=0): if is_datetime_or_timedelta_dtype(values.dtype): values = values.view('int64') - values[values == tslib.iNaT] = np.nan - # GH 7754 is_numeric = True elif is_bool_dtype(values.dtype): values = _algos.ensure_float64(values) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index b055d75df4cf4..8c5c7d709e5f1 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -739,7 +739,7 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != %(nan_val)s: nobs[lab, j] += 1 resx[lab, j] = val @@ -785,7 +785,7 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != %(nan_val)s: nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -1013,7 +1013,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != %(nan_val)s: nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -1027,7 +1027,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != %(nan_val)s: nobs[lab, 0] += 1 if val > maxx[lab, 0]: maxx[lab, 0] = val @@ -1076,7 +1076,8 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != %(nan_val)s: + nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -1090,7 +1091,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != %(nan_val)s: nobs[lab, 0] += 1 if val < minx[lab, 0]: minx[lab, 0] = val diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 2f2fd528999d6..767e7d6292b6d 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -7315,7 +7315,7 @@ def group_last_float64(ndarray[float64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, j] += 1 resx[lab, j] = val @@ -7360,7 +7360,7 @@ def group_last_float32(ndarray[float32_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, j] += 1 resx[lab, j] = val @@ -7405,7 +7405,7 @@ def group_last_int64(ndarray[int64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != iNaT: nobs[lab, j] += 1 resx[lab, j] = val @@ -7451,7 +7451,7 @@ def group_nth_float64(ndarray[float64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -7497,7 +7497,7 @@ def group_nth_float32(ndarray[float32_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -7543,7 +7543,7 @@ def group_nth_int64(ndarray[int64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != iNaT: nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -7592,7 +7592,8 @@ def group_min_float64(ndarray[float64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: + nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -7606,7 +7607,7 @@ def group_min_float64(ndarray[float64_t, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, 0] += 1 if val < minx[lab, 0]: minx[lab, 0] = val @@ -7654,7 +7655,8 @@ def group_min_float32(ndarray[float32_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: + nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -7668,7 +7670,7 @@ def group_min_float32(ndarray[float32_t, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, 0] += 1 if val < minx[lab, 0]: minx[lab, 0] = val @@ -7716,7 +7718,8 @@ def group_min_int64(ndarray[int64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != iNaT: + nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -7730,7 +7733,7 @@ def group_min_int64(ndarray[int64_t, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != iNaT: nobs[lab, 0] += 1 if val < minx[lab, 0]: minx[lab, 0] = val @@ -7779,7 +7782,7 @@ def group_max_float64(ndarray[float64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -7793,7 +7796,7 @@ def group_max_float64(ndarray[float64_t, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, 0] += 1 if val > maxx[lab, 0]: maxx[lab, 0] = val @@ -7841,7 +7844,7 @@ def group_max_float32(ndarray[float32_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -7855,7 +7858,7 @@ def group_max_float32(ndarray[float32_t, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != NAN: nobs[lab, 0] += 1 if val > maxx[lab, 0]: maxx[lab, 0] = val @@ -7903,7 +7906,7 @@ def group_max_int64(ndarray[int64_t, ndim=2] out, val = values[i, j] # not nan - if val == val: + if val == val and val != iNaT: nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -7917,7 +7920,7 @@ def group_max_int64(ndarray[int64_t, ndim=2] out, val = values[i, 0] # not nan - if val == val: + if val == val and val != iNaT: nobs[lab, 0] += 1 if val > maxx[lab, 0]: maxx[lab, 0] = val