From 12004a0dd121b3650a78266c545e8150355f3329 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 13 Feb 2023 19:03:35 -0800 Subject: [PATCH 1/3] BUG: GroupBy.quantile with datetimelike and NaT --- pandas/core/groupby/groupby.py | 23 ++++++++++++++++------- pandas/tests/groupby/test_quantile.py | 21 +++++++++++++++++++++ 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 763494666d870..65f05bf185478 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -73,7 +73,6 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import ensure_dtype_can_hold_na from pandas.core.dtypes.common import ( is_bool_dtype, - is_datetime64_dtype, is_float_dtype, is_hashable, is_integer, @@ -81,7 +80,7 @@ class providing the base-class of operations. is_numeric_dtype, is_object_dtype, is_scalar, - is_timedelta64_dtype, + needs_i8_conversion, ) from pandas.core.dtypes.missing import ( isna, @@ -3192,12 +3191,11 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: inference = np.dtype(np.int64) elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): out = vals.to_numpy(dtype=float, na_value=np.nan) - elif is_datetime64_dtype(vals.dtype): + elif needs_i8_conversion(vals.dtype): inference = vals.dtype - out = np.asarray(vals).astype(float) - elif is_timedelta64_dtype(vals.dtype): - inference = vals.dtype - out = np.asarray(vals).astype(float) + # In this case we need to delay the casting until after the + # np.lexsort below. + return vals, inference elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): inference = np.dtype(np.float64) out = vals.to_numpy(dtype=float, na_value=np.nan) @@ -3236,6 +3234,10 @@ def post_processor( is_integer_dtype(inference) and interpolation in {"linear", "midpoint"} ): + if needs_i8_conversion(inference): + vals = vals.astype("i8").view(orig_vals._ndarray.dtype) + return orig_vals._from_backing_data(vals) + assert isinstance(inference, np.dtype) # for mypy return vals.astype(inference) @@ -3272,6 +3274,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: mask = isna(values) result_mask = None + is_datetimelike = needs_i8_conversion(values.dtype) + vals, inference = pre_processor(values) ncols = 1 @@ -3289,6 +3293,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: order = (vals, shaped_labels) sort_arr = np.lexsort(order).astype(np.intp, copy=False) + if is_datetimelike: + # This casting needs to happen after the lexsort in order + # to ensure that NaTs are placed at the end and not the front + vals = vals.view("i8").astype(np.float64) + if vals.ndim == 1: # Ea is always 1d func( diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 8cba3a8afdfae..949acf0c4b6af 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -445,3 +445,24 @@ def test_timestamp_groupby_quantile(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_quantile_dt64tz_period(): + dti = pd.date_range("2016-01-01", periods=1000) + ser = pd.Series(dti) + df = ser.to_frame() + df[1] = dti.tz_localize("US/Pacific") + df[2] = dti.to_period("D") + df[3] = dti - dti[0] + df.iloc[-1] = pd.NaT + + by = np.tile(np.arange(5), 200) + gb = df.groupby(by) + + result = gb.quantile(0.5) + + # Check that we match the group-by-group result + exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} + expected = DataFrame(exp).T + + tm.assert_frame_equal(result, expected) From 8ec3d1cd2dd27ac9b21f3cb313c66c2032593d36 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 13 Feb 2023 19:05:54 -0800 Subject: [PATCH 2/3] GH refs --- doc/source/whatsnew/v2.0.0.rst | 2 ++ pandas/tests/groupby/test_quantile.py | 1 + 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d1b965e64e43b..bd6e2608f97ae 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1186,6 +1186,8 @@ Datetimelike - Bug in :func:`DataFrame.from_records` when given a :class:`DataFrame` input with timezone-aware datetime64 columns incorrectly dropping the timezone-awareness (:issue:`51162`) - Bug in :func:`to_datetime` was raising ``decimal.InvalidOperation`` when parsing date strings with ``errors='coerce'`` (:issue:`51084`) - Bug in :func:`to_datetime` with both ``unit`` and ``origin`` specified returning incorrect results (:issue:`42624`) +- Bug in :meth:`GroupBy.quantile` with datetime or timedelta dtypes giving incorrect results for groups containing ``NaT`` (:issue:`51373`) +- Bug in :meth:`Groupby.quantile` incorrectly raising with :class:`PeriodDtype` or :class:`DatetimeTZDtype` (:issue:`51373`) - Timedelta diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 949acf0c4b6af..4c5011f8c683d 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -448,6 +448,7 @@ def test_timestamp_groupby_quantile(): def test_groupby_quantile_dt64tz_period(): + # GH#51373 dti = pd.date_range("2016-01-01", periods=1000) ser = pd.Series(dti) df = ser.to_frame() From a6ff14464efa8b142805772f618f0968d7ef8228 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 13 Feb 2023 20:34:08 -0800 Subject: [PATCH 3/3] mypy, 32bit fixups --- pandas/core/groupby/groupby.py | 18 +++++++++++++++--- pandas/tests/groupby/test_quantile.py | 1 + 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 65f05bf185478..b25c767db42ff 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3195,7 +3195,11 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: inference = vals.dtype # In this case we need to delay the casting until after the # np.lexsort below. - return vals, inference + # error: Incompatible return value type (got + # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any, + # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any], + # Optional[Union[dtype[Any], ExtensionDtype]]]") + return vals, inference # type: ignore[return-value] elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): inference = np.dtype(np.float64) out = vals.to_numpy(dtype=float, na_value=np.nan) @@ -3235,8 +3239,16 @@ def post_processor( and interpolation in {"linear", "midpoint"} ): if needs_i8_conversion(inference): - vals = vals.astype("i8").view(orig_vals._ndarray.dtype) - return orig_vals._from_backing_data(vals) + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_ndarray" + vals = vals.astype("i8").view( + orig_vals._ndarray.dtype # type: ignore[union-attr] + ) + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_from_backing_data" + return orig_vals._from_backing_data( # type: ignore[union-attr] + vals + ) assert isinstance(inference, np.dtype) # for mypy return vals.astype(inference) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 4c5011f8c683d..79354e550d3f6 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -465,5 +465,6 @@ def test_groupby_quantile_dt64tz_period(): # Check that we match the group-by-group result exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} expected = DataFrame(exp).T + expected.index = expected.index.astype(np.int_) tm.assert_frame_equal(result, expected)