From 279c4d1df2e01f4d4cbd0cfbc8b52ebd3567e9ad Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Dec 2020 08:01:29 -0800 Subject: [PATCH 1/7] REF: groupby op casting without try/except --- pandas/core/groupby/ops.py | 19 ++++++++++++++++++- .../tests/arrays/integer/test_arithmetic.py | 5 ++++- pandas/tests/resample/test_datetime_index.py | 4 +++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d98c55755042e..9aaa777c454b0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -45,6 +45,7 @@ is_datetime64_any_dtype, is_datetime64tz_dtype, is_extension_array_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, @@ -507,7 +508,23 @@ def _ea_wrap_cython_operation( res_values = self._cython_operation( kind, values, how, axis, min_count, **kwargs ) - result = maybe_cast_result(result=res_values, obj=orig_values, how=how) + if how in ["mean", "median", "var"]: + # preserve float64 dtype + return res_values + + dtype = maybe_cast_result_dtype(orig_values.dtype, how) + if is_extension_array_dtype(dtype): + cls = dtype.construct_array_type() + return cls._from_sequence(res_values) + return res_values + + elif is_float_dtype(values.dtype): + # FloatingArray + values = values.to_numpy(na_value=np.nan) + res_values = self._cython_operation( + kind, values, how, axis, min_count, **kwargs + ) + result = type(orig_values)._from_sequence(res_values) return result raise NotImplementedError(values.dtype) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 4b8d95ae83e4f..1677f0864063b 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -277,7 +277,10 @@ def test_reduce_to_float(op): result = getattr(df.groupby("A"), op)() expected = pd.DataFrame( - {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + { + "B": np.array([1.0, 3.0]), + "C": integer_array([1, 3], dtype="Int64").astype(np.float64), + }, index=pd.Index(["a", "b"], name="A"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3e41dab39e71d..c10b3f9ffd4d2 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -124,7 +124,9 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64" + [1, 4, 7], + index=pd.date_range("1/1/2000", periods=3, freq="3T"), + dtype=np.float64, ) tm.assert_series_equal(result, expected) From e6dc52952342f9066c332e5b14181689c6a5dbaf Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Dec 2020 09:08:02 -0800 Subject: [PATCH 2/7] Float64 --- pandas/core/dtypes/cast.py | 3 +++ pandas/core/groupby/ops.py | 4 ---- pandas/tests/arrays/integer/test_arithmetic.py | 2 +- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/resample/test_datetime_index.py | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 12974d56dacdc..d314221f1b763 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -357,12 +357,15 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: The desired dtype of the result. """ from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)): return np.dtype(np.int64) elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): return Int64Dtype() + elif how in ["mean", "median", "var"] and isinstance(dtype, Int64Dtype): + return Float64Dtype() return dtype diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 9aaa777c454b0..85d9c7d7f8aea 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -508,10 +508,6 @@ def _ea_wrap_cython_operation( res_values = self._cython_operation( kind, values, how, axis, min_count, **kwargs ) - if how in ["mean", "median", "var"]: - # preserve float64 dtype - return res_values - dtype = maybe_cast_result_dtype(orig_values.dtype, how) if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 1677f0864063b..617cb6407d857 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -279,7 +279,7 @@ def test_reduce_to_float(op): expected = pd.DataFrame( { "B": np.array([1.0, 3.0]), - "C": integer_array([1, 3], dtype="Int64").astype(np.float64), + "C": pd.array([1, 3], dtype="Float64"), }, index=pd.Index(["a", "b"], name="A"), ) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 60aae003d2956..f3860b78e2994 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1072,7 +1072,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): output = 0.5 if function == "var" else 1.5 arr = np.array([output] * 3, dtype=float) idx = Index([1, 2, 3], dtype=object, name="a") - expected = DataFrame({"b": arr}, index=idx) + expected = DataFrame({"b": arr}, index=idx).astype("Float64") groups = DataFrame(values, dtype="Int64").groupby("a") diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index c10b3f9ffd4d2..8bf40c924ec86 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -126,7 +126,7 @@ def test_resample_integerarray(): expected = Series( [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), - dtype=np.float64, + dtype="Float64", ) tm.assert_series_equal(result, expected) From ea79027c808bc162f76020654dcdf4ecd3d254bb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Dec 2020 17:16:08 +0100 Subject: [PATCH 3/7] add tests for expected dtype of cython agg ops with nullable dtypes --- pandas/tests/groupby/aggregate/test_cython.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c907391917ca8..c97e7477cf643 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_float_dtype + import pandas as pd from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range import pandas._testing as tm @@ -312,3 +314,69 @@ def test_cython_agg_nullable_int(op_name): # so for now just checking the values by casting to float result = result.astype("float64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("with_na", [True, False]) +@pytest.mark.parametrize( + "op_name, action", + [ + # ("count", "always_int"), + ("sum", "large_int"), + # ("std", "always_float"), + ("var", "always_float"), + # ("sem", "always_float"), + ("mean", "always_float"), + ("median", "always_float"), + ("prod", "large_int"), + ("min", "preserve"), + ("max", "preserve"), + ("first", "preserve"), + ("last", "preserve"), + ], +) +@pytest.mark.parametrize( + "data", + [ + pd.array([1, 2, 3, 4], dtype="Int64"), + pd.array([1, 2, 3, 4], dtype="Int8"), + pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"), + pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"), + pd.array([True, True, False, False], dtype="boolean"), + ], +) +def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): + if with_na: + data[3] = pd.NA + + df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) + grouped = df.groupby("key") + + if action == "always_int": + # always Int64 + expected_dtype = pd.Int64Dtype() + elif action == "large_int": + # for any int/bool use Int64, for float preserve dtype + if is_float_dtype(data.dtype): + expected_dtype = data.dtype + else: + expected_dtype = pd.Int64Dtype() + elif action == "always_float": + # for any int/bool use Float64, for float preserve dtype + if is_float_dtype(data.dtype): + expected_dtype = data.dtype + else: + expected_dtype = pd.Float64Dtype() + elif action == "preserve": + expected_dtype = data.dtype + + result = getattr(grouped, op_name)() + result["col"].dtype == expected_dtype + + result = grouped.aggregate(op_name) + result["col"].dtype == expected_dtype + + result = getattr(grouped["col"], op_name)() + result.dtype == expected_dtype + + result = grouped["col"].aggregate(op_name) + result.dtype == expected_dtype From 97fcd22657cac1f665f30e03ad902cd14f5f9656 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Dec 2020 17:29:16 +0100 Subject: [PATCH 4/7] fix casting to float numpy array for FloatingArray --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 85d9c7d7f8aea..8020ac862ec50 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -516,7 +516,7 @@ def _ea_wrap_cython_operation( elif is_float_dtype(values.dtype): # FloatingArray - values = values.to_numpy(na_value=np.nan) + values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) res_values = self._cython_operation( kind, values, how, axis, min_count, **kwargs ) From b04d91fb73fdd3f8fc90393f662ceb95a7b9b9fe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Dec 2020 17:47:04 +0100 Subject: [PATCH 5/7] fix tests --- pandas/tests/groupby/aggregate/test_cython.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c97e7477cf643..8799f6faa775c 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -370,13 +370,13 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): expected_dtype = data.dtype result = getattr(grouped, op_name)() - result["col"].dtype == expected_dtype + assert result["col"].dtype == expected_dtype result = grouped.aggregate(op_name) - result["col"].dtype == expected_dtype + assert result["col"].dtype == expected_dtype result = getattr(grouped["col"], op_name)() - result.dtype == expected_dtype + assert result.dtype == expected_dtype result = grouped["col"].aggregate(op_name) - result.dtype == expected_dtype + assert result.dtype == expected_dtype From 202bee801df3654a6333092ca2c674d4f4b233a7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Dec 2020 17:52:24 +0100 Subject: [PATCH 6/7] update rules of known result dtypes --- pandas/core/dtypes/cast.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d314221f1b763..1437c7e59c54b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -358,13 +358,16 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: """ from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.floating import Float64Dtype - from pandas.core.arrays.integer import Int64Dtype - - if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)): - return np.dtype(np.int64) - elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): - return Int64Dtype() - elif how in ["mean", "median", "var"] and isinstance(dtype, Int64Dtype): + from pandas.core.arrays.integer import Int64Dtype, _IntegerDtype + + if how in ["add", "cumsum", "sum", "prod"]: + if dtype == np.dtype(bool): + return np.dtype(np.int64) + elif isinstance(dtype, (BooleanDtype, _IntegerDtype)): + return Int64Dtype() + elif how in ["mean", "median", "var"] and isinstance( + dtype, (BooleanDtype, _IntegerDtype) + ): return Float64Dtype() return dtype From 2566ec4dc07050effe490765aff247e3d4c68216 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Dec 2020 18:57:10 -0800 Subject: [PATCH 7/7] retain dtype --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index cbff65f6a55c2..7724e3930f7df 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -525,7 +525,7 @@ def _ea_wrap_cython_operation( dtype = maybe_cast_result_dtype(orig_values.dtype, how) if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() - return cls._from_sequence(res_values) + return cls._from_sequence(res_values, dtype=dtype) return res_values elif is_float_dtype(values.dtype):