From 279c4d1df2e01f4d4cbd0cfbc8b52ebd3567e9ad Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 4 Dec 2020 08:01:29 -0800
Subject: [PATCH 1/7] REF: groupby op casting without try/except

---
 pandas/core/groupby/ops.py                    | 19 ++++++++++++++++++-
 .../tests/arrays/integer/test_arithmetic.py   |  5 ++++-
 pandas/tests/resample/test_datetime_index.py  |  4 +++-
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index d98c55755042e..9aaa777c454b0 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -45,6 +45,7 @@
     is_datetime64_any_dtype,
     is_datetime64tz_dtype,
     is_extension_array_dtype,
+    is_float_dtype,
     is_integer_dtype,
     is_numeric_dtype,
     is_period_dtype,
@@ -507,7 +508,23 @@ def _ea_wrap_cython_operation(
             res_values = self._cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )
-            result = maybe_cast_result(result=res_values, obj=orig_values, how=how)
+            if how in ["mean", "median", "var"]:
+                # preserve float64 dtype
+                return res_values
+
+            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
+            if is_extension_array_dtype(dtype):
+                cls = dtype.construct_array_type()
+                return cls._from_sequence(res_values)
+            return res_values
+
+        elif is_float_dtype(values.dtype):
+            # FloatingArray
+            values = values.to_numpy(na_value=np.nan)
+            res_values = self._cython_operation(
+                kind, values, how, axis, min_count, **kwargs
+            )
+            result = type(orig_values)._from_sequence(res_values)
             return result
 
         raise NotImplementedError(values.dtype)
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
index 4b8d95ae83e4f..1677f0864063b 100644
--- a/pandas/tests/arrays/integer/test_arithmetic.py
+++ b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -277,7 +277,10 @@ def test_reduce_to_float(op):
     result = getattr(df.groupby("A"), op)()
 
     expected = pd.DataFrame(
-        {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")},
+        {
+            "B": np.array([1.0, 3.0]),
+            "C": integer_array([1, 3], dtype="Int64").astype(np.float64),
+        },
         index=pd.Index(["a", "b"], name="A"),
     )
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index 3e41dab39e71d..c10b3f9ffd4d2 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -124,7 +124,9 @@ def test_resample_integerarray():
 
     result = ts.resample("3T").mean()
     expected = Series(
-        [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64"
+        [1, 4, 7],
+        index=pd.date_range("1/1/2000", periods=3, freq="3T"),
+        dtype=np.float64,
     )
     tm.assert_series_equal(result, expected)
 

From e6dc52952342f9066c332e5b14181689c6a5dbaf Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 4 Dec 2020 09:08:02 -0800
Subject: [PATCH 2/7] Float64

---
 pandas/core/dtypes/cast.py                     | 3 +++
 pandas/core/groupby/ops.py                     | 4 ----
 pandas/tests/arrays/integer/test_arithmetic.py | 2 +-
 pandas/tests/groupby/test_function.py          | 2 +-
 pandas/tests/resample/test_datetime_index.py   | 2 +-
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 12974d56dacdc..d314221f1b763 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -357,12 +357,15 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
         The desired dtype of the result.
     """
     from pandas.core.arrays.boolean import BooleanDtype
+    from pandas.core.arrays.floating import Float64Dtype
     from pandas.core.arrays.integer import Int64Dtype
 
     if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)):
         return np.dtype(np.int64)
     elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype):
         return Int64Dtype()
+    elif how in ["mean", "median", "var"] and isinstance(dtype, Int64Dtype):
+        return Float64Dtype()
     return dtype
 
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 9aaa777c454b0..85d9c7d7f8aea 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -508,10 +508,6 @@ def _ea_wrap_cython_operation(
             res_values = self._cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )
-            if how in ["mean", "median", "var"]:
-                # preserve float64 dtype
-                return res_values
-
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
             if is_extension_array_dtype(dtype):
                 cls = dtype.construct_array_type()
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
index 1677f0864063b..617cb6407d857 100644
--- a/pandas/tests/arrays/integer/test_arithmetic.py
+++ b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -279,7 +279,7 @@ def test_reduce_to_float(op):
     expected = pd.DataFrame(
         {
             "B": np.array([1.0, 3.0]),
-            "C": integer_array([1, 3], dtype="Int64").astype(np.float64),
+            "C": pd.array([1, 3], dtype="Float64"),
         },
         index=pd.Index(["a", "b"], name="A"),
     )
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 60aae003d2956..f3860b78e2994 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -1072,7 +1072,7 @@ def test_apply_to_nullable_integer_returns_float(values, function):
     output = 0.5 if function == "var" else 1.5
     arr = np.array([output] * 3, dtype=float)
     idx = Index([1, 2, 3], dtype=object, name="a")
-    expected = DataFrame({"b": arr}, index=idx)
+    expected = DataFrame({"b": arr}, index=idx).astype("Float64")
 
     groups = DataFrame(values, dtype="Int64").groupby("a")
 
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index c10b3f9ffd4d2..8bf40c924ec86 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -126,7 +126,7 @@ def test_resample_integerarray():
     expected = Series(
         [1, 4, 7],
         index=pd.date_range("1/1/2000", periods=3, freq="3T"),
-        dtype=np.float64,
+        dtype="Float64",
     )
     tm.assert_series_equal(result, expected)
 

From ea79027c808bc162f76020654dcdf4ecd3d254bb Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 5 Dec 2020 17:16:08 +0100
Subject: [PATCH 3/7] add tests for expected dtype of cython agg ops with
 nullable dtypes

---
 pandas/tests/groupby/aggregate/test_cython.py | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index c907391917ca8..c97e7477cf643 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -5,6 +5,8 @@
 import numpy as np
 import pytest
 
+from pandas.core.dtypes.common import is_float_dtype
+
 import pandas as pd
 from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range
 import pandas._testing as tm
@@ -312,3 +314,69 @@ def test_cython_agg_nullable_int(op_name):
         # so for now just checking the values by casting to float
         result = result.astype("float64")
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("with_na", [True, False])
+@pytest.mark.parametrize(
+    "op_name, action",
+    [
+        # ("count", "always_int"),
+        ("sum", "large_int"),
+        # ("std", "always_float"),
+        ("var", "always_float"),
+        # ("sem", "always_float"),
+        ("mean", "always_float"),
+        ("median", "always_float"),
+        ("prod", "large_int"),
+        ("min", "preserve"),
+        ("max", "preserve"),
+        ("first", "preserve"),
+        ("last", "preserve"),
+    ],
+)
+@pytest.mark.parametrize(
+    "data",
+    [
+        pd.array([1, 2, 3, 4], dtype="Int64"),
+        pd.array([1, 2, 3, 4], dtype="Int8"),
+        pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
+        pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
+        pd.array([True, True, False, False], dtype="boolean"),
+    ],
+)
+def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
+    if with_na:
+        data[3] = pd.NA
+
+    df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
+    grouped = df.groupby("key")
+
+    if action == "always_int":
+        # always Int64
+        expected_dtype = pd.Int64Dtype()
+    elif action == "large_int":
+        # for any int/bool use Int64, for float preserve dtype
+        if is_float_dtype(data.dtype):
+            expected_dtype = data.dtype
+        else:
+            expected_dtype = pd.Int64Dtype()
+    elif action == "always_float":
+        # for any int/bool use Float64, for float preserve dtype
+        if is_float_dtype(data.dtype):
+            expected_dtype = data.dtype
+        else:
+            expected_dtype = pd.Float64Dtype()
+    elif action == "preserve":
+        expected_dtype = data.dtype
+
+    result = getattr(grouped, op_name)()
+    result["col"].dtype == expected_dtype
+
+    result = grouped.aggregate(op_name)
+    result["col"].dtype == expected_dtype
+
+    result = getattr(grouped["col"], op_name)()
+    result.dtype == expected_dtype
+
+    result = grouped["col"].aggregate(op_name)
+    result.dtype == expected_dtype

From 97fcd22657cac1f665f30e03ad902cd14f5f9656 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 5 Dec 2020 17:29:16 +0100
Subject: [PATCH 4/7] fix casting to float numpy array for FloatingArray

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 85d9c7d7f8aea..8020ac862ec50 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -516,7 +516,7 @@ def _ea_wrap_cython_operation(
 
         elif is_float_dtype(values.dtype):
             # FloatingArray
-            values = values.to_numpy(na_value=np.nan)
+            values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
             res_values = self._cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )

From b04d91fb73fdd3f8fc90393f662ceb95a7b9b9fe Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 5 Dec 2020 17:47:04 +0100
Subject: [PATCH 5/7] fix tests

---
 pandas/tests/groupby/aggregate/test_cython.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index c97e7477cf643..8799f6faa775c 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -370,13 +370,13 @@ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
         expected_dtype = data.dtype
 
     result = getattr(grouped, op_name)()
-    result["col"].dtype == expected_dtype
+    assert result["col"].dtype == expected_dtype
 
     result = grouped.aggregate(op_name)
-    result["col"].dtype == expected_dtype
+    assert result["col"].dtype == expected_dtype
 
     result = getattr(grouped["col"], op_name)()
-    result.dtype == expected_dtype
+    assert result.dtype == expected_dtype
 
     result = grouped["col"].aggregate(op_name)
-    result.dtype == expected_dtype
+    assert result.dtype == expected_dtype

From 202bee801df3654a6333092ca2c674d4f4b233a7 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 5 Dec 2020 17:52:24 +0100
Subject: [PATCH 6/7] update rules of known result dtypes

---
 pandas/core/dtypes/cast.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index d314221f1b763..1437c7e59c54b 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -358,13 +358,16 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
     """
     from pandas.core.arrays.boolean import BooleanDtype
     from pandas.core.arrays.floating import Float64Dtype
-    from pandas.core.arrays.integer import Int64Dtype
-
-    if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)):
-        return np.dtype(np.int64)
-    elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype):
-        return Int64Dtype()
-    elif how in ["mean", "median", "var"] and isinstance(dtype, Int64Dtype):
+    from pandas.core.arrays.integer import Int64Dtype, _IntegerDtype
+
+    if how in ["add", "cumsum", "sum", "prod"]:
+        if dtype == np.dtype(bool):
+            return np.dtype(np.int64)
+        elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
+            return Int64Dtype()
+    elif how in ["mean", "median", "var"] and isinstance(
+        dtype, (BooleanDtype, _IntegerDtype)
+    ):
         return Float64Dtype()
     return dtype
 

From 2566ec4dc07050effe490765aff247e3d4c68216 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 7 Dec 2020 18:57:10 -0800
Subject: [PATCH 7/7] retain dtype

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index cbff65f6a55c2..7724e3930f7df 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -525,7 +525,7 @@ def _ea_wrap_cython_operation(
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
             if is_extension_array_dtype(dtype):
                 cls = dtype.construct_array_type()
-                return cls._from_sequence(res_values)
+                return cls._from_sequence(res_values, dtype=dtype)
             return res_values
 
         elif is_float_dtype(values.dtype):