From ebf73ee134486c8a076af3ba81ef2ec0e281ca2c Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Mon, 27 Feb 2023 17:37:55 -0500
Subject: [PATCH] REGR: Performance regression in axis=1 DataFrame ops

---
 pandas/core/frame.py                  | 115 ++++++++++++++++-------
 pandas/tests/frame/test_reductions.py | 126 +++++++++++++++++++++++++-
 2 files changed, 205 insertions(+), 36 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index e6af875ab1c23..c711da9fc7066 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -141,12 +141,16 @@
     is_integer_dtype,
     is_iterator,
     is_list_like,
+    is_numeric_dtype,
     is_scalar,
     is_sequence,
     needs_i8_conversion,
     pandas_dtype,
 )
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    BaseMaskedDtype,
+    ExtensionDtype,
+)
 from pandas.core.dtypes.missing import (
     isna,
     notna,
@@ -10450,9 +10454,6 @@ def _reduce(
         assert filter_type is None or filter_type == "bool", filter_type
         out_dtype = "bool" if filter_type == "bool" else None
 
-        if axis is not None:
-            axis = self._get_axis_number(axis)
-
         def func(values: np.ndarray):
             # We only use this in the case that operates on self.values
             return op(values, axis=axis, skipna=skipna, **kwds)
@@ -10482,38 +10483,82 @@ def _get_data() -> DataFrame:
             df = _get_data()
         if axis is None:
             return func(df.values)
-        elif axis == 1:
-            if len(df.index) == 0:
-                # Taking a transpose would result in no columns, losing the dtype.
-                # In the empty case, reducing along axis 0 or 1 gives the same
-                # result dtype, so reduce with axis=0 and ignore values
-                result = df._reduce(
-                    op,
-                    name,
-                    axis=0,
-                    skipna=skipna,
-                    numeric_only=False,
-                    filter_type=filter_type,
-                    **kwds,
-                ).iloc[:0]
-                result.index = df.index
-                return result
-            df = df.T
-
-        # After possibly _get_data and transposing, we are now in the
-        #  simple case where we can use BlockManager.reduce
-        res = df._mgr.reduce(blk_func)
-        out = df._constructor(res).iloc[0]
-        if out_dtype is not None:
-            out = out.astype(out_dtype)
-        elif (df._mgr.get_dtypes() == object).any():
-            out = out.astype(object)
-        elif len(self) == 0 and name in ("sum", "prod"):
-            # Even if we are object dtype, follow numpy and return
-            #  float64, see test_apply_funcs_over_empty
-            out = out.astype(np.float64)
 
-        return out
+        axis = self._get_axis_number(axis)
+        assert axis in [0, 1]
+
+        if len(df._mgr) > 0:
+            common_dtype = find_common_type(list(df._mgr.get_dtypes()))
+            is_masked_ea = isinstance(common_dtype, BaseMaskedDtype)
+            is_np = isinstance(common_dtype, np.dtype)
+        else:
+            common_dtype = None
+
+        if axis == 0 or common_dtype is None or not (is_masked_ea or is_np):
+            if axis == 1:
+                if len(df.index) == 0:
+                    # Taking a transpose would result in no columns, losing the dtype.
+                    # In the empty case, reducing along axis 0 or 1 gives the same
+                    # result dtype, so reduce with axis=0 and ignore values
+                    result = df._reduce(
+                        op,
+                        name,
+                        axis=0,
+                        skipna=skipna,
+                        numeric_only=False,
+                        filter_type=filter_type,
+                        **kwds,
+                    ).iloc[:0]
+                    result.index = df.index
+                    return result
+                df = df.T
+
+            # After possibly _get_data and transposing, we are now in the
+            #  simple case where we can use BlockManager.reduce
+            res = df._mgr.reduce(blk_func)
+            out = df._constructor(res).iloc[0]
+            if out_dtype is not None:
+                out = out.astype(out_dtype)
+            elif (df._mgr.get_dtypes() == object).any():
+                out = out.astype(object)
+            elif len(self) == 0 and name in ["sum", "prod"]:
+                # Even if we are object dtype, follow numpy and return
+                #  float64, see test_apply_funcs_over_empty
+                out = out.astype(np.float64)
+
+            return out
+
+        if is_np or not is_numeric_dtype(common_dtype):
+            values = df.values
+        else:
+            # TODO: Better way to extract frame values as float64?
+            values = df.fillna(np.nan).astype("float64").values
+        result = func(values)
+
+        result_dtype = None
+        if filter_type == "bool" and notna(result).all():
+            result = result.astype(np.bool_)
+        elif is_np and common_dtype == "object":
+            result_dtype = "object"
+        elif is_masked_ea:
+            if name in ("sum",) and is_bool_dtype(common_dtype):
+                result_dtype = "Int64"
+            elif name in (
+                "var",
+                "std",
+                "kurt",
+                "mean",
+                "median",
+                "sem",
+                "skew",
+            ):
+                result_dtype = "Float64"
+            elif name not in ("argmax", "argmin", "count", "nunique"):
+                result_dtype = common_dtype
+
+        labels = self._get_agg_axis(axis)
+        result = self._constructor_sliced(result, index=labels, dtype=result_dtype)
+        return result
 
     def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
         """
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 28809e2ecb788..bf2dcf0d3a29a 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -9,7 +9,12 @@
 from pandas.compat import is_platform_windows
 import pandas.util._test_decorators as td
 
-from pandas.core.dtypes.common import is_categorical_dtype
+from pandas.core.dtypes.common import (
+    is_categorical_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    is_unsigned_integer_dtype,
+)
 
 import pandas as pd
 from pandas import (
@@ -1648,6 +1653,125 @@ def test_minmax_extensionarray(method, numeric_only):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "method",
+    [
+        "all",
+        "any",
+        "count",
+        "idxmax",
+        "idxmin",
+        "kurt",
+        "kurtosis",
+        "max",
+        "mean",
+        "median",
+        "min",
+        "nunique",
+        "prod",
+        "product",
+        "sem",
+        "skew",
+        "std",
+        "sum",
+        "var",
+    ],
+)
+@pytest.mark.parametrize("min_count", [0, 2])
+def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype):
+    df = DataFrame(
+        {
+            "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype),
+            "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype),
+        },
+    )
+    expected_df = DataFrame(
+        {
+            "a": [0.0, 1.0, 2.0, 3.0],
+            "b": [0.0, 1.0, np.nan, 3.0],
+        },
+    )
+    if method in ("count", "nunique"):
+        expected_dtype = "int64"
+    elif method in ("any", "all"):
+        expected_dtype = "bool"
+    elif method in ("var", "std", "skew", "kurt", "mean", "median", "kurtosis", "sem"):
+        expected_dtype = "Float64"
+    else:
+        expected_dtype = any_numeric_ea_dtype
+
+    kwargs = {}
+    if method not in ("count", "nunique", "quantile"):
+        kwargs["skipna"] = skipna
+    if method in ("prod", "product", "sum"):
+        kwargs["min_count"] = min_count
+    result = getattr(df, method)(axis=1, **kwargs)
+    expected = getattr(expected_df, method)(axis=1, **kwargs)
+    if method not in ("idxmax", "idxmin"):
+        expected = expected.astype(expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, expected_value",
+    [
+        ("all", True),
+        ("any", True),
+        ("count", 1),
+        ("idxmax", "a"),
+        ("idxmin", "a"),
+        ("kurt", np.nan),
+        ("kurtosis", np.nan),
+        ("max", 1),
+        ("mean", 1.0),
+        ("median", 1.0),
+        ("min", 1),
+        ("nunique", 1),
+        ("prod", 1),
+        ("product", 1),
+        ("sem", np.nan),
+        ("skew", np.nan),
+        ("std", np.nan),
+        ("sum", 1),
+        ("var", np.nan),
+    ],
+)
+def test_numeric_np_axis_1(method, expected_value, any_real_numpy_dtype):
+    df = DataFrame(
+        {
+            "a": Series([1], dtype=any_real_numpy_dtype),
+        },
+    )
+    if method in ("count", "nunique"):
+        expected_dtype = "int64"
+    elif method in ("prod", "product") and is_unsigned_integer_dtype(
+        any_real_numpy_dtype
+    ):
+        expected_dtype = "uint64"
+    elif method in ("sum", "prod", "product") and is_integer_dtype(
+        any_real_numpy_dtype
+    ):
+        expected_dtype = "int64"
+    elif method in ("any", "all"):
+        expected_dtype = "bool"
+    elif method in ("var", "std", "skew", "kurt", "mean", "median", "kurtosis", "sem"):
+        if is_float_dtype(any_real_numpy_dtype):
+            itemsize = max(2, df["a"].dtype.itemsize)
+            expected_dtype = f"float{8 * itemsize}"
+        else:
+            expected_dtype = "float64"
+    elif method in ("idxmax", "idxmin"):
+        expected_dtype = None
+    else:
+        expected_dtype = any_real_numpy_dtype
+
+    result = getattr(df, method)(axis=1)
+    expected = Series([expected_value], dtype=expected_dtype)
+    if method not in ("idxmax", "idxmin"):
+        expected = expected.astype(expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT])
 def test_frame_mixed_numeric_object_with_timestamp(ts_value):
     # GH 13912