From ebf73ee134486c8a076af3ba81ef2ec0e281ca2c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 27 Feb 2023 17:37:55 -0500 Subject: [PATCH] REGR: Performance regression in axis=1 DataFrame ops --- pandas/core/frame.py | 115 ++++++++++++++++------- pandas/tests/frame/test_reductions.py | 126 +++++++++++++++++++++++++- 2 files changed, 205 insertions(+), 36 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e6af875ab1c23..c711da9fc7066 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -141,12 +141,16 @@ is_integer_dtype, is_iterator, is_list_like, + is_numeric_dtype, is_scalar, is_sequence, needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, + ExtensionDtype, +) from pandas.core.dtypes.missing import ( isna, notna, @@ -10450,9 +10454,6 @@ def _reduce( assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None - if axis is not None: - axis = self._get_axis_number(axis) - def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) @@ -10482,38 +10483,82 @@ def _get_data() -> DataFrame: df = _get_data() if axis is None: return func(df.values) - elif axis == 1: - if len(df.index) == 0: - # Taking a transpose would result in no columns, losing the dtype. - # In the empty case, reducing along axis 0 or 1 gives the same - # result dtype, so reduce with axis=0 and ignore values - result = df._reduce( - op, - name, - axis=0, - skipna=skipna, - numeric_only=False, - filter_type=filter_type, - **kwds, - ).iloc[:0] - result.index = df.index - return result - df = df.T - - # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager.reduce - res = df._mgr.reduce(blk_func) - out = df._constructor(res).iloc[0] - if out_dtype is not None: - out = out.astype(out_dtype) - elif (df._mgr.get_dtypes() == object).any(): - out = out.astype(object) - elif len(self) == 0 and name in ("sum", "prod"): - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) - return out + axis = self._get_axis_number(axis) + assert axis in [0, 1] + + if len(df._mgr) > 0: + common_dtype = find_common_type(list(df._mgr.get_dtypes())) + is_masked_ea = isinstance(common_dtype, BaseMaskedDtype) + is_np = isinstance(common_dtype, np.dtype) + else: + common_dtype = None + + if axis == 0 or common_dtype is None or not (is_masked_ea or is_np): + if axis == 1: + if len(df.index) == 0: + # Taking a transpose would result in no columns, losing the dtype. + # In the empty case, reducing along axis 0 or 1 gives the same + # result dtype, so reduce with axis=0 and ignore values + result = df._reduce( + op, + name, + axis=0, + skipna=skipna, + numeric_only=False, + filter_type=filter_type, + **kwds, + ).iloc[:0] + result.index = df.index + return result + df = df.T + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager.reduce + res = df._mgr.reduce(blk_func) + out = df._constructor(res).iloc[0] + if out_dtype is not None: + out = out.astype(out_dtype) + elif (df._mgr.get_dtypes() == object).any(): + out = out.astype(object) + elif len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) + + return out + + if is_np or not is_numeric_dtype(common_dtype): + values = df.values + else: + # TODO: Better way to extract frame values as float64? + values = df.fillna(np.nan).astype("float64").values + result = func(values) + + result_dtype = None + if filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) + elif is_np and common_dtype == "object": + result_dtype = "object" + elif is_masked_ea: + if name in ("sum",) and is_bool_dtype(common_dtype): + result_dtype = "Int64" + elif name in ( + "var", + "std", + "kurt", + "mean", + "median", + "sem", + "skew", + ): + result_dtype = "Float64" + elif name not in ("argmax", "argmin", "count", "nunique"): + result_dtype = common_dtype + + labels = self._get_agg_axis(axis) + result = self._constructor_sliced(result, index=labels, dtype=result_dtype) + return result def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 28809e2ecb788..bf2dcf0d3a29a 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -9,7 +9,12 @@ from pandas.compat import is_platform_windows import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_categorical_dtype +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_float_dtype, + is_integer_dtype, + is_unsigned_integer_dtype, +) import pandas as pd from pandas import ( @@ -1648,6 +1653,125 @@ def test_minmax_extensionarray(method, numeric_only): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "method", + [ + "all", + "any", + "count", + "idxmax", + "idxmin", + "kurt", + "kurtosis", + "max", + "mean", + "median", + "min", + "nunique", + "prod", + "product", + "sem", + "skew", + "std", + "sum", + "var", + ], +) +@pytest.mark.parametrize("min_count", [0, 2]) +def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): + df = DataFrame( + { + "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype), + "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype), + }, + ) + expected_df = DataFrame( + { + "a": [0.0, 1.0, 2.0, 3.0], + "b": [0.0, 1.0, np.nan, 3.0], + }, + ) + if method in ("count", "nunique"): + expected_dtype = "int64" + elif method in ("any", "all"): + expected_dtype = "bool" + elif method in ("var", "std", "skew", "kurt", "mean", "median", "kurtosis", "sem"): + expected_dtype = "Float64" + else: + expected_dtype = any_numeric_ea_dtype + + kwargs = {} + if method not in ("count", "nunique", "quantile"): + kwargs["skipna"] = skipna + if method in ("prod", "product", "sum"): + kwargs["min_count"] = min_count + result = getattr(df, method)(axis=1, **kwargs) + expected = getattr(expected_df, method)(axis=1, **kwargs) + if method not in ("idxmax", "idxmin"): + expected = expected.astype(expected_dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "method, expected_value", + [ + ("all", True), + ("any", True), + ("count", 1), + ("idxmax", "a"), + ("idxmin", "a"), + ("kurt", np.nan), + ("kurtosis", np.nan), + ("max", 1), + ("mean", 1.0), + ("median", 1.0), + ("min", 1), + ("nunique", 1), + ("prod", 1), + ("product", 1), + ("sem", np.nan), + ("skew", np.nan), + ("std", np.nan), + ("sum", 1), + ("var", np.nan), + ], +) +def test_numeric_np_axis_1(method, expected_value, any_real_numpy_dtype): + df = DataFrame( + { + "a": Series([1], dtype=any_real_numpy_dtype), + }, + ) + if method in ("count", "nunique"): + expected_dtype = "int64" + elif method in ("prod", "product") and is_unsigned_integer_dtype( + any_real_numpy_dtype + ): + expected_dtype = "uint64" + elif method in ("sum", "prod", "product") and is_integer_dtype( + any_real_numpy_dtype + ): + expected_dtype = "int64" + elif method in ("any", "all"): + expected_dtype = "bool" + elif method in ("var", "std", "skew", "kurt", "mean", "median", "kurtosis", "sem"): + if is_float_dtype(any_real_numpy_dtype): + itemsize = max(2, df["a"].dtype.itemsize) + expected_dtype = f"float{8 * itemsize}" + else: + expected_dtype = "float64" + elif method in ("idxmax", "idxmin"): + expected_dtype = None + else: + expected_dtype = any_real_numpy_dtype + + result = getattr(df, method)(axis=1) + expected = Series([expected_value], dtype=expected_dtype) + if method not in ("idxmax", "idxmin"): + expected = expected.astype(expected_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT]) def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912