Skip to content

REGR: Performance regression in axis=1 DataFrame ops #51923

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 80 additions & 35 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,16 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_numeric_dtype,
is_scalar,
is_sequence,
needs_i8_conversion,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
ExtensionDtype,
)
from pandas.core.dtypes.missing import (
isna,
notna,
Expand Down Expand Up @@ -10450,9 +10454,6 @@ def _reduce(
assert filter_type is None or filter_type == "bool", filter_type
out_dtype = "bool" if filter_type == "bool" else None

if axis is not None:
axis = self._get_axis_number(axis)

def func(values: np.ndarray):
# We only use this in the case that operates on self.values
return op(values, axis=axis, skipna=skipna, **kwds)
Expand Down Expand Up @@ -10482,38 +10483,82 @@ def _get_data() -> DataFrame:
df = _get_data()
if axis is None:
return func(df.values)
elif axis == 1:
if len(df.index) == 0:
# Taking a transpose would result in no columns, losing the dtype.
# In the empty case, reducing along axis 0 or 1 gives the same
# result dtype, so reduce with axis=0 and ignore values
result = df._reduce(
op,
name,
axis=0,
skipna=skipna,
numeric_only=False,
filter_type=filter_type,
**kwds,
).iloc[:0]
result.index = df.index
return result
df = df.T

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
elif (df._mgr.get_dtypes() == object).any():
out = out.astype(object)
elif len(self) == 0 and name in ("sum", "prod"):
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

return out
axis = self._get_axis_number(axis)
assert axis in [0, 1]

if len(df._mgr) > 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this be len(df.columns)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also we only need to care about common_dtype in axis=1 case right? can we avoid these checks otherwise?

common_dtype = find_common_type(list(df._mgr.get_dtypes()))
is_masked_ea = isinstance(common_dtype, BaseMaskedDtype)
is_np = isinstance(common_dtype, np.dtype)
else:
common_dtype = None

if axis == 0 or common_dtype is None or not (is_masked_ea or is_np):
if axis == 1:
if len(df.index) == 0:
# Taking a transpose would result in no columns, losing the dtype.
# In the empty case, reducing along axis 0 or 1 gives the same
# result dtype, so reduce with axis=0 and ignore values
result = df._reduce(
op,
name,
axis=0,
skipna=skipna,
numeric_only=False,
filter_type=filter_type,
**kwds,
).iloc[:0]
result.index = df.index
return result
df = df.T

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
elif (df._mgr.get_dtypes() == object).any():
out = out.astype(object)
elif len(self) == 0 and name in ["sum", "prod"]:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

return out

if is_np or not is_numeric_dtype(common_dtype):
values = df.values
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not really relevant here but in general df._values is better than df.values, preserves e.g. DatetimeArray[tzaware]

else:
# TODO: Better way to extract frame values as float64?
values = df.fillna(np.nan).astype("float64").values
result = func(values)

result_dtype = None
if filter_type == "bool" and notna(result).all():
result = result.astype(np.bool_)
elif is_np and common_dtype == "object":
result_dtype = "object"
elif is_masked_ea:
if name in ("sum",) and is_bool_dtype(common_dtype):
result_dtype = "Int64"
elif name in (
"var",
"std",
"kurt",
"mean",
"median",
"sem",
"skew",
):
result_dtype = "Float64"
elif name not in ("argmax", "argmin", "count", "nunique"):
result_dtype = common_dtype

labels = self._get_agg_axis(axis)
result = self._constructor_sliced(result, index=labels, dtype=result_dtype)
return result

def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
"""
Expand Down
126 changes: 125 additions & 1 deletion pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
from pandas.compat import is_platform_windows
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_categorical_dtype
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_float_dtype,
is_integer_dtype,
is_unsigned_integer_dtype,
)

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -1648,6 +1653,125 @@ def test_minmax_extensionarray(method, numeric_only):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"method",
[
"all",
"any",
"count",
"idxmax",
"idxmin",
"kurt",
"kurtosis",
"max",
"mean",
"median",
"min",
"nunique",
"prod",
"product",
"sem",
"skew",
"std",
"sum",
"var",
],
)
@pytest.mark.parametrize("min_count", [0, 2])
def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype):
df = DataFrame(
{
"a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype),
"b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype),
},
)
expected_df = DataFrame(
{
"a": [0.0, 1.0, 2.0, 3.0],
"b": [0.0, 1.0, np.nan, 3.0],
},
)
if method in ("count", "nunique"):
expected_dtype = "int64"
elif method in ("any", "all"):
expected_dtype = "bool"
elif method in ("var", "std", "skew", "kurt", "mean", "median", "kurtosis", "sem"):
expected_dtype = "Float64"
else:
expected_dtype = any_numeric_ea_dtype

kwargs = {}
if method not in ("count", "nunique", "quantile"):
kwargs["skipna"] = skipna
if method in ("prod", "product", "sum"):
kwargs["min_count"] = min_count
result = getattr(df, method)(axis=1, **kwargs)
expected = getattr(expected_df, method)(axis=1, **kwargs)
if method not in ("idxmax", "idxmin"):
expected = expected.astype(expected_dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"method, expected_value",
[
("all", True),
("any", True),
("count", 1),
("idxmax", "a"),
("idxmin", "a"),
("kurt", np.nan),
("kurtosis", np.nan),
("max", 1),
("mean", 1.0),
("median", 1.0),
("min", 1),
("nunique", 1),
("prod", 1),
("product", 1),
("sem", np.nan),
("skew", np.nan),
("std", np.nan),
("sum", 1),
("var", np.nan),
],
)
def test_numeric_np_axis_1(method, expected_value, any_real_numpy_dtype):
df = DataFrame(
{
"a": Series([1], dtype=any_real_numpy_dtype),
},
)
if method in ("count", "nunique"):
expected_dtype = "int64"
elif method in ("prod", "product") and is_unsigned_integer_dtype(
any_real_numpy_dtype
):
expected_dtype = "uint64"
elif method in ("sum", "prod", "product") and is_integer_dtype(
any_real_numpy_dtype
):
expected_dtype = "int64"
elif method in ("any", "all"):
expected_dtype = "bool"
elif method in ("var", "std", "skew", "kurt", "mean", "median", "kurtosis", "sem"):
if is_float_dtype(any_real_numpy_dtype):
itemsize = max(2, df["a"].dtype.itemsize)
expected_dtype = f"float{8 * itemsize}"
else:
expected_dtype = "float64"
elif method in ("idxmax", "idxmin"):
expected_dtype = None
else:
expected_dtype = any_real_numpy_dtype

result = getattr(df, method)(axis=1)
expected = Series([expected_value], dtype=expected_dtype)
if method not in ("idxmax", "idxmin"):
expected = expected.astype(expected_dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT])
def test_frame_mixed_numeric_object_with_timestamp(ts_value):
# GH 13912
Expand Down