Skip to content

REGR: Performance of DataFrame axis=1 reduction ops with EA #51955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1197,7 +1197,6 @@ Numeric
- Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`)
- Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`)
- Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`)

Conversion
^^^^^^^^^^
Expand Down
64 changes: 32 additions & 32 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_object_dtype,
is_scalar,
is_sequence,
needs_i8_conversion,
Expand Down Expand Up @@ -10917,44 +10918,43 @@ def _get_data() -> DataFrame:
data = self._get_bool_data()
return data

# Case with EAs see GH#35881
df = self
if numeric_only:
df = _get_data()
if axis is None:
return func(df.values)
elif axis == 1:
if len(df.index) == 0:
# Taking a transpose would result in no columns, losing the dtype.
# In the empty case, reducing along axis 0 or 1 gives the same
# result dtype, so reduce with axis=0 and ignore values
result = df._reduce(
op,
name,
axis=0,
skipna=skipna,
numeric_only=False,
filter_type=filter_type,
**kwds,
).iloc[:0]
result.index = df.index
return result
df = df.T

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
elif (df._mgr.get_dtypes() == object).any():
out = out.astype(object)
elif len(self) == 0 and name in ("sum", "prod"):
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)
elif axis == 0:
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
elif axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)
elif (df._mgr.get_dtypes() == object).any():
out = out.astype(object)

return out

values = df.values
result = func(values)

if hasattr(result, "dtype"):
if filter_type == "bool" and notna(result).all():
result = result.astype(np.bool_)
elif (df._mgr.get_dtypes() == object).any():
result = result.astype(object)
elif filter_type is None and is_object_dtype(result.dtype):
try:
result = result.astype(np.float64)
except (ValueError, TypeError):
# try to coerce to the original dtypes item by item if we can
pass

return out
labels = self._get_agg_axis(axis)
result = self._constructor_sliced(result, index=labels)
return result

def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
"""
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,6 +1464,7 @@ def test_preserve_timezone(self, initial: str, method):
result = getattr(df, method)(axis=1)
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(reason="GH#51955 - avoid perf regression in axis=1 ops")
@pytest.mark.parametrize("method", ["min", "max"])
def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
# GH#51242
Expand Down