Skip to content

DEPR: Enforce default of numeric_only=False in DataFrame methods #49622

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,8 @@ Removal of prior version deprecations/changes
- Removed ``na_sentinel`` argument from :func:`factorize`, :meth:`.Index.factorize`, and :meth:`.ExtensionArray.factorize` (:issue:`47157`)
- Changed behavior of :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` so that ``group_keys`` is respected even if a transformer is detected (:issue:`34998`)
- Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`)
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
-

.. ---------------------------------------------------------------------------
Expand Down
64 changes: 0 additions & 64 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
cast,
overload,
)
import warnings

import numpy as np

Expand All @@ -37,7 +36,6 @@
RandomState,
T,
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -631,65 +629,3 @@ def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
list of column names with the None values replaced.
"""
return [f"level_{i}" if name is None else name for i, name in enumerate(names)]


def resolve_numeric_only(numeric_only: bool | None | lib.NoDefault) -> bool:
"""Determine the Boolean value of numeric_only.

See GH#46560 for details on the deprecation.

Parameters
----------
numeric_only : bool, None, or lib.no_default
Value passed to the method.

Returns
-------
Resolved value of numeric_only.
"""
if numeric_only is lib.no_default:
# Methods that behave like numeric_only=True and only got the numeric_only
# arg in 1.5.0 default to lib.no_default
result = True
elif numeric_only is None:
# Methods that had the numeric_only arg prior to 1.5.0 and try all columns
# first default to None
result = False
else:
result = numeric_only
return result


def deprecate_numeric_only_default(
cls: type, name: str, deprecate_none: bool = False
) -> None:
"""Emit FutureWarning message for deprecation of numeric_only.

See GH#46560 for details on the deprecation.

Parameters
----------
cls : type
pandas type that is generating the warning.
name : str
Name of the method that is generating the warning.
deprecate_none : bool, default False
Whether to also warn about the deprecation of specifying ``numeric_only=None``.
"""
if name in ["all", "any"]:
arg_name = "bool_only"
else:
arg_name = "numeric_only"

msg = (
f"The default value of {arg_name} in {cls.__name__}.{name} is "
"deprecated. In a future version, it will default to False. "
)
if deprecate_none:
msg += f"In addition, specifying '{arg_name}=None' is deprecated. "
msg += (
f"Select only valid columns or specify the value of {arg_name} to silence "
"this warning."
)

warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
83 changes: 28 additions & 55 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_sequence,
Expand Down Expand Up @@ -9939,7 +9938,7 @@ def corr(
self,
method: CorrelationMethod = "pearson",
min_periods: int = 1,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
) -> DataFrame:
"""
Compute pairwise correlation of columns, excluding NA/null values.
Expand All @@ -9960,14 +9959,13 @@ def corr(
Minimum number of observations required per pair of columns
to have a valid result. Currently only available for Pearson
and Spearman correlation.
numeric_only : bool, default True
numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

.. deprecated:: 1.5.0
The default value of ``numeric_only`` will be ``False`` in a future
version of pandas.
.. versionchanged:: 2.0.0
The default value of ``numeric_only`` is now ``False``.

Returns
-------
Expand Down Expand Up @@ -10007,11 +10005,7 @@ def corr(
dogs 1.0 NaN
cats NaN 1.0
""" # noqa:E501
numeric_only_bool = com.resolve_numeric_only(numeric_only)
data = self._get_numeric_data() if numeric_only_bool else self
if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
com.deprecate_numeric_only_default(type(self), "corr")

data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
Expand Down Expand Up @@ -10058,7 +10052,7 @@ def cov(
self,
min_periods: int | None = None,
ddof: int | None = 1,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
) -> DataFrame:
"""
Compute pairwise covariance of columns, excluding NA/null values.
Expand Down Expand Up @@ -10090,14 +10084,13 @@ def cov(

.. versionadded:: 1.1.0

numeric_only : bool, default True
numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

.. deprecated:: 1.5.0
The default value of ``numeric_only`` will be ``False`` in a future
version of pandas.
.. versionchanged:: 2.0.0
The default value of ``numeric_only`` is now ``False``.

Returns
-------
Expand Down Expand Up @@ -10168,11 +10161,7 @@ def cov(
b NaN 1.248003 0.191417
c -0.150812 0.191417 0.895202
"""
numeric_only_bool = com.resolve_numeric_only(numeric_only)
data = self._get_numeric_data() if numeric_only_bool else self
if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
com.deprecate_numeric_only_default(type(self), "cov")

data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
Expand All @@ -10196,7 +10185,7 @@ def corrwith(
axis: Axis = 0,
drop: bool = False,
method: CorrelationMethod = "pearson",
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
) -> Series:
"""
Compute pairwise correlation.
Expand Down Expand Up @@ -10224,14 +10213,13 @@ def corrwith(
* callable: callable with input two 1d ndarrays
and returning a float.

numeric_only : bool, default True
numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

.. deprecated:: 1.5.0
The default value of ``numeric_only`` will be ``False`` in a future
version of pandas.
.. versionchanged:: 2.0.0
The default value of ``numeric_only`` is now ``False``.

Returns
-------
Expand Down Expand Up @@ -10264,15 +10252,12 @@ def corrwith(
dtype: float64
""" # noqa:E501
axis = self._get_axis_number(axis)
numeric_only_bool = com.resolve_numeric_only(numeric_only)
this = self._get_numeric_data() if numeric_only_bool else self
if numeric_only is lib.no_default and len(this.columns) < len(self.columns):
com.deprecate_numeric_only_default(type(self), "corrwith")
this = self._get_numeric_data() if numeric_only else self

if isinstance(other, Series):
return this.apply(lambda x: other.corr(x, method=method), axis=axis)

if numeric_only_bool:
if numeric_only:
other = other._get_numeric_data()
left, right = this.align(other, join="inner", copy=False)

Expand All @@ -10286,14 +10271,14 @@ def corrwith(
right = right + left * 0

# demeaned data
ldem = left - left.mean(numeric_only=numeric_only_bool)
rdem = right - right.mean(numeric_only=numeric_only_bool)
ldem = left - left.mean(numeric_only=numeric_only)
rdem = right - right.mean(numeric_only=numeric_only)

num = (ldem * rdem).sum()
dom = (
(left.count() - 1)
* left.std(numeric_only=numeric_only_bool)
* right.std(numeric_only=numeric_only_bool)
* left.std(numeric_only=numeric_only)
* right.std(numeric_only=numeric_only)
)

correl = num / dom
Expand Down Expand Up @@ -10546,12 +10531,6 @@ def _get_data() -> DataFrame:
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

if numeric_only is None and out.shape[0] != df.shape[1]:
# columns have been dropped GH#41480
com.deprecate_numeric_only_default(
type(self), name, deprecate_none=True
)

return out

assert not numeric_only and axis == 1
Expand Down Expand Up @@ -10801,7 +10780,7 @@ def quantile(
self,
q: float = ...,
axis: Axis = ...,
numeric_only: bool | lib.NoDefault = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
) -> Series:
...
Expand All @@ -10811,7 +10790,7 @@ def quantile(
self,
q: AnyArrayLike | Sequence[float],
axis: Axis = ...,
numeric_only: bool | lib.NoDefault = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
) -> Series | DataFrame:
...
Expand All @@ -10821,7 +10800,7 @@ def quantile(
self,
q: float | AnyArrayLike | Sequence[float] = ...,
axis: Axis = ...,
numeric_only: bool | lib.NoDefault = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
) -> Series | DataFrame:
...
Expand All @@ -10830,7 +10809,7 @@ def quantile(
self,
q: float | AnyArrayLike | Sequence[float] = 0.5,
axis: Axis = 0,
numeric_only: bool | lib.NoDefault = no_default,
numeric_only: bool = False,
interpolation: QuantileInterpolation = "linear",
method: Literal["single", "table"] = "single",
) -> Series | DataFrame:
Expand All @@ -10843,13 +10822,11 @@ def quantile(
Value between 0 <= q <= 1, the quantile(s) to compute.
axis : {0 or 'index', 1 or 'columns'}, default 0
Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
numeric_only : bool, default True
If False, the quantile of datetime and timedelta data will be
computed as well.
numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. deprecated:: 1.5.0
The default value of ``numeric_only`` will be ``False`` in a future
version of pandas.
.. versionchanged:: 2.0.0
The default value of ``numeric_only`` is now ``False``.

interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to use,
Expand Down Expand Up @@ -10921,10 +10898,6 @@ def quantile(
"""
validate_percentile(q)
axis = self._get_axis_number(axis)
any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes)
if numeric_only is no_default and any_not_numeric:
com.deprecate_numeric_only_default(type(self), "quantile")
numeric_only = com.resolve_numeric_only(numeric_only)

if not is_list_like(q):
# BlockManager.quantile expects listlike, so we wrap and unwrap here
Expand Down
40 changes: 6 additions & 34 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8853,7 +8853,7 @@ def rank(
self: NDFrameT,
axis: Axis = 0,
method: str = "average",
numeric_only: bool_t | None | lib.NoDefault = lib.no_default,
numeric_only: bool_t = False,
na_option: str = "keep",
ascending: bool_t = True,
pct: bool_t = False,
Expand All @@ -8878,8 +8878,12 @@ def rank(
* first: ranks assigned in order they appear in the array
* dense: like 'min', but rank always increases by 1 between groups.

numeric_only : bool, optional
numeric_only : bool, default False
For DataFrame objects, rank only numeric columns if set to True.

.. versionchanged:: 2.0.0
The default value of ``numeric_only`` is now ``False``.

na_option : {'keep', 'top', 'bottom'}, default 'keep'
How to rank NaN values:

Expand Down Expand Up @@ -8953,20 +8957,6 @@ def rank(
3 spider 8.0 4.0 4.0 4.0 1.000
4 snake NaN NaN NaN 5.0 NaN
"""
warned = False
if numeric_only is None:
# GH#45036
warnings.warn(
f"'numeric_only=None' in {type(self).__name__}.rank is deprecated "
"and will raise in a future version. Pass either 'True' or "
"'False'. 'False' will be the default.",
FutureWarning,
stacklevel=find_stack_level(),
)
warned = True
elif numeric_only is lib.no_default:
numeric_only = None

axis_int = self._get_axis_number(axis)

if na_option not in {"keep", "top", "bottom"}:
Expand Down Expand Up @@ -9002,24 +8992,6 @@ def ranker(data):
ranks_obj = self._constructor(ranks, **data._construct_axes_dict())
return ranks_obj.__finalize__(self, method="rank")

# if numeric_only is None, and we can't get anything, we try with
# numeric_only=True
if numeric_only is None:
try:
return ranker(self)
except TypeError:
numeric_only = True
if not warned:
# Only warn here if we didn't already issue a warning above
# GH#45036
warnings.warn(
f"Dropping of nuisance columns in {type(self).__name__}.rank "
"is deprecated; in a future version this will raise TypeError. "
"Select only valid columns before calling rank.",
FutureWarning,
stacklevel=find_stack_level(),
)

if numeric_only:
if self.ndim == 1 and not is_numeric_dtype(self.dtype):
# GH#47500
Expand Down
Loading