From a79bec6ddd66048c4dbef08426082f78635c4811 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 31 Oct 2024 11:16:04 +0100 Subject: [PATCH] String dtype: implement sum reduction (#59853) (cherry picked from commit 2fdb16b347fc34f78213868a8a973447ac79ab2d) --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/array_algos/masked_reductions.py | 4 ++ pandas/core/arrays/arrow/array.py | 32 ++++++++++ pandas/core/arrays/string_.py | 18 +++++- pandas/core/arrays/string_arrow.py | 6 +- pandas/tests/apply/test_frame_apply.py | 10 --- pandas/tests/apply/test_invalid_arg.py | 39 ++++++------ pandas/tests/arrays/string_/test_string.py | 2 - pandas/tests/extension/test_arrow.py | 26 ++------ pandas/tests/extension/test_string.py | 2 +- pandas/tests/frame/test_reductions.py | 63 ++++++------------- pandas/tests/groupby/test_groupby.py | 15 +---- pandas/tests/groupby/test_raises.py | 3 +- .../tests/groupby/transform/test_transform.py | 11 +--- pandas/tests/series/test_reductions.py | 38 ++++------- 15 files changed, 121 insertions(+), 150 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 00503766b062f..cc561a888f843 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,7 +32,7 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 335fa1afc0f4e..6bf97729a79b1 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -62,6 +62,10 @@ def _reductions( ): return libmissing.NA + if values.dtype == np.dtype(object): + # object dtype does not support `where` without passing an initial + values = values[~mask] + return func(values, axis=axis, **kwargs) return func(values, where=~mask, axis=axis, **kwargs) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f3d7a3cc6d694..51136961c0fb3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -69,6 +69,7 @@ unpack_tuple_and_ellipses, validate_indices, ) +from pandas.core.nanops import check_below_min_count from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -1694,6 +1695,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs): denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) + elif name == "sum" and ( + pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type) + ): + + def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] + mask = pc.is_null(data) if data.null_count > 0 else None + if skip_nulls: + if min_count > 0 and check_below_min_count( + (len(data),), + None if mask is None else mask.to_numpy(), + min_count, + ): + return pa.scalar(None, type=data.type) + if data.null_count > 0: + # binary_join returns null if there is any null -> + # have to filter out any nulls + data = data.filter(pc.invert(mask)) + else: + if mask is not None or check_below_min_count( + (len(data),), None, min_count + ): + return pa.scalar(None, type=data.type) + + if pa.types.is_large_string(data.type): + # binary_join only supports string, not large_string + data = data.cast(pa.string()) + data_list = pa.ListArray.from_arrays( + [0, len(data)], data.combine_chunks() + )[0] + return pc.binary_join(data_list, "") + else: pyarrow_name = { "median": "quantile", diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5b69344bac0c8..faad516b53a4a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -812,8 +812,8 @@ def _reduce( else: return nanops.nanall(self._ndarray, skipna=skipna) - if name in ["min", "max"]: - result = getattr(self, name)(skipna=skipna, axis=axis) + if name in ["min", "max", "sum"]: + result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) if keepdims: return self._from_sequence([result], dtype=self.dtype) return result @@ -839,6 +839,20 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) + def sum( + self, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + **kwargs, + ) -> Scalar: + nv.validate_sum((), kwargs) + result = masked_reductions.sum( + values=self._ndarray, mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9389f7cffca9f..e7dd4f9dc5718 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -430,7 +430,11 @@ def _reduce( return result.astype(np.bool_) return result - result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("min", "max", "sum", "argmin", "argmax"): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + else: + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_result(result) elif isinstance(result, pa.Array): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 6a328dfb39be5..b7eac6b8f0ea1 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1173,7 +1169,6 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( @@ -1202,9 +1197,6 @@ def test_agg_multiple_mixed(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_agg_multiple_mixed_raises(): # GH 20909 mdf = DataFrame( @@ -1294,7 +1286,6 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1471,7 +1462,6 @@ def test_apply_datetime_tz_issue(engine, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_mixed_column_raises(df, method, using_infer_string): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 1c5b170c8753f..8963265b0a800 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -12,9 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError from pandas import ( @@ -212,10 +209,6 @@ def transform(row): data.apply(transform, axis=1) -# we should raise a proper TypeError instead of propagating the pyarrow error -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params( @@ -225,21 +218,25 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - import pyarrow as pa + if df.dtypes.iloc[0].storage == "pyarrow": + import pyarrow as pa - expected = (expected, pa.lib.ArrowNotImplementedError) + # TODO(infer_string) + # should raise a proper TypeError instead of propagating the pyarrow error - msg = "can't multiply sequence by non-int of type 'str'|has no kernel" + expected = (expected, pa.lib.ArrowNotImplementedError) + else: + expected = (expected, NotImplementedError) + + msg = ( + "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): df.agg(func, axis=axis) -# we should raise a proper TypeError instead of propagating the pyarrow error -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @pytest.mark.parametrize( "series, func, expected", chain( @@ -263,11 +260,15 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" if using_infer_string: - import pyarrow as pa - - expected = (expected, pa.lib.ArrowNotImplementedError) - - msg = msg + "|does not support|has no kernel" + if series.dtype.storage == "pyarrow": + import pyarrow as pa + + # TODO(infer_string) + # should raise a proper TypeError instead of propagating the pyarrow error + expected = (expected, pa.lib.ArrowNotImplementedError) + else: + expected = (expected, NotImplementedError) + msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 265b9fc40629b..73e8bde827d50 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -444,7 +444,6 @@ def test_astype_float(dtype, any_float_dtype): @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna, dtype): arr = pd.Series(["a", "b", "c"], dtype=dtype) result = arr.sum(skipna=skipna) @@ -452,7 +451,6 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna, dtype): arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) result = arr.sum(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 60e7bd83432c5..0ce7a66e0e00c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -459,10 +459,11 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: pass else: return False + elif pa.types.is_binary(pa_dtype) and op_name == "sum": + return False elif ( pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ) and op_name in [ - "sum", "mean", "median", "prod", @@ -553,6 +554,7 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + pa_type = arr._pa_array.type if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": @@ -562,6 +564,8 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): cmp_dtype = "float64[pyarrow]" elif op_name in ["median", "var", "std", "mean", "skew"]: cmp_dtype = "float64[pyarrow]" + elif op_name == "sum" and pa.types.is_string(pa_type): + cmp_dtype = arr.dtype else: cmp_dtype = { "i": "int64[pyarrow]", @@ -585,26 +589,6 @@ def test_median_not_approximate(self, typ): result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median() assert result == 1.5 - def test_in_numeric_groupby(self, data_for_grouping): - dtype = data_for_grouping.dtype - if is_string_dtype(dtype): - df = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1, 1], - } - ) - - expected = pd.Index(["C"]) - msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}") - with pytest.raises(TypeError, match=msg): - df.groupby("A").sum() - result = df.groupby("A").sum(numeric_only=True).columns - tm.assert_index_equal(result, expected) - else: - super().test_in_numeric_groupby(data_for_grouping) - def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 07c3b4224e76f..6af095e33396c 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -191,7 +191,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( - op_name in ["min", "max"] + op_name in ["min", "max", "sum"] or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a4263279a7bd5..824d53c8d5d13 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -226,7 +226,6 @@ def float_frame_with_na(): class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", @@ -246,17 +245,11 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame( - self, float_string_frame, axis, opname, using_infer_string - ): - if ( - (opname in ("sum", "min", "max") and axis == 0) - or opname - in ( - "count", - "nunique", - ) - ) and not (using_infer_string and opname == "sum"): + def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): + if (opname in ("sum", "min", "max") and axis == 0) or opname in ( + "count", + "nunique", + ): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -283,10 +276,11 @@ def test_stat_op_api_float_string_frame( msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": msg = re.compile( - r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + r"Cannot convert \[.*\] to numeric|does not support|Cannot perform", + flags=re.S, ) if not isinstance(msg, re.Pattern): - msg = msg + "|does not support" + msg = msg + "|does not support|Cannot perform reduction" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -432,7 +426,6 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): expected[expected.isna()] = None tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 @@ -449,26 +442,16 @@ def test_mixed_ops(self, op): "could not convert", "can't multiply sequence by non-int", "does not support", + "Cannot perform", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() with pd.option_context("use_bottleneck", False): - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "does not support", - ] - ) with pytest.raises(TypeError, match=msg): getattr(df, op)() - @pytest.mark.xfail( - using_string_dtype(), reason="sum doesn't work for arrow strings" - ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -610,7 +593,6 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, expected", [ @@ -632,7 +614,7 @@ def test_sem(self, datetime_frame): "A": [12], "B": [10.0], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": Series([np.nan], dtype="str"), "E": Categorical([np.nan], categories=["a"]), "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"), "G": to_timedelta([pd.NaT]), @@ -674,7 +656,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), + "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -694,7 +676,6 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted @@ -702,8 +683,13 @@ def test_mode_sortwarning(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - warning = None if using_infer_string else UserWarning - with tm.assert_produces_warning(warning): + # TODO(infer_string) avoid this UserWarning for python storage + warning = ( + None + if using_infer_string and df.A.dtype.storage == "pyarrow" + else UserWarning + ) + with tm.assert_produces_warning(warning, match="Unable to sort modes"): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -1367,13 +1353,10 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype( - self, axis, bool_agg_func, skipna, using_infer_string - ): + def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): # GH#35450 df = DataFrame( data=[ @@ -1383,13 +1366,8 @@ def test_any_all_object_dtype( [np.nan, np.nan, "5", np.nan], ] ) - if using_infer_string: - # na in object is True while in string pyarrow numpy it's false - val = not axis == 0 and not skipna and bool_agg_func == "all" - else: - val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, val, True]) + expected = Series([True, True, True, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. @@ -1969,7 +1947,6 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings") def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 57e691b3c508d..13269ea9c0920 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -8,15 +8,12 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW from pandas.errors import ( PerformanceWarning, SpecificationError, ) import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -1744,23 +1741,15 @@ def g(group): tm.assert_series_equal(result, expected) -# TODO harmonize error messages -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper, using_infer_string): +def test_set_group_name(df, grouper): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): - with pytest.raises(TypeError, match="does not support"): - group.sum() - else: - return group.sum() + return group.sum() def freducex(x): return freduce(x) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index d5b7a3f25d0eb..4ebb26b0289ec 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -219,7 +219,6 @@ def func(x): getattr(gb, how)(func) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( @@ -236,7 +235,7 @@ def test_groupby_raises_string_np( np.sum: (None, ""), np.mean: ( TypeError, - re.escape("agg function failed [how->mean,dtype->object]"), + "agg function failed|Cannot perform reduction 'mean' with string dtype", ), }[groupby_func_np] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 690eb6f410798..2aada753e27f4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -5,7 +5,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ensure_platform_int @@ -512,10 +511,7 @@ def test_transform_nuisance_raises(df, using_infer_string): gbc = grouped["B"] msg = "Could not convert" if using_infer_string: - if df.columns.dtype.storage == "pyarrow": - msg = "with dtype str does not support reduction 'mean'" - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" with pytest.raises(TypeError, match=msg): gbc.transform(lambda x: np.mean(x)) @@ -620,10 +616,7 @@ def test_groupby_transform_with_int(using_infer_string): ) msg = "Could not convert" if using_infer_string: - if HAS_PYARROW: - msg = "with dtype str does not support reduction 'mean'" - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index fcae835e4c3e2..5415f220cadd4 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import Series import pandas._testing as tm @@ -167,65 +163,55 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) - if using_infer_string: - msg = "does not support" - with pytest.raises(TypeError, match=msg): - ser.sum() - else: - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric|does not support" + assert ser.sum() == "12" + + msg = "Could not convert string '12' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric|does not support" + msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_mean_dont_convert_j_to_complex(using_array_manager): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) if using_array_manager: msg = "Could not convert string 'J' to numeric" else: - msg = r"Could not convert \['J'\] to numeric|does not support" + msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric|does not support" + msg = "Could not convert string 'J' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df["db"].mean() - msg = "Could not convert string 'J' to numeric|ufunc 'divide'" + msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + msg = ( + r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + "|Cannot perform" + ) df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median()