diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ab46aa4ec28d4..98d56bac402ac 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -95,6 +95,7 @@ Other enhancements - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) +- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index dae8e2c394abc..ca7ec0ef2ebaf 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -1,13 +1,28 @@ from __future__ import annotations import json +import warnings import numpy as np import pyarrow +from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level + from pandas.core.arrays.interval import VALID_CLOSED +def fallback_performancewarning(version: str | None = None): + """ + Raise a PerformanceWarning for falling back to ExtensionArray's + non-pyarrow method + """ + msg = "Falling back on a non-pyarrow code path which may decrease performance." + if version is not None: + msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." + warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) + + def pyarrow_array_to_numpy_and_mask(arr, dtype: np.dtype): """ Convert a primitive pyarrow.Array to a numpy array and boolean mask based diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b8136402b00e6..8b6f1ffcfa59b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -60,6 +60,8 @@ import pyarrow as pa import pyarrow.compute as pc + from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + ARROW_CMP_FUNCS = { "eq": pc.equal, "ne": pc.not_equal, @@ -331,6 +333,7 @@ def _maybe_convert_setitem_value(self, value): def isin(self, values): if pa_version_under2p0: + fallback_performancewarning(version="2") return super().isin(values) value_set = [ @@ -437,10 +440,12 @@ def _str_map( def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): if flags: + fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) if regex: if pa_version_under4p0 or case is False: + fallback_performancewarning(version="4") return super()._str_contains(pat, case, flags, na, regex) else: result = pc.match_substring_regex(self._data, pat) @@ -456,6 +461,7 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): def _str_startswith(self, pat: str, na=None): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_startswith(pat, na) pat = "^" + re.escape(pat) @@ -463,6 +469,7 @@ def _str_startswith(self, pat: str, na=None): def _str_endswith(self, pat: str, na=None): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_endswith(pat, na) pat = re.escape(pat) + "$" @@ -484,6 +491,7 @@ def _str_replace( or not case or flags ): + fallback_performancewarning(version="4") return super()._str_replace(pat, repl, n, case, flags, regex) func = pc.replace_substring_regex if regex else pc.replace_substring @@ -494,6 +502,7 @@ def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None ): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_match(pat, case, flags, na) if not pat.startswith("^"): @@ -504,6 +513,7 @@ def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_fullmatch(pat, case, flags, na) if not pat.endswith("$") or pat.endswith("//$"): @@ -536,6 +546,7 @@ def _str_isnumeric(self): def _str_isspace(self): if pa_version_under2p0: + fallback_performancewarning(version="2") return super()._str_isspace() result = pc.utf8_is_space(self._data) @@ -551,6 +562,7 @@ def _str_isupper(self): def _str_len(self): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_len() result = pc.utf8_length(self._data) @@ -564,6 +576,7 @@ def _str_upper(self): def _str_strip(self, to_strip=None): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_strip(to_strip) if to_strip is None: @@ -574,6 +587,7 @@ def _str_strip(self, to_strip=None): def _str_lstrip(self, to_strip=None): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_lstrip(to_strip) if to_strip is None: @@ -584,6 +598,7 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): if pa_version_under4p0: + fallback_performancewarning(version="4") return super()._str_rstrip(to_strip) if to_strip is None: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 59b9d2f2f8908..c21319f6de6ef 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,9 +2,13 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +from contextlib import nullcontext + import numpy as np import pytest +from pandas.compat import pa_version_under2p0 +from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_dtype_equal @@ -14,6 +18,13 @@ from pandas.core.arrays.string_arrow import ArrowStringArray +def maybe_perf_warn(using_pyarrow): + if using_pyarrow: + return tm.assert_produces_warning(PerformanceWarning, match="Falling back") + else: + return nullcontext() + + @pytest.fixture def dtype(string_storage): return pd.StringDtype(storage=string_storage) @@ -557,18 +568,22 @@ def test_to_numpy_na_value(dtype, nulls_fixture): def test_isin(dtype, fixed_now_ts): s = pd.Series(["a", "b", None], dtype=dtype) - result = s.isin(["a", "c"]) + with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + result = s.isin(["a", "c"]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) - result = s.isin(["a", pd.NA]) + with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + result = s.isin(["a", pd.NA]) expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) - result = s.isin([]) + with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + result = s.isin([]) expected = pd.Series([False, False, False]) tm.assert_series_equal(result, expected) - result = s.isin(["a", fixed_now_ts]) + with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + result = s.isin(["a", fixed_now_ts]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 067bcf5969587..c1d96ca7993e1 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -1,9 +1,13 @@ +from contextlib import nullcontext from datetime import datetime import re import numpy as np import pytest +from pandas.compat import pa_version_under4p0 +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( Series, @@ -15,6 +19,13 @@ # -------------------------------------------------------------------------------------- +def maybe_perf_warn(using_pyarrow): + if using_pyarrow: + return tm.assert_produces_warning(PerformanceWarning, match="Falling back") + else: + return nullcontext() + + def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -41,7 +52,8 @@ def test_contains(any_string_dtype): np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object), dtype=any_string_dtype, ) - result = values.str.contains(pat) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.contains(pat) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -51,7 +63,8 @@ def test_contains(any_string_dtype): np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), dtype=any_string_dtype, ) - result = values.str.contains("FOO|mmm", case=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = values.str.contains("FOO|mmm", case=False) expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -67,7 +80,8 @@ def test_contains(any_string_dtype): ) pat = "mmm[_]+" - result = values.str.contains(pat) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.contains(pat) expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype @@ -83,7 +97,8 @@ def test_contains(any_string_dtype): np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_), dtype=any_string_dtype, ) - result = values.str.contains(pat) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.contains(pat) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -147,7 +162,10 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - result = values.str.contains("a", na=na, regex=regex) + with maybe_perf_warn( + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 and regex + ): + result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) @@ -167,28 +185,32 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - result = s.str.contains("a", case=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) - result = s.str.contains("Aa") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.contains("Aa") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) - result = s.str.contains("ba") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.contains("ba") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) - result = s.str.contains("ba", case=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False], dtype=expected_dtype, @@ -200,23 +222,27 @@ def test_contains_nan(any_string_dtype): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) - result = s.str.contains("foo", na=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.contains("foo", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) - result = s.str.contains("foo") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.contains("foo") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -261,13 +287,19 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) - result = values.str.startswith("foo", na=na) + with maybe_perf_warn( + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + ): + result = values.str.startswith("foo", na=na) exp = Series( [False, na, True, False, False, na, True, False, False], dtype="boolean" ) tm.assert_series_equal(result, exp) - result = values.str.startswith("rege.", na=na) + with maybe_perf_warn( + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + ): + result = values.str.startswith("rege.", na=na) exp = Series( [False, na, False, False, False, na, False, False, True], dtype="boolean" ) @@ -313,13 +345,19 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) - result = values.str.endswith("foo", na=na) + with maybe_perf_warn( + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + ): + result = values.str.endswith("foo", na=na) exp = Series( [False, na, False, False, True, na, True, False, False], dtype="boolean" ) tm.assert_series_equal(result, exp) - result = values.str.endswith("rege.", na=na) + with maybe_perf_warn( + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + ): + result = values.str.endswith("rege.", na=na) exp = Series( [False, na, False, False, False, na, False, False, True], dtype="boolean" ) @@ -334,7 +372,8 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): def test_replace(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) - result = ser.str.replace("BAD[_]*", "", regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.replace("BAD[_]*", "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -343,11 +382,13 @@ def test_replace_max_replacements(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) - result = ser.str.replace("BAD[_]*", "", n=1, regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.replace("BAD[_]*", "", n=1, regex=True) tm.assert_series_equal(result, expected) expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype) - result = ser.str.replace("BAD", "", n=1, regex=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.replace("BAD", "", n=1, regex=False) tm.assert_series_equal(result, expected) @@ -363,7 +404,8 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -383,7 +425,8 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -401,7 +444,8 @@ def test_replace_callable_raises(any_string_dtype, repl): r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): - values.str.replace("a", repl) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + values.str.replace("a", repl) def test_replace_callable_named_groups(any_string_dtype): @@ -409,7 +453,8 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - result = ser.str.replace(pat, repl, regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -420,11 +465,13 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - result = ser.str.replace(pat, "", regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - result = ser.str.replace(pat, "", n=1, regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -443,7 +490,8 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - result = ser.str.replace(pat, ", ") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace(pat, ", ") tm.assert_series_equal(result, expected) @@ -470,7 +518,8 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - result = ser.str.replace(pat, repl, n=2) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace(pat, repl, n=2) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -482,7 +531,8 @@ def test_replace_literal(regex, expected, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) expected = Series(expected, dtype=any_string_dtype) - result = ser.str.replace("f.", "ba", regex=regex) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected) @@ -518,7 +568,8 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - result = ser.str.replace("A", "YYY", case=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", @@ -536,7 +587,8 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", @@ -559,11 +611,13 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - result = ser.str.replace("a", "c", case=False, regex=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - result = ser.str.replace("a.", "c.", case=False, regex=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -575,7 +629,12 @@ def test_replace_regex_default_warning(any_string_dtype): "The default value of regex will change from True to False in a " "future version\\.$" ) - with tm.assert_produces_warning(FutureWarning, match=msg): + + with tm.assert_produces_warning( + FutureWarning, + match=msg, + raise_on_extra_warnings=any_string_dtype != "string[pyarrow]", + ): result = s.str.replace("^.$", "a") expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -596,7 +655,10 @@ def test_replace_regex_single_character(regex, any_string_dtype): "version. In addition, single character regular expressions will *not* " "be treated as literal strings when regex=True." ) - with tm.assert_produces_warning(FutureWarning, match=msg): + pyarrow_warn = any_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.assert_produces_warning( + FutureWarning, match=msg, raise_on_extra_warnings=not pyarrow_warn + ): result = s.str.replace(".", "a", regex=regex) else: result = s.str.replace(".", "a", regex=regex) @@ -615,29 +677,34 @@ def test_match(any_string_dtype): expected_dtype = "object" if any_string_dtype == "object" else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) - result = values.str.match(".*(BAD[_]+).*(BAD)") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.match(".*(BAD[_]+).*(BAD)") expected = Series([True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - result = values.str.match(".*BAD[_]+.*BAD") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.match(".*BAD[_]+.*BAD") expected = Series([True, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = values.str.match("BAD[_]+.*BAD") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.match("BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - result = values.str.match("^BAD[_]+.*BAD") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.match("^BAD[_]+.*BAD") expected = Series([False, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = values.str.match("\\^BAD[_]+.*BAD") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = values.str.match("\\^BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -668,12 +735,14 @@ def test_match_na_kwarg(any_string_dtype): # GH #6609 s = Series(["a", "b", np.nan], dtype=any_string_dtype) - result = s.str.match("a", na=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.match("a", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.match("a") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = s.str.match("a") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -681,7 +750,8 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - result = values.str.match("ab", case=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = values.str.match("ab", case=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -697,7 +767,8 @@ def test_fullmatch(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - result = ser.str.fullmatch(".*BAD[_]+.*BAD") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.fullmatch(".*BAD[_]+.*BAD") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -707,7 +778,8 @@ def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -719,15 +791,18 @@ def test_fullmatch_case_kwarg(any_string_dtype): expected = Series([True, False, False, False], dtype=expected_dtype) - result = ser.str.fullmatch("ab", case=True) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.fullmatch("ab", case=True) tm.assert_series_equal(result, expected) expected = Series([True, True, False, False], dtype=expected_dtype) - result = ser.str.fullmatch("ab", case=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - result = ser.str.fullmatch("ab", flags=re.IGNORECASE) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -904,13 +979,17 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" + using_pyarrow = any_string_dtype == "string[pyarrow]" + result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - result = data.str.match(pat, flags=re.IGNORECASE) + with maybe_perf_warn(using_pyarrow): + result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] - result = data.str.fullmatch(pat, flags=re.IGNORECASE) + with maybe_perf_warn(using_pyarrow): + result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result[0] result = data.str.findall(pat, flags=re.IGNORECASE) @@ -920,6 +999,8 @@ def test_flags_kwarg(any_string_dtype): assert result[0] == 1 msg = "has match groups" - with tm.assert_produces_warning(UserWarning, match=msg): + with tm.assert_produces_warning( + UserWarning, match=msg, raise_on_extra_warnings=not using_pyarrow + ): result = data.str.contains(pat, flags=re.IGNORECASE) assert result[0] diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 90c26a747abdd..8628aafefa4b1 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -10,6 +10,7 @@ ) +@pytest.mark.filterwarnings("ignore:Falling back") def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index b061ef0bd8a15..6d6d69280b9dd 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -1,3 +1,4 @@ +from contextlib import nullcontext from datetime import ( datetime, timedelta, @@ -6,6 +7,12 @@ import numpy as np import pytest +from pandas.compat import ( + pa_version_under2p0, + pa_version_under4p0, +) +from pandas.errors import PerformanceWarning + from pandas import ( DataFrame, Index, @@ -16,6 +23,13 @@ import pandas._testing as tm +def maybe_perf_warn(using_pyarrow): + if using_pyarrow: + return tm.assert_produces_warning(PerformanceWarning, match="Falling back") + else: + return nullcontext() + + @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) def test_startswith_endswith_non_str_patterns(pattern): # GH3485 @@ -176,14 +190,19 @@ def test_empty_str_methods(any_string_dtype): assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count("a")) - tm.assert_series_equal(empty_bool, empty.str.contains("a")) - tm.assert_series_equal(empty_bool, empty.str.startswith("a")) - tm.assert_series_equal(empty_bool, empty.str.endswith("a")) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_bool, empty.str.contains("a")) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_bool, empty.str.startswith("a")) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) - tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) - tm.assert_series_equal(empty_bool, empty.str.match("^a")) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( DataFrame(columns=[0], dtype=any_string_dtype), empty.str.extract("()", expand=True), @@ -199,7 +218,8 @@ def test_empty_str_methods(any_string_dtype): ) tm.assert_frame_equal(empty_df, empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join("")) - tm.assert_series_equal(empty_int, empty.str.len()) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_int, empty.str.len()) tm.assert_series_equal(empty_object, empty_str.str.findall("a")) tm.assert_series_equal(empty_int, empty.str.find("a")) tm.assert_series_equal(empty_int, empty.str.rfind("a")) @@ -213,9 +233,12 @@ def test_empty_str_methods(any_string_dtype): tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) - tm.assert_series_equal(empty_str, empty.str.strip()) - tm.assert_series_equal(empty_str, empty.str.lstrip()) - tm.assert_series_equal(empty_str, empty.str.rstrip()) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_str, empty.str.strip()) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_str, empty.str.lstrip()) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) @@ -224,7 +247,8 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) - tm.assert_series_equal(empty_bool, empty.str.isspace()) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under2p0): + tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) tm.assert_series_equal(empty_bool, empty.str.isupper()) tm.assert_series_equal(empty_bool, empty.str.istitle()) @@ -275,7 +299,12 @@ def test_ismethods(method, expected, any_string_dtype): ) expected_dtype = "bool" if any_string_dtype == "object" else "boolean" expected = Series(expected, dtype=expected_dtype) - result = getattr(ser.str, method)() + with maybe_perf_warn( + any_string_dtype == "string[pyarrow]" + and pa_version_under2p0 + and method == "isspace" + ): + result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library @@ -345,7 +374,8 @@ def test_len(any_string_dtype): ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], dtype=any_string_dtype, ) - result = ser.str.len() + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.len() expected_dtype = "float64" if any_string_dtype == "object" else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -432,7 +462,8 @@ def test_pipe_failures(any_string_dtype): expected = Series([["A", "B", "C"]], dtype=object) tm.assert_series_equal(result, expected) - result = ser.str.replace("|", " ", regex=False) + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = ser.str.replace("|", " ", regex=False) expected = Series(["A B C"], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -534,7 +565,8 @@ def test_strip_lstrip_rstrip_mixed_object(method, exp): def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) - result = getattr(ser.str, method)("x") + with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + result = getattr(ser.str, method)("x") expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected)