From 8eb75e2c9365eef2c7e6b110af171dd6b0d85e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 6 Sep 2025 18:55:44 -0300 Subject: [PATCH 1/4] fix(arrow): fix `str.replace` behaviour for named group --- pandas/core/arrays/_arrow_string_mixins.py | 14 ++++- pandas/core/arrays/string_arrow.py | 12 ++++- pandas/tests/strings/test_find_replace.py | 62 ++++++++++++++++++++++ 3 files changed, 85 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index ad118d6be6b18..90fecd8ee692a 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -174,10 +174,20 @@ def _str_replace( flags: int = 0, regex: bool = True, ) -> Self: - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + if ( + isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + or ( + isinstance(repl, str) + and (r"\g<" in repl or re.search(r"\\\d", repl) is not None) + ) + ): raise NotImplementedError( "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" + "case=False, flags!=0, or when the replacement string contains " + "named group references (\\g<...>, \\d+)" ) func = pc.replace_substring_regex if regex else pc.replace_substring diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e396ce91a293a..226e62cfd0d54 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -423,7 +423,17 @@ def _str_replace( flags: int = 0, regex: bool = True, ): - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + if ( + isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + or ( # substitution contains a named group pattern + # https://docs.python.org/3/library/re.html + isinstance(repl, str) + and (r"\g<" in repl or re.search(r"\\\d", repl) is not None) + ) + ): return super()._str_replace(pat, repl, n, case, flags, regex) return ArrowStringArrayMixin._str_replace( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 093aa1aac27e2..027db8f5e9ec0 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -592,6 +592,68 @@ def test_replace_callable_raises(any_string_dtype, repl): values.str.replace("a", repl, regex=True) +@pytest.mark.parametrize( + "repl, expected_list", + [ + ( + r"\g \g \g", + ["Three Two One", "Baz Bar Foo"], + ), + ( + r"\g<3> \g<2> \g<1>", + ["Three Two One", "Baz Bar Foo"], + ), + ( + r"\g<2>0", + ["Two0", "Bar0"], + ), + ( + r"\g<2>0 \1", + ["Two0 One", "Bar0 Foo"], + ), + ], + ids=[ + "named_groups_full_swap", + "numbered_groups_full_swap", + "single_group_with_literal", + "mixed_group_reference_with_literal", + ], +) +@pytest.mark.parametrize("use_compile", [True, False]) +def test_replace_named_groups_regex_swap( + any_string_dtype, use_compile, repl, expected_list +): + # GH#57636 + ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype) + pattern = r"(?P\w+) (?P\w+) (?P\w+)" + if use_compile: + pattern = re.compile(pattern) + result = ser.str.replace(pattern, repl, regex=True) + expected = Series(expected_list, dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "repl", + [ + r"\g<20>", + r"\20", + ], +) +@pytest.mark.parametrize("use_compile", [True, False]) +def test_replace_named_groups_regex_swap_expected_fail( + any_string_dtype, repl, use_compile +): + # GH#57636 + pattern = r"(?P\w+) (?P\w+) (?P\w+)" + if use_compile: + pattern = re.compile(pattern) + ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype) + + with pytest.raises(re.error, match="invalid group reference"): + ser.str.replace(pattern, repl, regex=True) + + def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) From 2dd8c2327cc4fb4f3748c03bd6568ff3f9d2c18f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 6 Sep 2025 22:13:43 -0300 Subject: [PATCH 2/4] doc: add entry in whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7ec50137c3039..a298e2ab0e619 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -925,6 +925,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with ``string[pyarrow]`` dtype would raise an error (:issue:`57636`) - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) - From 8a2b77b38a89543c5847e6c7f869faa2790633d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Fri, 12 Sep 2025 14:38:51 -0300 Subject: [PATCH 3/4] docs: move whatsnew to `v2.3.3` --- doc/source/whatsnew/v2.3.3.rst | 1 + doc/source/whatsnew/v3.0.0.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index cbde6f52d4472..75449db09fd81 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -22,6 +22,7 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ +- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with ``string[pyarrow]`` dtype would raise an error (:issue:`57636`) - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` with a compiled regex and custom flags (:issue:`62240`) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a298e2ab0e619..7ec50137c3039 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -925,7 +925,6 @@ Conversion Strings ^^^^^^^ -- Bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with ``string[pyarrow]`` dtype would raise an error (:issue:`57636`) - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) - From 5c96aa24e07aba84955a07ec064a6d09f55bf5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Fri, 12 Sep 2025 15:05:34 -0300 Subject: [PATCH 4/4] docs: replace `string[pyarrow]` with Arrow-backed in whatsnew --- doc/source/whatsnew/v2.3.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index 75449db09fd81..aaed7544d9975 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -22,7 +22,7 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ -- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with ``string[pyarrow]`` dtype would raise an error (:issue:`57636`) +- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with the Arrow-backed dtype would raise an error (:issue:`57636`) - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` with a compiled regex and custom flags (:issue:`62240`)