From b9c04bb69fbe6736f762e7065052b1a376f9294e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 13 Dec 2017 14:43:43 +0200 Subject: [PATCH 1/2] bpo-32308: Replace empty matches adjacent to a previous non-empty match in re.sub(). --- Doc/howto/regex.rst | 4 ++-- Doc/library/re.rst | 14 +++++++---- Doc/whatsnew/3.7.rst | 13 ++++++++--- Lib/test/test_re.py | 23 ++++++++----------- .../2017-12-13-20-31-30.bpo-32308.CUbsb2.rst | 2 ++ Modules/_sre.c | 4 ++-- 6 files changed, 35 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-12-13-20-31-30.bpo-32308.CUbsb2.rst diff --git a/Doc/howto/regex.rst b/Doc/howto/regex.rst index fa8c6939408100..87a6b1aba59f9f 100644 --- a/Doc/howto/regex.rst +++ b/Doc/howto/regex.rst @@ -1140,12 +1140,12 @@ new string value and the number of replacements that were performed:: >>> p.subn('colour', 'no colours at all') ('no colours at all', 0) -Empty matches are replaced only when they're not adjacent to a previous match. +Empty matches are replaced only when they're not adjacent to a previous empty match. :: >>> p = re.compile('x*') >>> p.sub('-', 'abxd') - '-a-b-d-' + '-a-b--d-' If *replacement* is a string, any backslash escapes in it are processed. That is, ``\n`` is converted to a single newline character, ``\r`` is converted to a diff --git a/Doc/library/re.rst b/Doc/library/re.rst index dae1d7ea10a031..9b175f4e96756b 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -708,12 +708,15 @@ form. That way, separator components are always found at the same relative indices within the result list. - The pattern can match empty strings. :: + Empty matches for the pattern split the string only when not adjacent + to a previous empty match. >>> re.split(r'\b', 'Words, words, words.') ['', 'Words', ', ', 'words', ', ', 'words', '.'] + >>> re.split(r'\W*', '...words...') + ['', '', 'w', 'o', 'r', 'd', 's', '', ''] >>> re.split(r'(\W*)', '...words...') - ['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', ''] + ['', '...', '', '', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '', '', ''] .. versionchanged:: 3.1 Added the optional flags argument. @@ -778,8 +781,8 @@ form. The optional argument *count* is the maximum number of pattern occurrences to be replaced; *count* must be a non-negative integer. If omitted or zero, all occurrences will be replaced. Empty matches for the pattern are replaced only - when not adjacent to a previous match, so ``sub('x*', '-', 'abc')`` returns - ``'-a-b-c-'``. + when not adjacent to a previous empty match, so ``sub('x*', '-', 'abxd')`` returns + ``'-a-b--d-'``. In string-type *repl* arguments, in addition to the character escapes and backreferences described above, @@ -805,6 +808,9 @@ form. Unknown escapes in *repl* consisting of ``'\'`` and an ASCII letter now are errors. + Empty matches for the pattern are replaced when adjacent to a previous + non-empty match. + .. function:: subn(pattern, repl, string, count=0, flags=0) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 81a88a0c82e54f..63300e7b0197ac 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -841,8 +841,9 @@ Changes in the Python API * The result of splitting a string on a :mod:`regular expression ` that could match an empty string has been changed. For example splitting on ``r'\s*'`` will now split not only on whitespaces as it - did previously, but also between any pair of non-whitespace - characters. The previous behavior can be restored by changing the pattern + did previously, but also on empty strings before all non-whitespace + characters and just before the end of the string. + The previous behavior can be restored by changing the pattern to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since Python 3.5. @@ -853,7 +854,13 @@ Changes in the Python API positions 2--3. To match only blank lines, the pattern should be rewritten as ``r'(?m)^[^\S\n]*$'``. - (Contributed by Serhiy Storchaka in :issue:`25054`.) + :func:`re.sub()` now replaces empty matches adjacent to a previous + non-empty match. For example ``re.sub('x*', '-', 'abxd')`` returns now + ``'-a-b--d-'`` instead of ``'-a-b--d-'`` (the first minus between `b` and + `d` replaces the `x`, and the second minus replaces 0 `x`' between + `x` and `d`). + + (Contributed by Serhiy Storchaka in :issue:`25054` and :issue:`32308`.) * :class:`tracemalloc.Traceback` frames are now sorted from oldest to most recent to be more consistent with :mod:`traceback`. diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index aaed3d893aaf94..9fed4bef8809fc 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -213,11 +213,6 @@ def test_bug_114660(self): self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 'hello there') - def test_bug_462270(self): - # Test for empty sub() behaviour, see SF bug #462270 - self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') - self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') - def test_symbolic_groups(self): re.compile(r'(?Px)(?P=a)(?(a)y)') re.compile(r'(?Px)(?P=a1)(?(a1)y)') @@ -331,10 +326,10 @@ def test_re_split(self): ['', 'a', '', '', 'c']) for sep, expected in [ - (':*', ['', 'a', 'b', 'c', '']), - ('(?::*)', ['', 'a', 'b', 'c', '']), - ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']), - ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']), + (':*', ['', '', 'a', '', 'b', '', 'c', '']), + ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']), + ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']), + ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']), ]: with self.subTest(sep=sep): self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) @@ -357,7 +352,7 @@ def test_qualified_re_split(self): self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), - ['', ':', 'a', ':', 'b::c']) + ['', ':', '', '', 'a:b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) @@ -1753,13 +1748,13 @@ def test_match_repr(self): def test_zerowidth(self): # Issues 852532, 1647489, 3262, 25054. self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) - self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', '']) - self.assertEqual(re.split(r"(? Date: Thu, 14 Dec 2017 20:22:49 +0200 Subject: [PATCH 2/2] Fix docs markup. --- Doc/whatsnew/3.7.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 63300e7b0197ac..c5c10d85a2d9d0 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -856,9 +856,9 @@ Changes in the Python API :func:`re.sub()` now replaces empty matches adjacent to a previous non-empty match. For example ``re.sub('x*', '-', 'abxd')`` returns now - ``'-a-b--d-'`` instead of ``'-a-b--d-'`` (the first minus between `b` and - `d` replaces the `x`, and the second minus replaces 0 `x`' between - `x` and `d`). + ``'-a-b--d-'`` instead of ``'-a-b--d-'`` (the first minus between 'b' and + 'd' replaces 'x', and the second minus replaces an empty string between + 'x' and 'd'). (Contributed by Serhiy Storchaka in :issue:`25054` and :issue:`32308`.)