Skip to content

Commit a0093c0

Browse files
pythongh-91760: Deprecate group names and numbers which will be invalid in future
Only sequence of ASCII digits not starting with 0 (except group 0) will be accepted as a numerical reference. The group name in bytes patterns and replacement strings could only contain ASCII letters and digits and underscore.
1 parent 944fffe commit a0093c0

File tree

5 files changed

+136
-19
lines changed

5 files changed

+136
-19
lines changed

Doc/library/re.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,9 @@ The special characters are:
417417
| | * ``\1`` |
418418
+---------------------------------------+----------------------------------+
419419

420+
.. deprecated:: 3.11
421+
Group names containing non-ASCII characters in bytes patterns.
422+
420423
.. index:: single: (?P=; in regular expressions
421424

422425
``(?P=name)``
@@ -486,6 +489,9 @@ The special characters are:
486489
will match with ``'<[email protected]>'`` as well as ``'[email protected]'``, but
487490
not with ``'<[email protected]'`` nor ``'[email protected]>'``.
488491

492+
.. deprecated:: 3.11
493+
Group *id* containing anything except ASCII digits or starting with ``0``.
494+
489495

490496
The special sequences consist of ``'\'`` and a character from the list below.
491497
If the ordinary character is not an ASCII digit or an ASCII letter, then the
@@ -995,6 +1001,11 @@ form.
9951001
Empty matches for the pattern are replaced when adjacent to a previous
9961002
non-empty match.
9971003

1004+
.. deprecated:: 3.11
1005+
Group *id* containing anything except ASCII digits or starting with ``0``
1006+
(except group 0).
1007+
Group names containing non-ASCII characters in bytes replacement strings.
1008+
9981009

9991010
.. function:: subn(pattern, repl, string, count=0, flags=0)
10001011

Doc/whatsnew/3.11.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,15 @@ Deprecated
922922

923923
(Contributed by Brett Cannon in :issue:`47061`.)
924924

925+
* More strict rules will be applied now applied for numerical group references
926+
and group names in regular expressions in future Python versions.
927+
Only sequence of ASCII digits not starting with ``0`` (except group 0) will be
928+
now accepted as a numerical reference.
929+
The group name in bytes patterns and replacement strings could only
930+
contain ASCII letters and digits and underscore.
931+
For now, a deprecation warning is raised for such syntax.
932+
(Contributed by Serhiy Storchaka in :issue:`91760`.)
933+
925934

926935
Removed
927936
=======

Lib/re/_parser.py

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -293,8 +293,22 @@ def seek(self, index):
293293
self.__next()
294294

295295
def error(self, msg, offset=0):
296+
if not self.istext:
297+
msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
296298
return error(msg, self.string, self.tell() - offset)
297299

300+
def checkgroupname(self, name, offset, nested):
301+
if not name.isidentifier():
302+
msg = "bad character in group name %r" % name
303+
raise self.error(msg, len(name) + offset)
304+
if not (self.istext or name.isascii()):
305+
import warnings
306+
warnings.warn(
307+
"bad character in group name %a at position %d" %
308+
(name, self.tell() - len(name) - offset),
309+
DeprecationWarning, stacklevel=nested + 7
310+
)
311+
298312
def _class_escape(source, escape):
299313
# handle escape code inside character class
300314
code = ESCAPES.get(escape)
@@ -707,15 +721,11 @@ def _parse(source, state, verbose, nested, first=False):
707721
if sourcematch("<"):
708722
# named group: skip forward to end of name
709723
name = source.getuntil(">", "group name")
710-
if not name.isidentifier():
711-
msg = "bad character in group name %r" % name
712-
raise source.error(msg, len(name) + 1)
724+
source.checkgroupname(name, 1, nested)
713725
elif sourcematch("="):
714726
# named backreference
715727
name = source.getuntil(")", "group name")
716-
if not name.isidentifier():
717-
msg = "bad character in group name %r" % name
718-
raise source.error(msg, len(name) + 1)
728+
source.checkgroupname(name, 1, nested)
719729
gid = state.groupdict.get(name)
720730
if gid is None:
721731
msg = "unknown group name %r" % name
@@ -776,12 +786,7 @@ def _parse(source, state, verbose, nested, first=False):
776786
elif char == "(":
777787
# conditional backreference group
778788
condname = source.getuntil(")", "group name")
779-
if condname.isidentifier():
780-
condgroup = state.groupdict.get(condname)
781-
if condgroup is None:
782-
msg = "unknown group name %r" % condname
783-
raise source.error(msg, len(condname) + 1)
784-
else:
789+
if not condname.isidentifier():
785790
try:
786791
condgroup = int(condname)
787792
if condgroup < 0:
@@ -795,6 +800,21 @@ def _parse(source, state, verbose, nested, first=False):
795800
if condgroup >= MAXGROUPS:
796801
msg = "invalid group reference %d" % condgroup
797802
raise source.error(msg, len(condname) + 1)
803+
if not (condname.isdecimal() and condname.isascii() and
804+
(condname[0] != "0" or condname == "0")):
805+
import warnings
806+
warnings.warn(
807+
"bad character in group name %s at position %d" %
808+
(repr(condname) if source.istext else ascii(condname),
809+
source.tell() - len(condname) - 1),
810+
DeprecationWarning, stacklevel=nested + 6
811+
)
812+
else:
813+
source.checkgroupname(condname, 1, nested)
814+
condgroup = state.groupdict.get(condname)
815+
if condgroup is None:
816+
msg = "unknown group name %r" % condname
817+
raise source.error(msg, len(condname) + 1)
798818
state.checklookbehindgroup(condgroup, source)
799819
item_yes = _parse(source, state, verbose, nested + 1)
800820
if source.match("|"):
@@ -1006,16 +1026,10 @@ def addgroup(index, pos):
10061026
# group
10071027
c = this[1]
10081028
if c == "g":
1009-
name = ""
10101029
if not s.match("<"):
10111030
raise s.error("missing <")
10121031
name = s.getuntil(">", "group name")
1013-
if name.isidentifier():
1014-
try:
1015-
index = groupindex[name]
1016-
except KeyError:
1017-
raise IndexError("unknown group name %r" % name) from None
1018-
else:
1032+
if not name.isidentifier():
10191033
try:
10201034
index = int(name)
10211035
if index < 0:
@@ -1026,6 +1040,21 @@ def addgroup(index, pos):
10261040
if index >= MAXGROUPS:
10271041
raise s.error("invalid group reference %d" % index,
10281042
len(name) + 1)
1043+
if not (name.isdecimal() and name.isascii() and
1044+
(name[0] != "0" or name == "0")):
1045+
import warnings
1046+
warnings.warn(
1047+
"bad character in group name %s at position %d" %
1048+
(repr(name) if s.istext else ascii(name),
1049+
s.tell() - len(name) - 1),
1050+
DeprecationWarning, stacklevel=5
1051+
)
1052+
else:
1053+
s.checkgroupname(name, 1, -1)
1054+
try:
1055+
index = groupindex[name]
1056+
except KeyError:
1057+
raise IndexError("unknown group name %r" % name) from None
10291058
addgroup(index, len(name) + 1)
10301059
elif c == "0":
10311060
if s.next in OCTDIGITS:

Lib/test/test_re.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ def test_basic_re_sub(self):
135135
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
136136
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
137137
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
138+
self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
138139

139140
self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
140141
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
@@ -274,6 +275,21 @@ def test_symbolic_groups_errors(self):
274275
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
275276
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
276277
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
278+
with self.assertWarnsRegex(DeprecationWarning,
279+
r"bad character in group name '\\xc2\\xb5' "
280+
r"at position 4") as w:
281+
re.compile(b'(?P<\xc2\xb5>x)')
282+
self.assertEqual(w.warnings[0].filename, __file__)
283+
with self.assertWarnsRegex(DeprecationWarning,
284+
r"bad character in group name '\\xc2\\xb5' "
285+
r"at position 4"):
286+
self.checkPatternError(b'(?P=\xc2\xb5)',
287+
r"unknown group name '\xc2\xb5'", 4)
288+
with self.assertWarnsRegex(DeprecationWarning,
289+
r"bad character in group name '\\xc2\\xb5' "
290+
r"at position 3"):
291+
self.checkPatternError(b'(?(\xc2\xb5)y)',
292+
r"unknown group name '\xc2\xb5'", 3)
277293

278294
def test_symbolic_refs(self):
279295
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@@ -306,12 +322,39 @@ def test_symbolic_refs_errors(self):
306322
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
307323
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
308324
"bad character in group name '-1'", 3)
325+
with self.assertWarnsRegex(DeprecationWarning,
326+
r"bad character in group name '\+1' "
327+
r"at position 3") as w:
328+
re.sub('(?P<a>x)', r'\g<+1>', 'xx')
329+
self.assertEqual(w.warnings[0].filename, __file__)
330+
with self.assertWarnsRegex(DeprecationWarning,
331+
r"bad character in group name '01' "
332+
r"at position 3"):
333+
re.sub('(?P<a>x)', r'\g<01>', 'xx')
334+
with self.assertWarnsRegex(DeprecationWarning,
335+
r"bad character in group name '1_0' "
336+
r"at position 3"):
337+
re.sub('()'*10, r'\g<1_0>', 'xx')
338+
with self.assertWarnsRegex(DeprecationWarning,
339+
r"bad character in group name ' 1 ' "
340+
r"at position 3"):
341+
re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
309342
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
310343
"bad character in group name '©'", 3)
344+
with self.assertWarnsRegex(DeprecationWarning,
345+
r"bad character in group name '\\xc2\\xb5' "
346+
r"at position 3") as w:
347+
with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
348+
re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
349+
self.assertEqual(w.warnings[0].filename, __file__)
311350
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
312351
"bad character in group name '㊀'", 3)
313352
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
314353
"bad character in group name '¹'", 3)
354+
with self.assertWarnsRegex(DeprecationWarning,
355+
r"bad character in group name '१' "
356+
r"at position 3"):
357+
re.sub('(?P<a>x)', r'\g<१>', 'xx')
315358

316359
def test_re_subn(self):
317360
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -577,10 +620,31 @@ def test_re_groupref_exists_errors(self):
577620
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
578621
self.checkPatternError(r'()(?(-1)a|b)',
579622
"bad character in group name '-1'", 5)
623+
with self.assertWarnsRegex(DeprecationWarning,
624+
r"bad character in group name '\+1' "
625+
r"at position 5") as w:
626+
re.compile(r'()(?(+1)a|b)')
627+
self.assertEqual(w.warnings[0].filename, __file__)
628+
with self.assertWarnsRegex(DeprecationWarning,
629+
r"bad character in group name '01' "
630+
r"at position 5"):
631+
re.compile(r'()(?(01)a|b)')
632+
with self.assertWarnsRegex(DeprecationWarning,
633+
r"bad character in group name '1_0' "
634+
r"at position 23"):
635+
re.compile(r'()'*10 + r'(?(1_0)a|b)')
636+
with self.assertWarnsRegex(DeprecationWarning,
637+
r"bad character in group name ' 1 ' "
638+
r"at position 5"):
639+
re.compile(r'()(?( 1 )a|b)')
580640
self.checkPatternError(r'()(?(㊀)a|b)',
581641
"bad character in group name '㊀'", 5)
582642
self.checkPatternError(r'()(?(¹)a|b)',
583643
"bad character in group name '¹'", 5)
644+
with self.assertWarnsRegex(DeprecationWarning,
645+
r"bad character in group name '१' "
646+
r"at position 5"):
647+
re.compile(r'()(?(१)a|b)')
584648
self.checkPatternError(r'()(?(1',
585649
"missing ), unterminated name", 5)
586650
self.checkPatternError(r'()(?(1)a',
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
More strict rules will be applied for numerical group references and group
2+
names in regular expressions. For now, a deprecation warning is emitted for
3+
group references and group names which will be errors in future Python
4+
versions.

0 commit comments

Comments
 (0)