Skip to content

Commit 59e8fba

Browse files
epicfaacemiss-islington
authored andcommitted
[3.8] bpo-21315: Fix parsing of encoded words with missing leading ws (GH-13425) (GH-15655)
* [bpo-21315](https://bugs.python.org/issue21315): Fix parsing of encoded words with missing leading ws. Because of missing leading whitespace, encoded word would get parsed as unstructured token. This patch fixes that by looking for encoded words when splitting tokens with whitespace. Missing trailing whitespace around encoded word now register a defect instead. Original patch suggestion by David R. Murray on [bpo-21315](https://bugs.python.org/issue21315). (cherry picked from commit 66c4f3f) Co-authored-by: Abhilash Raj <[email protected]> (cherry picked from commit dc20fc4) Co-authored-by: Miss Islington (bot) <[email protected]> https://bugs.python.org/issue21315
1 parent 58067d2 commit 59e8fba

File tree

4 files changed

+49
-3
lines changed

4 files changed

+49
-3
lines changed

Lib/email/_header_value_parser.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,18 @@
9696
def quote_string(value):
9797
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
9898

99+
# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
100+
rfc2047_matcher = re.compile(r'''
101+
=\? # literal =?
102+
[^?]* # charset
103+
\? # literal ?
104+
[qQbB] # literal 'q' or 'b', case insensitive
105+
\? # literal ?
106+
.*? # encoded word
107+
\?= # literal ?=
108+
''', re.VERBOSE | re.MULTILINE)
109+
110+
99111
#
100112
# TokenList and its subclasses
101113
#
@@ -1054,6 +1066,10 @@ def get_encoded_word(value):
10541066
_validate_xtext(vtext)
10551067
ew.append(vtext)
10561068
text = ''.join(remainder)
1069+
# Encoded words should be followed by a WS
1070+
if value and value[0] not in WSP:
1071+
ew.defects.append(errors.InvalidHeaderDefect(
1072+
"missing trailing whitespace after encoded-word"))
10571073
return ew, value
10581074

10591075
def get_unstructured(value):
@@ -1106,6 +1122,11 @@ def get_unstructured(value):
11061122
unstructured.append(token)
11071123
continue
11081124
tok, *remainder = _wsp_splitter(value, 1)
1125+
# Split in the middle of an atom if there is a rfc2047 encoded word
1126+
# which does not have WSP on both sides. The defect will be registered
1127+
# the next time through the loop.
1128+
if rfc2047_matcher.search(tok):
1129+
tok, *remainder = value.partition('=?')
11091130
vtext = ValueTerminal(tok, 'vtext')
11101131
_validate_xtext(vtext)
11111132
unstructured.append(vtext)

Lib/test/test_email/test__header_value_parser.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self):
118118
'=?us-ascii?q?first?==?utf-8?q?second?=',
119119
'first',
120120
'first',
121-
[],
121+
[errors.InvalidHeaderDefect],
122122
'=?utf-8?q?second?=')
123123

124124
def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self):
361361
'=?utf-8?q?foo?==?utf-8?q?bar?=',
362362
'foobar',
363363
'foobar',
364+
[errors.InvalidHeaderDefect,
365+
errors.InvalidHeaderDefect],
366+
'')
367+
368+
def test_get_unstructured_ew_without_leading_whitespace(self):
369+
self._test_get_x(
370+
self._get_unst,
371+
'nowhitespace=?utf-8?q?somevalue?=',
372+
'nowhitespacesomevalue',
373+
'nowhitespacesomevalue',
374+
[errors.InvalidHeaderDefect],
375+
'')
376+
377+
def test_get_unstructured_ew_without_trailing_whitespace(self):
378+
self._test_get_x(
379+
self._get_unst,
380+
'=?utf-8?q?somevalue?=nowhitespace',
381+
'somevaluenowhitespace',
382+
'somevaluenowhitespace',
364383
[errors.InvalidHeaderDefect],
365384
'')
366385

@@ -550,7 +569,8 @@ def test_encoded_word_inside_quotes(self):
550569
'"=?utf-8?Q?not_really_valid?="',
551570
'"not really valid"',
552571
'not really valid',
553-
[errors.InvalidHeaderDefect],
572+
[errors.InvalidHeaderDefect,
573+
errors.InvalidHeaderDefect],
554574
'')
555575

556576
# get_comment

Lib/test/test_email/test_headerregistry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
11801180

11811181
'rfc2047_atom_in_quoted_string_is_decoded':
11821182
('"=?utf-8?q?=C3=89ric?=" <[email protected]>',
1183-
[errors.InvalidHeaderDefect],
1183+
[errors.InvalidHeaderDefect,
1184+
errors.InvalidHeaderDefect],
11841185
'Éric <[email protected]>',
11851186
'Éric',
11861187
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Email headers containing RFC2047 encoded words are parsed despite the missing
2+
whitespace, and a defect registered. Also missing trailing whitespace after
3+
encoded words is now registered as a defect.
4+

0 commit comments

Comments
 (0)