Skip to content

Commit dc20fc4

Browse files
miss-islingtonmaxking
authored andcommitted
bpo-21315: Fix parsing of encoded words with missing leading ws. (GH-13425) (#13846)
* bpo-21315: Fix parsing of encoded words with missing leading ws. Because of missing leading whitespace, encoded word would get parsed as unstructured token. This patch fixes that by looking for encoded words when splitting tokens with whitespace. Missing trailing whitespace around encoded word now register a defect instead. Original patch suggestion by David R. Murray on bpo-21315. (cherry picked from commit 66c4f3f) Co-authored-by: Abhilash Raj <[email protected]>
1 parent 28be388 commit dc20fc4

File tree

4 files changed

+49
-3
lines changed

4 files changed

+49
-3
lines changed

Lib/email/_header_value_parser.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,18 @@
9797
def quote_string(value):
9898
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
9999

100+
# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
101+
rfc2047_matcher = re.compile(r'''
102+
=\? # literal =?
103+
[^?]* # charset
104+
\? # literal ?
105+
[qQbB] # literal 'q' or 'b', case insensitive
106+
\? # literal ?
107+
.*? # encoded word
108+
\?= # literal ?=
109+
''', re.VERBOSE | re.MULTILINE)
110+
111+
100112
#
101113
# TokenList and its subclasses
102114
#
@@ -1050,6 +1062,10 @@ def get_encoded_word(value):
10501062
_validate_xtext(vtext)
10511063
ew.append(vtext)
10521064
text = ''.join(remainder)
1065+
# Encoded words should be followed by a WS
1066+
if value and value[0] not in WSP:
1067+
ew.defects.append(errors.InvalidHeaderDefect(
1068+
"missing trailing whitespace after encoded-word"))
10531069
return ew, value
10541070

10551071
def get_unstructured(value):
@@ -1102,6 +1118,11 @@ def get_unstructured(value):
11021118
unstructured.append(token)
11031119
continue
11041120
tok, *remainder = _wsp_splitter(value, 1)
1121+
# Split in the middle of an atom if there is a rfc2047 encoded word
1122+
# which does not have WSP on both sides. The defect will be registered
1123+
# the next time through the loop.
1124+
if rfc2047_matcher.search(tok):
1125+
tok, *remainder = value.partition('=?')
11051126
vtext = ValueTerminal(tok, 'vtext')
11061127
_validate_xtext(vtext)
11071128
unstructured.append(vtext)

Lib/test/test_email/test__header_value_parser.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self):
118118
'=?us-ascii?q?first?==?utf-8?q?second?=',
119119
'first',
120120
'first',
121-
[],
121+
[errors.InvalidHeaderDefect],
122122
'=?utf-8?q?second?=')
123123

124124
def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self):
361361
'=?utf-8?q?foo?==?utf-8?q?bar?=',
362362
'foobar',
363363
'foobar',
364+
[errors.InvalidHeaderDefect,
365+
errors.InvalidHeaderDefect],
366+
'')
367+
368+
def test_get_unstructured_ew_without_leading_whitespace(self):
369+
self._test_get_x(
370+
self._get_unst,
371+
'nowhitespace=?utf-8?q?somevalue?=',
372+
'nowhitespacesomevalue',
373+
'nowhitespacesomevalue',
374+
[errors.InvalidHeaderDefect],
375+
'')
376+
377+
def test_get_unstructured_ew_without_trailing_whitespace(self):
378+
self._test_get_x(
379+
self._get_unst,
380+
'=?utf-8?q?somevalue?=nowhitespace',
381+
'somevaluenowhitespace',
382+
'somevaluenowhitespace',
364383
[errors.InvalidHeaderDefect],
365384
'')
366385

@@ -546,7 +565,8 @@ def test_encoded_word_inside_quotes(self):
546565
'"=?utf-8?Q?not_really_valid?="',
547566
'"not really valid"',
548567
'not really valid',
549-
[errors.InvalidHeaderDefect],
568+
[errors.InvalidHeaderDefect,
569+
errors.InvalidHeaderDefect],
550570
'')
551571

552572
# get_comment

Lib/test/test_email/test_headerregistry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
11801180

11811181
'rfc2047_atom_in_quoted_string_is_decoded':
11821182
('"=?utf-8?q?=C3=89ric?=" <[email protected]>',
1183-
[errors.InvalidHeaderDefect],
1183+
[errors.InvalidHeaderDefect,
1184+
errors.InvalidHeaderDefect],
11841185
'Éric <[email protected]>',
11851186
'Éric',
11861187
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Email headers containing RFC2047 encoded words are parsed despite the missing
2+
whitespace, and a defect registered. Also missing trailing whitespace after
3+
encoded words is now registered as a defect.
4+

0 commit comments

Comments
 (0)