30
30
commentclose = re .compile (r'--!?>' )
31
31
commentabruptclose = re .compile (r'-?>' )
32
32
# Note:
33
- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
34
- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33
+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
34
+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
35
35
# explode, so don't do it.
36
- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
37
- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
38
- tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' )
39
- attrfind_tolerant = re .compile (
40
- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
41
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
36
+ # see the HTML5 specs section "13.2.5.6 Tag open state",
37
+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
38
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
39
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
40
+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
41
+ tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
42
+ attrfind_tolerant = re .compile (r"""
43
+ (
44
+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
45
+ )
46
+ (= # value indicator
47
+ ('[^']*' # LITA-enclosed value
48
+ |"[^"]*" # LIT-enclosed value
49
+ |(?!['"])[^>\t\n\r\f ]* # bare value
50
+ )
51
+ )?
52
+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
53
+ """ , re .VERBOSE )
54
+ locatetagend = re .compile (r"""
55
+ [a-zA-Z][^\t\n\r\f />]* # tag name
56
+ [\t\n\r\f /]* # optional whitespace before attribute name
57
+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
58
+ (?:= # value indicator
59
+ (?:'[^']*' # LITA-enclosed value
60
+ |"[^"]*" # LIT-enclosed value
61
+ |(?!['"])[^>\t\n\r\f ]* # bare value
62
+ )
63
+ )?
64
+ [\t\n\r\f /]* # possibly followed by a space
65
+ )*
66
+ >?
67
+ """ , re .VERBOSE )
68
+ # The following variables are not used, but are temporarily left for
69
+ # backward compatibility.
42
70
locatestarttagend_tolerant = re .compile (r"""
43
71
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
44
72
(?:[\s/]* # optional whitespace before attribute name
55
83
\s* # trailing whitespace
56
84
""" , re .VERBOSE )
57
85
endendtag = re .compile ('>' )
58
- # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
59
- # </ and the tag name, so maybe this should be fixed
60
86
endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
61
87
62
88
@@ -123,7 +149,8 @@ def get_starttag_text(self):
123
149
124
150
def set_cdata_mode (self , elem ):
125
151
self .cdata_elem = elem .lower ()
126
- self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
152
+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
153
+ re .IGNORECASE | re .ASCII )
127
154
128
155
def clear_cdata_mode (self ):
129
156
self .interesting = interesting_normal
@@ -148,7 +175,7 @@ def goahead(self, end):
148
175
# & near the end and see if it's followed by a space or ;.
149
176
amppos = rawdata .rfind ('&' , max (i , n - 34 ))
150
177
if (amppos >= 0 and
151
- not re .compile (r'[\s ;]' ).search (rawdata , amppos )):
178
+ not re .compile (r'[\t\n\r\f ;]' ).search (rawdata , amppos )):
152
179
break # wait till we get all the text
153
180
j = n
154
181
else :
@@ -261,7 +288,7 @@ def goahead(self, end):
261
288
else :
262
289
assert 0 , "interesting.search() lied"
263
290
# end while
264
- if end and i < n and not self . cdata_elem :
291
+ if end and i < n :
265
292
if self .convert_charrefs and not self .cdata_elem :
266
293
self .handle_data (unescape (rawdata [i :n ]))
267
294
else :
@@ -307,7 +334,7 @@ def parse_comment(self, i, report=True):
307
334
return match .end ()
308
335
309
336
# Internal -- parse bogus comment, return length or -1 if not terminated
310
- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
337
+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
311
338
def parse_bogus_comment (self , i , report = 1 ):
312
339
rawdata = self .rawdata
313
340
assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -333,6 +360,8 @@ def parse_pi(self, i):
333
360
334
361
# Internal -- handle starttag, return end or -1 if not terminated
335
362
def parse_starttag (self , i ):
363
+ # See the HTML5 specs section "13.2.5.8 Tag name state"
364
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
336
365
self .__starttag_text = None
337
366
endpos = self .check_for_whole_start_tag (i )
338
367
if endpos < 0 :
@@ -385,76 +414,42 @@ def parse_starttag(self, i):
385
414
# or -1 if incomplete.
386
415
def check_for_whole_start_tag (self , i ):
387
416
rawdata = self .rawdata
388
- m = locatestarttagend_tolerant .match (rawdata , i )
389
- if m :
390
- j = m .end ()
391
- next = rawdata [j :j + 1 ]
392
- if next == ">" :
393
- return j + 1
394
- if next == "/" :
395
- if rawdata .startswith ("/>" , j ):
396
- return j + 2
397
- if rawdata .startswith ("/" , j ):
398
- # buffer boundary
399
- return - 1
400
- # else bogus input
401
- if j > i :
402
- return j
403
- else :
404
- return i + 1
405
- if next == "" :
406
- # end of input
407
- return - 1
408
- if next in ("abcdefghijklmnopqrstuvwxyz=/"
409
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ):
410
- # end of input in or before attribute value, or we have the
411
- # '/' from a '/>' ending
412
- return - 1
413
- if j > i :
414
- return j
415
- else :
416
- return i + 1
417
- raise AssertionError ("we should not get here!" )
417
+ match = locatetagend .match (rawdata , i + 1 )
418
+ assert match
419
+ j = match .end ()
420
+ if rawdata [j - 1 ] != ">" :
421
+ return - 1
422
+ return j
418
423
419
424
# Internal -- parse endtag, return end or -1 if incomplete
420
425
def parse_endtag (self , i ):
426
+ # See the HTML5 specs section "13.2.5.7 End tag open state"
427
+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
421
428
rawdata = self .rawdata
422
429
assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
423
- match = endendtag .search (rawdata , i + 1 ) # >
424
- if not match :
430
+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
425
431
return - 1
426
- gtpos = match .end ()
427
- match = endtagfind .match (rawdata , i ) # </ + tag + >
428
- if not match :
429
- if self .cdata_elem is not None :
430
- self .handle_data (rawdata [i :gtpos ])
431
- return gtpos
432
- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
433
- namematch = tagfind_tolerant .match (rawdata , i + 2 )
434
- if not namematch :
435
- # w3.org/TR/html5/tokenization.html#end-tag-open-state
436
- if rawdata [i :i + 3 ] == '</>' :
437
- return i + 3
438
- else :
439
- return self .parse_bogus_comment (i )
440
- tagname = namematch .group (1 ).lower ()
441
- # consume and ignore other stuff between the name and the >
442
- # Note: this is not 100% correct, since we might have things like
443
- # </tag attr=">">, but looking for > after the name should cover
444
- # most of the cases and is much simpler
445
- gtpos = rawdata .find ('>' , namematch .end ())
446
- self .handle_endtag (tagname )
447
- return gtpos + 1
432
+ if not endtagopen .match (rawdata , i ): # </ + letter
433
+ if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
434
+ # "missing-end-tag-name" parser error
435
+ return i + 3
436
+ else :
437
+ return self .parse_bogus_comment (i )
448
438
449
- elem = match . group ( 1 ). lower () # script or style
450
- if self . cdata_elem is not None :
451
- if elem != self . cdata_elem :
452
- self . handle_data ( rawdata [i : gtpos ])
453
- return gtpos
439
+ match = locatetagend . match ( rawdata , i + 2 )
440
+ assert match
441
+ j = match . end ()
442
+ if rawdata [j - 1 ] != ">" :
443
+ return - 1
454
444
455
- self .handle_endtag (elem )
445
+ # find the name: "13.2.5.8 Tag name state"
446
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
447
+ match = tagfind_tolerant .match (rawdata , i + 2 )
448
+ assert match
449
+ tag = match .group (1 ).lower ()
450
+ self .handle_endtag (tag )
456
451
self .clear_cdata_mode ()
457
- return gtpos
452
+ return j
458
453
459
454
# Overridable -- finish processing of start+end tag: <tag.../>
460
455
def handle_startendtag (self , tag , attrs ):
0 commit comments