@@ -19,15 +19,13 @@ function dbg( $message, $indent = 0 ) {
19
19
}
20
20
}
21
21
22
+ // It's an object because sometimes the identity matters
22
23
class WP_HTML_Tag_Token {
23
24
24
25
public $ tag ;
25
26
26
- public $ bookmark ;
27
-
28
- public function __construct ( $ tag , $ bookmark = null ) {
27
+ public function __construct ( $ tag ) {
29
28
$ this ->tag = $ tag ;
30
- $ this ->bookmark = $ bookmark ;
31
29
}
32
30
33
31
}
@@ -65,22 +63,32 @@ public function __construct( $html ) {
65
63
66
64
public function parse () {
67
65
echo ("HTML before main loop: \n" );
68
- echo ($ this ->html );
66
+ // echo($this->html);
69
67
echo ("\n" );
68
+ $ i = 0 ;
70
69
while ($ this ->next_element_node ()) {
71
70
// ... twiddle thumbs ...
71
+ if (++$ i % 10000 === 0 )
72
+ {
73
+ echo $ this ->get_tag ()." oe: " . count ($ this ->open_elements ) . " " ;
74
+ echo "afe: " . count ($ this ->active_formatting_elements ) . " \n" ;
75
+ echo "Peak mem: " . round (memory_get_peak_usage (true ) / 1024 / 1024 , 2 ) . "MB \n" ;
76
+ // print_r($this->open_elements);
77
+ // die();
78
+ }
72
79
}
73
80
while ( count ($ this ->open_elements ) > 1 ) {
74
81
$ this ->pop_open_element ();
75
82
}
76
83
77
84
echo ("\n" );
78
85
echo ("\$this->HTML after main loop: \n" );
79
- echo ($ this ->get_updated_html ().'' );
86
+ // echo($this->get_updated_html().'');
80
87
echo "\n\n" ;
81
88
82
89
echo "Mem peak usage: " . (memory_get_peak_usage (true ) / 1024 / 1024 ) . "MB \n" ;
83
90
echo ("\n--------------- \n\n" );
91
+ return $ this ->get_updated_html ();
84
92
}
85
93
86
94
public function next_element_node () {
@@ -90,6 +98,9 @@ public function next_element_node() {
90
98
if ( ! $ this ->is_tag_closer () ) {
91
99
dbg ( "Found {$ this ->current_token ->tag } tag opener " );
92
100
switch ( $ this ->current_token ->tag ) {
101
+ case 'HTML ' :
102
+ $ this ->drop_current_tag_token ();
103
+ break ;
93
104
case 'ADDRESS ' :
94
105
case 'ARTICLE ' :
95
106
case 'ASIDE ' :
@@ -268,16 +279,23 @@ public function next_element_node() {
268
279
case 'TABLE ' :
269
280
$ this ->insert_element ( $ this ->current_token );
270
281
break ;
282
+
283
+ // Void elements.
284
+ // Some require reconstructing the active formatting elements.
271
285
case 'AREA ' :
272
286
case 'BR ' :
273
287
case 'EMBED ' :
274
288
case 'IMG ' :
275
289
case 'KEYGEN ' :
276
290
case 'WBR ' :
277
291
$ this ->reconstruct_active_formatting_elements ();
278
- $ this ->insert_element ( $ this ->current_token );
279
- $ this ->pop_open_element ( false );
280
- break ;
292
+ // But others don't.
293
+ case 'META ' :
294
+ case 'LINK ' :
295
+ case 'BASE ' :
296
+ case 'COL ' :
297
+ case 'FRAME ' :
298
+ case 'INPUT ' :
281
299
case 'PARAM ' :
282
300
case 'SOURCE ' :
283
301
case 'TRACK ' :
@@ -450,6 +468,22 @@ public function next_element_node() {
450
468
$ this ->pop_until_tag ( $ this ->current_token ->tag , false );
451
469
$ this ->clear_active_formatting_elements_up_to_last_marker ();
452
470
break ;
471
+
472
+ /*
473
+ * @divergence from spec:
474
+ * Close all the open tags when a table-related
475
+ * tag closer is encountered
476
+ */
477
+ case 'TBODY ' :
478
+ case 'TFOOT ' :
479
+ case 'THEAD ' :
480
+ case 'TD ' :
481
+ case 'TH ' :
482
+ case 'TR ' :
483
+ case 'TABLE ' :
484
+ $ this ->pop_until_tag ( $ this ->current_token ->tag , false );
485
+ break ;
486
+
453
487
case 'BR ' :
454
488
// This should never happen since Tag_Processor corrects that
455
489
default :
@@ -462,20 +496,33 @@ public function next_element_node() {
462
496
463
497
private function next_tag_token () {
464
498
$ tag_token = null ;
499
+ $ bookmark = null ;
465
500
$ text_start = $ this ->tag_ends_at + 1 ;
466
- if ($ this ->next_tag (array ('tag_closers ' => 'visit ' ))) {
467
- // @TODO don't create a bookmark for every single tag
468
- $ bookmark = '__internal_ ' . ( $ this ->element_bookmark_idx ++ );
469
- $ this ->set_bookmark ($ bookmark );
470
- $ tag_token = new WP_HTML_Tag_Token (
471
- $ this ->get_tag (),
472
- $ bookmark
473
- );
474
- $ text_end = $ this ->bookmarks [$ bookmark ]->start ;
475
- } else {
476
- $ text_end = strlen ($ this ->html );
501
+ if (!$ this ->next_tag (array ('tag_closers ' => 'visit ' ))) {
502
+ $ this ->process_text ($ text_start , strlen ($ this ->html ));
503
+ $ this ->current_token = null ;
504
+ $ this ->current_token_start = strlen ($ this ->html );
505
+ $ this ->current_token_end = strlen ($ this ->html );
506
+ return false ;
477
507
}
478
508
509
+ // @TODO don't create a bookmark for every single tag
510
+ $ bookmark = '__internal_ ' . ( $ this ->element_bookmark_idx ++ );
511
+ $ this ->set_bookmark ($ bookmark );
512
+ $ tag_token = new WP_HTML_Tag_Token ($ this ->get_tag ());
513
+ $ text_end = $ this ->bookmarks [$ bookmark ]->start ;
514
+
515
+ $ this ->process_text ($ text_start , $ text_end );
516
+
517
+ $ this ->current_token = $ tag_token ;
518
+ $ this ->current_token_start = $ this ->bookmarks [$ bookmark ]->start ;
519
+ $ this ->current_token_end = $ this ->bookmarks [$ bookmark ]->end ;
520
+ $ this ->release_bookmark ($ bookmark );
521
+
522
+ return true ;
523
+ }
524
+
525
+ private function process_text ($ text_start , $ text_end ) {
479
526
if ($ text_start < $ text_end ) {
480
527
$ this ->current_token = substr ($ this ->html , $ text_start , $ text_end - $ text_start );
481
528
$ this ->current_token_start = $ text_start ;
@@ -484,18 +531,6 @@ private function next_tag_token() {
484
531
dbg ( "Appending text to reconstructed HTML " , 1 );
485
532
$ this ->reconstruct_active_formatting_elements ();
486
533
}
487
-
488
- if ( ! $ tag_token ) {
489
- $ this ->current_token = null ;
490
- $ this ->current_token_start = strlen ($ this ->html );
491
- $ this ->current_token_end = strlen ($ this ->html );
492
- return false ;
493
- }
494
-
495
- $ this ->current_token = $ tag_token ;
496
- $ this ->current_token_start = $ this ->bookmarks [$ tag_token ->bookmark ]->start ;
497
- $ this ->current_token_end = $ this ->bookmarks [$ tag_token ->bookmark ]->end ;
498
- return true ;
499
534
}
500
535
501
536
private function process_any_other_end_tag ( WP_HTML_Tag_Token $ token ) {
@@ -745,7 +780,7 @@ private function close_p_element($insert_p_tag_closer = true) {
745
780
)
746
781
);
747
782
// If the current node is not a p element, then this is a parse error.
748
- if ( $ this ->get_tag () !== 'P ' ) {
783
+ if ( $ this ->current_node ()-> tag !== 'P ' ) {
749
784
$ this ->parse_error ();
750
785
}
751
786
$ this ->pop_until_tag ( 'P ' , false );
@@ -773,7 +808,7 @@ private function should_generate_implied_end_tags( $options = null ) {
773
808
return true ;
774
809
}
775
810
776
- $ thoroughly = null !== $ options && isset ( $ options ['thoroughly ' ] ) && $ options ['thoroughly ' ];
811
+ $ thoroughly = true ; // null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly'];
777
812
if ( $ thoroughly ) {
778
813
switch ( $ current_tag_name ) {
779
814
case 'TBODY ' :
@@ -1128,17 +1163,26 @@ private static function is_formatting_element( $tag_name ) {
1128
1163
1129
1164
}
1130
1165
1131
- // $dir = realpath( __DIR__ . '/../../../index.html' );
1166
+ $ dir = realpath ( __DIR__ . '/../../../index.html ' );
1132
1167
1133
- // $htmlspec = file_get_contents( $dir );
1134
- // $p = new WP_HTML_Processor( $htmlspec );
1135
- // $p->parse();
1168
+ $ htmlspec = file_get_contents ( $ dir );
1169
+ $ p = new WP_HTML_Processor ( $ htmlspec );
1170
+ $ p ->parse ();
1171
+
1172
+ die ();
1136
1173
1174
+ // $p = new WP_HTML_Processor( '<dd><dt>' );
1175
+ // $p->parse();
1137
1176
// die();
1177
+ // $p = new WP_HTML_Processor( '<p>1<title>HTML Standard</title><meta content=#3c790a name=theme-color>3<b>4</b>5</p>' );
1178
+ // $p->parse();
1179
+ $ p = new WP_HTML_Processor ( '<p>1<table><tbody><tr><td>HTML</td><td>Standard</table></p><div>test</div> ' );
1180
+ echo $ p ->parse ();
1181
+ die ();
1138
1182
1139
- $ p = new WP_HTML_Processor ( '<dd><dt > ' );
1183
+ $ p = new WP_HTML_Processor ( '<p>1<script>HTML Standard</script>3<b>4</b>5</p > ' );
1140
1184
$ p ->parse ();
1141
- die ();
1185
+
1142
1186
$ p = new WP_HTML_Processor ( '<p>1<b>2<i>3</b>4</i>5</p> ' );
1143
1187
$ p ->parse ();
1144
1188
0 commit comments