Skip to content

Commit 37659fb

Browse files
committed
MVP parser capable of parsing the entire HTML spec
1 parent 26c6f21 commit 37659fb

File tree

1 file changed

+84
-40
lines changed

1 file changed

+84
-40
lines changed

src/wp-includes/html-api/class-wp-html-text-processor.php

Lines changed: 84 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,13 @@ function dbg( $message, $indent = 0 ) {
1919
}
2020
}
2121

22+
// It's an object because sometimes the identity matters
2223
class WP_HTML_Tag_Token {
2324

2425
public $tag;
2526

26-
public $bookmark;
27-
28-
public function __construct( $tag, $bookmark = null ) {
27+
public function __construct( $tag ) {
2928
$this->tag = $tag;
30-
$this->bookmark = $bookmark;
3129
}
3230

3331
}
@@ -65,22 +63,32 @@ public function __construct( $html ) {
6563

6664
public function parse() {
6765
echo("HTML before main loop:\n");
68-
echo($this->html);
66+
// echo($this->html);
6967
echo("\n");
68+
$i = 0;
7069
while ($this->next_element_node()) {
7170
// ... twiddle thumbs ...
71+
if(++$i % 10000 === 0)
72+
{
73+
echo $this->get_tag()." oe: " . count($this->open_elements) . " ";
74+
echo "afe: " . count($this->active_formatting_elements) . " \n";
75+
echo "Peak mem:" . round(memory_get_peak_usage(true) / 1024 / 1024, 2) . "MB\n";
76+
// print_r($this->open_elements);
77+
// die();
78+
}
7279
}
7380
while ( count($this->open_elements) > 1 ) {
7481
$this->pop_open_element();
7582
}
7683

7784
echo("\n");
7885
echo("\$this->HTML after main loop:\n");
79-
echo($this->get_updated_html().'');
86+
// echo($this->get_updated_html().'');
8087
echo "\n\n";
8188

8289
echo "Mem peak usage:" . (memory_get_peak_usage(true) / 1024 / 1024) . "MB\n";
8390
echo("\n---------------\n\n");
91+
return $this->get_updated_html();
8492
}
8593

8694
public function next_element_node() {
@@ -90,6 +98,9 @@ public function next_element_node() {
9098
if ( ! $this->is_tag_closer() ) {
9199
dbg( "Found {$this->current_token->tag} tag opener" );
92100
switch ( $this->current_token->tag ) {
101+
case 'HTML':
102+
$this->drop_current_tag_token();
103+
break;
93104
case 'ADDRESS':
94105
case 'ARTICLE':
95106
case 'ASIDE':
@@ -268,16 +279,23 @@ public function next_element_node() {
268279
case 'TABLE':
269280
$this->insert_element( $this->current_token );
270281
break;
282+
283+
// Void elements.
284+
// Some require reconstructing the active formatting elements.
271285
case 'AREA':
272286
case 'BR':
273287
case 'EMBED':
274288
case 'IMG':
275289
case 'KEYGEN':
276290
case 'WBR':
277291
$this->reconstruct_active_formatting_elements();
278-
$this->insert_element( $this->current_token );
279-
$this->pop_open_element( false );
280-
break;
292+
// But others don't.
293+
case 'META':
294+
case 'LINK':
295+
case 'BASE':
296+
case 'COL':
297+
case 'FRAME':
298+
case 'INPUT':
281299
case 'PARAM':
282300
case 'SOURCE':
283301
case 'TRACK':
@@ -450,6 +468,22 @@ public function next_element_node() {
450468
$this->pop_until_tag( $this->current_token->tag, false );
451469
$this->clear_active_formatting_elements_up_to_last_marker();
452470
break;
471+
472+
/*
473+
* @divergence from spec:
474+
* Close all the open tags when a table-related
475+
* tag closer is encountered
476+
*/
477+
case 'TBODY':
478+
case 'TFOOT':
479+
case 'THEAD':
480+
case 'TD':
481+
case 'TH':
482+
case 'TR':
483+
case 'TABLE':
484+
$this->pop_until_tag( $this->current_token->tag, false );
485+
break;
486+
453487
case 'BR':
454488
// This should never happen since Tag_Processor corrects that
455489
default:
@@ -462,20 +496,33 @@ public function next_element_node() {
462496

463497
private function next_tag_token() {
464498
$tag_token = null;
499+
$bookmark = null;
465500
$text_start = $this->tag_ends_at + 1;
466-
if ($this->next_tag(array('tag_closers' => 'visit'))) {
467-
// @TODO don't create a bookmark for every single tag
468-
$bookmark = '__internal_' . ( $this->element_bookmark_idx++ );
469-
$this->set_bookmark($bookmark);
470-
$tag_token = new WP_HTML_Tag_Token(
471-
$this->get_tag(),
472-
$bookmark
473-
);
474-
$text_end = $this->bookmarks[$bookmark]->start;
475-
} else {
476-
$text_end = strlen($this->html);
501+
if (!$this->next_tag(array('tag_closers' => 'visit'))) {
502+
$this->process_text($text_start, strlen($this->html));
503+
$this->current_token = null;
504+
$this->current_token_start = strlen($this->html);
505+
$this->current_token_end = strlen($this->html);
506+
return false;
477507
}
478508

509+
// @TODO don't create a bookmark for every single tag
510+
$bookmark = '__internal_' . ( $this->element_bookmark_idx++ );
511+
$this->set_bookmark($bookmark);
512+
$tag_token = new WP_HTML_Tag_Token($this->get_tag());
513+
$text_end = $this->bookmarks[$bookmark]->start;
514+
515+
$this->process_text($text_start, $text_end);
516+
517+
$this->current_token = $tag_token;
518+
$this->current_token_start = $this->bookmarks[$bookmark]->start;
519+
$this->current_token_end = $this->bookmarks[$bookmark]->end;
520+
$this->release_bookmark($bookmark);
521+
522+
return true;
523+
}
524+
525+
private function process_text($text_start, $text_end) {
479526
if ($text_start < $text_end) {
480527
$this->current_token = substr($this->html, $text_start, $text_end - $text_start);
481528
$this->current_token_start = $text_start;
@@ -484,18 +531,6 @@ private function next_tag_token() {
484531
dbg( "Appending text to reconstructed HTML", 1 );
485532
$this->reconstruct_active_formatting_elements();
486533
}
487-
488-
if ( ! $tag_token ) {
489-
$this->current_token = null;
490-
$this->current_token_start = strlen($this->html);
491-
$this->current_token_end = strlen($this->html);
492-
return false;
493-
}
494-
495-
$this->current_token = $tag_token;
496-
$this->current_token_start = $this->bookmarks[$tag_token->bookmark]->start;
497-
$this->current_token_end = $this->bookmarks[$tag_token->bookmark]->end;
498-
return true;
499534
}
500535

501536
private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) {
@@ -745,7 +780,7 @@ private function close_p_element($insert_p_tag_closer = true) {
745780
)
746781
);
747782
// If the current node is not a p element, then this is a parse error.
748-
if ( $this->get_tag() !== 'P' ) {
783+
if ( $this->current_node()->tag !== 'P' ) {
749784
$this->parse_error();
750785
}
751786
$this->pop_until_tag( 'P', false );
@@ -773,7 +808,7 @@ private function should_generate_implied_end_tags( $options = null ) {
773808
return true;
774809
}
775810

776-
$thoroughly = null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly'];
811+
$thoroughly = true; //null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly'];
777812
if ( $thoroughly ) {
778813
switch ( $current_tag_name ) {
779814
case 'TBODY':
@@ -1128,17 +1163,26 @@ private static function is_formatting_element( $tag_name ) {
11281163

11291164
}
11301165

1131-
// $dir = realpath( __DIR__ . '/../../../index.html' );
1166+
$dir = realpath( __DIR__ . '/../../../index.html' );
11321167

1133-
// $htmlspec = file_get_contents( $dir );
1134-
// $p = new WP_HTML_Processor( $htmlspec );
1135-
// $p->parse();
1168+
$htmlspec = file_get_contents( $dir );
1169+
$p = new WP_HTML_Processor( $htmlspec );
1170+
$p->parse();
1171+
1172+
die();
11361173

1174+
// $p = new WP_HTML_Processor( '<dd><dt>' );
1175+
// $p->parse();
11371176
// die();
1177+
// $p = new WP_HTML_Processor( '<p>1<title>HTML Standard</title><meta content=#3c790a name=theme-color>3<b>4</b>5</p>' );
1178+
// $p->parse();
1179+
$p = new WP_HTML_Processor( '<p>1<table><tbody><tr><td>HTML</td><td>Standard</table></p><div>test</div>' );
1180+
echo $p->parse();
1181+
die();
11381182

1139-
$p = new WP_HTML_Processor( '<dd><dt>' );
1183+
$p = new WP_HTML_Processor( '<p>1<script>HTML Standard</script>3<b>4</b>5</p>' );
11401184
$p->parse();
1141-
die();
1185+
11421186
$p = new WP_HTML_Processor( '<p>1<b>2<i>3</b>4</i>5</p>' );
11431187
$p->parse();
11441188

0 commit comments

Comments
 (0)