Skip to content

Commit 5fcf00d

Browse files
dmsnelladamziel
andcommitted
Tag Processor: Add bookmark system for tracking semantic locations in document
It can be helpful to track a location in an HTML document while updates are being made to it such that we can instruct the Tag Processor to seek to the location of one of the bookmarks. In this patch we're introducing a bookmarks system to do just that. Bookmarks are referenced by name and handled internally by a tracking object which will follow all updates made to the document. It will be possible to rewind or jump around a document by setting a bookmark and then calling `seek( $bookmark_name )` to move there. Co-authored-by: Adam Zielinski <[email protected]> Co-authored-by: Dennis Snell <[email protected]>
1 parent 32ba7bd commit 5fcf00d

File tree

5 files changed

+684
-48
lines changed

5 files changed

+684
-48
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?php
2+
/**
3+
* HTML Span: Represents a textual span inside an HTML document.
4+
*
5+
* @package WordPress
6+
* @subpackage HTML
7+
* @since 6.2.0
8+
*/
9+
10+
/**
11+
* Represents a textual span inside an HTML document.
12+
*
13+
* This is a two-tuple in disguise, used to avoid the memory
14+
* overhead involved in using an array for the same purpose.
15+
*
16+
* This class is for internal usage of the WP_HTML_Tag_Processor class.
17+
*
18+
* @access private
19+
* @since 6.2.0
20+
*
21+
* @see WP_HTML_Tag_Processor
22+
*/
23+
class WP_HTML_Span {
24+
/**
25+
* Byte offset into document where span begins.
26+
*
27+
* @since 6.2.0
28+
* @var int
29+
*/
30+
public $start;
31+
32+
/**
33+
* Byte offset into document where span ends.
34+
*
35+
* @since 6.2.0
36+
* @var int
37+
*/
38+
public $end;
39+
40+
/**
41+
* Constructor.
42+
*
43+
* @since 6.2.0
44+
*
45+
* @param int $start Byte offset into document where replacement span begins.
46+
* @param int $end Byte offset into document where replacement span ends.
47+
*/
48+
public function __construct( $start, $end ) {
49+
$this->start = $start;
50+
$this->end = $end;
51+
}
52+
}

lib/experimental/html/class-wp-html-tag-processor.php

Lines changed: 243 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,25 @@
180180
* @since 6.2.0
181181
*/
182182
class WP_HTML_Tag_Processor {
183+
/**
184+
* The maximum number of bookmarks allowed to exist at
185+
* any given time.
186+
*
187+
* @see set_bookmark();
188+
* @since 6.2.0
189+
* @var int
190+
*/
191+
const MAX_BOOKMARKS = 10;
192+
193+
/**
194+
* Maximum number of times seek() can be called.
195+
* Prevents accidental infinite loops.
196+
*
197+
* @see seek()
198+
* @since 6.2.0
199+
* @var int
200+
*/
201+
const MAX_SEEK_OPS = 1000;
183202

184203
/**
185204
* The HTML document to parse.
@@ -349,11 +368,11 @@ class WP_HTML_Tag_Processor {
349368
*
350369
* Example:
351370
* <code>
352-
* // Add the `WP-block-group` class, remove the `WP-group` class.
353-
* $class_changes = [
371+
* // Add the `wp-block-group` class, remove the `wp-group` class.
372+
* $classname_updates = [
354373
* // Indexed by a comparable class name
355-
* 'wp-block-group' => new WP_Class_Name_Operation( 'WP-block-group', WP_Class_Name_Operation::ADD ),
356-
* 'wp-group' => new WP_Class_Name_Operation( 'WP-group', WP_Class_Name_Operation::REMOVE )
374+
* 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS,
375+
* 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS
357376
* ];
358377
* </code>
359378
*
@@ -362,6 +381,15 @@ class WP_HTML_Tag_Processor {
362381
*/
363382
private $classname_updates = array();
364383

384+
/**
385+
* Tracks a semantic location in the original HTML which
386+
* shifts with updates as they are applied to the document.
387+
*
388+
* @since 6.2.0
389+
* @var WP_HTML_Span[]
390+
*/
391+
private $bookmarks = array();
392+
365393
const ADD_CLASS = true;
366394
const REMOVE_CLASS = false;
367395
const SKIP_CLASS = null;
@@ -396,6 +424,16 @@ class WP_HTML_Tag_Processor {
396424
*/
397425
private $attribute_updates = array();
398426

427+
/**
428+
* Tracks how many times we've performed a `seek()`
429+
* so that we can prevent accidental infinite loops.
430+
*
431+
* @see seek
432+
* @since 6.2.0
433+
* @var int
434+
*/
435+
private $seek_count = 0;
436+
399437
/**
400438
* Constructor.
401439
*
@@ -479,6 +517,123 @@ public function next_tag( $query = null ) {
479517
return true;
480518
}
481519

520+
521+
/**
522+
* Sets a bookmark in the HTML document.
523+
*
524+
* Bookmarks represent specific places or tokens in the HTML
525+
* document, such as a tag opener or closer. When applying
526+
* edits to a document, such as setting an attribute, the
527+
* text offsets of that token may shift; the bookmark is
528+
* kept updated with those shifts and remains stable unless
529+
* the entire span of text in which the token sits is removed.
530+
*
531+
* Release bookmarks when they are no longer needed.
532+
*
533+
* Example:
534+
* ```
535+
* <main><h2>Surprising fact you may not know!</h2></main>
536+
* ^ ^
537+
* \-|-- this `H2` opener bookmark tracks the token
538+
*
539+
* <main class="clickbait"><h2>Surprising fact you may no…
540+
* ^ ^
541+
* \-|-- it shifts with edits
542+
* ```
543+
*
544+
* Bookmarks provide the ability to seek to a previously-scanned
545+
* place in the HTML document. This avoids the need to re-scan
546+
* the entire thing.
547+
*
548+
* Example:
549+
* ```
550+
* <ul><li>One</li><li>Two</li><li>Three</li></ul>
551+
* ^^^^
552+
* want to note this last item
553+
*
554+
* $p = new WP_HTML_Tag_Processor( $html );
555+
* $in_list = false;
556+
* while ( $p->next_tag( [ 'tag_closers' => $in_list ? 'visit' : 'skip' ] ) ) {
557+
* if ( 'UL' === $p->get_tag() ) {
558+
* if ( $p->is_tag_closer() ) {
559+
* $in_list = false;
560+
* $p->set_bookmark( 'resume' );
561+
* if ( $p->seek( 'last-li' ) ) {
562+
* $p->add_class( 'last-li' );
563+
* }
564+
* $p->seek( 'resume' );
565+
* $p->release_bookmark( 'last-li' );
566+
* $p->release_bookmark( 'resume' );
567+
* } else {
568+
* $in_list = true;
569+
* }
570+
* }
571+
*
572+
* if ( 'LI' === $p->get_tag() ) {
573+
* $p->set_bookmark( 'last-li' );
574+
* }
575+
* }
576+
* ```
577+
*
578+
* Because bookmarks maintain their position they don't
579+
* expose any internal offsets for the HTML document
580+
* and can't be used with normal string functions.
581+
*
582+
* Because bookmarks allocate memory and require processing
583+
* for every applied update they are limited and require
584+
* a name. They should not be created inside a loop.
585+
*
586+
* Bookmarks are a powerful tool to enable complicated behavior;
587+
* consider double-checking that you need this tool if you are
588+
* reaching for it, as inappropriate use could lead to broken
589+
* HTML structure or unwanted processing overhead.
590+
*
591+
* @param string $name Identifies this particular bookmark.
592+
* @return false|void
593+
* @throws Exception Throws on invalid bookmark name if WP_DEBUG set.
594+
*/
595+
public function set_bookmark( $name ) {
596+
if ( null === $this->tag_name_starts_at ) {
597+
return false;
598+
}
599+
600+
if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= self::MAX_BOOKMARKS ) {
601+
if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) {
602+
throw new Exception( "Tried to jump to a non-existent HTML bookmark {$name}." );
603+
}
604+
return false;
605+
}
606+
607+
$this->bookmarks[ $name ] = new WP_HTML_Span(
608+
$this->tag_name_starts_at - 1,
609+
$this->tag_ends_at
610+
);
611+
612+
return true;
613+
}
614+
615+
616+
/**
617+
* Removes a bookmark if you no longer need to use it.
618+
*
619+
* Releasing a bookmark frees up the small performance
620+
* overhead they require, mainly in the form of compute
621+
* costs when modifying the document.
622+
*
623+
* @param string $name Name of the bookmark to remove.
624+
* @return bool
625+
*/
626+
public function release_bookmark( $name ) {
627+
if ( ! array_key_exists( $name, $this->bookmarks ) ) {
628+
return false;
629+
}
630+
631+
unset( $this->bookmarks[ $name ] );
632+
633+
return true;
634+
}
635+
636+
482637
/**
483638
* Skips the contents of the title and textarea tags until an appropriate
484639
* tag closer is found.
@@ -1104,9 +1259,77 @@ private function apply_attributes_updates() {
11041259
$this->updated_bytes = $diff->end;
11051260
}
11061261

1262+
foreach ( $this->bookmarks as $bookmark ) {
1263+
/**
1264+
* As we loop through $this->attribute_updates, we keep comparing
1265+
* $bookmark->start and $bookmark->end to $diff->start. We can't
1266+
* change it and still expect the correct result, so let's accumulate
1267+
* the deltas separately and apply them all at once after the loop.
1268+
*/
1269+
$head_delta = 0;
1270+
$tail_delta = 0;
1271+
1272+
foreach ( $this->attribute_updates as $diff ) {
1273+
$update_head = $bookmark->start >= $diff->start;
1274+
$update_tail = $bookmark->end >= $diff->start;
1275+
1276+
if ( ! $update_head && ! $update_tail ) {
1277+
break;
1278+
}
1279+
1280+
$delta = strlen( $diff->text ) - ( $diff->end - $diff->start );
1281+
1282+
if ( $update_head ) {
1283+
$head_delta += $delta;
1284+
}
1285+
1286+
if ( $update_tail ) {
1287+
$tail_delta += $delta;
1288+
}
1289+
}
1290+
1291+
$bookmark->start += $head_delta;
1292+
$bookmark->end += $tail_delta;
1293+
}
1294+
11071295
$this->attribute_updates = array();
11081296
}
11091297

1298+
/**
1299+
* Move the current pointer in the Tag Processor to a given bookmark's location.
1300+
*
1301+
* In order to prevent accidental infinite loops, there's a
1302+
* maximum limit on the number of times seek() can be called.
1303+
*
1304+
* @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
1305+
* @return bool
1306+
* @throws Exception Throws on invalid bookmark name if WP_DEBUG set.
1307+
*/
1308+
public function seek( $bookmark_name ) {
1309+
if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) {
1310+
if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) {
1311+
throw new Exception( 'Invalid bookmark name' );
1312+
}
1313+
return false;
1314+
}
1315+
1316+
if ( ++$this->seek_count > self::MAX_SEEK_OPS ) {
1317+
if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) {
1318+
throw new Exception( 'Too many calls to seek() - this can lead to performance issues.' );
1319+
}
1320+
return false;
1321+
}
1322+
1323+
// Flush out any pending updates to the document.
1324+
$this->get_updated_html();
1325+
1326+
// Point this tag processor before the sought tag opener and consume it.
1327+
$this->parsed_bytes = $this->bookmarks[ $bookmark_name ]->start;
1328+
$this->updated_bytes = $this->parsed_bytes;
1329+
$this->updated_html = substr( $this->html, 0, $this->updated_bytes );
1330+
return $this->next_tag();
1331+
}
1332+
11101333
/**
11111334
* Sort function to arrange objects with a start property in ascending order.
11121335
*
@@ -1411,47 +1634,31 @@ public function __toString() {
14111634
* @return string The processed HTML.
14121635
*/
14131636
public function get_updated_html() {
1414-
// Short-circuit if there are no updates to apply.
1637+
// Short-circuit if there are no new updates to apply.
14151638
if ( ! count( $this->classname_updates ) && ! count( $this->attribute_updates ) ) {
14161639
return $this->updated_html . substr( $this->html, $this->updated_bytes );
14171640
}
14181641

1419-
/*
1420-
* Parsing is in progress – let's apply the attribute updates without moving on to the next tag.
1421-
*
1422-
* In practice:
1423-
* 1. Apply the attributes updates to the original HTML
1424-
* 2. Replace the original HTML with the updated HTML
1425-
* 3. Point this tag processor to the current tag name's end in that updated HTML
1426-
*/
1427-
1428-
// Find tag name's end in the updated markup.
1429-
$markup_updated_up_to_a_tag_name_end = $this->updated_html . substr( $this->html, $this->updated_bytes, $this->tag_name_starts_at + $this->tag_name_length - $this->updated_bytes );
1430-
$updated_tag_name_ends_at = strlen( $markup_updated_up_to_a_tag_name_end );
1431-
$updated_tag_name_starts_at = $updated_tag_name_ends_at - $this->tag_name_length;
1642+
// Otherwise: apply the updates, rewind before the current tag, and parse it again.
1643+
$delta_between_updated_html_end_and_current_tag_end = substr(
1644+
$this->html,
1645+
$this->updated_bytes,
1646+
$this->tag_name_starts_at + $this->tag_name_length - $this->updated_bytes
1647+
);
1648+
$updated_html_up_to_current_tag_name_end = $this->updated_html . $delta_between_updated_html_end_and_current_tag_end;
14321649

1433-
// Apply attributes updates.
1434-
$this->updated_html = $markup_updated_up_to_a_tag_name_end;
1435-
$this->updated_bytes = $this->tag_name_starts_at + $this->tag_name_length;
1650+
// 1. Apply the attributes updates to the original HTML
14361651
$this->class_name_updates_to_attributes_updates();
14371652
$this->apply_attributes_updates();
14381653

1439-
// Replace $this->html with the updated markup.
1440-
$this->html = $this->updated_html . substr( $this->html, $this->updated_bytes );
1654+
// 2. Replace the original HTML with the updated HTML
1655+
$this->html = $this->updated_html . substr( $this->html, $this->updated_bytes );
1656+
$this->updated_html = $updated_html_up_to_current_tag_name_end;
1657+
$this->updated_bytes = strlen( $this->updated_html );
14411658

1442-
// Rewind this processor to the tag name's end.
1443-
$this->tag_name_starts_at = $updated_tag_name_starts_at;
1444-
$this->parsed_bytes = $updated_tag_name_ends_at;
1445-
1446-
// Restore the previous version of the updated_html as we are not finished with the current_tag yet.
1447-
$this->updated_html = $markup_updated_up_to_a_tag_name_end;
1448-
$this->updated_bytes = $updated_tag_name_ends_at;
1449-
1450-
// Parse the attributes in the updated markup.
1451-
$this->attributes = array();
1452-
while ( $this->parse_next_attribute() ) {
1453-
continue;
1454-
}
1659+
// 3. Point this tag processor at the original tag opener and consume it
1660+
$this->parsed_bytes = strlen( $updated_html_up_to_current_tag_name_end ) - $this->tag_name_length - 2;
1661+
$this->next_tag();
14551662

14561663
return $this->html;
14571664
}

0 commit comments

Comments
 (0)