180
180
* @since 6.2.0
181
181
*/
182
182
class WP_HTML_Tag_Processor {
183
+ /**
184
+ * The maximum number of bookmarks allowed to exist at
185
+ * any given time.
186
+ *
187
+ * @see set_bookmark();
188
+ * @since 6.2.0
189
+ * @var int
190
+ */
191
+ const MAX_BOOKMARKS = 10 ;
192
+
193
+ /**
194
+ * Maximum number of times seek() can be called.
195
+ * Prevents accidental infinite loops.
196
+ *
197
+ * @see seek()
198
+ * @since 6.2.0
199
+ * @var int
200
+ */
201
+ const MAX_SEEK_OPS = 1000 ;
183
202
184
203
/**
185
204
* The HTML document to parse.
@@ -349,11 +368,11 @@ class WP_HTML_Tag_Processor {
349
368
*
350
369
* Example:
351
370
* <code>
352
- * // Add the `WP -block-group` class, remove the `WP -group` class.
353
- * $class_changes = [
371
+ * // Add the `wp -block-group` class, remove the `wp -group` class.
372
+ * $classname_updates = [
354
373
* // Indexed by a comparable class name
355
- * 'wp-block-group' => new WP_Class_Name_Operation( 'WP-block-group', WP_Class_Name_Operation::ADD ) ,
356
- * 'wp-group' => new WP_Class_Name_Operation( 'WP-group', WP_Class_Name_Operation::REMOVE )
374
+ * 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS ,
375
+ * 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS
357
376
* ];
358
377
* </code>
359
378
*
@@ -362,6 +381,15 @@ class WP_HTML_Tag_Processor {
362
381
*/
363
382
private $ classname_updates = array ();
364
383
384
+ /**
385
+ * Tracks a semantic location in the original HTML which
386
+ * shifts with updates as they are applied to the document.
387
+ *
388
+ * @since 6.2.0
389
+ * @var WP_HTML_Span[]
390
+ */
391
+ private $ bookmarks = array ();
392
+
365
393
const ADD_CLASS = true ;
366
394
const REMOVE_CLASS = false ;
367
395
const SKIP_CLASS = null ;
@@ -396,6 +424,16 @@ class WP_HTML_Tag_Processor {
396
424
*/
397
425
private $ attribute_updates = array ();
398
426
427
+ /**
428
+ * Tracks how many times we've performed a `seek()`
429
+ * so that we can prevent accidental infinite loops.
430
+ *
431
+ * @see seek
432
+ * @since 6.2.0
433
+ * @var int
434
+ */
435
+ private $ seek_count = 0 ;
436
+
399
437
/**
400
438
* Constructor.
401
439
*
@@ -479,6 +517,123 @@ public function next_tag( $query = null ) {
479
517
return true ;
480
518
}
481
519
520
+
521
+ /**
522
+ * Sets a bookmark in the HTML document.
523
+ *
524
+ * Bookmarks represent specific places or tokens in the HTML
525
+ * document, such as a tag opener or closer. When applying
526
+ * edits to a document, such as setting an attribute, the
527
+ * text offsets of that token may shift; the bookmark is
528
+ * kept updated with those shifts and remains stable unless
529
+ * the entire span of text in which the token sits is removed.
530
+ *
531
+ * Release bookmarks when they are no longer needed.
532
+ *
533
+ * Example:
534
+ * ```
535
+ * <main><h2>Surprising fact you may not know!</h2></main>
536
+ * ^ ^
537
+ * \-|-- this `H2` opener bookmark tracks the token
538
+ *
539
+ * <main class="clickbait"><h2>Surprising fact you may no…
540
+ * ^ ^
541
+ * \-|-- it shifts with edits
542
+ * ```
543
+ *
544
+ * Bookmarks provide the ability to seek to a previously-scanned
545
+ * place in the HTML document. This avoids the need to re-scan
546
+ * the entire thing.
547
+ *
548
+ * Example:
549
+ * ```
550
+ * <ul><li>One</li><li>Two</li><li>Three</li></ul>
551
+ * ^^^^
552
+ * want to note this last item
553
+ *
554
+ * $p = new WP_HTML_Tag_Processor( $html );
555
+ * $in_list = false;
556
+ * while ( $p->next_tag( [ 'tag_closers' => $in_list ? 'visit' : 'skip' ] ) ) {
557
+ * if ( 'UL' === $p->get_tag() ) {
558
+ * if ( $p->is_tag_closer() ) {
559
+ * $in_list = false;
560
+ * $p->set_bookmark( 'resume' );
561
+ * if ( $p->seek( 'last-li' ) ) {
562
+ * $p->add_class( 'last-li' );
563
+ * }
564
+ * $p->seek( 'resume' );
565
+ * $p->release_bookmark( 'last-li' );
566
+ * $p->release_bookmark( 'resume' );
567
+ * } else {
568
+ * $in_list = true;
569
+ * }
570
+ * }
571
+ *
572
+ * if ( 'LI' === $p->get_tag() ) {
573
+ * $p->set_bookmark( 'last-li' );
574
+ * }
575
+ * }
576
+ * ```
577
+ *
578
+ * Because bookmarks maintain their position they don't
579
+ * expose any internal offsets for the HTML document
580
+ * and can't be used with normal string functions.
581
+ *
582
+ * Because bookmarks allocate memory and require processing
583
+ * for every applied update they are limited and require
584
+ * a name. They should not be created inside a loop.
585
+ *
586
+ * Bookmarks are a powerful tool to enable complicated behavior;
587
+ * consider double-checking that you need this tool if you are
588
+ * reaching for it, as inappropriate use could lead to broken
589
+ * HTML structure or unwanted processing overhead.
590
+ *
591
+ * @param string $name Identifies this particular bookmark.
592
+ * @return false|void
593
+ * @throws Exception Throws on invalid bookmark name if WP_DEBUG set.
594
+ */
595
+ public function set_bookmark ( $ name ) {
596
+ if ( null === $ this ->tag_name_starts_at ) {
597
+ return false ;
598
+ }
599
+
600
+ if ( ! array_key_exists ( $ name , $ this ->bookmarks ) && count ( $ this ->bookmarks ) >= self ::MAX_BOOKMARKS ) {
601
+ if ( defined ( 'WP_DEBUG ' ) && WP_DEBUG ) {
602
+ throw new Exception ( "Tried to jump to a non-existent HTML bookmark {$ name }. " );
603
+ }
604
+ return false ;
605
+ }
606
+
607
+ $ this ->bookmarks [ $ name ] = new WP_HTML_Span (
608
+ $ this ->tag_name_starts_at - 1 ,
609
+ $ this ->tag_ends_at
610
+ );
611
+
612
+ return true ;
613
+ }
614
+
615
+
616
+ /**
617
+ * Removes a bookmark if you no longer need to use it.
618
+ *
619
+ * Releasing a bookmark frees up the small performance
620
+ * overhead they require, mainly in the form of compute
621
+ * costs when modifying the document.
622
+ *
623
+ * @param string $name Name of the bookmark to remove.
624
+ * @return bool
625
+ */
626
+ public function release_bookmark ( $ name ) {
627
+ if ( ! array_key_exists ( $ name , $ this ->bookmarks ) ) {
628
+ return false ;
629
+ }
630
+
631
+ unset( $ this ->bookmarks [ $ name ] );
632
+
633
+ return true ;
634
+ }
635
+
636
+
482
637
/**
483
638
* Skips the contents of the title and textarea tags until an appropriate
484
639
* tag closer is found.
@@ -1104,9 +1259,77 @@ private function apply_attributes_updates() {
1104
1259
$ this ->updated_bytes = $ diff ->end ;
1105
1260
}
1106
1261
1262
+ foreach ( $ this ->bookmarks as $ bookmark ) {
1263
+ /**
1264
+ * As we loop through $this->attribute_updates, we keep comparing
1265
+ * $bookmark->start and $bookmark->end to $diff->start. We can't
1266
+ * change it and still expect the correct result, so let's accumulate
1267
+ * the deltas separately and apply them all at once after the loop.
1268
+ */
1269
+ $ head_delta = 0 ;
1270
+ $ tail_delta = 0 ;
1271
+
1272
+ foreach ( $ this ->attribute_updates as $ diff ) {
1273
+ $ update_head = $ bookmark ->start >= $ diff ->start ;
1274
+ $ update_tail = $ bookmark ->end >= $ diff ->start ;
1275
+
1276
+ if ( ! $ update_head && ! $ update_tail ) {
1277
+ break ;
1278
+ }
1279
+
1280
+ $ delta = strlen ( $ diff ->text ) - ( $ diff ->end - $ diff ->start );
1281
+
1282
+ if ( $ update_head ) {
1283
+ $ head_delta += $ delta ;
1284
+ }
1285
+
1286
+ if ( $ update_tail ) {
1287
+ $ tail_delta += $ delta ;
1288
+ }
1289
+ }
1290
+
1291
+ $ bookmark ->start += $ head_delta ;
1292
+ $ bookmark ->end += $ tail_delta ;
1293
+ }
1294
+
1107
1295
$ this ->attribute_updates = array ();
1108
1296
}
1109
1297
1298
+ /**
1299
+ * Move the current pointer in the Tag Processor to a given bookmark's location.
1300
+ *
1301
+ * In order to prevent accidental infinite loops, there's a
1302
+ * maximum limit on the number of times seek() can be called.
1303
+ *
1304
+ * @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
1305
+ * @return bool
1306
+ * @throws Exception Throws on invalid bookmark name if WP_DEBUG set.
1307
+ */
1308
+ public function seek ( $ bookmark_name ) {
1309
+ if ( ! array_key_exists ( $ bookmark_name , $ this ->bookmarks ) ) {
1310
+ if ( defined ( 'WP_DEBUG ' ) && WP_DEBUG ) {
1311
+ throw new Exception ( 'Invalid bookmark name ' );
1312
+ }
1313
+ return false ;
1314
+ }
1315
+
1316
+ if ( ++$ this ->seek_count > self ::MAX_SEEK_OPS ) {
1317
+ if ( defined ( 'WP_DEBUG ' ) && WP_DEBUG ) {
1318
+ throw new Exception ( 'Too many calls to seek() - this can lead to performance issues. ' );
1319
+ }
1320
+ return false ;
1321
+ }
1322
+
1323
+ // Flush out any pending updates to the document.
1324
+ $ this ->get_updated_html ();
1325
+
1326
+ // Point this tag processor before the sought tag opener and consume it.
1327
+ $ this ->parsed_bytes = $ this ->bookmarks [ $ bookmark_name ]->start ;
1328
+ $ this ->updated_bytes = $ this ->parsed_bytes ;
1329
+ $ this ->updated_html = substr ( $ this ->html , 0 , $ this ->updated_bytes );
1330
+ return $ this ->next_tag ();
1331
+ }
1332
+
1110
1333
/**
1111
1334
* Sort function to arrange objects with a start property in ascending order.
1112
1335
*
@@ -1411,47 +1634,31 @@ public function __toString() {
1411
1634
* @return string The processed HTML.
1412
1635
*/
1413
1636
public function get_updated_html () {
1414
- // Short-circuit if there are no updates to apply.
1637
+ // Short-circuit if there are no new updates to apply.
1415
1638
if ( ! count ( $ this ->classname_updates ) && ! count ( $ this ->attribute_updates ) ) {
1416
1639
return $ this ->updated_html . substr ( $ this ->html , $ this ->updated_bytes );
1417
1640
}
1418
1641
1419
- /*
1420
- * Parsing is in progress – let's apply the attribute updates without moving on to the next tag.
1421
- *
1422
- * In practice:
1423
- * 1. Apply the attributes updates to the original HTML
1424
- * 2. Replace the original HTML with the updated HTML
1425
- * 3. Point this tag processor to the current tag name's end in that updated HTML
1426
- */
1427
-
1428
- // Find tag name's end in the updated markup.
1429
- $ markup_updated_up_to_a_tag_name_end = $ this ->updated_html . substr ( $ this ->html , $ this ->updated_bytes , $ this ->tag_name_starts_at + $ this ->tag_name_length - $ this ->updated_bytes );
1430
- $ updated_tag_name_ends_at = strlen ( $ markup_updated_up_to_a_tag_name_end );
1431
- $ updated_tag_name_starts_at = $ updated_tag_name_ends_at - $ this ->tag_name_length ;
1642
+ // Otherwise: apply the updates, rewind before the current tag, and parse it again.
1643
+ $ delta_between_updated_html_end_and_current_tag_end = substr (
1644
+ $ this ->html ,
1645
+ $ this ->updated_bytes ,
1646
+ $ this ->tag_name_starts_at + $ this ->tag_name_length - $ this ->updated_bytes
1647
+ );
1648
+ $ updated_html_up_to_current_tag_name_end = $ this ->updated_html . $ delta_between_updated_html_end_and_current_tag_end ;
1432
1649
1433
- // Apply attributes updates.
1434
- $ this ->updated_html = $ markup_updated_up_to_a_tag_name_end ;
1435
- $ this ->updated_bytes = $ this ->tag_name_starts_at + $ this ->tag_name_length ;
1650
+ // 1. Apply the attributes updates to the original HTML
1436
1651
$ this ->class_name_updates_to_attributes_updates ();
1437
1652
$ this ->apply_attributes_updates ();
1438
1653
1439
- // Replace $this->html with the updated markup.
1440
- $ this ->html = $ this ->updated_html . substr ( $ this ->html , $ this ->updated_bytes );
1654
+ // 2. Replace the original HTML with the updated HTML
1655
+ $ this ->html = $ this ->updated_html . substr ( $ this ->html , $ this ->updated_bytes );
1656
+ $ this ->updated_html = $ updated_html_up_to_current_tag_name_end ;
1657
+ $ this ->updated_bytes = strlen ( $ this ->updated_html );
1441
1658
1442
- // Rewind this processor to the tag name's end.
1443
- $ this ->tag_name_starts_at = $ updated_tag_name_starts_at ;
1444
- $ this ->parsed_bytes = $ updated_tag_name_ends_at ;
1445
-
1446
- // Restore the previous version of the updated_html as we are not finished with the current_tag yet.
1447
- $ this ->updated_html = $ markup_updated_up_to_a_tag_name_end ;
1448
- $ this ->updated_bytes = $ updated_tag_name_ends_at ;
1449
-
1450
- // Parse the attributes in the updated markup.
1451
- $ this ->attributes = array ();
1452
- while ( $ this ->parse_next_attribute () ) {
1453
- continue ;
1454
- }
1659
+ // 3. Point this tag processor at the original tag opener and consume it
1660
+ $ this ->parsed_bytes = strlen ( $ updated_html_up_to_current_tag_name_end ) - $ this ->tag_name_length - 2 ;
1661
+ $ this ->next_tag ();
1455
1662
1456
1663
return $ this ->html ;
1457
1664
}
0 commit comments