Skip to content

Commit dabde02

Browse files
committed
HTML API: Defer applying attribute updates until necessary.
When making repeated updates to a document, the Tag Processor will end up copying the entire document once for every update. This can lead to catastrophic behavior in the worse case. However, when batch-applying updates it's able to copy chunks of the document in one thread and only end up copying the entire document once for the entire batch. Previously the Tag Processor has been eagerly applying udpates, but in this patch it defers applying those updates as long as is possible. Developed in #6120 Discussed in https://core.trac.wordpress.org/ticket/60697 Props: dmsnell, bernhard-reiter, jonsurrell, westonruter. Fixes #60697. Follow-up to [55706], [56941], [57348]. git-svn-id: https://develop.svn.wordpress.org/trunk@57805 602fd350-edb4-49c9-b593-d223f7449a82
1 parent e3a5206 commit dabde02

3 files changed

Lines changed: 116 additions & 12 deletions

File tree

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -837,8 +837,27 @@ public function next_tag( $query = null ) {
837837
* @return bool Whether a token was parsed.
838838
*/
839839
public function next_token() {
840+
return $this->base_class_next_token();
841+
}
842+
843+
/**
844+
* Internal method which finds the next token in the HTML document.
845+
*
846+
* This method is a protected internal function which implements the logic for
847+
* finding the next token in a document. It exists so that the parser can update
848+
* its state without affecting the location of the cursor in the document and
849+
* without triggering subclass methods for things like `next_token()`, e.g. when
850+
* applying patches before searching for the next token.
851+
*
852+
* @since 6.5.0
853+
*
854+
* @access private
855+
*
856+
* @return bool Whether a token was parsed.
857+
*/
858+
private function base_class_next_token() {
840859
$was_at = $this->bytes_already_parsed;
841-
$this->get_updated_html();
860+
$this->after_tag();
842861

843862
// Don't proceed if there's nothing more to scan.
844863
if (
@@ -2041,6 +2060,45 @@ private function skip_whitespace() {
20412060
* @since 6.2.0
20422061
*/
20432062
private function after_tag() {
2063+
/*
2064+
* There could be lexical updates enqueued for an attribute that
2065+
* also exists on the next tag. In order to avoid conflating the
2066+
* attributes across the two tags, lexical updates with names
2067+
* need to be flushed to raw lexical updates.
2068+
*/
2069+
$this->class_name_updates_to_attributes_updates();
2070+
2071+
/*
2072+
* Purge updates if there are too many. The actual count isn't
2073+
* scientific, but a few values from 100 to a few thousand were
2074+
* tests to find a practially-useful limit.
2075+
*
2076+
* If the update queue grows too big, then the Tag Processor
2077+
* will spend more time iterating through them and lose the
2078+
* efficiency gains of deferring applying them.
2079+
*/
2080+
if ( 1000 < count( $this->lexical_updates ) ) {
2081+
$this->get_updated_html();
2082+
}
2083+
2084+
foreach ( $this->lexical_updates as $name => $update ) {
2085+
/*
2086+
* Any updates appearing after the cursor should be applied
2087+
* before proceeding, otherwise they may be overlooked.
2088+
*/
2089+
if ( $update->start >= $this->bytes_already_parsed ) {
2090+
$this->get_updated_html();
2091+
break;
2092+
}
2093+
2094+
if ( is_int( $name ) ) {
2095+
continue;
2096+
}
2097+
2098+
$this->lexical_updates[] = $update;
2099+
unset( $this->lexical_updates[ $name ] );
2100+
}
2101+
20442102
$this->token_starts_at = null;
20452103
$this->token_length = null;
20462104
$this->tag_name_starts_at = null;
@@ -2230,7 +2288,7 @@ private function apply_attributes_updates( $shift_this_point = 0 ) {
22302288
$shift = strlen( $diff->text ) - $diff->length;
22312289

22322290
// Adjust the cursor position by however much an update affects it.
2233-
if ( $diff->start <= $this->bytes_already_parsed ) {
2291+
if ( $diff->start < $this->bytes_already_parsed ) {
22342292
$this->bytes_already_parsed += $shift;
22352293
}
22362294

@@ -3164,15 +3222,7 @@ public function get_updated_html() {
31643222
* └←─┘ back up by strlen("em") + 1 ==> 3
31653223
*/
31663224
$this->bytes_already_parsed = $before_current_tag;
3167-
$this->parse_next_tag();
3168-
// Reparse the attributes.
3169-
while ( $this->parse_next_attribute() ) {
3170-
continue;
3171-
}
3172-
3173-
$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
3174-
$this->token_length = $tag_ends_at - $this->token_starts_at;
3175-
$this->bytes_already_parsed = $tag_ends_at;
3225+
$this->base_class_next_token();
31763226

31773227
return $this->html;
31783228
}

tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,21 +293,30 @@ public function test_bookmarks_complex_use_case() {
293293

294294
/**
295295
* @ticket 56299
296+
* @ticket 60697
296297
*
297298
* @covers WP_HTML_Tag_Processor::seek
298299
*/
299300
public function test_updates_bookmark_for_additions_after_both_sides() {
300301
$processor = new WP_HTML_Tag_Processor( '<div>First</div><div>Second</div>' );
301302
$processor->next_tag();
303+
$processor->set_attribute( 'id', 'one' );
302304
$processor->set_bookmark( 'first' );
303305
$processor->next_tag();
306+
$processor->set_attribute( 'id', 'two' );
304307
$processor->add_class( 'second' );
305308

306309
$processor->seek( 'first' );
307310
$processor->add_class( 'first' );
308311

309312
$this->assertSame(
310-
'<div class="first">First</div><div class="second">Second</div>',
313+
'one',
314+
$processor->get_attribute( 'id' ),
315+
'Should have remembered attribute change from before the seek.'
316+
);
317+
318+
$this->assertSame(
319+
'<div class="first" id="one">First</div><div class="second" id="two">Second</div>',
311320
$processor->get_updated_html(),
312321
'The bookmark was updated incorrectly in response to HTML markup updates'
313322
);

tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2727,4 +2727,49 @@ public function test_single_text_node_with_taglike_text() {
27272727
$this->assertSame( '#text', $processor->get_token_type(), 'Did not find text node.' );
27282728
$this->assertSame( 'test< /A>', $processor->get_modifiable_text(), 'Did not find complete text node.' );
27292729
}
2730+
2731+
/**
2732+
* Ensures that updates which are enqueued in front of the cursor
2733+
* are applied before moving forward in the document.
2734+
*
2735+
* @ticket 60697
2736+
*/
2737+
public function test_applies_updates_before_proceeding() {
2738+
$html = '<div><img></div><div><img></div>';
2739+
2740+
$subclass = new class( $html ) extends WP_HTML_Tag_Processor {
2741+
/**
2742+
* Inserts raw text after the current token.
2743+
*
2744+
* @param string $new_html Raw text to insert.
2745+
*/
2746+
public function insert_after( $new_html ) {
2747+
$this->set_bookmark( 'here' );
2748+
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
2749+
$this->bookmarks['here']->start + $this->bookmarks['here']->length + 1,
2750+
0,
2751+
$new_html
2752+
);
2753+
}
2754+
};
2755+
2756+
$subclass->next_tag( 'img' );
2757+
$subclass->insert_after( '<p>snow-capped</p>' );
2758+
2759+
$subclass->next_tag();
2760+
$this->assertSame(
2761+
'P',
2762+
$subclass->get_tag(),
2763+
'Should have matched inserted HTML as next tag.'
2764+
);
2765+
2766+
$subclass->next_tag( 'img' );
2767+
$subclass->set_attribute( 'alt', 'mountain' );
2768+
2769+
$this->assertSame(
2770+
'<div><img><p>snow-capped</p></div><div><img alt="mountain"></div>',
2771+
$subclass->get_updated_html(),
2772+
'Should have properly applied the update from in front of the cursor.'
2773+
);
2774+
}
27302775
}

0 commit comments

Comments
 (0)