Skip to content

Commit e5f428e

Browse files
committed
HTML API: Trigger active format reconstruction when reaching text nodes.
When encountering text nodes in an HTML document, the HTML parser needs to run the active format reconstruction algorithm, even if it doesn't stop to visit those text nodes. This is because the formats, which might need reconstructing, will impact the breadcrumbs of all downstream nodes from the text node. In this patch, this process is triggered, which properly triggers the active format reconstruction. It also enables the visiting of other token types as is possible in the Tag Processor. Developed in #6054 Discussed in https://core.trac.wordpress.org/ticket/60170 Reviewed by swissspidy. Merges [57806] to the to the 6.5 branch. Props: dmsnell, jonsurrell, westonruter. Fixes: #60455. Follow-up to: [57348]. git-svn-id: https://develop.svn.wordpress.org/branches/6.5@57823 602fd350-edb4-49c9-b593-d223f7449a82
1 parent e327f6b commit e5f428e

2 files changed

Lines changed: 104 additions & 46 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 101 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,10 @@ public function get_last_error() {
361361
public function next_tag( $query = null ) {
362362
if ( null === $query ) {
363363
while ( $this->step() ) {
364+
if ( '#tag' !== $this->get_token_type() ) {
365+
continue;
366+
}
367+
364368
if ( ! $this->is_tag_closer() ) {
365369
return true;
366370
}
@@ -384,6 +388,10 @@ public function next_tag( $query = null ) {
384388

385389
if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
386390
while ( $this->step() ) {
391+
if ( '#tag' !== $this->get_token_type() ) {
392+
continue;
393+
}
394+
387395
if ( ! $this->is_tag_closer() ) {
388396
return true;
389397
}
@@ -405,6 +413,10 @@ public function next_tag( $query = null ) {
405413
$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
406414

407415
while ( $match_offset > 0 && $this->step() ) {
416+
if ( '#tag' !== $this->get_token_type() ) {
417+
continue;
418+
}
419+
408420
if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
409421
return true;
410422
}
@@ -428,13 +440,7 @@ public function next_tag( $query = null ) {
428440
* @return bool
429441
*/
430442
public function next_token() {
431-
$found_a_token = parent::next_token();
432-
433-
if ( '#tag' === $this->get_token_type() ) {
434-
$this->step( self::PROCESS_CURRENT_NODE );
435-
}
436-
437-
return $found_a_token;
443+
return $this->step();
438444
}
439445

440446
/**
@@ -463,10 +469,6 @@ public function next_token() {
463469
* @return bool Whether the currently-matched tag is found at the given nested structure.
464470
*/
465471
public function matches_breadcrumbs( $breadcrumbs ) {
466-
if ( ! $this->get_tag() ) {
467-
return false;
468-
}
469-
470472
// Everything matches when there are zero constraints.
471473
if ( 0 === count( $breadcrumbs ) ) {
472474
return true;
@@ -529,25 +531,35 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
529531
* is provided in the opening tag, otherwise it expects a tag closer.
530532
*/
531533
$top_node = $this->state->stack_of_open_elements->current_node();
532-
if ( $top_node && self::is_void( $top_node->node_name ) ) {
534+
if (
535+
$top_node && (
536+
// Void elements.
537+
self::is_void( $top_node->node_name ) ||
538+
// Comments, text nodes, and other atomic tokens.
539+
'#' === $top_node->node_name[0] ||
540+
// Doctype declarations.
541+
'html' === $top_node->node_name
542+
)
543+
) {
533544
$this->state->stack_of_open_elements->pop();
534545
}
535546
}
536547

537548
if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
538-
while ( parent::next_token() && '#tag' !== $this->get_token_type() ) {
539-
continue;
540-
}
549+
parent::next_token();
541550
}
542551

543552
// Finish stepping when there are no more tokens in the document.
544-
if ( null === $this->get_tag() ) {
553+
if (
554+
WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
555+
WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
556+
) {
545557
return false;
546558
}
547559

548560
$this->state->current_token = new WP_HTML_Token(
549-
$this->bookmark_tag(),
550-
$this->get_tag(),
561+
$this->bookmark_token(),
562+
$this->get_token_name(),
551563
$this->has_self_closing_flag(),
552564
$this->release_internal_bookmark_on_destruct
553565
);
@@ -591,10 +603,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
591603
* @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
592604
*/
593605
public function get_breadcrumbs() {
594-
if ( ! $this->get_tag() ) {
595-
return null;
596-
}
597-
598606
$breadcrumbs = array();
599607
foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
600608
$breadcrumbs[] = $stack_item->node_name;
@@ -619,11 +627,61 @@ public function get_breadcrumbs() {
619627
* @return bool Whether an element was found.
620628
*/
621629
private function step_in_body() {
622-
$tag_name = $this->get_tag();
623-
$op_sigil = $this->is_tag_closer() ? '-' : '+';
624-
$op = "{$op_sigil}{$tag_name}";
630+
$token_name = $this->get_token_name();
631+
$token_type = $this->get_token_type();
632+
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
633+
$op = "{$op_sigil}{$token_name}";
625634

626635
switch ( $op ) {
636+
case '#comment':
637+
case '#funky-comment':
638+
case '#presumptuous-tag':
639+
$this->insert_html_element( $this->state->current_token );
640+
return true;
641+
642+
case '#text':
643+
$this->reconstruct_active_formatting_elements();
644+
645+
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
646+
647+
/*
648+
* > A character token that is U+0000 NULL
649+
*
650+
* Any successive sequence of NULL bytes is ignored and won't
651+
* trigger active format reconstruction. Therefore, if the text
652+
* only comprises NULL bytes then the token should be ignored
653+
* here, but if there are any other characters in the stream
654+
* the active formats should be reconstructed.
655+
*/
656+
if (
657+
1 <= $current_token->length &&
658+
"\x00" === $this->html[ $current_token->start ] &&
659+
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
660+
) {
661+
// Parse error: ignore the token.
662+
return $this->step();
663+
}
664+
665+
/*
666+
* Whitespace-only text does not affect the frameset-ok flag.
667+
* It is probably inter-element whitespace, but it may also
668+
* contain character references which decode only to whitespace.
669+
*/
670+
$text = $this->get_modifiable_text();
671+
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
672+
$this->state->frameset_ok = false;
673+
}
674+
675+
$this->insert_html_element( $this->state->current_token );
676+
return true;
677+
678+
case 'html':
679+
/*
680+
* > A DOCTYPE token
681+
* > Parse error. Ignore the token.
682+
*/
683+
return $this->step();
684+
627685
/*
628686
* > A start tag whose tag name is "button"
629687
*/
@@ -711,17 +769,17 @@ private function step_in_body() {
711769
case '-SECTION':
712770
case '-SUMMARY':
713771
case '-UL':
714-
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
772+
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
715773
// @todo Report parse error.
716774
// Ignore the token.
717775
return $this->step();
718776
}
719777

720778
$this->generate_implied_end_tags();
721-
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
779+
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
722780
// @todo Record parse error: this error doesn't impact parsing.
723781
}
724-
$this->state->stack_of_open_elements->pop_until( $tag_name );
782+
$this->state->stack_of_open_elements->pop_until( $token_name );
725783
return true;
726784

727785
/*
@@ -783,7 +841,7 @@ private function step_in_body() {
783841

784842
$this->generate_implied_end_tags();
785843

786-
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
844+
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
787845
// @todo Record parse error: this error doesn't impact parsing.
788846
}
789847

@@ -799,7 +857,7 @@ private function step_in_body() {
799857
case '+LI':
800858
$this->state->frameset_ok = false;
801859
$node = $this->state->stack_of_open_elements->current_node();
802-
$is_li = 'LI' === $tag_name;
860+
$is_li = 'LI' === $token_name;
803861

804862
in_body_list_loop:
805863
/*
@@ -862,7 +920,7 @@ private function step_in_body() {
862920
* then this is a parse error; ignore the token.
863921
*/
864922
(
865-
'LI' === $tag_name &&
923+
'LI' === $token_name &&
866924
! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
867925
) ||
868926
/*
@@ -872,8 +930,8 @@ private function step_in_body() {
872930
* parse error; ignore the token.
873931
*/
874932
(
875-
'LI' !== $tag_name &&
876-
! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name )
933+
'LI' !== $token_name &&
934+
! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
877935
)
878936
) {
879937
/*
@@ -884,13 +942,13 @@ private function step_in_body() {
884942
return $this->step();
885943
}
886944

887-
$this->generate_implied_end_tags( $tag_name );
945+
$this->generate_implied_end_tags( $token_name );
888946

889-
if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
947+
if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
890948
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
891949
}
892950

893-
$this->state->stack_of_open_elements->pop_until( $tag_name );
951+
$this->state->stack_of_open_elements->pop_until( $token_name );
894952
return true;
895953

896954
/*
@@ -1043,7 +1101,7 @@ private function step_in_body() {
10431101
*
10441102
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
10451103
*/
1046-
switch ( $tag_name ) {
1104+
switch ( $token_name ) {
10471105
case 'APPLET':
10481106
case 'BASE':
10491107
case 'BASEFONT':
@@ -1091,7 +1149,7 @@ private function step_in_body() {
10911149
case 'TR':
10921150
case 'XMP':
10931151
$this->last_error = self::ERROR_UNSUPPORTED;
1094-
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
1152+
throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
10951153
}
10961154

10971155
if ( ! $this->is_tag_closer() ) {
@@ -1113,7 +1171,7 @@ private function step_in_body() {
11131171
* close anything beyond its containing `P` or `DIV` element.
11141172
*/
11151173
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
1116-
if ( $tag_name === $node->node_name ) {
1174+
if ( $token_name === $node->node_name ) {
11171175
break;
11181176
}
11191177

@@ -1123,7 +1181,7 @@ private function step_in_body() {
11231181
}
11241182
}
11251183

1126-
$this->generate_implied_end_tags( $tag_name );
1184+
$this->generate_implied_end_tags( $token_name );
11271185
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
11281186
// @todo Record parse error: this error doesn't impact parsing.
11291187
}
@@ -1142,19 +1200,16 @@ private function step_in_body() {
11421200
*/
11431201

11441202
/**
1145-
* Creates a new bookmark for the currently-matched tag and returns the generated name.
1203+
* Creates a new bookmark for the currently-matched token and returns the generated name.
11461204
*
11471205
* @since 6.4.0
1206+
* @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
11481207
*
11491208
* @throws Exception When unable to allocate requested bookmark.
11501209
*
11511210
* @return string|false Name of created bookmark, or false if unable to create.
11521211
*/
1153-
private function bookmark_tag() {
1154-
if ( ! $this->get_tag() ) {
1155-
return false;
1156-
}
1157-
1212+
private function bookmark_token() {
11581213
if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
11591214
$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
11601215
throw new Exception( 'could not allocate bookmark' );

tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ public function test_in_body_skips_unexpected_button_closer() {
128128
$this->assertSame( 'DIV', $processor->get_tag(), 'Did not stop at initial DIV tag.' );
129129
$this->assertFalse( $processor->is_tag_closer(), 'Did not find that initial DIV tag is an opener.' );
130130

131+
$processor->step();
132+
$this->assertSame( '#text', $processor->get_token_type(), 'Should have found the text node.' );
133+
131134
/*
132135
* When encountering the BUTTON closing tag, there is no BUTTON in the stack of open elements.
133136
* It should be ignored as there's no BUTTON to close.

0 commit comments

Comments
 (0)