Skip to content

Commit 39f6ae2

Browse files
committed
Recreate branch from principle, scanning all tokens.
1 parent e01eaeb commit 39f6ae2

1 file changed

Lines changed: 96 additions & 86 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 96 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,11 @@ public function get_last_error() {
361361
public function next_tag( $query = null ) {
362362
if ( null === $query ) {
363363
while ( $this->step() ) {
364-
if ( '#tag' === $this->get_token_type() && ! $this->is_tag_closer() ) {
364+
if ( '#tag' !== $this->get_token_type() ) {
365+
continue;
366+
}
367+
368+
if ( ! $this->is_tag_closer() ) {
365369
return true;
366370
}
367371
}
@@ -384,7 +388,11 @@ public function next_tag( $query = null ) {
384388

385389
if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
386390
while ( $this->step() ) {
387-
if ( '#tag' === $this->get_token_type() && ! $this->is_tag_closer() ) {
391+
if ( '#tag' !== $this->get_token_type() ) {
392+
continue;
393+
}
394+
395+
if ( ! $this->is_tag_closer() ) {
388396
return true;
389397
}
390398
}
@@ -405,6 +413,10 @@ public function next_tag( $query = null ) {
405413
$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
406414

407415
while ( $match_offset > 0 && $this->step() ) {
416+
if ( '#tag' !== $this->get_token_type() ) {
417+
continue;
418+
}
419+
408420
if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
409421
return true;
410422
}
@@ -457,10 +469,6 @@ public function next_token() {
457469
* @return bool Whether the currently-matched tag is found at the given nested structure.
458470
*/
459471
public function matches_breadcrumbs( $breadcrumbs ) {
460-
if ( ! $this->get_tag() ) {
461-
return false;
462-
}
463-
464472
// Everything matches when there are zero constraints.
465473
if ( 0 === count( $breadcrumbs ) ) {
466474
return true;
@@ -523,37 +531,35 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
523531
* is provided in the opening tag, otherwise it expects a tag closer.
524532
*/
525533
$top_node = $this->state->stack_of_open_elements->current_node();
526-
if ( $top_node && self::is_void( $top_node->node_name ) ) {
534+
if (
535+
$top_node && (
536+
// Void elements.
537+
self::is_void( $top_node->node_name ) ||
538+
// Comments, text nodes, and other atomic tokens.
539+
'#' === $top_node->node_name[0] ||
540+
// Doctype declarations.
541+
'html' === $top_node->node_name
542+
)
543+
) {
527544
$this->state->stack_of_open_elements->pop();
528545
}
529546
}
530547

531548
if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
532-
/*
533-
* Currently tag and text nodes must be processed. Text nodes may
534-
* trigger active format reconstruction, otherwise they could be
535-
* skipped until the HTML Processor supports visiting text nodes.
536-
*/
537-
while ( parent::next_token() ) {
538-
$token_type = $this->get_token_type();
539-
540-
if ( '#tag' === $token_type || '#text' === $token_type ) {
541-
break;
542-
}
543-
}
549+
parent::next_token();
544550
}
545551

546552
// Finish stepping when there are no more tokens in the document.
547553
if (
548-
WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state ||
549-
WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state
554+
WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
555+
WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
550556
) {
551557
return false;
552558
}
553559

554560
$this->state->current_token = new WP_HTML_Token(
555-
$this->bookmark_tag(),
556-
$this->get_tag(),
561+
$this->bookmark_token(),
562+
$this->get_token_name(),
557563
$this->has_self_closing_flag(),
558564
$this->release_internal_bookmark_on_destruct
559565
);
@@ -597,10 +603,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
597603
* @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
598604
*/
599605
public function get_breadcrumbs() {
600-
if ( ! $this->get_tag() ) {
601-
return null;
602-
}
603-
604606
$breadcrumbs = array();
605607
foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
606608
$breadcrumbs[] = $stack_item->node_name;
@@ -625,54 +627,61 @@ public function get_breadcrumbs() {
625627
* @return bool Whether an element was found.
626628
*/
627629
private function step_in_body() {
628-
$tag_name = $this->get_tag();
629-
$op_sigil = $this->is_tag_closer() ? '-' : '+';
630-
$op = "{$op_sigil}{$tag_name}";
630+
$token_name = $this->get_token_name();
631+
$token_type = $this->get_token_type();
632+
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
633+
$op = "{$op_sigil}{$token_name}";
631634

632-
if ( null === $tag_name && '#text' === $this->get_token_type() ) {
633-
/*
634-
* This rule is necessary even without supporting text nodes in the
635-
* HTML Processor because the parser has to move past text nodes, and
636-
* there could be breadcrumb implications when the text triggers the
637-
* active format reconstruction.
638-
*/
639-
$this->reconstruct_active_formatting_elements();
635+
switch ( $op ) {
636+
case '#comment':
637+
case '#funky-comment':
638+
case '#presumptuous-tag':
639+
$this->insert_html_element( $this->state->current_token );
640+
return true;
640641

641-
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
642+
case '#text':
643+
$this->reconstruct_active_formatting_elements();
642644

643-
/*
644-
* > A character token that is U+0000 NULL
645-
*
646-
* Any successive sequence of NULL bytes is ignored and won't
647-
* trigger active format reconstruction. Therefore, if the text
648-
* only comprises NULL bytes then the token should be ignored
649-
* here, but if there are any other characters in the stream
650-
* the active formats should be reconstructed.
651-
*/
652-
if (
653-
1 <= $current_token->length &&
654-
"\x00" === $this->html[ $current_token->start ] &&
655-
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
656-
) {
657-
// Parse error: ignore the token.
658-
return $this->step();
659-
}
645+
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
660646

661-
/*
662-
* Whitespace-only text does not affect the frameset-ok flag.
663-
* It is probably inter-element whitespace, but it may also
664-
* contain character references which decode only to whitespace.
665-
*/
666-
$text = $this->get_modifiable_text();
667-
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
668-
$this->state->frameset_ok = false;
669-
}
647+
/*
648+
* > A character token that is U+0000 NULL
649+
*
650+
* Any successive sequence of NULL bytes is ignored and won't
651+
* trigger active format reconstruction. Therefore, if the text
652+
* only comprises NULL bytes then the token should be ignored
653+
* here, but if there are any other characters in the stream
654+
* the active formats should be reconstructed.
655+
*/
656+
if (
657+
1 <= $current_token->length &&
658+
"\x00" === $this->html[ $current_token->start ] &&
659+
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
660+
) {
661+
// Parse error: ignore the token.
662+
return $this->step();
663+
}
670664

671-
// @todo Add support for text nodes: insert node and return "true" where when supported.
672-
return $this->step();
673-
}
665+
/*
666+
* Whitespace-only text does not affect the frameset-ok flag.
667+
* It is probably inter-element whitespace, but it may also
668+
* contain character references which decode only to whitespace.
669+
*/
670+
$text = $this->get_modifiable_text();
671+
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
672+
$this->state->frameset_ok = false;
673+
}
674+
675+
$this->insert_html_element( $this->state->current_token );
676+
return true;
677+
678+
case 'html':
679+
/*
680+
* > A DOCTYPE token
681+
* > Parse error. Ignore the token.
682+
*/
683+
return $this->step();
674684

675-
switch ( $op ) {
676685
/*
677686
* > A start tag whose tag name is "button"
678687
*/
@@ -760,17 +769,17 @@ private function step_in_body() {
760769
case '-SECTION':
761770
case '-SUMMARY':
762771
case '-UL':
763-
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
772+
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
764773
// @todo Report parse error.
765774
// Ignore the token.
766775
return $this->step();
767776
}
768777

769778
$this->generate_implied_end_tags();
770-
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
779+
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
771780
// @todo Record parse error: this error doesn't impact parsing.
772781
}
773-
$this->state->stack_of_open_elements->pop_until( $tag_name );
782+
$this->state->stack_of_open_elements->pop_until( $token_name );
774783
return true;
775784

776785
/*
@@ -832,7 +841,7 @@ private function step_in_body() {
832841

833842
$this->generate_implied_end_tags();
834843

835-
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
844+
if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
836845
// @todo Record parse error: this error doesn't impact parsing.
837846
}
838847

@@ -848,7 +857,7 @@ private function step_in_body() {
848857
case '+LI':
849858
$this->state->frameset_ok = false;
850859
$node = $this->state->stack_of_open_elements->current_node();
851-
$is_li = 'LI' === $tag_name;
860+
$is_li = 'LI' === $token_name;
852861

853862
in_body_list_loop:
854863
/*
@@ -911,7 +920,7 @@ private function step_in_body() {
911920
* then this is a parse error; ignore the token.
912921
*/
913922
(
914-
'LI' === $tag_name &&
923+
'LI' === $token_name &&
915924
! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
916925
) ||
917926
/*
@@ -921,8 +930,8 @@ private function step_in_body() {
921930
* parse error; ignore the token.
922931
*/
923932
(
924-
'LI' !== $tag_name &&
925-
! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name )
933+
'LI' !== $token_name &&
934+
! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
926935
)
927936
) {
928937
/*
@@ -933,13 +942,13 @@ private function step_in_body() {
933942
return $this->step();
934943
}
935944

936-
$this->generate_implied_end_tags( $tag_name );
945+
$this->generate_implied_end_tags( $token_name );
937946

938-
if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
947+
if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
939948
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
940949
}
941950

942-
$this->state->stack_of_open_elements->pop_until( $tag_name );
951+
$this->state->stack_of_open_elements->pop_until( $token_name );
943952
return true;
944953

945954
/*
@@ -1092,7 +1101,7 @@ private function step_in_body() {
10921101
*
10931102
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
10941103
*/
1095-
switch ( $tag_name ) {
1104+
switch ( $token_name ) {
10961105
case 'APPLET':
10971106
case 'BASE':
10981107
case 'BASEFONT':
@@ -1140,7 +1149,7 @@ private function step_in_body() {
11401149
case 'TR':
11411150
case 'XMP':
11421151
$this->last_error = self::ERROR_UNSUPPORTED;
1143-
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
1152+
throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
11441153
}
11451154

11461155
if ( ! $this->is_tag_closer() ) {
@@ -1162,7 +1171,7 @@ private function step_in_body() {
11621171
* close anything beyond its containing `P` or `DIV` element.
11631172
*/
11641173
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
1165-
if ( $tag_name === $node->node_name ) {
1174+
if ( $token_name === $node->node_name ) {
11661175
break;
11671176
}
11681177

@@ -1172,7 +1181,7 @@ private function step_in_body() {
11721181
}
11731182
}
11741183

1175-
$this->generate_implied_end_tags( $tag_name );
1184+
$this->generate_implied_end_tags( $token_name );
11761185
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
11771186
// @todo Record parse error: this error doesn't impact parsing.
11781187
}
@@ -1191,15 +1200,16 @@ private function step_in_body() {
11911200
*/
11921201

11931202
/**
1194-
* Creates a new bookmark for the currently-matched tag and returns the generated name.
1203+
* Creates a new bookmark for the currently-matched token and returns the generated name.
11951204
*
11961205
* @since 6.4.0
1206+
* @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
11971207
*
11981208
* @throws Exception When unable to allocate requested bookmark.
11991209
*
12001210
* @return string|false Name of created bookmark, or false if unable to create.
12011211
*/
1202-
private function bookmark_tag() {
1212+
private function bookmark_token() {
12031213
if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
12041214
$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
12051215
throw new Exception( 'could not allocate bookmark' );

0 commit comments

Comments
 (0)