@@ -361,7 +361,11 @@ public function get_last_error() {
361361 public function next_tag ( $ query = null ) {
362362 if ( null === $ query ) {
363363 while ( $ this ->step () ) {
364- if ( '#tag ' === $ this ->get_token_type () && ! $ this ->is_tag_closer () ) {
364+ if ( '#tag ' !== $ this ->get_token_type () ) {
365+ continue ;
366+ }
367+
368+ if ( ! $ this ->is_tag_closer () ) {
365369 return true ;
366370 }
367371 }
@@ -384,7 +388,11 @@ public function next_tag( $query = null ) {
384388
385389 if ( ! ( array_key_exists ( 'breadcrumbs ' , $ query ) && is_array ( $ query ['breadcrumbs ' ] ) ) ) {
386390 while ( $ this ->step () ) {
387- if ( '#tag ' === $ this ->get_token_type () && ! $ this ->is_tag_closer () ) {
391+ if ( '#tag ' !== $ this ->get_token_type () ) {
392+ continue ;
393+ }
394+
395+ if ( ! $ this ->is_tag_closer () ) {
388396 return true ;
389397 }
390398 }
@@ -405,6 +413,10 @@ public function next_tag( $query = null ) {
405413 $ match_offset = isset ( $ query ['match_offset ' ] ) ? (int ) $ query ['match_offset ' ] : 1 ;
406414
407415 while ( $ match_offset > 0 && $ this ->step () ) {
416+ if ( '#tag ' !== $ this ->get_token_type () ) {
417+ continue ;
418+ }
419+
408420 if ( $ this ->matches_breadcrumbs ( $ breadcrumbs ) && 0 === --$ match_offset ) {
409421 return true ;
410422 }
@@ -457,10 +469,6 @@ public function next_token() {
457469 * @return bool Whether the currently-matched tag is found at the given nested structure.
458470 */
459471 public function matches_breadcrumbs ( $ breadcrumbs ) {
460- if ( ! $ this ->get_tag () ) {
461- return false ;
462- }
463-
464472 // Everything matches when there are zero constraints.
465473 if ( 0 === count ( $ breadcrumbs ) ) {
466474 return true ;
@@ -523,37 +531,35 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
523531 * is provided in the opening tag, otherwise it expects a tag closer.
524532 */
525533 $ top_node = $ this ->state ->stack_of_open_elements ->current_node ();
526- if ( $ top_node && self ::is_void ( $ top_node ->node_name ) ) {
534+ if (
535+ $ top_node && (
536+ // Void elements.
537+ self ::is_void ( $ top_node ->node_name ) ||
538+ // Comments, text nodes, and other atomic tokens.
539+ '# ' === $ top_node ->node_name [0 ] ||
540+ // Doctype declarations.
541+ 'html ' === $ top_node ->node_name
542+ )
543+ ) {
527544 $ this ->state ->stack_of_open_elements ->pop ();
528545 }
529546 }
530547
531548 if ( self ::PROCESS_NEXT_NODE === $ node_to_process ) {
532- /*
533- * Currently tag and text nodes must be processed. Text nodes may
534- * trigger active format reconstruction, otherwise they could be
535- * skipped until the HTML Processor supports visiting text nodes.
536- */
537- while ( parent ::next_token () ) {
538- $ token_type = $ this ->get_token_type ();
539-
540- if ( '#tag ' === $ token_type || '#text ' === $ token_type ) {
541- break ;
542- }
543- }
549+ parent ::next_token ();
544550 }
545551
546552 // Finish stepping when there are no more tokens in the document.
547553 if (
548- WP_HTML_Tag_Processor::STATE_COMPLETE === $ this ->parser_state ||
549- WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $ this ->parser_state
554+ WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $ this ->parser_state ||
555+ WP_HTML_Tag_Processor::STATE_COMPLETE === $ this ->parser_state
550556 ) {
551557 return false ;
552558 }
553559
554560 $ this ->state ->current_token = new WP_HTML_Token (
555- $ this ->bookmark_tag (),
556- $ this ->get_tag (),
561+ $ this ->bookmark_token (),
562+ $ this ->get_token_name (),
557563 $ this ->has_self_closing_flag (),
558564 $ this ->release_internal_bookmark_on_destruct
559565 );
@@ -597,10 +603,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
597603 * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
598604 */
599605 public function get_breadcrumbs () {
600- if ( ! $ this ->get_tag () ) {
601- return null ;
602- }
603-
604606 $ breadcrumbs = array ();
605607 foreach ( $ this ->state ->stack_of_open_elements ->walk_down () as $ stack_item ) {
606608 $ breadcrumbs [] = $ stack_item ->node_name ;
@@ -625,54 +627,61 @@ public function get_breadcrumbs() {
625627 * @return bool Whether an element was found.
626628 */
627629 private function step_in_body () {
628- $ tag_name = $ this ->get_tag ();
629- $ op_sigil = $ this ->is_tag_closer () ? '- ' : '+ ' ;
630- $ op = "{$ op_sigil }{$ tag_name }" ;
630+ $ token_name = $ this ->get_token_name ();
631+ $ token_type = $ this ->get_token_type ();
632+ $ op_sigil = '#tag ' === $ token_type ? ( $ this ->is_tag_closer () ? '- ' : '+ ' ) : '' ;
633+ $ op = "{$ op_sigil }{$ token_name }" ;
631634
632- if ( null === $ tag_name && '#text ' === $ this ->get_token_type () ) {
633- /*
634- * This rule is necessary even without supporting text nodes in the
635- * HTML Processor because the parser has to move past text nodes, and
636- * there could be breadcrumb implications when the text triggers the
637- * active format reconstruction.
638- */
639- $ this ->reconstruct_active_formatting_elements ();
635+ switch ( $ op ) {
636+ case '#comment ' :
637+ case '#funky-comment ' :
638+ case '#presumptuous-tag ' :
639+ $ this ->insert_html_element ( $ this ->state ->current_token );
640+ return true ;
640641
641- $ current_token = $ this ->bookmarks [ $ this ->state ->current_token ->bookmark_name ];
642+ case '#text ' :
643+ $ this ->reconstruct_active_formatting_elements ();
642644
643- /*
644- * > A character token that is U+0000 NULL
645- *
646- * Any successive sequence of NULL bytes is ignored and won't
647- * trigger active format reconstruction. Therefore, if the text
648- * only comprises NULL bytes then the token should be ignored
649- * here, but if there are any other characters in the stream
650- * the active formats should be reconstructed.
651- */
652- if (
653- 1 <= $ current_token ->length &&
654- "\x00" === $ this ->html [ $ current_token ->start ] &&
655- strspn ( $ this ->html , "\x00" , $ current_token ->start , $ current_token ->length ) === $ current_token ->length
656- ) {
657- // Parse error: ignore the token.
658- return $ this ->step ();
659- }
645+ $ current_token = $ this ->bookmarks [ $ this ->state ->current_token ->bookmark_name ];
660646
661- /*
662- * Whitespace-only text does not affect the frameset-ok flag.
663- * It is probably inter-element whitespace, but it may also
664- * contain character references which decode only to whitespace.
665- */
666- $ text = $ this ->get_modifiable_text ();
667- if ( strlen ( $ text ) !== strspn ( $ text , " \t\n\f\r" ) ) {
668- $ this ->state ->frameset_ok = false ;
669- }
647+ /*
648+ * > A character token that is U+0000 NULL
649+ *
650+ * Any successive sequence of NULL bytes is ignored and won't
651+ * trigger active format reconstruction. Therefore, if the text
652+ * only comprises NULL bytes then the token should be ignored
653+ * here, but if there are any other characters in the stream
654+ * the active formats should be reconstructed.
655+ */
656+ if (
657+ 1 <= $ current_token ->length &&
658+ "\x00" === $ this ->html [ $ current_token ->start ] &&
659+ strspn ( $ this ->html , "\x00" , $ current_token ->start , $ current_token ->length ) === $ current_token ->length
660+ ) {
661+ // Parse error: ignore the token.
662+ return $ this ->step ();
663+ }
670664
671- // @todo Add support for text nodes: insert node and return "true" where when supported.
672- return $ this ->step ();
673- }
665+ /*
666+ * Whitespace-only text does not affect the frameset-ok flag.
667+ * It is probably inter-element whitespace, but it may also
668+ * contain character references which decode only to whitespace.
669+ */
670+ $ text = $ this ->get_modifiable_text ();
671+ if ( strlen ( $ text ) !== strspn ( $ text , " \t\n\f\r" ) ) {
672+ $ this ->state ->frameset_ok = false ;
673+ }
674+
675+ $ this ->insert_html_element ( $ this ->state ->current_token );
676+ return true ;
677+
678+ case 'html ' :
679+ /*
680+ * > A DOCTYPE token
681+ * > Parse error. Ignore the token.
682+ */
683+ return $ this ->step ();
674684
675- switch ( $ op ) {
676685 /*
677686 * > A start tag whose tag name is "button"
678687 */
@@ -760,17 +769,17 @@ private function step_in_body() {
760769 case '-SECTION ' :
761770 case '-SUMMARY ' :
762771 case '-UL ' :
763- if ( ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ tag_name ) ) {
772+ if ( ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ token_name ) ) {
764773 // @todo Report parse error.
765774 // Ignore the token.
766775 return $ this ->step ();
767776 }
768777
769778 $ this ->generate_implied_end_tags ();
770- if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ tag_name ) {
779+ if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ token_name ) {
771780 // @todo Record parse error: this error doesn't impact parsing.
772781 }
773- $ this ->state ->stack_of_open_elements ->pop_until ( $ tag_name );
782+ $ this ->state ->stack_of_open_elements ->pop_until ( $ token_name );
774783 return true ;
775784
776785 /*
@@ -832,7 +841,7 @@ private function step_in_body() {
832841
833842 $ this ->generate_implied_end_tags ();
834843
835- if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ tag_name ) {
844+ if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ token_name ) {
836845 // @todo Record parse error: this error doesn't impact parsing.
837846 }
838847
@@ -848,7 +857,7 @@ private function step_in_body() {
848857 case '+LI ' :
849858 $ this ->state ->frameset_ok = false ;
850859 $ node = $ this ->state ->stack_of_open_elements ->current_node ();
851- $ is_li = 'LI ' === $ tag_name ;
860+ $ is_li = 'LI ' === $ token_name ;
852861
853862 in_body_list_loop:
854863 /*
@@ -911,7 +920,7 @@ private function step_in_body() {
911920 * then this is a parse error; ignore the token.
912921 */
913922 (
914- 'LI ' === $ tag_name &&
923+ 'LI ' === $ token_name &&
915924 ! $ this ->state ->stack_of_open_elements ->has_element_in_list_item_scope ( 'LI ' )
916925 ) ||
917926 /*
@@ -921,8 +930,8 @@ private function step_in_body() {
921930 * parse error; ignore the token.
922931 */
923932 (
924- 'LI ' !== $ tag_name &&
925- ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ tag_name )
933+ 'LI ' !== $ token_name &&
934+ ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ token_name )
926935 )
927936 ) {
928937 /*
@@ -933,13 +942,13 @@ private function step_in_body() {
933942 return $ this ->step ();
934943 }
935944
936- $ this ->generate_implied_end_tags ( $ tag_name );
945+ $ this ->generate_implied_end_tags ( $ token_name );
937946
938- if ( $ tag_name !== $ this ->state ->stack_of_open_elements ->current_node ()->node_name ) {
947+ if ( $ token_name !== $ this ->state ->stack_of_open_elements ->current_node ()->node_name ) {
939948 // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
940949 }
941950
942- $ this ->state ->stack_of_open_elements ->pop_until ( $ tag_name );
951+ $ this ->state ->stack_of_open_elements ->pop_until ( $ token_name );
943952 return true ;
944953
945954 /*
@@ -1092,7 +1101,7 @@ private function step_in_body() {
10921101 *
10931102 * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
10941103 */
1095- switch ( $ tag_name ) {
1104+ switch ( $ token_name ) {
10961105 case 'APPLET ' :
10971106 case 'BASE ' :
10981107 case 'BASEFONT ' :
@@ -1140,7 +1149,7 @@ private function step_in_body() {
11401149 case 'TR ' :
11411150 case 'XMP ' :
11421151 $ this ->last_error = self ::ERROR_UNSUPPORTED ;
1143- throw new WP_HTML_Unsupported_Exception ( "Cannot process {$ tag_name } element. " );
1152+ throw new WP_HTML_Unsupported_Exception ( "Cannot process {$ token_name } element. " );
11441153 }
11451154
11461155 if ( ! $ this ->is_tag_closer () ) {
@@ -1162,7 +1171,7 @@ private function step_in_body() {
11621171 * close anything beyond its containing `P` or `DIV` element.
11631172 */
11641173 foreach ( $ this ->state ->stack_of_open_elements ->walk_up () as $ node ) {
1165- if ( $ tag_name === $ node ->node_name ) {
1174+ if ( $ token_name === $ node ->node_name ) {
11661175 break ;
11671176 }
11681177
@@ -1172,7 +1181,7 @@ private function step_in_body() {
11721181 }
11731182 }
11741183
1175- $ this ->generate_implied_end_tags ( $ tag_name );
1184+ $ this ->generate_implied_end_tags ( $ token_name );
11761185 if ( $ node !== $ this ->state ->stack_of_open_elements ->current_node () ) {
11771186 // @todo Record parse error: this error doesn't impact parsing.
11781187 }
@@ -1191,15 +1200,16 @@ private function step_in_body() {
11911200 */
11921201
11931202 /**
1194- * Creates a new bookmark for the currently-matched tag and returns the generated name.
1203+ * Creates a new bookmark for the currently-matched token and returns the generated name.
11951204 *
11961205 * @since 6.4.0
1206+ * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
11971207 *
11981208 * @throws Exception When unable to allocate requested bookmark.
11991209 *
12001210 * @return string|false Name of created bookmark, or false if unable to create.
12011211 */
1202- private function bookmark_tag () {
1212+ private function bookmark_token () {
12031213 if ( ! parent ::set_bookmark ( ++$ this ->bookmark_counter ) ) {
12041214 $ this ->last_error = self ::ERROR_EXCEEDED_MAX_BOOKMARKS ;
12051215 throw new Exception ( 'could not allocate bookmark ' );
0 commit comments