@@ -361,6 +361,10 @@ public function get_last_error() {
361361 public function next_tag ( $ query = null ) {
362362 if ( null === $ query ) {
363363 while ( $ this ->step () ) {
364+ if ( '#tag ' !== $ this ->get_token_type () ) {
365+ continue ;
366+ }
367+
364368 if ( ! $ this ->is_tag_closer () ) {
365369 return true ;
366370 }
@@ -384,6 +388,10 @@ public function next_tag( $query = null ) {
384388
385389 if ( ! ( array_key_exists ( 'breadcrumbs ' , $ query ) && is_array ( $ query ['breadcrumbs ' ] ) ) ) {
386390 while ( $ this ->step () ) {
391+ if ( '#tag ' !== $ this ->get_token_type () ) {
392+ continue ;
393+ }
394+
387395 if ( ! $ this ->is_tag_closer () ) {
388396 return true ;
389397 }
@@ -405,6 +413,10 @@ public function next_tag( $query = null ) {
405413 $ match_offset = isset ( $ query ['match_offset ' ] ) ? (int ) $ query ['match_offset ' ] : 1 ;
406414
407415 while ( $ match_offset > 0 && $ this ->step () ) {
416+ if ( '#tag ' !== $ this ->get_token_type () ) {
417+ continue ;
418+ }
419+
408420 if ( $ this ->matches_breadcrumbs ( $ breadcrumbs ) && 0 === --$ match_offset ) {
409421 return true ;
410422 }
@@ -428,13 +440,7 @@ public function next_tag( $query = null ) {
428440 * @return bool
429441 */
430442 public function next_token () {
431- $ found_a_token = parent ::next_token ();
432-
433- if ( '#tag ' === $ this ->get_token_type () ) {
434- $ this ->step ( self ::PROCESS_CURRENT_NODE );
435- }
436-
437- return $ found_a_token ;
443+ return $ this ->step ();
438444 }
439445
440446 /**
@@ -463,10 +469,6 @@ public function next_token() {
463469 * @return bool Whether the currently-matched tag is found at the given nested structure.
464470 */
465471 public function matches_breadcrumbs ( $ breadcrumbs ) {
466- if ( ! $ this ->get_tag () ) {
467- return false ;
468- }
469-
470472 // Everything matches when there are zero constraints.
471473 if ( 0 === count ( $ breadcrumbs ) ) {
472474 return true ;
@@ -529,25 +531,35 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
529531 * is provided in the opening tag, otherwise it expects a tag closer.
530532 */
531533 $ top_node = $ this ->state ->stack_of_open_elements ->current_node ();
532- if ( $ top_node && self ::is_void ( $ top_node ->node_name ) ) {
534+ if (
535+ $ top_node && (
536+ // Void elements.
537+ self ::is_void ( $ top_node ->node_name ) ||
538+ // Comments, text nodes, and other atomic tokens.
539+ '# ' === $ top_node ->node_name [0 ] ||
540+ // Doctype declarations.
541+ 'html ' === $ top_node ->node_name
542+ )
543+ ) {
533544 $ this ->state ->stack_of_open_elements ->pop ();
534545 }
535546 }
536547
537548 if ( self ::PROCESS_NEXT_NODE === $ node_to_process ) {
538- while ( parent ::next_token () && '#tag ' !== $ this ->get_token_type () ) {
539- continue ;
540- }
549+ parent ::next_token ();
541550 }
542551
543552 // Finish stepping when there are no more tokens in the document.
544- if ( null === $ this ->get_tag () ) {
553+ if (
554+ WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $ this ->parser_state ||
555+ WP_HTML_Tag_Processor::STATE_COMPLETE === $ this ->parser_state
556+ ) {
545557 return false ;
546558 }
547559
548560 $ this ->state ->current_token = new WP_HTML_Token (
549- $ this ->bookmark_tag (),
550- $ this ->get_tag (),
561+ $ this ->bookmark_token (),
562+ $ this ->get_token_name (),
551563 $ this ->has_self_closing_flag (),
552564 $ this ->release_internal_bookmark_on_destruct
553565 );
@@ -591,10 +603,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
591603 * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
592604 */
593605 public function get_breadcrumbs () {
594- if ( ! $ this ->get_tag () ) {
595- return null ;
596- }
597-
598606 $ breadcrumbs = array ();
599607 foreach ( $ this ->state ->stack_of_open_elements ->walk_down () as $ stack_item ) {
600608 $ breadcrumbs [] = $ stack_item ->node_name ;
@@ -619,11 +627,61 @@ public function get_breadcrumbs() {
619627 * @return bool Whether an element was found.
620628 */
621629 private function step_in_body () {
622- $ tag_name = $ this ->get_tag ();
623- $ op_sigil = $ this ->is_tag_closer () ? '- ' : '+ ' ;
624- $ op = "{$ op_sigil }{$ tag_name }" ;
630+ $ token_name = $ this ->get_token_name ();
631+ $ token_type = $ this ->get_token_type ();
632+ $ op_sigil = '#tag ' === $ token_type ? ( $ this ->is_tag_closer () ? '- ' : '+ ' ) : '' ;
633+ $ op = "{$ op_sigil }{$ token_name }" ;
625634
626635 switch ( $ op ) {
636+ case '#comment ' :
637+ case '#funky-comment ' :
638+ case '#presumptuous-tag ' :
639+ $ this ->insert_html_element ( $ this ->state ->current_token );
640+ return true ;
641+
642+ case '#text ' :
643+ $ this ->reconstruct_active_formatting_elements ();
644+
645+ $ current_token = $ this ->bookmarks [ $ this ->state ->current_token ->bookmark_name ];
646+
647+ /*
648+ * > A character token that is U+0000 NULL
649+ *
650+ * Any successive sequence of NULL bytes is ignored and won't
651+ * trigger active format reconstruction. Therefore, if the text
652+ * only comprises NULL bytes then the token should be ignored
653+ * here, but if there are any other characters in the stream
654+ * the active formats should be reconstructed.
655+ */
656+ if (
657+ 1 <= $ current_token ->length &&
658+ "\x00" === $ this ->html [ $ current_token ->start ] &&
659+ strspn ( $ this ->html , "\x00" , $ current_token ->start , $ current_token ->length ) === $ current_token ->length
660+ ) {
661+ // Parse error: ignore the token.
662+ return $ this ->step ();
663+ }
664+
665+ /*
666+ * Whitespace-only text does not affect the frameset-ok flag.
667+ * It is probably inter-element whitespace, but it may also
668+ * contain character references which decode only to whitespace.
669+ */
670+ $ text = $ this ->get_modifiable_text ();
671+ if ( strlen ( $ text ) !== strspn ( $ text , " \t\n\f\r" ) ) {
672+ $ this ->state ->frameset_ok = false ;
673+ }
674+
675+ $ this ->insert_html_element ( $ this ->state ->current_token );
676+ return true ;
677+
678+ case 'html ' :
679+ /*
680+ * > A DOCTYPE token
681+ * > Parse error. Ignore the token.
682+ */
683+ return $ this ->step ();
684+
627685 /*
628686 * > A start tag whose tag name is "button"
629687 */
@@ -711,17 +769,17 @@ private function step_in_body() {
711769 case '-SECTION ' :
712770 case '-SUMMARY ' :
713771 case '-UL ' :
714- if ( ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ tag_name ) ) {
772+ if ( ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ token_name ) ) {
715773 // @todo Report parse error.
716774 // Ignore the token.
717775 return $ this ->step ();
718776 }
719777
720778 $ this ->generate_implied_end_tags ();
721- if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ tag_name ) {
779+ if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ token_name ) {
722780 // @todo Record parse error: this error doesn't impact parsing.
723781 }
724- $ this ->state ->stack_of_open_elements ->pop_until ( $ tag_name );
782+ $ this ->state ->stack_of_open_elements ->pop_until ( $ token_name );
725783 return true ;
726784
727785 /*
@@ -783,7 +841,7 @@ private function step_in_body() {
783841
784842 $ this ->generate_implied_end_tags ();
785843
786- if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ tag_name ) {
844+ if ( $ this ->state ->stack_of_open_elements ->current_node ()->node_name !== $ token_name ) {
787845 // @todo Record parse error: this error doesn't impact parsing.
788846 }
789847
@@ -799,7 +857,7 @@ private function step_in_body() {
799857 case '+LI ' :
800858 $ this ->state ->frameset_ok = false ;
801859 $ node = $ this ->state ->stack_of_open_elements ->current_node ();
802- $ is_li = 'LI ' === $ tag_name ;
860+ $ is_li = 'LI ' === $ token_name ;
803861
804862 in_body_list_loop:
805863 /*
@@ -862,7 +920,7 @@ private function step_in_body() {
862920 * then this is a parse error; ignore the token.
863921 */
864922 (
865- 'LI ' === $ tag_name &&
923+ 'LI ' === $ token_name &&
866924 ! $ this ->state ->stack_of_open_elements ->has_element_in_list_item_scope ( 'LI ' )
867925 ) ||
868926 /*
@@ -872,8 +930,8 @@ private function step_in_body() {
872930 * parse error; ignore the token.
873931 */
874932 (
875- 'LI ' !== $ tag_name &&
876- ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ tag_name )
933+ 'LI ' !== $ token_name &&
934+ ! $ this ->state ->stack_of_open_elements ->has_element_in_scope ( $ token_name )
877935 )
878936 ) {
879937 /*
@@ -884,13 +942,13 @@ private function step_in_body() {
884942 return $ this ->step ();
885943 }
886944
887- $ this ->generate_implied_end_tags ( $ tag_name );
945+ $ this ->generate_implied_end_tags ( $ token_name );
888946
889- if ( $ tag_name !== $ this ->state ->stack_of_open_elements ->current_node ()->node_name ) {
947+ if ( $ token_name !== $ this ->state ->stack_of_open_elements ->current_node ()->node_name ) {
890948 // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
891949 }
892950
893- $ this ->state ->stack_of_open_elements ->pop_until ( $ tag_name );
951+ $ this ->state ->stack_of_open_elements ->pop_until ( $ token_name );
894952 return true ;
895953
896954 /*
@@ -1043,7 +1101,7 @@ private function step_in_body() {
10431101 *
10441102 * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
10451103 */
1046- switch ( $ tag_name ) {
1104+ switch ( $ token_name ) {
10471105 case 'APPLET ' :
10481106 case 'BASE ' :
10491107 case 'BASEFONT ' :
@@ -1091,7 +1149,7 @@ private function step_in_body() {
10911149 case 'TR ' :
10921150 case 'XMP ' :
10931151 $ this ->last_error = self ::ERROR_UNSUPPORTED ;
1094- throw new WP_HTML_Unsupported_Exception ( "Cannot process {$ tag_name } element. " );
1152+ throw new WP_HTML_Unsupported_Exception ( "Cannot process {$ token_name } element. " );
10951153 }
10961154
10971155 if ( ! $ this ->is_tag_closer () ) {
@@ -1113,7 +1171,7 @@ private function step_in_body() {
11131171 * close anything beyond its containing `P` or `DIV` element.
11141172 */
11151173 foreach ( $ this ->state ->stack_of_open_elements ->walk_up () as $ node ) {
1116- if ( $ tag_name === $ node ->node_name ) {
1174+ if ( $ token_name === $ node ->node_name ) {
11171175 break ;
11181176 }
11191177
@@ -1123,7 +1181,7 @@ private function step_in_body() {
11231181 }
11241182 }
11251183
1126- $ this ->generate_implied_end_tags ( $ tag_name );
1184+ $ this ->generate_implied_end_tags ( $ token_name );
11271185 if ( $ node !== $ this ->state ->stack_of_open_elements ->current_node () ) {
11281186 // @todo Record parse error: this error doesn't impact parsing.
11291187 }
@@ -1142,19 +1200,16 @@ private function step_in_body() {
11421200 */
11431201
11441202 /**
1145- * Creates a new bookmark for the currently-matched tag and returns the generated name.
1203+ * Creates a new bookmark for the currently-matched token and returns the generated name.
11461204 *
11471205 * @since 6.4.0
1206+ * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
11481207 *
11491208 * @throws Exception When unable to allocate requested bookmark.
11501209 *
11511210 * @return string|false Name of created bookmark, or false if unable to create.
11521211 */
1153- private function bookmark_tag () {
1154- if ( ! $ this ->get_tag () ) {
1155- return false ;
1156- }
1157-
1212+ private function bookmark_token () {
11581213 if ( ! parent ::set_bookmark ( ++$ this ->bookmark_counter ) ) {
11591214 $ this ->last_error = self ::ERROR_EXCEEDED_MAX_BOOKMARKS ;
11601215 throw new Exception ( 'could not allocate bookmark ' );
0 commit comments