Recreate branch from principle, scanning all tokens.

dmsnell · dmsnell · commit 39f6ae28e1b2 · 2024-03-05T10:00:06.000-07:00
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -361,7 +361,11 @@ public function get_last_error() {
 	public function next_tag( $query = null ) {
 		if ( null === $query ) {
 			while ( $this->step() ) {
-				if ( '#tag' === $this->get_token_type() && ! $this->is_tag_closer() ) {
+				if ( '#tag' !== $this->get_token_type() ) {
+					continue;
+				}
+
+				if ( ! $this->is_tag_closer() ) {
 					return true;
 				}
 			}
@@ -384,7 +388,11 @@ public function next_tag( $query = null ) {
 
 		if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
 			while ( $this->step() ) {
-				if ( '#tag' === $this->get_token_type() && ! $this->is_tag_closer() ) {
+				if ( '#tag' !== $this->get_token_type() ) {
+					continue;
+				}
+
+				if ( ! $this->is_tag_closer() ) {
 					return true;
 				}
 			}
@@ -405,6 +413,10 @@ public function next_tag( $query = null ) {
 		$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
 
 		while ( $match_offset > 0 && $this->step() ) {
+			if ( '#tag' !== $this->get_token_type() ) {
+				continue;
+			}
+
 			if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
 				return true;
 			}
@@ -457,10 +469,6 @@ public function next_token() {
 	 * @return bool Whether the currently-matched tag is found at the given nested structure.
 	 */
 	public function matches_breadcrumbs( $breadcrumbs ) {
-		if ( ! $this->get_tag() ) {
-			return false;
-		}
-
 		// Everything matches when there are zero constraints.
 		if ( 0 === count( $breadcrumbs ) ) {
 			return true;
@@ -523,37 +531,35 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
 			 *        is provided in the opening tag, otherwise it expects a tag closer.
 			 */
 			$top_node = $this->state->stack_of_open_elements->current_node();
-			if ( $top_node && self::is_void( $top_node->node_name ) ) {
+			if (
+				$top_node && (
+					// Void elements.
+					self::is_void( $top_node->node_name ) ||
+					// Comments, text nodes, and other atomic tokens.
+					'#' === $top_node->node_name[0] ||
+					// Doctype declarations.
+					'html' === $top_node->node_name
+				)
+			) {
 				$this->state->stack_of_open_elements->pop();
 			}
 		}
 
 		if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
-			/*
-			 * Currently tag and text nodes must be processed. Text nodes may
-			 * trigger active format reconstruction, otherwise they could be
-			 * skipped until the HTML Processor supports visiting text nodes.
-			 */
-			while ( parent::next_token() ) {
-				$token_type = $this->get_token_type();
-
-				if ( '#tag' === $token_type || '#text' === $token_type ) {
-					break;
-				}
-			}
+			parent::next_token();
 		}
 
 		// Finish stepping when there are no more tokens in the document.
 		if (
-			WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state ||
-			WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state
+			WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
+			WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
 		) {
 			return false;
 		}
 
 		$this->state->current_token = new WP_HTML_Token(
-			$this->bookmark_tag(),
-			$this->get_tag(),
+			$this->bookmark_token(),
+			$this->get_token_name(),
 			$this->has_self_closing_flag(),
 			$this->release_internal_bookmark_on_destruct
 		);
@@ -597,10 +603,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
 	 * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
 	 */
 	public function get_breadcrumbs() {
-		if ( ! $this->get_tag() ) {
-			return null;
-		}
-
 		$breadcrumbs = array();
 		foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
 			$breadcrumbs[] = $stack_item->node_name;
@@ -625,54 +627,61 @@ public function get_breadcrumbs() {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_in_body() {
-		$tag_name = $this->get_tag();
-		$op_sigil = $this->is_tag_closer() ? '-' : '+';
-		$op       = "{$op_sigil}{$tag_name}";
+		$token_name = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$token_name}";
 
-		if ( null === $tag_name && '#text' === $this->get_token_type() ) {
-			/*
-			 * This rule is necessary even without supporting text nodes in the
-			 * HTML Processor because the parser has to move past text nodes, and
-			 * there could be breadcrumb implications when the text triggers the
-			 * active format reconstruction.
-			 */
-			$this->reconstruct_active_formatting_elements();
+		switch ( $op ) {
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
 
-			$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
+			case '#text':
+				$this->reconstruct_active_formatting_elements();
 
-			/*
-			 * > A character token that is U+0000 NULL
-			 *
-			 * Any successive sequence of NULL bytes is ignored and won't
-			 * trigger active format reconstruction. Therefore, if the text
-			 * only comprises NULL bytes then the token should be ignored
-			 * here, but if there are any other characters in the stream
-			 * the active formats should be reconstructed.
-			 */
-			if (
-				1 <= $current_token->length &&
-				"\x00" === $this->html[ $current_token->start ] &&
-				strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
-			) {
-				// Parse error: ignore the token.
-				return $this->step();
-			}
+				$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
 
-			/*
-			 * Whitespace-only text does not affect the frameset-ok flag.
-			 * It is probably inter-element whitespace, but it may also
-			 * contain character references which decode only to whitespace.
-			 */
-			$text = $this->get_modifiable_text();
-			if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
-				$this->state->frameset_ok = false;
-			}
+				/*
+				 * > A character token that is U+0000 NULL
+				 *
+				 * Any successive sequence of NULL bytes is ignored and won't
+				 * trigger active format reconstruction. Therefore, if the text
+				 * only comprises NULL bytes then the token should be ignored
+				 * here, but if there are any other characters in the stream
+				 * the active formats should be reconstructed.
+				 */
+				if (
+					1 <= $current_token->length &&
+					"\x00" === $this->html[ $current_token->start ] &&
+					strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
+				) {
+					// Parse error: ignore the token.
+					return $this->step();
+				}
 
-			// @todo Add support for text nodes: insert node and return "true" where when supported.
-			return $this->step();
-		}
+				/*
+				 * Whitespace-only text does not affect the frameset-ok flag.
+				 * It is probably inter-element whitespace, but it may also
+				 * contain character references which decode only to whitespace.
+				 */
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+					$this->state->frameset_ok = false;
+				}
+
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			case 'html':
+				/*
+				 * > A DOCTYPE token
+				 * > Parse error. Ignore the token.
+				 */
+				return $this->step();
 
-		switch ( $op ) {
 			/*
 			 * > A start tag whose tag name is "button"
 			 */
@@ -760,17 +769,17 @@ private function step_in_body() {
 			case '-SECTION':
 			case '-SUMMARY':
 			case '-UL':
-				if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
+				if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
 					// @todo Report parse error.
 					// Ignore the token.
 					return $this->step();
 				}
 
 				$this->generate_implied_end_tags();
-				if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
+				if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
 					// @todo Record parse error: this error doesn't impact parsing.
 				}
-				$this->state->stack_of_open_elements->pop_until( $tag_name );
+				$this->state->stack_of_open_elements->pop_until( $token_name );
 				return true;
 
 			/*
@@ -832,7 +841,7 @@ private function step_in_body() {
 
 				$this->generate_implied_end_tags();
 
-				if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
+				if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
 					// @todo Record parse error: this error doesn't impact parsing.
 				}
 
@@ -848,7 +857,7 @@ private function step_in_body() {
 			case '+LI':
 				$this->state->frameset_ok = false;
 				$node                     = $this->state->stack_of_open_elements->current_node();
-				$is_li                    = 'LI' === $tag_name;
+				$is_li                    = 'LI' === $token_name;
 
 				in_body_list_loop:
 				/*
@@ -911,7 +920,7 @@ private function step_in_body() {
 					 * then this is a parse error; ignore the token.
 					 */
 					(
-						'LI' === $tag_name &&
+						'LI' === $token_name &&
 						! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
 					) ||
 					/*
@@ -921,8 +930,8 @@ private function step_in_body() {
 					 * parse error; ignore the token.
 					 */
 					(
-						'LI' !== $tag_name &&
-						! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name )
+						'LI' !== $token_name &&
+						! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
 					)
 				) {
 					/*
@@ -933,13 +942,13 @@ private function step_in_body() {
 					return $this->step();
 				}
 
-				$this->generate_implied_end_tags( $tag_name );
+				$this->generate_implied_end_tags( $token_name );
 
-				if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
+				if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
 					// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
 				}
 
-				$this->state->stack_of_open_elements->pop_until( $tag_name );
+				$this->state->stack_of_open_elements->pop_until( $token_name );
 				return true;
 
 			/*
@@ -1092,7 +1101,7 @@ private function step_in_body() {
 		 *
 		 * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
 		 */
-		switch ( $tag_name ) {
+		switch ( $token_name ) {
 			case 'APPLET':
 			case 'BASE':
 			case 'BASEFONT':
@@ -1140,7 +1149,7 @@ private function step_in_body() {
 			case 'TR':
 			case 'XMP':
 				$this->last_error = self::ERROR_UNSUPPORTED;
-				throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
+				throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
 		}
 
 		if ( ! $this->is_tag_closer() ) {
@@ -1162,7 +1171,7 @@ private function step_in_body() {
 			 * close anything beyond its containing `P` or `DIV` element.
 			 */
 			foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
-				if ( $tag_name === $node->node_name ) {
+				if ( $token_name === $node->node_name ) {
 					break;
 				}
 
@@ -1172,7 +1181,7 @@ private function step_in_body() {
 				}
 			}
 
-			$this->generate_implied_end_tags( $tag_name );
+			$this->generate_implied_end_tags( $token_name );
 			if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
 				// @todo Record parse error: this error doesn't impact parsing.
 			}
@@ -1191,15 +1200,16 @@ private function step_in_body() {
 	 */
 
 	/**
-	 * Creates a new bookmark for the currently-matched tag and returns the generated name.
+	 * Creates a new bookmark for the currently-matched token and returns the generated name.
 	 *
 	 * @since 6.4.0
+	 * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
 	 *
 	 * @throws Exception When unable to allocate requested bookmark.
 	 *
 	 * @return string|false Name of created bookmark, or false if unable to create.
 	 */
-	private function bookmark_tag() {
+	private function bookmark_token() {
 		if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
 			$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
 			throw new Exception( 'could not allocate bookmark' );