WordPress · dmsnell · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -1029,6 +1029,208 @@ public function get_current_depth(): int {
 		return count( $this->breadcrumbs );
 	}
 
+	/**
+	 * Normalizes an HTML fragment by serializing it.
+	 *
+	 * This method assumes that the given HTML snippet is found in BODY context.
+	 * For normalizing full documents or fragments found in other contexts, create
+	 * a new processor using {@see WP_HTML_Processor::create_fragment} or
+	 * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize}
+	 * on the created instances.
+	 *
+	 * Many aspects of an input HTML fragment may be changed during normalization.
+	 *
+	 *  - Attribute values will be double-quoted.
+	 *  - Duplicate attributes will be removed.
+	 *  - Omitted tags will be added.
+	 *  - Tag and attribute name casing will be lower-cased,
+	 *    except for specific SVG and MathML tags or attributes.
+	 *  - Text will be re-encoded, null bytes handled,
+	 *     and invalid UTF-8 replaced with U+FFFD.
+	 *  - Any incomplete syntax trailing at the end will be omitted,
+	 *    for example, an unclosed comment opener will be removed.
+	 *
+	 * Example:
+	 *
+	 *     echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
+	 *     // <a href="#anchor" v="5" enabled>One</a>
+	 *
+	 *     echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' );
+	 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
+	 *
+	 *     echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
+	 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
+	 *
+	 * @since 6.7.0
+	 *
+	 * @param string $html Input HTML to normalize.
+	 *
+	 * @return string|null Normalized output, or `null` if unable to normalize.
+	 */
+	public static function normalize( string $html ): ?string {
+		return static::create_fragment( $html )->serialize();
+	}
+
+	/**
+	 * Returns normalized HTML for a fragment by serializing it.
+	 *
+	 * This differs from {@see WP_HTML_Processor::normalize} in that it starts with
+	 * a specific HTML Processor, which _must_ not have already started scanning;
+	 * it must be in the initial ready state and will be in the completed state once
+	 * serialization is complete.
+	 *
+	 * Many aspects of an input HTML fragment may be changed during normalization.
+	 *
+	 *  - Attribute values will be double-quoted.
+	 *  - Duplicate attributes will be removed.
+	 *  - Omitted tags will be added.
+	 *  - Tag and attribute name casing will be lower-cased,
+	 *    except for specific SVG and MathML tags or attributes.
+	 *  - Text will be re-encoded, null bytes handled,
+	 *     and invalid UTF-8 replaced with U+FFFD.
+	 *  - Any incomplete syntax trailing at the end will be omitted,
+	 *    for example, an unclosed comment opener will be removed.
+	 *
+	 * Example:
+	 *
+	 *     $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
+	 *     echo $processor->serialize();
+	 *     // <a href="#anchor" v="5" enabled>One</a>
+	 *
+	 *     $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
+	 *     echo $processor->serialize();
+	 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
+	 *
+	 *     $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
+	 *     echo $processor->serialize();
+	 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
+	 *
+	 * @since 6.7.0
+	 *
+	 * @return string|null Normalized HTML markup represented by processor,
+	 *                     or `null` if unable to generate serialization.
+	 */
+	public function serialize(): ?string {
+		if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) {
+			wp_trigger_error(
+				__METHOD__,
+				"An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.",
+				E_USER_ERROR
+			);
+			return null;
+		}
+
+		$html = '';
+		while ( $this->next_token() ) {
+			$token_type = $this->get_token_type();
+
+			switch ( $token_type ) {
+				case '#text':
+					$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
+					break;
+
+				// Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
+				case '#presumptuous-tag':
+					break;
+
+				case '#funky-comment':
+					$html .= "<!--{$this->get_modifiable_text()}-->";
+					break;
+
+				case '#comment':
+					switch ( $this->get_comment_type() ) {
+						case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE:
+							$html .= "<!--[CDATA[{$this->get_modifiable_text()}]]-->";
+							break;
+
+						case WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE:
+							$html .= "<!--?{$this->get_tag()}{$this->get_modifiable_text()}?-->";
+							break;
+
+						default:
+							$html .= "<!--{$this->get_modifiable_text()}-->";
+					}
+					break;
+
+				case '#cdata-section':
+					$html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
+					break;
+
+				case 'html':
+					$html .= '<!DOCTYPE html>';
+					break;
+			}
+
+			if ( '#tag' !== $token_type ) {
+				continue;
+			}
+
+			$tag_name       = $this->get_tag();
+			$in_html        = 'html' === $this->get_namespace();
+			$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
+
+			if ( $this->is_tag_closer() ) {
+				$html .= "</{$qualified_name}>";
+				continue;
+			}
+
+			$attribute_names = $this->get_attribute_names_with_prefix( '' );
+			if ( ! isset( $attribute_names ) ) {
+				$html .= "<{$qualified_name}>";
+				continue;
+			}
+
+			$html .= "<{$qualified_name}";
+			foreach ( $attribute_names as $attribute_name ) {
+				$html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
+				$value = $this->get_attribute( $attribute_name );
+
+				if ( is_string( $value ) ) {
+					$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
+				}
+			}
+
+			if ( ! $in_html && $this->has_self_closing_flag() ) {
+				$html .= ' /';
+			}
+
+			$html .= '>';
+
+			// Flush out self-contained elements.
+			if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) {
+				$text = $this->get_modifiable_text();
+
+				switch ( $tag_name ) {
+					case 'IFRAME':
+					case 'NOEMBED':
+					case 'NOFRAMES':
+						$text = '';
+						break;
+
+					case 'SCRIPT':
+					case 'STYLE':
+						break;
+
+					default:
+						$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
+				}
+
+				$html .= "{$text}</{$qualified_name}>";
+			}
+		}
+
+		if ( null !== $this->get_last_error() ) {
+			wp_trigger_error(
+				__METHOD__,
+				"Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.",
+				E_USER_ERROR
+			);
+			return null;
+		}
+
+		return $html;
+	}
+
 	/**
 	 * Parses next element in the 'initial' insertion mode.
 	 *

diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -1984,6 +1984,9 @@ private function parse_next_tag(): bool {
 				 *                     [#x10000-#xEFFFF]
 				 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
 				 *
+				 * @todo Processing instruction nodes in SGML may contain any kind of markup. XML defines a
+				 *       special case with `<?xml ... ?>` syntax, but the `?` is part of the bogus comment.
+				 *
 				 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
 				 */
 				if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {