@@ -1029,6 +1029,216 @@ public function get_current_depth(): int {
10291029 return count ( $ this ->breadcrumbs );
10301030 }
10311031
1032+ /**
1033+ * Normalizes an HTML fragment by serializing it.
1034+ *
1035+ * This method assumes that the given HTML snippet is found in BODY context.
1036+ * For normalizing full documents or fragments found in other contexts, create
1037+ * a new processor using {@see WP_HTML_Processor::create_fragment} or
1038+ * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize}
1039+ * on the created instances.
1040+ *
1041+ * Many aspects of an input HTML fragment may be changed during normalization.
1042+ *
1043+ * - Attribute values will be double-quoted.
1044+ * - Duplicate attributes will be removed.
1045+ * - Omitted tags will be added.
1046+ * - Tag and attribute name casing will be lower-cased,
1047+ * except for specific SVG and MathML tags or attributes.
1048+ * - Text will be re-encoded, null bytes handled,
1049+ * and invalid UTF-8 replaced with U+FFFD.
1050+ * - Any incomplete syntax trailing at the end will be omitted,
1051+ * for example, an unclosed comment opener will be removed.
1052+ *
1053+ * Example:
1054+ *
1055+ * echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
1056+ * // <a href="#anchor" v="5" enabled>One</a>
1057+ *
1058+ * echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' );
1059+ * // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
1060+ *
1061+ * echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
1062+ * // <!--[CDATA[invalid comment]]--> syntax < <> "oddities"
1063+ *
1064+ * @since 6.7.0
1065+ *
1066+ * @param string $html Input HTML to normalize.
1067+ *
1068+ * @return string|null Normalized output, or `null` if unable to normalize.
1069+ */
1070+ public static function normalize ( string $ html ): ?string {
1071+ return static ::create_fragment ( $ html )->serialize ();
1072+ }
1073+
1074+ /**
1075+ * Returns normalized HTML for a fragment by serializing it.
1076+ *
1077+ * This differs from {@see WP_HTML_Processor::normalize} in that it starts with
1078+ * a specific HTML Processor, which _must_ not have already started scanning;
1079+ * it must be in the initial ready state and will be in the completed state once
1080+ * serialization is complete.
1081+ *
1082+ * Many aspects of an input HTML fragment may be changed during normalization.
1083+ *
1084+ * - Attribute values will be double-quoted.
1085+ * - Duplicate attributes will be removed.
1086+ * - Omitted tags will be added.
1087+ * - Tag and attribute name casing will be lower-cased,
1088+ * except for specific SVG and MathML tags or attributes.
1089+ * - Text will be re-encoded, null bytes handled,
1090+ * and invalid UTF-8 replaced with U+FFFD.
1091+ * - Any incomplete syntax trailing at the end will be omitted,
1092+ * for example, an unclosed comment opener will be removed.
1093+ *
1094+ * Example:
1095+ *
1096+ * $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
1097+ * echo $processor->serialize();
1098+ * // <a href="#anchor" v="5" enabled>One</a>
1099+ *
1100+ * $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
1101+ * echo $processor->serialize();
1102+ * // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
1103+ *
1104+ * $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
1105+ * echo $processor->serialize();
1106+ * // <!--[CDATA[invalid comment]]--> syntax < <> "oddities"
1107+ *
1108+ * @since 6.7.0
1109+ *
1110+ * @return string|null Normalized HTML markup represented by processor,
1111+ * or `null` if unable to generate serialization.
1112+ */
1113+ public function serialize (): ?string {
1114+ if ( WP_HTML_Tag_Processor::STATE_READY !== $ this ->parser_state ) {
1115+ wp_trigger_error (
1116+ __METHOD__ ,
1117+ "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance. " ,
1118+ E_USER_WARNING
1119+ );
1120+ return null ;
1121+ }
1122+
1123+ $ html = '' ;
1124+ while ( $ this ->next_token () ) {
1125+ $ html .= $ this ->serialize_token ();
1126+ }
1127+
1128+ if ( null !== $ this ->get_last_error () ) {
1129+ wp_trigger_error (
1130+ __METHOD__ ,
1131+ "Cannot serialize HTML Processor with parsing error: {$ this ->get_last_error ()}. " ,
1132+ E_USER_WARNING
1133+ );
1134+ return null ;
1135+ }
1136+
1137+ return $ html ;
1138+ }
1139+
1140+ /**
1141+ * Serializes the currently-matched token.
1142+ *
1143+ * This method produces a fully-normative HTML string for the currently-matched token,
1144+ * if able. If not matched at any token or if the token doesn't correspond to any HTML
1145+ * it will return an empty string (for example, presumptuous end tags are ignored).
1146+ *
1147+ * @see static::serialize()
1148+ *
1149+ * @since 6.7.0
1150+ *
1151+ * @return string Serialization of token, or empty string if no serialization exists.
1152+ */
1153+ protected function serialize_token (): string {
1154+ $ html = '' ;
1155+ $ token_type = $ this ->get_token_type ();
1156+
1157+ switch ( $ token_type ) {
1158+ case '#text ' :
1159+ $ html .= htmlspecialchars ( $ this ->get_modifiable_text (), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 , 'UTF-8 ' );
1160+ break ;
1161+
1162+ // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
1163+ case '#presumptuous-tag ' :
1164+ break ;
1165+
1166+ case '#funky-comment ' :
1167+ case '#comment ' :
1168+ $ html .= "<!-- {$ this ->get_full_comment_text ()}--> " ;
1169+ break ;
1170+
1171+ case '#cdata-section ' :
1172+ $ html .= "<![CDATA[ {$ this ->get_modifiable_text ()}]]> " ;
1173+ break ;
1174+
1175+ case 'html ' :
1176+ $ html .= '<!DOCTYPE html> ' ;
1177+ break ;
1178+ }
1179+
1180+ if ( '#tag ' !== $ token_type ) {
1181+ return $ html ;
1182+ }
1183+
1184+ $ tag_name = str_replace ( "\x00" , "\u{FFFD}" , $ this ->get_tag () );
1185+ $ in_html = 'html ' === $ this ->get_namespace ();
1186+ $ qualified_name = $ in_html ? strtolower ( $ tag_name ) : $ this ->get_qualified_tag_name ();
1187+
1188+ if ( $ this ->is_tag_closer () ) {
1189+ $ html .= "</ {$ qualified_name }> " ;
1190+ return $ html ;
1191+ }
1192+
1193+ $ attribute_names = $ this ->get_attribute_names_with_prefix ( '' );
1194+ if ( ! isset ( $ attribute_names ) ) {
1195+ $ html .= "< {$ qualified_name }> " ;
1196+ return $ html ;
1197+ }
1198+
1199+ $ html .= "< {$ qualified_name }" ;
1200+ foreach ( $ attribute_names as $ attribute_name ) {
1201+ $ html .= " {$ this ->get_qualified_attribute_name ( $ attribute_name )}" ;
1202+ $ value = $ this ->get_attribute ( $ attribute_name );
1203+
1204+ if ( is_string ( $ value ) ) {
1205+ $ html .= '=" ' . htmlspecialchars ( $ value , ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '" ' ;
1206+ }
1207+
1208+ $ html = str_replace ( "\x00" , "\u{FFFD}" , $ html );
1209+ }
1210+
1211+ if ( ! $ in_html && $ this ->has_self_closing_flag () ) {
1212+ $ html .= ' / ' ;
1213+ }
1214+
1215+ $ html .= '> ' ;
1216+
1217+ // Flush out self-contained elements.
1218+ if ( $ in_html && in_array ( $ tag_name , array ( 'IFRAME ' , 'NOEMBED ' , 'NOFRAMES ' , 'SCRIPT ' , 'STYLE ' , 'TEXTAREA ' , 'TITLE ' , 'XMP ' ), true ) ) {
1219+ $ text = $ this ->get_modifiable_text ();
1220+
1221+ switch ( $ tag_name ) {
1222+ case 'IFRAME ' :
1223+ case 'NOEMBED ' :
1224+ case 'NOFRAMES ' :
1225+ $ text = '' ;
1226+ break ;
1227+
1228+ case 'SCRIPT ' :
1229+ case 'STYLE ' :
1230+ break ;
1231+
1232+ default :
1233+ $ text = htmlspecialchars ( $ text , ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 , 'UTF-8 ' );
1234+ }
1235+
1236+ $ html .= "{$ text }</ {$ qualified_name }> " ;
1237+ }
1238+
1239+ return $ html ;
1240+ }
1241+
10321242 /**
10331243 * Parses next element in the 'initial' insertion mode.
10341244 *
0 commit comments