327327 * until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA
328328 * section in an HTML document containing `>`. The Tag Processor will first find
329329 * all valid and bogus HTML comments, and then if the comment _would_ have been a
330- * CDATA section _were they to exist_, it will re-classify the bogus comment as such .
330+ * CDATA section _were they to exist_, it will indicate this as the type of comment .
331331 *
332332 * [2]: XML allows a broader range of characters in a processing instruction's target name
333333 * and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
@@ -494,7 +494,6 @@ class WP_HTML_Tag_Processor {
494494 * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
495495 * | *Text node* | Found a #text node; this is plaintext and modifiable. |
496496 * | *CDATA node* | Found a CDATA section; this is modifiable. |
497- * | *PI node* | Found a Processing Instruction; this is modifiable. |
498497 * | *Comment* | Found a comment or bogus comment; this is modifiable. |
499498 * | *Presumptuous* | Found an empty tag closer: `</>`. |
500499 * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. |
@@ -507,7 +506,6 @@ class WP_HTML_Tag_Processor {
507506 * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
508507 * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
509508 * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
510- * @see WP_HTML_Tag_Processor::STATE_PI_NODE
511509 * @see WP_HTML_Tag_Processor::STATE_COMMENT
512510 * @see WP_HTML_Tag_Processor::STATE_DOCTYPE
513511 * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
@@ -517,6 +515,19 @@ class WP_HTML_Tag_Processor {
517515 */
518516 protected $ parser_state = self ::STATE_READY ;
519517
518+ /**
519+ * What kind of syntax token became an HTML comment.
520+ *
521+ * Since there are many ways in which HTML syntax can create an HTML comment,
522+ * this indicates which of those caused it. This allows the Tag Processor to
523+ * represent more from the original input document than would appear in the DOM.
524+ *
525+ * @since 6.5.0
526+ *
527+ * @var string|null
528+ */
529+ protected $ comment_type = null ;
530+
520531 /**
521532 * How many bytes from the original HTML document have been read and parsed.
522533 *
@@ -1602,10 +1613,12 @@ private function parse_next_tag() {
16021613 * involves inserting an additional `-` into the token after the modifiable text.
16031614 */
16041615 $ this ->parser_state = self ::STATE_COMMENT ;
1616+ $ this ->comment_type = self ::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT ;
16051617 $ this ->token_length = $ closer_at + $ span_of_dashes + 1 - $ this ->token_starts_at ;
16061618
16071619 // Only provide modifiable text if the token is long enough to contain it.
16081620 if ( $ span_of_dashes >= 2 ) {
1621+ $ this ->comment_type = self ::COMMENT_AS_HTML_COMMENT ;
16091622 $ this ->text_starts_at = $ this ->token_starts_at + 4 ;
16101623 $ this ->text_length = $ span_of_dashes - 2 ;
16111624 }
@@ -1631,6 +1644,7 @@ private function parse_next_tag() {
16311644
16321645 if ( $ closer_at + 2 < $ doc_length && '> ' === $ html [ $ closer_at + 2 ] ) {
16331646 $ this ->parser_state = self ::STATE_COMMENT ;
1647+ $ this ->comment_type = self ::COMMENT_AS_HTML_COMMENT ;
16341648 $ this ->token_length = $ closer_at + 3 - $ this ->token_starts_at ;
16351649 $ this ->text_starts_at = $ this ->token_starts_at + 4 ;
16361650 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
@@ -1644,6 +1658,7 @@ private function parse_next_tag() {
16441658 '> ' === $ html [ $ closer_at + 3 ]
16451659 ) {
16461660 $ this ->parser_state = self ::STATE_COMMENT ;
1661+ $ this ->comment_type = self ::COMMENT_AS_HTML_COMMENT ;
16471662 $ this ->token_length = $ closer_at + 4 - $ this ->token_starts_at ;
16481663 $ this ->text_starts_at = $ this ->token_starts_at + 4 ;
16491664 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
@@ -1696,6 +1711,7 @@ private function parse_next_tag() {
16961711 }
16971712
16981713 $ this ->parser_state = self ::STATE_COMMENT ;
1714+ $ this ->comment_type = self ::COMMENT_AS_INVALID_HTML ;
16991715 $ this ->token_length = $ closer_at + 1 - $ this ->token_starts_at ;
17001716 $ this ->text_starts_at = $ this ->token_starts_at + 2 ;
17011717 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
@@ -1727,7 +1743,8 @@ private function parse_next_tag() {
17271743 '[ ' === $ html [ $ this ->token_starts_at + 8 ] &&
17281744 '] ' === $ html [ $ closer_at - 1 ]
17291745 ) {
1730- $ this ->parser_state = self ::STATE_CDATA_NODE ;
1746+ $ this ->parser_state = self ::STATE_COMMENT ;
1747+ $ this ->comment_type = self ::COMMENT_AS_CDATA_LOOKALIKE ;
17311748 $ this ->text_starts_at += 7 ;
17321749 $ this ->text_length -= 9 ;
17331750 }
@@ -1764,6 +1781,7 @@ private function parse_next_tag() {
17641781 }
17651782
17661783 $ this ->parser_state = self ::STATE_COMMENT ;
1784+ $ this ->comment_type = self ::COMMENT_AS_INVALID_HTML ;
17671785 $ this ->token_length = $ closer_at + 1 - $ this ->token_starts_at ;
17681786 $ this ->text_starts_at = $ this ->token_starts_at + 2 ;
17691787 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
@@ -1800,7 +1818,8 @@ private function parse_next_tag() {
18001818 if ( 0 < $ pi_target_length ) {
18011819 $ pi_target_length += strspn ( $ comment_text , 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-. ' , $ pi_target_length );
18021820
1803- $ this ->parser_state = self ::STATE_PI_NODE ;
1821+ $ this ->parser_state = self ::STATE_COMMENT ;
1822+ $ this ->comment_type = self ::COMMENT_AS_PI_NODE_LOOKALIKE ;
18041823 $ this ->tag_name_starts_at = $ this ->token_starts_at + 2 ;
18051824 $ this ->tag_name_length = $ pi_target_length ;
18061825 $ this ->text_starts_at += $ pi_target_length ;
@@ -2003,6 +2022,7 @@ private function after_tag() {
20032022 $ this ->text_length = 0 ;
20042023 $ this ->is_closing_tag = null ;
20052024 $ this ->attributes = array ();
2025+ $ this ->comment_type = null ;
20062026 $ this ->duplicate_attributes = null ;
20072027 }
20082028
@@ -2621,9 +2641,6 @@ public function get_token_type() {
26212641 case self ::STATE_DOCTYPE :
26222642 return '#doctype ' ;
26232643
2624- case self ::STATE_PI_NODE :
2625- return '#processing-instruction ' ;
2626-
26272644 default :
26282645 return $ this ->get_token_name ();
26292646 }
@@ -2661,9 +2678,6 @@ public function get_token_name() {
26612678 case self ::STATE_CDATA_NODE :
26622679 return '#cdata-section ' ;
26632680
2664- case self ::STATE_PI_NODE :
2665- return substr ( $ this ->html , $ this ->tag_name_starts_at , $ this ->tag_name_length );
2666-
26672681 case self ::STATE_COMMENT :
26682682 return '#comment ' ;
26692683
@@ -2678,6 +2692,27 @@ public function get_token_name() {
26782692 }
26792693 }
26802694
2695+ /**
2696+ * Indicates what kind of comment produced the comment node.
2697+ *
2698+ * Because there are different kinds of HTML syntax which produce
2699+ * comments, the Tag Processor tracks the source in order to expose
2700+ * it to callers.
2701+ *
2702+ * @todo finish this comment!
2703+ *
2704+ * @since 6.5.0
2705+ *
2706+ * @return string|null
2707+ */
2708+ public function get_comment_type () {
2709+ if ( self ::STATE_COMMENT !== $ this ->parser_state ) {
2710+ return null ;
2711+ }
2712+
2713+ return $ this ->comment_type ;
2714+ }
2715+
26812716 /**
26822717 * Returns the modifiable text for a matched token, or an empty string.
26832718 *
@@ -2708,7 +2743,6 @@ public function get_modifiable_text() {
27082743 self ::STATE_CDATA_NODE === $ this ->parser_state ||
27092744 self ::STATE_COMMENT === $ this ->parser_state ||
27102745 self ::STATE_DOCTYPE === $ this ->parser_state ||
2711- self ::STATE_PI_NODE === $ this ->parser_state ||
27122746 self ::STATE_FUNKY_COMMENT === $ this ->parser_state
27132747 ) {
27142748 return $ text ;
@@ -3290,32 +3324,14 @@ private function matches() {
32903324 * Indicates that the parser has found a CDADA node and it's possible
32913325 * to read and modify its modifiable text. Note that in HTML there are
32923326 * no CDATA nodes outside foreign elements (SVG and MathML). Outside
3293- * of foreign elements, they are treated as HTML comments. Nonetheless,
3294- * the Tag Processor still recognizes them as they appear in the HTML
3295- * stream and exposes them for inspection and modification.
3327+ * of foreign elements, they are treated as HTML comments.
32963328 *
32973329 * @since 6.5.0
32983330 *
32993331 * @access private
33003332 */
33013333 const STATE_CDATA_NODE = 'STATE_CDATA_NODE ' ;
33023334
3303- /**
3304- * Parser Processing Instruction State.
3305- *
3306- * Indicates that the parser has found a Processing Instruction and
3307- * it's possible to read and modify its modifiable text. Note that in
3308- * HTML there are no Processing Instruction nodes and they are treated
3309- * as HTML comments. Nonetheless, the Tag Processor still recognizes
3310- * them as they appear in the HTML stream and exposes them for
3311- * inspection and modification.
3312- *
3313- * @since 6.5.0
3314- *
3315- * @access private
3316- */
3317- const STATE_PI_NODE = 'STATE_PI_NODE ' ;
3318-
33193335 /**
33203336 * Indicates that the parser has found an HTML comment and it's
33213337 * possible to read and modify its modifiable text.
@@ -3373,4 +3389,70 @@ private function matches() {
33733389 * @access private
33743390 */
33753391 const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY ' ;
3392+
3393+ /**
3394+ * Indicates that a comment was created when encountering abruptly-closed HTML comment.
3395+ *
3396+ * Example:
3397+ *
3398+ * <!-->
3399+ * <!--->
3400+ * <!---->
3401+ *
3402+ * @since 6.5.0
3403+ */
3404+ const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT ' ;
3405+
3406+ /**
3407+ * Indicates that a comment would be parsed as a CDATA node,
3408+ * were HTML to allow CDATA nodes outside of foreign elements.
3409+ *
3410+ * Example:
3411+ *
3412+ * <![CDATA[This is a CDATA node.]]>
3413+ *
3414+ * This is an HTML comment, but it looks like a CDATA node.
3415+ *
3416+ * @since 6.5.0
3417+ */
3418+ const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE ' ;
3419+
3420+ /**
3421+ * Indicates that a comment was created when encountering
3422+ * normative HTML comment syntax.
3423+ *
3424+ * Example:
3425+ *
3426+ * <!-- this is a comment -->
3427+ *
3428+ * @since 6.5.0
3429+ */
3430+ const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT ' ;
3431+
3432+ /**
3433+ * Indicates that a comment would be parsed as a Processing
3434+ * Instruction node, were they to exist within HTML.
3435+ *
3436+ * Example:
3437+ *
3438+ * <?wp __( 'Like' ) ?>
3439+ *
3440+ * This is an HTML comment, but it looks like a CDATA node.
3441+ *
3442+ * @since 6.5.0
3443+ */
3444+ const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE ' ;
3445+
3446+ /**
3447+ * Indicates that a comment was created when encountering invalid
3448+ * HTML input, a so-called "bogus comment."
3449+ *
3450+ * Example:
3451+ *
3452+ * <?nothing special>
3453+ * <!{nothing special}>
3454+ *
3455+ * @since 6.5.0
3456+ */
3457+ const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML ' ;
33763458}
0 commit comments