Skip to content

Commit d62099b

Browse files
committed
WIP: Add comment types
1 parent c4fb5ff commit d62099b

1 file changed

Lines changed: 113 additions & 31 deletions

File tree

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 113 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@
327327
* until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA
328328
* section in an HTML document containing `>`. The Tag Processor will first find
329329
* all valid and bogus HTML comments, and then if the comment _would_ have been a
330-
* CDATA section _were they to exist_, it will re-classify the bogus comment as such.
330+
* CDATA section _were they to exist_, it will indicate this as the type of comment.
331331
*
332332
* [2]: XML allows a broader range of characters in a processing instruction's target name
333333
* and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
@@ -494,7 +494,6 @@ class WP_HTML_Tag_Processor {
494494
* | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
495495
* | *Text node* | Found a #text node; this is plaintext and modifiable. |
496496
* | *CDATA node* | Found a CDATA section; this is modifiable. |
497-
* | *PI node* | Found a Processing Instruction; this is modifiable. |
498497
* | *Comment* | Found a comment or bogus comment; this is modifiable. |
499498
* | *Presumptuous* | Found an empty tag closer: `</>`. |
500499
* | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. |
@@ -507,7 +506,6 @@ class WP_HTML_Tag_Processor {
507506
* @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
508507
* @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
509508
* @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
510-
* @see WP_HTML_Tag_Processor::STATE_PI_NODE
511509
* @see WP_HTML_Tag_Processor::STATE_COMMENT
512510
* @see WP_HTML_Tag_Processor::STATE_DOCTYPE
513511
* @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
@@ -517,6 +515,19 @@ class WP_HTML_Tag_Processor {
517515
*/
518516
protected $parser_state = self::STATE_READY;
519517

518+
/**
519+
* What kind of syntax token became an HTML comment.
520+
*
521+
* Since there are many ways in which HTML syntax can create an HTML comment,
522+
* this indicates which of those caused it. This allows the Tag Processor to
523+
* represent more from the original input document than would appear in the DOM.
524+
*
525+
* @since 6.5.0
526+
*
527+
* @var string|null
528+
*/
529+
protected $comment_type = null;
530+
520531
/**
521532
* How many bytes from the original HTML document have been read and parsed.
522533
*
@@ -1602,10 +1613,12 @@ private function parse_next_tag() {
16021613
* involves inserting an additional `-` into the token after the modifiable text.
16031614
*/
16041615
$this->parser_state = self::STATE_COMMENT;
1616+
$this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
16051617
$this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;
16061618

16071619
// Only provide modifiable text if the token is long enough to contain it.
16081620
if ( $span_of_dashes >= 2 ) {
1621+
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
16091622
$this->text_starts_at = $this->token_starts_at + 4;
16101623
$this->text_length = $span_of_dashes - 2;
16111624
}
@@ -1631,6 +1644,7 @@ private function parse_next_tag() {
16311644

16321645
if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
16331646
$this->parser_state = self::STATE_COMMENT;
1647+
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
16341648
$this->token_length = $closer_at + 3 - $this->token_starts_at;
16351649
$this->text_starts_at = $this->token_starts_at + 4;
16361650
$this->text_length = $closer_at - $this->text_starts_at;
@@ -1644,6 +1658,7 @@ private function parse_next_tag() {
16441658
'>' === $html[ $closer_at + 3 ]
16451659
) {
16461660
$this->parser_state = self::STATE_COMMENT;
1661+
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
16471662
$this->token_length = $closer_at + 4 - $this->token_starts_at;
16481663
$this->text_starts_at = $this->token_starts_at + 4;
16491664
$this->text_length = $closer_at - $this->text_starts_at;
@@ -1696,6 +1711,7 @@ private function parse_next_tag() {
16961711
}
16971712

16981713
$this->parser_state = self::STATE_COMMENT;
1714+
$this->comment_type = self::COMMENT_AS_INVALID_HTML;
16991715
$this->token_length = $closer_at + 1 - $this->token_starts_at;
17001716
$this->text_starts_at = $this->token_starts_at + 2;
17011717
$this->text_length = $closer_at - $this->text_starts_at;
@@ -1727,7 +1743,8 @@ private function parse_next_tag() {
17271743
'[' === $html[ $this->token_starts_at + 8 ] &&
17281744
']' === $html[ $closer_at - 1 ]
17291745
) {
1730-
$this->parser_state = self::STATE_CDATA_NODE;
1746+
$this->parser_state = self::STATE_COMMENT;
1747+
$this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE;
17311748
$this->text_starts_at += 7;
17321749
$this->text_length -= 9;
17331750
}
@@ -1764,6 +1781,7 @@ private function parse_next_tag() {
17641781
}
17651782

17661783
$this->parser_state = self::STATE_COMMENT;
1784+
$this->comment_type = self::COMMENT_AS_INVALID_HTML;
17671785
$this->token_length = $closer_at + 1 - $this->token_starts_at;
17681786
$this->text_starts_at = $this->token_starts_at + 2;
17691787
$this->text_length = $closer_at - $this->text_starts_at;
@@ -1800,7 +1818,8 @@ private function parse_next_tag() {
18001818
if ( 0 < $pi_target_length ) {
18011819
$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
18021820

1803-
$this->parser_state = self::STATE_PI_NODE;
1821+
$this->parser_state = self::STATE_COMMENT;
1822+
$this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE;
18041823
$this->tag_name_starts_at = $this->token_starts_at + 2;
18051824
$this->tag_name_length = $pi_target_length;
18061825
$this->text_starts_at += $pi_target_length;
@@ -2003,6 +2022,7 @@ private function after_tag() {
20032022
$this->text_length = 0;
20042023
$this->is_closing_tag = null;
20052024
$this->attributes = array();
2025+
$this->comment_type = null;
20062026
$this->duplicate_attributes = null;
20072027
}
20082028

@@ -2621,9 +2641,6 @@ public function get_token_type() {
26212641
case self::STATE_DOCTYPE:
26222642
return '#doctype';
26232643

2624-
case self::STATE_PI_NODE:
2625-
return '#processing-instruction';
2626-
26272644
default:
26282645
return $this->get_token_name();
26292646
}
@@ -2661,9 +2678,6 @@ public function get_token_name() {
26612678
case self::STATE_CDATA_NODE:
26622679
return '#cdata-section';
26632680

2664-
case self::STATE_PI_NODE:
2665-
return substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
2666-
26672681
case self::STATE_COMMENT:
26682682
return '#comment';
26692683

@@ -2678,6 +2692,27 @@ public function get_token_name() {
26782692
}
26792693
}
26802694

2695+
/**
2696+
* Indicates what kind of comment produced the comment node.
2697+
*
2698+
* Because there are different kinds of HTML syntax which produce
2699+
* comments, the Tag Processor tracks the source in order to expose
2700+
* it to callers.
2701+
*
2702+
* @todo finish this comment!
2703+
*
2704+
* @since 6.5.0
2705+
*
2706+
* @return string|null
2707+
*/
2708+
public function get_comment_type() {
2709+
if ( self::STATE_COMMENT !== $this->parser_state ) {
2710+
return null;
2711+
}
2712+
2713+
return $this->comment_type;
2714+
}
2715+
26812716
/**
26822717
* Returns the modifiable text for a matched token, or an empty string.
26832718
*
@@ -2708,7 +2743,6 @@ public function get_modifiable_text() {
27082743
self::STATE_CDATA_NODE === $this->parser_state ||
27092744
self::STATE_COMMENT === $this->parser_state ||
27102745
self::STATE_DOCTYPE === $this->parser_state ||
2711-
self::STATE_PI_NODE === $this->parser_state ||
27122746
self::STATE_FUNKY_COMMENT === $this->parser_state
27132747
) {
27142748
return $text;
@@ -3290,32 +3324,14 @@ private function matches() {
32903324
* Indicates that the parser has found a CDADA node and it's possible
32913325
* to read and modify its modifiable text. Note that in HTML there are
32923326
* no CDATA nodes outside foreign elements (SVG and MathML). Outside
3293-
* of foreign elements, they are treated as HTML comments. Nonetheless,
3294-
* the Tag Processor still recognizes them as they appear in the HTML
3295-
* stream and exposes them for inspection and modification.
3327+
* of foreign elements, they are treated as HTML comments.
32963328
*
32973329
* @since 6.5.0
32983330
*
32993331
* @access private
33003332
*/
33013333
const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
33023334

3303-
/**
3304-
* Parser Processing Instruction State.
3305-
*
3306-
* Indicates that the parser has found a Processing Instruction and
3307-
* it's possible to read and modify its modifiable text. Note that in
3308-
* HTML there are no Processing Instruction nodes and they are treated
3309-
* as HTML comments. Nonetheless, the Tag Processor still recognizes
3310-
* them as they appear in the HTML stream and exposes them for
3311-
* inspection and modification.
3312-
*
3313-
* @since 6.5.0
3314-
*
3315-
* @access private
3316-
*/
3317-
const STATE_PI_NODE = 'STATE_PI_NODE';
3318-
33193335
/**
33203336
* Indicates that the parser has found an HTML comment and it's
33213337
* possible to read and modify its modifiable text.
@@ -3373,4 +3389,70 @@ private function matches() {
33733389
* @access private
33743390
*/
33753391
const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY';
3392+
3393+
/**
3394+
* Indicates that a comment was created when encountering abruptly-closed HTML comment.
3395+
*
3396+
* Example:
3397+
*
3398+
* <!-->
3399+
* <!--->
3400+
* <!---->
3401+
*
3402+
* @since 6.5.0
3403+
*/
3404+
const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT';
3405+
3406+
/**
3407+
* Indicates that a comment would be parsed as a CDATA node,
3408+
* were HTML to allow CDATA nodes outside of foreign elements.
3409+
*
3410+
* Example:
3411+
*
3412+
* <![CDATA[This is a CDATA node.]]>
3413+
*
3414+
* This is an HTML comment, but it looks like a CDATA node.
3415+
*
3416+
* @since 6.5.0
3417+
*/
3418+
const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE';
3419+
3420+
/**
3421+
* Indicates that a comment was created when encountering
3422+
* normative HTML comment syntax.
3423+
*
3424+
* Example:
3425+
*
3426+
* <!-- this is a comment -->
3427+
*
3428+
* @since 6.5.0
3429+
*/
3430+
const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT';
3431+
3432+
/**
3433+
* Indicates that a comment would be parsed as a Processing
3434+
* Instruction node, were they to exist within HTML.
3435+
*
3436+
* Example:
3437+
*
3438+
* <?wp __( 'Like' ) ?>
3439+
*
3440+
* This is an HTML comment, but it looks like a CDATA node.
3441+
*
3442+
* @since 6.5.0
3443+
*/
3444+
const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE';
3445+
3446+
/**
3447+
* Indicates that a comment was created when encountering invalid
3448+
* HTML input, a so-called "bogus comment."
3449+
*
3450+
* Example:
3451+
*
3452+
* <?nothing special>
3453+
* <!{nothing special}>
3454+
*
3455+
* @since 6.5.0
3456+
*/
3457+
const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
33763458
}

0 commit comments

Comments
 (0)