@@ -1664,6 +1664,24 @@ private function parse_next_tag() {
16641664 $ this ->text_starts_at = $ this ->token_starts_at + 2 ;
16651665 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
16661666 $ this ->bytes_already_parsed = $ closer_at + 1 ;
1667+
1668+ // Identify nodes that would be CDATA if HTML had CDATA sections.
1669+ if (
1670+ $ this ->token_length >= 10 &&
1671+ '[ ' === $ html [ $ this ->token_starts_at + 2 ] &&
1672+ 'C ' === $ html [ $ this ->token_starts_at + 3 ] &&
1673+ 'D ' === $ html [ $ this ->token_starts_at + 4 ] &&
1674+ 'A ' === $ html [ $ this ->token_starts_at + 5 ] &&
1675+ 'T ' === $ html [ $ this ->token_starts_at + 6 ] &&
1676+ 'A ' === $ html [ $ this ->token_starts_at + 7 ] &&
1677+ '[ ' === $ html [ $ this ->token_starts_at + 8 ] &&
1678+ '] ' === $ html [ $ closer_at - 1 ]
1679+ ) {
1680+ $ this ->parser_state = self ::STATE_CDATA_NODE ;
1681+ $ this ->text_starts_at += 7 ;
1682+ $ this ->text_length -= 9 ;
1683+ }
1684+
16671685 return true ;
16681686 }
16691687
@@ -1700,6 +1718,41 @@ private function parse_next_tag() {
17001718 $ this ->text_starts_at = $ this ->token_starts_at + 2 ;
17011719 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
17021720 $ this ->bytes_already_parsed = $ closer_at + 1 ;
1721+
1722+ /*
1723+ * Identify a Processing Instruction node were HTML to have them.
1724+ *
1725+ * XML allows for more target names, but this code only identifies
1726+ * a subset. This is more or less okay because ultimately these are
1727+ * HTML comments in the DOM and this safely supports _some_ kinds
1728+ * of PI Nodes without getting lost while parsing.
1729+ *
1730+ * This code identifies processing instruction nodes whose target
1731+ * name can be represented in single-byte UTF-8 / 7-bit ASCII.
1732+ *
1733+ * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
1734+ * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
1735+ * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
1736+ * [#x10000-#xEFFFF]
1737+ * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
1738+ *
1739+ * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
1740+ */
1741+ if ( $ this ->token_length >= 5 && '? ' === $ html [ $ closer_at - 1 ] ) {
1742+ $ comment_text = substr ( $ html , $ this ->token_starts_at + 2 , $ this ->token_length - 4 );
1743+ $ pi_target_length = strspn ( $ comment_text , 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_ ' );
1744+
1745+ if ( 0 < $ pi_target_length ) {
1746+ $ pi_target_length += strspn ( $ comment_text ,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-. ' , $ pi_target_length );
1747+
1748+ $ this ->parser_state = self ::STATE_PI_NODE ;
1749+ $ this ->tag_name_starts_at = $ this ->token_starts_at + 2 ;
1750+ $ this ->tag_name_length = $ pi_target_length ;
1751+ $ this ->text_starts_at += $ pi_target_length ;
1752+ $ this ->text_length -= $ pi_target_length + 1 ;
1753+ }
1754+ }
1755+
17031756 return true ;
17041757 }
17051758
@@ -2507,6 +2560,9 @@ public function get_token_type() {
25072560 case self ::STATE_DOCTYPE :
25082561 return '#doctype ' ;
25092562
2563+ case self ::STATE_PI_NODE :
2564+ return '#processing-instruction ' ;
2565+
25102566 default :
25112567 return $ this ->get_token_name ();
25122568 }
@@ -2540,6 +2596,12 @@ public function get_token_name() {
25402596 case self ::STATE_TEXT_NODE :
25412597 return '#text ' ;
25422598
2599+ case self ::STATE_CDATA_NODE :
2600+ return '#cdata-section ' ;
2601+
2602+ case self ::STATE_PI_NODE :
2603+ return substr ( $ this ->html , $ this ->tag_name_starts_at , $ this ->tag_name_length );
2604+
25432605 case self ::STATE_COMMENT :
25442606 return '#comment ' ;
25452607
@@ -2580,7 +2642,15 @@ public function get_modifiable_text() {
25802642 $ at = $ this ->text_starts_at ;
25812643 $ length = $ this ->text_length ;
25822644 $ text = substr ( $ this ->html , $ at , $ length );
2583- $ text = html_entity_decode ( $ text , ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
2645+
2646+ if (
2647+ self ::STATE_CDATA_NODE === $ this ->parser_state ||
2648+ self ::STATE_PI_NODE === $ this ->parser_state
2649+ ) {
2650+ return $ text ;
2651+ }
2652+
2653+ $ text = html_entity_decode ( $ text , ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
25842654
25852655 if ( empty ( $ text ) ) {
25862656 return '' ;
@@ -3135,6 +3205,38 @@ private function matches() {
31353205 */
31363206 const STATE_TEXT_NODE = 'STATE_TEXT_NODE ' ;
31373207
3208+ /**
3209+ * Parser CDATA Node State.
3210+ *
3211+ * Indicates that the parser has found a CDADA node and it's possible
3212+ * to read and modify its modifiable text. Note that in HTML there are
3213+ * no CDATA nodes outside foreign elements (SVG and MathML). Outside
3214+ * of foreign elements, they are treated as HTML comments. Nonetheless,
3215+ * the Tag Processor still recognizes them as they appear in the HTML
3216+ * stream and exposes them for inspection and modification.
3217+ *
3218+ * @since 6.5.0
3219+ *
3220+ * @access private
3221+ */
3222+ const STATE_CDATA_NODE = 'STATE_CDATA_NODE ' ;
3223+
3224+ /**
3225+ * Parser Processing Instruction State.
3226+ *
3227+ * Indicates that the parser has found a Processing Instruction and
3228+ * it's possible to read and modify its modifiable text. Note that in
3229+ * HTML there are no Processing Instruction nodes and they are treated
3230+ * as HTML comments. Nonetheless, the Tag Processor still recognizes
3231+ * them as they appear in the HTML stream and exposes them for
3232+ * inspection and modification.
3233+ *
3234+ * @since 6.5.0
3235+ *
3236+ * @access private
3237+ */
3238+ const STATE_PI_NODE = 'STATE_PI_NODE ' ;
3239+
31383240 /**
31393241 * Indicates that the parser has found an HTML comment and it's
31403242 * possible to read and modify its modifiable text.
0 commit comments