Expand docblocks for CDATA/PINodes and re-add removed tests

dmsnell · dmsnell · commit e4417906ebff · 2024-01-15T11:22:09.000-06:00
diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -313,13 +313,15 @@
  *  - Comment nodes and nodes that became comments because of some syntax error. The
  *    text for these nodes is the portion of the comment inside of the syntax. E.g. for
  *    `<!-- comment -->` the text is `" comment "` (note that the spaces are part of it).
- *    For `<![CDATA[some content]]>` the text is `"[CDATA[some content]]"`.
+ *  - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
+ *    `<![CDATA[some content]]>` the text is `"some content"`.
  *  - "Funky comments," which are a special case of invalid closing tags whose name is
  *    invalid. The text for these nodes is the text that a browser would transform into
  *    an HTML when parsing. E.g. for `</%post_author>` the text is `%post_author`.
  *
  * And there are non-elements which are atomic in nature but have no modifiable text.
  *  - `DOCTYPE` nodes like `<DOCTYPE html>` which have no closing tag.
+ *  - XML Processing instruction nodes like `<?xml charset="utf8"?>` (with restrictions).}
  *  - The empty end tag `</>` which is ignored in the browser and DOM but exposed
  *    to the HTML API.
  *
@@ -480,6 +482,8 @@ class WP_HTML_Tag_Processor {
 	 * | *Incomplete*    | The HTML ended in the middle of a token; nothing more can be parsed. |
 	 * | *Matched tag*   | Found an HTML tag; it's possible to modify its attributes.           |
 	 * | *Text node*     | Found a #text node; this is plaintext and modifiable.                |
+	 * | *CDATA node*    | Found a CDATA section; this is modifiable.                           |
+	 * | *PI node*       | Found a Processing Instruction; this is modifiable.                  |
 	 * | *Comment*       | Found a comment or bogus comment; this is modifiable.                |
 	 * | *Presumptuous*  | Found an empty tag closer: `</>`.                                    |
 	 * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable.     |
@@ -491,6 +495,8 @@ class WP_HTML_Tag_Processor {
 	 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE
 	 * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
 	 * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
+	 * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
+	 * @see WP_HTML_Tag_Processor::STATE_PI_NODE
 	 * @see WP_HTML_Tag_Processor::STATE_COMMENT
 	 * @see WP_HTML_Tag_Processor::STATE_DOCTYPE
 	 * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
@@ -800,6 +806,7 @@ public function next_tag( $query = null ) {
 	 *  - a text node - the plaintext inside tags.
 	 *  - an HTML comment.
 	 *  - a DOCTYPE declaration.
+	 *  - a processing instruction, e.g. `<?xml version="1.0" ?>`.
 	 *
 	 * The Tag Processor currently only supports the tag token.
 	 *
@@ -1665,7 +1672,21 @@ private function parse_next_tag() {
 				$this->text_length          = $closer_at - $this->text_starts_at;
 				$this->bytes_already_parsed = $closer_at + 1;
 
-				// Identify nodes that would be CDATA if HTML had CDATA sections.
+				/*
+				 * Identify nodes that would be CDATA if HTML had CDATA sections.
+				 *
+				 * This section must occur after identifying the bogus comment end
+				 * because in an HTML parser it will span to the nearest `>`, even
+				 * if there's no `]]>` as would be required in an XML document. It
+				 * is therefore not possible to parse a CDATA section containing
+				 * a `>` in the HTML syntax.
+				 *
+				 * Inside foreign elements there is a discrepancy between browsers
+				 * and the specification on this.
+				 *
+				 * @todo Track whether the Tag Processor is inside a foreign element
+				 *       and require the proper closing `]]>` in those cases.
+				 */
 				if (
 					$this->token_length >= 10 &&
 					'[' === $html[ $this->token_starts_at + 2 ] &&
@@ -1722,13 +1743,18 @@ private function parse_next_tag() {
 				/*
 				 * Identify a Processing Instruction node were HTML to have them.
 				 *
-				 * XML allows for more target names, but this code only identifies
-				 * a subset. This is more or less okay because ultimately these are
-				 * HTML comments in the DOM and this safely supports _some_ kinds
-				 * of PI Nodes without getting lost while parsing.
+				 * This section must occur after identifying the bogus comment end
+				 * because in an HTML parser it will span to the nearest `>`, even
+				 * if there's no `?>` as would be required in an XML document. It
+				 * is therefore not possible to parse a Processing Instruction node
+				 * containing a `>` in the HTML syntax.
 				 *
-				 * This code identifies processing instruction nodes whose target
-				 * name can be represented in single-byte UTF-8 / 7-bit ASCII.
+				 * XML allows for more target names, but this code only identifies
+				 * those with ASCII-representable target names. This means that it
+				 * may identify some Processing Instruction nodes as bogus comments,
+				 * but it will not misinterpret the HTML structure. By limiting the
+				 * identification to these target names the Tag Processor can avoid
+				 * the need to start parsing UTF-8 sequences.
 				 *
 				 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
 				 *                     [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
@@ -1743,13 +1769,13 @@ private function parse_next_tag() {
 					$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
 
 					if ( 0 < $pi_target_length ) {
-						 $pi_target_length += strspn( $comment_text,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
+						$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
 
-						 $this->parser_state       = self::STATE_PI_NODE;
-						 $this->tag_name_starts_at = $this->token_starts_at + 2;
-						 $this->tag_name_length    = $pi_target_length;
-						 $this->text_starts_at    += $pi_target_length;
-						 $this->text_length       -= $pi_target_length + 1;
+						$this->parser_state       = self::STATE_PI_NODE;
+						$this->tag_name_starts_at = $this->token_starts_at + 2;
+						$this->tag_name_length    = $pi_target_length;
+						$this->text_starts_at    += $pi_target_length;
+						$this->text_length       -= $pi_target_length + 1;
 					}
 				}
 
@@ -2544,6 +2570,8 @@ public function is_tag_closer() {
 	 * Possible values:
 	 *  - `#tag` when matched on a tag.
 	 *  - `#text` when matched on a text node.
+	 *  - `#cdata-section` when matched on a CDATA node.
+	 *  - `#processing-instruction` when matched on a processing instruction.
 	 *  - `#comment` when matched on a comment.
 	 *  - `#presumptuous-tag` when matched on an empty tag closer.
 	 *  - `#funky-comment` when matched on a funky comment.
@@ -2577,6 +2605,7 @@ public function get_token_type() {
 	 *
 	 * Dynamic names:
 	 *  - Uppercase tag name for tag matches.
+	 *  - Target name for processing instructions.
 	 *  - `html` for DOCTYPE declarations.
 	 *
 	 * Note that if the Tag Processor is not matched on a token
diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php
@@ -305,6 +305,178 @@ public function data_rawtext_elements() {
 		);
 	}
 
+	/**
+	 * Ensures that normative CDATA sections are properly parsed.
+	 *
+	 * @ticket 60170
+	 *
+	 * @since 6.5.0
+	 *
+	 * @covers WP_HTML_Tag_Processor::next_token
+	 */
+	public function test_basic_assertion_cdata_section() {
+		$processor = WP_HTML_Processor::create_fragment( '<![CDATA[this is a comment]]>' );
+		$processor->next_token();
+
+		$this->assertSame(
+			'#cdata-section',
+			$processor->get_token_name(),
+			"Should have found CDATA section name but found {$processor->get_token_name()} instead."
+		);
+
+		$this->assertNull(
+			$processor->get_tag(),
+			'Should not have been able to query tag name on non-element token.'
+		);
+
+		$this->assertNull(
+			$processor->get_attribute( 'type' ),
+			'Should not have been able to query attributes on non-element token.'
+		);
+
+		$this->assertSame(
+			'this is a comment',
+			$processor->get_modifiable_text(),
+			'Found incorrect modifiable text.'
+		);
+	}
+
+	/**
+	 * Ensures that abruptly-closed CDATA sections are properly parsed as comments.
+	 *
+	 * @ticket 60170
+	 *
+	 * @since 6.5.0
+	 *
+	 * @covers WP_HTML_Tag_Processor::next_token
+	 */
+	public function test_basic_assertion_abruptly_closed_cdata_section() {
+		$processor = WP_HTML_Processor::create_fragment( '<![CDATA[this is > a comment]]>' );
+		$processor->next_token();
+
+		$this->assertSame(
+			'#comment',
+			$processor->get_token_name(),
+			"Should have found a bogus comment but found {$processor->get_token_name()} instead."
+		);
+
+		$this->assertNull(
+			$processor->get_tag(),
+			'Should not have been able to query tag name on non-element token.'
+		);
+
+		$this->assertNull(
+			$processor->get_attribute( 'type' ),
+			'Should not have been able to query attributes on non-element token.'
+		);
+
+		$this->assertSame(
+			'[CDATA[this is ',
+			$processor->get_modifiable_text(),
+			'Found incorrect modifiable text.'
+		);
+
+		$processor->next_token();
+
+		$this->assertSame(
+			' a comment]]>',
+			$processor->get_modifiable_text(),
+			'Should have found remaining syntax from abruptly-closed CDATA section.'
+		);
+	}
+
+	/**
+	 * Ensures that normative Processing Instruction nodes are properly parsed.
+	 *
+	 * @ticket 60170
+	 *
+	 * @since 6.5.0
+	 *
+	 * @covers WP_HTML_Tag_Processor::next_token
+	 */
+	public function test_basic_assertion_processing_instruction() {
+		$processor = WP_HTML_Processor::create_fragment( '<?wp-bit {"just": "kidding"}?>' );
+		$processor->next_token();
+
+		$this->assertSame(
+			'#processing-instruction',
+			$processor->get_token_type(),
+			"Should have found PI node but found {$processor->get_token_type()} instead."
+		);
+
+		$this->assertSame(
+			'wp-bit',
+			$processor->get_token_name(),
+			"Should have found PI target as name but found {$processor->get_token_name()} instead."
+		);
+
+		$this->assertNull(
+			$processor->get_tag(),
+			'Should not have been able to query tag name on non-element token.'
+		);
+
+		$this->assertNull(
+			$processor->get_attribute( 'type' ),
+			'Should not have been able to query attributes on non-element token.'
+		);
+
+		$this->assertSame(
+			' {"just": "kidding"}',
+			$processor->get_modifiable_text(),
+			'Found incorrect modifiable text.'
+		);
+	}
+
+	/**
+	 * Ensures that abruptly-closed Processing Instruction nodes are properly parsed as comments.
+	 *
+	 * @ticket 60170
+	 *
+	 * @since 6.5.0
+	 *
+	 * @covers WP_HTML_Tag_Processor::next_token
+	 */
+	public function test_basic_assertion_abruptly_closed_processing_instruction() {
+		$processor = WP_HTML_Processor::create_fragment( '<?version=">=5.3.6"?>' );
+		$processor->next_token();
+
+		$this->assertSame(
+			'#comment',
+			$processor->get_token_type(),
+			"Should have found bogus comment but found {$processor->get_token_type()} instead."
+		);
+
+		$this->assertSame(
+			'#comment',
+			$processor->get_token_name(),
+			"Should have found #comment as name but found {$processor->get_token_name()} instead."
+		);
+
+		$this->assertNull(
+			$processor->get_tag(),
+			'Should not have been able to query tag name on non-element token.'
+		);
+
+		$this->assertNull(
+			$processor->get_attribute( 'type' ),
+			'Should not have been able to query attributes on non-element token.'
+		);
+
+		$this->assertSame(
+			'version="',
+			$processor->get_modifiable_text(),
+			'Found incorrect modifiable text.'
+		);
+
+		$processor->next_token();
+
+		$this->assertSame(
+			'=5.3.6"?>',
+			$processor->get_modifiable_text(),
+			'Should have found remaining syntax from abruptly-closed Processing Instruction.'
+		);
+	}
+
 	/**
 	 * Ensures that common comments are properly parsed.
 	 *
@@ -359,10 +531,11 @@ public function test_basic_assertion_common_comments( $html, $text ) {
 	 */
 	public function data_common_comments() {
 		return array(
-			'Shortest comment'    => array( '<!-->', '' ),
-			'Short comment'       => array( '<!--->', '' ),
-			'Invalid PI node'     => array( '<?/missing/>', '/missing/' ),
-			'Invalid ! directive' => array( '<!something else>', 'something else' ),
+			'Shortest comment'       => array( '<!-->', '' ),
+			'Short comment'          => array( '<!--->', '' ),
+			'PI node without target' => array( '<? missing?>', ' missing?' ),
+			'Invalid PI node'        => array( '<?/missing/>', '/missing/' ),
+			'Invalid ! directive'    => array( '<!something else>', 'something else' ),
 		);
 	}