Skip to content

Commit e441790

Browse files
committed
Expand docblocks for CDATA/PINodes and re-add removed tests
1 parent 3d71c07 commit e441790

2 files changed

Lines changed: 220 additions & 18 deletions

File tree

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -313,13 +313,15 @@
313313
* - Comment nodes and nodes that became comments because of some syntax error. The
314314
* text for these nodes is the portion of the comment inside of the syntax. E.g. for
315315
* `<!-- comment -->` the text is `" comment "` (note that the spaces are part of it).
316-
* For `<![CDATA[some content]]>` the text is `"[CDATA[some content]]"`.
316+
* - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
317+
* `<![CDATA[some content]]>` the text is `"some content"`.
317318
* - "Funky comments," which are a special case of invalid closing tags whose name is
318319
* invalid. The text for these nodes is the text that a browser would transform into
319320
* an HTML when parsing. E.g. for `</%post_author>` the text is `%post_author`.
320321
*
321322
* And there are non-elements which are atomic in nature but have no modifiable text.
322323
* - `DOCTYPE` nodes like `<DOCTYPE html>` which have no closing tag.
324+
* - XML Processing instruction nodes like `<?xml charset="utf8"?>` (with restrictions).}
323325
* - The empty end tag `</>` which is ignored in the browser and DOM but exposed
324326
* to the HTML API.
325327
*
@@ -480,6 +482,8 @@ class WP_HTML_Tag_Processor {
480482
* | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. |
481483
* | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
482484
* | *Text node* | Found a #text node; this is plaintext and modifiable. |
485+
* | *CDATA node* | Found a CDATA section; this is modifiable. |
486+
* | *PI node* | Found a Processing Instruction; this is modifiable. |
483487
* | *Comment* | Found a comment or bogus comment; this is modifiable. |
484488
* | *Presumptuous* | Found an empty tag closer: `</>`. |
485489
* | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. |
@@ -491,6 +495,8 @@ class WP_HTML_Tag_Processor {
491495
* @see WP_HTML_Tag_Processor::STATE_INCOMPLETE
492496
* @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
493497
* @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
498+
* @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
499+
* @see WP_HTML_Tag_Processor::STATE_PI_NODE
494500
* @see WP_HTML_Tag_Processor::STATE_COMMENT
495501
* @see WP_HTML_Tag_Processor::STATE_DOCTYPE
496502
* @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
@@ -800,6 +806,7 @@ public function next_tag( $query = null ) {
800806
* - a text node - the plaintext inside tags.
801807
* - an HTML comment.
802808
* - a DOCTYPE declaration.
809+
* - a processing instruction, e.g. `<?xml version="1.0" ?>`.
803810
*
804811
* The Tag Processor currently only supports the tag token.
805812
*
@@ -1665,7 +1672,21 @@ private function parse_next_tag() {
16651672
$this->text_length = $closer_at - $this->text_starts_at;
16661673
$this->bytes_already_parsed = $closer_at + 1;
16671674

1668-
// Identify nodes that would be CDATA if HTML had CDATA sections.
1675+
/*
1676+
* Identify nodes that would be CDATA if HTML had CDATA sections.
1677+
*
1678+
* This section must occur after identifying the bogus comment end
1679+
* because in an HTML parser it will span to the nearest `>`, even
1680+
* if there's no `]]>` as would be required in an XML document. It
1681+
* is therefore not possible to parse a CDATA section containing
1682+
* a `>` in the HTML syntax.
1683+
*
1684+
* Inside foreign elements there is a discrepancy between browsers
1685+
* and the specification on this.
1686+
*
1687+
* @todo Track whether the Tag Processor is inside a foreign element
1688+
* and require the proper closing `]]>` in those cases.
1689+
*/
16691690
if (
16701691
$this->token_length >= 10 &&
16711692
'[' === $html[ $this->token_starts_at + 2 ] &&
@@ -1722,13 +1743,18 @@ private function parse_next_tag() {
17221743
/*
17231744
* Identify a Processing Instruction node were HTML to have them.
17241745
*
1725-
* XML allows for more target names, but this code only identifies
1726-
* a subset. This is more or less okay because ultimately these are
1727-
* HTML comments in the DOM and this safely supports _some_ kinds
1728-
* of PI Nodes without getting lost while parsing.
1746+
* This section must occur after identifying the bogus comment end
1747+
* because in an HTML parser it will span to the nearest `>`, even
1748+
* if there's no `?>` as would be required in an XML document. It
1749+
* is therefore not possible to parse a Processing Instruction node
1750+
* containing a `>` in the HTML syntax.
17291751
*
1730-
* This code identifies processing instruction nodes whose target
1731-
* name can be represented in single-byte UTF-8 / 7-bit ASCII.
1752+
* XML allows for more target names, but this code only identifies
1753+
* those with ASCII-representable target names. This means that it
1754+
* may identify some Processing Instruction nodes as bogus comments,
1755+
* but it will not misinterpret the HTML structure. By limiting the
1756+
* identification to these target names the Tag Processor can avoid
1757+
* the need to start parsing UTF-8 sequences.
17321758
*
17331759
* > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
17341760
* [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
@@ -1743,13 +1769,13 @@ private function parse_next_tag() {
17431769
$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
17441770

17451771
if ( 0 < $pi_target_length ) {
1746-
$pi_target_length += strspn( $comment_text,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
1772+
$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
17471773

1748-
$this->parser_state = self::STATE_PI_NODE;
1749-
$this->tag_name_starts_at = $this->token_starts_at + 2;
1750-
$this->tag_name_length = $pi_target_length;
1751-
$this->text_starts_at += $pi_target_length;
1752-
$this->text_length -= $pi_target_length + 1;
1774+
$this->parser_state = self::STATE_PI_NODE;
1775+
$this->tag_name_starts_at = $this->token_starts_at + 2;
1776+
$this->tag_name_length = $pi_target_length;
1777+
$this->text_starts_at += $pi_target_length;
1778+
$this->text_length -= $pi_target_length + 1;
17531779
}
17541780
}
17551781

@@ -2544,6 +2570,8 @@ public function is_tag_closer() {
25442570
* Possible values:
25452571
* - `#tag` when matched on a tag.
25462572
* - `#text` when matched on a text node.
2573+
* - `#cdata-section` when matched on a CDATA node.
2574+
* - `#processing-instruction` when matched on a processing instruction.
25472575
* - `#comment` when matched on a comment.
25482576
* - `#presumptuous-tag` when matched on an empty tag closer.
25492577
* - `#funky-comment` when matched on a funky comment.
@@ -2577,6 +2605,7 @@ public function get_token_type() {
25772605
*
25782606
* Dynamic names:
25792607
* - Uppercase tag name for tag matches.
2608+
* - Target name for processing instructions.
25802609
* - `html` for DOCTYPE declarations.
25812610
*
25822611
* Note that if the Tag Processor is not matched on a token

tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php

Lines changed: 177 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,178 @@ public function data_rawtext_elements() {
305305
);
306306
}
307307

308+
/**
309+
* Ensures that normative CDATA sections are properly parsed.
310+
*
311+
* @ticket 60170
312+
*
313+
* @since 6.5.0
314+
*
315+
* @covers WP_HTML_Tag_Processor::next_token
316+
*/
317+
public function test_basic_assertion_cdata_section() {
318+
$processor = WP_HTML_Processor::create_fragment( '<![CDATA[this is a comment]]>' );
319+
$processor->next_token();
320+
321+
$this->assertSame(
322+
'#cdata-section',
323+
$processor->get_token_name(),
324+
"Should have found CDATA section name but found {$processor->get_token_name()} instead."
325+
);
326+
327+
$this->assertNull(
328+
$processor->get_tag(),
329+
'Should not have been able to query tag name on non-element token.'
330+
);
331+
332+
$this->assertNull(
333+
$processor->get_attribute( 'type' ),
334+
'Should not have been able to query attributes on non-element token.'
335+
);
336+
337+
$this->assertSame(
338+
'this is a comment',
339+
$processor->get_modifiable_text(),
340+
'Found incorrect modifiable text.'
341+
);
342+
}
343+
344+
/**
345+
* Ensures that abruptly-closed CDATA sections are properly parsed as comments.
346+
*
347+
* @ticket 60170
348+
*
349+
* @since 6.5.0
350+
*
351+
* @covers WP_HTML_Tag_Processor::next_token
352+
*/
353+
public function test_basic_assertion_abruptly_closed_cdata_section() {
354+
$processor = WP_HTML_Processor::create_fragment( '<![CDATA[this is > a comment]]>' );
355+
$processor->next_token();
356+
357+
$this->assertSame(
358+
'#comment',
359+
$processor->get_token_name(),
360+
"Should have found a bogus comment but found {$processor->get_token_name()} instead."
361+
);
362+
363+
$this->assertNull(
364+
$processor->get_tag(),
365+
'Should not have been able to query tag name on non-element token.'
366+
);
367+
368+
$this->assertNull(
369+
$processor->get_attribute( 'type' ),
370+
'Should not have been able to query attributes on non-element token.'
371+
);
372+
373+
$this->assertSame(
374+
'[CDATA[this is ',
375+
$processor->get_modifiable_text(),
376+
'Found incorrect modifiable text.'
377+
);
378+
379+
$processor->next_token();
380+
381+
$this->assertSame(
382+
' a comment]]>',
383+
$processor->get_modifiable_text(),
384+
'Should have found remaining syntax from abruptly-closed CDATA section.'
385+
);
386+
}
387+
388+
/**
389+
* Ensures that normative Processing Instruction nodes are properly parsed.
390+
*
391+
* @ticket 60170
392+
*
393+
* @since 6.5.0
394+
*
395+
* @covers WP_HTML_Tag_Processor::next_token
396+
*/
397+
public function test_basic_assertion_processing_instruction() {
398+
$processor = WP_HTML_Processor::create_fragment( '<?wp-bit {"just": "kidding"}?>' );
399+
$processor->next_token();
400+
401+
$this->assertSame(
402+
'#processing-instruction',
403+
$processor->get_token_type(),
404+
"Should have found PI node but found {$processor->get_token_type()} instead."
405+
);
406+
407+
$this->assertSame(
408+
'wp-bit',
409+
$processor->get_token_name(),
410+
"Should have found PI target as name but found {$processor->get_token_name()} instead."
411+
);
412+
413+
$this->assertNull(
414+
$processor->get_tag(),
415+
'Should not have been able to query tag name on non-element token.'
416+
);
417+
418+
$this->assertNull(
419+
$processor->get_attribute( 'type' ),
420+
'Should not have been able to query attributes on non-element token.'
421+
);
422+
423+
$this->assertSame(
424+
' {"just": "kidding"}',
425+
$processor->get_modifiable_text(),
426+
'Found incorrect modifiable text.'
427+
);
428+
}
429+
430+
/**
431+
* Ensures that abruptly-closed Processing Instruction nodes are properly parsed as comments.
432+
*
433+
* @ticket 60170
434+
*
435+
* @since 6.5.0
436+
*
437+
* @covers WP_HTML_Tag_Processor::next_token
438+
*/
439+
public function test_basic_assertion_abruptly_closed_processing_instruction() {
440+
$processor = WP_HTML_Processor::create_fragment( '<?version=">=5.3.6"?>' );
441+
$processor->next_token();
442+
443+
$this->assertSame(
444+
'#comment',
445+
$processor->get_token_type(),
446+
"Should have found bogus comment but found {$processor->get_token_type()} instead."
447+
);
448+
449+
$this->assertSame(
450+
'#comment',
451+
$processor->get_token_name(),
452+
"Should have found #comment as name but found {$processor->get_token_name()} instead."
453+
);
454+
455+
$this->assertNull(
456+
$processor->get_tag(),
457+
'Should not have been able to query tag name on non-element token.'
458+
);
459+
460+
$this->assertNull(
461+
$processor->get_attribute( 'type' ),
462+
'Should not have been able to query attributes on non-element token.'
463+
);
464+
465+
$this->assertSame(
466+
'version="',
467+
$processor->get_modifiable_text(),
468+
'Found incorrect modifiable text.'
469+
);
470+
471+
$processor->next_token();
472+
473+
$this->assertSame(
474+
'=5.3.6"?>',
475+
$processor->get_modifiable_text(),
476+
'Should have found remaining syntax from abruptly-closed Processing Instruction.'
477+
);
478+
}
479+
308480
/**
309481
* Ensures that common comments are properly parsed.
310482
*
@@ -359,10 +531,11 @@ public function test_basic_assertion_common_comments( $html, $text ) {
359531
*/
360532
public function data_common_comments() {
361533
return array(
362-
'Shortest comment' => array( '<!-->', '' ),
363-
'Short comment' => array( '<!--->', '' ),
364-
'Invalid PI node' => array( '<?/missing/>', '/missing/' ),
365-
'Invalid ! directive' => array( '<!something else>', 'something else' ),
534+
'Shortest comment' => array( '<!-->', '' ),
535+
'Short comment' => array( '<!--->', '' ),
536+
'PI node without target' => array( '<? missing?>', ' missing?' ),
537+
'Invalid PI node' => array( '<?/missing/>', '/missing/' ),
538+
'Invalid ! directive' => array( '<!something else>', 'something else' ),
366539
);
367540
}
368541

0 commit comments

Comments
 (0)