Skip to content

Commit 440fee0

Browse files
dmsnellaslamdoctor
authored andcommitted
HTML API: Add get_full_comment_text() method.
Previously, there were a few cases where the modifiable text read from an HTML comment differs slightly from the parsed value of its inner text in a browser. This is due to the specific way that invalid HTML syntax tokens become "bogus comments." This patch introduces a new method to the Tag Processor to allow differentiating these specific cases, such as when copying or serializing HTML from one source to another. Similar code has already been in use in the html5lib tests, and this patch simplifies the test runner, evidencing the fact that this method was already needed. Developed in WordPress#7342 Discussed in https://core.trac.wordpress.org/ticket/62036 Props dmsnell, jonsurrell. See #62036. git-svn-id: https://develop.svn.wordpress.org/trunk@59075 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 179daa9 commit 440fee0

2 files changed

Lines changed: 64 additions & 33 deletions

File tree

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3385,6 +3385,58 @@ public function get_comment_type(): ?string {
33853385
return $this->comment_type;
33863386
}
33873387

3388+
/**
3389+
* Returns the text of a matched comment or null if not on a comment type node.
3390+
*
3391+
* This method returns the entire text content of a comment node as it
3392+
* would appear in the browser.
3393+
*
3394+
* This differs from {@see ::get_modifiable_text()} in that certain comment
3395+
* types in the HTML API cannot allow their entire comment text content to
3396+
* be modified. Namely, "bogus comments" of the form `<?not allowed in html>`
3397+
* will create a comment whose text content starts with `?`. Note that if
3398+
* that character were modified, it would be possible to change the node
3399+
* type.
3400+
*
3401+
* @since 6.7.0
3402+
*
3403+
* @return string|null The comment text as it would appear in the browser or null
3404+
* if not on a comment type node.
3405+
*/
3406+
public function get_full_comment_text(): ?string {
3407+
if ( self::STATE_FUNKY_COMMENT === $this->parser_state ) {
3408+
return $this->get_modifiable_text();
3409+
}
3410+
3411+
if ( self::STATE_COMMENT !== $this->parser_state ) {
3412+
return null;
3413+
}
3414+
3415+
switch ( $this->get_comment_type() ) {
3416+
case self::COMMENT_AS_HTML_COMMENT:
3417+
case self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT:
3418+
return $this->get_modifiable_text();
3419+
3420+
case self::COMMENT_AS_CDATA_LOOKALIKE:
3421+
return "[CDATA[{$this->get_modifiable_text()}]]";
3422+
3423+
case self::COMMENT_AS_PI_NODE_LOOKALIKE:
3424+
return "?{$this->get_tag()}{$this->get_modifiable_text()}?";
3425+
3426+
/*
3427+
* This represents "bogus comments state" from HTML tokenization.
3428+
* This can be entered by `<?` or `<!`, where `?` is included in
3429+
* the comment text but `!` is not.
3430+
*/
3431+
case self::COMMENT_AS_INVALID_HTML:
3432+
$preceding_character = $this->html[ $this->text_starts_at - 1 ];
3433+
$comment_start = '?' === $preceding_character ? '?' : '';
3434+
return "{$comment_start}{$this->get_modifiable_text()}";
3435+
}
3436+
3437+
return null;
3438+
}
3439+
33883440
/**
33893441
* Subdivides a matched text node, splitting NULL byte sequences and decoded whitespace as
33903442
* distinct nodes prefixes.

tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,17 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase {
2727
* Skip specific tests that may not be supported or have known issues.
2828
*/
2929
const SKIP_TESTS = array(
30-
'comments01/line0155' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
31-
'comments01/line0169' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
32-
'html5test-com/line0129' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
33-
'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
34-
'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
35-
'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
36-
'tests19/line0488' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
37-
'tests19/line0500' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
38-
'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
39-
'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
40-
'tests2/line0686' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
41-
'tests2/line0697' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
42-
'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
43-
'webkit01/line0231' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
30+
'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
31+
'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
32+
'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
33+
'tests19/line0488' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
34+
'tests19/line0500' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
35+
'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
36+
'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
37+
'tests2/line0686' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
38+
'tests2/line0697' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
39+
'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
40+
'webkit01/line0231' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
4441
);
4542

4643
/**
@@ -315,26 +312,8 @@ static function ( $a, $b ) {
315312
break;
316313

317314
case '#comment':
318-
switch ( $processor->get_comment_type() ) {
319-
case WP_HTML_Processor::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT:
320-
case WP_HTML_Processor::COMMENT_AS_HTML_COMMENT:
321-
case WP_HTML_Processor::COMMENT_AS_INVALID_HTML:
322-
$comment_text_content = $processor->get_modifiable_text();
323-
break;
324-
325-
case WP_HTML_Processor::COMMENT_AS_CDATA_LOOKALIKE:
326-
$comment_text_content = "[CDATA[{$processor->get_modifiable_text()}]]";
327-
break;
328-
329-
case WP_HTML_Processor::COMMENT_AS_PI_NODE_LOOKALIKE:
330-
$comment_text_content = "?{$processor->get_tag()}{$processor->get_modifiable_text()}?";
331-
break;
332-
333-
default:
334-
throw new Error( "Unhandled comment type for tree construction: {$processor->get_comment_type()}" );
335-
}
336315
// Comments must be "<" then "!-- " then the data then " -->".
337-
$output .= str_repeat( self::TREE_INDENT, $indent_level ) . "<!-- {$comment_text_content} -->\n";
316+
$output .= str_repeat( self::TREE_INDENT, $indent_level ) . "<!-- {$processor->get_full_comment_text()} -->\n";
338317
break;
339318

340319
default:

0 commit comments

Comments
 (0)