Skip to content

Commit ce08149

Browse files
committed
Merge PR #51: preserve rawtext contents
# Conflicts: # tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
2 parents ce2af0e + ae3a8be commit ce08149

2 files changed

Lines changed: 132 additions & 6 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1495,6 +1495,8 @@ public function serialize(): ?string {
14951495
*
14961496
* @since 6.7.0
14971497
* @since 6.9.0 Converted from protected to public method.
1498+
* @since 7.1.0 Contents of IFRAME, NOEMBED, NOFRAMES, and XMP elements are
1499+
* serialized literally instead of being dropped or escaped.
14981500
*
14991501
* @return string Serialization of token, or empty string if no serialization exists.
15001502
*/
@@ -1636,17 +1638,38 @@ public function serialize_token(): string {
16361638
$text = $this->get_modifiable_text();
16371639

16381640
switch ( $tag_name ) {
1641+
/*
1642+
* The contents of these elements are emitted literally to preserve
1643+
* the document's contents, following the HTML serialization spec:
1644+
*
1645+
* > If the parent of current node is a style, script, xmp, iframe,
1646+
* > noembed, noframes, or plaintext element, or if the parent of
1647+
* > current node is a noscript element and scripting is enabled for
1648+
* > the node, then append the value of current node's data literally.
1649+
*
1650+
* This is safe because character references are never decoded in
1651+
* their contents. RAWTEXT contents (IFRAME, NOEMBED, NOFRAMES,
1652+
* STYLE, XMP) cannot contain their own closing tag, so the closer
1653+
* appended below cannot be matched early. SCRIPT data may contain
1654+
* escaped closers (e.g. within `<!-- -->`), but re-parsing the
1655+
* identical bytes follows the same tokenization rules that produced
1656+
* this text, terminating at the appended closer all the same.
1657+
*
1658+
* @see https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments
1659+
*/
16391660
case 'IFRAME':
16401661
case 'NOEMBED':
16411662
case 'NOFRAMES':
1642-
$text = '';
1643-
break;
1644-
16451663
case 'SCRIPT':
16461664
case 'STYLE':
16471665
case 'XMP':
16481666
break;
16491667

1668+
/*
1669+
* The contents of TEXTAREA and TITLE are parsed as RCDATA, in which
1670+
* character references are decoded, so the decoded modifiable text
1671+
* must be re-escaped to preserve the document's contents.
1672+
*/
16501673
default:
16511674
$text = self::serialize_decoded_text( $text );
16521675
}

tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php

Lines changed: 106 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -260,16 +260,115 @@ public function test_style_contents_are_not_escaped() {
260260
/**
261261
* Ensures that XMP contents are not escaped, as they are not parsed like text nodes are.
262262
*
263-
* @ticket 62036
263+
* XMP contents are parsed as raw text: character references are never decoded.
264+
* Escaping the contents would change the document, e.g. a "<" would be replaced
265+
* by the literal text "&lt;" after serializing and re-parsing.
266+
*
267+
* @ticket 65372
264268
*/
265269
public function test_xmp_contents_are_not_escaped() {
266270
$this->assertSame(
267-
WP_HTML_Processor::normalize( "<xmp>apples > or\x00anges & pears < plums</xmp>" ),
268-
"<xmp>apples > or\u{FFFD}anges & pears < plums</xmp>",
271+
"<xmp>1 < 2 &amp; apples > or\u{FFFD}anges</xmp>",
272+
WP_HTML_Processor::normalize( "<xmp>1 < 2 &amp; apples > or\x00anges</xmp>" ),
269273
'Should have preserved text inside an XMP element, except for replacing NULL bytes.'
270274
);
271275
}
272276

277+
/**
278+
* Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are
279+
* preserved when serializing.
280+
*
281+
* These elements contain raw text which is part of the parsed document.
282+
* Dropping it would change the document's contents across a serialize and
283+
* re-parse cycle.
284+
*
285+
* @ticket 65372
286+
*
287+
* @dataProvider data_rawtext_elements_with_contents
288+
*
289+
* @param string $html Normalized HTML containing a rawtext element with contents.
290+
*/
291+
public function test_rawtext_element_contents_are_preserved_when_normalizing( string $html ) {
292+
$this->assertSame(
293+
$html,
294+
WP_HTML_Processor::normalize( $html ),
295+
'Should have preserved the rawtext element contents.'
296+
);
297+
}
298+
299+
/**
300+
* Data provider.
301+
*
302+
* @return array[]
303+
*/
304+
public static function data_rawtext_elements_with_contents() {
305+
return array(
306+
'IFRAME with following text' => array( '<iframe>x</iframe>y' ),
307+
'NOEMBED with following text' => array( '<noembed>x</noembed>y' ),
308+
'NOFRAMES with following text' => array( '<section><noframes>x</noframes>y</section>' ),
309+
'NOFRAMES before comment' => array( '<section><noframes>x</noframes><!----></section>' ),
310+
'IFRAME with markup-like contents' => array( '<iframe><div>inert</div></iframe>' ),
311+
'NOEMBED with character reference' => array( '<noembed>&amp;</noembed>' ),
312+
'IFRAME in foreign content' => array( '<svg><iframe>1 &lt; 2</iframe></svg>' ),
313+
);
314+
}
315+
316+
/**
317+
* Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are
318+
* preserved when serializing full documents, including NOFRAMES elements
319+
* in the HEAD or after a FRAMESET.
320+
*
321+
* @ticket 65372
322+
*
323+
* @dataProvider data_full_documents_with_rawtext_elements
324+
*
325+
* @param string $html Input HTML document.
326+
* @param string $expected Expected serialization of the full document.
327+
*/
328+
public function test_rawtext_element_contents_are_preserved_in_full_documents( string $html, string $expected ) {
329+
$processor = WP_HTML_Processor::create_full_parser( $html );
330+
331+
$this->assertSame(
332+
$expected,
333+
$processor->serialize(),
334+
'Should have preserved the rawtext element contents.'
335+
);
336+
}
337+
338+
/**
339+
* Data provider.
340+
*
341+
* @return array[]
342+
*/
343+
public static function data_full_documents_with_rawtext_elements() {
344+
return array(
345+
'IFRAME in BODY' => array(
346+
'<iframe>x</iframe>y',
347+
'<html><head></head><body><iframe>x</iframe>y</body></html>',
348+
),
349+
'NOEMBED in BODY' => array(
350+
'a<noembed>x</noembed>',
351+
'<html><head></head><body>a<noembed>x</noembed></body></html>',
352+
),
353+
'NOFRAMES in BODY' => array(
354+
'a<noframes>x</noframes>',
355+
'<html><head></head><body>a<noframes>x</noframes></body></html>',
356+
),
357+
'NOFRAMES in HEAD' => array(
358+
'<head><noframes>x</noframes></head>z',
359+
'<html><head><noframes>x</noframes></head><body>z</body></html>',
360+
),
361+
'NOFRAMES in FRAMESET' => array(
362+
'<html><frameset><noframes>x</noframes>',
363+
'<html><head></head><frameset><noframes>x</noframes></frameset></html>',
364+
),
365+
'IFRAME before a comment' => array(
366+
'<h3><div><small><dd><iframe>x</iframe><!---->',
367+
'<html><head></head><body><h3><div><small><dd><iframe>x</iframe><!----></dd></small></div></h3></body></html>',
368+
),
369+
);
370+
}
371+
273372
public function test_unexpected_closing_tags_are_removed() {
274373
$this->assertSame(
275374
WP_HTML_Processor::normalize( 'one</div>two</span>three' ),
@@ -447,6 +546,10 @@ public static function data_tokens_with_null_bytes() {
447546
'Foreign content text' => array( "<svg>one\x00two</svg>", "<svg>one\u{FFFD}two</svg>" ),
448547
'SCRIPT content' => array( "<script>alert(\x00)</script>", "<script>alert(\u{FFFD})</script>" ),
449548
'STYLE content' => array( "<style>\x00 {}</style>", "<style>\u{FFFD} {}</style>" ),
549+
'IFRAME content' => array( "<iframe>a\x00b</iframe>", "<iframe>a\u{FFFD}b</iframe>" ),
550+
'NOEMBED content' => array( "<noembed>a\x00b</noembed>", "<noembed>a\u{FFFD}b</noembed>" ),
551+
'NOFRAMES content' => array( "<noframes>a\x00b</noframes>", "<noframes>a\u{FFFD}b</noframes>" ),
552+
'XMP content' => array( "<xmp>a\x00b</xmp>", "<xmp>a\u{FFFD}b</xmp>" ),
450553
'Comment text' => array( "<!-- \x00 -->", "<!-- \u{FFFD} -->" ),
451554
);
452555
}

0 commit comments

Comments
 (0)