Skip to content

Commit ae3a8be

Browse files
committed
HTML API: Stop escaping XMP contents when serializing.
XMP contents are raw text in which character references are never decoded. Escaping them changed document contents across a serialize/re-parse cycle: `<xmp>1 < 2</xmp>` serialized as `<xmp>1 &lt; 2</xmp>`, which re-parses as the literal text "1 &lt; 2". XMP contents now serialize literally like the other raw text elements, following the HTML fragment serialization algorithm. See #65372.
1 parent b2abcaa commit ae3a8be

2 files changed

Lines changed: 27 additions & 3 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,8 +1347,8 @@ public function serialize(): ?string {
13471347
*
13481348
* @since 6.7.0
13491349
* @since 6.9.0 Converted from protected to public method.
1350-
* @since 7.1.0 Contents of IFRAME, NOEMBED, and NOFRAMES elements are
1351-
* serialized literally instead of being dropped.
1350+
* @since 7.1.0 Contents of IFRAME, NOEMBED, NOFRAMES, and XMP elements are
1351+
* serialized literally instead of being dropped or escaped.
13521352
*
13531353
* @return string Serialization of token, or empty string if no serialization exists.
13541354
*/
@@ -1503,7 +1503,7 @@ public function serialize_token(): string {
15031503
*
15041504
* This is safe because character references are never decoded in
15051505
* their contents. RAWTEXT contents (IFRAME, NOEMBED, NOFRAMES,
1506-
* STYLE) cannot contain their own closing tag, so the closer
1506+
* STYLE, XMP) cannot contain their own closing tag, so the closer
15071507
* appended below cannot be matched early. SCRIPT data may contain
15081508
* escaped closers (e.g. within `<!-- -->`), but re-parsing the
15091509
* identical bytes follows the same tokenization rules that produced
@@ -1516,8 +1516,14 @@ public function serialize_token(): string {
15161516
case 'NOFRAMES':
15171517
case 'SCRIPT':
15181518
case 'STYLE':
1519+
case 'XMP':
15191520
break;
15201521

1522+
/*
1523+
* The contents of TEXTAREA and TITLE are parsed as RCDATA, in which
1524+
* character references are decoded, so the decoded modifiable text
1525+
* must be re-escaped to preserve the document's contents.
1526+
*/
15211527
default:
15221528
$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
15231529
}

tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,23 @@ public function test_style_contents_are_not_escaped() {
134134
);
135135
}
136136

137+
/**
138+
* Ensures that XMP contents are not escaped, as they are not parsed like text nodes are.
139+
*
140+
* XMP contents are parsed as raw text: character references are never decoded.
141+
* Escaping the contents would change the document, e.g. a "<" would be replaced
142+
* by the literal text "&lt;" after serializing and re-parsing.
143+
*
144+
* @ticket 65372
145+
*/
146+
public function test_xmp_contents_are_not_escaped() {
147+
$this->assertSame(
148+
"<xmp>1 < 2 &amp; apples > or\u{FFFD}anges</xmp>",
149+
WP_HTML_Processor::normalize( "<xmp>1 < 2 &amp; apples > or\x00anges</xmp>" ),
150+
'Should have preserved text inside an XMP element, except for replacing NULL bytes.'
151+
);
152+
}
153+
137154
/**
138155
* Ensures that the contents of IFRAME, NOEMBED, and NOFRAMES elements are
139156
* preserved when serializing.
@@ -379,6 +396,7 @@ public static function data_tokens_with_null_bytes() {
379396
'IFRAME content' => array( "<iframe>a\x00b</iframe>", "<iframe>a\u{FFFD}b</iframe>" ),
380397
'NOEMBED content' => array( "<noembed>a\x00b</noembed>", "<noembed>a\u{FFFD}b</noembed>" ),
381398
'NOFRAMES content' => array( "<noframes>a\x00b</noframes>", "<noframes>a\u{FFFD}b</noframes>" ),
399+
'XMP content' => array( "<xmp>a\x00b</xmp>", "<xmp>a\u{FFFD}b</xmp>" ),
382400
'Comment text' => array( "<!-- \x00 -->", "<!-- \u{FFFD} -->" ),
383401
);
384402
}

0 commit comments

Comments
 (0)