Skip to content

Commit c57f20f

Browse files
dmsnellwestonrutersirreal
committed
HTML API: Trigger active format reconstruction when reaching text nodes.
When encountering text nodes in an HTML document, the HTML parser needs to run the active format reconstruction algorithm, even if it doesn't stop to visit those text nodes. This is because the formats, which might need reconstructing, will impact the breadcrumbs of all downstream nodes from the text node. In this patch, this process is triggered, but the text nodes are then skipped, since the HTML Processor doesn't currently support visiting them. Co-authored-by: Weston Ruter <westonruter@git.wordpress.org> Co-authored-by: Jon Surrell <jonsurrell@git.wordpress.org>
1 parent faf0ed0 commit c57f20f

2 files changed

Lines changed: 107 additions & 0 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,49 @@ private function step_in_body() {
623623
$op_sigil = $this->is_tag_closer() ? '-' : '+';
624624
$op = "{$op_sigil}{$tag_name}";
625625

626+
if ( null === $tag_name && '#text' === $this->get_token_type() ) {
627+
/*
628+
* This rule is necessary even without supporting text nodes in the
629+
* HTML Processor because the parser has to move past text nodes, and
630+
* there could be breadcrumb implications when the text triggers the
631+
* active format reconstruction.
632+
*/
633+
$this->reconstruct_active_formatting_elements();
634+
635+
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
636+
637+
/*
638+
* > A character token that is U+0000 NULL
639+
*
640+
* Any successive sequence of NULL bytes is ignored and won't
641+
* trigger active format reconstruction. Therefore, if the text
642+
* only comprises NULL bytes then the token should be ignored
643+
* here, but if there are any other characters in the stream
644+
* the active formats should be reconstructed.
645+
*/
646+
if (
647+
1 <= $current_token->length &&
648+
"\x00" === $this->html[ $current_token->start ] &&
649+
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
650+
) {
651+
// Parse error: ignore the token.
652+
return $this->step();
653+
}
654+
655+
/*
656+
* Whitespace-only text does not affect the frameset-ok flag.
657+
* It is probably inter-element whitespace, but it may also
658+
* contain character references which decode only to whitespace.
659+
*/
660+
$text = $this->get_modifiable_text();
661+
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
662+
$this->state->frameset_ok = false;
663+
}
664+
665+
// @todo Add support for text nodes: insert node and return "true" where when supported.
666+
return $this->step();
667+
}
668+
626669
switch ( $op ) {
627670
/*
628671
* > A start tag whose tag name is "button"
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
<?php
2+
/**
3+
* Unit tests for the HTML API ensuring proper handling of behaviors related to
4+
* active format reconstruction.
5+
*
6+
* @package WordPress
7+
* @subpackage HTML-API
8+
*
9+
* @since 6.5.0
10+
*
11+
* @group html-api
12+
*
13+
* @coversDefaultClass WP_HTML_Processor
14+
*/
15+
class Tests_HtmlApi_WpHtmlSupportRequiredActiveFormatReconstruction extends WP_UnitTestCase {
16+
/**
17+
* Ensures that active formats are properly reconstructed when visiting text nodes,
18+
* verifying that the proper breadcrumbs are maintained when scanning through HTML.
19+
*
20+
* @ticket 60455
21+
*/
22+
public function test_reconstructs_active_formats_on_text_nodes() {
23+
$processor = WP_HTML_Processor::create_fragment( '<p><b>One<span><p>Two<an-element>' );
24+
25+
$processor->next_tag( 'span' );
26+
$this->assertSame(
27+
array( 'HTML', 'BODY', 'P', 'B', 'SPAN' ),
28+
$processor->get_breadcrumbs(),
29+
'Should have identified the stack of open elements for the first text node.'
30+
);
31+
32+
$this->assertTrue(
33+
$processor->next_tag( 'p' ),
34+
'Should have found second P element.'
35+
);
36+
37+
/*
38+
* There are two ways this test could fail. One is to appropriately find the
39+
* second text node but fail to reconstruct the implicitly-closed B element.
40+
* The other way is to fail to abort when encountering the second text node
41+
* because the kind of active format reconstruction isn't supported.
42+
*
43+
* At the time of writing this test, the HTML Processor bails whenever it
44+
* needs to reconstruct active formats, unless there are no active formats.
45+
* To ensure that this test properly works once that support is expanded,
46+
* it's written to verify both circumstances. Once support is added, this
47+
* can be simplified to only contain the first clause of the conditional.
48+
*/
49+
50+
if ( $processor->next_tag( 'AN-ELEMENT' ) ) {
51+
$this->assertSame(
52+
array( 'HTML', 'BODY', 'P', 'B', 'AN-ELEMENT' ),
53+
$processor->get_breadcrumbs(),
54+
'Should have reconstructed the implicitly-closed B element.'
55+
);
56+
} else {
57+
$this->assertSame(
58+
WP_HTML_Processor::ERROR_UNSUPPORTED,
59+
$processor->get_last_error(),
60+
'Should have aborted for incomplete active format reconstruction when encountering the second text node.'
61+
);
62+
}
63+
}
64+
}

0 commit comments

Comments
 (0)