Skip to content

Commit 8b2ed2f

Browse files
committed
HTML API: Add explicit handling or failure for all tags.
The HTML API HTML processor does not yet support all tags. Many tags (e.g. list elements) have some complicated rules in the [https://html.spec.whatwg.org/#parsing-main-inbody "in body" insertion mode]. Implementing these special rules is blocking the implementation for a catch-all rule for "any other tag" because we need to prevent special rules from being handled by the catch-all. Any other start tag Reconstruct the active formatting elements, if any. Insert an HTML element for the token. … This change ensures the HTML Processor fails when handling special tags. This is the same as existing behavior, but will allow us to implement the catch-all "any other tag" handling without unintentionally handling special elements. Additionally, we add tests that assert the special elements are unhandled. As these tags are implemented, this should help to ensure they're removed from the unsupported tag list. Props jonsurrell, dmsnell. Fixes #60092. git-svn-id: https://develop.svn.wordpress.org/trunk@57248 602fd350-edb4-49c9-b593-d223f7449a82
1 parent b315d4e commit 8b2ed2f

4 files changed

Lines changed: 290 additions & 154 deletions

File tree

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 124 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,19 @@
100100
* The following list specifies the HTML tags that _are_ supported:
101101
*
102102
* - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
103-
* - Form elements: BUTTON, FIELDSET, SEARCH.
103+
* - Custom elements: All custom elements are supported. :)
104+
* - Form elements: BUTTON, DATALIST, FIELDSET, LABEL, LEGEND, METER, PROGRESS, SEARCH.
104105
* - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
105106
* - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
106107
* - Links: A.
107108
* - Lists: DL.
108-
* - Media elements: FIGCAPTION, FIGURE, IMG.
109+
* - Media elements: AUDIO, CANVAS, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, VIDEO.
109110
* - Paragraph: P.
110-
* - Sectioning elements: ARTICLE, ASIDE, NAV, SECTION
111-
* - Deprecated elements: CENTER, DIR
111+
* - Phrasing elements: ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
112+
* - Sectioning elements: ARTICLE, ASIDE, NAV, SECTION.
113+
* - Templating elements: SLOT.
114+
* - Text decoration: RUBY.
115+
* - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, MULTICOL, NEXTID, SPACER.
112116
*
113117
* ### Supported markup
114118
*
@@ -830,41 +834,132 @@ private function step_in_body() {
830834
$this->reconstruct_active_formatting_elements();
831835
$this->insert_html_element( $this->state->current_token );
832836
return true;
837+
}
838+
839+
/*
840+
* These tags require special handling in the 'in body' insertion mode
841+
* but that handling hasn't yet been implemented.
842+
*
843+
* As the rules for each tag are implemented, the corresponding tag
844+
* name should be removed from this list. An accompanying test should
845+
* help ensure this list is maintained.
846+
*
847+
* @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags
848+
*
849+
* Since this switch structure throws a WP_HTML_Unsupported_Exception, it's
850+
* possible to handle "any other start tag" and "any other end tag" below,
851+
* as that guarantees execution doesn't proceed for the unimplemented tags.
852+
*
853+
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
854+
*/
855+
switch ( $tag_name ) {
856+
case 'APPLET':
857+
case 'AREA':
858+
case 'BASE':
859+
case 'BASEFONT':
860+
case 'BGSOUND':
861+
case 'BODY':
862+
case 'BR':
863+
case 'CAPTION':
864+
case 'COL':
865+
case 'COLGROUP':
866+
case 'DD':
867+
case 'DT':
868+
case 'EMBED':
869+
case 'FORM':
870+
case 'FRAME':
871+
case 'FRAMESET':
872+
case 'HEAD':
873+
case 'HR':
874+
case 'HTML':
875+
case 'IFRAME':
876+
case 'INPUT':
877+
case 'KEYGEN':
878+
case 'LI':
879+
case 'LINK':
880+
case 'LISTING':
881+
case 'MARQUEE':
882+
case 'MATH':
883+
case 'META':
884+
case 'NOBR':
885+
case 'NOEMBED':
886+
case 'NOFRAMES':
887+
case 'NOSCRIPT':
888+
case 'OBJECT':
889+
case 'OL':
890+
case 'OPTGROUP':
891+
case 'OPTION':
892+
case 'PARAM':
893+
case 'PLAINTEXT':
894+
case 'PRE':
895+
case 'RB':
896+
case 'RP':
897+
case 'RT':
898+
case 'RTC':
899+
case 'SARCASM':
900+
case 'SCRIPT':
901+
case 'SELECT':
902+
case 'SOURCE':
903+
case 'STYLE':
904+
case 'SVG':
905+
case 'TABLE':
906+
case 'TBODY':
907+
case 'TD':
908+
case 'TEMPLATE':
909+
case 'TEXTAREA':
910+
case 'TFOOT':
911+
case 'TH':
912+
case 'THEAD':
913+
case 'TITLE':
914+
case 'TR':
915+
case 'TRACK':
916+
case 'UL':
917+
case 'WBR':
918+
case 'XMP':
919+
$this->last_error = self::ERROR_UNSUPPORTED;
920+
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
921+
}
833922

923+
if ( ! $this->is_tag_closer() ) {
834924
/*
835925
* > Any other start tag
836926
*/
837-
case '+SPAN':
838-
$this->reconstruct_active_formatting_elements();
839-
$this->insert_html_element( $this->state->current_token );
840-
return true;
927+
$this->reconstruct_active_formatting_elements();
928+
$this->insert_html_element( $this->state->current_token );
929+
return true;
930+
} else {
931+
/*
932+
* > Any other end tag
933+
*/
841934

842935
/*
843-
* Any other end tag
936+
* Find the corresponding tag opener in the stack of open elements, if
937+
* it exists before reaching a special element, which provides a kind
938+
* of boundary in the stack. For example, a `</custom-tag>` should not
939+
* close anything beyond its containing `P` or `DIV` element.
844940
*/
845-
case '-SPAN':
846-
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
847-
// > If node is an HTML element with the same tag name as the token, then:
848-
if ( $item->node_name === $tag_name ) {
849-
$this->generate_implied_end_tags( $tag_name );
941+
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
942+
if ( $tag_name === $node->node_name ) {
943+
break;
944+
}
850945

851-
// > If node is not the current node, then this is a parse error.
946+
if ( self::is_special( $node->node_name ) ) {
947+
// This is a parse error, ignore the token.
948+
return $this->step();
949+
}
950+
}
852951

853-
$this->state->stack_of_open_elements->pop_until( $tag_name );
854-
return true;
855-
}
952+
$this->generate_implied_end_tags( $tag_name );
953+
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
954+
// @todo Record parse error: this error doesn't impact parsing.
955+
}
856956

857-
// > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return.
858-
if ( self::is_special( $item->node_name ) ) {
859-
return $this->step();
860-
}
957+
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
958+
$this->state->stack_of_open_elements->pop();
959+
if ( $node === $item ) {
960+
return true;
861961
}
862-
// Execution should not reach here; if it does then something went wrong.
863-
return false;
864-
865-
default:
866-
$this->last_error = self::ERROR_UNSUPPORTED;
867-
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
962+
}
868963
}
869964
}
870965

@@ -1264,7 +1359,7 @@ private function run_adoption_agency_algorithm() {
12641359

12651360
// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
12661361
if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) {
1267-
$this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name );
1362+
$this->state->active_formatting_elements->remove_node( $formatting_element );
12681363
return;
12691364
}
12701365

tests/phpunit/tests/html-api/wpHtmlProcessor.php

Lines changed: 92 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -60,22 +60,6 @@ public function test_get_tag_is_null_once_document_is_finished() {
6060
$this->assertNull( $p->get_tag() );
6161
}
6262

63-
/**
64-
* Ensures that if the HTML Processor encounters inputs that it can't properly handle,
65-
* that it stops processing the rest of the document. This prevents data corruption.
66-
*
67-
* @ticket 59167
68-
*
69-
* @covers WP_HTML_Processor::next_tag
70-
*/
71-
public function test_stops_processing_after_unsupported_elements() {
72-
$p = WP_HTML_Processor::create_fragment( '<p><x-not-supported></p><p></p>' );
73-
$p->next_tag( 'P' );
74-
$this->assertFalse( $p->next_tag(), 'Stepped into a tag after encountering X-NOT-SUPPORTED element when it should have aborted.' );
75-
$this->assertNull( $p->get_tag(), "Should have aborted processing, but still reported tag {$p->get_tag()} after properly failing to step into tag." );
76-
$this->assertFalse( $p->next_tag( 'P' ), 'Stepped into normal P element after X-NOT-SUPPORTED element when it should have aborted.' );
77-
}
78-
7963
/**
8064
* Ensures that the HTML Processor maintains its internal state through seek calls.
8165
*
@@ -147,4 +131,96 @@ public function test_fails_to_reconstruct_formatting_elements() {
147131
$this->assertTrue( $p->next_tag( 'EM' ), 'Could not find first EM.' );
148132
$this->assertFalse( $p->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' );
149133
}
134+
135+
/**
136+
* Ensures that special handling of unsupported tags is cleaned up
137+
* as handling is implemented. Otherwise there's risk of leaving special
138+
* handling (that is never reached) when tag handling is implemented.
139+
*
140+
* @ticket 60092
141+
*
142+
* @dataProvider data_unsupported_special_in_body_tags
143+
*
144+
* @covers WP_HTML_Processor::step_in_body
145+
*
146+
* @param string $tag_name Name of the tag to test.
147+
*/
148+
public function test_step_in_body_fails_on_unsupported_tags( $tag_name ) {
149+
$fragment = WP_HTML_Processor::create_fragment( '<' . $tag_name . '></' . $tag_name . '>' );
150+
$this->assertFalse( $fragment->next_tag(), 'Should fail to find tag: ' . $tag_name . '.' );
151+
$this->assertEquals( $fragment->get_last_error(), WP_HTML_Processor::ERROR_UNSUPPORTED, 'Should have unsupported last error.' );
152+
}
153+
154+
/**
155+
* Data provider.
156+
*
157+
* @return array[]
158+
*/
159+
public function data_unsupported_special_in_body_tags() {
160+
return array(
161+
'APPLET' => array( 'APPLET' ),
162+
'AREA' => array( 'AREA' ),
163+
'BASE' => array( 'BASE' ),
164+
'BASEFONT' => array( 'BASEFONT' ),
165+
'BGSOUND' => array( 'BGSOUND' ),
166+
'BODY' => array( 'BODY' ),
167+
'BR' => array( 'BR' ),
168+
'CAPTION' => array( 'CAPTION' ),
169+
'COL' => array( 'COL' ),
170+
'COLGROUP' => array( 'COLGROUP' ),
171+
'DD' => array( 'DD' ),
172+
'DT' => array( 'DT' ),
173+
'EMBED' => array( 'EMBED' ),
174+
'FORM' => array( 'FORM' ),
175+
'FRAME' => array( 'FRAME' ),
176+
'FRAMESET' => array( 'FRAMESET' ),
177+
'HEAD' => array( 'HEAD' ),
178+
'HR' => array( 'HR' ),
179+
'HTML' => array( 'HTML' ),
180+
'IFRAME' => array( 'IFRAME' ),
181+
'INPUT' => array( 'INPUT' ),
182+
'KEYGEN' => array( 'KEYGEN' ),
183+
'LI' => array( 'LI' ),
184+
'LINK' => array( 'LINK' ),
185+
'LISTING' => array( 'LISTING' ),
186+
'MARQUEE' => array( 'MARQUEE' ),
187+
'MATH' => array( 'MATH' ),
188+
'META' => array( 'META' ),
189+
'NOBR' => array( 'NOBR' ),
190+
'NOEMBED' => array( 'NOEMBED' ),
191+
'NOFRAMES' => array( 'NOFRAMES' ),
192+
'NOSCRIPT' => array( 'NOSCRIPT' ),
193+
'OBJECT' => array( 'OBJECT' ),
194+
'OL' => array( 'OL' ),
195+
'OPTGROUP' => array( 'OPTGROUP' ),
196+
'OPTION' => array( 'OPTION' ),
197+
'PARAM' => array( 'PARAM' ),
198+
'PLAINTEXT' => array( 'PLAINTEXT' ),
199+
'PRE' => array( 'PRE' ),
200+
'RB' => array( 'RB' ),
201+
'RP' => array( 'RP' ),
202+
'RT' => array( 'RT' ),
203+
'RTC' => array( 'RTC' ),
204+
'SARCASM' => array( 'SARCASM' ),
205+
'SCRIPT' => array( 'SCRIPT' ),
206+
'SELECT' => array( 'SELECT' ),
207+
'SOURCE' => array( 'SOURCE' ),
208+
'STYLE' => array( 'STYLE' ),
209+
'SVG' => array( 'SVG' ),
210+
'TABLE' => array( 'TABLE' ),
211+
'TBODY' => array( 'TBODY' ),
212+
'TD' => array( 'TD' ),
213+
'TEMPLATE' => array( 'TEMPLATE' ),
214+
'TEXTAREA' => array( 'TEXTAREA' ),
215+
'TFOOT' => array( 'TFOOT' ),
216+
'TH' => array( 'TH' ),
217+
'THEAD' => array( 'THEAD' ),
218+
'TITLE' => array( 'TITLE' ),
219+
'TR' => array( 'TR' ),
220+
'TRACK' => array( 'TRACK' ),
221+
'UL' => array( 'UL' ),
222+
'WBR' => array( 'WBR' ),
223+
'XMP' => array( 'XMP' ),
224+
);
225+
}
150226
}

0 commit comments

Comments
 (0)