Skip to content

Commit 7d47eb5

Browse files
committed
HTML API: Fixes for issues discovered while fuzzing.
Fuzz-testing was performed against the HTML API for finding edge cases that might be broken in the existing parsing code. A few issues were discovered with HTML normalization and warnings from out-of-bounds string reads. This patch contains new tests catching regressions on these behaviors and adds fixes for the discovered issues. A special-case for FORM closing tags inside TABLEs was added after investigation turned up that many normalization issues stem from this single issue, where `next_tag()` would already fail to proceed, but the normalization was continuing with already-created virtual tokens. This special-case should be investigated as more support is added to the HTML API to ensure that it couldn’t be removed for more robust core code.
1 parent 6782f0e commit 7d47eb5

7 files changed

Lines changed: 321 additions & 11 deletions

File tree

src/wp-includes/class-wp-token-map.php

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,10 @@ public static function from_precomputed_table( $state ): ?WP_Token_Map {
440440
* @return bool Whether there's an entry for the given word in the map.
441441
*/
442442
public function contains( string $word, string $case_sensitivity = 'case-sensitive' ): bool {
443+
if ( str_contains( $word, "\x00" ) ) {
444+
return false;
445+
}
446+
443447
$ignore_case = 'ascii-case-insensitive' === $case_sensitivity;
444448

445449
if ( $this->key_length >= strlen( $word ) ) {
@@ -533,9 +537,17 @@ public function read_token( string $text, int $offset = 0, &$matched_token_byte_
533537

534538
// Search for a long word first, if the text is long enough, and if that fails, a short one.
535539
if ( $text_length > $this->key_length ) {
536-
$group_key = substr( $text, $offset, $this->key_length );
540+
/*
541+
* Keys cannot contain null bytes, which is taken care of for the full words,
542+
* but here it’s required to reject group keys with null bytes so that the
543+
* lookup doesn’t get off track when scanning the group string.
544+
*/
545+
if ( strcspn( $text, "\x00", $offset, $this->key_length ) < $this->key_length ) {
546+
return null;
547+
}
537548

538-
$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
549+
$group_key = substr( $text, $offset, $this->key_length );
550+
$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
539551
if ( false === $group_at ) {
540552
// Perhaps a short word then.
541553
return strlen( $this->small_words ) > 0

src/wp-includes/html-api/class-wp-html-open-elements.php

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,11 @@ public function after_element_pop( WP_HTML_Token $item ): void {
738738
* When adding support for new elements, expand this switch to trap
739739
* cases where the precalculated value needs to change.
740740
*/
741-
switch ( $item->node_name ) {
741+
$namespaced_name = 'html' === $item->namespace
742+
? $item->node_name
743+
: "{$item->namespace} {$item->node_name}";
744+
745+
switch ( $namespaced_name ) {
742746
case 'APPLET':
743747
case 'BUTTON':
744748
case 'CAPTION':

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -813,8 +813,14 @@ private function next_visitable_token(): bool {
813813
* until there are events or until there are no more
814814
* tokens works in the meantime and isn't obviously wrong.
815815
*/
816-
if ( empty( $this->element_queue ) && $this->step() ) {
817-
return $this->next_visitable_token();
816+
if ( empty( $this->element_queue ) ) {
817+
if ( $this->step() ) {
818+
return $this->next_visitable_token();
819+
}
820+
821+
if ( isset( $this->last_error ) ) {
822+
return false;
823+
}
818824
}
819825

820826
// Process the next event on the queue.
@@ -1401,6 +1407,7 @@ public function serialize_token(): string {
14011407
$tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
14021408
$in_html = 'html' === $this->get_namespace();
14031409
$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
1410+
$qualified_name = str_replace( "\x00", "\u{FFFD}", $qualified_name );
14041411

14051412
if ( $this->is_tag_closer() ) {
14061413
$html .= "</{$qualified_name}>";
@@ -1414,15 +1421,36 @@ public function serialize_token(): string {
14141421
}
14151422

14161423
$html .= "<{$qualified_name}";
1424+
1425+
$previous_attribute_was_true = false;
1426+
$seen_attribute_names = array();
14171427
foreach ( $attribute_names as $attribute_name ) {
1418-
$html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
1428+
$qualified_attribute_name = $this->get_qualified_attribute_name( $attribute_name );
1429+
$qualified_attribute_name = str_replace( "\x00", "\u{FFFD}", $qualified_attribute_name );
1430+
$qualified_attribute_name = wp_scrub_utf8( $qualified_attribute_name );
1431+
if ( isset( $seen_attribute_names[ $qualified_attribute_name ] ) ) {
1432+
continue;
1433+
} else {
1434+
$seen_attribute_names[ $qualified_attribute_name ] = true;
1435+
}
1436+
1437+
if (
1438+
$previous_attribute_was_true &&
1439+
isset( $qualified_attribute_name[0] ) &&
1440+
'=' === $qualified_attribute_name[0]
1441+
) {
1442+
$html .= '=""';
1443+
}
1444+
1445+
$html .= " {$qualified_attribute_name}";
14191446
$value = $this->get_attribute( $attribute_name );
14201447

14211448
if ( is_string( $value ) ) {
14221449
$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
14231450
}
14241451

1425-
$html = str_replace( "\x00", "\u{FFFD}", $html );
1452+
$previous_attribute_was_true = true === $value;
1453+
$html = str_replace( "\x00", "\u{FFFD}", $html );
14261454
}
14271455

14281456
if ( ! $in_html && $this->has_self_closing_flag() ) {
@@ -2667,8 +2695,7 @@ private function step_in_body(): bool {
26672695
*/
26682696
case '-FORM':
26692697
if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) {
2670-
$node = $this->state->form_element;
2671-
$this->state->form_element = null;
2698+
$node = $this->state->form_element;
26722699

26732700
/*
26742701
* > If node is null or if the stack of open elements does not have node
@@ -2681,10 +2708,20 @@ private function step_in_body(): bool {
26812708
null === $node ||
26822709
! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' )
26832710
) {
2684-
// Parse error: ignore the token.
2711+
/*
2712+
* Parse error: ignore the token.
2713+
*
2714+
* Keep the form pointer intact when the end tag is ignored, such as
2715+
* when a FORM closing tag appears inside an SVG TITLE integration
2716+
* point. Otherwise the ignored token changes parser state in a way
2717+
* that serialization cannot represent, allowing a later FORM opener
2718+
* to appear in the first normalization pass and disappear on the second.
2719+
*/
26852720
return $this->step();
26862721
}
26872722

2723+
$this->state->form_element = null;
2724+
26882725
$this->generate_implied_end_tags();
26892726
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
26902727
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
@@ -3492,6 +3529,16 @@ private function step_in_table(): bool {
34923529
$this->state->form_element = $this->state->current_token;
34933530
$this->state->stack_of_open_elements->pop();
34943531
return true;
3532+
3533+
/*
3534+
* > Anything else
3535+
*
3536+
* A FORM end tag in table insertion mode is processed through the "in body"
3537+
* rules with foster parenting enabled. Because this token does not insert
3538+
* DOM content, the in-body handling is sufficient here.
3539+
*/
3540+
case '-FORM':
3541+
return $this->step_in_body();
34953542
}
34963543

34973544
/*

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1424,7 +1424,7 @@ private function skip_rcdata( string $tag_name ): bool {
14241424
$this->tag_name_starts_at = $at;
14251425

14261426
// Fail if there is no possible tag closer.
1427-
if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
1427+
if ( false === $at || ( $at + 2 + $tag_length ) > $doc_length ) {
14281428
return false;
14291429
}
14301430

@@ -1815,6 +1815,12 @@ private function parse_next_tag(): bool {
18151815

18161816
// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
18171817
$span_of_dashes = strspn( $html, '-', $closer_at );
1818+
if ( $doc_length <= $span_of_dashes + $closer_at ) {
1819+
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
1820+
1821+
return false;
1822+
}
1823+
18181824
if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
18191825
/*
18201826
* @todo When implementing `set_modifiable_text()` ensure that updates to this token

tests/phpunit/tests/html-api/wpHtmlDecoder.php

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,31 @@ public static function data_edge_cases() {
3636
);
3737
}
3838

39+
/**
40+
* Ensures that character references followed by NULL bytes do not emit native PHP errors.
41+
*
42+
* @ticket {TICKET_NUMBER}
43+
*/
44+
public function test_character_reference_with_null_byte_does_not_emit_native_errors() {
45+
$errors = array();
46+
set_error_handler(
47+
static function ( int $errno, string $errstr ) use ( &$errors ) {
48+
$errors[] = "{$errno}: {$errstr}";
49+
return true;
50+
}
51+
);
52+
53+
try {
54+
$decoded = WP_HTML_Decoder::decode_text_node( "&\x00b" );
55+
} finally {
56+
restore_error_handler();
57+
}
58+
59+
// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
60+
$this->assertSame( array(), $errors );
61+
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
62+
}
63+
3964
/**
4065
* Ensures proper detection of attribute prefixes ignoring ASCII case.
4166
*

tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,180 @@ public function test_normalize_special_leading_newline_handling( string $input,
340340
$this->assertEqualHTML( $expected, $normalized_twice );
341341
}
342342

343+
/**
344+
* Ensures that fuzzer-discovered inputs do not emit native PHP errors.
345+
*
346+
* @ticket {TICKET_NUMBER}
347+
*
348+
* @dataProvider data_provider_fuzzer_native_error_cases
349+
*
350+
* @param string $input HTML input.
351+
* @param string|null $expected Expected normalized output, or null when unsupported.
352+
*/
353+
public function test_normalize_fuzzer_cases_do_not_emit_native_errors( string $input, ?string $expected ) {
354+
$errors = array();
355+
356+
/*
357+
* This test is checking for native PHP warnings/notices. Unsupported HTML may
358+
* intentionally cause wp_trigger_error() under WP_DEBUG, which is separate
359+
* from the native errors this regression test is trying to catch.
360+
*/
361+
add_filter( 'wp_trigger_error_trigger_error', '__return_false' );
362+
set_error_handler(
363+
static function ( int $errno, string $errstr ) use ( &$errors ) {
364+
$errors[] = "{$errno}: {$errstr}";
365+
return true;
366+
}
367+
);
368+
369+
try {
370+
$normalized = WP_HTML_Processor::normalize( $input );
371+
} finally {
372+
restore_error_handler();
373+
remove_filter( 'wp_trigger_error_trigger_error', '__return_false' );
374+
}
375+
376+
// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
377+
$this->assertSame( array(), $errors );
378+
$this->assertSame( $expected, $normalized, 'Should have normalized the input.' );
379+
}
380+
381+
/**
382+
* Data provider.
383+
*
384+
* @return array[]
385+
*/
386+
public static function data_provider_fuzzer_native_error_cases() {
387+
return array(
388+
'Unsupported active formatting' => array( '<A><I><A>', null ),
389+
);
390+
}
391+
392+
/**
393+
* Ensures that normalized fuzzer-discovered inputs remain supported.
394+
*
395+
* @ticket {TICKET_NUMBER}
396+
*
397+
* @dataProvider data_provider_normalized_fuzzer_cases_that_should_remain_supported
398+
*
399+
* @param string $input HTML input.
400+
*/
401+
public function test_normalized_fuzzer_cases_should_remain_supported( string $input ) {
402+
$errors = array();
403+
set_error_handler(
404+
static function ( int $errno, string $errstr ) use ( &$errors ) {
405+
$errors[] = "{$errno}: {$errstr}";
406+
return true;
407+
}
408+
);
409+
410+
try {
411+
$normalized = WP_HTML_Processor::normalize( $input );
412+
$normalized_twice = is_string( $normalized ) ? WP_HTML_Processor::normalize( $normalized ) : null;
413+
} finally {
414+
restore_error_handler();
415+
}
416+
417+
// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
418+
$this->assertSame( array(), $errors );
419+
$this->assertIsString( $normalized, 'Input HTML should normalize successfully.' );
420+
$this->assertIsString(
421+
$normalized_twice,
422+
'Normalized HTML should remain supported by the HTML Processor.'
423+
);
424+
}
425+
426+
/**
427+
* Data provider.
428+
*
429+
* @return array[]
430+
*/
431+
public static function data_provider_normalized_fuzzer_cases_that_should_remain_supported() {
432+
return array(
433+
'FORM in TABLE' => array( '<table><form>' ),
434+
'Mixed-case FORM in TABLE' => array( '<TABLE><Form>' ),
435+
'FORM in TABLE after SCRIPT' => array( '<table><script></script><form>' ),
436+
'Unclosed SVG TITLE after P in EM' => array( '<em><p><svg><title>' ),
437+
'Unclosed SVG TITLE after P in STRONG' => array(
438+
'<strong><p><svg ><title>',
439+
),
440+
);
441+
}
442+
443+
/**
444+
* Ensures that normalized fuzzer-discovered inputs are idempotent.
445+
*
446+
* @ticket {TICKET_NUMBER}
447+
*
448+
* @dataProvider data_provider_normalized_fuzzer_cases_that_should_be_idempotent
449+
*
450+
* @param string $input HTML input.
451+
*/
452+
public function test_normalized_fuzzer_cases_should_be_idempotent( string $input ) {
453+
$errors = array();
454+
set_error_handler(
455+
static function ( int $errno, string $errstr ) use ( &$errors ) {
456+
$errors[] = "{$errno}: {$errstr}";
457+
return true;
458+
}
459+
);
460+
461+
try {
462+
$normalized = WP_HTML_Processor::normalize( $input );
463+
$normalized_twice = is_string( $normalized ) ? WP_HTML_Processor::normalize( $normalized ) : null;
464+
} finally {
465+
restore_error_handler();
466+
}
467+
468+
// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
469+
$this->assertSame( array(), $errors );
470+
$this->assertIsString( $normalized, 'Input HTML should normalize successfully.' );
471+
$this->assertSame(
472+
$normalized,
473+
$normalized_twice,
474+
'Normalizing already-normalized HTML should not change it.'
475+
);
476+
}
477+
478+
/**
479+
* Data provider.
480+
*
481+
* @return array[]
482+
*/
483+
public static function data_provider_normalized_fuzzer_cases_that_should_be_idempotent() {
484+
return array(
485+
'Malformed quoted attribute boundary' => array( '<A "/=>' ),
486+
'Duplicate attribute after bare attribute' => array( '<A V=5 R V=""=>' ),
487+
'Duplicate DATA-ID after numeric attribute' => array( '<E DATA-ID=1 1 DATA-ID=""=>' ),
488+
'Duplicate attribute before tag end' => array( '<R V=5 R V=5 =>' ),
489+
'NULL byte in foreign tag name' => array( "<SVG><L\x00 D>" ),
490+
'Malformed closing-looking attribute' => array( '<a </=>' ),
491+
'Malformed self-closing attribute' => array( '<a h/=>' ),
492+
'Duplicate ID with quote boundary' => array( '<d ID=""" ID=""=>' ),
493+
'Mixed-case duplicate TITLE' => array( "<d TITLE=\"\"' title=\"\"=>" ),
494+
'Colon before self-closing slash' => array( '<e :/=>' ),
495+
'Duplicate class after bare attribute' => array( "<e class=y d class=''=>" ),
496+
'Duplicate DATA-ID after hyphen' => array( '<e data-id=1 - data-id="">' ),
497+
'Duplicate title after quotes' => array( "<e title=''' title=\"\"=>" ),
498+
'FORM with SVG TITLE text edge' => array( "<form ><svg ><title \"'></form><form>" ),
499+
'FORM with TABLE and SCRIPT' => array( '<form id><table te"><script></script><td srce" ID/></form><form claslicate">' ),
500+
'FORM with TABLE CAPTION' => array( '<form><table><caption></form><form >' ),
501+
'Short malformed G attribute C' => array( '<g c/=>' ),
502+
'Short malformed G attribute S' => array( '<g s/=>' ),
503+
'Duplicate SRC boundary' => array( '<g src=""g src="">' ),
504+
'Short malformed H attribute' => array( '<h f/=>' ),
505+
'Malformed SRC equals boundary' => array( '<i src=""= src=""=">' ),
506+
'Malformed slash in tag opener' => array( '<i/t/=>' ),
507+
'Malformed L colon attribute' => array( '<l :/=>' ),
508+
'Malformed L less-than attribute' => array( '<l/</=>' ),
509+
'Malformed N less-than attribute' => array( '<n </=>' ),
510+
'Unclosed SVG TITLE after P' => array( '<p><svg><title>' ),
511+
'Duplicate ALT boundary' => array( '<r alt=\'\'d alt=""=>' ),
512+
'NULL byte in SVG child tag' => array( "<svg><l\x00 '>" ),
513+
'NULL byte before slash in SVG child tag' => array( "<svg><l\x00/r>" ),
514+
);
515+
}
516+
343517
/**
344518
* Data provider.
345519
*

0 commit comments

Comments
 (0)