From 6301a405b9dfb8033fb5e1efa1eddad918b21add Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 21 Apr 2026 08:01:06 -0500 Subject: [PATCH] Tests: Print invalid UTF-8 as ASCII to fix hosts test reporting failures. Reported in WordPress/phpunit-test-runner#310 When serializing test output into XML, invalid UTF-8 bytes lead to a failure to load those test results when they are read. This patch adds code to remap those invalid bytes in an ASCII-readable form, whereas the invalid bytes are separated by parentheses and encoded in their hex form. This ensures that a proper XML file is generated from the testing results. --- tests/phpunit/tests/formatting/isEmail.php | 55 +++++++++++++- .../tests/formatting/sanitizeEmail.php | 73 +++++++++++++++++-- 2 files changed, 122 insertions(+), 6 deletions(-) diff --git a/tests/phpunit/tests/formatting/isEmail.php b/tests/phpunit/tests/formatting/isEmail.php index d79647885ceba..b793af2c4a70d 100644 --- a/tests/phpunit/tests/formatting/isEmail.php +++ b/tests/phpunit/tests/formatting/isEmail.php @@ -122,7 +122,60 @@ public static function data_invalid_email_provider() { ); foreach ( $invalid_emails as $email ) { - yield $email => array( $email ); + yield self::invalid_utf8_as_ascii( $email ) => array( $email ); + } + } + + /** + * Transforms invalid byte sequences in UTF-8 into representations of + * each byte value, according to the maximal subpart rule. + * + * Example: + * + * // For valid UTF-8 the output is the input. + * 'test' === invalid_utf8_as_ascii( 'test' ); + * + * // Invalid bytes are represented with their hex value. + * 'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" ); + * + * // Invalid byte sequences form maximal subparts. + * '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" ); + * + * @param string $text + * @return string + */ + private static function invalid_utf8_as_ascii( string $text ): string { + $output = ''; + $at = 0; + $was_at = 0; + $end = strlen( $text ); + $invalid_bytes = 0; + + while ( $at < $end ) { + if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) { + break; + } + + if ( $at > $was_at ) { + $output .= substr( $text, $was_at, $at - $was_at ); + } + + if ( $invalid_bytes > 0 ) { + $output .= '('; + + for ( $i = 0; $i < $invalid_bytes; $i++ ) { + $space = $i > 0 ? ' ' : ''; + $as_hex = bin2hex( $text[ $at + $i ] ); + $output .= "{$space}0x{$as_hex}"; + } + + $output .= ')'; + } + + $at += $invalid_bytes; + $was_at = $at; } + + return $output; } } diff --git a/tests/phpunit/tests/formatting/sanitizeEmail.php b/tests/phpunit/tests/formatting/sanitizeEmail.php index 6ca396f42dc26..5490374d0a5e7 100644 --- a/tests/phpunit/tests/formatting/sanitizeEmail.php +++ b/tests/phpunit/tests/formatting/sanitizeEmail.php @@ -17,11 +17,21 @@ class Tests_Formatting_SanitizeEmail extends WP_UnitTestCase { * @param string $expected The expected sanitized email address. */ public function test_returns_stripped_email_address( $address, $expected ) { - $this->assertSame( - $expected, - sanitize_email( $address ), - 'Should have produced the known sanitized form of the email.' - ); + $sanitized = sanitize_email( $address ); + + if ( $expected === $sanitized ) { + $this->assertSame( + $expected, + $sanitized, + 'Should have produced the known sanitized form of the email.' + ); + } else { + $this->assertSame( + $expected, + self::invalid_utf8_as_ascii( $sanitized ), + 'Should have produced the known sanitized form of the email.' + ); + } } /** @@ -39,4 +49,57 @@ public function data_sanitized_email_pairs() { 'all subdomains invalid utf8' => array( "abc@\x80.org", '' ), ); } + + /** + * Transforms invalid byte sequences in UTF-8 into representations of + * each byte value, according to the maximal subpart rule. + * + * Example: + * + * // For valid UTF-8 the output is the input. + * 'test' === invalid_utf8_as_ascii( 'test' ); + * + * // Invalid bytes are represented with their hex value. + * 'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" ); + * + * // Invalid byte sequences form maximal subparts. + * '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" ); + * + * @param string $text + * @return string + */ + private static function invalid_utf8_as_ascii( string $text ): string { + $output = ''; + $at = 0; + $was_at = 0; + $end = strlen( $text ); + $invalid_bytes = 0; + + while ( $at < $end ) { + if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) { + break; + } + + if ( $at > $was_at ) { + $output .= substr( $text, $was_at, $at - $was_at ); + } + + if ( $invalid_bytes > 0 ) { + $output .= '('; + + for ( $i = 0; $i < $invalid_bytes; $i++ ) { + $space = $i > 0 ? ' ' : ''; + $as_hex = bin2hex( $text[ $at + $i ] ); + $output .= "{$space}0x{$as_hex}"; + } + + $output .= ')'; + } + + $at += $invalid_bytes; + $was_at = $at; + } + + return $output; + } }