Tests: Print invalid UTF-8 as ASCII to fix hosts test reporting failures.

dmsnell · dmsnell · commit 4440667d8060 · 2026-04-21T14:03:56.000Z
When serializing test output into XML, invalid UTF-8 bytes lead to a failure to load those test results when they are read. This patch adds code to remap those invalid bytes in an ASCII-readable form, whereas the invalid bytes are separated by parentheses and encoded in their hex form. This ensures that a proper XML file is generated from the testing results. Developed in: #11620 Discussed in: https://core.trac.wordpress.org/ticket/31992 Reported in: WordPress/phpunit-test-runner#310 Follow-up to: [62225]. Props agulbra, amykamala, codexdemon, dmsnell, mywp459, rolle. See #31992. git-svn-id: https://develop.svn.wordpress.org/trunk@62249 602fd350-edb4-49c9-b593-d223f7449a82
diff --git a/tests/phpunit/tests/formatting/isEmail.php b/tests/phpunit/tests/formatting/isEmail.php
@@ -122,7 +122,60 @@ public static function data_invalid_email_provider() {
 		);
 
 		foreach ( $invalid_emails as $email ) {
-			yield $email => array( $email );
+			yield self::invalid_utf8_as_ascii( $email ) => array( $email );
+		}
+	}
+
+	/**
+	 * Transforms invalid byte sequences in UTF-8 into representations of
+	 * each byte value, according to the maximal subpart rule.
+	 *
+	 * Example:
+	 *
+	 *     // For valid UTF-8 the output is the input.
+	 *     'test' === invalid_utf8_as_ascii( 'test' );
+	 *
+	 *     // Invalid bytes are represented with their hex value.
+	 *     'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" );
+	 *
+	 *     // Invalid byte sequences form maximal subparts.
+	 *     '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" );
+	 *
+	 * @param string $text
+	 * @return string
+	 */
+	private static function invalid_utf8_as_ascii( string $text ): string {
+		$output        = '';
+		$at            = 0;
+		$was_at        = 0;
+		$end           = strlen( $text );
+		$invalid_bytes = 0;
+
+		while ( $at < $end ) {
+			if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) {
+				break;
+			}
+
+			if ( $at > $was_at ) {
+				$output .= substr( $text, $was_at, $at - $was_at );
+			}
+
+			if ( $invalid_bytes > 0 ) {
+				$output .= '(';
+
+				for ( $i = 0; $i < $invalid_bytes; $i++ ) {
+					$space   = $i > 0 ? ' ' : '';
+					$as_hex  = bin2hex( $text[ $at + $i ] );
+					$output .= "{$space}0x{$as_hex}";
+				}
+
+				$output .= ')';
+			}
+
+			$at    += $invalid_bytes;
+			$was_at = $at;
 		}
+
+		return $output;
 	}
 }
diff --git a/tests/phpunit/tests/formatting/sanitizeEmail.php b/tests/phpunit/tests/formatting/sanitizeEmail.php
@@ -17,11 +17,21 @@ class Tests_Formatting_SanitizeEmail extends WP_UnitTestCase {
 	 * @param string $expected The expected sanitized email address.
 	 */
 	public function test_returns_stripped_email_address( $address, $expected ) {
-		$this->assertSame(
-			$expected,
-			sanitize_email( $address ),
-			'Should have produced the known sanitized form of the email.'
-		);
+		$sanitized = sanitize_email( $address );
+
+		if ( $expected === $sanitized ) {
+			$this->assertSame(
+				$expected,
+				$sanitized,
+				'Should have produced the known sanitized form of the email.'
+			);
+		} else {
+			$this->assertSame(
+				$expected,
+				self::invalid_utf8_as_ascii( $sanitized ),
+				'Should have produced the known sanitized form of the email.'
+			);
+		}
 	}
 
 	/**
@@ -39,4 +49,57 @@ public function data_sanitized_email_pairs() {
 			'all subdomains invalid utf8'    => array( "abc@\x80.org", '' ),
 		);
 	}
+
+	/**
+	 * Transforms invalid byte sequences in UTF-8 into representations of
+	 * each byte value, according to the maximal subpart rule.
+	 *
+	 * Example:
+	 *
+	 *     // For valid UTF-8 the output is the input.
+	 *     'test' === invalid_utf8_as_ascii( 'test' );
+	 *
+	 *     // Invalid bytes are represented with their hex value.
+	 *     'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" );
+	 *
+	 *     // Invalid byte sequences form maximal subparts.
+	 *     '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" );
+	 *
+	 * @param string $text
+	 * @return string
+	 */
+	private static function invalid_utf8_as_ascii( string $text ): string {
+		$output        = '';
+		$at            = 0;
+		$was_at        = 0;
+		$end           = strlen( $text );
+		$invalid_bytes = 0;
+
+		while ( $at < $end ) {
+			if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) {
+				break;
+			}
+
+			if ( $at > $was_at ) {
+				$output .= substr( $text, $was_at, $at - $was_at );
+			}
+
+			if ( $invalid_bytes > 0 ) {
+				$output .= '(';
+
+				for ( $i = 0; $i < $invalid_bytes; $i++ ) {
+					$space   = $i > 0 ? ' ' : '';
+					$as_hex  = bin2hex( $text[ $at + $i ] );
+					$output .= "{$space}0x{$as_hex}";
+				}
+
+				$output .= ')';
+			}
+
+			$at    += $invalid_bytes;
+			$was_at = $at;
+		}
+
+		return $output;
+	}
 }