Skip to content

Commit 4440667

Browse files
committed
Tests: Print invalid UTF-8 as ASCII to fix hosts test reporting failures.
When serializing test output into XML, invalid UTF-8 bytes lead to a failure to load those test results when they are read. This patch adds code to remap those invalid bytes in an ASCII-readable form, whereas the invalid bytes are separated by parentheses and encoded in their hex form. This ensures that a proper XML file is generated from the testing results. Developed in: #11620 Discussed in: https://core.trac.wordpress.org/ticket/31992 Reported in: WordPress/phpunit-test-runner#310 Follow-up to: [62225]. Props agulbra, amykamala, codexdemon, dmsnell, mywp459, rolle. See #31992. git-svn-id: https://develop.svn.wordpress.org/trunk@62249 602fd350-edb4-49c9-b593-d223f7449a82
1 parent fc65d67 commit 4440667

File tree

2 files changed

+122
-6
lines changed

2 files changed

+122
-6
lines changed

tests/phpunit/tests/formatting/isEmail.php

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,60 @@ public static function data_invalid_email_provider() {
122122
);
123123

124124
foreach ( $invalid_emails as $email ) {
125-
yield $email => array( $email );
125+
yield self::invalid_utf8_as_ascii( $email ) => array( $email );
126+
}
127+
}
128+
129+
/**
130+
* Transforms invalid byte sequences in UTF-8 into representations of
131+
* each byte value, according to the maximal subpart rule.
132+
*
133+
* Example:
134+
*
135+
* // For valid UTF-8 the output is the input.
136+
* 'test' === invalid_utf8_as_ascii( 'test' );
137+
*
138+
* // Invalid bytes are represented with their hex value.
139+
* 'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" );
140+
*
141+
* // Invalid byte sequences form maximal subparts.
142+
* '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" );
143+
*
144+
* @param string $text
145+
* @return string
146+
*/
147+
private static function invalid_utf8_as_ascii( string $text ): string {
148+
$output = '';
149+
$at = 0;
150+
$was_at = 0;
151+
$end = strlen( $text );
152+
$invalid_bytes = 0;
153+
154+
while ( $at < $end ) {
155+
if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) {
156+
break;
157+
}
158+
159+
if ( $at > $was_at ) {
160+
$output .= substr( $text, $was_at, $at - $was_at );
161+
}
162+
163+
if ( $invalid_bytes > 0 ) {
164+
$output .= '(';
165+
166+
for ( $i = 0; $i < $invalid_bytes; $i++ ) {
167+
$space = $i > 0 ? ' ' : '';
168+
$as_hex = bin2hex( $text[ $at + $i ] );
169+
$output .= "{$space}0x{$as_hex}";
170+
}
171+
172+
$output .= ')';
173+
}
174+
175+
$at += $invalid_bytes;
176+
$was_at = $at;
126177
}
178+
179+
return $output;
127180
}
128181
}

tests/phpunit/tests/formatting/sanitizeEmail.php

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,21 @@ class Tests_Formatting_SanitizeEmail extends WP_UnitTestCase {
1717
* @param string $expected The expected sanitized email address.
1818
*/
1919
public function test_returns_stripped_email_address( $address, $expected ) {
20-
$this->assertSame(
21-
$expected,
22-
sanitize_email( $address ),
23-
'Should have produced the known sanitized form of the email.'
24-
);
20+
$sanitized = sanitize_email( $address );
21+
22+
if ( $expected === $sanitized ) {
23+
$this->assertSame(
24+
$expected,
25+
$sanitized,
26+
'Should have produced the known sanitized form of the email.'
27+
);
28+
} else {
29+
$this->assertSame(
30+
$expected,
31+
self::invalid_utf8_as_ascii( $sanitized ),
32+
'Should have produced the known sanitized form of the email.'
33+
);
34+
}
2535
}
2636

2737
/**
@@ -39,4 +49,57 @@ public function data_sanitized_email_pairs() {
3949
'all subdomains invalid utf8' => array( "abc@\x80.org", '' ),
4050
);
4151
}
52+
53+
/**
54+
* Transforms invalid byte sequences in UTF-8 into representations of
55+
* each byte value, according to the maximal subpart rule.
56+
*
57+
* Example:
58+
*
59+
* // For valid UTF-8 the output is the input.
60+
* 'test' === invalid_utf8_as_ascii( 'test' );
61+
*
62+
* // Invalid bytes are represented with their hex value.
63+
* 'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" );
64+
*
65+
* // Invalid byte sequences form maximal subparts.
66+
* '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" );
67+
*
68+
* @param string $text
69+
* @return string
70+
*/
71+
private static function invalid_utf8_as_ascii( string $text ): string {
72+
$output = '';
73+
$at = 0;
74+
$was_at = 0;
75+
$end = strlen( $text );
76+
$invalid_bytes = 0;
77+
78+
while ( $at < $end ) {
79+
if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) {
80+
break;
81+
}
82+
83+
if ( $at > $was_at ) {
84+
$output .= substr( $text, $was_at, $at - $was_at );
85+
}
86+
87+
if ( $invalid_bytes > 0 ) {
88+
$output .= '(';
89+
90+
for ( $i = 0; $i < $invalid_bytes; $i++ ) {
91+
$space = $i > 0 ? ' ' : '';
92+
$as_hex = bin2hex( $text[ $at + $i ] );
93+
$output .= "{$space}0x{$as_hex}";
94+
}
95+
96+
$output .= ')';
97+
}
98+
99+
$at += $invalid_bytes;
100+
$was_at = $at;
101+
}
102+
103+
return $output;
104+
}
42105
}

0 commit comments

Comments
 (0)