Skip to content

Commit 6782f0e

Browse files
committed
Charset: Polyfill mb_ord() and mb_chr().
These functions are useful primitives but missing when the mbstring extension isn’t available. This patch adds polyfills for those few environments where this is the case so that WordPress code can unconditionally call them. Developed in: #11965 Discussed in: https://core.trac.wordpress.org/ticket/65342 Fixes #65342. git-svn-id: https://develop.svn.wordpress.org/trunk@62424 602fd350-edb4-49c9-b593-d223f7449a82
1 parent f2754db commit 6782f0e

4 files changed

Lines changed: 212 additions & 34 deletions

File tree

src/wp-includes/compat.php

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,147 @@ function _is_utf8_charset( $charset_slug ) {
110110
);
111111
}
112112

113+
if ( ! function_exists( 'mb_chr' ) ) :
114+
/**
115+
* Compat function to mimic mb_chr().
116+
*
117+
* @ignore
118+
* @since 7.1.0
119+
*
120+
* @see _mb_ord()
121+
*
122+
* @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
123+
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
124+
* @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
125+
*/
126+
function mb_chr( $codepoint, $encoding = null ) {
127+
return _mb_chr( $codepoint, $encoding );
128+
}
129+
endif;
130+
131+
/**
132+
* Internal compat function to mimic mb_chr().
133+
*
134+
* @ignore
135+
* @since 7.1.0
136+
*
137+
* @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
138+
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
139+
* @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
140+
*/
141+
function _mb_chr( $codepoint, $encoding = null ) {
142+
if ( ! is_int( $codepoint ) || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
143+
return false;
144+
}
145+
146+
// Pre-check to ensure a valid code point.
147+
if (
148+
$codepoint < 0 ||
149+
( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
150+
$codepoint > 0x10FFFF
151+
) {
152+
return false;
153+
}
154+
155+
if ( $codepoint <= 0x7F ) {
156+
return chr( $codepoint );
157+
}
158+
159+
if ( $codepoint <= 0x7FF ) {
160+
$byte1 = chr( ( $codepoint >> 6 ) | 0xC0 );
161+
$byte2 = chr( $codepoint & 0x3F | 0x80 );
162+
163+
return "{$byte1}{$byte2}";
164+
}
165+
166+
if ( $codepoint <= 0xFFFF ) {
167+
$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
168+
$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
169+
$byte3 = chr( $codepoint & 0x3F | 0x80 );
170+
171+
return "{$byte1}{$byte2}{$byte3}";
172+
}
173+
174+
// Any values above U+10FFFF are eliminated above in the pre-check.
175+
$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
176+
$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
177+
$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
178+
$byte4 = chr( $codepoint & 0x3F | 0x80 );
179+
180+
return "{$byte1}{$byte2}{$byte3}{$byte4}";
181+
}
182+
183+
if ( ! function_exists( 'mb_ord' ) ) :
184+
/**
185+
* Compat function to mimic mb_ord().
186+
*
187+
* @ignore
188+
* @since 7.1.0
189+
*
190+
* @see _mb_ord()
191+
*
192+
* @param string $string Return the code point at the start of this string.
193+
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
194+
* @return int|false The Unicode code point for the first character of string or false on failure.
195+
*/
196+
function mb_ord( $string, $encoding = null ) {
197+
return _mb_ord( $string, $encoding );
198+
}
199+
endif;
200+
201+
/**
202+
* Internal compat function to mimic mb_ord().
203+
*
204+
* @ignore
205+
* @since 7.1.0
206+
*
207+
* @param string $string Return the code point at the start of this string.
208+
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
209+
* @return int|false The Unicode code point for the first character of string or false on failure.
210+
*/
211+
function _mb_ord( $string, $encoding = null ) {
212+
if ( ! is_string( $string ) || '' === $string || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
213+
return false;
214+
}
215+
216+
$byte_length = 0;
217+
$invalid_length = 0;
218+
$found_count = _wp_scan_utf8( $string, $byte_length, $invalid_length, null, 1 );
219+
220+
if ( 1 !== $found_count ) {
221+
return false;
222+
}
223+
224+
// These are valid code points, so no further validation is required.
225+
$b0 = ord( $string[0] );
226+
227+
switch ( $byte_length ) {
228+
case 1:
229+
return $b0;
230+
231+
case 2:
232+
return (
233+
( ( $b0 & 0x1F ) << 6 ) |
234+
( ( ord( $string[1] ) & 0x3F ) )
235+
);
236+
237+
case 3:
238+
return (
239+
( ( $b0 & 0x0F ) << 12 ) |
240+
( ( ord( $string[1] ) & 0x3F ) << 6 ) |
241+
( ( ord( $string[2] ) & 0x3F ) )
242+
);
243+
244+
case 4:
245+
return (
246+
( ( $b0 & 0x07 ) << 18 ) |
247+
( ( ord( $string[1] ) & 0x3F ) << 12 ) |
248+
( ( ord( $string[2] ) & 0x3F ) << 6 ) |
249+
( ( ord( $string[3] ) & 0x3F ) )
250+
);
251+
}
252+
}
253+
113254
if ( ! function_exists( 'mb_substr' ) ) :
114255
/**
115256
* Compat function to mimic mb_substr().

src/wp-includes/html-api/class-wp-html-decoder.php

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -424,40 +424,8 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
424424
* @return string Converted code point, or `�` if invalid.
425425
*/
426426
public static function code_point_to_utf8_bytes( $code_point ): string {
427-
// Pre-check to ensure a valid code point.
428-
if (
429-
$code_point <= 0 ||
430-
( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
431-
$code_point > 0x10FFFF
432-
) {
433-
return '';
434-
}
435-
436-
if ( $code_point <= 0x7F ) {
437-
return chr( $code_point );
438-
}
439-
440-
if ( $code_point <= 0x7FF ) {
441-
$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
442-
$byte2 = chr( $code_point & 0x3F | 0x80 );
443-
444-
return "{$byte1}{$byte2}";
445-
}
446-
447-
if ( $code_point <= 0xFFFF ) {
448-
$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
449-
$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
450-
$byte3 = chr( $code_point & 0x3F | 0x80 );
451-
452-
return "{$byte1}{$byte2}{$byte3}";
453-
}
454-
455-
// Any values above U+10FFFF are eliminated above in the pre-check.
456-
$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
457-
$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
458-
$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
459-
$byte4 = chr( $code_point & 0x3F | 0x80 );
427+
$string = mb_chr( $code_point );
460428

461-
return "{$byte1}{$byte2}{$byte3}{$byte4}";
429+
return false !== $string ? $string : '';
462430
}
463431
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<?php
2+
3+
/**
4+
* @group compat
5+
*
6+
* @covers ::mb_chr
7+
*/
8+
class Tests_Compat_mbChr extends WP_UnitTestCase {
9+
/**
10+
* Ensures that the mb_chr() polyfill matches the behavior of mb_chr()
11+
* for the supported UTF-8 encoding.
12+
*
13+
* @ticket 65342
14+
*/
15+
public function test_mb_chr_polyfill_matches_spec() {
16+
for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
17+
$this->assertSame(
18+
mb_chr( $code_point ),
19+
_mb_chr( $code_point ),
20+
'Failed to properly decode the code point from the string.'
21+
);
22+
}
23+
24+
$this->assertFalse( _mb_chr( ord( 'A' ), 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
25+
$this->assertFalse( _mb_ord( ord( 'A' ), 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
26+
$this->assertSame( 'A', _mb_chr( ord( 'A' ), 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
27+
}
28+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
<?php
2+
3+
/**
4+
* @group compat
5+
*
6+
* @covers ::mb_ord
7+
*/
8+
class Tests_Compat_mbOrd extends WP_UnitTestCase {
9+
/**
10+
* Ensures that the mb_ord() polyfill matches the behavior of mb_ord()
11+
* for the supported UTF-8 encoding.
12+
*
13+
* @ticket 65342
14+
*/
15+
public function test_mb_ord_polyfill_matches_spec() {
16+
for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
17+
/*
18+
* Some code points cannot be constructed in UTF-8 because they
19+
* are invalid; notably the surrogate halves. While they could be
20+
* manually constructed here using the direct UTF-8 encoder without
21+
* its constraints, it’s sufficient to test the positive cases here
22+
* and spot-check an unpaired and incorrectly-converted surrogate
23+
* half below.
24+
*/
25+
if ( false !== mb_chr( $code_point ) ) {
26+
$this->assertSame(
27+
$code_point,
28+
_mb_ord( mb_chr( $code_point ) ),
29+
'Failed to properly decode the code point from the string.'
30+
);
31+
}
32+
}
33+
34+
$this->assertFalse( _mb_ord( '' ), 'Should have failed on empty string.' );
35+
$this->assertFalse( _mb_ord( 'hi', 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
36+
$this->assertFalse( _mb_ord( 'hi', 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
37+
$this->assertSame( ord( 'A' ), _mb_ord( 'A', 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
38+
$this->assertFalse( _mb_ord( "\xC0" ), 'Should have rejected invalid UTF-8 code point.' );
39+
$this->assertFalse( _mb_ord( substr( "\xED\xA0\x80", 0, 2 ) ), 'Should have rejected unpaired surrogate half.' );
40+
}
41+
}

0 commit comments

Comments
 (0)