Charset: Polyfill mb_ord() and mb_chr().

dmsnell · dmsnell · commit 6782f0eecc80 · 2026-05-28T01:54:58.000Z
These functions are useful primitives but missing when the mbstring extension isn’t available. This patch adds polyfills for those few environments where this is the case so that WordPress code can unconditionally call them. Developed in: #11965 Discussed in: https://core.trac.wordpress.org/ticket/65342 Fixes #65342. git-svn-id: https://develop.svn.wordpress.org/trunk@62424 602fd350-edb4-49c9-b593-d223f7449a82
diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php
@@ -110,6 +110,147 @@ function _is_utf8_charset( $charset_slug ) {
 	);
 }
 
+if ( ! function_exists( 'mb_chr' ) ) :
+	/**
+	 * Compat function to mimic mb_chr().
+	 *
+	 * @ignore
+	 * @since 7.1.0
+	 *
+	 * @see _mb_ord()
+	 *
+	 * @param int          $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
+	 * @param "UTF-8"|null $encoding  Must be 'UTF-8' or null.
+	 * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
+	 */
+	function mb_chr( $codepoint, $encoding = null ) {
+		return _mb_chr( $codepoint, $encoding );
+	}
+endif;
+
+/**
+ * Internal compat function to mimic mb_chr().
+ *
+ * @ignore
+ * @since 7.1.0
+ *
+ * @param int          $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
+ * @param "UTF-8"|null $encoding  Must be 'UTF-8' or null.
+ * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
+ */
+function _mb_chr( $codepoint, $encoding = null ) {
+	if ( ! is_int( $codepoint ) || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
+		return false;
+	}
+
+	// Pre-check to ensure a valid code point.
+	if (
+		$codepoint < 0 ||
+		( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
+		$codepoint > 0x10FFFF
+	) {
+		return false;
+	}
+
+	if ( $codepoint <= 0x7F ) {
+		return chr( $codepoint );
+	}
+
+	if ( $codepoint <= 0x7FF ) {
+		$byte1 = chr( ( $codepoint >> 6 ) | 0xC0 );
+		$byte2 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}";
+	}
+
+	if ( $codepoint <= 0xFFFF ) {
+		$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
+		$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+		$byte3 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}{$byte3}";
+	}
+
+	// Any values above U+10FFFF are eliminated above in the pre-check.
+	$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
+	$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
+	$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+	$byte4 = chr( $codepoint & 0x3F | 0x80 );
+
+	return "{$byte1}{$byte2}{$byte3}{$byte4}";
+}
+
+if ( ! function_exists( 'mb_ord' ) ) :
+	/**
+	 * Compat function to mimic mb_ord().
+	 *
+	 * @ignore
+	 * @since 7.1.0
+	 *
+	 * @see _mb_ord()
+	 *
+	 * @param string       $string   Return the code point at the start of this string.
+	 * @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
+	 * @return int|false The Unicode code point for the first character of string or false on failure.
+	 */
+	function mb_ord( $string, $encoding = null ) {
+		return _mb_ord( $string, $encoding );
+	}
+endif;
+
+/**
+ * Internal compat function to mimic mb_ord().
+ *
+ * @ignore
+ * @since 7.1.0
+ *
+ * @param string       $string   Return the code point at the start of this string.
+ * @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
+ * @return int|false The Unicode code point for the first character of string or false on failure.
+ */
+function _mb_ord( $string, $encoding = null ) {
+	if ( ! is_string( $string ) || '' === $string || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
+		return false;
+	}
+
+	$byte_length    = 0;
+	$invalid_length = 0;
+	$found_count    = _wp_scan_utf8( $string, $byte_length, $invalid_length, null, 1 );
+
+	if ( 1 !== $found_count ) {
+		return false;
+	}
+
+	// These are valid code points, so no further validation is required.
+	$b0 = ord( $string[0] );
+
+	switch ( $byte_length ) {
+		case 1:
+			return $b0;
+
+		case 2:
+			return (
+				( ( $b0 & 0x1F ) << 6 ) |
+				( ( ord( $string[1] ) & 0x3F ) )
+			);
+
+		case 3:
+			return (
+				( ( $b0 & 0x0F ) << 12 ) |
+				( ( ord( $string[1] ) & 0x3F ) << 6 ) |
+				( ( ord( $string[2] ) & 0x3F ) )
+			);
+
+		case 4:
+			return (
+				( ( $b0 & 0x07 ) << 18 ) |
+				( ( ord( $string[1] ) & 0x3F ) << 12 ) |
+				( ( ord( $string[2] ) & 0x3F ) << 6 ) |
+				( ( ord( $string[3] ) & 0x3F ) )
+			);
+	}
+}
+
 if ( ! function_exists( 'mb_substr' ) ) :
 	/**
 	 * Compat function to mimic mb_substr().
diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php
@@ -424,40 +424,8 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
 	 * @return string Converted code point, or `�` if invalid.
 	 */
 	public static function code_point_to_utf8_bytes( $code_point ): string {
-		// Pre-check to ensure a valid code point.
-		if (
-			$code_point <= 0 ||
-			( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
-			$code_point > 0x10FFFF
-		) {
-			return '�';
-		}
-
-		if ( $code_point <= 0x7F ) {
-			return chr( $code_point );
-		}
-
-		if ( $code_point <= 0x7FF ) {
-			$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
-			$byte2 = chr( $code_point & 0x3F | 0x80 );
-
-			return "{$byte1}{$byte2}";
-		}
-
-		if ( $code_point <= 0xFFFF ) {
-			$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
-			$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
-			$byte3 = chr( $code_point & 0x3F | 0x80 );
-
-			return "{$byte1}{$byte2}{$byte3}";
-		}
-
-		// Any values above U+10FFFF are eliminated above in the pre-check.
-		$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
-		$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
-		$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
-		$byte4 = chr( $code_point & 0x3F | 0x80 );
+		$string = mb_chr( $code_point );
 
-		return "{$byte1}{$byte2}{$byte3}{$byte4}";
+		return false !== $string ? $string : '�';
 	}
 }
diff --git a/tests/phpunit/tests/compat/mbChr.php b/tests/phpunit/tests/compat/mbChr.php
@@ -0,0 +1,28 @@
+<?php
+
+/**
+ * @group compat
+ *
+ * @covers ::mb_chr
+ */
+class Tests_Compat_mbChr extends WP_UnitTestCase {
+	/**
+	 * Ensures that the mb_chr() polyfill matches the behavior of mb_chr()
+	 * for the supported UTF-8 encoding.
+	 *
+	 * @ticket 65342
+	 */
+	public function test_mb_chr_polyfill_matches_spec() {
+		for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
+			$this->assertSame(
+				mb_chr( $code_point ),
+				_mb_chr( $code_point ),
+				'Failed to properly decode the code point from the string.'
+			);
+		}
+
+		$this->assertFalse( _mb_chr( ord( 'A' ), 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertFalse( _mb_ord( ord( 'A' ), 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertSame( 'A', _mb_chr( ord( 'A' ), 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
+	}
+}
diff --git a/tests/phpunit/tests/compat/mbOrd.php b/tests/phpunit/tests/compat/mbOrd.php
@@ -0,0 +1,41 @@
+<?php
+
+/**
+ * @group compat
+ *
+ * @covers ::mb_ord
+ */
+class Tests_Compat_mbOrd extends WP_UnitTestCase {
+	/**
+	 * Ensures that the mb_ord() polyfill matches the behavior of mb_ord()
+	 * for the supported UTF-8 encoding.
+	 *
+	 * @ticket 65342
+	 */
+	public function test_mb_ord_polyfill_matches_spec() {
+		for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
+			/*
+			 * Some code points cannot be constructed in UTF-8 because they
+			 * are invalid; notably the surrogate halves. While they could be
+			 * manually constructed here using the direct UTF-8 encoder without
+			 * its constraints, it’s sufficient to test the positive cases here
+			 * and spot-check an unpaired and incorrectly-converted surrogate
+			 * half below.
+			 */
+			if ( false !== mb_chr( $code_point ) ) {
+				$this->assertSame(
+					$code_point,
+					_mb_ord( mb_chr( $code_point ) ),
+					'Failed to properly decode the code point from the string.'
+				);
+			}
+		}
+
+		$this->assertFalse( _mb_ord( '' ), 'Should have failed on empty string.' );
+		$this->assertFalse( _mb_ord( 'hi', 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertFalse( _mb_ord( 'hi', 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertSame( ord( 'A' ), _mb_ord( 'A', 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
+		$this->assertFalse( _mb_ord( "\xC0" ), 'Should have rejected invalid UTF-8 code point.' );
+		$this->assertFalse( _mb_ord( substr( "\xED\xA0\x80", 0, 2 ) ), 'Should have rejected unpaired surrogate half.' );
+	}
+}