WordPress · dmsnell · Apr 11, 2026 · Apr 13, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php
@@ -425,6 +425,66 @@
 	return $has_noncharacters;
 }
 
+/**
+ * Get Unicode code point of character
+ *
+ * This is a polyfill for {@see \mb_ord()}.
+ *
+ * @since {WP_VERSION}
+ *
+ * @ignore
+ *
+ * @param string  $string   Return the Unicode code point number for the first UTF-8 character in this string.
+ * @param ?string $encoding Must be "UTF-8" if provided, else omitted.
+ * @return int|false Code point if able to decode the first character from the string, else false.
+ */
+function _wp_mb_ord( $string, $encoding = null ) {
+	if ( isset( $encoding ) && ! is_utf8_charset( $encoding ) ) {
+		return false;
+	}
+
+	$at             = 0;
+	$invalid_length = 0;
+	$count          = _wp_scan_utf8( $string, $at, $invalid_length, null, 1 );
+
+	// Beyond this check, all relevant bytes are well-formed.
+	if ( 1 !== $count ) {
+		return false;
+	}
+
+	switch ( $at ) {
+		case 1:
+			return ord( $string[ $at ] );
+
+		case 2:
+			$byte1 = ord( $string[ $at ] );
+			$byte2 = ord( $string[ $at + 1 ] );
+			return ( $byte1 & 0x1F ) << 6 + ( $byte2 & 0x3F );
+
+		case 3:
+			$byte1 = ord( $string[ $at ] );
+			$byte2 = ord( $string[ $at + 1 ] );
+			$byte3 = ord( $string[ $at + 2 ] );
+			return (
+				( ( $byte1 & 0x3F ) << 12 ) +
+				( ( $byte2 & 0x3F ) << 6 ) +
+				( $byte3 & 0x3F )
+			);
+
+		case 4:
+			$byte1 = ord( $string[ $at ] );
+			$byte2 = ord( $string[ $at + 1 ] );
+			$byte3 = ord( $string[ $at + 2 ] );
+			$byte4 = ord( $string[ $at + 3 ] );
+			return (
+				( ( $byte1 & 0x07 ) << 18 ) +
+				( ( $byte2 & 0x3F ) << 12 ) +
+				( ( $byte3 & 0x3F ) << 6 ) +
+				( $byte4 & 0x3F )
+			);
+	}
+}
+
 /**
  * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
  * with the deprecated function from the PHP standard library.

diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
@@ -2904,24 +2904,61 @@ function urldecode_deep( $value ) {
  * Converts email addresses characters to HTML entities to block spam bots.
  *
  * @since 0.71
+ * @since {WP_VERSION} Masquerades multi-byte characters.
  *
  * @param string $email_address Email address.
  * @param int    $hex_encoding  Optional. Set to 1 to enable hex encoding.
  * @return string Converted email address.
  */
 function antispambot( $email_address, $hex_encoding = 0 ) {
+	/*
+	 * Email addresses passed into this function should not contain invalid UTF-8, but if they do,
+	 * enforce the constraint by refusing to print any email address.
+	 */
+	if ( ! wp_check_invalid_utf8( $email_address ) ) {
+		return '';
+	}
+
 	$email_no_spam_address = '';
 
-	for ( $i = 0, $len = strlen( $email_address ); $i < $len; $i++ ) {
-		$j = rand( 0, 1 + $hex_encoding );
+	$has_mb_support = function_exists( 'grapheme_extract' );
+	$at             = 0;
+	$next_at        = 0;
+	$end            = strlen( $email_address );
+	$invalid_length = 0;
+	while ( $at < $end ) {
+		if ( $has_mb_support ) {
+			$character = grapheme_extract( $email_address, 1, GRAPHEME_EXTR_MAXCHARS, $at, $next_at );
+			if ( false === $character ) {
+				break;
+			}
+		} else {
+			if ( 0 === _wp_scan_utf8( $email_address, $next_at, $invalid_length, null, 1 ) ) {
+				break;
+			}
 
-		if ( 0 === $j ) {
-			$email_no_spam_address .= '&#' . ord( $email_address[ $i ] ) . ';';
-		} elseif ( 1 === $j ) {
-			$email_no_spam_address .= $email_address[ $i ];
-		} elseif ( 2 === $j ) {
-			$email_no_spam_address .= '%' . zeroise( dechex( ord( $email_address[ $i ] ) ), 2 );
+			$character = substr( $email_address, $at, $next_at - $at );
 		}
+
+		switch ( rand( 0, 1 + $hex_encoding ) ) {
+			case 0:
+				$code_point             = mb_ord( $character );
+				$email_no_spam_address .= "&#{$code_point};";
+				break;
+
+			case 1:
+				$email_no_spam_address .= $character;
+				break;
+
+			case 2:
+				for ( $i = 0, $byte_count = strlen( $character ); $i < $byte_count; $i++ ) {
+					$hex_value              = bin2hex( $character );
+					$email_no_spam_address .= "%{$hex_value}";
+				}
+				break;
+		}
+
+		$at = $next_at;
 	}
 
 	return str_replace( '@', '&#64;', $email_no_spam_address );

diff --git a/src/wp-includes/utf8.php b/src/wp-includes/utf8.php
@@ -53,6 +53,20 @@ function wp_is_valid_utf8( string $string ): bool {
 	}
 endif;
 
+if ( ! extension_loaded( 'mbstring' ) ) :
+	/**
+	 * Fallback function for getting the Unicode code point of character.
+	 *
+	 * @ignore
+	 * @private
+	 *
+	 * @since {WP_VERSION}
+	 */
+	function mb_ord( $string, $encoding = null ) {
+		return _wp_mb_ord( $string, $encoding );
+	}
+endif;
+
 if (
 	extension_loaded( 'mbstring' ) &&
 	// Maximal subpart substitution introduced by php/php-src@04e59c916f12b322ac55f22314e31bd0176d01cb.

diff --git a/tests/phpunit/tests/formatting/antispambot.php b/tests/phpunit/tests/formatting/antispambot.php
@@ -35,6 +35,8 @@ public function data_returns_valid_utf8() {
 			'deep subdomain'       => array( 'kevin@many.subdomains.make.a.happy.man.edu' ),
 			'short address'        => array( 'a@b.co' ),
 			'weird but legal dots' => array( '..@example.com' ),
+			'umlauts'              => array( 'bücher@gmx.de' ),
+			'three-byte UTF-8'     => array( "\u{FFFD}@who.knows.com" ),
 		);
 	}
 
@@ -62,12 +64,17 @@ public function test_antispambot_obfuscates( $provided ) {
 	/**
 	 * Data provider.
 	 *
-	 * @return array[]
+	 * @return Generator
 	 */
 	public function data_antispambot_obfuscates() {
-		return array(
-			array( 'example@example.com' ),
-			array( '#@example.com' ),
+		$addresses = array(
+			'example@example.com',
+			'#@example.com',
+			'πετρος@example.com',
 		);
+
+		foreach ( $addresses as $address ) {
+			yield $address => array( $address );
+		}
 	}
 }