Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions src/wp-includes/compat-utf8.php
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,66 @@
return $has_noncharacters;
}

/**
* Get Unicode code point of character
*
* This is a polyfill for {@see \mb_ord()}.
*
* @since {WP_VERSION}
*
* @ignore
*
* @param string $string Return the Unicode code point number for the first UTF-8 character in this string.
* @param ?string $encoding Must be "UTF-8" if provided, else omitted.
* @return int|false Code point if able to decode the first character from the string, else false.
*/
function _wp_mb_ord( $string, $encoding = null ) {
if ( isset( $encoding ) && ! is_utf8_charset( $encoding ) ) {
return false;
}

$at = 0;
$invalid_length = 0;
$count = _wp_scan_utf8( $string, $at, $invalid_length, null, 1 );

// Beyond this check, all relevant bytes are well-formed.
if ( 1 !== $count ) {
return false;
}

switch ( $at ) {

Check warning on line 455 in src/wp-includes/compat-utf8.php

View workflow job for this annotation

GitHub Actions / PHP static analysis / Run PHP static analysis

Function _wp_mb_ord() should return int|false but return statement is missing.
case 1:
return ord( $string[ $at ] );

case 2:
$byte1 = ord( $string[ $at ] );
$byte2 = ord( $string[ $at + 1 ] );
return ( $byte1 & 0x1F ) << 6 + ( $byte2 & 0x3F );

case 3:
$byte1 = ord( $string[ $at ] );
$byte2 = ord( $string[ $at + 1 ] );
$byte3 = ord( $string[ $at + 2 ] );
return (
( ( $byte1 & 0x3F ) << 12 ) +
( ( $byte2 & 0x3F ) << 6 ) +
( $byte3 & 0x3F )
);

case 4:
$byte1 = ord( $string[ $at ] );
$byte2 = ord( $string[ $at + 1 ] );
$byte3 = ord( $string[ $at + 2 ] );
$byte4 = ord( $string[ $at + 3 ] );
return (
( ( $byte1 & 0x07 ) << 18 ) +
( ( $byte2 & 0x3F ) << 12 ) +
( ( $byte3 & 0x3F ) << 6 ) +
( $byte4 & 0x3F )
);
}
}

/**
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
* with the deprecated function from the PHP standard library.
Expand Down
53 changes: 45 additions & 8 deletions src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -2904,24 +2904,61 @@ function urldecode_deep( $value ) {
* Converts email addresses characters to HTML entities to block spam bots.
*
* @since 0.71
* @since {WP_VERSION} Masquerades multi-byte characters.
*
* @param string $email_address Email address.
* @param int $hex_encoding Optional. Set to 1 to enable hex encoding.
* @return string Converted email address.
*/
function antispambot( $email_address, $hex_encoding = 0 ) {
/*
* Email addresses passed into this function should not contain invalid UTF-8, but if they do,
* enforce the constraint by refusing to print any email address.
*/
if ( ! wp_check_invalid_utf8( $email_address ) ) {
return '';
}

$email_no_spam_address = '';

for ( $i = 0, $len = strlen( $email_address ); $i < $len; $i++ ) {
$j = rand( 0, 1 + $hex_encoding );
$has_mb_support = function_exists( 'grapheme_extract' );
$at = 0;
$next_at = 0;
$end = strlen( $email_address );
$invalid_length = 0;
while ( $at < $end ) {
if ( $has_mb_support ) {
$character = grapheme_extract( $email_address, 1, GRAPHEME_EXTR_MAXCHARS, $at, $next_at );
if ( false === $character ) {
break;
}
} else {
if ( 0 === _wp_scan_utf8( $email_address, $next_at, $invalid_length, null, 1 ) ) {
break;
}

if ( 0 === $j ) {
$email_no_spam_address .= '&#' . ord( $email_address[ $i ] ) . ';';
} elseif ( 1 === $j ) {
$email_no_spam_address .= $email_address[ $i ];
} elseif ( 2 === $j ) {
$email_no_spam_address .= '%' . zeroise( dechex( ord( $email_address[ $i ] ) ), 2 );
$character = substr( $email_address, $at, $next_at - $at );
}

switch ( rand( 0, 1 + $hex_encoding ) ) {
case 0:
$code_point = mb_ord( $character );
$email_no_spam_address .= "&#{$code_point};";
break;

case 1:
$email_no_spam_address .= $character;
break;

case 2:
for ( $i = 0, $byte_count = strlen( $character ); $i < $byte_count; $i++ ) {
$hex_value = bin2hex( $character );
$email_no_spam_address .= "%{$hex_value}";
}
break;
}

$at = $next_at;
}

return str_replace( '@', '&#64;', $email_no_spam_address );
Expand Down
14 changes: 14 additions & 0 deletions src/wp-includes/utf8.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,20 @@ function wp_is_valid_utf8( string $string ): bool {
}
endif;

if ( ! extension_loaded( 'mbstring' ) ) :
Comment thread
dmsnell marked this conversation as resolved.
Outdated
/**
* Fallback function for getting the Unicode code point of character.
*
* @ignore
* @private
*
* @since {WP_VERSION}
*/
function mb_ord( $string, $encoding = null ) {
return _wp_mb_ord( $string, $encoding );
}
endif;

if (
extension_loaded( 'mbstring' ) &&
// Maximal subpart substitution introduced by php/php-src@04e59c916f12b322ac55f22314e31bd0176d01cb.
Expand Down
15 changes: 11 additions & 4 deletions tests/phpunit/tests/formatting/antispambot.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public function data_returns_valid_utf8() {
'deep subdomain' => array( 'kevin@many.subdomains.make.a.happy.man.edu' ),
'short address' => array( 'a@b.co' ),
'weird but legal dots' => array( '..@example.com' ),
'umlauts' => array( 'bücher@gmx.de' ),
'three-byte UTF-8' => array( "\u{FFFD}@who.knows.com" ),
);
}

Expand Down Expand Up @@ -62,12 +64,17 @@ public function test_antispambot_obfuscates( $provided ) {
/**
* Data provider.
*
* @return array[]
* @return Generator
*/
public function data_antispambot_obfuscates() {
return array(
array( 'example@example.com' ),
array( '#@example.com' ),
$addresses = array(
'example@example.com',
'#@example.com',
'πετρος@example.com',
);

foreach ( $addresses as $address ) {
yield $address => array( $address );
}
}
}
Loading