Skip to content

Commit cc27792

Browse files
committed
Charset: Update antispambot to handle multibyte characters.
In preparation for handling Unicode email addresses (non-US-ASCII characters in the mailbox name), the `antispambot()` function needs to be multi-byte aware so that it creates proper HTML numeric character references and percent-encoded strings. Previously it has been scanning the input email address byte-by-byte, but with multibyte characters this will produce invalid sequences of the transformations by encoding individual bytes of a multi-byte sequence as if they were whole characters on their own. This patch relies on the newly-polyfilled `mb_ord()` function and the `_wp_scan_utf8()` function to crawl through an input email by code point, assuming UTF-8 encoding. This ensures proper transformation. Developed in: WordPress#11567 Discussed in: https://core.trac.wordpress.org/ticket/31992 Props agulbra, akirk, benniledl, dmsnell, siliconforks. See #65342. git-svn-id: https://develop.svn.wordpress.org/trunk@62425 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 6782f0e commit cc27792

2 files changed

Lines changed: 97 additions & 21 deletions

File tree

src/wp-includes/formatting.php

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2901,30 +2901,88 @@ function urldecode_deep( $value ) {
29012901
}
29022902

29032903
/**
2904-
* Converts email addresses characters to HTML entities to block spam bots.
2904+
* Obscures email addresses in HTML to prevent spam bots from harvesting them.
2905+
*
2906+
* Typically this will randomly replace characters from the email address with
2907+
* HTML character references; however, when the hex encoding parameter is set,
2908+
* some characters will also be represented in their percent-encoded form.
2909+
*
2910+
* Because this function is randomized, the outputs for any given input may
2911+
* differ between calls. This helps diversify the ways the email addresses
2912+
* are obscured.
2913+
*
2914+
* When non-UTF-8 inputs are provided, any spans of invalid UTF-8 bytes will
2915+
* be passed through without any obfuscation.
2916+
*
2917+
* Example:
2918+
*
2919+
* $email = 'noreply@example.com';
2920+
* $obscured = antispambot( $email );
2921+
* $obscured === 'noreply@example.com';
2922+
*
2923+
* // Hex-encoding also obscures characters with percent-encoding.
2924+
* $obscured = antispambot( $email, 1 );
2925+
* $obscured === '%6eore%70l%79@%65x%61mple%2e%63%6fm';
2926+
*
2927+
* // Non-UTF-8 characters are not obfuscated. "\xFC" is Latin1 "ü".
2928+
* $obscured = antispambot( "b\xFCcher@library.de" );
2929+
* $obscured === 'b�cher@library.de';
2930+
* $obscured === "b\xFCcher@library.de"
29052931
*
29062932
* @since 0.71
2933+
* @since 7.1.0 Masquerades multibyte characters.
29072934
*
29082935
* @param string $email_address Email address.
29092936
* @param int $hex_encoding Optional. Set to 1 to enable hex encoding.
29102937
* @return string Converted email address.
29112938
*/
29122939
function antispambot( $email_address, $hex_encoding = 0 ) {
2913-
$email_no_spam_address = '';
2940+
$obfuscated = '';
2941+
$at = 0;
2942+
$end = strlen( $email_address );
2943+
$invalid_length = 0;
2944+
2945+
while ( $at < $end ) {
2946+
$was_at = $at;
2947+
if (
2948+
0 === _wp_scan_utf8( $email_address, $at, $invalid_length, null, 1 ) &&
2949+
0 === $invalid_length
2950+
) {
2951+
break;
2952+
}
29142953

2915-
for ( $i = 0, $len = strlen( $email_address ); $i < $len; $i++ ) {
2916-
$j = rand( 0, 1 + $hex_encoding );
2954+
$character_length = $at - $was_at;
29172955

2918-
if ( 0 === $j ) {
2919-
$email_no_spam_address .= '&#' . ord( $email_address[ $i ] ) . ';';
2920-
} elseif ( 1 === $j ) {
2921-
$email_no_spam_address .= $email_address[ $i ];
2922-
} elseif ( 2 === $j ) {
2923-
$email_no_spam_address .= '%' . zeroise( dechex( ord( $email_address[ $i ] ) ), 2 );
2956+
if ( $character_length > 0 ) {
2957+
$character = substr( $email_address, $was_at, $character_length );
2958+
2959+
switch ( rand( 0, 1 + $hex_encoding ) ) {
2960+
case 0:
2961+
$code_point = mb_ord( $character );
2962+
$obfuscated .= "&#{$code_point};";
2963+
break;
2964+
2965+
case 1:
2966+
$obfuscated .= $character;
2967+
break;
2968+
2969+
case 2:
2970+
for ( $i = 0; $i < $character_length; $i++ ) {
2971+
$hex_value = bin2hex( $character[ $i ] );
2972+
$obfuscated .= "%{$hex_value}";
2973+
}
2974+
break;
2975+
}
29242976
}
2977+
2978+
if ( 0 !== $invalid_length ) {
2979+
$obfuscated .= substr( $email_address, $at, $invalid_length );
2980+
}
2981+
2982+
$at += $invalid_length;
29252983
}
29262984

2927-
return str_replace( '@', '&#64;', $email_no_spam_address );
2985+
return str_replace( '@', '&#64;', $obfuscated );
29282986
}
29292987

29302988
/**

tests/phpunit/tests/formatting/antispambot.php

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ public function data_returns_valid_utf8() {
3535
'deep subdomain' => array( 'kevin@many.subdomains.make.a.happy.man.edu' ),
3636
'short address' => array( 'a@b.co' ),
3737
'weird but legal dots' => array( '..@example.com' ),
38+
'umlauts' => array( 'bücher@gmx.de' ),
39+
'three-byte UTF-8' => array( "\u{FFFD}@who.knows.com" ),
3840
);
3941
}
4042

@@ -49,25 +51,41 @@ public function data_returns_valid_utf8() {
4951
* @param string $provided The email address to obfuscate.
5052
*/
5153
public function test_antispambot_obfuscates( $provided ) {
54+
$obfuscated = antispambot( $provided, 1 );
55+
$processor = new WP_HTML_Tag_Processor( $obfuscated );
56+
5257
// The only token should be the email address, so advance once and treat as a text node.
53-
$obfuscated = antispambot( $provided );
54-
$p = new WP_HTML_Tag_Processor( $obfuscated );
55-
$p->next_token();
56-
$decoded = rawurldecode( $p->get_modifiable_text() );
58+
$processor->next_token();
59+
$decoded = rawurldecode( $processor->get_modifiable_text() );
60+
61+
$this->assertNotSame(
62+
$provided,
63+
$obfuscated,
64+
'Should have produced an obfuscated representation.'
65+
);
5766

58-
$this->assertNotSame( $provided, $obfuscated, 'Should have produced an obfuscated representation.' );
59-
$this->assertSame( $provided, $decoded, 'Should have decoded to the original email after restoring.' );
67+
$this->assertSame(
68+
$provided,
69+
$decoded,
70+
'Should have decoded to the original email after restoring.'
71+
);
6072
}
6173

6274
/**
6375
* Data provider.
6476
*
65-
* @return array[]
77+
* @return Generator
6678
*/
6779
public function data_antispambot_obfuscates() {
68-
return array(
69-
array( 'example@example.com' ),
70-
array( '#@example.com' ),
80+
$addresses = array(
81+
'example@example.com',
82+
'#@example.com',
83+
'πετρος@example.com',
84+
"\u{FFFD}@mad.mail.com",
7185
);
86+
87+
foreach ( $addresses as $address ) {
88+
yield $address => array( $address );
89+
}
7290
}
7391
}

0 commit comments

Comments
 (0)