Skip to content

Commit 1514f31

Browse files
committed
General: Add support for unicode email addresses in is_email and sanitize_email
This adds support for the unicode address extensions in RFC 6530-3, adds unit tests for that, extends the documentation to explain the relationship between this code and the various specifications, and finally adds unit tests to ensure that the documentation's description of the code remains correct. During testing, it became clear that antispambot() worked only for strings using a single-byte encoding, while this uses UTF8. Fixed. Fixes #31992. Props SirLouen, dmsnell, tusharbharti, mukeshpanchal27.
1 parent 9d9a0f1 commit 1514f31

File tree

6 files changed

+225
-9
lines changed

6 files changed

+225
-9
lines changed

src/wp-includes/default-filters.php

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,12 @@
8787
add_filter( $filter, 'wp_filter_kses' );
8888
}
8989

90+
// Email addresses: Allow so long as the database can store them. This
91+
// affects all addresses, including those entered into contact forms.
92+
if ( 'utf8mb4' !== $wpdb->charset ) {
93+
add_filter( 'sanitize_email', 'wp_ascii_without_controls' );
94+
}
95+
9096
// Display URL.
9197
foreach ( array( 'user_url', 'link_url', 'link_image', 'link_rss', 'comment_url', 'post_guid' ) as $filter ) {
9298
if ( is_admin() ) {

src/wp-includes/formatting.php

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2173,6 +2173,20 @@ function sanitize_user( $username, $strict = false ) {
21732173
return apply_filters( 'sanitize_user', $username, $raw_username, $strict );
21742174
}
21752175

2176+
2177+
/**
2178+
* Returns a string with all controls and all non-ASCII bytes removed.
2179+
*
2180+
* @since 7.0.0
2181+
*
2182+
* @param string $input The string to be sanitized.
2183+
* @return string The modified string.
2184+
*/
2185+
function wp_ascii_without_controls( $input ) {
2186+
return preg_replace( '/[\x00-\x19\x7F-\xFF]/', '', $input );
2187+
}
2188+
2189+
21762190
/**
21772191
* Sanitizes a string key.
21782192
*
@@ -2912,7 +2926,9 @@ function antispambot( $email_address, $hex_encoding = 0 ) {
29122926
for ( $i = 0, $len = strlen( $email_address ); $i < $len; $i++ ) {
29132927
$j = rand( 0, 1 + $hex_encoding );
29142928

2915-
if ( 0 === $j ) {
2929+
if ( ord( $email_address[ $i ] ) > 127 ) {
2930+
$email_no_spam_address .= $email_address[ $i ];
2931+
} elseif ( 0 === $j ) {
29162932
$email_no_spam_address .= '&#' . ord( $email_address[ $i ] ) . ';';
29172933
} elseif ( 1 === $j ) {
29182934
$email_no_spam_address .= $email_address[ $i ];
@@ -3528,7 +3544,21 @@ function convert_smilies( $text ) {
35283544
/**
35293545
* Verifies that an email is valid.
35303546
*
3531-
* Does not grok i18n domains. Not RFC compliant.
3547+
* The mostly matches what people think is the format of email
3548+
* addresses, and is close to all three current specifications.
3549+
*
3550+
* Email address syntax is specified in RFC 5322 for ASCII-only email
3551+
* and in RFC 6532 for unicode email (both unicode domains and
3552+
* localparts). In addition, the HTML WHATWG specification contains a
3553+
* third syntax which is used for HTML form input (except that major
3554+
* browsers deviate a little from the WHATWG specification).
3555+
*
3556+
* This function matches the WHATWG and RFC 6532 specifications fairly
3557+
* well, although there are some differences. " "@example.com (quote
3558+
* space quote at ...) is allowed by the RFCs and rejected by this
3559+
* code, while ..@example.com is allowed by this code and prohibited
3560+
* by the RFCs. info@grå.org is allowed by this code and major
3561+
* browsers, but prohibited by WHATWG's regex (as of April 2023).
35323562
*
35333563
* @since 0.71
35343564
*
@@ -3572,7 +3602,7 @@ function is_email( $email, $deprecated = false ) {
35723602
* LOCAL PART
35733603
* Test for invalid characters.
35743604
*/
3575-
if ( ! preg_match( '/^[a-zA-Z0-9!#$%&\'*+\/=?^_`{|}~\.-]+$/', $local ) ) {
3605+
if ( ! ( wp_is_valid_utf8( $local ) && preg_match( '/^[a-zA-Z0-9\x80-\xff!#$%&\'*+\/=?^_`{|}~\.-]+$/', $local ) && preg_match( '/^\X+$/', $local ) ) ) {
35763606
/** This filter is documented in wp-includes/formatting.php */
35773607
return apply_filters( 'is_email', false, $email, 'local_invalid_chars' );
35783608
}
@@ -3610,7 +3640,7 @@ function is_email( $email, $deprecated = false ) {
36103640
}
36113641

36123642
// Test for invalid characters.
3613-
if ( ! preg_match( '/^[a-z0-9-]+$/i', $sub ) ) {
3643+
if ( ! ( wp_is_valid_utf8( $sub ) && preg_match( '/^[a-z0-9\x80-\xff-]+$/i', $sub ) && preg_match( '/^\X+$/', $sub ) ) ) {
36143644
/** This filter is documented in wp-includes/formatting.php */
36153645
return apply_filters( 'is_email', false, $email, 'sub_invalid_chars' );
36163646
}
@@ -3786,8 +3816,8 @@ function sanitize_email( $email ) {
37863816
* LOCAL PART
37873817
* Test for invalid characters.
37883818
*/
3789-
$local = preg_replace( '/[^a-zA-Z0-9!#$%&\'*+\/=?^_`{|}~\.-]/', '', $local );
3790-
if ( '' === $local ) {
3819+
$local = preg_replace( '/[^a-zA-Z0-9!#$%&\'*+\/=?^_`{|}~\.\x80-\xff-]/', '', $local );
3820+
if ( '' === $local || ! wp_is_valid_utf8( $local ) ) {
37913821
/** This filter is documented in wp-includes/formatting.php */
37923822
return apply_filters( 'sanitize_email', '', $email, 'local_invalid_chars' );
37933823
}
@@ -3827,10 +3857,10 @@ function sanitize_email( $email ) {
38273857
$sub = trim( $sub, " \t\n\r\0\x0B-" );
38283858

38293859
// Test for invalid characters.
3830-
$sub = preg_replace( '/[^a-z0-9-]+/i', '', $sub );
3860+
$sub = preg_replace( '/[^a-z0-9\x80-\xff-]+/i', '', $sub );
38313861

38323862
// If there's anything left, add it to the valid subs.
3833-
if ( '' !== $sub ) {
3863+
if ( '' !== $sub && wp_is_valid_utf8( $sub ) ) {
38343864
$new_subs[] = $sub;
38353865
}
38363866
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
<?php
2+
/**
3+
* Tests for the antispambot() function.
4+
*
5+
* @group formatting
6+
* @covers ::antispambot
7+
*/
8+
class Tests_Formatting_Antispambot extends WP_UnitTestCase {
9+
10+
/**
11+
* This is basically a driveby test. While working on ticket
12+
* 31992 I noticed that there was no unit testing for
13+
* antispambot, so I added a little, just so I'd leave the code
14+
* better than I found it.
15+
*
16+
* @ticket 31992
17+
*
18+
* @dataProvider data_returns_valid_utf8
19+
* @param string $address The email address to obfuscate.
20+
* @param bool $validity Whether the obfuscated address should be valid UTF-8.
21+
*/
22+
public function test_returns_valid_utf8( $address, $validity ) {
23+
$this->assertSame( wp_is_valid_utf8( antispambot( $address ) ), $validity );
24+
}
25+
26+
/**
27+
* Data provider for test_returns_valid_utf8.
28+
*/
29+
public function data_returns_valid_utf8() {
30+
return array(
31+
'plain' => array( 'bob@example.com', true ),
32+
'plain with ip' => array( 'ace@204.32.222.14', true ),
33+
'deep subdomain' => array( 'kevin@many.subdomains.make.a.happy.man.edu', true ),
34+
'short address' => array( 'a@b.co', true ),
35+
'ascii@nonascii' => array( 'info@grå.org', true ),
36+
'nonascii@nonascii' => array( 'grå@grå.org', true ),
37+
'decomposed unicode' => array( 'gr\u{0061}\u{030a}blå@grå.org', true ),
38+
'weird but legal dots' => array( '..@example.com', true ),
39+
);
40+
}
41+
}

tests/phpunit/tests/formatting/isEmail.php

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?php
2-
32
/**
3+
* Tests for the is_email() function.
4+
*
45
* @group formatting
56
*
67
* @covers ::is_email
@@ -23,10 +24,15 @@ public static function valid_email_provider() {
2324
$valid_emails = array(
2425
'bob@example.com',
2526
'phil@example.info',
27+
'phil@TLA.example',
2628
'ace@204.32.222.14',
2729
'kevin@many.subdomains.make.a.happy.man.edu',
2830
'a@b.co',
2931
'bill+ted@example.com',
32+
'info@grå.org',
33+
'grå@grå.org',
34+
"gr\u{0061}\u{030a}blå@grå.org",
35+
'..@example.com',
3036
);
3137

3238
foreach ( $valid_emails as $email ) {
@@ -54,6 +60,48 @@ public static function invalid_email_provider() {
5460
'com.exampleNOSPAMbob',
5561
'bob@your mom',
5662
'a@b.c',
63+
'" "@b.c',
64+
'h(aj@couc.ou', // bad comment.
65+
'hi@',
66+
'hi@hi@couc.ou', // double @.
67+
68+
/*
69+
* The next address is not deliverable as described,
70+
* SMTP servers should strip the (ab), so it is very
71+
* likely a source of confusion or a typo.
72+
* Best rejected.
73+
*/
74+
'(ab)cd@couc.ou',
75+
76+
/*
77+
* The next address is not globally deliverable,
78+
* so it may work with PHPMailer and break with
79+
* mail sending services. Best not allow users
80+
* to paint themselves into that corner. This also
81+
* avoids security problems like those that were
82+
* used to probe the Wordpress server's local
83+
* network.
84+
*/
85+
'toto@to',
86+
87+
/*
88+
* Several addresses are best rejected because
89+
* we don't want to allow sending to fe80::, 192.168
90+
* and other special addresses; that too might
91+
* be used to probe the Wordpress server's local
92+
* network.
93+
*/
94+
'to@[2001:db8::1]',
95+
'to@[IPv6:2001:db8::1]',
96+
'to@[192.168.1.1]',
97+
98+
/*
99+
* Ill-formed UTF-8 byte sequences must be rejected.
100+
* A lone continuation byte (0x80) is not valid UTF-8
101+
* whether it appears in the local part or the domain.
102+
*/
103+
"a\x80b@example.com", // invalid UTF-8 in local part.
104+
"abc@\x80.org", // invalid UTF-8 in domain subdomain.
57105
);
58106

59107
foreach ( $invalid_emails as $email ) {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
<?php
2+
/**
3+
* Tests for the sanitize_email() function.
4+
*
5+
* @group formatting
6+
* @covers ::sanitize_email
7+
*/
8+
class Tests_Formatting_SanitizeEmail extends WP_UnitTestCase {
9+
10+
/**
11+
* This test checks that email addresses are properly sanitized.
12+
*
13+
* @ticket 31992
14+
* @dataProvider data_for_sanitation
15+
* @param string $address The email address to sanitize.
16+
* @param string $expected The expected sanitized email address.
17+
*/
18+
public function test_returns_stripped_email_address( $address, $expected ) {
19+
$this->assertSame( sanitize_email( $address ), $expected );
20+
}
21+
22+
/**
23+
* Data provider for test_returns_stripped_email_address.
24+
*/
25+
public function data_for_sanitation() {
26+
return array(
27+
'shorter than 6 characters' => array( 'a@b', '' ),
28+
'contains no @' => array( 'ab', '' ),
29+
'just a TLD' => array( 'abc@com', '' ),
30+
'plain' => array( 'abc@example.com', 'abc@example.com' ),
31+
'unicode domain' => array( 'abc@grå.org', 'abc@grå.org' ),
32+
'unicode local part' => array( 'grå@example.com', 'grå@example.com' ),
33+
'unicode local and domain' => array( 'grå@grå.org', 'grå@grå.org' ),
34+
'invalid utf8 in local' => array( "a\x80b@example.com", '' ),
35+
'invalid utf8 subdomain dropped' => array( "abc@sub.\x80.org", 'abc@sub.org' ),
36+
'all subdomains invalid utf8' => array( "abc@\x80.org", '' ),
37+
);
38+
}
39+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?php
2+
/**
3+
* Tests for the wp_ascii_without_controls() function.
4+
*
5+
* @group formatting
6+
*
7+
* @covers ::wp_ascii_without_controls
8+
*/
9+
class Tests_Formatting_WpAsciiWithoutControls extends WP_UnitTestCase {
10+
11+
/**
12+
* @dataProvider data_provider
13+
*/
14+
public function test_output( $input, $expected ) {
15+
$this->assertSame( $expected, wp_ascii_without_controls( $input ) );
16+
}
17+
18+
public static function data_provider() {
19+
return array(
20+
'empty string' => array( '', '' ),
21+
'plain ASCII' => array( 'hello world', 'hello world' ),
22+
'printable ASCII symbols' => array( '!@#$%^&*()', '!@#$%^&*()' ),
23+
24+
// Control characters 0x00-0x19 are removed.
25+
'NUL (0x00)' => array( "\x00", '' ),
26+
'SOH (0x01)' => array( "\x01", '' ),
27+
'BEL (0x07)' => array( "\x07", '' ),
28+
'TAB (0x09)' => array( "\x09", '' ),
29+
'LF (0x0A)' => array( "\x0A", '' ),
30+
'CR (0x0D)' => array( "\x0D", '' ),
31+
'EM (0x19)' => array( "\x19", '' ),
32+
33+
// 0x1A-0x1F are control characters not covered by the regex.
34+
'SUB (0x1A) not removed' => array( "\x1A", "\x1A" ),
35+
'ESC (0x1B) not removed' => array( "\x1B", "\x1B" ),
36+
'US (0x1F) not removed' => array( "\x1F", "\x1F" ),
37+
38+
// DEL (0x7F) is removed.
39+
'DEL (0x7F)' => array( "\x7F", '' ),
40+
41+
// High bytes (0x80-0xFF) are removed. This strips UTF-8 multi-byte sequences.
42+
'lone continuation (0x80)' => array( "\x80", '' ),
43+
'high byte (0xFF)' => array( "\xFF", '' ),
44+
'utf8 for å (0xC3 0xA5)' => array( "\xC3\xA5", '' ),
45+
46+
// Mixed input: controls and high bytes are stripped, ASCII printable is kept.
47+
'controls embedded' => array( "a\x01b\x02c", 'abc' ),
48+
'unicode email stripped' => array( 'grå@example.com', 'gr@example.com' ),
49+
'unicode both sides' => array( 'grå@grå.org', 'gr@gr.org' ),
50+
);
51+
}
52+
}

0 commit comments

Comments
 (0)