Skip to content

Commit 2001ef1

Browse files
committed
General: Add support for unicode email addresses in is_email and sanitize_email
This adds support for the unicode address extensions in RFC 6530-3 and refactors the code so there are fewer long regexes and less duplication between sanitize_email and is_email. A new class, WP_Email_Address, provides the shared parts. Opting out of unicode support is easy, default-filters.php adds unicode support by adding filters, which can be removed. `sanitize_email` no longer does major changes like removing an entire subdomain from someone's address, it only cleans up things like soft hyphens and whitespace — changes that happen when coping an email address from text. Developed in: WordPress#5237 Discussed in: https://core.trac.wordpress.org/ticket/31992 Props agulbra, akirk, benniledl, dmsnell, ironprogrammer, justlevine, mdawaffe, mukeshpanchal27, SirLouen, tusharbharti. Fixes #31992. git-svn-id: https://develop.svn.wordpress.org/trunk@62482 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 6939aa7 commit 2001ef1

11 files changed

Lines changed: 836 additions & 175 deletions

File tree

src/wp-includes/class-wp-email-address.php

Lines changed: 405 additions & 0 deletions
Large diffs are not rendered by default.

src/wp-includes/default-filters.php

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,17 @@
8787
add_filter( $filter, 'wp_filter_kses' );
8888
}
8989

90+
// Email addresses: Allow unicode if and only if as the database can
91+
// store them. This affects all addresses, including those entered
92+
// into contact forms.
93+
if ( 'utf8mb4' === $wpdb->charset ) {
94+
add_filter( 'is_email', 'wp_is_unicode_email', 10, 3 );
95+
add_filter( 'sanitize_email', 'wp_sanitize_unicode_email', 10, 3 );
96+
} else {
97+
add_filter( 'is_email', 'wp_is_ascii_email', 10, 3 );
98+
add_filter( 'sanitize_email', 'wp_sanitize_ascii_email', 10, 3 );
99+
}
100+
90101
// Display URL.
91102
foreach ( array( 'user_url', 'link_url', 'link_image', 'link_rss', 'comment_url', 'post_guid' ) as $filter ) {
92103
if ( is_admin() ) {

src/wp-includes/formatting.php

Lines changed: 136 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -2176,6 +2176,7 @@ function sanitize_user( $username, $strict = false ) {
21762176
return apply_filters( 'sanitize_user', $username, $raw_username, $strict );
21772177
}
21782178

2179+
21792180
/**
21802181
* Sanitizes a string key.
21812182
*
@@ -3589,7 +3590,14 @@ function convert_smilies( $text ) {
35893590
/**
35903591
* Verifies that an email is valid.
35913592
*
3592-
* Does not grok i18n domains. Not RFC compliant.
3593+
* This accepts the addresses that matches the WHATWG specifications,
3594+
* i.e. what browsers use for `<input type=email>`. It also accepts some
3595+
* additional addresses.
3596+
*
3597+
* By default this accepts addresses like info@grå.org (also accepted
3598+
* by Firefox) `<input type=email>`. You can disable Unicode support by
3599+
* using the wp_is_ascii_email filter instead of wp_is_unicode_email,
3600+
* which is the default.
35933601
*
35943602
* @since 0.71
35953603
*
@@ -3602,84 +3610,65 @@ function is_email( $email, $deprecated = false ) {
36023610
_deprecated_argument( __FUNCTION__, '3.0.0' );
36033611
}
36043612

3605-
// Test for the minimum length the email can be.
3606-
if ( strlen( $email ) < 6 ) {
3607-
/**
3608-
* Filters whether an email address is valid.
3609-
*
3610-
* This filter is evaluated under several different contexts, such as 'email_too_short',
3611-
* 'email_no_at', 'local_invalid_chars', 'domain_period_sequence', 'domain_period_limits',
3612-
* 'domain_no_periods', 'sub_hyphen_limits', 'sub_invalid_chars', or no specific context.
3613-
*
3614-
* @since 2.8.0
3615-
*
3616-
* @param string|false $is_email The email address if successfully passed the is_email() checks, false otherwise.
3617-
* @param string $email The email address being checked.
3618-
* @param string $context Context under which the email was tested.
3619-
*/
3620-
return apply_filters( 'is_email', false, $email, 'email_too_short' );
3621-
}
3622-
3623-
// Test for an @ character after the first position.
3624-
if ( false === strpos( $email, '@', 1 ) ) {
3625-
/** This filter is documented in wp-includes/formatting.php */
3626-
return apply_filters( 'is_email', false, $email, 'email_no_at' );
3627-
}
3628-
3629-
// Split out the local and domain parts.
3630-
list( $local, $domain ) = explode( '@', $email, 2 );
3631-
3632-
/*
3633-
* LOCAL PART
3634-
* Test for invalid characters.
3635-
*/
3636-
if ( ! preg_match( '/^[a-zA-Z0-9!#$%&\'*+\/=?^_`{|}~\.-]+$/', $local ) ) {
3637-
/** This filter is documented in wp-includes/formatting.php */
3638-
return apply_filters( 'is_email', false, $email, 'local_invalid_chars' );
3639-
}
3640-
3641-
/*
3642-
* DOMAIN PART
3643-
* Test for sequences of periods.
3613+
/**
3614+
* Filters whether an email address is valid.
3615+
*
3616+
* This filter is evaluated under several different contexts, such as
3617+
* 'local_invalid_chars', 'domain_no_periods', or no specific context.
3618+
* Filters registered on this hook perform the actual validation; the
3619+
* default filter is registered in default-filters.php.
3620+
*
3621+
* @since 2.8.0
3622+
*
3623+
* @param string|false $is_email The email address if successfully passed the is_email() checks, false otherwise.
3624+
* @param string $email The email address being checked.
3625+
* @param string|null $context Context under which the email was tested, or null for the initial call.
36443626
*/
3645-
if ( preg_match( '/\.{2,}/', $domain ) ) {
3646-
/** This filter is documented in wp-includes/formatting.php */
3647-
return apply_filters( 'is_email', false, $email, 'domain_period_sequence' );
3648-
}
3649-
3650-
// Test for leading and trailing periods and whitespace.
3651-
if ( trim( $domain, " \t\n\r\0\x0B." ) !== $domain ) {
3652-
/** This filter is documented in wp-includes/formatting.php */
3653-
return apply_filters( 'is_email', false, $email, 'domain_period_limits' );
3654-
}
3655-
3656-
// Split the domain into subs.
3657-
$subs = explode( '.', $domain );
3627+
return apply_filters( 'is_email', false, $email, null );
3628+
}
36583629

3659-
// Assume the domain will have at least two subs.
3660-
if ( 2 > count( $subs ) ) {
3661-
/** This filter is documented in wp-includes/formatting.php */
3662-
return apply_filters( 'is_email', false, $email, 'domain_no_periods' );
3630+
/**
3631+
* Default is_email filter for databases that support Unicode (db charset is utf8mb4).
3632+
*
3633+
* Validates the email address using {@see WP_Email_Address::from_string()} with Unicode enabled.
3634+
* Only acts when $context is null (which it is in the initial validation call); later rescue-context calls are passed through.
3635+
*
3636+
* @since 7.1.0
3637+
*
3638+
* @param string|false $value The current filter value.
3639+
* @param string $email The email address being checked.
3640+
* @param string|null $context Validation context, or null for the initial call.
3641+
* @return string|false The email address if valid, false otherwise.
3642+
*/
3643+
function wp_is_unicode_email( $value, $email, $context ) {
3644+
if ( null !== $context ) {
3645+
return $value;
36633646
}
36643647

3665-
// Loop through each sub.
3666-
foreach ( $subs as $sub ) {
3667-
// Test for leading and trailing hyphens and whitespace.
3668-
if ( trim( $sub, " \t\n\r\0\x0B-" ) !== $sub ) {
3669-
/** This filter is documented in wp-includes/formatting.php */
3670-
return apply_filters( 'is_email', false, $email, 'sub_hyphen_limits' );
3671-
}
3648+
$result = WP_Email_Address::from_string( $email, 'unicode' );
3649+
return $result ? $result->get_unicode_address() : false;
3650+
}
36723651

3673-
// Test for invalid characters.
3674-
if ( ! preg_match( '/^[a-z0-9-]+$/i', $sub ) ) {
3675-
/** This filter is documented in wp-includes/formatting.php */
3676-
return apply_filters( 'is_email', false, $email, 'sub_invalid_chars' );
3677-
}
3652+
/**
3653+
* Default is_email filter for databases that do not support Unicode (db charset is not utf8mb4).
3654+
*
3655+
* Validates the email address using {@see WP_Email_Address::from_string()} with Unicode disabled.
3656+
* Only acts when $context is null (which it is in the initial validation call); later rescue-context calls are passed through.
3657+
*
3658+
* @since 7.1.0
3659+
*
3660+
* @param string|false $value The current filter value.
3661+
* @param string $email The email address being checked.
3662+
* @param string|null $context Validation context, or null for the initial call.
3663+
* @return string|false The email address if valid, false otherwise.
3664+
*/
3665+
function wp_is_ascii_email( $value, $email, $context ) {
3666+
if ( null !== $context ) {
3667+
return $value;
36783668
}
36793669

3680-
// Congratulations, your email made it!
3681-
/** This filter is documented in wp-includes/formatting.php */
3682-
return apply_filters( 'is_email', $email, $email, null );
3670+
$result = WP_Email_Address::from_string( $email, 'ascii' );
3671+
return $result ? $result->get_unicode_address() : false;
36833672
}
36843673

36853674
/**
@@ -3808,109 +3797,96 @@ function iso8601_to_datetime( $date_string, $timezone = 'user' ) {
38083797
}
38093798

38103799
/**
3811-
* Strips out all characters that are not allowable in an email.
3800+
* Sanitizes an email address.
3801+
*
3802+
* Strips stray whitespace from the input, then strips trailing dots from the domain.
3803+
* This is designed to recover from cut/paste mistakes without any risk of transforming
3804+
* the input into a different address than the user intended.
3805+
*
3806+
* Validation and final form are determined by the 'sanitize_email' filter; the default
3807+
* filter is registered in default-filters.php and delegates to {@see WP_Email_Address::from_string()}.
38123808
*
38133809
* @since 1.5.0
3810+
* @since 7.1.0 Accepts Unicode email addresses on supporting platforms.
38143811
*
3815-
* @param string $email Email address to filter.
3816-
* @return string Filtered email address.
3812+
* @param string $email Email address to sanitize.
3813+
* @return string The sanitized email address, or an empty string if invalid.
38173814
*/
38183815
function sanitize_email( $email ) {
3819-
// Test for the minimum length the email can be.
3820-
if ( strlen( $email ) < 6 ) {
3821-
/**
3822-
* Filters a sanitized email address.
3823-
*
3824-
* This filter is evaluated under several contexts, including 'email_too_short',
3825-
* 'email_no_at', 'local_invalid_chars', 'domain_period_sequence', 'domain_period_limits',
3826-
* 'domain_no_periods', 'domain_no_valid_subs', or no context.
3827-
*
3828-
* @since 2.8.0
3829-
*
3830-
* @param string $sanitized_email The sanitized email address.
3831-
* @param string $email The email address, as provided to sanitize_email().
3832-
* @param string|null $message A message to pass to the user. null if email is sanitized.
3833-
*/
3834-
return apply_filters( 'sanitize_email', '', $email, 'email_too_short' );
3835-
}
3836-
3837-
// Test for an @ character after the first position.
3838-
if ( false === strpos( $email, '@', 1 ) ) {
3839-
/** This filter is documented in wp-includes/formatting.php */
3840-
return apply_filters( 'sanitize_email', '', $email, 'email_no_at' );
3841-
}
3842-
3843-
// Split out the local and domain parts.
3844-
list( $local, $domain ) = explode( '@', $email, 2 );
3816+
// Strip surrounding whitespace.
3817+
$email = trim( $email );
38453818

3846-
/*
3847-
* LOCAL PART
3848-
* Test for invalid characters.
3849-
*/
3850-
$local = preg_replace( '/[^a-zA-Z0-9!#$%&\'*+\/=?^_`{|}~\.-]/', '', $local );
3851-
if ( '' === $local ) {
3852-
/** This filter is documented in wp-includes/formatting.php */
3853-
return apply_filters( 'sanitize_email', '', $email, 'local_invalid_chars' );
3819+
// Extract the address from "Display Name <username@domain>" format.
3820+
if ( 1 === preg_match( '/<([^>]+)>$/', $email, $matches ) ) {
3821+
$email = $matches[1];
38543822
}
38553823

38563824
/*
3857-
* DOMAIN PART
3858-
* Test for sequences of periods.
3825+
* Strip soft hyphens and whitespace adjacent to structural separators (dots and @),
3826+
* e.g. copy-paste artifacts like "info@example\u{00AD}.com" or "info@example .com".
3827+
*
3828+
* In some cases, e.g. autocorrect, some older software has been seen to add the
3829+
* space for unrecognized TLDs. This re-joins the parts for proper examination.
38593830
*/
3860-
$domain = preg_replace( '/\.{2,}/', '', $domain );
3861-
if ( '' === $domain ) {
3862-
/** This filter is documented in wp-includes/formatting.php */
3863-
return apply_filters( 'sanitize_email', '', $email, 'domain_period_sequence' );
3864-
}
3865-
3866-
// Test for leading and trailing periods and whitespace.
3867-
$domain = trim( $domain, " \t\n\r\0\x0B." );
3868-
if ( '' === $domain ) {
3869-
/** This filter is documented in wp-includes/formatting.php */
3870-
return apply_filters( 'sanitize_email', '', $email, 'domain_period_limits' );
3871-
}
3872-
3873-
// Split the domain into subs.
3874-
$subs = explode( '.', $domain );
3875-
3876-
// Assume the domain will have at least two subs.
3877-
if ( 2 > count( $subs ) ) {
3878-
/** This filter is documented in wp-includes/formatting.php */
3879-
return apply_filters( 'sanitize_email', '', $email, 'domain_no_periods' );
3880-
}
3881-
3882-
// Create an array that will contain valid subs.
3883-
$new_subs = array();
3884-
3885-
// Loop through each sub.
3886-
foreach ( $subs as $sub ) {
3887-
// Test for leading and trailing hyphens.
3888-
$sub = trim( $sub, " \t\n\r\0\x0B-" );
3831+
$email = preg_replace( '/[\x{00AD}\s]*([.@])[\x{00AD}\s]*/u', '$1', $email ) ?? $email;
38893832

3890-
// Test for invalid characters.
3891-
$sub = preg_replace( '/[^a-z0-9-]+/i', '', $sub );
3892-
3893-
// If there's anything left, add it to the valid subs.
3894-
if ( '' !== $sub ) {
3895-
$new_subs[] = $sub;
3896-
}
3897-
}
3898-
3899-
// If there aren't 2 or more valid subs.
3900-
if ( 2 > count( $new_subs ) ) {
3901-
/** This filter is documented in wp-includes/formatting.php */
3902-
return apply_filters( 'sanitize_email', '', $email, 'domain_no_valid_subs' );
3833+
// Strip a trailing dot from the domain (e.g. if pasted from the end of a sentence).
3834+
if ( str_contains( $email, '@' ) ) {
3835+
list( $local, $domain ) = explode( '@', $email, 2 );
3836+
$domain = rtrim( $domain, '.' );
3837+
$email = $local . '@' . $domain;
39033838
}
39043839

3905-
// Join valid subs into the new domain.
3906-
$domain = implode( '.', $new_subs );
3840+
/**
3841+
* Filters a sanitized email address.
3842+
*
3843+
* Filters registered on this hook perform the actual validation and return
3844+
* the canonical email string on success or an empty string on failure.
3845+
* The default filter is registered in default-filters.php.
3846+
*
3847+
* @since 2.8.0
3848+
*
3849+
* @param string $sanitized_email The sanitized email address, or empty string.
3850+
* @param string $email The email address as provided to sanitize_email().
3851+
* @param string|null $context Validation context, or null for the initial call.
3852+
*/
3853+
return apply_filters( 'sanitize_email', '', $email, null );
3854+
}
39073855

3908-
// Put the email back together.
3909-
$sanitized_email = $local . '@' . $domain;
3856+
/**
3857+
* Default sanitize_email filter for databases that support Unicode (db charset is utf8mb4).
3858+
*
3859+
* Returns the canonical address from {@see WP_Email_Address::from_string()} with Unicode
3860+
* enabled, or an empty string if the address is invalid.
3861+
*
3862+
* @since 7.1.0
3863+
*
3864+
* @param string $value The current filter value.
3865+
* @param string $email The email address being sanitized.
3866+
* @param string|null $context Sanitization context, always null.
3867+
* @return string The canonical email address if valid, empty string otherwise.
3868+
*/
3869+
function wp_sanitize_unicode_email( $value, $email, $context ) {
3870+
$result = WP_Email_Address::from_string( $email, 'unicode' );
3871+
return $result ? $result->get_unicode_address() : '';
3872+
}
39103873

3911-
// Congratulations, your email made it!
3912-
/** This filter is documented in wp-includes/formatting.php */
3913-
return apply_filters( 'sanitize_email', $sanitized_email, $email, null );
3874+
/**
3875+
* Default sanitize_email filter for databases that do not support Unicode (db charset is not utf8mb4).
3876+
*
3877+
* Returns the canonical address from {@see WP_Email_Address::from_string()} with Unicode
3878+
* disabled, or an empty string if the address is invalid.
3879+
*
3880+
* @since 7.1.0
3881+
*
3882+
* @param string $value The current filter value.
3883+
* @param string $email The email address being sanitized.
3884+
* @param string|null $context Sanitization context, always null.
3885+
* @return string The canonical email address if valid, empty string otherwise.
3886+
*/
3887+
function wp_sanitize_ascii_email( $value, $email, $context ) {
3888+
$result = WP_Email_Address::from_string( $email, 'ascii' );
3889+
return $result ? $result->get_unicode_address() : '';
39143890
}
39153891

39163892
/**

src/wp-settings.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
require ABSPATH . WPINC . '/class-wp-list-util.php';
113113
require ABSPATH . WPINC . '/class-wp-token-map.php';
114114
require ABSPATH . WPINC . '/utf8.php';
115+
require ABSPATH . WPINC . '/class-wp-email-address.php';
115116
require ABSPATH . WPINC . '/formatting.php';
116117
require ABSPATH . WPINC . '/meta.php';
117118
require ABSPATH . WPINC . '/functions.php';

tests/phpunit/tests/auth.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1520,7 +1520,7 @@ public function test_wp_authenticate_cookie_with_invalid_cookie() {
15201520
*/
15211521
public function test_wp_signon_using_email_with_an_apostrophe() {
15221522
$user_args = array(
1523-
'user_email' => "mail\'@example.com",
1523+
'user_email' => "mail'@example.com",
15241524
'user_pass' => 'password',
15251525
);
15261526
self::factory()->user->create( $user_args );
@@ -1833,7 +1833,7 @@ static function ( $available, WP_User $user ) {
18331833
*/
18341834
public function test_reset_password_with_apostrophe_in_email() {
18351835
$user_args = array(
1836-
'user_email' => "jo'hn@example.com",
1836+
'user_email' => "jo\'hn@example.com",
18371837
'user_pass' => 'password',
18381838
);
18391839

tests/phpunit/tests/formatting/antispambot.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ public function data_returns_valid_utf8() {
3434
'plain with ip' => array( 'ace@204.32.222.14' ),
3535
'deep subdomain' => array( 'kevin@many.subdomains.make.a.happy.man.edu' ),
3636
'short address' => array( 'a@b.co' ),
37+
'ascii@nonascii' => array( 'info@grå.org' ),
38+
'nonascii@nonascii' => array( 'grå@grå.org' ),
39+
'decomposed unicode' => array( "gr\u{0061}\u{030a}blå@grå.org" ),
3740
'weird but legal dots' => array( '..@example.com' ),
3841
'umlauts' => array( 'bücher@gmx.de' ),
3942
'three-byte UTF-8' => array( "\u{FFFD}@who.knows.com" ),

0 commit comments

Comments
 (0)