Skip to content

Commit d4c66f3

Browse files
committed
Formatting: Refactor sanitize_title_with_dashes() with PCRE Unicode and WP_HTML_Decoder
1 parent 609f25f commit d4c66f3

2 files changed

Lines changed: 93 additions & 29 deletions

File tree

src/wp-includes/formatting.php

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2274,12 +2274,31 @@ function sanitize_title_for_query( $title ) {
22742274
* @param string $title The title to be sanitized.
22752275
* @param string $raw_title Optional. Not used. Default empty.
22762276
* @param string $context Optional. The operation for which the string is sanitized.
2277-
* When set to 'save', additional entities are converted to hyphens
2278-
* or stripped entirely. Default 'display'.
2277+
* When set to 'save', HTML entities are decoded to raw UTF-8 and
2278+
* Unicode dash punctuation and separators are converted to hyphens.
2279+
* Default 'display'.
22792280
* @return string The sanitized title.
22802281
*/
22812282
function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'display' ) {
22822283
$title = strip_tags( $title );
2284+
2285+
if ( 'save' === $context ) {
2286+
/*
2287+
* Decode HTML entities to raw UTF-8, ensuring all representations of the same
2288+
* character are treated identically.
2289+
*/
2290+
$title = WP_HTML_Decoder::decode_text_node( $title );
2291+
2292+
$title = str_replace( '&', '', $title );
2293+
2294+
if ( _wp_can_use_pcre_u() ) {
2295+
$title = preg_replace( '~[\p{Pd}\p{Z}]~u', '-', $title );
2296+
}
2297+
2298+
// Convert forward slash to hyphen.
2299+
$title = str_replace( '/', '-', $title );
2300+
}
2301+
22832302
// Preserve escaped octets.
22842303
$title = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title );
22852304
// Remove percent signs that are not part of an octet.
@@ -2297,12 +2316,38 @@ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'displa
22972316
$title = strtolower( $title );
22982317

22992318
if ( 'save' === $context ) {
2300-
// Convert &nbsp, non-breaking hyphen, &ndash, and &mdash to hyphens.
2319+
/*
2320+
* Convert known dash punctuation and space separator variants to hyphens.
2321+
*
2322+
* These are the percent-encoded UTF-8 forms produced by utf8_uri_encode().
2323+
* When _wp_can_use_pcre_u() is true, raw UTF-8 dash/space chars were already
2324+
* replaced by PCRE above, so these str_replace() calls become no-ops for those.
2325+
* They remain necessary to handle inputs that arrived as pre-encoded percent
2326+
* sequences.
2327+
*/
23012328
$title = str_replace( array( '%c2%a0', '%e2%80%91', '%e2%80%93', '%e2%80%94' ), '-', $title );
2302-
// Convert &nbsp, non-breaking hyphen, &ndash, and &mdash HTML entities to hyphens.
2303-
$title = str_replace( array( ' ', '‑', ' ', '–', '–', '—', '—' ), '-', $title );
2304-
// Convert forward slash to hyphen.
2305-
$title = str_replace( '/', '-', $title );
2329+
2330+
// Convert space separator variants (percent-encoded) to hyphen.
2331+
$title = str_replace(
2332+
array(
2333+
'%e2%80%80', // En quad.
2334+
'%e2%80%81', // Em quad.
2335+
'%e2%80%82', // En space.
2336+
'%e2%80%83', // Em space.
2337+
'%e2%80%84', // Three-per-em space.
2338+
'%e2%80%85', // Four-per-em space.
2339+
'%e2%80%86', // Six-per-em space.
2340+
'%e2%80%87', // Figure space.
2341+
'%e2%80%88', // Punctuation space.
2342+
'%e2%80%89', // Thin space.
2343+
'%e2%80%8a', // Hair space.
2344+
'%e2%80%a8', // Line separator.
2345+
'%e2%80%a9', // Paragraph separator.
2346+
'%e2%80%af', // Narrow no-break space.
2347+
),
2348+
'-',
2349+
$title
2350+
);
23062351

23072352
// Strip these characters entirely.
23082353
$title = str_replace(
@@ -2361,28 +2406,6 @@ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'displa
23612406
$title
23622407
);
23632408

2364-
// Convert non-visible characters that display with a width to hyphen.
2365-
$title = str_replace(
2366-
array(
2367-
'%e2%80%80', // En quad.
2368-
'%e2%80%81', // Em quad.
2369-
'%e2%80%82', // En space.
2370-
'%e2%80%83', // Em space.
2371-
'%e2%80%84', // Three-per-em space.
2372-
'%e2%80%85', // Four-per-em space.
2373-
'%e2%80%86', // Six-per-em space.
2374-
'%e2%80%87', // Figure space.
2375-
'%e2%80%88', // Punctuation space.
2376-
'%e2%80%89', // Thin space.
2377-
'%e2%80%8a', // Hair space.
2378-
'%e2%80%a8', // Line separator.
2379-
'%e2%80%a9', // Paragraph separator.
2380-
'%e2%80%af', // Narrow no-break space.
2381-
),
2382-
'-',
2383-
$title
2384-
);
2385-
23862409
// Convert &times to 'x'.
23872410
$title = str_replace( '%c3%97', 'x', $title );
23882411
}

tests/phpunit/tests/formatting/sanitizeTitleWithDashes.php

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,47 @@ public function data_converts_non_visible_characters_with_width_to_hyphen() {
327327
);
328328
}
329329

330+
/**
331+
* @ticket 64151
332+
*/
333+
public function test_replaces_hex_nbsp_entity() {
334+
$this->assertSame( 'dont-break-the-space', sanitize_title_with_dashes( "don\u{2019}t break the space", '', 'save' ) );
335+
}
336+
337+
/**
338+
* @ticket 64151
339+
*/
340+
public function test_replaces_hex_ndash_mdash_entities() {
341+
$this->assertSame( 'do-the-dash', sanitize_title_with_dashes( 'Do – the Dash', '', 'save' ) );
342+
$this->assertSame( 'do-the-dash', sanitize_title_with_dashes( 'Do — the Dash', '', 'save' ) );
343+
}
344+
345+
/**
346+
* @ticket 64151
347+
*/
348+
public function test_replaces_hex_non_breaking_hyphen_entity() {
349+
$this->assertSame( 'do-the-dash', sanitize_title_with_dashes( 'Do ‑ the Dash', '', 'save' ) );
350+
}
351+
352+
/**
353+
* @ticket 64151
354+
*/
355+
public function test_replaces_additional_dash_punctuation() {
356+
$this->assertSame( 'do-the-dash', sanitize_title_with_dashes( "Do \u{2012} the Dash", '', 'save' ) );
357+
$this->assertSame( 'do-the-dash', sanitize_title_with_dashes( "Do \u{2015} the Dash", '', 'save' ) );
358+
$this->assertSame( 'do-the-dash', sanitize_title_with_dashes( "Do \u{2010} the Dash", '', 'save' ) );
359+
}
360+
361+
/**
362+
* @ticket 64151
363+
*/
364+
public function test_replaces_additional_space_separators() {
365+
$this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{1680} the Space", '', 'save' ) );
366+
$this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{205F} the Space", '', 'save' ) );
367+
$this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{205F} the Space", '', 'save' ) );
368+
$this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{3000} the Space", '', 'save' ) );
369+
}
370+
330371
/**
331372
* @ticket 47912
332373
* @dataProvider data_non_visible_characters_with_width_to_hyphen_when_not_save

0 commit comments

Comments
 (0)