Skip to content

Commit 0351b78

Browse files
committed
Try some optimizations
1 parent b80e784 commit 0351b78

1 file changed

Lines changed: 107 additions & 75 deletions

File tree

src/wp-includes/html-api/class-wp-html-decoder.php

Lines changed: 107 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,8 @@ public static function decode( $context, $text, $at = 0, $length = null ) {
243243
* @return string|null Decoded character reference if found, otherwise `false`.
244244
*/
245245
public static function read_character_reference( $context, $text, $at, &$skip_bytes = null ) {
246-
global $html5_named_character_entity_set;
247-
248246
$length = strlen( $text );
249-
if ( $at + 1 >= $length ) {
247+
if ( $at + 2 >= $length ) {
250248
return null;
251249
}
252250

@@ -283,18 +281,18 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
283281
$max_digits = 7; // 
284282
}
285283

286-
// Cannot encode invalid Unicode code points. Max is to U+10FFFF.
287-
$zero_count = strspn( $text, '0', $digits_at );
288-
$digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count );
289-
$after_digits = $digits_at + $zero_count + $digit_count;
290-
$has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ];
291-
$end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits;
284+
$zero_count = strspn( $text, '0', $digits_at );
285+
$digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count );
292286

293287
// `&#` or `&#x` without digits returns into plaintext.
294288
if ( 0 === $digit_count && 0 === $zero_count ) {
295289
return null;
296290
}
297291

292+
$after_digits = $digits_at + $zero_count + $digit_count;
293+
$has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ];
294+
$end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits;
295+
298296
if ( 0 === $digit_count ) {
299297
$skip_bytes = $end_of_span - $at;
300298
return '';
@@ -328,60 +326,12 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
328326
* @see https://infra.spec.whatwg.org/#noncharacter
329327
*/
330328

331-
/*
332-
* Code points in the C1 controls area need to be remapped as if they
333-
* were stored in Windows-1252. Note! This transformation only happens
334-
* for numeric character references. The raw code points in the byte
335-
* stream are not translated.
336-
*
337-
* > If the number is one of the numbers in the first column of
338-
* > the following table, then find the row with that number in
339-
* > the first column, and set the character reference code to
340-
* > the number in the second column of that row.
341-
*/
342-
if ( $code_point >= 0x80 && $code_point <= 0x9F ) {
343-
$windows_1252_mapping = array(
344-
0x20AC, // 0x80 -> EURO SIGN (€).
345-
0x81, // 0x81 -> (no change).
346-
0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚).
347-
0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ).
348-
0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („).
349-
0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…).
350-
0x2020, // 0x86 -> DAGGER (†).
351-
0x2021, // 0x87 -> DOUBLE DAGGER (‡).
352-
0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ).
353-
0x2030, // 0x89 -> PER MILLE SIGN (‰).
354-
0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š).
355-
0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹).
356-
0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ).
357-
0x8D, // 0x8D -> (no change).
358-
0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž).
359-
0x8F, // 0x8F -> (no change).
360-
0x90, // 0x90 -> (no change).
361-
0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘).
362-
0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’).
363-
0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“).
364-
0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”).
365-
0x2022, // 0x95 -> BULLET (•).
366-
0x2013, // 0x96 -> EN DASH (–).
367-
0x2014, // 0x97 -> EM DASH (—).
368-
0x02DC, // 0x98 -> SMALL TILDE (˜).
369-
0x2122, // 0x99 -> TRADE MARK SIGN (™).
370-
0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š).
371-
0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›).
372-
0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ).
373-
0x9D, // 0x9D -> (no change).
374-
0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž).
375-
0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ).
376-
);
377-
378-
$code_point = $windows_1252_mapping[ $code_point - 0x80 ];
379-
}
380-
381329
$skip_bytes = $end_of_span - $at;
382330
return self::code_point_to_utf8_bytes( $code_point );
383331
}
384332

333+
global $html5_named_character_entity_set;
334+
385335
/** Tracks inner parsing within the named character reference. */
386336
$name_at = $at + 1;
387337
// Minimum named character reference is two characters. E.g. `GT`.
@@ -460,32 +410,114 @@ public static function code_point_to_utf8_bytes( $code_point ) {
460410
return '';
461411
}
462412

463-
if ( $code_point <= 0x7F ) {
464-
return chr( $code_point );
465-
}
466-
467-
if ( $code_point <= 0x7FF ) {
468-
$byte1 = ( $code_point >> 6 ) | 0xC0;
469-
$byte2 = $code_point & 0x3F | 0x80;
413+
if ( $code_point > 0xFFFF ) {
414+
$byte1 = ( $code_point >> 18 ) | 0xF0;
415+
$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
416+
$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
417+
$byte4 = $code_point & 0x3F | 0x80;
470418

471-
return pack( 'CC', $byte1, $byte2 );
419+
return chr( $byte1 ) . chr( $byte2 ) . chr( $byte3 ) . chr( $byte4 );
472420
}
473421

474-
if ( $code_point <= 0xFFFF ) {
422+
if ( $code_point > 0x7FF ) {
475423
$byte1 = ( $code_point >> 12 ) | 0xE0;
476424
$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
477425
$byte3 = $code_point & 0x3F | 0x80;
478426

479-
return pack( 'CCC', $byte1, $byte2, $byte3 );
427+
return chr( $byte1 ) . chr( $byte2 ) . chr( $byte3 );
480428
}
481429

482-
if ( $code_point <= 0x10FFFF ) {
483-
$byte1 = ( $code_point >> 18 ) | 0xF0;
484-
$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
485-
$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
486-
$byte4 = $code_point & 0x3F | 0x80;
430+
if ( $code_point <= 0x7F ) {
431+
return chr( $code_point );
432+
}
433+
434+
if ( $code_point <= 0x7FF ) {
435+
/*
436+
* Code points in the C1 controls area need to be remapped as if they
437+
* were stored in Windows-1252. Note! This transformation only happens
438+
* for numeric character references. The raw code points in the byte
439+
* stream are not translated.
440+
*
441+
* > If the number is one of the numbers in the first column of
442+
* > the following table, then find the row with that number in
443+
* > the first column, and set the character reference code to
444+
* > the number in the second column of that row.
445+
*/
446+
if ( $code_point <= 0x9F ) {
447+
$windows_1252_mapping = array(
448+
// 0x20AC, // 0x80 -> EURO SIGN (€).
449+
// 0x81, // 0x81 -> (no change).
450+
// 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚).
451+
// 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ).
452+
// 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („).
453+
// 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…).
454+
// 0x2020, // 0x86 -> DAGGER (†).
455+
// 0x2021, // 0x87 -> DOUBLE DAGGER (‡).
456+
// 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ).
457+
// 0x2030, // 0x89 -> PER MILLE SIGN (‰).
458+
// 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š).
459+
// 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹).
460+
// 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ).
461+
// 0x8D, // 0x8D -> (no change).
462+
// 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž).
463+
// 0x8F, // 0x8F -> (no change).
464+
// 0x90, // 0x90 -> (no change).
465+
// 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘).
466+
// 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’).
467+
// 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“).
468+
// 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”).
469+
// 0x2022, // 0x95 -> BULLET (•).
470+
// 0x2013, // 0x96 -> EN DASH (–).
471+
// 0x2014, // 0x97 -> EM DASH (—).
472+
// 0x02DC, // 0x98 -> SMALL TILDE (˜).
473+
// 0x2122, // 0x99 -> TRADE MARK SIGN (™).
474+
// 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š).
475+
// 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›).
476+
// 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ).
477+
// 0x9D, // 0x9D -> (no change).
478+
// 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž).
479+
// 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ).
480+
'',
481+
'',
482+
'',
483+
'ƒ',
484+
'',
485+
'',
486+
'',
487+
'',
488+
'ˆ',
489+
'',
490+
'Š',
491+
'',
492+
'Œ',
493+
'',
494+
'Ž',
495+
'',
496+
'',
497+
'',
498+
'',
499+
'',
500+
'',
501+
'',
502+
'',
503+
'',
504+
'˜',
505+
'',
506+
'š',
507+
'',
508+
'œ',
509+
'',
510+
'ž',
511+
'Ÿ',
512+
);
513+
514+
return $windows_1252_mapping[ $code_point - 0x80 ];
515+
}
516+
517+
$byte1 = ( $code_point >> 6 ) | 0xC0;
518+
$byte2 = $code_point & 0x3F | 0x80;
487519

488-
return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
520+
return chr( $byte1 ) . chr( $byte2 );
489521
}
490522
}
491523
}

0 commit comments

Comments
 (0)