@@ -243,10 +243,8 @@ public static function decode( $context, $text, $at = 0, $length = null ) {
243243 * @return string|null Decoded character reference if found, otherwise `false`.
244244 */
245245 public static function read_character_reference ( $ context , $ text , $ at , &$ skip_bytes = null ) {
246- global $ html5_named_character_entity_set ;
247-
248246 $ length = strlen ( $ text );
249- if ( $ at + 1 >= $ length ) {
247+ if ( $ at + 2 >= $ length ) {
250248 return null ;
251249 }
252250
@@ -283,18 +281,18 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
283281 $ max_digits = 7 ; // 
284282 }
285283
286- // Cannot encode invalid Unicode code points. Max is to U+10FFFF.
287- $ zero_count = strspn ( $ text , '0 ' , $ digits_at );
288- $ digit_count = strspn ( $ text , $ numeric_digits , $ digits_at + $ zero_count );
289- $ after_digits = $ digits_at + $ zero_count + $ digit_count ;
290- $ has_semicolon = $ after_digits < $ length && '; ' === $ text [ $ after_digits ];
291- $ end_of_span = $ has_semicolon ? $ after_digits + 1 : $ after_digits ;
284+ $ zero_count = strspn ( $ text , '0 ' , $ digits_at );
285+ $ digit_count = strspn ( $ text , $ numeric_digits , $ digits_at + $ zero_count );
292286
293287 // `&#` or `&#x` without digits returns into plaintext.
294288 if ( 0 === $ digit_count && 0 === $ zero_count ) {
295289 return null ;
296290 }
297291
292+ $ after_digits = $ digits_at + $ zero_count + $ digit_count ;
293+ $ has_semicolon = $ after_digits < $ length && '; ' === $ text [ $ after_digits ];
294+ $ end_of_span = $ has_semicolon ? $ after_digits + 1 : $ after_digits ;
295+
298296 if ( 0 === $ digit_count ) {
299297 $ skip_bytes = $ end_of_span - $ at ;
300298 return '� ' ;
@@ -328,60 +326,12 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
328326 * @see https://infra.spec.whatwg.org/#noncharacter
329327 */
330328
331- /*
332- * Code points in the C1 controls area need to be remapped as if they
333- * were stored in Windows-1252. Note! This transformation only happens
334- * for numeric character references. The raw code points in the byte
335- * stream are not translated.
336- *
337- * > If the number is one of the numbers in the first column of
338- * > the following table, then find the row with that number in
339- * > the first column, and set the character reference code to
340- * > the number in the second column of that row.
341- */
342- if ( $ code_point >= 0x80 && $ code_point <= 0x9F ) {
343- $ windows_1252_mapping = array (
344- 0x20AC , // 0x80 -> EURO SIGN (€).
345- 0x81 , // 0x81 -> (no change).
346- 0x201A , // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚).
347- 0x0192 , // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ).
348- 0x201E , // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („).
349- 0x2026 , // 0x85 -> HORIZONTAL ELLIPSIS (…).
350- 0x2020 , // 0x86 -> DAGGER (†).
351- 0x2021 , // 0x87 -> DOUBLE DAGGER (‡).
352- 0x02C6 , // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ).
353- 0x2030 , // 0x89 -> PER MILLE SIGN (‰).
354- 0x0160 , // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š).
355- 0x2039 , // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹).
356- 0x0152 , // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ).
357- 0x8D , // 0x8D -> (no change).
358- 0x017D , // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž).
359- 0x8F , // 0x8F -> (no change).
360- 0x90 , // 0x90 -> (no change).
361- 0x2018 , // 0x91 -> LEFT SINGLE QUOTATION MARK (‘).
362- 0x2019 , // 0x92 -> RIGHT SINGLE QUOTATION MARK (’).
363- 0x201C , // 0x93 -> LEFT DOUBLE QUOTATION MARK (“).
364- 0x201D , // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”).
365- 0x2022 , // 0x95 -> BULLET (•).
366- 0x2013 , // 0x96 -> EN DASH (–).
367- 0x2014 , // 0x97 -> EM DASH (—).
368- 0x02DC , // 0x98 -> SMALL TILDE (˜).
369- 0x2122 , // 0x99 -> TRADE MARK SIGN (™).
370- 0x0161 , // 0x9A -> LATIN SMALL LETTER S WITH CARON (š).
371- 0x203A , // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›).
372- 0x0153 , // 0x9C -> LATIN SMALL LIGATURE OE (œ).
373- 0x9D , // 0x9D -> (no change).
374- 0x017E , // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž).
375- 0x0178 , // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ).
376- );
377-
378- $ code_point = $ windows_1252_mapping [ $ code_point - 0x80 ];
379- }
380-
381329 $ skip_bytes = $ end_of_span - $ at ;
382330 return self ::code_point_to_utf8_bytes ( $ code_point );
383331 }
384332
333+ global $ html5_named_character_entity_set ;
334+
385335 /** Tracks inner parsing within the named character reference. */
386336 $ name_at = $ at + 1 ;
387337 // Minimum named character reference is two characters. E.g. `GT`.
@@ -460,32 +410,114 @@ public static function code_point_to_utf8_bytes( $code_point ) {
460410 return '� ' ;
461411 }
462412
463- if ( $ code_point <= 0x7F ) {
464- return chr ( $ code_point );
465- }
466-
467- if ( $ code_point <= 0x7FF ) {
468- $ byte1 = ( $ code_point >> 6 ) | 0xC0 ;
469- $ byte2 = $ code_point & 0x3F | 0x80 ;
413+ if ( $ code_point > 0xFFFF ) {
414+ $ byte1 = ( $ code_point >> 18 ) | 0xF0 ;
415+ $ byte2 = ( $ code_point >> 12 ) & 0x3F | 0x80 ;
416+ $ byte3 = ( $ code_point >> 6 ) & 0x3F | 0x80 ;
417+ $ byte4 = $ code_point & 0x3F | 0x80 ;
470418
471- return pack ( ' CC ' , $ byte1, $ byte2 );
419+ return chr ( $ byte1 ) . chr ( $ byte2 ) . chr ( $ byte3 ) . chr ( $ byte4 );
472420 }
473421
474- if ( $ code_point <= 0xFFFF ) {
422+ if ( $ code_point > 0x7FF ) {
475423 $ byte1 = ( $ code_point >> 12 ) | 0xE0 ;
476424 $ byte2 = ( $ code_point >> 6 ) & 0x3F | 0x80 ;
477425 $ byte3 = $ code_point & 0x3F | 0x80 ;
478426
479- return pack ( ' CCC ' , $ byte1, $ byte2, $ byte3 );
427+ return chr ( $ byte1 ) . chr ( $ byte2 ) . chr ( $ byte3 );
480428 }
481429
482- if ( $ code_point <= 0x10FFFF ) {
483- $ byte1 = ( $ code_point >> 18 ) | 0xF0 ;
484- $ byte2 = ( $ code_point >> 12 ) & 0x3F | 0x80 ;
485- $ byte3 = ( $ code_point >> 6 ) & 0x3F | 0x80 ;
486- $ byte4 = $ code_point & 0x3F | 0x80 ;
430+ if ( $ code_point <= 0x7F ) {
431+ return chr ( $ code_point );
432+ }
433+
434+ if ( $ code_point <= 0x7FF ) {
435+ /*
436+ * Code points in the C1 controls area need to be remapped as if they
437+ * were stored in Windows-1252. Note! This transformation only happens
438+ * for numeric character references. The raw code points in the byte
439+ * stream are not translated.
440+ *
441+ * > If the number is one of the numbers in the first column of
442+ * > the following table, then find the row with that number in
443+ * > the first column, and set the character reference code to
444+ * > the number in the second column of that row.
445+ */
446+ if ( $ code_point <= 0x9F ) {
447+ $ windows_1252_mapping = array (
448+ // 0x20AC, // 0x80 -> EURO SIGN (€).
449+ // 0x81, // 0x81 -> (no change).
450+ // 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚).
451+ // 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ).
452+ // 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („).
453+ // 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…).
454+ // 0x2020, // 0x86 -> DAGGER (†).
455+ // 0x2021, // 0x87 -> DOUBLE DAGGER (‡).
456+ // 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ).
457+ // 0x2030, // 0x89 -> PER MILLE SIGN (‰).
458+ // 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š).
459+ // 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹).
460+ // 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ).
461+ // 0x8D, // 0x8D -> (no change).
462+ // 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž).
463+ // 0x8F, // 0x8F -> (no change).
464+ // 0x90, // 0x90 -> (no change).
465+ // 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘).
466+ // 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’).
467+ // 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“).
468+ // 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”).
469+ // 0x2022, // 0x95 -> BULLET (•).
470+ // 0x2013, // 0x96 -> EN DASH (–).
471+ // 0x2014, // 0x97 -> EM DASH (—).
472+ // 0x02DC, // 0x98 -> SMALL TILDE (˜).
473+ // 0x2122, // 0x99 -> TRADE MARK SIGN (™).
474+ // 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š).
475+ // 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›).
476+ // 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ).
477+ // 0x9D, // 0x9D -> (no change).
478+ // 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž).
479+ // 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ).
480+ '€ ' ,
481+ ' ' ,
482+ '‚ ' ,
483+ 'ƒ ' ,
484+ '„ ' ,
485+ '… ' ,
486+ '† ' ,
487+ '‡ ' ,
488+ 'ˆ ' ,
489+ '‰ ' ,
490+ 'Š ' ,
491+ '‹ ' ,
492+ 'Œ ' ,
493+ ' ' ,
494+ 'Ž ' ,
495+ ' ' ,
496+ ' ' ,
497+ '‘ ' ,
498+ '’ ' ,
499+ '“ ' ,
500+ '” ' ,
501+ '• ' ,
502+ '– ' ,
503+ '— ' ,
504+ '˜ ' ,
505+ '™ ' ,
506+ 'š ' ,
507+ '› ' ,
508+ 'œ ' ,
509+ ' ' ,
510+ 'ž ' ,
511+ 'Ÿ ' ,
512+ );
513+
514+ return $ windows_1252_mapping [ $ code_point - 0x80 ];
515+ }
516+
517+ $ byte1 = ( $ code_point >> 6 ) | 0xC0 ;
518+ $ byte2 = $ code_point & 0x3F | 0x80 ;
487519
488- return pack ( ' CCCC ' , $ byte1, $ byte2 , $ byte3 , $ byte4 );
520+ return chr ( $ byte1 ) . chr ( $ byte2 );
489521 }
490522 }
491523}
0 commit comments