@@ -276,7 +276,7 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
276276 $ numeric_base = 16 ;
277277 $ numeric_digits = '0123456789abcdefABCDEF ' ;
278278 $ max_digits = 6 ; // 
279- $ digits_at += 1 ;
279+ ++ $ digits_at ;
280280 } else {
281281 $ numeric_base = 10 ;
282282 $ numeric_digits = '0123456789 ' ;
@@ -308,49 +308,25 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
308308 $ digits = substr ( $ text , $ digits_at + $ zero_count , $ digit_count );
309309 $ code_point = intval ( $ digits , $ numeric_base );
310310
311- if (
312- // Null character.
313- 0 === $ code_point ||
314-
315- // Outside Unicode range.
316- $ code_point > 0x10FFFF ||
317-
318- // Surrogate.
319- ( $ code_point >= 0xD800 && $ code_point <= 0xDFFF )
320- ) {
321- $ skip_bytes = $ end_of_span - $ at ;
322- return '� ' ;
323- }
324-
325- if (
326- /*
327- * Noncharacters.
328- *
329- * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
330- * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
331- * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
332- * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
333- * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
334- * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
335- *
336- * @see https://infra.spec.whatwg.org/#noncharacter
337- */
338- ( $ code_point >= 0xFDD0 && $ code_point <= 0xFDEF ) ||
339- ( 0xFFFE === ( $ code_point & 0xFFFE ) ) ||
340-
341- // 0x0D or non-ASCII-whitespace control
342- 0x0D === $ code_point ||
343- (
344- $ code_point >= 0 &&
345- $ code_point <= 0x1F &&
346- 0x9 !== $ code_point &&
347- 0xA !== $ code_point &&
348- 0xC !== $ code_point &&
349- 0xD !== $ code_point
350- )
351- ) {
352- // @todo This is an error but the code point passes through.
353- }
311+ /*
312+ * Noncharacters, 0x0D, and non-ASCII-whitespace control characters.
313+ *
314+ * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
315+ * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
316+ * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
317+ * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
318+ * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
319+ * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
320+ *
321+ * A C0 control is a code point that is in the range of U+00 to U+1F,
322+ * but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D.
323+ *
324+ * These characters are invalid but still decode as any valid character.
325+ * This comment is here to note and explain why there's no check to
326+ * remove these characters or replace them.
327+ *
328+ * @see https://infra.spec.whatwg.org/#noncharacter
329+ */
354330
355331 /*
356332 * > If the number is one of the numbers in the first column of
@@ -449,7 +425,36 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
449425 return $ name ;
450426 }
451427
428+ /**
429+ * Encode a code point number into the UTF-8 encoding.
430+ *
431+ * This encoder implements the encoding algorithm for converting a number
432+ * into a byte sequence, but if it receives an invalid code point it will
433+ * return the Unicode Replacement Character U+FFFD `�`.
434+ *
435+ * Example:
436+ *
437+ * '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 );
438+ *
439+ * // Half of a surrogate pair is an invalid code point.
440+ * '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c );
441+ *
442+ * @since 6.6.0
443+ *
444+ * @see https://www.rfc-editor.org/rfc/rfc3629 UTF-8
445+ *
446+ * @param int $code_point Which code point to convert.
447+ * @return string Converted code point, or `�` if invalid.
448+ */
452449 public static function code_point_to_utf8_bytes ( $ code_point ) {
450+ if (
451+ $ code_point <= 0 ||
452+ ( $ code_point >= 0xD800 && $ code_point <= 0xDFFF ) ||
453+ $ code_point > 0x10FFFF
454+ ) {
455+ return '� ' ;
456+ }
457+
453458 if ( $ code_point < 0x80 ) {
454459 return chr ( $ code_point );
455460 }
0 commit comments