@@ -5,7 +5,6 @@ use crate::panic::const_panic;
55use crate :: slice;
66use crate :: str:: from_utf8_unchecked_mut;
77use crate :: ub_checks:: assert_unsafe_precondition;
8- use crate :: unicode:: printable:: is_printable;
98use crate :: unicode:: { self , conversions} ;
109
1110impl char {
@@ -478,18 +477,29 @@ impl char {
478477 #[ inline]
479478 pub ( crate ) fn escape_debug_ext ( self , args : EscapeDebugExtArgs ) -> EscapeDebug {
480479 match self {
481- '\0' => EscapeDebug :: backslash ( ascii:: Char :: Digit0 ) ,
482- '\t' => EscapeDebug :: backslash ( ascii:: Char :: SmallT ) ,
483- '\r' => EscapeDebug :: backslash ( ascii:: Char :: SmallR ) ,
484- '\n' => EscapeDebug :: backslash ( ascii:: Char :: SmallN ) ,
485- '\\' => EscapeDebug :: backslash ( ascii:: Char :: ReverseSolidus ) ,
480+ // Special escapes
486481 '\"' if args. escape_double_quote => EscapeDebug :: backslash ( ascii:: Char :: QuotationMark ) ,
487482 '\'' if args. escape_single_quote => EscapeDebug :: backslash ( ascii:: Char :: Apostrophe ) ,
488- _ if args. escape_grapheme_extender && self . is_grapheme_extender ( ) => {
483+ '\\' => EscapeDebug :: backslash ( ascii:: Char :: ReverseSolidus ) ,
484+ '\n' => EscapeDebug :: backslash ( ascii:: Char :: SmallN ) ,
485+ '\t' => EscapeDebug :: backslash ( ascii:: Char :: SmallT ) ,
486+ '\r' => EscapeDebug :: backslash ( ascii:: Char :: SmallR ) ,
487+ '\0' => EscapeDebug :: backslash ( ascii:: Char :: Digit0 ) ,
488+
489+ // ASCII fast path
490+ '\x20' ..='\x7E' => EscapeDebug :: printable ( self ) ,
491+
492+ _ if self . is_control ( )
493+ || self . is_private_use ( )
494+ || self . is_whitespace ( )
495+ || args. escape_grapheme_extender && self . is_grapheme_extender ( )
496+ || self . is_format_control ( )
497+ || self . is_unassigned ( ) =>
498+ {
489499 EscapeDebug :: unicode ( self )
490500 }
491- _ if is_printable ( self ) => EscapeDebug :: printable ( self ) ,
492- _ => EscapeDebug :: unicode ( self ) ,
501+
502+ _ => EscapeDebug :: printable ( self ) ,
493503 }
494504 }
495505
@@ -1110,6 +1120,111 @@ impl char {
11101120 matches ! ( self , '\0' ..='\x1f' | '\x7f' ..='\u{9f}' )
11111121 }
11121122
1123+ /// Returns `true` if this `char` has the general category for [private-use characters].
1124+ /// These characters do not have an interpretation specified by Unicode; individual programs
1125+ /// and users are free to assign them whatever meaning they like.
1126+ ///
1127+ /// [private-use characters]: https://www.unicode.org/faq/private_use#private_use
1128+ ///
1129+ /// Private-use characters (code points with the general category of `Co`) are [described] in Chapter 23
1130+ /// (Special Areas and Format Characters) of the Unicode Standard, and [specified] in the
1131+ /// Unicode Character Database [`UnicodeData.txt`]. The full set of private-use characters is
1132+ /// `'\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}'`,
1133+ /// and will never change.
1134+ ///
1135+ /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-23/#G19184
1136+ /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table
1137+ /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1138+ ///
1139+ #[ must_use]
1140+ #[ inline]
1141+ const fn is_private_use ( self ) -> bool {
1142+ // According to
1143+ // https://www.unicode.org/policies/stability_policy.html#Property_Value,
1144+ // the set of codepoints in `Co` will never change.
1145+ // So we can just hard-code the patterns to match against instead of using a table.
1146+ matches ! ( self , '\u{E000}' ..='\u{F8FF}' | '\u{F0000}' ..='\u{FFFFD}' | '\u{100000}' ..='\u{10FFFD}' )
1147+ }
1148+
1149+ /// Returns `true` if this `char` has the general category for format control characters.
1150+ ///
1151+ /// Format controls (code points with the general category of `Cf`) are [described] in Chapter 4
1152+ /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character
1153+ /// Database [`UnicodeData.txt`].
1154+ ///
1155+ /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153
1156+ /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table
1157+ /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1158+ ///
1159+ /// # Examples
1160+ ///
1161+ /// Basic usage:
1162+ ///
1163+ /// ```ignore(private)
1164+ /// assert!('\u{AD}'.is_format_control()); // SOFT HYPHEN
1165+ /// assert!('\u{200B}'.is_format_control()); // ZERO WIDTH SPACE
1166+ /// assert!('\u{E0041}'.is_format_control()); // TAG LATIN CAPITAL LETTER A
1167+ /// assert!(''.is_format_control()); // ARABIC END OF AYAH
1168+ /// assert!(''.is_format_control()); // EGYPTIAN HIEROGLYPH INSERT AT TOP START
1169+ /// assert!(!'q'.is_format_control());
1170+ /// ```
1171+ #[ must_use]
1172+ #[ inline]
1173+ fn is_format_control ( self ) -> bool {
1174+ self > '\u{AC}' && unicode:: Cf ( self )
1175+ }
1176+
1177+ /// Returns `true` if this `char` has not yet been assigned a meaning by Unicode, as of
1178+ /// [`UNICODE_VERSION`].
1179+ ///
1180+ /// [`UNICODE_VERSION`]: Self::UNICODE_VERSION
1181+ ///
1182+ /// These characters may have a meaning assigned in the future,
1183+ /// except for the 66 [noncharacters] which will never be assigned a meaning.
1184+ ///
1185+ /// [noncharacters]: https://www.unicode.org/faq/private_use#noncharacters
1186+ ///
1187+ /// Many of Unicode's [stability policies] apply only to assigned characters.
1188+ ///
1189+ /// [stability policies]: https://www.unicode.org/policies/stability_policy.html
1190+ ///
1191+ /// Unassigned characters (code points with the general category of `Cn`) are [described] in Chapter 4
1192+ /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character Database
1193+ /// by their exclusion from [`UnicodeData.txt`].
1194+ ///
1195+ /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153
1196+ /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table
1197+ /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1198+ ///
1199+ /// # Examples
1200+ ///
1201+ /// Basic usage:
1202+ ///
1203+ /// ```ignore(private)
1204+ /// assert!('\u{FFFE}'.is_unassigned()); // noncharacter, will never be assigned
1205+ ///
1206+ /// //assert!('\u{7AAAA}'.is_unassigned()); // not currently assigned, but may be in the future,
1207+ /// // so we shouldn't rely on the current status
1208+ ///
1209+ /// assert!(!'γ'.is_unassigned()); // once a character is assigned, it stays assigned forever
1210+ /// ```
1211+ #[ must_use]
1212+ #[ inline]
1213+ fn is_unassigned ( self ) -> bool {
1214+ match self {
1215+ '\0' ..='\u{377}' => false ,
1216+ '\u{378}' ..='\u{3FFFD}' => unicode:: Cn_planes_0_3 ( self ) ,
1217+ // Assigned character ranges in planes 4 and above.
1218+ // `src/tools/unicode-table-generator/src/main.rs` asserts that this is correct
1219+ '\u{E0001}'
1220+ | '\u{E0020}' ..='\u{E007F}'
1221+ | '\u{E0100}' ..='\u{E01EF}'
1222+ | '\u{F0000}' ..='\u{FFFFD}'
1223+ | '\u{100000}' ..='\u{10FFFD}' => false ,
1224+ _ => true ,
1225+ }
1226+ }
1227+
11131228 /// Returns `true` if this `char` has the `Grapheme_Extend` property.
11141229 ///
11151230 /// `Grapheme_Extend` is [described] in Chapter 3 (Conformance) of the Unicode Standard,
0 commit comments