Skip to content

Commit 35eb40a

Browse files
Replace printables table with unicode_data.rs tables
This gets rid of the `printable.py` script, ensuring that `unicode-table-generator` handles all our Unicode data table generation needs. I've elected to give each Unicode property its own table, instead of merging them all into one. This is slightly less efficient in terms of space, but should allow us to expose these tables in the future with public methods on `char`.
1 parent da089a7 commit 35eb40a

9 files changed

Lines changed: 614 additions & 912 deletions

File tree

library/core/src/char/methods.rs

Lines changed: 124 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use crate::panic::const_panic;
55
use crate::slice;
66
use crate::str::from_utf8_unchecked_mut;
77
use crate::ub_checks::assert_unsafe_precondition;
8-
use crate::unicode::printable::is_printable;
98
use crate::unicode::{self, conversions};
109

1110
impl char {
@@ -478,18 +477,29 @@ impl char {
478477
#[inline]
479478
pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug {
480479
match self {
481-
'\0' => EscapeDebug::backslash(ascii::Char::Digit0),
482-
'\t' => EscapeDebug::backslash(ascii::Char::SmallT),
483-
'\r' => EscapeDebug::backslash(ascii::Char::SmallR),
484-
'\n' => EscapeDebug::backslash(ascii::Char::SmallN),
485-
'\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus),
480+
// Special escapes
486481
'\"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark),
487482
'\'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe),
488-
_ if args.escape_grapheme_extender && self.is_grapheme_extender() => {
483+
'\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus),
484+
'\n' => EscapeDebug::backslash(ascii::Char::SmallN),
485+
'\t' => EscapeDebug::backslash(ascii::Char::SmallT),
486+
'\r' => EscapeDebug::backslash(ascii::Char::SmallR),
487+
'\0' => EscapeDebug::backslash(ascii::Char::Digit0),
488+
489+
// ASCII fast path
490+
'\x20'..='\x7E' => EscapeDebug::printable(self),
491+
492+
_ if self.is_control()
493+
|| self.is_private_use()
494+
|| self.is_whitespace()
495+
|| args.escape_grapheme_extender && self.is_grapheme_extender()
496+
|| self.is_format_control()
497+
|| self.is_unassigned() =>
498+
{
489499
EscapeDebug::unicode(self)
490500
}
491-
_ if is_printable(self) => EscapeDebug::printable(self),
492-
_ => EscapeDebug::unicode(self),
501+
502+
_ => EscapeDebug::printable(self),
493503
}
494504
}
495505

@@ -1110,6 +1120,111 @@ impl char {
11101120
matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}')
11111121
}
11121122

1123+
/// Returns `true` if this `char` has the general category for [private-use characters].
1124+
/// These characters do not have an interpretation specified by Unicode; individual programs
1125+
/// and users are free to assign them whatever meaning they like.
1126+
///
1127+
/// [private-use characters]: https://www.unicode.org/faq/private_use#private_use
1128+
///
1129+
/// Private-use characters (code points with the general category of `Co`) are [described] in Chapter 23
1130+
/// (Special Areas and Format Characters) of the Unicode Standard, and [specified] in the
1131+
/// Unicode Character Database [`UnicodeData.txt`]. The full set of private-use characters is
1132+
/// `'\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}'`,
1133+
/// and will never change.
1134+
///
1135+
/// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-23/#G19184
1136+
/// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table
1137+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1138+
///
1139+
#[must_use]
1140+
#[inline]
1141+
const fn is_private_use(self) -> bool {
1142+
// According to
1143+
// https://www.unicode.org/policies/stability_policy.html#Property_Value,
1144+
// the set of codepoints in `Co` will never change.
1145+
// So we can just hard-code the patterns to match against instead of using a table.
1146+
matches!(self, '\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1147+
}
1148+
1149+
/// Returns `true` if this `char` has the general category for format control characters.
1150+
///
1151+
/// Format controls (code points with the general category of `Cf`) are [described] in Chapter 4
1152+
/// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character
1153+
/// Database [`UnicodeData.txt`].
1154+
///
1155+
/// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153
1156+
/// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table
1157+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1158+
///
1159+
/// # Examples
1160+
///
1161+
/// Basic usage:
1162+
///
1163+
/// ```ignore(private)
1164+
/// assert!('\u{AD}'.is_format_control()); // SOFT HYPHEN
1165+
/// assert!('\u{200B}'.is_format_control()); // ZERO WIDTH SPACE
1166+
/// assert!('\u{E0041}'.is_format_control()); // TAG LATIN CAPITAL LETTER A
1167+
/// assert!('۝'.is_format_control()); // ARABIC END OF AYAH
1168+
/// assert!('𓐲'.is_format_control()); // EGYPTIAN HIEROGLYPH INSERT AT TOP START
1169+
/// assert!(!'q'.is_format_control());
1170+
/// ```
1171+
#[must_use]
1172+
#[inline]
1173+
fn is_format_control(self) -> bool {
1174+
self > '\u{AC}' && unicode::Cf(self)
1175+
}
1176+
1177+
/// Returns `true` if this `char` has not yet been assigned a meaning by Unicode, as of
1178+
/// [`UNICODE_VERSION`].
1179+
///
1180+
/// [`UNICODE_VERSION`]: Self::UNICODE_VERSION
1181+
///
1182+
/// These characters may have a meaning assigned in the future,
1183+
/// except for the 66 [noncharacters] which will never be assigned a meaning.
1184+
///
1185+
/// [noncharacters]: https://www.unicode.org/faq/private_use#noncharacters
1186+
///
1187+
/// Many of Unicode's [stability policies] apply only to assigned characters.
1188+
///
1189+
/// [stability policies]: https://www.unicode.org/policies/stability_policy.html
1190+
///
1191+
/// Unassigned characters (code points with the general category of `Cn`) are [described] in Chapter 4
1192+
/// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character Database
1193+
/// by their exclusion from [`UnicodeData.txt`].
1194+
///
1195+
/// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153
1196+
/// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table
1197+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1198+
///
1199+
/// # Examples
1200+
///
1201+
/// Basic usage:
1202+
///
1203+
/// ```ignore(private)
1204+
/// assert!('\u{FFFE}'.is_unassigned()); // noncharacter, will never be assigned
1205+
///
1206+
/// //assert!('\u{7AAAA}'.is_unassigned()); // not currently assigned, but may be in the future,
1207+
/// // so we shouldn't rely on the current status
1208+
///
1209+
/// assert!(!'γ'.is_unassigned()); // once a character is assigned, it stays assigned forever
1210+
/// ```
1211+
#[must_use]
1212+
#[inline]
1213+
fn is_unassigned(self) -> bool {
1214+
match self {
1215+
'\0'..='\u{377}' => false,
1216+
'\u{378}'..='\u{3FFFD}' => unicode::Cn_planes_0_3(self),
1217+
// Assigned character ranges in planes 4 and above.
1218+
// `src/tools/unicode-table-generator/src/main.rs` asserts that this is correct
1219+
'\u{E0001}'
1220+
| '\u{E0020}'..='\u{E007F}'
1221+
| '\u{E0100}'..='\u{E01EF}'
1222+
| '\u{F0000}'..='\u{FFFFD}'
1223+
| '\u{100000}'..='\u{10FFFD}' => false,
1224+
_ => true,
1225+
}
1226+
}
1227+
11131228
/// Returns `true` if this `char` has the `Grapheme_Extend` property.
11141229
///
11151230
/// `Grapheme_Extend` is [described] in Chapter 3 (Conformance) of the Unicode Standard,

library/core/src/unicode/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ pub use unicode_data::conversions;
99
#[rustfmt::skip]
1010
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
1111
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
12+
pub(crate) use unicode_data::cf::lookup as Cf;
13+
pub(crate) use unicode_data::cn_planes_0_3::lookup as Cn_planes_0_3;
1214
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1315
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
1416
pub(crate) use unicode_data::lt::lookup as Lt;
1517
pub(crate) use unicode_data::n::lookup as N;
1618
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
1719
pub(crate) use unicode_data::white_space::lookup as White_Space;
1820

19-
pub(crate) mod printable;
20-
2121
#[allow(unreachable_pub)]
2222
pub mod unicode_data;
2323

0 commit comments

Comments
 (0)