diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 00b735e91a377..c275ae2f621d9 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -5,7 +5,6 @@ use crate::panic::const_panic; use crate::slice; use crate::str::from_utf8_unchecked_mut; use crate::ub_checks::assert_unsafe_precondition; -use crate::unicode::printable::is_printable; use crate::unicode::{self, conversions}; impl char { @@ -93,13 +92,18 @@ impl char { /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of /// `char` and `str` methods are based on. /// - /// New versions of Unicode are released regularly and subsequently all methods - /// in the standard library depending on Unicode are updated. Therefore the - /// behavior of some `char` and `str` methods and the value of this constant - /// changes over time. This is *not* considered to be a breaking change. + /// New versions of Unicode are released regularly, and subsequently all methods + /// in the standard library depending on Unicode are updated. Therefore, the + /// behavior of some `char` and `str` methods, and the value of this constant, + /// change over time (within the boundaries of Unicode's [stability policies]). + /// This is *not* considered to be a breaking change. + /// + /// [stability policies]: https://www.unicode.org/policies/stability_policy.html /// /// The version numbering scheme is explained in - /// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4). + /// [Section 3.1 (Version Numbering)] of the Unicode Standard. + /// + /// [Section 3.1 (Version Numbering)]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49512 #[stable(feature = "assoc_char_consts", since = "1.52.0")] pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION; @@ -473,18 +477,30 @@ impl char { #[inline] pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug { match self { - '\0' => EscapeDebug::backslash(ascii::Char::Digit0), - '\t' => EscapeDebug::backslash(ascii::Char::SmallT), - '\r' => EscapeDebug::backslash(ascii::Char::SmallR), - '\n' => EscapeDebug::backslash(ascii::Char::SmallN), - '\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus), + // Special escapes '\"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark), '\'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe), - _ if args.escape_grapheme_extended && self.is_grapheme_extended() => { + '\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus), + '\n' => EscapeDebug::backslash(ascii::Char::SmallN), + '\t' => EscapeDebug::backslash(ascii::Char::SmallT), + '\r' => EscapeDebug::backslash(ascii::Char::SmallR), + '\0' => EscapeDebug::backslash(ascii::Char::Digit0), + + // ASCII fast path + '\x20'..='\x7E' => EscapeDebug::printable(self), + + _ if self.is_control() + || self.is_private_use() + || self.is_whitespace() + || args.escape_grapheme_extender && self.is_grapheme_extender() + || self.is_default_ignorable() + || self.is_format_control() + || self.is_unassigned() => + { EscapeDebug::unicode(self) } - _ if is_printable(self) => EscapeDebug::printable(self), - _ => EscapeDebug::unicode(self), + + _ => EscapeDebug::printable(self), } } @@ -753,11 +769,11 @@ impl char { /// Returns `true` if this `char` has the `Alphabetic` property. /// - /// `Alphabetic` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Alphabetic` is [described] in Chapter 4 (Character Properties) of the Unicode Standard, and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G32524 + /// [specified]: https://www.unicode.org/reports/tr44/#Alphabetic /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -786,11 +802,11 @@ impl char { /// Returns `true` if this `char` has the `Cased` property. /// A character is cased if and only if it is uppercase, lowercase, or titlecase. /// - /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Cased` is [described] in Chapter 3 (Character Properties) of the Unicode Standard and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G44595 + /// [specified]: https://www.unicode.org/reports/tr44/#Cased /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -849,11 +865,11 @@ impl char { /// Returns `true` if this `char` has the `Lowercase` property. /// - /// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Lowercase` is [described] in Chapter 4 (Character Properties) of the Unicode Standard, and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G136255 + /// [specified]: https://www.unicode.org/reports/tr44/#Lowercase /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -889,15 +905,15 @@ impl char { } } - /// Returns `true` if this `char` has the general category for titlecase letters. + /// Returns `true` if this `char` is in the general category for titlecase letters. /// Conceptually, these characters consist of an uppercase portion followed by a lowercase portion. /// - /// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4 - /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character - /// Database][ucd] [`UnicodeData.txt`]. + /// Titlecase letters (code points with the general category of `Lt`) are [described] in Chapter 4 + /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G124722 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// /// # Examples @@ -925,11 +941,11 @@ impl char { /// Returns `true` if this `char` has the `Uppercase` property. /// - /// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Uppercase` is [described] in Chapter 4 (Character Properties) of the Unicode Standard, and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G136255 + /// [specified]: https://www.unicode.org/reports/tr44/#Uppercase /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -965,44 +981,54 @@ impl char { } } - /// Returns `true` if this `char` has the `White_Space` property. + /// Returns `true` if this `char` has one of the general categories for numbers. /// - /// `White_Space` is specified in the [Unicode Character Database][ucd] [`PropList.txt`]. + /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric + /// characters, and `No` for other numeric characters) are [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. /// - /// [ucd]: https://www.unicode.org/reports/tr44/ - /// [`PropList.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt + /// This method doesn't cover everything that could be considered a number, e.g. ideographic numbers like '三'. + /// If you want everything including characters with overlapping purposes, then you might want to use + /// a Unicode or language-processing library that exposes the appropriate character properties + /// (e.g. [`Numeric_Type`]) instead of looking at the Unicode categories. + /// + /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N, use + /// `is_ascii_digit` or `is_digit` instead. + /// + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// [`Numeric_Type`]: https://www.unicode.org/reports/tr44/#Numeric_Type /// /// # Examples /// /// Basic usage: /// /// ``` - /// assert!(' '.is_whitespace()); - /// - /// // line break - /// assert!('\n'.is_whitespace()); - /// - /// // a non-breaking space - /// assert!('\u{A0}'.is_whitespace()); - /// - /// assert!(!'越'.is_whitespace()); + /// assert!('٣'.is_numeric()); + /// assert!('7'.is_numeric()); + /// assert!('৬'.is_numeric()); + /// assert!('¾'.is_numeric()); + /// assert!('①'.is_numeric()); + /// assert!(!'K'.is_numeric()); + /// assert!(!'و'.is_numeric()); + /// assert!(!'藏'.is_numeric()); + /// assert!(!'三'.is_numeric()); /// ``` #[must_use] #[stable(feature = "rust1", since = "1.0.0")] - #[rustc_const_stable(feature = "const_char_classify", since = "1.87.0")] #[inline] - pub const fn is_whitespace(self) -> bool { + pub fn is_numeric(self) -> bool { match self { - ' ' | '\x09'..='\x0d' => true, - '\0'..='\u{84}' => false, - _ => unicode::White_Space(self), + '0'..='9' => true, + '\0'..='\u{B1}' => false, + _ => unicode::N(self), } } /// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`]. /// - /// [`is_alphabetic()`]: #method.is_alphabetic - /// [`is_numeric()`]: #method.is_numeric + /// [`is_alphabetic()`]: Self::is_alphabetic + /// [`is_numeric()`]: Self::is_numeric /// /// # Examples /// @@ -1029,14 +1055,49 @@ impl char { } } + /// Returns `true` if this `char` has the `White_Space` property. + /// + /// `White_Space` is [specified] in the Unicode Character Database [`PropList.txt`]. + /// + /// [specified]: https://www.unicode.org/reports/tr44/#White_Space + /// [`PropList.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// assert!(' '.is_whitespace()); + /// + /// // line break + /// assert!('\n'.is_whitespace()); + /// + /// // a non-breaking space + /// assert!('\u{A0}'.is_whitespace()); + /// + /// assert!(!'越'.is_whitespace()); + /// ``` + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + #[rustc_const_stable(feature = "const_char_classify", since = "1.87.0")] + #[inline] + pub const fn is_whitespace(self) -> bool { + match self { + ' ' | '\x09'..='\x0d' => true, + '\0'..='\u{84}' => false, + _ => unicode::White_Space(self), + } + } + /// Returns `true` if this `char` has the general category for control codes. /// - /// Control codes (code points with the general category of `Cc`) are described in Chapter 4 - /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character - /// Database][ucd] [`UnicodeData.txt`]. + /// Control codes (code points with the general category of `Cc`) are [described] in Chapter 23 + /// (Special Areas and Format Characters) of the Unicode Standard, and [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. The full set of Unicode control codes is + /// `'\0'..='\x1f' | '\x7f'..='\u{9f}'`, and will never change. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-23/#G20365 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// /// # Examples @@ -1044,8 +1105,9 @@ impl char { /// Basic usage: /// /// ``` - /// // U+009C, STRING TERMINATOR - /// assert!('œ'.is_control()); + /// assert!('\t'.is_control()); + /// assert!('\n'.is_control()); + /// assert!('\u{9C}'.is_control()); // STRING TERMINATOR /// assert!(!'q'.is_control()); /// ``` #[must_use] @@ -1059,84 +1121,175 @@ impl char { matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}') } - /// Returns `true` if this `char` has the `Grapheme_Extend` property. + /// Returns `true` if this `char` has the general category for [private-use characters]. + /// These characters do not have an interpretation specified by Unicode; individual programs + /// and users are free to assign them whatever meaning they like. /// - /// `Grapheme_Extend` is described in [Unicode Standard Annex #29 (Unicode Text - /// Segmentation)][uax29] and specified in the [Unicode Character Database][ucd] - /// [`DerivedCoreProperties.txt`]. + /// [private-use characters]: https://www.unicode.org/faq/private_use#private_use + /// + /// Private-use characters (code points with the general category of `Co`) are [described] in Chapter 23 + /// (Special Areas and Format Characters) of the Unicode Standard, and [specified] in the + /// Unicode Character Database [`UnicodeData.txt`]. The full set of private-use characters is + /// `'\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}'`, + /// and will never change. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-23/#G19184 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// - /// [uax29]: https://www.unicode.org/reports/tr29/ - /// [ucd]: https://www.unicode.org/reports/tr44/ - /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt #[must_use] #[inline] - pub(crate) fn is_grapheme_extended(self) -> bool { - self > '\u{02FF}' && unicode::Grapheme_Extend(self) + const fn is_private_use(self) -> bool { + // According to + // https://www.unicode.org/policies/stability_policy.html#Property_Value, + // the set of codepoints in `Co` will never change. + // So we can just hard-code the patterns to match against instead of using a table. + matches!(self, '\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}') } - /// Returns `true` if this `char` has the `Case_Ignorable` property. This narrow-use property - /// is used to implement context-dependent casing for the Greek letter sigma (uppercase Σ), - /// which has two lowercase forms. + /// Returns `true` if this `char` has the general category for format control characters. /// - /// `Case_Ignorable` is [described][D136] in Chapter 3 (Conformance) of the Unicode Core Specification, - /// and specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]; - /// see those resources for more information. + /// Format controls (code points with the general category of `Cf`) are [described] in Chapter 4 + /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. /// - /// [D136]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63116 - /// [ucd]: https://www.unicode.org/reports/tr44/ - /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```ignore(private) + /// assert!('\u{AD}'.is_format_control()); // SOFT HYPHEN + /// assert!('\u{200B}'.is_format_control()); // ZERO WIDTH SPACE + /// assert!('\u{E0041}'.is_format_control()); // TAG LATIN CAPITAL LETTER A + /// assert!('۝'.is_format_control()); // ARABIC END OF AYAH + /// assert!('𓐲'.is_format_control()); // EGYPTIAN HIEROGLYPH INSERT AT TOP START + /// assert!(!'q'.is_format_control()); + /// ``` #[must_use] #[inline] - #[unstable(feature = "case_ignorable", issue = "154848")] - pub fn is_case_ignorable(self) -> bool { - if self.is_ascii() { - matches!(self, '\'' | '.' | ':' | '^' | '`') - } else { - unicode::Case_Ignorable(self) - } + fn is_format_control(self) -> bool { + self > '\u{AC}' && unicode::Cf(self) } - /// Returns `true` if this `char` has one of the general categories for numbers. + /// Returns `true` if this `char` has not yet been assigned a meaning by Unicode, as of + /// [`UNICODE_VERSION`]. /// - /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric - /// characters, and `No` for other numeric characters) are specified in the [Unicode Character - /// Database][ucd] [`UnicodeData.txt`]. + /// [`UNICODE_VERSION`]: Self::UNICODE_VERSION /// - /// This method doesn't cover everything that could be considered a number, e.g. ideographic numbers like '三'. - /// If you want everything including characters with overlapping purposes then you might want to use - /// a unicode or language-processing library that exposes the appropriate character properties instead - /// of looking at the unicode categories. + /// These characters may have a meaning assigned in the future, + /// except for the 66 [noncharacters] which will never be assigned a meaning. /// - /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N, use - /// `is_ascii_digit` or `is_digit` instead. + /// [noncharacters]: https://www.unicode.org/faq/private_use#noncharacters /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// Many of Unicode's [stability policies] apply only to assigned characters. + /// + /// [stability policies]: https://www.unicode.org/policies/stability_policy.html + /// + /// Unassigned characters (code points with the general category of `Cn`) are [described] in Chapter 4 + /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character Database + /// by their exclusion from [`UnicodeData.txt`]. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// /// # Examples /// /// Basic usage: /// - /// ``` - /// assert!('٣'.is_numeric()); - /// assert!('7'.is_numeric()); - /// assert!('৬'.is_numeric()); - /// assert!('¾'.is_numeric()); - /// assert!('①'.is_numeric()); - /// assert!(!'K'.is_numeric()); - /// assert!(!'و'.is_numeric()); - /// assert!(!'藏'.is_numeric()); - /// assert!(!'三'.is_numeric()); + /// ```ignore(private) + /// assert!('\u{FFFE}'.is_unassigned()); // noncharacter, will never be assigned + /// + /// //assert!('\u{7AAAA}'.is_unassigned()); // not currently assigned, but may be in the future, + /// // so we shouldn't rely on the current status + /// + /// assert!(!'γ'.is_unassigned()); // once a character is assigned, it stays assigned forever /// ``` #[must_use] - #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn is_numeric(self) -> bool { + fn is_unassigned(self) -> bool { match self { - '0'..='9' => true, - '\0'..='\u{B1}' => false, - _ => unicode::N(self), + '\0'..='\u{377}' => false, + '\u{378}'..='\u{3FFFD}' => unicode::Cn_planes_0_3(self), + // Assigned character ranges in planes 4 and above. + // `src/tools/unicode-table-generator/src/main.rs` asserts that this is correct + '\u{E0001}' + | '\u{E0020}'..='\u{E007F}' + | '\u{E0100}'..='\u{E01EF}' + | '\u{F0000}'..='\u{FFFFD}' + | '\u{100000}'..='\u{10FFFD}' => false, + _ => true, + } + } + + /// Returns `true` if this `char` has the `Default_Ignorable_Code_Point` property. + /// These characters [should be displayed as invisible in fallback rendering](https://www.unicode.org/faq/unsup_char#3). + /// + /// `Default_Ignorable_Code_Point` is [described] in Chapter 5 (Implementation Guidelines) of the Unicode Standard, + /// and [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-5/#G40120 + /// [specified]: https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```ignore(private) + /// assert!('\u{AD}'.is_default_ignorable()); // SOFT HYPHEN + /// assert!('\u{115F}'.is_default_ignorable()); // HANGUL CHOSEONG FILLER + /// assert!('\u{200B}'.is_default_ignorable()); // ZERO WIDTH SPACE + /// assert!('\u{E0041}'.is_default_ignorable()); // TAG LATIN CAPITAL LETTER A + /// assert!(!'۝'.is_default_ignorable()); // ARABIC END OF AYAH + /// assert!(!'𓐲'.is_default_ignorable()); // EGYPTIAN HIEROGLYPH INSERT AT TOP START + /// assert!(!' '.is_default_ignorable()); + /// assert!(!'\n'.is_default_ignorable()); + /// assert!(!'\0'.is_default_ignorable()); + /// assert!(!'q'.is_default_ignorable()); + #[must_use] + #[inline] + fn is_default_ignorable(self) -> bool { + self > '\u{AC}' && unicode::Default_Ignorable_Code_Point(self) + } + + /// Returns `true` if this `char` has the `Grapheme_Extend` property. + /// + /// `Grapheme_Extend` is [described] in Chapter 3 (Conformance) of the Unicode Standard, + /// and [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G41165 + /// [specified]: https://www.unicode.org/reports/tr44/#Grapheme_Extend + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + #[must_use] + #[inline] + fn is_grapheme_extender(self) -> bool { + self > '\u{02FF}' && unicode::Grapheme_Extend(self) + } + + /// Returns `true` if this `char` has the `Case_Ignorable` property. This narrow-use property + /// is used to implement context-dependent casing for the Greek letter sigma (uppercase Σ), + /// which has two lowercase forms. + /// + /// `Case_Ignorable` is [described] in Chapter 3 (Conformance) of the Unicode Core Specification, + /// and [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]; + /// see those resources for more information. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63116 + /// [specified]: https://www.unicode.org/reports/tr44/#Case_Ignorable + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + #[must_use] + #[inline] + #[unstable(feature = "case_ignorable", issue = "154848")] + pub fn is_case_ignorable(self) -> bool { + if self.is_ascii() { + matches!(self, '\'' | '.' | ':' | '^' | '`') + } else { + unicode::Case_Ignorable(self) } } @@ -2151,8 +2304,8 @@ impl char { } pub(crate) struct EscapeDebugExtArgs { - /// Escape Extended Grapheme codepoints? - pub(crate) escape_grapheme_extended: bool, + /// Escape Grapheme Extender codepoints? + pub(crate) escape_grapheme_extender: bool, /// Escape single quotes? pub(crate) escape_single_quote: bool, @@ -2163,7 +2316,7 @@ pub(crate) struct EscapeDebugExtArgs { impl EscapeDebugExtArgs { pub(crate) const ESCAPE_ALL: Self = Self { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: true, escape_double_quote: true, }; diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs index 9e5f693246f33..00694a653be2d 100644 --- a/library/core/src/fmt/mod.rs +++ b/library/core/src/fmt/mod.rs @@ -2941,7 +2941,7 @@ impl Debug for str { let mut chars = rest.chars(); if let Some(c) = chars.next() { let esc = c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: false, escape_double_quote: true, }); @@ -2973,7 +2973,7 @@ impl Debug for char { fn fmt(&self, f: &mut Formatter<'_>) -> Result { f.write_char('\'')?; let esc = self.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: true, escape_double_quote: false, }); diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index d2dc650910f63..847ff265d3608 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -123,7 +123,7 @@ impl fmt::Debug for Debug<'_> { let mut from = 0; for (i, c) in valid.char_indices() { let esc = c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: false, escape_double_quote: true, }); diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 5af399ab1b34c..1fd3125154322 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -3195,7 +3195,7 @@ impl_fn_for_zst! { #[derive(Clone)] struct CharEscapeDebugContinue impl Fn = |c: char| -> char::EscapeDebug { c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: false, + escape_grapheme_extender: false, escape_single_quote: true, escape_double_quote: true }) diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 8b2c526a08878..1648795facd40 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -9,6 +9,9 @@ pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; +pub(crate) use unicode_data::cf::lookup as Cf; +pub(crate) use unicode_data::cn_planes_0_3::lookup as Cn_planes_0_3; +pub(crate) use unicode_data::default_ignorable_code_point::lookup as Default_Ignorable_Code_Point; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::lt::lookup as Lt; @@ -16,8 +19,6 @@ pub(crate) use unicode_data::n::lookup as N; pub(crate) use unicode_data::uppercase::lookup as Uppercase; pub(crate) use unicode_data::white_space::lookup as White_Space; -pub(crate) mod printable; - #[allow(unreachable_pub)] pub mod unicode_data; @@ -27,8 +28,13 @@ pub mod unicode_data; /// New versions of Unicode are released regularly and subsequently all methods /// in the standard library depending on Unicode are updated. Therefore the /// behavior of some `char` and `str` methods and the value of this constant -/// changes over time. This is *not* considered to be a breaking change. +/// changes over time, within the boundaries of Unicode's [stability policies]. +/// This is *not* considered to be a breaking change. +/// +/// [stability policies]: https://www.unicode.org/policies/stability_policy.html /// /// The version numbering scheme is explained in -/// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4). +/// [Section 3.1 (Version Numbering)] of the Unicode Standard. +/// +/// [Section 3.1 (Version Numbering)]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49512 pub const UNICODE_VERSION: (u8, u8, u8) = unicode_data::UNICODE_VERSION; diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py deleted file mode 100755 index 260fa9f9e6ad2..0000000000000 --- a/library/core/src/unicode/printable.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python - -# This script uses the following Unicode tables: -# - UnicodeData.txt - - -from collections import namedtuple -import csv -import os -import subprocess - -NUM_CODEPOINTS = 0x110000 - - -def to_ranges(iter): - current = None - for i in iter: - if current is None or i != current[1] or i in (0x10000, 0x20000): - if current is not None: - yield tuple(current) - current = [i, i + 1] - else: - current[1] += 1 - if current is not None: - yield tuple(current) - - -def get_escaped(codepoints): - for c in codepoints: - if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord( - " " - ): - yield c.value - - -def get_file(f): - try: - return open(os.path.basename(f)) - except FileNotFoundError: - subprocess.run(["curl", "-O", f], check=True) - return open(os.path.basename(f)) - - -Codepoint = namedtuple("Codepoint", "value class_") - - -def get_codepoints(f): - r = csv.reader(f, delimiter=";") - prev_codepoint = 0 - class_first = None - for row in r: - codepoint = int(row[0], 16) - name = row[1] - class_ = row[2] - - if class_first is not None: - if not name.endswith("Last>"): - raise ValueError("Missing Last after First") - - for c in range(prev_codepoint + 1, codepoint): - yield Codepoint(c, class_first) - - class_first = None - if name.endswith("First>"): - class_first = class_ - - yield Codepoint(codepoint, class_) - prev_codepoint = codepoint - - if class_first is not None: - raise ValueError("Missing Last after First") - - for c in range(prev_codepoint + 1, NUM_CODEPOINTS): - yield Codepoint(c, None) - - -def compress_singletons(singletons): - uppers = [] # (upper, # items in lowers) - lowers = [] - - for i in singletons: - upper = i >> 8 - lower = i & 0xFF - if len(uppers) == 0 or uppers[-1][0] != upper: - uppers.append((upper, 1)) - else: - upper, count = uppers[-1] - uppers[-1] = upper, count + 1 - lowers.append(lower) - - return uppers, lowers - - -def compress_normal(normal): - # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f - # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff - compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] - - prev_start = 0 - for start, count in normal: - truelen = start - prev_start - falselen = count - prev_start = start + count - - assert truelen < 0x8000 and falselen < 0x8000 - entry = [] - if truelen > 0x7F: - entry.append(0x80 | (truelen >> 8)) - entry.append(truelen & 0xFF) - else: - entry.append(truelen & 0x7F) - if falselen > 0x7F: - entry.append(0x80 | (falselen >> 8)) - entry.append(falselen & 0xFF) - else: - entry.append(falselen & 0x7F) - - compressed.append(entry) - - return compressed - - -def print_singletons(uppers, lowers, uppersname, lowersname): - print("#[rustfmt::skip]") - print("const {}: &[(u8, u8)] = &[".format(uppersname)) - for u, c in uppers: - print(" ({:#04x}, {}),".format(u, c)) - print("];") - print("#[rustfmt::skip]") - print("const {}: &[u8] = &[".format(lowersname)) - for i in range(0, len(lowers), 8): - print( - " {}".format(" ".join("{:#04x},".format(x) for x in lowers[i : i + 8])) - ) - print("];") - - -def print_normal(normal, normalname): - print("#[rustfmt::skip]") - print("const {}: &[u8] = &[".format(normalname)) - for v in normal: - print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) - print("];") - - -def main(): - file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") - - codepoints = get_codepoints(file) - - CUTOFF = 0x10000 - singletons0 = [] - singletons1 = [] - normal0 = [] - normal1 = [] - extra = [] - - for a, b in to_ranges(get_escaped(codepoints)): - if a > 2 * CUTOFF: - extra.append((a, b - a)) - elif a == b - 1: - if a & CUTOFF: - singletons1.append(a & ~CUTOFF) - else: - singletons0.append(a) - elif a == b - 2: - if a & CUTOFF: - singletons1.append(a & ~CUTOFF) - singletons1.append((a + 1) & ~CUTOFF) - else: - singletons0.append(a) - singletons0.append(a + 1) - else: - if a >= 2 * CUTOFF: - extra.append((a, b - a)) - elif a & CUTOFF: - normal1.append((a & ~CUTOFF, b - a)) - else: - normal0.append((a, b - a)) - - singletons0u, singletons0l = compress_singletons(singletons0) - singletons1u, singletons1l = compress_singletons(singletons1) - normal0 = compress_normal(normal0) - normal1 = compress_normal(normal1) - - print("""\ -// NOTE: The following code was generated by "library/core/src/unicode/printable.py", -// do not edit directly! - -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { - if lower == x as u8 { - return false; - } - } - } else if xupper < upper { - break; - } - lowerstart = lowerend; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 - } else { - v as i32 - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} - -pub(crate) fn is_printable(x: char) -> bool { - let x = x as u32; - let lower = x as u16; - - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else {\ -""") - for a, b in extra: - print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) - print(" return false;") - print(" }") - print("""\ - true - } -}\ -""") - print() - print_singletons(singletons0u, singletons0l, "SINGLETONS0U", "SINGLETONS0L") - print_singletons(singletons1u, singletons1l, "SINGLETONS1U", "SINGLETONS1L") - print_normal(normal0, "NORMAL0") - print_normal(normal1, "NORMAL1") - - -if __name__ == "__main__": - main() diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs deleted file mode 100644 index 68e1c8ae31c06..0000000000000 --- a/library/core/src/unicode/printable.rs +++ /dev/null @@ -1,608 +0,0 @@ -// NOTE: The following code was generated by "library/core/src/unicode/printable.py", -// do not edit directly! - -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { - if lower == x as u8 { - return false; - } - } - } else if xupper < upper { - break; - } - lowerstart = lowerend; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 - } else { - v as i32 - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} - -pub(crate) fn is_printable(x: char) -> bool { - let x = x as u32; - let lower = x as u16; - - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else { - if 0x2a6e0 <= x && x < 0x2a700 { - return false; - } - if 0x2b81e <= x && x < 0x2b820 { - return false; - } - if 0x2ceae <= x && x < 0x2ceb0 { - return false; - } - if 0x2ebe1 <= x && x < 0x2ebf0 { - return false; - } - if 0x2ee5e <= x && x < 0x2f800 { - return false; - } - if 0x2fa1e <= x && x < 0x30000 { - return false; - } - if 0x3134b <= x && x < 0x31350 { - return false; - } - if 0x3347a <= x && x < 0xe0100 { - return false; - } - if 0xe01f0 <= x && x < 0x110000 { - return false; - } - true - } -} - -#[rustfmt::skip] -const SINGLETONS0U: &[(u8, u8)] = &[ - (0x00, 1), - (0x03, 5), - (0x05, 6), - (0x06, 2), - (0x07, 6), - (0x08, 7), - (0x09, 17), - (0x0a, 28), - (0x0b, 25), - (0x0c, 25), - (0x0d, 16), - (0x0e, 12), - (0x0f, 4), - (0x10, 3), - (0x12, 18), - (0x13, 9), - (0x16, 1), - (0x17, 4), - (0x18, 1), - (0x19, 3), - (0x1a, 9), - (0x1b, 1), - (0x1c, 2), - (0x1f, 22), - (0x20, 3), - (0x2b, 2), - (0x2d, 11), - (0x2e, 1), - (0x30, 4), - (0x31, 2), - (0x32, 1), - (0xa9, 2), - (0xaa, 4), - (0xab, 8), - (0xfa, 2), - (0xfb, 5), - (0xfe, 3), - (0xff, 9), -]; -#[rustfmt::skip] -const SINGLETONS0L: &[u8] = &[ - 0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, - 0x58, 0x8b, 0x8c, 0x90, 0x1c, 0xdd, 0x0e, 0x0f, - 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f, 0x5c, - 0x5d, 0x5f, 0xe2, 0x84, 0x8d, 0x8e, 0x91, 0x92, - 0xa9, 0xb1, 0xba, 0xbb, 0xc5, 0xc6, 0xc9, 0xca, - 0xde, 0xe4, 0xe5, 0xff, 0x00, 0x04, 0x11, 0x12, - 0x29, 0x31, 0x34, 0x37, 0x3a, 0x3b, 0x3d, 0x49, - 0x4a, 0x5d, 0x84, 0x8e, 0x92, 0xa9, 0xb1, 0xb4, - 0xba, 0xbb, 0xc6, 0xca, 0xce, 0xcf, 0xe4, 0xe5, - 0x00, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, - 0x34, 0x3a, 0x3b, 0x45, 0x46, 0x49, 0x4a, 0x5e, - 0x64, 0x65, 0x84, 0x91, 0x9b, 0x9d, 0xc9, 0xce, - 0xcf, 0x0d, 0x11, 0x29, 0x3a, 0x3b, 0x45, 0x49, - 0x57, 0x5b, 0x5e, 0x5f, 0x64, 0x65, 0x8d, 0x91, - 0xa9, 0xb4, 0xba, 0xbb, 0xc5, 0xc9, 0xdf, 0xe4, - 0xe5, 0xf0, 0x0d, 0x11, 0x45, 0x49, 0x64, 0x65, - 0x80, 0x84, 0xb2, 0xbc, 0xbe, 0xbf, 0xd5, 0xd7, - 0xf0, 0xf1, 0x83, 0x85, 0x8b, 0xa4, 0xa6, 0xbe, - 0xbf, 0xc5, 0xc7, 0xcf, 0xda, 0xdb, 0x48, 0x98, - 0xbd, 0xcd, 0xc6, 0xce, 0xcf, 0x49, 0x4e, 0x4f, - 0x57, 0x59, 0x5e, 0x5f, 0x89, 0x8e, 0x8f, 0xb1, - 0xb6, 0xb7, 0xbf, 0xc1, 0xc6, 0xc7, 0xd7, 0x11, - 0x16, 0x17, 0x5b, 0x5c, 0xf6, 0xf7, 0xfe, 0xff, - 0x80, 0x6d, 0x71, 0xde, 0xdf, 0x0e, 0x1f, 0x6e, - 0x6f, 0x1c, 0x1d, 0x5f, 0x7d, 0x7e, 0xae, 0xaf, - 0xde, 0xdf, 0x4d, 0xbb, 0xbc, 0x16, 0x17, 0x1e, - 0x1f, 0x46, 0x47, 0x4e, 0x4f, 0x58, 0x5a, 0x5c, - 0x5e, 0x7e, 0x7f, 0xb5, 0xc5, 0xd4, 0xd5, 0xdc, - 0xf0, 0xf1, 0xf5, 0x72, 0x73, 0x8f, 0x74, 0x75, - 0x26, 0x2e, 0x2f, 0xa7, 0xaf, 0xb7, 0xbf, 0xc7, - 0xcf, 0xd7, 0xdf, 0x9a, 0x00, 0x40, 0x97, 0x98, - 0x30, 0x8f, 0x1f, 0xce, 0xff, 0x4e, 0x4f, 0x5a, - 0x5b, 0x07, 0x08, 0x0f, 0x10, 0x27, 0x2f, 0xee, - 0xef, 0x6e, 0x6f, 0x37, 0x3d, 0x3f, 0x42, 0x45, - 0x53, 0x67, 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, - 0xd9, 0xe7, 0xfe, 0xff, -]; -#[rustfmt::skip] -const SINGLETONS1U: &[(u8, u8)] = &[ - (0x00, 6), - (0x01, 1), - (0x03, 1), - (0x04, 2), - (0x05, 7), - (0x07, 2), - (0x08, 8), - (0x09, 2), - (0x0a, 5), - (0x0b, 2), - (0x0e, 4), - (0x10, 1), - (0x11, 2), - (0x12, 5), - (0x13, 28), - (0x14, 1), - (0x15, 2), - (0x17, 2), - (0x19, 13), - (0x1c, 5), - (0x1d, 8), - (0x1f, 1), - (0x24, 1), - (0x6a, 4), - (0x6b, 2), - (0x6e, 2), - (0xaf, 3), - (0xb1, 2), - (0xbc, 2), - (0xcf, 2), - (0xd1, 2), - (0xd4, 12), - (0xd5, 9), - (0xd6, 2), - (0xd7, 2), - (0xda, 1), - (0xe0, 5), - (0xe1, 2), - (0xe6, 1), - (0xe7, 4), - (0xe8, 2), - (0xee, 32), - (0xf0, 4), - (0xf8, 2), - (0xfa, 5), - (0xfb, 1), -]; -#[rustfmt::skip] -const SINGLETONS1L: &[u8] = &[ - 0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, - 0x9e, 0x9f, 0x7b, 0x8b, 0x93, 0x96, 0xa2, 0xb2, - 0xba, 0x86, 0xb1, 0x06, 0x07, 0x09, 0x36, 0x3d, - 0x3e, 0x56, 0xf3, 0xd0, 0xd1, 0x04, 0x14, 0x18, - 0x36, 0x37, 0x56, 0x57, 0x7f, 0xaa, 0xae, 0xaf, - 0xbd, 0x35, 0xe0, 0x12, 0x87, 0x89, 0x8e, 0x9e, - 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, - 0x3a, 0x45, 0x46, 0x49, 0x4a, 0x4e, 0x4f, 0x64, - 0x65, 0x8a, 0x8c, 0x8d, 0x8f, 0xb6, 0xc1, 0xc3, - 0xc4, 0xc6, 0xcb, 0xd6, 0x5c, 0xb6, 0xb7, 0x1b, - 0x1c, 0x07, 0x08, 0x0a, 0x0b, 0x14, 0x17, 0x36, - 0x39, 0x3a, 0xa8, 0xa9, 0xd8, 0xd9, 0x09, 0x37, - 0x90, 0x91, 0xa8, 0x07, 0x0a, 0x3b, 0x3e, 0x66, - 0x69, 0x8f, 0x92, 0x11, 0x6f, 0x5f, 0xbf, 0xee, - 0xef, 0x5a, 0x62, 0xb9, 0xba, 0xf4, 0xfc, 0xff, - 0x53, 0x54, 0x9a, 0x9b, 0x2e, 0x2f, 0x27, 0x28, - 0x55, 0x9d, 0xa0, 0xa1, 0xa3, 0xa4, 0xa7, 0xa8, - 0xad, 0xba, 0xbc, 0xc4, 0x06, 0x0b, 0x0c, 0x15, - 0x1d, 0x3a, 0x3f, 0x45, 0x51, 0xa6, 0xa7, 0xcc, - 0xcd, 0xa0, 0x07, 0x19, 0x1a, 0x22, 0x25, 0x3e, - 0x3f, 0xdf, 0xe7, 0xec, 0xef, 0xff, 0xc5, 0xc6, - 0x04, 0x20, 0x23, 0x25, 0x26, 0x28, 0x33, 0x38, - 0x3a, 0x48, 0x4a, 0x4c, 0x50, 0x53, 0x55, 0x56, - 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x63, 0x65, 0x66, - 0x6b, 0x73, 0x78, 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, - 0xaf, 0xb0, 0xc0, 0xd0, 0xae, 0xaf, 0x6e, 0x6f, - 0xc7, 0xdd, 0xde, 0x93, -]; -#[rustfmt::skip] -const NORMAL0: &[u8] = &[ - 0x00, 0x20, - 0x5f, 0x22, - 0x82, 0xdf, 0x04, - 0x82, 0x44, 0x08, - 0x1b, 0x04, - 0x06, 0x11, - 0x81, 0xac, 0x0e, - 0x80, 0xab, 0x05, - 0x20, 0x07, - 0x81, 0x1c, 0x03, - 0x19, 0x08, - 0x01, 0x04, - 0x2f, 0x04, - 0x34, 0x04, - 0x07, 0x03, - 0x01, 0x07, - 0x06, 0x07, - 0x11, 0x0a, - 0x50, 0x0f, - 0x12, 0x07, - 0x55, 0x07, - 0x03, 0x04, - 0x1c, 0x0a, - 0x09, 0x03, - 0x08, 0x03, - 0x07, 0x03, - 0x02, 0x03, - 0x03, 0x03, - 0x0c, 0x04, - 0x05, 0x03, - 0x0b, 0x06, - 0x01, 0x0e, - 0x15, 0x05, - 0x4e, 0x07, - 0x1b, 0x07, - 0x57, 0x07, - 0x02, 0x05, - 0x18, 0x0c, - 0x50, 0x04, - 0x43, 0x03, - 0x2d, 0x03, - 0x01, 0x04, - 0x11, 0x06, - 0x0f, 0x0c, - 0x3a, 0x04, - 0x1d, 0x25, - 0x5f, 0x20, - 0x6d, 0x04, - 0x6a, 0x25, - 0x80, 0xc8, 0x05, - 0x82, 0xb0, 0x03, - 0x1a, 0x06, - 0x82, 0xfd, 0x03, - 0x59, 0x07, - 0x16, 0x09, - 0x18, 0x09, - 0x14, 0x0c, - 0x14, 0x0c, - 0x6a, 0x06, - 0x0a, 0x06, - 0x1a, 0x06, - 0x59, 0x07, - 0x2b, 0x05, - 0x46, 0x0a, - 0x2c, 0x04, - 0x0c, 0x04, - 0x01, 0x03, - 0x31, 0x0b, - 0x2c, 0x04, - 0x1a, 0x06, - 0x0b, 0x03, - 0x80, 0xac, 0x06, - 0x0a, 0x06, - 0x4c, 0x14, - 0x80, 0xf4, 0x08, - 0x3c, 0x03, - 0x0f, 0x03, - 0x3e, 0x05, - 0x38, 0x08, - 0x2b, 0x05, - 0x82, 0xff, 0x11, - 0x18, 0x08, - 0x2f, 0x11, - 0x2d, 0x03, - 0x22, 0x0e, - 0x21, 0x0f, - 0x80, 0x8c, 0x04, - 0x82, 0x9a, 0x16, - 0x0b, 0x15, - 0x88, 0x94, 0x05, - 0x2f, 0x05, - 0x3b, 0x07, - 0x02, 0x0e, - 0x18, 0x09, - 0x80, 0xbe, 0x22, - 0x74, 0x0c, - 0x80, 0xd6, 0x1a, - 0x81, 0x10, 0x05, - 0x80, 0xe1, 0x09, - 0xf2, 0x9e, 0x03, - 0x37, 0x09, - 0x81, 0x5c, 0x14, - 0x80, 0xb8, 0x08, - 0x80, 0xdd, 0x14, - 0x3c, 0x03, - 0x0a, 0x06, - 0x38, 0x08, - 0x46, 0x08, - 0x0c, 0x06, - 0x74, 0x0b, - 0x1e, 0x03, - 0x5a, 0x04, - 0x59, 0x09, - 0x80, 0x83, 0x18, - 0x1c, 0x0a, - 0x16, 0x09, - 0x4c, 0x04, - 0x80, 0x8a, 0x06, - 0xab, 0xa4, 0x0c, - 0x17, 0x04, - 0x31, 0xa1, 0x04, - 0x81, 0xda, 0x26, - 0x07, 0x0c, - 0x05, 0x05, - 0x82, 0xb3, 0x20, - 0x2a, 0x06, - 0x4c, 0x04, - 0x80, 0x8d, 0x04, - 0x80, 0xbe, 0x03, - 0x1b, 0x03, - 0x0f, 0x0d, -]; -#[rustfmt::skip] -const NORMAL1: &[u8] = &[ - 0x5e, 0x22, - 0x7b, 0x05, - 0x03, 0x04, - 0x2d, 0x03, - 0x66, 0x03, - 0x01, 0x2f, - 0x2e, 0x80, 0x82, - 0x1d, 0x03, - 0x31, 0x0f, - 0x1c, 0x04, - 0x24, 0x09, - 0x1e, 0x05, - 0x2b, 0x05, - 0x44, 0x04, - 0x0e, 0x2a, - 0x80, 0xaa, 0x06, - 0x24, 0x04, - 0x24, 0x04, - 0x28, 0x08, - 0x34, 0x0b, - 0x4e, 0x03, - 0x34, 0x0c, - 0x81, 0x37, 0x09, - 0x16, 0x0a, - 0x08, 0x18, - 0x3b, 0x45, - 0x39, 0x03, - 0x63, 0x08, - 0x09, 0x30, - 0x16, 0x05, - 0x21, 0x03, - 0x1b, 0x05, - 0x1b, 0x26, - 0x38, 0x04, - 0x4b, 0x05, - 0x2f, 0x04, - 0x0a, 0x07, - 0x09, 0x07, - 0x40, 0x20, - 0x27, 0x04, - 0x0c, 0x09, - 0x36, 0x03, - 0x3a, 0x05, - 0x1a, 0x07, - 0x04, 0x0c, - 0x07, 0x50, - 0x49, 0x37, - 0x33, 0x0d, - 0x33, 0x07, - 0x2e, 0x08, - 0x0a, 0x06, - 0x26, 0x03, - 0x1d, 0x08, - 0x02, 0x80, 0xd0, - 0x52, 0x10, - 0x06, 0x08, - 0x09, 0x21, - 0x2e, 0x08, - 0x2a, 0x16, - 0x1a, 0x26, - 0x1c, 0x14, - 0x17, 0x09, - 0x4e, 0x04, - 0x24, 0x09, - 0x44, 0x0d, - 0x19, 0x07, - 0x0a, 0x06, - 0x48, 0x08, - 0x27, 0x09, - 0x75, 0x0b, - 0x42, 0x3e, - 0x2a, 0x06, - 0x3b, 0x05, - 0x0a, 0x06, - 0x51, 0x06, - 0x01, 0x05, - 0x10, 0x03, - 0x05, 0x0b, - 0x59, 0x08, - 0x02, 0x1d, - 0x62, 0x1e, - 0x48, 0x08, - 0x0a, 0x80, 0xa6, - 0x5e, 0x22, - 0x45, 0x0b, - 0x0a, 0x06, - 0x0d, 0x13, - 0x3a, 0x06, - 0x0a, 0x06, - 0x14, 0x1c, - 0x2c, 0x04, - 0x17, 0x80, 0xb9, - 0x3c, 0x64, - 0x53, 0x0c, - 0x48, 0x09, - 0x0a, 0x46, - 0x45, 0x1b, - 0x48, 0x08, - 0x53, 0x0d, - 0x49, 0x07, - 0x0a, 0x56, - 0x08, 0x58, - 0x22, 0x0e, - 0x0a, 0x06, - 0x46, 0x0a, - 0x1d, 0x03, - 0x47, 0x49, - 0x37, 0x03, - 0x0e, 0x08, - 0x0a, 0x06, - 0x39, 0x07, - 0x0a, 0x06, - 0x2c, 0x04, - 0x0a, 0x80, 0xf6, - 0x19, 0x07, - 0x3b, 0x03, - 0x1d, 0x55, - 0x01, 0x0f, - 0x32, 0x0d, - 0x83, 0x9b, 0x66, - 0x75, 0x0b, - 0x80, 0xc4, 0x8a, 0x4c, - 0x63, 0x0d, - 0x84, 0x30, 0x10, - 0x16, 0x0a, - 0x8f, 0x9b, 0x05, - 0x82, 0x47, 0x9a, 0xb9, - 0x3a, 0x86, 0xc6, - 0x82, 0x39, 0x07, - 0x2a, 0x04, - 0x5c, 0x06, - 0x26, 0x0a, - 0x46, 0x0a, - 0x28, 0x05, - 0x13, 0x81, 0xb0, - 0x3a, 0x80, 0xc6, - 0x5b, 0x05, - 0x34, 0x2c, - 0x4b, 0x04, - 0x39, 0x07, - 0x11, 0x40, - 0x05, 0x0b, - 0x07, 0x09, - 0x9c, 0xd6, 0x29, - 0x20, 0x61, - 0x73, 0xa1, 0xfd, - 0x81, 0x33, 0x0f, - 0x01, 0x1d, - 0x06, 0x0e, - 0x04, 0x08, - 0x81, 0x8c, 0x89, 0x04, - 0x6b, 0x05, - 0x0d, 0x03, - 0x09, 0x07, - 0x10, 0x8f, 0x60, - 0x80, 0xfd, 0x03, - 0x81, 0xb4, 0x06, - 0x17, 0x0f, - 0x11, 0x0f, - 0x47, 0x09, - 0x74, 0x3c, - 0x80, 0xf6, 0x0a, - 0x73, 0x08, - 0x70, 0x15, - 0x46, 0x7a, - 0x14, 0x0c, - 0x14, 0x0c, - 0x57, 0x09, - 0x19, 0x80, 0x87, - 0x81, 0x47, 0x03, - 0x85, 0x42, 0x0f, - 0x15, 0x84, 0x50, - 0x1f, 0x06, - 0x06, 0x80, 0xd5, - 0x2b, 0x05, - 0x3e, 0x21, - 0x01, 0x70, - 0x2d, 0x03, - 0x1a, 0x04, - 0x02, 0x81, 0x40, - 0x1f, 0x11, - 0x3a, 0x05, - 0x01, 0x81, 0xd0, - 0x2a, 0x80, 0xd6, - 0x2b, 0x04, - 0x01, 0x80, 0xc0, - 0x36, 0x08, - 0x02, 0x80, 0xe0, - 0x80, 0xf7, 0x29, - 0x4c, 0x04, - 0x0a, 0x04, - 0x02, 0x83, 0x11, - 0x44, 0x4c, - 0x3d, 0x80, 0xc2, - 0x3c, 0x06, - 0x01, 0x04, - 0x55, 0x05, - 0x1b, 0x34, - 0x02, 0x81, 0x0e, - 0x2c, 0x04, - 0x64, 0x0c, - 0x56, 0x0a, - 0x80, 0xae, 0x38, - 0x1d, 0x0d, - 0x2c, 0x04, - 0x09, 0x07, - 0x02, 0x0e, - 0x06, 0x80, 0x9a, - 0x83, 0xd9, 0x03, - 0x11, 0x03, - 0x0d, 0x03, - 0x80, 0xda, 0x06, - 0x0c, 0x04, - 0x01, 0x0f, - 0x0c, 0x04, - 0x38, 0x08, - 0x0a, 0x06, - 0x28, 0x08, - 0x2c, 0x04, - 0x02, 0x0e, - 0x09, 0x27, - 0x81, 0x58, 0x08, - 0x1d, 0x03, - 0x0b, 0x03, - 0x3b, 0x04, - 0x1e, 0x04, - 0x0a, 0x07, - 0x80, 0xfb, 0x84, 0x05, -]; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 83d3808051840..7154da4d23181 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,16 +1,19 @@ //! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! -// Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist -// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist -// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist -// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset -// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist -// N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist -// Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset -// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading -// to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT -// to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT -// to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT -// Total : 9629 bytes +// Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist +// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist +// Cf : 87 bytes, 170 codepoints in 21 ranges (U+0000AD - U+0E0080) using skiplist +// Cn_Planes_0_3 : 1677 bytes, 94165 codepoints in 730 ranges (U+000378 - U+03FFFE) using skiplist +// Default_Ignorable_Code_Point: 83 bytes, 4174 codepoints in 17 ranges (U+0000AD - U+0E1000) using skiplist +// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist +// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset +// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist +// N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist +// Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset +// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading +// to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT +// to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT +// to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT +// Total : 11476 bytes #[inline(always)] const fn bitset_search< @@ -337,6 +340,193 @@ pub mod case_ignorable { } } +#[rustfmt::skip] +pub mod cf { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 11] = [ + ShortOffsetRunHeader::new(0, 1536), ShortOffsetRunHeader::new(3, 2192), + ShortOffsetRunHeader::new(11, 6158), ShortOffsetRunHeader::new(15, 8203), + ShortOffsetRunHeader::new(17, 65279), ShortOffsetRunHeader::new(25, 69821), + ShortOffsetRunHeader::new(29, 78896), ShortOffsetRunHeader::new(33, 113824), + ShortOffsetRunHeader::new(35, 119155), ShortOffsetRunHeader::new(37, 917505), + ShortOffsetRunHeader::new(39, 2031744), + ]; + static OFFSETS: [u8; 43] = [ + 173, 1, 0, 6, 22, 1, 192, 1, 49, 1, 0, 2, 80, 1, 0, 1, 0, 5, 26, 5, 49, 5, 1, 10, 0, 1, + 249, 3, 0, 1, 15, 1, 0, 16, 0, 4, 0, 8, 0, 1, 30, 96, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xad && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + +#[rustfmt::skip] +pub mod cn_planes_0_3 { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 54] = [ + ShortOffsetRunHeader::new(0, 888), ShortOffsetRunHeader::new(1, 1328), + ShortOffsetRunHeader::new(11, 1806), ShortOffsetRunHeader::new(25, 4681), + ShortOffsetRunHeader::new(325, 5789), ShortOffsetRunHeader::new(365, 7958), + ShortOffsetRunHeader::new(445, 9258), ShortOffsetRunHeader::new(491, 11124), + ShortOffsetRunHeader::new(495, 11508), ShortOffsetRunHeader::new(497, 42125), + ShortOffsetRunHeader::new(549, 42540), ShortOffsetRunHeader::new(553, 55204), + ShortOffsetRunHeader::new(605, 64110), ShortOffsetRunHeader::new(611, 64976), + ShortOffsetRunHeader::new(629, 67383), ShortOffsetRunHeader::new(735, 74650), + ShortOffsetRunHeader::new(1067, 77712), ShortOffsetRunHeader::new(1074, 78934), + ShortOffsetRunHeader::new(1077, 82939), ShortOffsetRunHeader::new(1079, 83527), + ShortOffsetRunHeader::new(1081, 90368), ShortOffsetRunHeader::new(1082, 92160), + ShortOffsetRunHeader::new(1084, 92729), ShortOffsetRunHeader::new(1085, 93504), + ShortOffsetRunHeader::new(1108, 101590), ShortOffsetRunHeader::new(1127, 110576), + ShortOffsetRunHeader::new(1132, 110883), ShortOffsetRunHeader::new(1139, 111356), + ShortOffsetRunHeader::new(1149, 113664), ShortOffsetRunHeader::new(1150, 117760), + ShortOffsetRunHeader::new(1160, 118452), ShortOffsetRunHeader::new(1163, 120486), + ShortOffsetRunHeader::new(1227, 120780), ShortOffsetRunHeader::new(1229, 121484), + ShortOffsetRunHeader::new(1231, 122624), ShortOffsetRunHeader::new(1236, 123536), + ShortOffsetRunHeader::new(1262, 124112), ShortOffsetRunHeader::new(1268, 126065), + ShortOffsetRunHeader::new(1298, 126976), ShortOffsetRunHeader::new(1370, 128729), + ShortOffsetRunHeader::new(1395, 129624), ShortOffsetRunHeader::new(1423, 131072), + ShortOffsetRunHeader::new(1444, 173792), ShortOffsetRunHeader::new(1445, 178206), + ShortOffsetRunHeader::new(1447, 183982), ShortOffsetRunHeader::new(1449, 191457), + ShortOffsetRunHeader::new(1451, 192094), ShortOffsetRunHeader::new(1453, 194560), + ShortOffsetRunHeader::new(1454, 195102), ShortOffsetRunHeader::new(1455, 196608), + ShortOffsetRunHeader::new(1456, 201547), ShortOffsetRunHeader::new(1457, 210042), + ShortOffsetRunHeader::new(1459, 262142), ShortOffsetRunHeader::new(1460, 1376254), + ]; + static OFFSETS: [u8; 1461] = [ + 0, 2, 6, 4, 7, 1, 1, 1, 20, 1, 0, 1, 38, 2, 50, 2, 3, 1, 55, 8, 27, 4, 6, 11, 0, 1, 60, 2, + 101, 14, 59, 2, 49, 2, 15, 1, 28, 2, 1, 1, 11, 5, 34, 5, 237, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, + 3, 4, 2, 9, 2, 2, 2, 4, 8, 1, 4, 2, 1, 5, 2, 25, 2, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, 2, + 1, 2, 2, 1, 1, 5, 4, 2, 2, 3, 3, 1, 7, 4, 1, 1, 7, 17, 10, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, + 1, 5, 2, 10, 1, 3, 1, 3, 2, 1, 15, 4, 2, 12, 7, 7, 1, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, + 5, 2, 9, 2, 2, 2, 3, 7, 3, 4, 2, 1, 5, 2, 18, 10, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, + 2, 3, 3, 3, 12, 4, 5, 3, 3, 1, 4, 2, 1, 6, 1, 14, 21, 5, 13, 1, 3, 1, 23, 1, 16, 2, 9, 1, 3, + 1, 4, 7, 2, 1, 3, 1, 2, 2, 4, 2, 10, 7, 22, 1, 3, 1, 23, 1, 10, 1, 5, 2, 9, 1, 3, 1, 4, 7, + 2, 5, 3, 1, 4, 2, 10, 1, 3, 12, 13, 1, 3, 1, 51, 1, 3, 1, 6, 4, 16, 2, 26, 1, 3, 1, 18, 3, + 24, 1, 9, 1, 1, 2, 7, 3, 1, 4, 6, 1, 1, 1, 8, 6, 10, 2, 3, 12, 58, 4, 29, 37, 2, 1, 1, 1, 5, + 1, 24, 1, 1, 1, 23, 2, 5, 1, 1, 1, 7, 1, 10, 2, 4, 32, 72, 1, 36, 4, 39, 1, 36, 1, 15, 1, + 13, 37, 198, 1, 1, 5, 1, 2, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, 41, 1, 4, 2, 33, 1, 4, 2, 7, 1, 1, + 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 2, 32, 3, 26, 6, 86, 2, 6, 2, 0, 3, 89, 7, 22, 9, 24, 9, + 20, 12, 13, 1, 3, 1, 2, 12, 94, 2, 10, 6, 10, 6, 26, 6, 89, 7, 43, 5, 70, 10, 31, 1, 12, 4, + 12, 4, 1, 3, 42, 2, 5, 11, 44, 4, 26, 6, 11, 3, 62, 2, 65, 1, 29, 2, 11, 6, 10, 6, 14, 2, + 46, 2, 12, 20, 77, 1, 166, 8, 60, 3, 15, 3, 62, 5, 43, 2, 11, 8, 43, 5, 0, 2, 6, 2, 38, 2, + 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 15, 1, 14, 2, 6, 1, 19, 2, 3, 1, 9, 1, 101, 1, + 12, 2, 27, 1, 13, 3, 34, 14, 33, 15, 140, 4, 0, 22, 11, 21, 0, 2, 0, 5, 45, 1, 1, 5, 1, 2, + 56, 7, 2, 14, 24, 9, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 126, 34, 26, 1, 89, 12, + 214, 26, 80, 1, 86, 2, 103, 5, 43, 1, 94, 1, 86, 9, 48, 1, 0, 3, 55, 9, 0, 20, 184, 8, 221, + 20, 60, 3, 10, 6, 56, 8, 70, 8, 12, 6, 116, 11, 30, 3, 78, 1, 11, 4, 33, 1, 55, 9, 14, 2, + 10, 2, 103, 24, 28, 10, 6, 2, 6, 2, 6, 9, 7, 1, 7, 1, 60, 4, 126, 2, 10, 6, 0, 12, 23, 4, + 49, 4, 0, 2, 106, 38, 7, 12, 5, 5, 26, 1, 5, 1, 1, 1, 2, 1, 2, 1, 0, 32, 42, 6, 51, 1, 19, + 1, 4, 4, 5, 1, 135, 2, 1, 1, 190, 3, 6, 2, 6, 2, 6, 2, 3, 3, 7, 1, 7, 10, 5, 2, 12, 1, 26, + 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 5, 3, 4, 45, 3, 88, 1, 13, 3, 1, 47, 46, 130, 29, 3, 49, + 15, 28, 4, 36, 9, 30, 5, 43, 5, 30, 1, 37, 4, 14, 42, 158, 2, 10, 6, 36, 4, 36, 4, 40, 8, + 52, 11, 12, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, 6, + 1, 42, 1, 9, 69, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 1, 72, 8, 9, 48, 19, 1, 2, 5, 33, 3, 27, + 5, 27, 38, 56, 4, 20, 2, 50, 1, 2, 5, 8, 1, 3, 1, 29, 2, 3, 4, 10, 7, 9, 7, 64, 32, 39, 4, + 12, 9, 54, 3, 29, 2, 27, 5, 26, 7, 4, 12, 7, 80, 73, 55, 51, 13, 51, 7, 46, 8, 10, 6, 38, 3, + 29, 8, 2, 208, 31, 1, 42, 1, 3, 2, 2, 16, 6, 8, 9, 33, 46, 8, 42, 22, 26, 38, 28, 20, 23, 9, + 78, 4, 36, 9, 68, 10, 1, 2, 25, 7, 10, 6, 53, 1, 18, 8, 39, 9, 96, 1, 20, 11, 18, 1, 47, 62, + 7, 1, 1, 1, 4, 1, 15, 1, 11, 6, 59, 5, 10, 6, 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 1, 10, + 2, 2, 2, 3, 2, 1, 6, 1, 5, 7, 2, 7, 3, 5, 11, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, 1, 1, + 4, 1, 10, 1, 2, 8, 2, 29, 92, 1, 5, 30, 72, 8, 10, 166, 54, 2, 38, 34, 69, 11, 10, 6, 13, + 19, 58, 6, 10, 6, 20, 28, 27, 2, 15, 4, 23, 185, 60, 100, 83, 12, 8, 2, 1, 2, 8, 1, 2, 1, + 30, 1, 2, 2, 12, 9, 10, 70, 8, 2, 46, 2, 11, 27, 72, 8, 83, 13, 73, 7, 10, 86, 8, 88, 34, + 14, 10, 6, 9, 1, 45, 1, 14, 10, 29, 3, 32, 2, 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, + 9, 8, 10, 6, 6, 1, 2, 1, 37, 1, 2, 1, 6, 7, 10, 6, 44, 4, 10, 246, 25, 7, 17, 1, 41, 3, 29, + 85, 1, 15, 50, 13, 0, 102, 111, 1, 5, 11, 196, 0, 99, 13, 0, 10, 0, 5, 0, 0, 58, 0, 0, 7, + 31, 1, 10, 4, 81, 1, 10, 6, 30, 2, 6, 10, 70, 10, 10, 1, 7, 1, 21, 5, 19, 0, 58, 198, 91, 5, + 25, 2, 25, 44, 75, 4, 57, 7, 17, 64, 5, 11, 7, 9, 0, 41, 32, 97, 115, 0, 4, 1, 7, 1, 2, 1, + 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, 13, 3, 9, 7, 10, 2, 8, 0, 253, 3, 0, 6, 23, + 15, 17, 15, 46, 2, 23, 9, 116, 60, 246, 10, 39, 2, 194, 21, 70, 122, 20, 12, 20, 12, 87, 9, + 25, 135, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, + 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 0, 2, 0, 15, 5, 1, 15, 0, 31, 6, 6, 213, 7, 1, 17, 2, + 7, 1, 2, 1, 5, 5, 62, 33, 1, 112, 45, 3, 14, 2, 10, 4, 2, 0, 31, 17, 58, 5, 1, 0, 42, 214, + 43, 4, 1, 192, 31, 1, 22, 8, 2, 224, 7, 1, 4, 1, 2, 1, 15, 1, 197, 2, 16, 41, 76, 4, 10, 4, + 2, 0, 68, 76, 61, 194, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, + 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, + 1, 1, 1, 10, 1, 17, 5, 3, 1, 5, 1, 17, 52, 2, 0, 44, 4, 100, 12, 15, 2, 15, 1, 15, 1, 37, + 10, 174, 56, 29, 13, 44, 4, 9, 7, 2, 14, 6, 154, 0, 3, 17, 3, 13, 3, 218, 6, 12, 4, 1, 15, + 12, 4, 56, 8, 10, 6, 40, 8, 30, 2, 12, 4, 2, 14, 9, 39, 0, 8, 14, 2, 13, 3, 11, 3, 57, 1, 1, + 4, 16, 2, 12, 4, 10, 7, 147, 1, 103, 0, 0, 32, 0, 2, 0, 2, 0, 15, 0, 0, 0, 0, 0, 5, 0, 0, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0x378 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + +#[rustfmt::skip] +pub mod default_ignorable_code_point { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 12] = [ + ShortOffsetRunHeader::new(0, 847), ShortOffsetRunHeader::new(3, 1564), + ShortOffsetRunHeader::new(5, 4447), ShortOffsetRunHeader::new(7, 6068), + ShortOffsetRunHeader::new(9, 8203), ShortOffsetRunHeader::new(13, 12644), + ShortOffsetRunHeader::new(19, 65024), ShortOffsetRunHeader::new(21, 113824), + ShortOffsetRunHeader::new(29, 119155), ShortOffsetRunHeader::new(31, 917504), + ShortOffsetRunHeader::new(33, 921600), ShortOffsetRunHeader::new(34, 2035712), + ]; + static OFFSETS: [u8; 35] = [ + 173, 1, 0, 1, 0, 1, 0, 2, 0, 2, 85, 5, 0, 5, 26, 5, 49, 16, 0, 1, 0, 16, 239, 1, 160, 1, + 79, 9, 0, 4, 0, 8, 0, 0, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xad && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + #[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; diff --git a/library/core/src/wtf8.rs b/library/core/src/wtf8.rs index a0978c3dafb48..effb7a37d970c 100644 --- a/library/core/src/wtf8.rs +++ b/library/core/src/wtf8.rs @@ -147,7 +147,7 @@ impl fmt::Debug for Wtf8 { use crate::fmt::Write as _; for c in s.chars().flat_map(|c| { c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: false, escape_double_quote: true, }) diff --git a/library/coretests/tests/unicode.rs b/library/coretests/tests/unicode.rs index 12eed25a1feae..bd9e1ac2ced38 100644 --- a/library/coretests/tests/unicode.rs +++ b/library/coretests/tests/unicode.rs @@ -1,3 +1,4 @@ +use core::iter::Step; use core::unicode::unicode_data; use std::ops::RangeInclusive; @@ -19,7 +20,7 @@ fn test_boolean_property(ranges: &[RangeInclusive], lookup: fn(char) -> bo for c in range.clone() { assert!(lookup(c), "{c:?}"); } - start = char::from_u32(*range.end() as u32 + 1).unwrap(); + start = Step::forward(*range.end(), 1); } for c in start..=char::MAX { assert!(!lookup(c), "{c:?}"); @@ -60,9 +61,23 @@ fn case_ignorable() { #[test] #[cfg_attr(miri, ignore)] // Miri is too slow -fn lt() { - test_boolean_property(test_data::LT, unicode_data::lt::lookup); - test_boolean_property(test_data::LT, char::is_titlecase); +fn cf() { + test_boolean_property(test_data::CF, unicode_data::cf::lookup); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn cn_planes_0_3() { + test_boolean_property(test_data::CN_PLANES_0_3, unicode_data::cn_planes_0_3::lookup); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn default_ignorable_code_point() { + test_boolean_property( + test_data::DEFAULT_IGNORABLE_CODE_POINT, + unicode_data::default_ignorable_code_point::lookup, + ); } #[test] @@ -78,6 +93,13 @@ fn lowercase() { test_boolean_property(test_data::LOWERCASE, char::is_lowercase); } +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn lt() { + test_boolean_property(test_data::LT, unicode_data::lt::lookup); + test_boolean_property(test_data::LT, char::is_titlecase); +} + #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn n() { diff --git a/library/coretests/tests/unicode/test_data.rs b/library/coretests/tests/unicode/test_data.rs index 962770a0ff830..a246716d54fdf 100644 --- a/library/coretests/tests/unicode/test_data.rs +++ b/library/coretests/tests/unicode/test_data.rs @@ -392,6 +392,262 @@ pub(super) static CASE_IGNORABLE: &[RangeInclusive; 459] = &[ '\u{e0100}'..='\u{e01ef}', ]; +#[rustfmt::skip] +pub(super) static CF: &[RangeInclusive; 21] = &[ + '\u{ad}'..='\u{ad}', '\u{600}'..='\u{605}', '\u{61c}'..='\u{61c}', '\u{6dd}'..='\u{6dd}', + '\u{70f}'..='\u{70f}', '\u{890}'..='\u{891}', '\u{8e2}'..='\u{8e2}', + '\u{180e}'..='\u{180e}', '\u{200b}'..='\u{200f}', '\u{202a}'..='\u{202e}', + '\u{2060}'..='\u{2064}', '\u{2066}'..='\u{206f}', '\u{feff}'..='\u{feff}', + '\u{fff9}'..='\u{fffb}', '\u{110bd}'..='\u{110bd}', '\u{110cd}'..='\u{110cd}', + '\u{13430}'..='\u{1343f}', '\u{1bca0}'..='\u{1bca3}', '\u{1d173}'..='\u{1d17a}', + '\u{e0001}'..='\u{e0001}', '\u{e0020}'..='\u{e007f}', +]; + +#[rustfmt::skip] +pub(super) static CN_PLANES_0_3: &[RangeInclusive; 730] = &[ + '\u{378}'..='\u{379}', '\u{380}'..='\u{383}', '\u{38b}'..='\u{38b}', '\u{38d}'..='\u{38d}', + '\u{3a2}'..='\u{3a2}', '\u{530}'..='\u{530}', '\u{557}'..='\u{558}', '\u{58b}'..='\u{58c}', + '\u{590}'..='\u{590}', '\u{5c8}'..='\u{5cf}', '\u{5eb}'..='\u{5ee}', '\u{5f5}'..='\u{5ff}', + '\u{70e}'..='\u{70e}', '\u{74b}'..='\u{74c}', '\u{7b2}'..='\u{7bf}', '\u{7fb}'..='\u{7fc}', + '\u{82e}'..='\u{82f}', '\u{83f}'..='\u{83f}', '\u{85c}'..='\u{85d}', '\u{85f}'..='\u{85f}', + '\u{86b}'..='\u{86f}', '\u{892}'..='\u{896}', '\u{984}'..='\u{984}', '\u{98d}'..='\u{98e}', + '\u{991}'..='\u{992}', '\u{9a9}'..='\u{9a9}', '\u{9b1}'..='\u{9b1}', '\u{9b3}'..='\u{9b5}', + '\u{9ba}'..='\u{9bb}', '\u{9c5}'..='\u{9c6}', '\u{9c9}'..='\u{9ca}', '\u{9cf}'..='\u{9d6}', + '\u{9d8}'..='\u{9db}', '\u{9de}'..='\u{9de}', '\u{9e4}'..='\u{9e5}', '\u{9ff}'..='\u{a00}', + '\u{a04}'..='\u{a04}', '\u{a0b}'..='\u{a0e}', '\u{a11}'..='\u{a12}', '\u{a29}'..='\u{a29}', + '\u{a31}'..='\u{a31}', '\u{a34}'..='\u{a34}', '\u{a37}'..='\u{a37}', '\u{a3a}'..='\u{a3b}', + '\u{a3d}'..='\u{a3d}', '\u{a43}'..='\u{a46}', '\u{a49}'..='\u{a4a}', '\u{a4e}'..='\u{a50}', + '\u{a52}'..='\u{a58}', '\u{a5d}'..='\u{a5d}', '\u{a5f}'..='\u{a65}', '\u{a77}'..='\u{a80}', + '\u{a84}'..='\u{a84}', '\u{a8e}'..='\u{a8e}', '\u{a92}'..='\u{a92}', '\u{aa9}'..='\u{aa9}', + '\u{ab1}'..='\u{ab1}', '\u{ab4}'..='\u{ab4}', '\u{aba}'..='\u{abb}', '\u{ac6}'..='\u{ac6}', + '\u{aca}'..='\u{aca}', '\u{ace}'..='\u{acf}', '\u{ad1}'..='\u{adf}', '\u{ae4}'..='\u{ae5}', + '\u{af2}'..='\u{af8}', '\u{b00}'..='\u{b00}', '\u{b04}'..='\u{b04}', '\u{b0d}'..='\u{b0e}', + '\u{b11}'..='\u{b12}', '\u{b29}'..='\u{b29}', '\u{b31}'..='\u{b31}', '\u{b34}'..='\u{b34}', + '\u{b3a}'..='\u{b3b}', '\u{b45}'..='\u{b46}', '\u{b49}'..='\u{b4a}', '\u{b4e}'..='\u{b54}', + '\u{b58}'..='\u{b5b}', '\u{b5e}'..='\u{b5e}', '\u{b64}'..='\u{b65}', '\u{b78}'..='\u{b81}', + '\u{b84}'..='\u{b84}', '\u{b8b}'..='\u{b8d}', '\u{b91}'..='\u{b91}', '\u{b96}'..='\u{b98}', + '\u{b9b}'..='\u{b9b}', '\u{b9d}'..='\u{b9d}', '\u{ba0}'..='\u{ba2}', '\u{ba5}'..='\u{ba7}', + '\u{bab}'..='\u{bad}', '\u{bba}'..='\u{bbd}', '\u{bc3}'..='\u{bc5}', '\u{bc9}'..='\u{bc9}', + '\u{bce}'..='\u{bcf}', '\u{bd1}'..='\u{bd6}', '\u{bd8}'..='\u{be5}', '\u{bfb}'..='\u{bff}', + '\u{c0d}'..='\u{c0d}', '\u{c11}'..='\u{c11}', '\u{c29}'..='\u{c29}', '\u{c3a}'..='\u{c3b}', + '\u{c45}'..='\u{c45}', '\u{c49}'..='\u{c49}', '\u{c4e}'..='\u{c54}', '\u{c57}'..='\u{c57}', + '\u{c5b}'..='\u{c5b}', '\u{c5e}'..='\u{c5f}', '\u{c64}'..='\u{c65}', '\u{c70}'..='\u{c76}', + '\u{c8d}'..='\u{c8d}', '\u{c91}'..='\u{c91}', '\u{ca9}'..='\u{ca9}', '\u{cb4}'..='\u{cb4}', + '\u{cba}'..='\u{cbb}', '\u{cc5}'..='\u{cc5}', '\u{cc9}'..='\u{cc9}', '\u{cce}'..='\u{cd4}', + '\u{cd7}'..='\u{cdb}', '\u{cdf}'..='\u{cdf}', '\u{ce4}'..='\u{ce5}', '\u{cf0}'..='\u{cf0}', + '\u{cf4}'..='\u{cff}', '\u{d0d}'..='\u{d0d}', '\u{d11}'..='\u{d11}', '\u{d45}'..='\u{d45}', + '\u{d49}'..='\u{d49}', '\u{d50}'..='\u{d53}', '\u{d64}'..='\u{d65}', '\u{d80}'..='\u{d80}', + '\u{d84}'..='\u{d84}', '\u{d97}'..='\u{d99}', '\u{db2}'..='\u{db2}', '\u{dbc}'..='\u{dbc}', + '\u{dbe}'..='\u{dbf}', '\u{dc7}'..='\u{dc9}', '\u{dcb}'..='\u{dce}', '\u{dd5}'..='\u{dd5}', + '\u{dd7}'..='\u{dd7}', '\u{de0}'..='\u{de5}', '\u{df0}'..='\u{df1}', '\u{df5}'..='\u{e00}', + '\u{e3b}'..='\u{e3e}', '\u{e5c}'..='\u{e80}', '\u{e83}'..='\u{e83}', '\u{e85}'..='\u{e85}', + '\u{e8b}'..='\u{e8b}', '\u{ea4}'..='\u{ea4}', '\u{ea6}'..='\u{ea6}', '\u{ebe}'..='\u{ebf}', + '\u{ec5}'..='\u{ec5}', '\u{ec7}'..='\u{ec7}', '\u{ecf}'..='\u{ecf}', '\u{eda}'..='\u{edb}', + '\u{ee0}'..='\u{eff}', '\u{f48}'..='\u{f48}', '\u{f6d}'..='\u{f70}', '\u{f98}'..='\u{f98}', + '\u{fbd}'..='\u{fbd}', '\u{fcd}'..='\u{fcd}', '\u{fdb}'..='\u{fff}', + '\u{10c6}'..='\u{10c6}', '\u{10c8}'..='\u{10cc}', '\u{10ce}'..='\u{10cf}', + '\u{1249}'..='\u{1249}', '\u{124e}'..='\u{124f}', '\u{1257}'..='\u{1257}', + '\u{1259}'..='\u{1259}', '\u{125e}'..='\u{125f}', '\u{1289}'..='\u{1289}', + '\u{128e}'..='\u{128f}', '\u{12b1}'..='\u{12b1}', '\u{12b6}'..='\u{12b7}', + '\u{12bf}'..='\u{12bf}', '\u{12c1}'..='\u{12c1}', '\u{12c6}'..='\u{12c7}', + '\u{12d7}'..='\u{12d7}', '\u{1311}'..='\u{1311}', '\u{1316}'..='\u{1317}', + '\u{135b}'..='\u{135c}', '\u{137d}'..='\u{137f}', '\u{139a}'..='\u{139f}', + '\u{13f6}'..='\u{13f7}', '\u{13fe}'..='\u{13ff}', '\u{169d}'..='\u{169f}', + '\u{16f9}'..='\u{16ff}', '\u{1716}'..='\u{171e}', '\u{1737}'..='\u{173f}', + '\u{1754}'..='\u{175f}', '\u{176d}'..='\u{176d}', '\u{1771}'..='\u{1771}', + '\u{1774}'..='\u{177f}', '\u{17de}'..='\u{17df}', '\u{17ea}'..='\u{17ef}', + '\u{17fa}'..='\u{17ff}', '\u{181a}'..='\u{181f}', '\u{1879}'..='\u{187f}', + '\u{18ab}'..='\u{18af}', '\u{18f6}'..='\u{18ff}', '\u{191f}'..='\u{191f}', + '\u{192c}'..='\u{192f}', '\u{193c}'..='\u{193f}', '\u{1941}'..='\u{1943}', + '\u{196e}'..='\u{196f}', '\u{1975}'..='\u{197f}', '\u{19ac}'..='\u{19af}', + '\u{19ca}'..='\u{19cf}', '\u{19db}'..='\u{19dd}', '\u{1a1c}'..='\u{1a1d}', + '\u{1a5f}'..='\u{1a5f}', '\u{1a7d}'..='\u{1a7e}', '\u{1a8a}'..='\u{1a8f}', + '\u{1a9a}'..='\u{1a9f}', '\u{1aae}'..='\u{1aaf}', '\u{1ade}'..='\u{1adf}', + '\u{1aec}'..='\u{1aff}', '\u{1b4d}'..='\u{1b4d}', '\u{1bf4}'..='\u{1bfb}', + '\u{1c38}'..='\u{1c3a}', '\u{1c4a}'..='\u{1c4c}', '\u{1c8b}'..='\u{1c8f}', + '\u{1cbb}'..='\u{1cbc}', '\u{1cc8}'..='\u{1ccf}', '\u{1cfb}'..='\u{1cff}', + '\u{1f16}'..='\u{1f17}', '\u{1f1e}'..='\u{1f1f}', '\u{1f46}'..='\u{1f47}', + '\u{1f4e}'..='\u{1f4f}', '\u{1f58}'..='\u{1f58}', '\u{1f5a}'..='\u{1f5a}', + '\u{1f5c}'..='\u{1f5c}', '\u{1f5e}'..='\u{1f5e}', '\u{1f7e}'..='\u{1f7f}', + '\u{1fb5}'..='\u{1fb5}', '\u{1fc5}'..='\u{1fc5}', '\u{1fd4}'..='\u{1fd5}', + '\u{1fdc}'..='\u{1fdc}', '\u{1ff0}'..='\u{1ff1}', '\u{1ff5}'..='\u{1ff5}', + '\u{1fff}'..='\u{1fff}', '\u{2065}'..='\u{2065}', '\u{2072}'..='\u{2073}', + '\u{208f}'..='\u{208f}', '\u{209d}'..='\u{209f}', '\u{20c2}'..='\u{20cf}', + '\u{20f1}'..='\u{20ff}', '\u{218c}'..='\u{218f}', '\u{242a}'..='\u{243f}', + '\u{244b}'..='\u{245f}', '\u{2b74}'..='\u{2b75}', '\u{2cf4}'..='\u{2cf8}', + '\u{2d26}'..='\u{2d26}', '\u{2d28}'..='\u{2d2c}', '\u{2d2e}'..='\u{2d2f}', + '\u{2d68}'..='\u{2d6e}', '\u{2d71}'..='\u{2d7e}', '\u{2d97}'..='\u{2d9f}', + '\u{2da7}'..='\u{2da7}', '\u{2daf}'..='\u{2daf}', '\u{2db7}'..='\u{2db7}', + '\u{2dbf}'..='\u{2dbf}', '\u{2dc7}'..='\u{2dc7}', '\u{2dcf}'..='\u{2dcf}', + '\u{2dd7}'..='\u{2dd7}', '\u{2ddf}'..='\u{2ddf}', '\u{2e5e}'..='\u{2e7f}', + '\u{2e9a}'..='\u{2e9a}', '\u{2ef4}'..='\u{2eff}', '\u{2fd6}'..='\u{2fef}', + '\u{3040}'..='\u{3040}', '\u{3097}'..='\u{3098}', '\u{3100}'..='\u{3104}', + '\u{3130}'..='\u{3130}', '\u{318f}'..='\u{318f}', '\u{31e6}'..='\u{31ee}', + '\u{321f}'..='\u{321f}', '\u{a48d}'..='\u{a48f}', '\u{a4c7}'..='\u{a4cf}', + '\u{a62c}'..='\u{a63f}', '\u{a6f8}'..='\u{a6ff}', '\u{a7dd}'..='\u{a7f0}', + '\u{a82d}'..='\u{a82f}', '\u{a83a}'..='\u{a83f}', '\u{a878}'..='\u{a87f}', + '\u{a8c6}'..='\u{a8cd}', '\u{a8da}'..='\u{a8df}', '\u{a954}'..='\u{a95e}', + '\u{a97d}'..='\u{a97f}', '\u{a9ce}'..='\u{a9ce}', '\u{a9da}'..='\u{a9dd}', + '\u{a9ff}'..='\u{a9ff}', '\u{aa37}'..='\u{aa3f}', '\u{aa4e}'..='\u{aa4f}', + '\u{aa5a}'..='\u{aa5b}', '\u{aac3}'..='\u{aada}', '\u{aaf7}'..='\u{ab00}', + '\u{ab07}'..='\u{ab08}', '\u{ab0f}'..='\u{ab10}', '\u{ab17}'..='\u{ab1f}', + '\u{ab27}'..='\u{ab27}', '\u{ab2f}'..='\u{ab2f}', '\u{ab6c}'..='\u{ab6f}', + '\u{abee}'..='\u{abef}', '\u{abfa}'..='\u{abff}', '\u{d7a4}'..='\u{d7af}', + '\u{d7c7}'..='\u{d7ca}', '\u{d7fc}'..='\u{d7ff}', '\u{fa6e}'..='\u{fa6f}', + '\u{fada}'..='\u{faff}', '\u{fb07}'..='\u{fb12}', '\u{fb18}'..='\u{fb1c}', + '\u{fb37}'..='\u{fb37}', '\u{fb3d}'..='\u{fb3d}', '\u{fb3f}'..='\u{fb3f}', + '\u{fb42}'..='\u{fb42}', '\u{fb45}'..='\u{fb45}', '\u{fdd0}'..='\u{fdef}', + '\u{fe1a}'..='\u{fe1f}', '\u{fe53}'..='\u{fe53}', '\u{fe67}'..='\u{fe67}', + '\u{fe6c}'..='\u{fe6f}', '\u{fe75}'..='\u{fe75}', '\u{fefd}'..='\u{fefe}', + '\u{ff00}'..='\u{ff00}', '\u{ffbf}'..='\u{ffc1}', '\u{ffc8}'..='\u{ffc9}', + '\u{ffd0}'..='\u{ffd1}', '\u{ffd8}'..='\u{ffd9}', '\u{ffdd}'..='\u{ffdf}', + '\u{ffe7}'..='\u{ffe7}', '\u{ffef}'..='\u{fff8}', '\u{fffe}'..='\u{ffff}', + '\u{1000c}'..='\u{1000c}', '\u{10027}'..='\u{10027}', '\u{1003b}'..='\u{1003b}', + '\u{1003e}'..='\u{1003e}', '\u{1004e}'..='\u{1004f}', '\u{1005e}'..='\u{1007f}', + '\u{100fb}'..='\u{100ff}', '\u{10103}'..='\u{10106}', '\u{10134}'..='\u{10136}', + '\u{1018f}'..='\u{1018f}', '\u{1019d}'..='\u{1019f}', '\u{101a1}'..='\u{101cf}', + '\u{101fe}'..='\u{1027f}', '\u{1029d}'..='\u{1029f}', '\u{102d1}'..='\u{102df}', + '\u{102fc}'..='\u{102ff}', '\u{10324}'..='\u{1032c}', '\u{1034b}'..='\u{1034f}', + '\u{1037b}'..='\u{1037f}', '\u{1039e}'..='\u{1039e}', '\u{103c4}'..='\u{103c7}', + '\u{103d6}'..='\u{103ff}', '\u{1049e}'..='\u{1049f}', '\u{104aa}'..='\u{104af}', + '\u{104d4}'..='\u{104d7}', '\u{104fc}'..='\u{104ff}', '\u{10528}'..='\u{1052f}', + '\u{10564}'..='\u{1056e}', '\u{1057b}'..='\u{1057b}', '\u{1058b}'..='\u{1058b}', + '\u{10593}'..='\u{10593}', '\u{10596}'..='\u{10596}', '\u{105a2}'..='\u{105a2}', + '\u{105b2}'..='\u{105b2}', '\u{105ba}'..='\u{105ba}', '\u{105bd}'..='\u{105bf}', + '\u{105f4}'..='\u{105ff}', '\u{10737}'..='\u{1073f}', '\u{10756}'..='\u{1075f}', + '\u{10768}'..='\u{1077f}', '\u{10786}'..='\u{10786}', '\u{107b1}'..='\u{107b1}', + '\u{107bb}'..='\u{107ff}', '\u{10806}'..='\u{10807}', '\u{10809}'..='\u{10809}', + '\u{10836}'..='\u{10836}', '\u{10839}'..='\u{1083b}', '\u{1083d}'..='\u{1083e}', + '\u{10856}'..='\u{10856}', '\u{1089f}'..='\u{108a6}', '\u{108b0}'..='\u{108df}', + '\u{108f3}'..='\u{108f3}', '\u{108f6}'..='\u{108fa}', '\u{1091c}'..='\u{1091e}', + '\u{1093a}'..='\u{1093e}', '\u{1095a}'..='\u{1097f}', '\u{109b8}'..='\u{109bb}', + '\u{109d0}'..='\u{109d1}', '\u{10a04}'..='\u{10a04}', '\u{10a07}'..='\u{10a0b}', + '\u{10a14}'..='\u{10a14}', '\u{10a18}'..='\u{10a18}', '\u{10a36}'..='\u{10a37}', + '\u{10a3b}'..='\u{10a3e}', '\u{10a49}'..='\u{10a4f}', '\u{10a59}'..='\u{10a5f}', + '\u{10aa0}'..='\u{10abf}', '\u{10ae7}'..='\u{10aea}', '\u{10af7}'..='\u{10aff}', + '\u{10b36}'..='\u{10b38}', '\u{10b56}'..='\u{10b57}', '\u{10b73}'..='\u{10b77}', + '\u{10b92}'..='\u{10b98}', '\u{10b9d}'..='\u{10ba8}', '\u{10bb0}'..='\u{10bff}', + '\u{10c49}'..='\u{10c7f}', '\u{10cb3}'..='\u{10cbf}', '\u{10cf3}'..='\u{10cf9}', + '\u{10d28}'..='\u{10d2f}', '\u{10d3a}'..='\u{10d3f}', '\u{10d66}'..='\u{10d68}', + '\u{10d86}'..='\u{10d8d}', '\u{10d90}'..='\u{10e5f}', '\u{10e7f}'..='\u{10e7f}', + '\u{10eaa}'..='\u{10eaa}', '\u{10eae}'..='\u{10eaf}', '\u{10eb2}'..='\u{10ec1}', + '\u{10ec8}'..='\u{10ecf}', '\u{10ed9}'..='\u{10ef9}', '\u{10f28}'..='\u{10f2f}', + '\u{10f5a}'..='\u{10f6f}', '\u{10f8a}'..='\u{10faf}', '\u{10fcc}'..='\u{10fdf}', + '\u{10ff7}'..='\u{10fff}', '\u{1104e}'..='\u{11051}', '\u{11076}'..='\u{1107e}', + '\u{110c3}'..='\u{110cc}', '\u{110ce}'..='\u{110cf}', '\u{110e9}'..='\u{110ef}', + '\u{110fa}'..='\u{110ff}', '\u{11135}'..='\u{11135}', '\u{11148}'..='\u{1114f}', + '\u{11177}'..='\u{1117f}', '\u{111e0}'..='\u{111e0}', '\u{111f5}'..='\u{111ff}', + '\u{11212}'..='\u{11212}', '\u{11242}'..='\u{1127f}', '\u{11287}'..='\u{11287}', + '\u{11289}'..='\u{11289}', '\u{1128e}'..='\u{1128e}', '\u{1129e}'..='\u{1129e}', + '\u{112aa}'..='\u{112af}', '\u{112eb}'..='\u{112ef}', '\u{112fa}'..='\u{112ff}', + '\u{11304}'..='\u{11304}', '\u{1130d}'..='\u{1130e}', '\u{11311}'..='\u{11312}', + '\u{11329}'..='\u{11329}', '\u{11331}'..='\u{11331}', '\u{11334}'..='\u{11334}', + '\u{1133a}'..='\u{1133a}', '\u{11345}'..='\u{11346}', '\u{11349}'..='\u{1134a}', + '\u{1134e}'..='\u{1134f}', '\u{11351}'..='\u{11356}', '\u{11358}'..='\u{1135c}', + '\u{11364}'..='\u{11365}', '\u{1136d}'..='\u{1136f}', '\u{11375}'..='\u{1137f}', + '\u{1138a}'..='\u{1138a}', '\u{1138c}'..='\u{1138d}', '\u{1138f}'..='\u{1138f}', + '\u{113b6}'..='\u{113b6}', '\u{113c1}'..='\u{113c1}', '\u{113c3}'..='\u{113c4}', + '\u{113c6}'..='\u{113c6}', '\u{113cb}'..='\u{113cb}', '\u{113d6}'..='\u{113d6}', + '\u{113d9}'..='\u{113e0}', '\u{113e3}'..='\u{113ff}', '\u{1145c}'..='\u{1145c}', + '\u{11462}'..='\u{1147f}', '\u{114c8}'..='\u{114cf}', '\u{114da}'..='\u{1157f}', + '\u{115b6}'..='\u{115b7}', '\u{115de}'..='\u{115ff}', '\u{11645}'..='\u{1164f}', + '\u{1165a}'..='\u{1165f}', '\u{1166d}'..='\u{1167f}', '\u{116ba}'..='\u{116bf}', + '\u{116ca}'..='\u{116cf}', '\u{116e4}'..='\u{116ff}', '\u{1171b}'..='\u{1171c}', + '\u{1172c}'..='\u{1172f}', '\u{11747}'..='\u{117ff}', '\u{1183c}'..='\u{1189f}', + '\u{118f3}'..='\u{118fe}', '\u{11907}'..='\u{11908}', '\u{1190a}'..='\u{1190b}', + '\u{11914}'..='\u{11914}', '\u{11917}'..='\u{11917}', '\u{11936}'..='\u{11936}', + '\u{11939}'..='\u{1193a}', '\u{11947}'..='\u{1194f}', '\u{1195a}'..='\u{1199f}', + '\u{119a8}'..='\u{119a9}', '\u{119d8}'..='\u{119d9}', '\u{119e5}'..='\u{119ff}', + '\u{11a48}'..='\u{11a4f}', '\u{11aa3}'..='\u{11aaf}', '\u{11af9}'..='\u{11aff}', + '\u{11b0a}'..='\u{11b5f}', '\u{11b68}'..='\u{11bbf}', '\u{11be2}'..='\u{11bef}', + '\u{11bfa}'..='\u{11bff}', '\u{11c09}'..='\u{11c09}', '\u{11c37}'..='\u{11c37}', + '\u{11c46}'..='\u{11c4f}', '\u{11c6d}'..='\u{11c6f}', '\u{11c90}'..='\u{11c91}', + '\u{11ca8}'..='\u{11ca8}', '\u{11cb7}'..='\u{11cff}', '\u{11d07}'..='\u{11d07}', + '\u{11d0a}'..='\u{11d0a}', '\u{11d37}'..='\u{11d39}', '\u{11d3b}'..='\u{11d3b}', + '\u{11d3e}'..='\u{11d3e}', '\u{11d48}'..='\u{11d4f}', '\u{11d5a}'..='\u{11d5f}', + '\u{11d66}'..='\u{11d66}', '\u{11d69}'..='\u{11d69}', '\u{11d8f}'..='\u{11d8f}', + '\u{11d92}'..='\u{11d92}', '\u{11d99}'..='\u{11d9f}', '\u{11daa}'..='\u{11daf}', + '\u{11ddc}'..='\u{11ddf}', '\u{11dea}'..='\u{11edf}', '\u{11ef9}'..='\u{11eff}', + '\u{11f11}'..='\u{11f11}', '\u{11f3b}'..='\u{11f3d}', '\u{11f5b}'..='\u{11faf}', + '\u{11fb1}'..='\u{11fbf}', '\u{11ff2}'..='\u{11ffe}', '\u{1239a}'..='\u{123ff}', + '\u{1246f}'..='\u{1246f}', '\u{12475}'..='\u{1247f}', '\u{12544}'..='\u{12f8f}', + '\u{12ff3}'..='\u{12fff}', '\u{13456}'..='\u{1345f}', '\u{143fb}'..='\u{143ff}', + '\u{14647}'..='\u{160ff}', '\u{1613a}'..='\u{167ff}', '\u{16a39}'..='\u{16a3f}', + '\u{16a5f}'..='\u{16a5f}', '\u{16a6a}'..='\u{16a6d}', '\u{16abf}'..='\u{16abf}', + '\u{16aca}'..='\u{16acf}', '\u{16aee}'..='\u{16aef}', '\u{16af6}'..='\u{16aff}', + '\u{16b46}'..='\u{16b4f}', '\u{16b5a}'..='\u{16b5a}', '\u{16b62}'..='\u{16b62}', + '\u{16b78}'..='\u{16b7c}', '\u{16b90}'..='\u{16d3f}', '\u{16d7a}'..='\u{16e3f}', + '\u{16e9b}'..='\u{16e9f}', '\u{16eb9}'..='\u{16eba}', '\u{16ed4}'..='\u{16eff}', + '\u{16f4b}'..='\u{16f4e}', '\u{16f88}'..='\u{16f8e}', '\u{16fa0}'..='\u{16fdf}', + '\u{16fe5}'..='\u{16fef}', '\u{16ff7}'..='\u{16fff}', '\u{18cd6}'..='\u{18cfe}', + '\u{18d1f}'..='\u{18d7f}', '\u{18df3}'..='\u{1afef}', '\u{1aff4}'..='\u{1aff4}', + '\u{1affc}'..='\u{1affc}', '\u{1afff}'..='\u{1afff}', '\u{1b123}'..='\u{1b131}', + '\u{1b133}'..='\u{1b14f}', '\u{1b153}'..='\u{1b154}', '\u{1b156}'..='\u{1b163}', + '\u{1b168}'..='\u{1b16f}', '\u{1b2fc}'..='\u{1bbff}', '\u{1bc6b}'..='\u{1bc6f}', + '\u{1bc7d}'..='\u{1bc7f}', '\u{1bc89}'..='\u{1bc8f}', '\u{1bc9a}'..='\u{1bc9b}', + '\u{1bca4}'..='\u{1cbff}', '\u{1ccfd}'..='\u{1ccff}', '\u{1ceb4}'..='\u{1ceb9}', + '\u{1ced1}'..='\u{1cedf}', '\u{1cef1}'..='\u{1ceff}', '\u{1cf2e}'..='\u{1cf2f}', + '\u{1cf47}'..='\u{1cf4f}', '\u{1cfc4}'..='\u{1cfff}', '\u{1d0f6}'..='\u{1d0ff}', + '\u{1d127}'..='\u{1d128}', '\u{1d1eb}'..='\u{1d1ff}', '\u{1d246}'..='\u{1d2bf}', + '\u{1d2d4}'..='\u{1d2df}', '\u{1d2f4}'..='\u{1d2ff}', '\u{1d357}'..='\u{1d35f}', + '\u{1d379}'..='\u{1d3ff}', '\u{1d455}'..='\u{1d455}', '\u{1d49d}'..='\u{1d49d}', + '\u{1d4a0}'..='\u{1d4a1}', '\u{1d4a3}'..='\u{1d4a4}', '\u{1d4a7}'..='\u{1d4a8}', + '\u{1d4ad}'..='\u{1d4ad}', '\u{1d4ba}'..='\u{1d4ba}', '\u{1d4bc}'..='\u{1d4bc}', + '\u{1d4c4}'..='\u{1d4c4}', '\u{1d506}'..='\u{1d506}', '\u{1d50b}'..='\u{1d50c}', + '\u{1d515}'..='\u{1d515}', '\u{1d51d}'..='\u{1d51d}', '\u{1d53a}'..='\u{1d53a}', + '\u{1d53f}'..='\u{1d53f}', '\u{1d545}'..='\u{1d545}', '\u{1d547}'..='\u{1d549}', + '\u{1d551}'..='\u{1d551}', '\u{1d6a6}'..='\u{1d6a7}', '\u{1d7cc}'..='\u{1d7cd}', + '\u{1da8c}'..='\u{1da9a}', '\u{1daa0}'..='\u{1daa0}', '\u{1dab0}'..='\u{1deff}', + '\u{1df1f}'..='\u{1df24}', '\u{1df2b}'..='\u{1dfff}', '\u{1e007}'..='\u{1e007}', + '\u{1e019}'..='\u{1e01a}', '\u{1e022}'..='\u{1e022}', '\u{1e025}'..='\u{1e025}', + '\u{1e02b}'..='\u{1e02f}', '\u{1e06e}'..='\u{1e08e}', '\u{1e090}'..='\u{1e0ff}', + '\u{1e12d}'..='\u{1e12f}', '\u{1e13e}'..='\u{1e13f}', '\u{1e14a}'..='\u{1e14d}', + '\u{1e150}'..='\u{1e28f}', '\u{1e2af}'..='\u{1e2bf}', '\u{1e2fa}'..='\u{1e2fe}', + '\u{1e300}'..='\u{1e4cf}', '\u{1e4fa}'..='\u{1e5cf}', '\u{1e5fb}'..='\u{1e5fe}', + '\u{1e600}'..='\u{1e6bf}', '\u{1e6df}'..='\u{1e6df}', '\u{1e6f6}'..='\u{1e6fd}', + '\u{1e700}'..='\u{1e7df}', '\u{1e7e7}'..='\u{1e7e7}', '\u{1e7ec}'..='\u{1e7ec}', + '\u{1e7ef}'..='\u{1e7ef}', '\u{1e7ff}'..='\u{1e7ff}', '\u{1e8c5}'..='\u{1e8c6}', + '\u{1e8d7}'..='\u{1e8ff}', '\u{1e94c}'..='\u{1e94f}', '\u{1e95a}'..='\u{1e95d}', + '\u{1e960}'..='\u{1ec70}', '\u{1ecb5}'..='\u{1ed00}', '\u{1ed3e}'..='\u{1edff}', + '\u{1ee04}'..='\u{1ee04}', '\u{1ee20}'..='\u{1ee20}', '\u{1ee23}'..='\u{1ee23}', + '\u{1ee25}'..='\u{1ee26}', '\u{1ee28}'..='\u{1ee28}', '\u{1ee33}'..='\u{1ee33}', + '\u{1ee38}'..='\u{1ee38}', '\u{1ee3a}'..='\u{1ee3a}', '\u{1ee3c}'..='\u{1ee41}', + '\u{1ee43}'..='\u{1ee46}', '\u{1ee48}'..='\u{1ee48}', '\u{1ee4a}'..='\u{1ee4a}', + '\u{1ee4c}'..='\u{1ee4c}', '\u{1ee50}'..='\u{1ee50}', '\u{1ee53}'..='\u{1ee53}', + '\u{1ee55}'..='\u{1ee56}', '\u{1ee58}'..='\u{1ee58}', '\u{1ee5a}'..='\u{1ee5a}', + '\u{1ee5c}'..='\u{1ee5c}', '\u{1ee5e}'..='\u{1ee5e}', '\u{1ee60}'..='\u{1ee60}', + '\u{1ee63}'..='\u{1ee63}', '\u{1ee65}'..='\u{1ee66}', '\u{1ee6b}'..='\u{1ee6b}', + '\u{1ee73}'..='\u{1ee73}', '\u{1ee78}'..='\u{1ee78}', '\u{1ee7d}'..='\u{1ee7d}', + '\u{1ee7f}'..='\u{1ee7f}', '\u{1ee8a}'..='\u{1ee8a}', '\u{1ee9c}'..='\u{1eea0}', + '\u{1eea4}'..='\u{1eea4}', '\u{1eeaa}'..='\u{1eeaa}', '\u{1eebc}'..='\u{1eeef}', + '\u{1eef2}'..='\u{1efff}', '\u{1f02c}'..='\u{1f02f}', '\u{1f094}'..='\u{1f09f}', + '\u{1f0af}'..='\u{1f0b0}', '\u{1f0c0}'..='\u{1f0c0}', '\u{1f0d0}'..='\u{1f0d0}', + '\u{1f0f6}'..='\u{1f0ff}', '\u{1f1ae}'..='\u{1f1e5}', '\u{1f203}'..='\u{1f20f}', + '\u{1f23c}'..='\u{1f23f}', '\u{1f249}'..='\u{1f24f}', '\u{1f252}'..='\u{1f25f}', + '\u{1f266}'..='\u{1f2ff}', '\u{1f6d9}'..='\u{1f6db}', '\u{1f6ed}'..='\u{1f6ef}', + '\u{1f6fd}'..='\u{1f6ff}', '\u{1f7da}'..='\u{1f7df}', '\u{1f7ec}'..='\u{1f7ef}', + '\u{1f7f1}'..='\u{1f7ff}', '\u{1f80c}'..='\u{1f80f}', '\u{1f848}'..='\u{1f84f}', + '\u{1f85a}'..='\u{1f85f}', '\u{1f888}'..='\u{1f88f}', '\u{1f8ae}'..='\u{1f8af}', + '\u{1f8bc}'..='\u{1f8bf}', '\u{1f8c2}'..='\u{1f8cf}', '\u{1f8d9}'..='\u{1f8ff}', + '\u{1fa58}'..='\u{1fa5f}', '\u{1fa6e}'..='\u{1fa6f}', '\u{1fa7d}'..='\u{1fa7f}', + '\u{1fa8b}'..='\u{1fa8d}', '\u{1fac7}'..='\u{1fac7}', '\u{1fac9}'..='\u{1facc}', + '\u{1fadd}'..='\u{1fade}', '\u{1faeb}'..='\u{1faee}', '\u{1faf9}'..='\u{1faff}', + '\u{1fb93}'..='\u{1fb93}', '\u{1fbfb}'..='\u{1ffff}', '\u{2a6e0}'..='\u{2a6ff}', + '\u{2b81e}'..='\u{2b81f}', '\u{2ceae}'..='\u{2ceaf}', '\u{2ebe1}'..='\u{2ebef}', + '\u{2ee5e}'..='\u{2f7ff}', '\u{2fa1e}'..='\u{2ffff}', '\u{3134b}'..='\u{3134f}', + '\u{3347a}'..='\u{3fffd}', +]; + +#[rustfmt::skip] +pub(super) static DEFAULT_IGNORABLE_CODE_POINT: &[RangeInclusive; 17] = &[ + '\u{ad}'..='\u{ad}', '\u{34f}'..='\u{34f}', '\u{61c}'..='\u{61c}', '\u{115f}'..='\u{1160}', + '\u{17b4}'..='\u{17b5}', '\u{180b}'..='\u{180f}', '\u{200b}'..='\u{200f}', + '\u{202a}'..='\u{202e}', '\u{2060}'..='\u{206f}', '\u{3164}'..='\u{3164}', + '\u{fe00}'..='\u{fe0f}', '\u{feff}'..='\u{feff}', '\u{ffa0}'..='\u{ffa0}', + '\u{fff0}'..='\u{fff8}', '\u{1bca0}'..='\u{1bca3}', '\u{1d173}'..='\u{1d17a}', + '\u{e0000}'..='\u{e0fff}', +]; + #[rustfmt::skip] pub(super) static GRAPHEME_EXTEND: &[RangeInclusive; 383] = &[ '\u{300}'..='\u{36f}', '\u{483}'..='\u{489}', '\u{591}'..='\u{5bd}', '\u{5bf}'..='\u{5bf}', diff --git a/license-metadata.json b/license-metadata.json index e8e13fa8d859c..b58329b2ccc19 100644 --- a/license-metadata.json +++ b/license-metadata.json @@ -3,6 +3,28 @@ "children": [ { "children": [ + { + "children": [ + { + "license": { + "copyright": [ + "The Rust Project Developers (see https://thanks.rust-lang.org)" + ], + "spdx": "Apache-2.0 OR MIT" + }, + "name": "mod.rs", + "type": "file" + } + ], + "license": { + "copyright": [ + "1991-2024 Unicode, Inc" + ], + "spdx": "Unicode-3.0" + }, + "name": "library/core/src/unicode", + "type": "directory" + }, { "children": [ { @@ -178,16 +200,6 @@ "name": "library/backtrace", "type": "directory" }, - { - "license": { - "copyright": [ - "1991-2024 Unicode, Inc" - ], - "spdx": "Unicode-3.0" - }, - "name": "library/core/src/unicode/unicode_data.rs", - "type": "file" - }, { "children": [], "license": { diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 398b4c7b7ec5a..aedab398e9313 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -71,11 +71,11 @@ //! index of that offset is utilized as the answer to whether we're in the set //! or not. -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::fmt::Write; use std::ops::Range; -use ucd_parse::Codepoints; +use ucd_parse::{Codepoint, Codepoints}; mod cascading_map; mod case_mapping; @@ -88,14 +88,19 @@ use fmt_helpers::CharEscape; use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace}; static PROPERTIES: &[&str] = &[ + // tidy-alphabetical-start "Alphabetic", - "Lowercase", - "Uppercase", "Case_Ignorable", + "Cf", + "Cn_Planes_0_3", + "Default_Ignorable_Code_Point", "Grapheme_Extend", - "White_Space", - "N", + "Lowercase", "Lt", + "N", + "Uppercase", + "White_Space", + // tidy-alphabetical-end ]; struct UnicodeData { @@ -138,6 +143,9 @@ fn load_data() -> UnicodeData { } } + // Unassigned characters are not listed in `UnicodeData.txt`, + // so get a list of all the assigned ones + let mut assigned_chars = BTreeSet::new(); let [mut to_lower, mut to_upper, mut to_title] = [const { BTreeMap::new() }; 3]; for row in ucd_parse::UnicodeDataExpander::new( ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), @@ -147,6 +155,11 @@ fn load_data() -> UnicodeData { } else { row.general_category.as_str() }; + + if !matches!(general_category, "Cs" | "Cn") { + assigned_chars.insert(row.codepoint.value()); + } + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) { properties .entry(*name) @@ -171,6 +184,25 @@ fn load_data() -> UnicodeData { } } + // Find all unassigned chars in the first 4 planes + for c in '\0'..='\u{3FFFD}' { + let cp = Codepoint::from_u32(c.into()).unwrap(); + if !assigned_chars.contains(&cp.value()) { + properties.entry("Cn_Planes_0_3").or_insert_with(Vec::new).push(Codepoints::Single(cp)); + } + } + + // For now, we hardcode the assigned/unassigned status of characters + // U+3FFFE and above. The assertion below must be kept in sync + // with the `is_unassigned()` method in `library/core/char/methods.rs`. + for c in '\u{3FFFE}'..=char::MAX { + assert_eq!( + assigned_chars.contains(&u32::from(c)), + matches!(c, '\u{E0001}' | '\u{E0020}'..='\u{E007F}' | '\u{E0100}'..='\u{E01EF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}'), + "{c:?}", + ); + } + for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { if !row.conditions.is_empty() { // Skip conditional case mappings @@ -247,7 +279,7 @@ fn main() { modules.push((property.to_lowercase().to_string(), emitter.file)); table_file.push_str(&format!( - "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", + "// {:28}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", property, emitter.bytes_used, datapoints, @@ -260,10 +292,10 @@ fn main() { } let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); for (name, (desc, size)) in ["to_lower", "to_upper", "to_title"].iter().zip(sizes) { - table_file.push_str(&format!("// {:16}: {:5} bytes, {desc}\n", name, size,)); + table_file.push_str(&format!("// {:28}: {:5} bytes, {desc}\n", name, size,)); total_bytes += size; } - table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); + table_file.push_str(&format!("// {:28}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function table_file.push('\n');