From ffa84369f8bf3ce2ac8567dfd95d6b613d9fe2cc Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 26 Apr 2026 18:11:27 -0400 Subject: [PATCH 1/4] Improve `core::char::mathods.rs` docs And rename a struct field. --- library/core/src/char/methods.rs | 180 ++++++++++++++++--------------- library/core/src/fmt/mod.rs | 4 +- library/core/src/str/lossy.rs | 2 +- library/core/src/str/mod.rs | 2 +- library/core/src/unicode/mod.rs | 9 +- library/core/src/wtf8.rs | 2 +- 6 files changed, 105 insertions(+), 94 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 00b735e91a377..21575fc4bca74 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -93,13 +93,18 @@ impl char { /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of /// `char` and `str` methods are based on. /// - /// New versions of Unicode are released regularly and subsequently all methods - /// in the standard library depending on Unicode are updated. Therefore the - /// behavior of some `char` and `str` methods and the value of this constant - /// changes over time. This is *not* considered to be a breaking change. + /// New versions of Unicode are released regularly, and subsequently all methods + /// in the standard library depending on Unicode are updated. Therefore, the + /// behavior of some `char` and `str` methods, and the value of this constant, + /// change over time (within the boundaries of Unicode's [stability policies]). + /// This is *not* considered to be a breaking change. + /// + /// [stability policies]: https://www.unicode.org/policies/stability_policy.html /// /// The version numbering scheme is explained in - /// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4). + /// [Section 3.1 (Version Numbering)] of the Unicode Standard. + /// + /// [Section 3.1 (Version Numbering)]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49512 #[stable(feature = "assoc_char_consts", since = "1.52.0")] pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION; @@ -480,7 +485,7 @@ impl char { '\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus), '\"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark), '\'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe), - _ if args.escape_grapheme_extended && self.is_grapheme_extended() => { + _ if args.escape_grapheme_extender && self.is_grapheme_extender() => { EscapeDebug::unicode(self) } _ if is_printable(self) => EscapeDebug::printable(self), @@ -753,11 +758,11 @@ impl char { /// Returns `true` if this `char` has the `Alphabetic` property. /// - /// `Alphabetic` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Alphabetic` is [described] in Chapter 4 (Character Properties) of the Unicode Standard, and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G32524 + /// [specified]: https://www.unicode.org/reports/tr44/#Alphabetic /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -786,11 +791,11 @@ impl char { /// Returns `true` if this `char` has the `Cased` property. /// A character is cased if and only if it is uppercase, lowercase, or titlecase. /// - /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Cased` is [described] in Chapter 3 (Character Properties) of the Unicode Standard and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G44595 + /// [specified]: https://www.unicode.org/reports/tr44/#Cased /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -849,11 +854,11 @@ impl char { /// Returns `true` if this `char` has the `Lowercase` property. /// - /// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Lowercase` is [described] in Chapter 4 (Character Properties) of the Unicode Standard, and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G136255 + /// [specified]: https://www.unicode.org/reports/tr44/#Lowercase /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -889,15 +894,15 @@ impl char { } } - /// Returns `true` if this `char` has the general category for titlecase letters. + /// Returns `true` if this `char` is in the general category for titlecase letters. /// Conceptually, these characters consist of an uppercase portion followed by a lowercase portion. /// - /// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4 - /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character - /// Database][ucd] [`UnicodeData.txt`]. + /// Titlecase letters (code points with the general category of `Lt`) are [described] in Chapter 4 + /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G124722 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// /// # Examples @@ -925,11 +930,11 @@ impl char { /// Returns `true` if this `char` has the `Uppercase` property. /// - /// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// `Uppercase` is [described] in Chapter 4 (Character Properties) of the Unicode Standard, and + /// [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G136255 + /// [specified]: https://www.unicode.org/reports/tr44/#Uppercase /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt /// /// # Examples @@ -965,11 +970,41 @@ impl char { } } + /// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`]. + /// + /// [`is_alphabetic()`]: Self::is_alphabetic + /// [`is_numeric()`]: Self::is_numeric + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// assert!('٣'.is_alphanumeric()); + /// assert!('7'.is_alphanumeric()); + /// assert!('৬'.is_alphanumeric()); + /// assert!('¾'.is_alphanumeric()); + /// assert!('①'.is_alphanumeric()); + /// assert!('K'.is_alphanumeric()); + /// assert!('و'.is_alphanumeric()); + /// assert!('藏'.is_alphanumeric()); + /// ``` + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn is_alphanumeric(self) -> bool { + match self { + 'a'..='z' | 'A'..='Z' | '0'..='9' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Alphabetic(self) || unicode::N(self), + } + } + /// Returns `true` if this `char` has the `White_Space` property. /// - /// `White_Space` is specified in the [Unicode Character Database][ucd] [`PropList.txt`]. + /// `White_Space` is [specified] in the Unicode Character Database [`PropList.txt`]. /// - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [specified]: https://www.unicode.org/reports/tr44/#White_Space /// [`PropList.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt /// /// # Examples @@ -999,44 +1034,15 @@ impl char { } } - /// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`]. - /// - /// [`is_alphabetic()`]: #method.is_alphabetic - /// [`is_numeric()`]: #method.is_numeric - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// assert!('٣'.is_alphanumeric()); - /// assert!('7'.is_alphanumeric()); - /// assert!('৬'.is_alphanumeric()); - /// assert!('¾'.is_alphanumeric()); - /// assert!('①'.is_alphanumeric()); - /// assert!('K'.is_alphanumeric()); - /// assert!('و'.is_alphanumeric()); - /// assert!('藏'.is_alphanumeric()); - /// ``` - #[must_use] - #[stable(feature = "rust1", since = "1.0.0")] - #[inline] - pub fn is_alphanumeric(self) -> bool { - match self { - 'a'..='z' | 'A'..='Z' | '0'..='9' => true, - '\0'..='\u{A9}' => false, - _ => unicode::Alphabetic(self) || unicode::N(self), - } - } - /// Returns `true` if this `char` has the general category for control codes. /// - /// Control codes (code points with the general category of `Cc`) are described in Chapter 4 - /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character - /// Database][ucd] [`UnicodeData.txt`]. + /// Control codes (code points with the general category of `Cc`) are [described] in Chapter 23 + /// (Special Areas and Format Characters) of the Unicode Standard, and [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. The full set of Unicode control codes is + /// `'\0'..='\x1f' | '\x7f'..='\u{9f}'`, and will never change. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-23/#G20365 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// /// # Examples @@ -1044,8 +1050,9 @@ impl char { /// Basic usage: /// /// ``` - /// // U+009C, STRING TERMINATOR - /// assert!('œ'.is_control()); + /// assert!('\t'.is_control()); + /// assert!('\n'.is_control()); + /// assert!('\u{9C}'.is_control()); // STRING TERMINATOR /// assert!(!'q'.is_control()); /// ``` #[must_use] @@ -1061,16 +1068,15 @@ impl char { /// Returns `true` if this `char` has the `Grapheme_Extend` property. /// - /// `Grapheme_Extend` is described in [Unicode Standard Annex #29 (Unicode Text - /// Segmentation)][uax29] and specified in the [Unicode Character Database][ucd] - /// [`DerivedCoreProperties.txt`]. + /// `Grapheme_Extend` is [described] in Chapter 3 (Conformance) of the Unicode Standard, + /// and [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. /// - /// [uax29]: https://www.unicode.org/reports/tr29/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G41165 + /// [specified]: https://www.unicode.org/reports/tr44/#Grapheme_Extend /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt #[must_use] #[inline] - pub(crate) fn is_grapheme_extended(self) -> bool { + fn is_grapheme_extender(self) -> bool { self > '\u{02FF}' && unicode::Grapheme_Extend(self) } @@ -1078,12 +1084,12 @@ impl char { /// is used to implement context-dependent casing for the Greek letter sigma (uppercase Σ), /// which has two lowercase forms. /// - /// `Case_Ignorable` is [described][D136] in Chapter 3 (Conformance) of the Unicode Core Specification, - /// and specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]; + /// `Case_Ignorable` is [described] in Chapter 3 (Conformance) of the Unicode Core Specification, + /// and [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]; /// see those resources for more information. /// - /// [D136]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63116 - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63116 + /// [specified]: https://www.unicode.org/reports/tr44/#Case_Ignorable /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt #[must_use] #[inline] @@ -1099,20 +1105,20 @@ impl char { /// Returns `true` if this `char` has one of the general categories for numbers. /// /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric - /// characters, and `No` for other numeric characters) are specified in the [Unicode Character - /// Database][ucd] [`UnicodeData.txt`]. + /// characters, and `No` for other numeric characters) are [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. /// /// This method doesn't cover everything that could be considered a number, e.g. ideographic numbers like '三'. - /// If you want everything including characters with overlapping purposes then you might want to use - /// a unicode or language-processing library that exposes the appropriate character properties instead - /// of looking at the unicode categories. + /// If you want everything including characters with overlapping purposes, then you might want to use + /// a Unicode or language-processing library that exposes the appropriate character properties + /// (e.g. [`Numeric_Type`]) instead of looking at the Unicode categories. /// /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N, use /// `is_ascii_digit` or `is_digit` instead. /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// [`Numeric_Type`]: https://www.unicode.org/reports/tr44/#Numeric_Type /// /// # Examples /// @@ -2151,8 +2157,8 @@ impl char { } pub(crate) struct EscapeDebugExtArgs { - /// Escape Extended Grapheme codepoints? - pub(crate) escape_grapheme_extended: bool, + /// Escape Grapheme Extender codepoints? + pub(crate) escape_grapheme_extender: bool, /// Escape single quotes? pub(crate) escape_single_quote: bool, @@ -2163,7 +2169,7 @@ pub(crate) struct EscapeDebugExtArgs { impl EscapeDebugExtArgs { pub(crate) const ESCAPE_ALL: Self = Self { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: true, escape_double_quote: true, }; diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs index 9e5f693246f33..00694a653be2d 100644 --- a/library/core/src/fmt/mod.rs +++ b/library/core/src/fmt/mod.rs @@ -2941,7 +2941,7 @@ impl Debug for str { let mut chars = rest.chars(); if let Some(c) = chars.next() { let esc = c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: false, escape_double_quote: true, }); @@ -2973,7 +2973,7 @@ impl Debug for char { fn fmt(&self, f: &mut Formatter<'_>) -> Result { f.write_char('\'')?; let esc = self.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: true, escape_double_quote: false, }); diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index d2dc650910f63..847ff265d3608 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -123,7 +123,7 @@ impl fmt::Debug for Debug<'_> { let mut from = 0; for (i, c) in valid.char_indices() { let esc = c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: false, escape_double_quote: true, }); diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 5af399ab1b34c..1fd3125154322 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -3195,7 +3195,7 @@ impl_fn_for_zst! { #[derive(Clone)] struct CharEscapeDebugContinue impl Fn = |c: char| -> char::EscapeDebug { c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: false, + escape_grapheme_extender: false, escape_single_quote: true, escape_double_quote: true }) diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 8b2c526a08878..fb4e5c65d0ce1 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -27,8 +27,13 @@ pub mod unicode_data; /// New versions of Unicode are released regularly and subsequently all methods /// in the standard library depending on Unicode are updated. Therefore the /// behavior of some `char` and `str` methods and the value of this constant -/// changes over time. This is *not* considered to be a breaking change. +/// changes over time, within the boundaries of Unicode's [stability policies]. +/// This is *not* considered to be a breaking change. +/// +/// [stability policies]: https://www.unicode.org/policies/stability_policy.html /// /// The version numbering scheme is explained in -/// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4). +/// [Section 3.1 (Version Numbering)] of the Unicode Standard. +/// +/// [Section 3.1 (Version Numbering)]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49512 pub const UNICODE_VERSION: (u8, u8, u8) = unicode_data::UNICODE_VERSION; diff --git a/library/core/src/wtf8.rs b/library/core/src/wtf8.rs index a0978c3dafb48..effb7a37d970c 100644 --- a/library/core/src/wtf8.rs +++ b/library/core/src/wtf8.rs @@ -147,7 +147,7 @@ impl fmt::Debug for Wtf8 { use crate::fmt::Write as _; for c in s.chars().flat_map(|c| { c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, + escape_grapheme_extender: true, escape_single_quote: false, escape_double_quote: true, }) From da089a7d0027fab73f72d94c94df67acf34f418c Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 26 Apr 2026 18:19:55 -0400 Subject: [PATCH 2/4] `char`: move `is_numeric` next to `is_alphanumeric` --- library/core/src/char/methods.rs | 88 ++++++++++++++++---------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 21575fc4bca74..a73fd4d6ead01 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -970,6 +970,50 @@ impl char { } } + /// Returns `true` if this `char` has one of the general categories for numbers. + /// + /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric + /// characters, and `No` for other numeric characters) are [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. + /// + /// This method doesn't cover everything that could be considered a number, e.g. ideographic numbers like '三'. + /// If you want everything including characters with overlapping purposes, then you might want to use + /// a Unicode or language-processing library that exposes the appropriate character properties + /// (e.g. [`Numeric_Type`]) instead of looking at the Unicode categories. + /// + /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N, use + /// `is_ascii_digit` or `is_digit` instead. + /// + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// [`Numeric_Type`]: https://www.unicode.org/reports/tr44/#Numeric_Type + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// assert!('٣'.is_numeric()); + /// assert!('7'.is_numeric()); + /// assert!('৬'.is_numeric()); + /// assert!('¾'.is_numeric()); + /// assert!('①'.is_numeric()); + /// assert!(!'K'.is_numeric()); + /// assert!(!'و'.is_numeric()); + /// assert!(!'藏'.is_numeric()); + /// assert!(!'三'.is_numeric()); + /// ``` + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn is_numeric(self) -> bool { + match self { + '0'..='9' => true, + '\0'..='\u{B1}' => false, + _ => unicode::N(self), + } + } + /// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`]. /// /// [`is_alphabetic()`]: Self::is_alphabetic @@ -1102,50 +1146,6 @@ impl char { } } - /// Returns `true` if this `char` has one of the general categories for numbers. - /// - /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric - /// characters, and `No` for other numeric characters) are [specified] in the Unicode Character - /// Database [`UnicodeData.txt`]. - /// - /// This method doesn't cover everything that could be considered a number, e.g. ideographic numbers like '三'. - /// If you want everything including characters with overlapping purposes, then you might want to use - /// a Unicode or language-processing library that exposes the appropriate character properties - /// (e.g. [`Numeric_Type`]) instead of looking at the Unicode categories. - /// - /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N, use - /// `is_ascii_digit` or `is_digit` instead. - /// - /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table - /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt - /// [`Numeric_Type`]: https://www.unicode.org/reports/tr44/#Numeric_Type - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// assert!('٣'.is_numeric()); - /// assert!('7'.is_numeric()); - /// assert!('৬'.is_numeric()); - /// assert!('¾'.is_numeric()); - /// assert!('①'.is_numeric()); - /// assert!(!'K'.is_numeric()); - /// assert!(!'و'.is_numeric()); - /// assert!(!'藏'.is_numeric()); - /// assert!(!'三'.is_numeric()); - /// ``` - #[must_use] - #[stable(feature = "rust1", since = "1.0.0")] - #[inline] - pub fn is_numeric(self) -> bool { - match self { - '0'..='9' => true, - '\0'..='\u{B1}' => false, - _ => unicode::N(self), - } - } - /// Returns an iterator that yields the lowercase mapping of this `char` as one or more /// `char`s. /// From 35eb40a28c431f25d3c13e6fb02053b30553737b Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 26 Apr 2026 18:36:06 -0400 Subject: [PATCH 3/4] Replace printables table with `unicode_data.rs` tables This gets rid of the `printable.py` script, ensuring that `unicode-table-generator` handles all our Unicode data table generation needs. I've elected to give each Unicode property its own table, instead of merging them all into one. This is slightly less efficient in terms of space, but should allow us to expose these tables in the future with public methods on `char`. --- library/core/src/char/methods.rs | 133 +++- library/core/src/unicode/mod.rs | 4 +- library/core/src/unicode/printable.py | 258 -------- library/core/src/unicode/printable.rs | 608 ------------------ library/core/src/unicode/unicode_data.rs | 175 ++++- library/coretests/tests/unicode.rs | 21 +- library/coretests/tests/unicode/test_data.rs | 246 +++++++ license-metadata.json | 32 +- src/tools/unicode-table-generator/src/main.rs | 49 +- 9 files changed, 614 insertions(+), 912 deletions(-) delete mode 100755 library/core/src/unicode/printable.py delete mode 100644 library/core/src/unicode/printable.rs diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index a73fd4d6ead01..785f781c1860c 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -5,7 +5,6 @@ use crate::panic::const_panic; use crate::slice; use crate::str::from_utf8_unchecked_mut; use crate::ub_checks::assert_unsafe_precondition; -use crate::unicode::printable::is_printable; use crate::unicode::{self, conversions}; impl char { @@ -478,18 +477,29 @@ impl char { #[inline] pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug { match self { - '\0' => EscapeDebug::backslash(ascii::Char::Digit0), - '\t' => EscapeDebug::backslash(ascii::Char::SmallT), - '\r' => EscapeDebug::backslash(ascii::Char::SmallR), - '\n' => EscapeDebug::backslash(ascii::Char::SmallN), - '\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus), + // Special escapes '\"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark), '\'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe), - _ if args.escape_grapheme_extender && self.is_grapheme_extender() => { + '\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus), + '\n' => EscapeDebug::backslash(ascii::Char::SmallN), + '\t' => EscapeDebug::backslash(ascii::Char::SmallT), + '\r' => EscapeDebug::backslash(ascii::Char::SmallR), + '\0' => EscapeDebug::backslash(ascii::Char::Digit0), + + // ASCII fast path + '\x20'..='\x7E' => EscapeDebug::printable(self), + + _ if self.is_control() + || self.is_private_use() + || self.is_whitespace() + || args.escape_grapheme_extender && self.is_grapheme_extender() + || self.is_format_control() + || self.is_unassigned() => + { EscapeDebug::unicode(self) } - _ if is_printable(self) => EscapeDebug::printable(self), - _ => EscapeDebug::unicode(self), + + _ => EscapeDebug::printable(self), } } @@ -1110,6 +1120,111 @@ impl char { matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}') } + /// Returns `true` if this `char` has the general category for [private-use characters]. + /// These characters do not have an interpretation specified by Unicode; individual programs + /// and users are free to assign them whatever meaning they like. + /// + /// [private-use characters]: https://www.unicode.org/faq/private_use#private_use + /// + /// Private-use characters (code points with the general category of `Co`) are [described] in Chapter 23 + /// (Special Areas and Format Characters) of the Unicode Standard, and [specified] in the + /// Unicode Character Database [`UnicodeData.txt`]. The full set of private-use characters is + /// `'\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}'`, + /// and will never change. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-23/#G19184 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + #[must_use] + #[inline] + const fn is_private_use(self) -> bool { + // According to + // https://www.unicode.org/policies/stability_policy.html#Property_Value, + // the set of codepoints in `Co` will never change. + // So we can just hard-code the patterns to match against instead of using a table. + matches!(self, '\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}') + } + + /// Returns `true` if this `char` has the general category for format control characters. + /// + /// Format controls (code points with the general category of `Cf`) are [described] in Chapter 4 + /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character + /// Database [`UnicodeData.txt`]. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```ignore(private) + /// assert!('\u{AD}'.is_format_control()); // SOFT HYPHEN + /// assert!('\u{200B}'.is_format_control()); // ZERO WIDTH SPACE + /// assert!('\u{E0041}'.is_format_control()); // TAG LATIN CAPITAL LETTER A + /// assert!('۝'.is_format_control()); // ARABIC END OF AYAH + /// assert!('𓐲'.is_format_control()); // EGYPTIAN HIEROGLYPH INSERT AT TOP START + /// assert!(!'q'.is_format_control()); + /// ``` + #[must_use] + #[inline] + fn is_format_control(self) -> bool { + self > '\u{AC}' && unicode::Cf(self) + } + + /// Returns `true` if this `char` has not yet been assigned a meaning by Unicode, as of + /// [`UNICODE_VERSION`]. + /// + /// [`UNICODE_VERSION`]: Self::UNICODE_VERSION + /// + /// These characters may have a meaning assigned in the future, + /// except for the 66 [noncharacters] which will never be assigned a meaning. + /// + /// [noncharacters]: https://www.unicode.org/faq/private_use#noncharacters + /// + /// Many of Unicode's [stability policies] apply only to assigned characters. + /// + /// [stability policies]: https://www.unicode.org/policies/stability_policy.html + /// + /// Unassigned characters (code points with the general category of `Cn`) are [described] in Chapter 4 + /// (Character Properties) of the Unicode Standard, and [specified] in the Unicode Character Database + /// by their exclusion from [`UnicodeData.txt`]. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-4/#G134153 + /// [specified]: https://www.unicode.org/reports/tr44/#GC_Values_Table + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```ignore(private) + /// assert!('\u{FFFE}'.is_unassigned()); // noncharacter, will never be assigned + /// + /// //assert!('\u{7AAAA}'.is_unassigned()); // not currently assigned, but may be in the future, + /// // so we shouldn't rely on the current status + /// + /// assert!(!'γ'.is_unassigned()); // once a character is assigned, it stays assigned forever + /// ``` + #[must_use] + #[inline] + fn is_unassigned(self) -> bool { + match self { + '\0'..='\u{377}' => false, + '\u{378}'..='\u{3FFFD}' => unicode::Cn_planes_0_3(self), + // Assigned character ranges in planes 4 and above. + // `src/tools/unicode-table-generator/src/main.rs` asserts that this is correct + '\u{E0001}' + | '\u{E0020}'..='\u{E007F}' + | '\u{E0100}'..='\u{E01EF}' + | '\u{F0000}'..='\u{FFFFD}' + | '\u{100000}'..='\u{10FFFD}' => false, + _ => true, + } + } + /// Returns `true` if this `char` has the `Grapheme_Extend` property. /// /// `Grapheme_Extend` is [described] in Chapter 3 (Conformance) of the Unicode Standard, diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index fb4e5c65d0ce1..61acb08487057 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -9,6 +9,8 @@ pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; +pub(crate) use unicode_data::cf::lookup as Cf; +pub(crate) use unicode_data::cn_planes_0_3::lookup as Cn_planes_0_3; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::lt::lookup as Lt; @@ -16,8 +18,6 @@ pub(crate) use unicode_data::n::lookup as N; pub(crate) use unicode_data::uppercase::lookup as Uppercase; pub(crate) use unicode_data::white_space::lookup as White_Space; -pub(crate) mod printable; - #[allow(unreachable_pub)] pub mod unicode_data; diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py deleted file mode 100755 index 260fa9f9e6ad2..0000000000000 --- a/library/core/src/unicode/printable.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python - -# This script uses the following Unicode tables: -# - UnicodeData.txt - - -from collections import namedtuple -import csv -import os -import subprocess - -NUM_CODEPOINTS = 0x110000 - - -def to_ranges(iter): - current = None - for i in iter: - if current is None or i != current[1] or i in (0x10000, 0x20000): - if current is not None: - yield tuple(current) - current = [i, i + 1] - else: - current[1] += 1 - if current is not None: - yield tuple(current) - - -def get_escaped(codepoints): - for c in codepoints: - if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord( - " " - ): - yield c.value - - -def get_file(f): - try: - return open(os.path.basename(f)) - except FileNotFoundError: - subprocess.run(["curl", "-O", f], check=True) - return open(os.path.basename(f)) - - -Codepoint = namedtuple("Codepoint", "value class_") - - -def get_codepoints(f): - r = csv.reader(f, delimiter=";") - prev_codepoint = 0 - class_first = None - for row in r: - codepoint = int(row[0], 16) - name = row[1] - class_ = row[2] - - if class_first is not None: - if not name.endswith("Last>"): - raise ValueError("Missing Last after First") - - for c in range(prev_codepoint + 1, codepoint): - yield Codepoint(c, class_first) - - class_first = None - if name.endswith("First>"): - class_first = class_ - - yield Codepoint(codepoint, class_) - prev_codepoint = codepoint - - if class_first is not None: - raise ValueError("Missing Last after First") - - for c in range(prev_codepoint + 1, NUM_CODEPOINTS): - yield Codepoint(c, None) - - -def compress_singletons(singletons): - uppers = [] # (upper, # items in lowers) - lowers = [] - - for i in singletons: - upper = i >> 8 - lower = i & 0xFF - if len(uppers) == 0 or uppers[-1][0] != upper: - uppers.append((upper, 1)) - else: - upper, count = uppers[-1] - uppers[-1] = upper, count + 1 - lowers.append(lower) - - return uppers, lowers - - -def compress_normal(normal): - # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f - # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff - compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] - - prev_start = 0 - for start, count in normal: - truelen = start - prev_start - falselen = count - prev_start = start + count - - assert truelen < 0x8000 and falselen < 0x8000 - entry = [] - if truelen > 0x7F: - entry.append(0x80 | (truelen >> 8)) - entry.append(truelen & 0xFF) - else: - entry.append(truelen & 0x7F) - if falselen > 0x7F: - entry.append(0x80 | (falselen >> 8)) - entry.append(falselen & 0xFF) - else: - entry.append(falselen & 0x7F) - - compressed.append(entry) - - return compressed - - -def print_singletons(uppers, lowers, uppersname, lowersname): - print("#[rustfmt::skip]") - print("const {}: &[(u8, u8)] = &[".format(uppersname)) - for u, c in uppers: - print(" ({:#04x}, {}),".format(u, c)) - print("];") - print("#[rustfmt::skip]") - print("const {}: &[u8] = &[".format(lowersname)) - for i in range(0, len(lowers), 8): - print( - " {}".format(" ".join("{:#04x},".format(x) for x in lowers[i : i + 8])) - ) - print("];") - - -def print_normal(normal, normalname): - print("#[rustfmt::skip]") - print("const {}: &[u8] = &[".format(normalname)) - for v in normal: - print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) - print("];") - - -def main(): - file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") - - codepoints = get_codepoints(file) - - CUTOFF = 0x10000 - singletons0 = [] - singletons1 = [] - normal0 = [] - normal1 = [] - extra = [] - - for a, b in to_ranges(get_escaped(codepoints)): - if a > 2 * CUTOFF: - extra.append((a, b - a)) - elif a == b - 1: - if a & CUTOFF: - singletons1.append(a & ~CUTOFF) - else: - singletons0.append(a) - elif a == b - 2: - if a & CUTOFF: - singletons1.append(a & ~CUTOFF) - singletons1.append((a + 1) & ~CUTOFF) - else: - singletons0.append(a) - singletons0.append(a + 1) - else: - if a >= 2 * CUTOFF: - extra.append((a, b - a)) - elif a & CUTOFF: - normal1.append((a & ~CUTOFF, b - a)) - else: - normal0.append((a, b - a)) - - singletons0u, singletons0l = compress_singletons(singletons0) - singletons1u, singletons1l = compress_singletons(singletons1) - normal0 = compress_normal(normal0) - normal1 = compress_normal(normal1) - - print("""\ -// NOTE: The following code was generated by "library/core/src/unicode/printable.py", -// do not edit directly! - -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { - if lower == x as u8 { - return false; - } - } - } else if xupper < upper { - break; - } - lowerstart = lowerend; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 - } else { - v as i32 - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} - -pub(crate) fn is_printable(x: char) -> bool { - let x = x as u32; - let lower = x as u16; - - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else {\ -""") - for a, b in extra: - print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) - print(" return false;") - print(" }") - print("""\ - true - } -}\ -""") - print() - print_singletons(singletons0u, singletons0l, "SINGLETONS0U", "SINGLETONS0L") - print_singletons(singletons1u, singletons1l, "SINGLETONS1U", "SINGLETONS1L") - print_normal(normal0, "NORMAL0") - print_normal(normal1, "NORMAL1") - - -if __name__ == "__main__": - main() diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs deleted file mode 100644 index 68e1c8ae31c06..0000000000000 --- a/library/core/src/unicode/printable.rs +++ /dev/null @@ -1,608 +0,0 @@ -// NOTE: The following code was generated by "library/core/src/unicode/printable.py", -// do not edit directly! - -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { - if lower == x as u8 { - return false; - } - } - } else if xupper < upper { - break; - } - lowerstart = lowerend; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 - } else { - v as i32 - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} - -pub(crate) fn is_printable(x: char) -> bool { - let x = x as u32; - let lower = x as u16; - - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else { - if 0x2a6e0 <= x && x < 0x2a700 { - return false; - } - if 0x2b81e <= x && x < 0x2b820 { - return false; - } - if 0x2ceae <= x && x < 0x2ceb0 { - return false; - } - if 0x2ebe1 <= x && x < 0x2ebf0 { - return false; - } - if 0x2ee5e <= x && x < 0x2f800 { - return false; - } - if 0x2fa1e <= x && x < 0x30000 { - return false; - } - if 0x3134b <= x && x < 0x31350 { - return false; - } - if 0x3347a <= x && x < 0xe0100 { - return false; - } - if 0xe01f0 <= x && x < 0x110000 { - return false; - } - true - } -} - -#[rustfmt::skip] -const SINGLETONS0U: &[(u8, u8)] = &[ - (0x00, 1), - (0x03, 5), - (0x05, 6), - (0x06, 2), - (0x07, 6), - (0x08, 7), - (0x09, 17), - (0x0a, 28), - (0x0b, 25), - (0x0c, 25), - (0x0d, 16), - (0x0e, 12), - (0x0f, 4), - (0x10, 3), - (0x12, 18), - (0x13, 9), - (0x16, 1), - (0x17, 4), - (0x18, 1), - (0x19, 3), - (0x1a, 9), - (0x1b, 1), - (0x1c, 2), - (0x1f, 22), - (0x20, 3), - (0x2b, 2), - (0x2d, 11), - (0x2e, 1), - (0x30, 4), - (0x31, 2), - (0x32, 1), - (0xa9, 2), - (0xaa, 4), - (0xab, 8), - (0xfa, 2), - (0xfb, 5), - (0xfe, 3), - (0xff, 9), -]; -#[rustfmt::skip] -const SINGLETONS0L: &[u8] = &[ - 0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, - 0x58, 0x8b, 0x8c, 0x90, 0x1c, 0xdd, 0x0e, 0x0f, - 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f, 0x5c, - 0x5d, 0x5f, 0xe2, 0x84, 0x8d, 0x8e, 0x91, 0x92, - 0xa9, 0xb1, 0xba, 0xbb, 0xc5, 0xc6, 0xc9, 0xca, - 0xde, 0xe4, 0xe5, 0xff, 0x00, 0x04, 0x11, 0x12, - 0x29, 0x31, 0x34, 0x37, 0x3a, 0x3b, 0x3d, 0x49, - 0x4a, 0x5d, 0x84, 0x8e, 0x92, 0xa9, 0xb1, 0xb4, - 0xba, 0xbb, 0xc6, 0xca, 0xce, 0xcf, 0xe4, 0xe5, - 0x00, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, - 0x34, 0x3a, 0x3b, 0x45, 0x46, 0x49, 0x4a, 0x5e, - 0x64, 0x65, 0x84, 0x91, 0x9b, 0x9d, 0xc9, 0xce, - 0xcf, 0x0d, 0x11, 0x29, 0x3a, 0x3b, 0x45, 0x49, - 0x57, 0x5b, 0x5e, 0x5f, 0x64, 0x65, 0x8d, 0x91, - 0xa9, 0xb4, 0xba, 0xbb, 0xc5, 0xc9, 0xdf, 0xe4, - 0xe5, 0xf0, 0x0d, 0x11, 0x45, 0x49, 0x64, 0x65, - 0x80, 0x84, 0xb2, 0xbc, 0xbe, 0xbf, 0xd5, 0xd7, - 0xf0, 0xf1, 0x83, 0x85, 0x8b, 0xa4, 0xa6, 0xbe, - 0xbf, 0xc5, 0xc7, 0xcf, 0xda, 0xdb, 0x48, 0x98, - 0xbd, 0xcd, 0xc6, 0xce, 0xcf, 0x49, 0x4e, 0x4f, - 0x57, 0x59, 0x5e, 0x5f, 0x89, 0x8e, 0x8f, 0xb1, - 0xb6, 0xb7, 0xbf, 0xc1, 0xc6, 0xc7, 0xd7, 0x11, - 0x16, 0x17, 0x5b, 0x5c, 0xf6, 0xf7, 0xfe, 0xff, - 0x80, 0x6d, 0x71, 0xde, 0xdf, 0x0e, 0x1f, 0x6e, - 0x6f, 0x1c, 0x1d, 0x5f, 0x7d, 0x7e, 0xae, 0xaf, - 0xde, 0xdf, 0x4d, 0xbb, 0xbc, 0x16, 0x17, 0x1e, - 0x1f, 0x46, 0x47, 0x4e, 0x4f, 0x58, 0x5a, 0x5c, - 0x5e, 0x7e, 0x7f, 0xb5, 0xc5, 0xd4, 0xd5, 0xdc, - 0xf0, 0xf1, 0xf5, 0x72, 0x73, 0x8f, 0x74, 0x75, - 0x26, 0x2e, 0x2f, 0xa7, 0xaf, 0xb7, 0xbf, 0xc7, - 0xcf, 0xd7, 0xdf, 0x9a, 0x00, 0x40, 0x97, 0x98, - 0x30, 0x8f, 0x1f, 0xce, 0xff, 0x4e, 0x4f, 0x5a, - 0x5b, 0x07, 0x08, 0x0f, 0x10, 0x27, 0x2f, 0xee, - 0xef, 0x6e, 0x6f, 0x37, 0x3d, 0x3f, 0x42, 0x45, - 0x53, 0x67, 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, - 0xd9, 0xe7, 0xfe, 0xff, -]; -#[rustfmt::skip] -const SINGLETONS1U: &[(u8, u8)] = &[ - (0x00, 6), - (0x01, 1), - (0x03, 1), - (0x04, 2), - (0x05, 7), - (0x07, 2), - (0x08, 8), - (0x09, 2), - (0x0a, 5), - (0x0b, 2), - (0x0e, 4), - (0x10, 1), - (0x11, 2), - (0x12, 5), - (0x13, 28), - (0x14, 1), - (0x15, 2), - (0x17, 2), - (0x19, 13), - (0x1c, 5), - (0x1d, 8), - (0x1f, 1), - (0x24, 1), - (0x6a, 4), - (0x6b, 2), - (0x6e, 2), - (0xaf, 3), - (0xb1, 2), - (0xbc, 2), - (0xcf, 2), - (0xd1, 2), - (0xd4, 12), - (0xd5, 9), - (0xd6, 2), - (0xd7, 2), - (0xda, 1), - (0xe0, 5), - (0xe1, 2), - (0xe6, 1), - (0xe7, 4), - (0xe8, 2), - (0xee, 32), - (0xf0, 4), - (0xf8, 2), - (0xfa, 5), - (0xfb, 1), -]; -#[rustfmt::skip] -const SINGLETONS1L: &[u8] = &[ - 0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, - 0x9e, 0x9f, 0x7b, 0x8b, 0x93, 0x96, 0xa2, 0xb2, - 0xba, 0x86, 0xb1, 0x06, 0x07, 0x09, 0x36, 0x3d, - 0x3e, 0x56, 0xf3, 0xd0, 0xd1, 0x04, 0x14, 0x18, - 0x36, 0x37, 0x56, 0x57, 0x7f, 0xaa, 0xae, 0xaf, - 0xbd, 0x35, 0xe0, 0x12, 0x87, 0x89, 0x8e, 0x9e, - 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, - 0x3a, 0x45, 0x46, 0x49, 0x4a, 0x4e, 0x4f, 0x64, - 0x65, 0x8a, 0x8c, 0x8d, 0x8f, 0xb6, 0xc1, 0xc3, - 0xc4, 0xc6, 0xcb, 0xd6, 0x5c, 0xb6, 0xb7, 0x1b, - 0x1c, 0x07, 0x08, 0x0a, 0x0b, 0x14, 0x17, 0x36, - 0x39, 0x3a, 0xa8, 0xa9, 0xd8, 0xd9, 0x09, 0x37, - 0x90, 0x91, 0xa8, 0x07, 0x0a, 0x3b, 0x3e, 0x66, - 0x69, 0x8f, 0x92, 0x11, 0x6f, 0x5f, 0xbf, 0xee, - 0xef, 0x5a, 0x62, 0xb9, 0xba, 0xf4, 0xfc, 0xff, - 0x53, 0x54, 0x9a, 0x9b, 0x2e, 0x2f, 0x27, 0x28, - 0x55, 0x9d, 0xa0, 0xa1, 0xa3, 0xa4, 0xa7, 0xa8, - 0xad, 0xba, 0xbc, 0xc4, 0x06, 0x0b, 0x0c, 0x15, - 0x1d, 0x3a, 0x3f, 0x45, 0x51, 0xa6, 0xa7, 0xcc, - 0xcd, 0xa0, 0x07, 0x19, 0x1a, 0x22, 0x25, 0x3e, - 0x3f, 0xdf, 0xe7, 0xec, 0xef, 0xff, 0xc5, 0xc6, - 0x04, 0x20, 0x23, 0x25, 0x26, 0x28, 0x33, 0x38, - 0x3a, 0x48, 0x4a, 0x4c, 0x50, 0x53, 0x55, 0x56, - 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x63, 0x65, 0x66, - 0x6b, 0x73, 0x78, 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, - 0xaf, 0xb0, 0xc0, 0xd0, 0xae, 0xaf, 0x6e, 0x6f, - 0xc7, 0xdd, 0xde, 0x93, -]; -#[rustfmt::skip] -const NORMAL0: &[u8] = &[ - 0x00, 0x20, - 0x5f, 0x22, - 0x82, 0xdf, 0x04, - 0x82, 0x44, 0x08, - 0x1b, 0x04, - 0x06, 0x11, - 0x81, 0xac, 0x0e, - 0x80, 0xab, 0x05, - 0x20, 0x07, - 0x81, 0x1c, 0x03, - 0x19, 0x08, - 0x01, 0x04, - 0x2f, 0x04, - 0x34, 0x04, - 0x07, 0x03, - 0x01, 0x07, - 0x06, 0x07, - 0x11, 0x0a, - 0x50, 0x0f, - 0x12, 0x07, - 0x55, 0x07, - 0x03, 0x04, - 0x1c, 0x0a, - 0x09, 0x03, - 0x08, 0x03, - 0x07, 0x03, - 0x02, 0x03, - 0x03, 0x03, - 0x0c, 0x04, - 0x05, 0x03, - 0x0b, 0x06, - 0x01, 0x0e, - 0x15, 0x05, - 0x4e, 0x07, - 0x1b, 0x07, - 0x57, 0x07, - 0x02, 0x05, - 0x18, 0x0c, - 0x50, 0x04, - 0x43, 0x03, - 0x2d, 0x03, - 0x01, 0x04, - 0x11, 0x06, - 0x0f, 0x0c, - 0x3a, 0x04, - 0x1d, 0x25, - 0x5f, 0x20, - 0x6d, 0x04, - 0x6a, 0x25, - 0x80, 0xc8, 0x05, - 0x82, 0xb0, 0x03, - 0x1a, 0x06, - 0x82, 0xfd, 0x03, - 0x59, 0x07, - 0x16, 0x09, - 0x18, 0x09, - 0x14, 0x0c, - 0x14, 0x0c, - 0x6a, 0x06, - 0x0a, 0x06, - 0x1a, 0x06, - 0x59, 0x07, - 0x2b, 0x05, - 0x46, 0x0a, - 0x2c, 0x04, - 0x0c, 0x04, - 0x01, 0x03, - 0x31, 0x0b, - 0x2c, 0x04, - 0x1a, 0x06, - 0x0b, 0x03, - 0x80, 0xac, 0x06, - 0x0a, 0x06, - 0x4c, 0x14, - 0x80, 0xf4, 0x08, - 0x3c, 0x03, - 0x0f, 0x03, - 0x3e, 0x05, - 0x38, 0x08, - 0x2b, 0x05, - 0x82, 0xff, 0x11, - 0x18, 0x08, - 0x2f, 0x11, - 0x2d, 0x03, - 0x22, 0x0e, - 0x21, 0x0f, - 0x80, 0x8c, 0x04, - 0x82, 0x9a, 0x16, - 0x0b, 0x15, - 0x88, 0x94, 0x05, - 0x2f, 0x05, - 0x3b, 0x07, - 0x02, 0x0e, - 0x18, 0x09, - 0x80, 0xbe, 0x22, - 0x74, 0x0c, - 0x80, 0xd6, 0x1a, - 0x81, 0x10, 0x05, - 0x80, 0xe1, 0x09, - 0xf2, 0x9e, 0x03, - 0x37, 0x09, - 0x81, 0x5c, 0x14, - 0x80, 0xb8, 0x08, - 0x80, 0xdd, 0x14, - 0x3c, 0x03, - 0x0a, 0x06, - 0x38, 0x08, - 0x46, 0x08, - 0x0c, 0x06, - 0x74, 0x0b, - 0x1e, 0x03, - 0x5a, 0x04, - 0x59, 0x09, - 0x80, 0x83, 0x18, - 0x1c, 0x0a, - 0x16, 0x09, - 0x4c, 0x04, - 0x80, 0x8a, 0x06, - 0xab, 0xa4, 0x0c, - 0x17, 0x04, - 0x31, 0xa1, 0x04, - 0x81, 0xda, 0x26, - 0x07, 0x0c, - 0x05, 0x05, - 0x82, 0xb3, 0x20, - 0x2a, 0x06, - 0x4c, 0x04, - 0x80, 0x8d, 0x04, - 0x80, 0xbe, 0x03, - 0x1b, 0x03, - 0x0f, 0x0d, -]; -#[rustfmt::skip] -const NORMAL1: &[u8] = &[ - 0x5e, 0x22, - 0x7b, 0x05, - 0x03, 0x04, - 0x2d, 0x03, - 0x66, 0x03, - 0x01, 0x2f, - 0x2e, 0x80, 0x82, - 0x1d, 0x03, - 0x31, 0x0f, - 0x1c, 0x04, - 0x24, 0x09, - 0x1e, 0x05, - 0x2b, 0x05, - 0x44, 0x04, - 0x0e, 0x2a, - 0x80, 0xaa, 0x06, - 0x24, 0x04, - 0x24, 0x04, - 0x28, 0x08, - 0x34, 0x0b, - 0x4e, 0x03, - 0x34, 0x0c, - 0x81, 0x37, 0x09, - 0x16, 0x0a, - 0x08, 0x18, - 0x3b, 0x45, - 0x39, 0x03, - 0x63, 0x08, - 0x09, 0x30, - 0x16, 0x05, - 0x21, 0x03, - 0x1b, 0x05, - 0x1b, 0x26, - 0x38, 0x04, - 0x4b, 0x05, - 0x2f, 0x04, - 0x0a, 0x07, - 0x09, 0x07, - 0x40, 0x20, - 0x27, 0x04, - 0x0c, 0x09, - 0x36, 0x03, - 0x3a, 0x05, - 0x1a, 0x07, - 0x04, 0x0c, - 0x07, 0x50, - 0x49, 0x37, - 0x33, 0x0d, - 0x33, 0x07, - 0x2e, 0x08, - 0x0a, 0x06, - 0x26, 0x03, - 0x1d, 0x08, - 0x02, 0x80, 0xd0, - 0x52, 0x10, - 0x06, 0x08, - 0x09, 0x21, - 0x2e, 0x08, - 0x2a, 0x16, - 0x1a, 0x26, - 0x1c, 0x14, - 0x17, 0x09, - 0x4e, 0x04, - 0x24, 0x09, - 0x44, 0x0d, - 0x19, 0x07, - 0x0a, 0x06, - 0x48, 0x08, - 0x27, 0x09, - 0x75, 0x0b, - 0x42, 0x3e, - 0x2a, 0x06, - 0x3b, 0x05, - 0x0a, 0x06, - 0x51, 0x06, - 0x01, 0x05, - 0x10, 0x03, - 0x05, 0x0b, - 0x59, 0x08, - 0x02, 0x1d, - 0x62, 0x1e, - 0x48, 0x08, - 0x0a, 0x80, 0xa6, - 0x5e, 0x22, - 0x45, 0x0b, - 0x0a, 0x06, - 0x0d, 0x13, - 0x3a, 0x06, - 0x0a, 0x06, - 0x14, 0x1c, - 0x2c, 0x04, - 0x17, 0x80, 0xb9, - 0x3c, 0x64, - 0x53, 0x0c, - 0x48, 0x09, - 0x0a, 0x46, - 0x45, 0x1b, - 0x48, 0x08, - 0x53, 0x0d, - 0x49, 0x07, - 0x0a, 0x56, - 0x08, 0x58, - 0x22, 0x0e, - 0x0a, 0x06, - 0x46, 0x0a, - 0x1d, 0x03, - 0x47, 0x49, - 0x37, 0x03, - 0x0e, 0x08, - 0x0a, 0x06, - 0x39, 0x07, - 0x0a, 0x06, - 0x2c, 0x04, - 0x0a, 0x80, 0xf6, - 0x19, 0x07, - 0x3b, 0x03, - 0x1d, 0x55, - 0x01, 0x0f, - 0x32, 0x0d, - 0x83, 0x9b, 0x66, - 0x75, 0x0b, - 0x80, 0xc4, 0x8a, 0x4c, - 0x63, 0x0d, - 0x84, 0x30, 0x10, - 0x16, 0x0a, - 0x8f, 0x9b, 0x05, - 0x82, 0x47, 0x9a, 0xb9, - 0x3a, 0x86, 0xc6, - 0x82, 0x39, 0x07, - 0x2a, 0x04, - 0x5c, 0x06, - 0x26, 0x0a, - 0x46, 0x0a, - 0x28, 0x05, - 0x13, 0x81, 0xb0, - 0x3a, 0x80, 0xc6, - 0x5b, 0x05, - 0x34, 0x2c, - 0x4b, 0x04, - 0x39, 0x07, - 0x11, 0x40, - 0x05, 0x0b, - 0x07, 0x09, - 0x9c, 0xd6, 0x29, - 0x20, 0x61, - 0x73, 0xa1, 0xfd, - 0x81, 0x33, 0x0f, - 0x01, 0x1d, - 0x06, 0x0e, - 0x04, 0x08, - 0x81, 0x8c, 0x89, 0x04, - 0x6b, 0x05, - 0x0d, 0x03, - 0x09, 0x07, - 0x10, 0x8f, 0x60, - 0x80, 0xfd, 0x03, - 0x81, 0xb4, 0x06, - 0x17, 0x0f, - 0x11, 0x0f, - 0x47, 0x09, - 0x74, 0x3c, - 0x80, 0xf6, 0x0a, - 0x73, 0x08, - 0x70, 0x15, - 0x46, 0x7a, - 0x14, 0x0c, - 0x14, 0x0c, - 0x57, 0x09, - 0x19, 0x80, 0x87, - 0x81, 0x47, 0x03, - 0x85, 0x42, 0x0f, - 0x15, 0x84, 0x50, - 0x1f, 0x06, - 0x06, 0x80, 0xd5, - 0x2b, 0x05, - 0x3e, 0x21, - 0x01, 0x70, - 0x2d, 0x03, - 0x1a, 0x04, - 0x02, 0x81, 0x40, - 0x1f, 0x11, - 0x3a, 0x05, - 0x01, 0x81, 0xd0, - 0x2a, 0x80, 0xd6, - 0x2b, 0x04, - 0x01, 0x80, 0xc0, - 0x36, 0x08, - 0x02, 0x80, 0xe0, - 0x80, 0xf7, 0x29, - 0x4c, 0x04, - 0x0a, 0x04, - 0x02, 0x83, 0x11, - 0x44, 0x4c, - 0x3d, 0x80, 0xc2, - 0x3c, 0x06, - 0x01, 0x04, - 0x55, 0x05, - 0x1b, 0x34, - 0x02, 0x81, 0x0e, - 0x2c, 0x04, - 0x64, 0x0c, - 0x56, 0x0a, - 0x80, 0xae, 0x38, - 0x1d, 0x0d, - 0x2c, 0x04, - 0x09, 0x07, - 0x02, 0x0e, - 0x06, 0x80, 0x9a, - 0x83, 0xd9, 0x03, - 0x11, 0x03, - 0x0d, 0x03, - 0x80, 0xda, 0x06, - 0x0c, 0x04, - 0x01, 0x0f, - 0x0c, 0x04, - 0x38, 0x08, - 0x0a, 0x06, - 0x28, 0x08, - 0x2c, 0x04, - 0x02, 0x0e, - 0x09, 0x27, - 0x81, 0x58, 0x08, - 0x1d, 0x03, - 0x0b, 0x03, - 0x3b, 0x04, - 0x1e, 0x04, - 0x0a, 0x07, - 0x80, 0xfb, 0x84, 0x05, -]; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 83d3808051840..e5bcdc270ccfb 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,16 +1,18 @@ //! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! -// Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist -// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist -// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist -// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset -// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist -// N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist -// Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset -// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading -// to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT -// to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT -// to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT -// Total : 9629 bytes +// Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist +// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist +// Cf : 87 bytes, 170 codepoints in 21 ranges (U+0000AD - U+0E0080) using skiplist +// Cn_Planes_0_3 : 1677 bytes, 94165 codepoints in 730 ranges (U+000378 - U+03FFFE) using skiplist +// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist +// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset +// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist +// N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist +// Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset +// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading +// to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT +// to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT +// to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT +// Total : 11393 bytes #[inline(always)] const fn bitset_search< @@ -337,6 +339,155 @@ pub mod case_ignorable { } } +#[rustfmt::skip] +pub mod cf { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 11] = [ + ShortOffsetRunHeader::new(0, 1536), ShortOffsetRunHeader::new(3, 2192), + ShortOffsetRunHeader::new(11, 6158), ShortOffsetRunHeader::new(15, 8203), + ShortOffsetRunHeader::new(17, 65279), ShortOffsetRunHeader::new(25, 69821), + ShortOffsetRunHeader::new(29, 78896), ShortOffsetRunHeader::new(33, 113824), + ShortOffsetRunHeader::new(35, 119155), ShortOffsetRunHeader::new(37, 917505), + ShortOffsetRunHeader::new(39, 2031744), + ]; + static OFFSETS: [u8; 43] = [ + 173, 1, 0, 6, 22, 1, 192, 1, 49, 1, 0, 2, 80, 1, 0, 1, 0, 5, 26, 5, 49, 5, 1, 10, 0, 1, + 249, 3, 0, 1, 15, 1, 0, 16, 0, 4, 0, 8, 0, 1, 30, 96, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xad && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + +#[rustfmt::skip] +pub mod cn_planes_0_3 { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 54] = [ + ShortOffsetRunHeader::new(0, 888), ShortOffsetRunHeader::new(1, 1328), + ShortOffsetRunHeader::new(11, 1806), ShortOffsetRunHeader::new(25, 4681), + ShortOffsetRunHeader::new(325, 5789), ShortOffsetRunHeader::new(365, 7958), + ShortOffsetRunHeader::new(445, 9258), ShortOffsetRunHeader::new(491, 11124), + ShortOffsetRunHeader::new(495, 11508), ShortOffsetRunHeader::new(497, 42125), + ShortOffsetRunHeader::new(549, 42540), ShortOffsetRunHeader::new(553, 55204), + ShortOffsetRunHeader::new(605, 64110), ShortOffsetRunHeader::new(611, 64976), + ShortOffsetRunHeader::new(629, 67383), ShortOffsetRunHeader::new(735, 74650), + ShortOffsetRunHeader::new(1067, 77712), ShortOffsetRunHeader::new(1074, 78934), + ShortOffsetRunHeader::new(1077, 82939), ShortOffsetRunHeader::new(1079, 83527), + ShortOffsetRunHeader::new(1081, 90368), ShortOffsetRunHeader::new(1082, 92160), + ShortOffsetRunHeader::new(1084, 92729), ShortOffsetRunHeader::new(1085, 93504), + ShortOffsetRunHeader::new(1108, 101590), ShortOffsetRunHeader::new(1127, 110576), + ShortOffsetRunHeader::new(1132, 110883), ShortOffsetRunHeader::new(1139, 111356), + ShortOffsetRunHeader::new(1149, 113664), ShortOffsetRunHeader::new(1150, 117760), + ShortOffsetRunHeader::new(1160, 118452), ShortOffsetRunHeader::new(1163, 120486), + ShortOffsetRunHeader::new(1227, 120780), ShortOffsetRunHeader::new(1229, 121484), + ShortOffsetRunHeader::new(1231, 122624), ShortOffsetRunHeader::new(1236, 123536), + ShortOffsetRunHeader::new(1262, 124112), ShortOffsetRunHeader::new(1268, 126065), + ShortOffsetRunHeader::new(1298, 126976), ShortOffsetRunHeader::new(1370, 128729), + ShortOffsetRunHeader::new(1395, 129624), ShortOffsetRunHeader::new(1423, 131072), + ShortOffsetRunHeader::new(1444, 173792), ShortOffsetRunHeader::new(1445, 178206), + ShortOffsetRunHeader::new(1447, 183982), ShortOffsetRunHeader::new(1449, 191457), + ShortOffsetRunHeader::new(1451, 192094), ShortOffsetRunHeader::new(1453, 194560), + ShortOffsetRunHeader::new(1454, 195102), ShortOffsetRunHeader::new(1455, 196608), + ShortOffsetRunHeader::new(1456, 201547), ShortOffsetRunHeader::new(1457, 210042), + ShortOffsetRunHeader::new(1459, 262142), ShortOffsetRunHeader::new(1460, 1376254), + ]; + static OFFSETS: [u8; 1461] = [ + 0, 2, 6, 4, 7, 1, 1, 1, 20, 1, 0, 1, 38, 2, 50, 2, 3, 1, 55, 8, 27, 4, 6, 11, 0, 1, 60, 2, + 101, 14, 59, 2, 49, 2, 15, 1, 28, 2, 1, 1, 11, 5, 34, 5, 237, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, + 3, 4, 2, 9, 2, 2, 2, 4, 8, 1, 4, 2, 1, 5, 2, 25, 2, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, 2, + 1, 2, 2, 1, 1, 5, 4, 2, 2, 3, 3, 1, 7, 4, 1, 1, 7, 17, 10, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, + 1, 5, 2, 10, 1, 3, 1, 3, 2, 1, 15, 4, 2, 12, 7, 7, 1, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, + 5, 2, 9, 2, 2, 2, 3, 7, 3, 4, 2, 1, 5, 2, 18, 10, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, + 2, 3, 3, 3, 12, 4, 5, 3, 3, 1, 4, 2, 1, 6, 1, 14, 21, 5, 13, 1, 3, 1, 23, 1, 16, 2, 9, 1, 3, + 1, 4, 7, 2, 1, 3, 1, 2, 2, 4, 2, 10, 7, 22, 1, 3, 1, 23, 1, 10, 1, 5, 2, 9, 1, 3, 1, 4, 7, + 2, 5, 3, 1, 4, 2, 10, 1, 3, 12, 13, 1, 3, 1, 51, 1, 3, 1, 6, 4, 16, 2, 26, 1, 3, 1, 18, 3, + 24, 1, 9, 1, 1, 2, 7, 3, 1, 4, 6, 1, 1, 1, 8, 6, 10, 2, 3, 12, 58, 4, 29, 37, 2, 1, 1, 1, 5, + 1, 24, 1, 1, 1, 23, 2, 5, 1, 1, 1, 7, 1, 10, 2, 4, 32, 72, 1, 36, 4, 39, 1, 36, 1, 15, 1, + 13, 37, 198, 1, 1, 5, 1, 2, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, 41, 1, 4, 2, 33, 1, 4, 2, 7, 1, 1, + 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 2, 32, 3, 26, 6, 86, 2, 6, 2, 0, 3, 89, 7, 22, 9, 24, 9, + 20, 12, 13, 1, 3, 1, 2, 12, 94, 2, 10, 6, 10, 6, 26, 6, 89, 7, 43, 5, 70, 10, 31, 1, 12, 4, + 12, 4, 1, 3, 42, 2, 5, 11, 44, 4, 26, 6, 11, 3, 62, 2, 65, 1, 29, 2, 11, 6, 10, 6, 14, 2, + 46, 2, 12, 20, 77, 1, 166, 8, 60, 3, 15, 3, 62, 5, 43, 2, 11, 8, 43, 5, 0, 2, 6, 2, 38, 2, + 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 15, 1, 14, 2, 6, 1, 19, 2, 3, 1, 9, 1, 101, 1, + 12, 2, 27, 1, 13, 3, 34, 14, 33, 15, 140, 4, 0, 22, 11, 21, 0, 2, 0, 5, 45, 1, 1, 5, 1, 2, + 56, 7, 2, 14, 24, 9, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 126, 34, 26, 1, 89, 12, + 214, 26, 80, 1, 86, 2, 103, 5, 43, 1, 94, 1, 86, 9, 48, 1, 0, 3, 55, 9, 0, 20, 184, 8, 221, + 20, 60, 3, 10, 6, 56, 8, 70, 8, 12, 6, 116, 11, 30, 3, 78, 1, 11, 4, 33, 1, 55, 9, 14, 2, + 10, 2, 103, 24, 28, 10, 6, 2, 6, 2, 6, 9, 7, 1, 7, 1, 60, 4, 126, 2, 10, 6, 0, 12, 23, 4, + 49, 4, 0, 2, 106, 38, 7, 12, 5, 5, 26, 1, 5, 1, 1, 1, 2, 1, 2, 1, 0, 32, 42, 6, 51, 1, 19, + 1, 4, 4, 5, 1, 135, 2, 1, 1, 190, 3, 6, 2, 6, 2, 6, 2, 3, 3, 7, 1, 7, 10, 5, 2, 12, 1, 26, + 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 5, 3, 4, 45, 3, 88, 1, 13, 3, 1, 47, 46, 130, 29, 3, 49, + 15, 28, 4, 36, 9, 30, 5, 43, 5, 30, 1, 37, 4, 14, 42, 158, 2, 10, 6, 36, 4, 36, 4, 40, 8, + 52, 11, 12, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, 6, + 1, 42, 1, 9, 69, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 1, 72, 8, 9, 48, 19, 1, 2, 5, 33, 3, 27, + 5, 27, 38, 56, 4, 20, 2, 50, 1, 2, 5, 8, 1, 3, 1, 29, 2, 3, 4, 10, 7, 9, 7, 64, 32, 39, 4, + 12, 9, 54, 3, 29, 2, 27, 5, 26, 7, 4, 12, 7, 80, 73, 55, 51, 13, 51, 7, 46, 8, 10, 6, 38, 3, + 29, 8, 2, 208, 31, 1, 42, 1, 3, 2, 2, 16, 6, 8, 9, 33, 46, 8, 42, 22, 26, 38, 28, 20, 23, 9, + 78, 4, 36, 9, 68, 10, 1, 2, 25, 7, 10, 6, 53, 1, 18, 8, 39, 9, 96, 1, 20, 11, 18, 1, 47, 62, + 7, 1, 1, 1, 4, 1, 15, 1, 11, 6, 59, 5, 10, 6, 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 1, 10, + 2, 2, 2, 3, 2, 1, 6, 1, 5, 7, 2, 7, 3, 5, 11, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, 1, 1, + 4, 1, 10, 1, 2, 8, 2, 29, 92, 1, 5, 30, 72, 8, 10, 166, 54, 2, 38, 34, 69, 11, 10, 6, 13, + 19, 58, 6, 10, 6, 20, 28, 27, 2, 15, 4, 23, 185, 60, 100, 83, 12, 8, 2, 1, 2, 8, 1, 2, 1, + 30, 1, 2, 2, 12, 9, 10, 70, 8, 2, 46, 2, 11, 27, 72, 8, 83, 13, 73, 7, 10, 86, 8, 88, 34, + 14, 10, 6, 9, 1, 45, 1, 14, 10, 29, 3, 32, 2, 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, + 9, 8, 10, 6, 6, 1, 2, 1, 37, 1, 2, 1, 6, 7, 10, 6, 44, 4, 10, 246, 25, 7, 17, 1, 41, 3, 29, + 85, 1, 15, 50, 13, 0, 102, 111, 1, 5, 11, 196, 0, 99, 13, 0, 10, 0, 5, 0, 0, 58, 0, 0, 7, + 31, 1, 10, 4, 81, 1, 10, 6, 30, 2, 6, 10, 70, 10, 10, 1, 7, 1, 21, 5, 19, 0, 58, 198, 91, 5, + 25, 2, 25, 44, 75, 4, 57, 7, 17, 64, 5, 11, 7, 9, 0, 41, 32, 97, 115, 0, 4, 1, 7, 1, 2, 1, + 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, 13, 3, 9, 7, 10, 2, 8, 0, 253, 3, 0, 6, 23, + 15, 17, 15, 46, 2, 23, 9, 116, 60, 246, 10, 39, 2, 194, 21, 70, 122, 20, 12, 20, 12, 87, 9, + 25, 135, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, + 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 0, 2, 0, 15, 5, 1, 15, 0, 31, 6, 6, 213, 7, 1, 17, 2, + 7, 1, 2, 1, 5, 5, 62, 33, 1, 112, 45, 3, 14, 2, 10, 4, 2, 0, 31, 17, 58, 5, 1, 0, 42, 214, + 43, 4, 1, 192, 31, 1, 22, 8, 2, 224, 7, 1, 4, 1, 2, 1, 15, 1, 197, 2, 16, 41, 76, 4, 10, 4, + 2, 0, 68, 76, 61, 194, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, + 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, + 1, 1, 1, 10, 1, 17, 5, 3, 1, 5, 1, 17, 52, 2, 0, 44, 4, 100, 12, 15, 2, 15, 1, 15, 1, 37, + 10, 174, 56, 29, 13, 44, 4, 9, 7, 2, 14, 6, 154, 0, 3, 17, 3, 13, 3, 218, 6, 12, 4, 1, 15, + 12, 4, 56, 8, 10, 6, 40, 8, 30, 2, 12, 4, 2, 14, 9, 39, 0, 8, 14, 2, 13, 3, 11, 3, 57, 1, 1, + 4, 16, 2, 12, 4, 10, 7, 147, 1, 103, 0, 0, 32, 0, 2, 0, 2, 0, 15, 0, 0, 0, 0, 0, 5, 0, 0, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0x378 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + #[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; diff --git a/library/coretests/tests/unicode.rs b/library/coretests/tests/unicode.rs index 12eed25a1feae..793255563aaae 100644 --- a/library/coretests/tests/unicode.rs +++ b/library/coretests/tests/unicode.rs @@ -1,3 +1,4 @@ +use core::iter::Step; use core::unicode::unicode_data; use std::ops::RangeInclusive; @@ -19,7 +20,7 @@ fn test_boolean_property(ranges: &[RangeInclusive], lookup: fn(char) -> bo for c in range.clone() { assert!(lookup(c), "{c:?}"); } - start = char::from_u32(*range.end() as u32 + 1).unwrap(); + start = Step::forward(*range.end(), 1); } for c in start..=char::MAX { assert!(!lookup(c), "{c:?}"); @@ -60,9 +61,14 @@ fn case_ignorable() { #[test] #[cfg_attr(miri, ignore)] // Miri is too slow -fn lt() { - test_boolean_property(test_data::LT, unicode_data::lt::lookup); - test_boolean_property(test_data::LT, char::is_titlecase); +fn cf() { + test_boolean_property(test_data::CF, unicode_data::cf::lookup); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn cn_planes_0_3() { + test_boolean_property(test_data::CN_PLANES_0_3, unicode_data::cn_planes_0_3::lookup); } #[test] @@ -78,6 +84,13 @@ fn lowercase() { test_boolean_property(test_data::LOWERCASE, char::is_lowercase); } +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn lt() { + test_boolean_property(test_data::LT, unicode_data::lt::lookup); + test_boolean_property(test_data::LT, char::is_titlecase); +} + #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn n() { diff --git a/library/coretests/tests/unicode/test_data.rs b/library/coretests/tests/unicode/test_data.rs index 962770a0ff830..cb72682b2e00e 100644 --- a/library/coretests/tests/unicode/test_data.rs +++ b/library/coretests/tests/unicode/test_data.rs @@ -392,6 +392,252 @@ pub(super) static CASE_IGNORABLE: &[RangeInclusive; 459] = &[ '\u{e0100}'..='\u{e01ef}', ]; +#[rustfmt::skip] +pub(super) static CF: &[RangeInclusive; 21] = &[ + '\u{ad}'..='\u{ad}', '\u{600}'..='\u{605}', '\u{61c}'..='\u{61c}', '\u{6dd}'..='\u{6dd}', + '\u{70f}'..='\u{70f}', '\u{890}'..='\u{891}', '\u{8e2}'..='\u{8e2}', + '\u{180e}'..='\u{180e}', '\u{200b}'..='\u{200f}', '\u{202a}'..='\u{202e}', + '\u{2060}'..='\u{2064}', '\u{2066}'..='\u{206f}', '\u{feff}'..='\u{feff}', + '\u{fff9}'..='\u{fffb}', '\u{110bd}'..='\u{110bd}', '\u{110cd}'..='\u{110cd}', + '\u{13430}'..='\u{1343f}', '\u{1bca0}'..='\u{1bca3}', '\u{1d173}'..='\u{1d17a}', + '\u{e0001}'..='\u{e0001}', '\u{e0020}'..='\u{e007f}', +]; + +#[rustfmt::skip] +pub(super) static CN_PLANES_0_3: &[RangeInclusive; 730] = &[ + '\u{378}'..='\u{379}', '\u{380}'..='\u{383}', '\u{38b}'..='\u{38b}', '\u{38d}'..='\u{38d}', + '\u{3a2}'..='\u{3a2}', '\u{530}'..='\u{530}', '\u{557}'..='\u{558}', '\u{58b}'..='\u{58c}', + '\u{590}'..='\u{590}', '\u{5c8}'..='\u{5cf}', '\u{5eb}'..='\u{5ee}', '\u{5f5}'..='\u{5ff}', + '\u{70e}'..='\u{70e}', '\u{74b}'..='\u{74c}', '\u{7b2}'..='\u{7bf}', '\u{7fb}'..='\u{7fc}', + '\u{82e}'..='\u{82f}', '\u{83f}'..='\u{83f}', '\u{85c}'..='\u{85d}', '\u{85f}'..='\u{85f}', + '\u{86b}'..='\u{86f}', '\u{892}'..='\u{896}', '\u{984}'..='\u{984}', '\u{98d}'..='\u{98e}', + '\u{991}'..='\u{992}', '\u{9a9}'..='\u{9a9}', '\u{9b1}'..='\u{9b1}', '\u{9b3}'..='\u{9b5}', + '\u{9ba}'..='\u{9bb}', '\u{9c5}'..='\u{9c6}', '\u{9c9}'..='\u{9ca}', '\u{9cf}'..='\u{9d6}', + '\u{9d8}'..='\u{9db}', '\u{9de}'..='\u{9de}', '\u{9e4}'..='\u{9e5}', '\u{9ff}'..='\u{a00}', + '\u{a04}'..='\u{a04}', '\u{a0b}'..='\u{a0e}', '\u{a11}'..='\u{a12}', '\u{a29}'..='\u{a29}', + '\u{a31}'..='\u{a31}', '\u{a34}'..='\u{a34}', '\u{a37}'..='\u{a37}', '\u{a3a}'..='\u{a3b}', + '\u{a3d}'..='\u{a3d}', '\u{a43}'..='\u{a46}', '\u{a49}'..='\u{a4a}', '\u{a4e}'..='\u{a50}', + '\u{a52}'..='\u{a58}', '\u{a5d}'..='\u{a5d}', '\u{a5f}'..='\u{a65}', '\u{a77}'..='\u{a80}', + '\u{a84}'..='\u{a84}', '\u{a8e}'..='\u{a8e}', '\u{a92}'..='\u{a92}', '\u{aa9}'..='\u{aa9}', + '\u{ab1}'..='\u{ab1}', '\u{ab4}'..='\u{ab4}', '\u{aba}'..='\u{abb}', '\u{ac6}'..='\u{ac6}', + '\u{aca}'..='\u{aca}', '\u{ace}'..='\u{acf}', '\u{ad1}'..='\u{adf}', '\u{ae4}'..='\u{ae5}', + '\u{af2}'..='\u{af8}', '\u{b00}'..='\u{b00}', '\u{b04}'..='\u{b04}', '\u{b0d}'..='\u{b0e}', + '\u{b11}'..='\u{b12}', '\u{b29}'..='\u{b29}', '\u{b31}'..='\u{b31}', '\u{b34}'..='\u{b34}', + '\u{b3a}'..='\u{b3b}', '\u{b45}'..='\u{b46}', '\u{b49}'..='\u{b4a}', '\u{b4e}'..='\u{b54}', + '\u{b58}'..='\u{b5b}', '\u{b5e}'..='\u{b5e}', '\u{b64}'..='\u{b65}', '\u{b78}'..='\u{b81}', + '\u{b84}'..='\u{b84}', '\u{b8b}'..='\u{b8d}', '\u{b91}'..='\u{b91}', '\u{b96}'..='\u{b98}', + '\u{b9b}'..='\u{b9b}', '\u{b9d}'..='\u{b9d}', '\u{ba0}'..='\u{ba2}', '\u{ba5}'..='\u{ba7}', + '\u{bab}'..='\u{bad}', '\u{bba}'..='\u{bbd}', '\u{bc3}'..='\u{bc5}', '\u{bc9}'..='\u{bc9}', + '\u{bce}'..='\u{bcf}', '\u{bd1}'..='\u{bd6}', '\u{bd8}'..='\u{be5}', '\u{bfb}'..='\u{bff}', + '\u{c0d}'..='\u{c0d}', '\u{c11}'..='\u{c11}', '\u{c29}'..='\u{c29}', '\u{c3a}'..='\u{c3b}', + '\u{c45}'..='\u{c45}', '\u{c49}'..='\u{c49}', '\u{c4e}'..='\u{c54}', '\u{c57}'..='\u{c57}', + '\u{c5b}'..='\u{c5b}', '\u{c5e}'..='\u{c5f}', '\u{c64}'..='\u{c65}', '\u{c70}'..='\u{c76}', + '\u{c8d}'..='\u{c8d}', '\u{c91}'..='\u{c91}', '\u{ca9}'..='\u{ca9}', '\u{cb4}'..='\u{cb4}', + '\u{cba}'..='\u{cbb}', '\u{cc5}'..='\u{cc5}', '\u{cc9}'..='\u{cc9}', '\u{cce}'..='\u{cd4}', + '\u{cd7}'..='\u{cdb}', '\u{cdf}'..='\u{cdf}', '\u{ce4}'..='\u{ce5}', '\u{cf0}'..='\u{cf0}', + '\u{cf4}'..='\u{cff}', '\u{d0d}'..='\u{d0d}', '\u{d11}'..='\u{d11}', '\u{d45}'..='\u{d45}', + '\u{d49}'..='\u{d49}', '\u{d50}'..='\u{d53}', '\u{d64}'..='\u{d65}', '\u{d80}'..='\u{d80}', + '\u{d84}'..='\u{d84}', '\u{d97}'..='\u{d99}', '\u{db2}'..='\u{db2}', '\u{dbc}'..='\u{dbc}', + '\u{dbe}'..='\u{dbf}', '\u{dc7}'..='\u{dc9}', '\u{dcb}'..='\u{dce}', '\u{dd5}'..='\u{dd5}', + '\u{dd7}'..='\u{dd7}', '\u{de0}'..='\u{de5}', '\u{df0}'..='\u{df1}', '\u{df5}'..='\u{e00}', + '\u{e3b}'..='\u{e3e}', '\u{e5c}'..='\u{e80}', '\u{e83}'..='\u{e83}', '\u{e85}'..='\u{e85}', + '\u{e8b}'..='\u{e8b}', '\u{ea4}'..='\u{ea4}', '\u{ea6}'..='\u{ea6}', '\u{ebe}'..='\u{ebf}', + '\u{ec5}'..='\u{ec5}', '\u{ec7}'..='\u{ec7}', '\u{ecf}'..='\u{ecf}', '\u{eda}'..='\u{edb}', + '\u{ee0}'..='\u{eff}', '\u{f48}'..='\u{f48}', '\u{f6d}'..='\u{f70}', '\u{f98}'..='\u{f98}', + '\u{fbd}'..='\u{fbd}', '\u{fcd}'..='\u{fcd}', '\u{fdb}'..='\u{fff}', + '\u{10c6}'..='\u{10c6}', '\u{10c8}'..='\u{10cc}', '\u{10ce}'..='\u{10cf}', + '\u{1249}'..='\u{1249}', '\u{124e}'..='\u{124f}', '\u{1257}'..='\u{1257}', + '\u{1259}'..='\u{1259}', '\u{125e}'..='\u{125f}', '\u{1289}'..='\u{1289}', + '\u{128e}'..='\u{128f}', '\u{12b1}'..='\u{12b1}', '\u{12b6}'..='\u{12b7}', + '\u{12bf}'..='\u{12bf}', '\u{12c1}'..='\u{12c1}', '\u{12c6}'..='\u{12c7}', + '\u{12d7}'..='\u{12d7}', '\u{1311}'..='\u{1311}', '\u{1316}'..='\u{1317}', + '\u{135b}'..='\u{135c}', '\u{137d}'..='\u{137f}', '\u{139a}'..='\u{139f}', + '\u{13f6}'..='\u{13f7}', '\u{13fe}'..='\u{13ff}', '\u{169d}'..='\u{169f}', + '\u{16f9}'..='\u{16ff}', '\u{1716}'..='\u{171e}', '\u{1737}'..='\u{173f}', + '\u{1754}'..='\u{175f}', '\u{176d}'..='\u{176d}', '\u{1771}'..='\u{1771}', + '\u{1774}'..='\u{177f}', '\u{17de}'..='\u{17df}', '\u{17ea}'..='\u{17ef}', + '\u{17fa}'..='\u{17ff}', '\u{181a}'..='\u{181f}', '\u{1879}'..='\u{187f}', + '\u{18ab}'..='\u{18af}', '\u{18f6}'..='\u{18ff}', '\u{191f}'..='\u{191f}', + '\u{192c}'..='\u{192f}', '\u{193c}'..='\u{193f}', '\u{1941}'..='\u{1943}', + '\u{196e}'..='\u{196f}', '\u{1975}'..='\u{197f}', '\u{19ac}'..='\u{19af}', + '\u{19ca}'..='\u{19cf}', '\u{19db}'..='\u{19dd}', '\u{1a1c}'..='\u{1a1d}', + '\u{1a5f}'..='\u{1a5f}', '\u{1a7d}'..='\u{1a7e}', '\u{1a8a}'..='\u{1a8f}', + '\u{1a9a}'..='\u{1a9f}', '\u{1aae}'..='\u{1aaf}', '\u{1ade}'..='\u{1adf}', + '\u{1aec}'..='\u{1aff}', '\u{1b4d}'..='\u{1b4d}', '\u{1bf4}'..='\u{1bfb}', + '\u{1c38}'..='\u{1c3a}', '\u{1c4a}'..='\u{1c4c}', '\u{1c8b}'..='\u{1c8f}', + '\u{1cbb}'..='\u{1cbc}', '\u{1cc8}'..='\u{1ccf}', '\u{1cfb}'..='\u{1cff}', + '\u{1f16}'..='\u{1f17}', '\u{1f1e}'..='\u{1f1f}', '\u{1f46}'..='\u{1f47}', + '\u{1f4e}'..='\u{1f4f}', '\u{1f58}'..='\u{1f58}', '\u{1f5a}'..='\u{1f5a}', + '\u{1f5c}'..='\u{1f5c}', '\u{1f5e}'..='\u{1f5e}', '\u{1f7e}'..='\u{1f7f}', + '\u{1fb5}'..='\u{1fb5}', '\u{1fc5}'..='\u{1fc5}', '\u{1fd4}'..='\u{1fd5}', + '\u{1fdc}'..='\u{1fdc}', '\u{1ff0}'..='\u{1ff1}', '\u{1ff5}'..='\u{1ff5}', + '\u{1fff}'..='\u{1fff}', '\u{2065}'..='\u{2065}', '\u{2072}'..='\u{2073}', + '\u{208f}'..='\u{208f}', '\u{209d}'..='\u{209f}', '\u{20c2}'..='\u{20cf}', + '\u{20f1}'..='\u{20ff}', '\u{218c}'..='\u{218f}', '\u{242a}'..='\u{243f}', + '\u{244b}'..='\u{245f}', '\u{2b74}'..='\u{2b75}', '\u{2cf4}'..='\u{2cf8}', + '\u{2d26}'..='\u{2d26}', '\u{2d28}'..='\u{2d2c}', '\u{2d2e}'..='\u{2d2f}', + '\u{2d68}'..='\u{2d6e}', '\u{2d71}'..='\u{2d7e}', '\u{2d97}'..='\u{2d9f}', + '\u{2da7}'..='\u{2da7}', '\u{2daf}'..='\u{2daf}', '\u{2db7}'..='\u{2db7}', + '\u{2dbf}'..='\u{2dbf}', '\u{2dc7}'..='\u{2dc7}', '\u{2dcf}'..='\u{2dcf}', + '\u{2dd7}'..='\u{2dd7}', '\u{2ddf}'..='\u{2ddf}', '\u{2e5e}'..='\u{2e7f}', + '\u{2e9a}'..='\u{2e9a}', '\u{2ef4}'..='\u{2eff}', '\u{2fd6}'..='\u{2fef}', + '\u{3040}'..='\u{3040}', '\u{3097}'..='\u{3098}', '\u{3100}'..='\u{3104}', + '\u{3130}'..='\u{3130}', '\u{318f}'..='\u{318f}', '\u{31e6}'..='\u{31ee}', + '\u{321f}'..='\u{321f}', '\u{a48d}'..='\u{a48f}', '\u{a4c7}'..='\u{a4cf}', + '\u{a62c}'..='\u{a63f}', '\u{a6f8}'..='\u{a6ff}', '\u{a7dd}'..='\u{a7f0}', + '\u{a82d}'..='\u{a82f}', '\u{a83a}'..='\u{a83f}', '\u{a878}'..='\u{a87f}', + '\u{a8c6}'..='\u{a8cd}', '\u{a8da}'..='\u{a8df}', '\u{a954}'..='\u{a95e}', + '\u{a97d}'..='\u{a97f}', '\u{a9ce}'..='\u{a9ce}', '\u{a9da}'..='\u{a9dd}', + '\u{a9ff}'..='\u{a9ff}', '\u{aa37}'..='\u{aa3f}', '\u{aa4e}'..='\u{aa4f}', + '\u{aa5a}'..='\u{aa5b}', '\u{aac3}'..='\u{aada}', '\u{aaf7}'..='\u{ab00}', + '\u{ab07}'..='\u{ab08}', '\u{ab0f}'..='\u{ab10}', '\u{ab17}'..='\u{ab1f}', + '\u{ab27}'..='\u{ab27}', '\u{ab2f}'..='\u{ab2f}', '\u{ab6c}'..='\u{ab6f}', + '\u{abee}'..='\u{abef}', '\u{abfa}'..='\u{abff}', '\u{d7a4}'..='\u{d7af}', + '\u{d7c7}'..='\u{d7ca}', '\u{d7fc}'..='\u{d7ff}', '\u{fa6e}'..='\u{fa6f}', + '\u{fada}'..='\u{faff}', '\u{fb07}'..='\u{fb12}', '\u{fb18}'..='\u{fb1c}', + '\u{fb37}'..='\u{fb37}', '\u{fb3d}'..='\u{fb3d}', '\u{fb3f}'..='\u{fb3f}', + '\u{fb42}'..='\u{fb42}', '\u{fb45}'..='\u{fb45}', '\u{fdd0}'..='\u{fdef}', + '\u{fe1a}'..='\u{fe1f}', '\u{fe53}'..='\u{fe53}', '\u{fe67}'..='\u{fe67}', + '\u{fe6c}'..='\u{fe6f}', '\u{fe75}'..='\u{fe75}', '\u{fefd}'..='\u{fefe}', + '\u{ff00}'..='\u{ff00}', '\u{ffbf}'..='\u{ffc1}', '\u{ffc8}'..='\u{ffc9}', + '\u{ffd0}'..='\u{ffd1}', '\u{ffd8}'..='\u{ffd9}', '\u{ffdd}'..='\u{ffdf}', + '\u{ffe7}'..='\u{ffe7}', '\u{ffef}'..='\u{fff8}', '\u{fffe}'..='\u{ffff}', + '\u{1000c}'..='\u{1000c}', '\u{10027}'..='\u{10027}', '\u{1003b}'..='\u{1003b}', + '\u{1003e}'..='\u{1003e}', '\u{1004e}'..='\u{1004f}', '\u{1005e}'..='\u{1007f}', + '\u{100fb}'..='\u{100ff}', '\u{10103}'..='\u{10106}', '\u{10134}'..='\u{10136}', + '\u{1018f}'..='\u{1018f}', '\u{1019d}'..='\u{1019f}', '\u{101a1}'..='\u{101cf}', + '\u{101fe}'..='\u{1027f}', '\u{1029d}'..='\u{1029f}', '\u{102d1}'..='\u{102df}', + '\u{102fc}'..='\u{102ff}', '\u{10324}'..='\u{1032c}', '\u{1034b}'..='\u{1034f}', + '\u{1037b}'..='\u{1037f}', '\u{1039e}'..='\u{1039e}', '\u{103c4}'..='\u{103c7}', + '\u{103d6}'..='\u{103ff}', '\u{1049e}'..='\u{1049f}', '\u{104aa}'..='\u{104af}', + '\u{104d4}'..='\u{104d7}', '\u{104fc}'..='\u{104ff}', '\u{10528}'..='\u{1052f}', + '\u{10564}'..='\u{1056e}', '\u{1057b}'..='\u{1057b}', '\u{1058b}'..='\u{1058b}', + '\u{10593}'..='\u{10593}', '\u{10596}'..='\u{10596}', '\u{105a2}'..='\u{105a2}', + '\u{105b2}'..='\u{105b2}', '\u{105ba}'..='\u{105ba}', '\u{105bd}'..='\u{105bf}', + '\u{105f4}'..='\u{105ff}', '\u{10737}'..='\u{1073f}', '\u{10756}'..='\u{1075f}', + '\u{10768}'..='\u{1077f}', '\u{10786}'..='\u{10786}', '\u{107b1}'..='\u{107b1}', + '\u{107bb}'..='\u{107ff}', '\u{10806}'..='\u{10807}', '\u{10809}'..='\u{10809}', + '\u{10836}'..='\u{10836}', '\u{10839}'..='\u{1083b}', '\u{1083d}'..='\u{1083e}', + '\u{10856}'..='\u{10856}', '\u{1089f}'..='\u{108a6}', '\u{108b0}'..='\u{108df}', + '\u{108f3}'..='\u{108f3}', '\u{108f6}'..='\u{108fa}', '\u{1091c}'..='\u{1091e}', + '\u{1093a}'..='\u{1093e}', '\u{1095a}'..='\u{1097f}', '\u{109b8}'..='\u{109bb}', + '\u{109d0}'..='\u{109d1}', '\u{10a04}'..='\u{10a04}', '\u{10a07}'..='\u{10a0b}', + '\u{10a14}'..='\u{10a14}', '\u{10a18}'..='\u{10a18}', '\u{10a36}'..='\u{10a37}', + '\u{10a3b}'..='\u{10a3e}', '\u{10a49}'..='\u{10a4f}', '\u{10a59}'..='\u{10a5f}', + '\u{10aa0}'..='\u{10abf}', '\u{10ae7}'..='\u{10aea}', '\u{10af7}'..='\u{10aff}', + '\u{10b36}'..='\u{10b38}', '\u{10b56}'..='\u{10b57}', '\u{10b73}'..='\u{10b77}', + '\u{10b92}'..='\u{10b98}', '\u{10b9d}'..='\u{10ba8}', '\u{10bb0}'..='\u{10bff}', + '\u{10c49}'..='\u{10c7f}', '\u{10cb3}'..='\u{10cbf}', '\u{10cf3}'..='\u{10cf9}', + '\u{10d28}'..='\u{10d2f}', '\u{10d3a}'..='\u{10d3f}', '\u{10d66}'..='\u{10d68}', + '\u{10d86}'..='\u{10d8d}', '\u{10d90}'..='\u{10e5f}', '\u{10e7f}'..='\u{10e7f}', + '\u{10eaa}'..='\u{10eaa}', '\u{10eae}'..='\u{10eaf}', '\u{10eb2}'..='\u{10ec1}', + '\u{10ec8}'..='\u{10ecf}', '\u{10ed9}'..='\u{10ef9}', '\u{10f28}'..='\u{10f2f}', + '\u{10f5a}'..='\u{10f6f}', '\u{10f8a}'..='\u{10faf}', '\u{10fcc}'..='\u{10fdf}', + '\u{10ff7}'..='\u{10fff}', '\u{1104e}'..='\u{11051}', '\u{11076}'..='\u{1107e}', + '\u{110c3}'..='\u{110cc}', '\u{110ce}'..='\u{110cf}', '\u{110e9}'..='\u{110ef}', + '\u{110fa}'..='\u{110ff}', '\u{11135}'..='\u{11135}', '\u{11148}'..='\u{1114f}', + '\u{11177}'..='\u{1117f}', '\u{111e0}'..='\u{111e0}', '\u{111f5}'..='\u{111ff}', + '\u{11212}'..='\u{11212}', '\u{11242}'..='\u{1127f}', '\u{11287}'..='\u{11287}', + '\u{11289}'..='\u{11289}', '\u{1128e}'..='\u{1128e}', '\u{1129e}'..='\u{1129e}', + '\u{112aa}'..='\u{112af}', '\u{112eb}'..='\u{112ef}', '\u{112fa}'..='\u{112ff}', + '\u{11304}'..='\u{11304}', '\u{1130d}'..='\u{1130e}', '\u{11311}'..='\u{11312}', + '\u{11329}'..='\u{11329}', '\u{11331}'..='\u{11331}', '\u{11334}'..='\u{11334}', + '\u{1133a}'..='\u{1133a}', '\u{11345}'..='\u{11346}', '\u{11349}'..='\u{1134a}', + '\u{1134e}'..='\u{1134f}', '\u{11351}'..='\u{11356}', '\u{11358}'..='\u{1135c}', + '\u{11364}'..='\u{11365}', '\u{1136d}'..='\u{1136f}', '\u{11375}'..='\u{1137f}', + '\u{1138a}'..='\u{1138a}', '\u{1138c}'..='\u{1138d}', '\u{1138f}'..='\u{1138f}', + '\u{113b6}'..='\u{113b6}', '\u{113c1}'..='\u{113c1}', '\u{113c3}'..='\u{113c4}', + '\u{113c6}'..='\u{113c6}', '\u{113cb}'..='\u{113cb}', '\u{113d6}'..='\u{113d6}', + '\u{113d9}'..='\u{113e0}', '\u{113e3}'..='\u{113ff}', '\u{1145c}'..='\u{1145c}', + '\u{11462}'..='\u{1147f}', '\u{114c8}'..='\u{114cf}', '\u{114da}'..='\u{1157f}', + '\u{115b6}'..='\u{115b7}', '\u{115de}'..='\u{115ff}', '\u{11645}'..='\u{1164f}', + '\u{1165a}'..='\u{1165f}', '\u{1166d}'..='\u{1167f}', '\u{116ba}'..='\u{116bf}', + '\u{116ca}'..='\u{116cf}', '\u{116e4}'..='\u{116ff}', '\u{1171b}'..='\u{1171c}', + '\u{1172c}'..='\u{1172f}', '\u{11747}'..='\u{117ff}', '\u{1183c}'..='\u{1189f}', + '\u{118f3}'..='\u{118fe}', '\u{11907}'..='\u{11908}', '\u{1190a}'..='\u{1190b}', + '\u{11914}'..='\u{11914}', '\u{11917}'..='\u{11917}', '\u{11936}'..='\u{11936}', + '\u{11939}'..='\u{1193a}', '\u{11947}'..='\u{1194f}', '\u{1195a}'..='\u{1199f}', + '\u{119a8}'..='\u{119a9}', '\u{119d8}'..='\u{119d9}', '\u{119e5}'..='\u{119ff}', + '\u{11a48}'..='\u{11a4f}', '\u{11aa3}'..='\u{11aaf}', '\u{11af9}'..='\u{11aff}', + '\u{11b0a}'..='\u{11b5f}', '\u{11b68}'..='\u{11bbf}', '\u{11be2}'..='\u{11bef}', + '\u{11bfa}'..='\u{11bff}', '\u{11c09}'..='\u{11c09}', '\u{11c37}'..='\u{11c37}', + '\u{11c46}'..='\u{11c4f}', '\u{11c6d}'..='\u{11c6f}', '\u{11c90}'..='\u{11c91}', + '\u{11ca8}'..='\u{11ca8}', '\u{11cb7}'..='\u{11cff}', '\u{11d07}'..='\u{11d07}', + '\u{11d0a}'..='\u{11d0a}', '\u{11d37}'..='\u{11d39}', '\u{11d3b}'..='\u{11d3b}', + '\u{11d3e}'..='\u{11d3e}', '\u{11d48}'..='\u{11d4f}', '\u{11d5a}'..='\u{11d5f}', + '\u{11d66}'..='\u{11d66}', '\u{11d69}'..='\u{11d69}', '\u{11d8f}'..='\u{11d8f}', + '\u{11d92}'..='\u{11d92}', '\u{11d99}'..='\u{11d9f}', '\u{11daa}'..='\u{11daf}', + '\u{11ddc}'..='\u{11ddf}', '\u{11dea}'..='\u{11edf}', '\u{11ef9}'..='\u{11eff}', + '\u{11f11}'..='\u{11f11}', '\u{11f3b}'..='\u{11f3d}', '\u{11f5b}'..='\u{11faf}', + '\u{11fb1}'..='\u{11fbf}', '\u{11ff2}'..='\u{11ffe}', '\u{1239a}'..='\u{123ff}', + '\u{1246f}'..='\u{1246f}', '\u{12475}'..='\u{1247f}', '\u{12544}'..='\u{12f8f}', + '\u{12ff3}'..='\u{12fff}', '\u{13456}'..='\u{1345f}', '\u{143fb}'..='\u{143ff}', + '\u{14647}'..='\u{160ff}', '\u{1613a}'..='\u{167ff}', '\u{16a39}'..='\u{16a3f}', + '\u{16a5f}'..='\u{16a5f}', '\u{16a6a}'..='\u{16a6d}', '\u{16abf}'..='\u{16abf}', + '\u{16aca}'..='\u{16acf}', '\u{16aee}'..='\u{16aef}', '\u{16af6}'..='\u{16aff}', + '\u{16b46}'..='\u{16b4f}', '\u{16b5a}'..='\u{16b5a}', '\u{16b62}'..='\u{16b62}', + '\u{16b78}'..='\u{16b7c}', '\u{16b90}'..='\u{16d3f}', '\u{16d7a}'..='\u{16e3f}', + '\u{16e9b}'..='\u{16e9f}', '\u{16eb9}'..='\u{16eba}', '\u{16ed4}'..='\u{16eff}', + '\u{16f4b}'..='\u{16f4e}', '\u{16f88}'..='\u{16f8e}', '\u{16fa0}'..='\u{16fdf}', + '\u{16fe5}'..='\u{16fef}', '\u{16ff7}'..='\u{16fff}', '\u{18cd6}'..='\u{18cfe}', + '\u{18d1f}'..='\u{18d7f}', '\u{18df3}'..='\u{1afef}', '\u{1aff4}'..='\u{1aff4}', + '\u{1affc}'..='\u{1affc}', '\u{1afff}'..='\u{1afff}', '\u{1b123}'..='\u{1b131}', + '\u{1b133}'..='\u{1b14f}', '\u{1b153}'..='\u{1b154}', '\u{1b156}'..='\u{1b163}', + '\u{1b168}'..='\u{1b16f}', '\u{1b2fc}'..='\u{1bbff}', '\u{1bc6b}'..='\u{1bc6f}', + '\u{1bc7d}'..='\u{1bc7f}', '\u{1bc89}'..='\u{1bc8f}', '\u{1bc9a}'..='\u{1bc9b}', + '\u{1bca4}'..='\u{1cbff}', '\u{1ccfd}'..='\u{1ccff}', '\u{1ceb4}'..='\u{1ceb9}', + '\u{1ced1}'..='\u{1cedf}', '\u{1cef1}'..='\u{1ceff}', '\u{1cf2e}'..='\u{1cf2f}', + '\u{1cf47}'..='\u{1cf4f}', '\u{1cfc4}'..='\u{1cfff}', '\u{1d0f6}'..='\u{1d0ff}', + '\u{1d127}'..='\u{1d128}', '\u{1d1eb}'..='\u{1d1ff}', '\u{1d246}'..='\u{1d2bf}', + '\u{1d2d4}'..='\u{1d2df}', '\u{1d2f4}'..='\u{1d2ff}', '\u{1d357}'..='\u{1d35f}', + '\u{1d379}'..='\u{1d3ff}', '\u{1d455}'..='\u{1d455}', '\u{1d49d}'..='\u{1d49d}', + '\u{1d4a0}'..='\u{1d4a1}', '\u{1d4a3}'..='\u{1d4a4}', '\u{1d4a7}'..='\u{1d4a8}', + '\u{1d4ad}'..='\u{1d4ad}', '\u{1d4ba}'..='\u{1d4ba}', '\u{1d4bc}'..='\u{1d4bc}', + '\u{1d4c4}'..='\u{1d4c4}', '\u{1d506}'..='\u{1d506}', '\u{1d50b}'..='\u{1d50c}', + '\u{1d515}'..='\u{1d515}', '\u{1d51d}'..='\u{1d51d}', '\u{1d53a}'..='\u{1d53a}', + '\u{1d53f}'..='\u{1d53f}', '\u{1d545}'..='\u{1d545}', '\u{1d547}'..='\u{1d549}', + '\u{1d551}'..='\u{1d551}', '\u{1d6a6}'..='\u{1d6a7}', '\u{1d7cc}'..='\u{1d7cd}', + '\u{1da8c}'..='\u{1da9a}', '\u{1daa0}'..='\u{1daa0}', '\u{1dab0}'..='\u{1deff}', + '\u{1df1f}'..='\u{1df24}', '\u{1df2b}'..='\u{1dfff}', '\u{1e007}'..='\u{1e007}', + '\u{1e019}'..='\u{1e01a}', '\u{1e022}'..='\u{1e022}', '\u{1e025}'..='\u{1e025}', + '\u{1e02b}'..='\u{1e02f}', '\u{1e06e}'..='\u{1e08e}', '\u{1e090}'..='\u{1e0ff}', + '\u{1e12d}'..='\u{1e12f}', '\u{1e13e}'..='\u{1e13f}', '\u{1e14a}'..='\u{1e14d}', + '\u{1e150}'..='\u{1e28f}', '\u{1e2af}'..='\u{1e2bf}', '\u{1e2fa}'..='\u{1e2fe}', + '\u{1e300}'..='\u{1e4cf}', '\u{1e4fa}'..='\u{1e5cf}', '\u{1e5fb}'..='\u{1e5fe}', + '\u{1e600}'..='\u{1e6bf}', '\u{1e6df}'..='\u{1e6df}', '\u{1e6f6}'..='\u{1e6fd}', + '\u{1e700}'..='\u{1e7df}', '\u{1e7e7}'..='\u{1e7e7}', '\u{1e7ec}'..='\u{1e7ec}', + '\u{1e7ef}'..='\u{1e7ef}', '\u{1e7ff}'..='\u{1e7ff}', '\u{1e8c5}'..='\u{1e8c6}', + '\u{1e8d7}'..='\u{1e8ff}', '\u{1e94c}'..='\u{1e94f}', '\u{1e95a}'..='\u{1e95d}', + '\u{1e960}'..='\u{1ec70}', '\u{1ecb5}'..='\u{1ed00}', '\u{1ed3e}'..='\u{1edff}', + '\u{1ee04}'..='\u{1ee04}', '\u{1ee20}'..='\u{1ee20}', '\u{1ee23}'..='\u{1ee23}', + '\u{1ee25}'..='\u{1ee26}', '\u{1ee28}'..='\u{1ee28}', '\u{1ee33}'..='\u{1ee33}', + '\u{1ee38}'..='\u{1ee38}', '\u{1ee3a}'..='\u{1ee3a}', '\u{1ee3c}'..='\u{1ee41}', + '\u{1ee43}'..='\u{1ee46}', '\u{1ee48}'..='\u{1ee48}', '\u{1ee4a}'..='\u{1ee4a}', + '\u{1ee4c}'..='\u{1ee4c}', '\u{1ee50}'..='\u{1ee50}', '\u{1ee53}'..='\u{1ee53}', + '\u{1ee55}'..='\u{1ee56}', '\u{1ee58}'..='\u{1ee58}', '\u{1ee5a}'..='\u{1ee5a}', + '\u{1ee5c}'..='\u{1ee5c}', '\u{1ee5e}'..='\u{1ee5e}', '\u{1ee60}'..='\u{1ee60}', + '\u{1ee63}'..='\u{1ee63}', '\u{1ee65}'..='\u{1ee66}', '\u{1ee6b}'..='\u{1ee6b}', + '\u{1ee73}'..='\u{1ee73}', '\u{1ee78}'..='\u{1ee78}', '\u{1ee7d}'..='\u{1ee7d}', + '\u{1ee7f}'..='\u{1ee7f}', '\u{1ee8a}'..='\u{1ee8a}', '\u{1ee9c}'..='\u{1eea0}', + '\u{1eea4}'..='\u{1eea4}', '\u{1eeaa}'..='\u{1eeaa}', '\u{1eebc}'..='\u{1eeef}', + '\u{1eef2}'..='\u{1efff}', '\u{1f02c}'..='\u{1f02f}', '\u{1f094}'..='\u{1f09f}', + '\u{1f0af}'..='\u{1f0b0}', '\u{1f0c0}'..='\u{1f0c0}', '\u{1f0d0}'..='\u{1f0d0}', + '\u{1f0f6}'..='\u{1f0ff}', '\u{1f1ae}'..='\u{1f1e5}', '\u{1f203}'..='\u{1f20f}', + '\u{1f23c}'..='\u{1f23f}', '\u{1f249}'..='\u{1f24f}', '\u{1f252}'..='\u{1f25f}', + '\u{1f266}'..='\u{1f2ff}', '\u{1f6d9}'..='\u{1f6db}', '\u{1f6ed}'..='\u{1f6ef}', + '\u{1f6fd}'..='\u{1f6ff}', '\u{1f7da}'..='\u{1f7df}', '\u{1f7ec}'..='\u{1f7ef}', + '\u{1f7f1}'..='\u{1f7ff}', '\u{1f80c}'..='\u{1f80f}', '\u{1f848}'..='\u{1f84f}', + '\u{1f85a}'..='\u{1f85f}', '\u{1f888}'..='\u{1f88f}', '\u{1f8ae}'..='\u{1f8af}', + '\u{1f8bc}'..='\u{1f8bf}', '\u{1f8c2}'..='\u{1f8cf}', '\u{1f8d9}'..='\u{1f8ff}', + '\u{1fa58}'..='\u{1fa5f}', '\u{1fa6e}'..='\u{1fa6f}', '\u{1fa7d}'..='\u{1fa7f}', + '\u{1fa8b}'..='\u{1fa8d}', '\u{1fac7}'..='\u{1fac7}', '\u{1fac9}'..='\u{1facc}', + '\u{1fadd}'..='\u{1fade}', '\u{1faeb}'..='\u{1faee}', '\u{1faf9}'..='\u{1faff}', + '\u{1fb93}'..='\u{1fb93}', '\u{1fbfb}'..='\u{1ffff}', '\u{2a6e0}'..='\u{2a6ff}', + '\u{2b81e}'..='\u{2b81f}', '\u{2ceae}'..='\u{2ceaf}', '\u{2ebe1}'..='\u{2ebef}', + '\u{2ee5e}'..='\u{2f7ff}', '\u{2fa1e}'..='\u{2ffff}', '\u{3134b}'..='\u{3134f}', + '\u{3347a}'..='\u{3fffd}', +]; + #[rustfmt::skip] pub(super) static GRAPHEME_EXTEND: &[RangeInclusive; 383] = &[ '\u{300}'..='\u{36f}', '\u{483}'..='\u{489}', '\u{591}'..='\u{5bd}', '\u{5bf}'..='\u{5bf}', diff --git a/license-metadata.json b/license-metadata.json index e8e13fa8d859c..b58329b2ccc19 100644 --- a/license-metadata.json +++ b/license-metadata.json @@ -3,6 +3,28 @@ "children": [ { "children": [ + { + "children": [ + { + "license": { + "copyright": [ + "The Rust Project Developers (see https://thanks.rust-lang.org)" + ], + "spdx": "Apache-2.0 OR MIT" + }, + "name": "mod.rs", + "type": "file" + } + ], + "license": { + "copyright": [ + "1991-2024 Unicode, Inc" + ], + "spdx": "Unicode-3.0" + }, + "name": "library/core/src/unicode", + "type": "directory" + }, { "children": [ { @@ -178,16 +200,6 @@ "name": "library/backtrace", "type": "directory" }, - { - "license": { - "copyright": [ - "1991-2024 Unicode, Inc" - ], - "spdx": "Unicode-3.0" - }, - "name": "library/core/src/unicode/unicode_data.rs", - "type": "file" - }, { "children": [], "license": { diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 398b4c7b7ec5a..6c95f9173eda8 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -71,11 +71,11 @@ //! index of that offset is utilized as the answer to whether we're in the set //! or not. -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::fmt::Write; use std::ops::Range; -use ucd_parse::Codepoints; +use ucd_parse::{Codepoint, Codepoints}; mod cascading_map; mod case_mapping; @@ -88,14 +88,18 @@ use fmt_helpers::CharEscape; use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace}; static PROPERTIES: &[&str] = &[ + // tidy-alphabetical-start "Alphabetic", - "Lowercase", - "Uppercase", "Case_Ignorable", + "Cf", + "Cn_Planes_0_3", "Grapheme_Extend", - "White_Space", - "N", + "Lowercase", "Lt", + "N", + "Uppercase", + "White_Space", + // tidy-alphabetical-end ]; struct UnicodeData { @@ -138,6 +142,9 @@ fn load_data() -> UnicodeData { } } + // Unassigned characters are not listed in `UnicodeData.txt`, + // so get a list of all the assigned ones + let mut assigned_chars = BTreeSet::new(); let [mut to_lower, mut to_upper, mut to_title] = [const { BTreeMap::new() }; 3]; for row in ucd_parse::UnicodeDataExpander::new( ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), @@ -147,6 +154,11 @@ fn load_data() -> UnicodeData { } else { row.general_category.as_str() }; + + if !matches!(general_category, "Cs" | "Cn") { + assigned_chars.insert(row.codepoint.value()); + } + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) { properties .entry(*name) @@ -171,6 +183,25 @@ fn load_data() -> UnicodeData { } } + // Find all unassigned chars in the first 4 planes + for c in '\0'..='\u{3FFFD}' { + let cp = Codepoint::from_u32(c.into()).unwrap(); + if !assigned_chars.contains(&cp.value()) { + properties.entry("Cn_Planes_0_3").or_insert_with(Vec::new).push(Codepoints::Single(cp)); + } + } + + // For now, we hardcode the assigned/unassigned status of characters + // U+3FFFE and above. The assertion below must be kept in sync + // with the `is_unassigned()` method in `library/core/char/methods.rs`. + for c in '\u{3FFFE}'..=char::MAX { + assert_eq!( + assigned_chars.contains(&u32::from(c)), + matches!(c, '\u{E0001}' | '\u{E0020}'..='\u{E007F}' | '\u{E0100}'..='\u{E01EF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}'), + "{c:?}", + ); + } + for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { if !row.conditions.is_empty() { // Skip conditional case mappings @@ -247,7 +278,7 @@ fn main() { modules.push((property.to_lowercase().to_string(), emitter.file)); table_file.push_str(&format!( - "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", + "// {:28}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", property, emitter.bytes_used, datapoints, @@ -260,10 +291,10 @@ fn main() { } let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); for (name, (desc, size)) in ["to_lower", "to_upper", "to_title"].iter().zip(sizes) { - table_file.push_str(&format!("// {:16}: {:5} bytes, {desc}\n", name, size,)); + table_file.push_str(&format!("// {:28}: {:5} bytes, {desc}\n", name, size,)); total_bytes += size; } - table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); + table_file.push_str(&format!("// {:28}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function table_file.push('\n'); From 6079a98c2162c8a40f3278fd16b1ca76d664a6f3 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 26 Apr 2026 18:38:40 -0400 Subject: [PATCH 4/4] Consider all `Default_Ignorable_Code_Point`s unprintable These characters may be hidden/invisible otherwise. --- library/core/src/char/methods.rs | 32 +++++++++++++++ library/core/src/unicode/mod.rs | 1 + library/core/src/unicode/unicode_data.rs | 41 ++++++++++++++++++- library/coretests/tests/unicode.rs | 9 ++++ library/coretests/tests/unicode/test_data.rs | 10 +++++ src/tools/unicode-table-generator/src/main.rs | 1 + 6 files changed, 93 insertions(+), 1 deletion(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 785f781c1860c..c275ae2f621d9 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -493,6 +493,7 @@ impl char { || self.is_private_use() || self.is_whitespace() || args.escape_grapheme_extender && self.is_grapheme_extender() + || self.is_default_ignorable() || self.is_format_control() || self.is_unassigned() => { @@ -1225,6 +1226,37 @@ impl char { } } + /// Returns `true` if this `char` has the `Default_Ignorable_Code_Point` property. + /// These characters [should be displayed as invisible in fallback rendering](https://www.unicode.org/faq/unsup_char#3). + /// + /// `Default_Ignorable_Code_Point` is [described] in Chapter 5 (Implementation Guidelines) of the Unicode Standard, + /// and [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`]. + /// + /// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-5/#G40120 + /// [specified]: https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ```ignore(private) + /// assert!('\u{AD}'.is_default_ignorable()); // SOFT HYPHEN + /// assert!('\u{115F}'.is_default_ignorable()); // HANGUL CHOSEONG FILLER + /// assert!('\u{200B}'.is_default_ignorable()); // ZERO WIDTH SPACE + /// assert!('\u{E0041}'.is_default_ignorable()); // TAG LATIN CAPITAL LETTER A + /// assert!(!'۝'.is_default_ignorable()); // ARABIC END OF AYAH + /// assert!(!'𓐲'.is_default_ignorable()); // EGYPTIAN HIEROGLYPH INSERT AT TOP START + /// assert!(!' '.is_default_ignorable()); + /// assert!(!'\n'.is_default_ignorable()); + /// assert!(!'\0'.is_default_ignorable()); + /// assert!(!'q'.is_default_ignorable()); + #[must_use] + #[inline] + fn is_default_ignorable(self) -> bool { + self > '\u{AC}' && unicode::Default_Ignorable_Code_Point(self) + } + /// Returns `true` if this `char` has the `Grapheme_Extend` property. /// /// `Grapheme_Extend` is [described] in Chapter 3 (Conformance) of the Unicode Standard, diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 61acb08487057..1648795facd40 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -11,6 +11,7 @@ pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; pub(crate) use unicode_data::cf::lookup as Cf; pub(crate) use unicode_data::cn_planes_0_3::lookup as Cn_planes_0_3; +pub(crate) use unicode_data::default_ignorable_code_point::lookup as Default_Ignorable_Code_Point; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::lt::lookup as Lt; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index e5bcdc270ccfb..7154da4d23181 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -3,6 +3,7 @@ // Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist // Cf : 87 bytes, 170 codepoints in 21 ranges (U+0000AD - U+0E0080) using skiplist // Cn_Planes_0_3 : 1677 bytes, 94165 codepoints in 730 ranges (U+000378 - U+03FFFE) using skiplist +// Default_Ignorable_Code_Point: 83 bytes, 4174 codepoints in 17 ranges (U+0000AD - U+0E1000) using skiplist // Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist // Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset // Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist @@ -12,7 +13,7 @@ // to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT // to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT // to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT -// Total : 11393 bytes +// Total : 11476 bytes #[inline(always)] const fn bitset_search< @@ -488,6 +489,44 @@ pub mod cn_planes_0_3 { } } +#[rustfmt::skip] +pub mod default_ignorable_code_point { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 12] = [ + ShortOffsetRunHeader::new(0, 847), ShortOffsetRunHeader::new(3, 1564), + ShortOffsetRunHeader::new(5, 4447), ShortOffsetRunHeader::new(7, 6068), + ShortOffsetRunHeader::new(9, 8203), ShortOffsetRunHeader::new(13, 12644), + ShortOffsetRunHeader::new(19, 65024), ShortOffsetRunHeader::new(21, 113824), + ShortOffsetRunHeader::new(29, 119155), ShortOffsetRunHeader::new(31, 917504), + ShortOffsetRunHeader::new(33, 921600), ShortOffsetRunHeader::new(34, 2035712), + ]; + static OFFSETS: [u8; 35] = [ + 173, 1, 0, 1, 0, 1, 0, 2, 0, 2, 85, 5, 0, 5, 26, 5, 49, 16, 0, 1, 0, 16, 239, 1, 160, 1, + 79, 9, 0, 4, 0, 8, 0, 0, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xad && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + #[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; diff --git a/library/coretests/tests/unicode.rs b/library/coretests/tests/unicode.rs index 793255563aaae..bd9e1ac2ced38 100644 --- a/library/coretests/tests/unicode.rs +++ b/library/coretests/tests/unicode.rs @@ -71,6 +71,15 @@ fn cn_planes_0_3() { test_boolean_property(test_data::CN_PLANES_0_3, unicode_data::cn_planes_0_3::lookup); } +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn default_ignorable_code_point() { + test_boolean_property( + test_data::DEFAULT_IGNORABLE_CODE_POINT, + unicode_data::default_ignorable_code_point::lookup, + ); +} + #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn grapheme_extend() { diff --git a/library/coretests/tests/unicode/test_data.rs b/library/coretests/tests/unicode/test_data.rs index cb72682b2e00e..a246716d54fdf 100644 --- a/library/coretests/tests/unicode/test_data.rs +++ b/library/coretests/tests/unicode/test_data.rs @@ -638,6 +638,16 @@ pub(super) static CN_PLANES_0_3: &[RangeInclusive; 730] = &[ '\u{3347a}'..='\u{3fffd}', ]; +#[rustfmt::skip] +pub(super) static DEFAULT_IGNORABLE_CODE_POINT: &[RangeInclusive; 17] = &[ + '\u{ad}'..='\u{ad}', '\u{34f}'..='\u{34f}', '\u{61c}'..='\u{61c}', '\u{115f}'..='\u{1160}', + '\u{17b4}'..='\u{17b5}', '\u{180b}'..='\u{180f}', '\u{200b}'..='\u{200f}', + '\u{202a}'..='\u{202e}', '\u{2060}'..='\u{206f}', '\u{3164}'..='\u{3164}', + '\u{fe00}'..='\u{fe0f}', '\u{feff}'..='\u{feff}', '\u{ffa0}'..='\u{ffa0}', + '\u{fff0}'..='\u{fff8}', '\u{1bca0}'..='\u{1bca3}', '\u{1d173}'..='\u{1d17a}', + '\u{e0000}'..='\u{e0fff}', +]; + #[rustfmt::skip] pub(super) static GRAPHEME_EXTEND: &[RangeInclusive; 383] = &[ '\u{300}'..='\u{36f}', '\u{483}'..='\u{489}', '\u{591}'..='\u{5bd}', '\u{5bf}'..='\u{5bf}', diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 6c95f9173eda8..aedab398e9313 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -93,6 +93,7 @@ static PROPERTIES: &[&str] = &[ "Case_Ignorable", "Cf", "Cn_Planes_0_3", + "Default_Ignorable_Code_Point", "Grapheme_Extend", "Lowercase", "Lt",