Skip to content

Commit 6079a98

Browse files
Consider all Default_Ignorable_Code_Points unprintable
These characters may be hidden/invisible otherwise.
1 parent 35eb40a commit 6079a98

6 files changed

Lines changed: 93 additions & 1 deletion

File tree

library/core/src/char/methods.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,7 @@ impl char {
493493
|| self.is_private_use()
494494
|| self.is_whitespace()
495495
|| args.escape_grapheme_extender && self.is_grapheme_extender()
496+
|| self.is_default_ignorable()
496497
|| self.is_format_control()
497498
|| self.is_unassigned() =>
498499
{
@@ -1225,6 +1226,37 @@ impl char {
12251226
}
12261227
}
12271228

1229+
/// Returns `true` if this `char` has the `Default_Ignorable_Code_Point` property.
1230+
/// These characters [should be displayed as invisible in fallback rendering](https://www.unicode.org/faq/unsup_char#3).
1231+
///
1232+
/// `Default_Ignorable_Code_Point` is [described] in Chapter 5 (Implementation Guidelines) of the Unicode Standard,
1233+
/// and [specified] in the Unicode Character Database [`DerivedCoreProperties.txt`].
1234+
///
1235+
/// [described]: https://www.unicode.org/versions/latest/core-spec/chapter-5/#G40120
1236+
/// [specified]: https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point
1237+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1238+
///
1239+
/// # Examples
1240+
///
1241+
/// Basic usage:
1242+
///
1243+
/// ```ignore(private)
1244+
/// assert!('\u{AD}'.is_default_ignorable()); // SOFT HYPHEN
1245+
/// assert!('\u{115F}'.is_default_ignorable()); // HANGUL CHOSEONG FILLER
1246+
/// assert!('\u{200B}'.is_default_ignorable()); // ZERO WIDTH SPACE
1247+
/// assert!('\u{E0041}'.is_default_ignorable()); // TAG LATIN CAPITAL LETTER A
1248+
/// assert!(!'۝'.is_default_ignorable()); // ARABIC END OF AYAH
1249+
/// assert!(!'𓐲'.is_default_ignorable()); // EGYPTIAN HIEROGLYPH INSERT AT TOP START
1250+
/// assert!(!' '.is_default_ignorable());
1251+
/// assert!(!'\n'.is_default_ignorable());
1252+
/// assert!(!'\0'.is_default_ignorable());
1253+
/// assert!(!'q'.is_default_ignorable());
1254+
#[must_use]
1255+
#[inline]
1256+
fn is_default_ignorable(self) -> bool {
1257+
self > '\u{AC}' && unicode::Default_Ignorable_Code_Point(self)
1258+
}
1259+
12281260
/// Returns `true` if this `char` has the `Grapheme_Extend` property.
12291261
///
12301262
/// `Grapheme_Extend` is [described] in Chapter 3 (Conformance) of the Unicode Standard,

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
1111
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
1212
pub(crate) use unicode_data::cf::lookup as Cf;
1313
pub(crate) use unicode_data::cn_planes_0_3::lookup as Cn_planes_0_3;
14+
pub(crate) use unicode_data::default_ignorable_code_point::lookup as Default_Ignorable_Code_Point;
1415
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1516
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
1617
pub(crate) use unicode_data::lt::lookup as Lt;

library/core/src/unicode/unicode_data.rs

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist
44
// Cf : 87 bytes, 170 codepoints in 21 ranges (U+0000AD - U+0E0080) using skiplist
55
// Cn_Planes_0_3 : 1677 bytes, 94165 codepoints in 730 ranges (U+000378 - U+03FFFE) using skiplist
6+
// Default_Ignorable_Code_Point: 83 bytes, 4174 codepoints in 17 ranges (U+0000AD - U+0E1000) using skiplist
67
// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist
78
// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset
89
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
@@ -12,7 +13,7 @@
1213
// to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT
1314
// to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT
1415
// to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT
15-
// Total : 11393 bytes
16+
// Total : 11476 bytes
1617

1718
#[inline(always)]
1819
const fn bitset_search<
@@ -488,6 +489,44 @@ pub mod cn_planes_0_3 {
488489
}
489490
}
490491

492+
#[rustfmt::skip]
493+
pub mod default_ignorable_code_point {
494+
use super::ShortOffsetRunHeader;
495+
496+
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 12] = [
497+
ShortOffsetRunHeader::new(0, 847), ShortOffsetRunHeader::new(3, 1564),
498+
ShortOffsetRunHeader::new(5, 4447), ShortOffsetRunHeader::new(7, 6068),
499+
ShortOffsetRunHeader::new(9, 8203), ShortOffsetRunHeader::new(13, 12644),
500+
ShortOffsetRunHeader::new(19, 65024), ShortOffsetRunHeader::new(21, 113824),
501+
ShortOffsetRunHeader::new(29, 119155), ShortOffsetRunHeader::new(31, 917504),
502+
ShortOffsetRunHeader::new(33, 921600), ShortOffsetRunHeader::new(34, 2035712),
503+
];
504+
static OFFSETS: [u8; 35] = [
505+
173, 1, 0, 1, 0, 1, 0, 2, 0, 2, 85, 5, 0, 5, 26, 5, 49, 16, 0, 1, 0, 16, 239, 1, 160, 1,
506+
79, 9, 0, 4, 0, 8, 0, 0, 0,
507+
];
508+
#[inline]
509+
pub fn lookup(c: char) -> bool {
510+
debug_assert!(!c.is_ascii());
511+
(c as u32) >= 0xad && lookup_slow(c)
512+
}
513+
514+
#[inline(never)]
515+
fn lookup_slow(c: char) -> bool {
516+
const {
517+
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
518+
let mut i = 0;
519+
while i < SHORT_OFFSET_RUNS.len() {
520+
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
521+
i += 1;
522+
}
523+
}
524+
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
525+
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
526+
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
527+
}
528+
}
529+
491530
#[rustfmt::skip]
492531
pub mod grapheme_extend {
493532
use super::ShortOffsetRunHeader;

library/coretests/tests/unicode.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ fn cn_planes_0_3() {
7171
test_boolean_property(test_data::CN_PLANES_0_3, unicode_data::cn_planes_0_3::lookup);
7272
}
7373

74+
#[test]
75+
#[cfg_attr(miri, ignore)] // Miri is too slow
76+
fn default_ignorable_code_point() {
77+
test_boolean_property(
78+
test_data::DEFAULT_IGNORABLE_CODE_POINT,
79+
unicode_data::default_ignorable_code_point::lookup,
80+
);
81+
}
82+
7483
#[test]
7584
#[cfg_attr(miri, ignore)] // Miri is too slow
7685
fn grapheme_extend() {

library/coretests/tests/unicode/test_data.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,16 @@ pub(super) static CN_PLANES_0_3: &[RangeInclusive<char>; 730] = &[
638638
'\u{3347a}'..='\u{3fffd}',
639639
];
640640

641+
#[rustfmt::skip]
642+
pub(super) static DEFAULT_IGNORABLE_CODE_POINT: &[RangeInclusive<char>; 17] = &[
643+
'\u{ad}'..='\u{ad}', '\u{34f}'..='\u{34f}', '\u{61c}'..='\u{61c}', '\u{115f}'..='\u{1160}',
644+
'\u{17b4}'..='\u{17b5}', '\u{180b}'..='\u{180f}', '\u{200b}'..='\u{200f}',
645+
'\u{202a}'..='\u{202e}', '\u{2060}'..='\u{206f}', '\u{3164}'..='\u{3164}',
646+
'\u{fe00}'..='\u{fe0f}', '\u{feff}'..='\u{feff}', '\u{ffa0}'..='\u{ffa0}',
647+
'\u{fff0}'..='\u{fff8}', '\u{1bca0}'..='\u{1bca3}', '\u{1d173}'..='\u{1d17a}',
648+
'\u{e0000}'..='\u{e0fff}',
649+
];
650+
641651
#[rustfmt::skip]
642652
pub(super) static GRAPHEME_EXTEND: &[RangeInclusive<char>; 383] = &[
643653
'\u{300}'..='\u{36f}', '\u{483}'..='\u{489}', '\u{591}'..='\u{5bd}', '\u{5bf}'..='\u{5bf}',

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ static PROPERTIES: &[&str] = &[
9393
"Case_Ignorable",
9494
"Cf",
9595
"Cn_Planes_0_3",
96+
"Default_Ignorable_Code_Point",
9697
"Grapheme_Extend",
9798
"Lowercase",
9899
"Lt",

0 commit comments

Comments
 (0)