Skip to content

Commit 435f983

Browse files
authored
Rollup merge of rust-lang#154742 - Jules-Bertholet:casefold, r=Mark-Simulacrum
Add APIs for case folding to the standard library [Libs-api requested these](rust-lang#154287 (comment)), so here they are. New public API (gated behind `#[feature(casefold)]`): ```rust impl char { pub fn to_casefold(self) -> ToCasefold; } impl str { pub fn to_casefold(&self) -> String; pub fn eq_ignore_case(&self) -> bool; } pub struct ToCasefold { ... } impl Iterator for ToCasefold { type Item = char; ... } impl DoubleEndedIterator for ToCasefold { ... } impl FusedIterator for ToCasefold { } impl ExactSizeIterator for ToCasefold { ... } impl fmt::Display for ToCasefold { ... } ``` ## Notes - This only adds a negligible amount of static data to `core::unicode`. To accomplish that, we compute the case-folding for most characters as the lowercase of their uppercase; this double mapping adds some complexity to the implementation. - No normalization (e.g. NFC) is performed, so visually and semantically equivalent strings can compare unequal. - I have not put any effort into optimizing `eq_ignore_case()`; there may be a more performant implementation. - `char::eq_ignore_case()` is left to future work—it's a potential footgun, so we may want to think more deeply about how to expose and document that API. @rustbot label T-libs-api A-unicode
2 parents 6cd4f49 + ee7f3dd commit 435f983

19 files changed

Lines changed: 803 additions & 64 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6069,6 +6069,7 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
60696069
name = "unicode-table-generator"
60706070
version = "0.1.0"
60716071
dependencies = [
6072+
"rustc-hash 2.1.1",
60726073
"ucd-parse",
60736074
]
60746075

library/alloc/src/str.rs

Lines changed: 118 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
// It's cleaner to just turn off the unused_imports warning than to fix them.
88
#![allow(unused_imports)]
99

10+
#[cfg(not(no_global_oom_handling))]
11+
use core::ascii;
1012
use core::borrow::{Borrow, BorrowMut};
1113
use core::iter::FusedIterator;
1214
use core::mem::MaybeUninit;
@@ -431,9 +433,7 @@ impl str {
431433
without modifying the original"]
432434
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
433435
pub fn to_lowercase(&self) -> String {
434-
// SAFETY: `to_ascii_lowercase` preserves ASCII bytes, so the converted
435-
// prefix remains valid UTF-8.
436-
let (mut s, rest) = unsafe { convert_while_ascii(self, u8::to_ascii_lowercase) };
436+
let (mut s, rest) = convert_while_ascii(self, ascii::Char::to_lowercase);
437437

438438
let prefix_len = s.len();
439439

@@ -638,9 +638,7 @@ impl str {
638638
without modifying the original"]
639639
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
640640
pub fn to_uppercase(&self) -> String {
641-
// SAFETY: `to_ascii_uppercase` preserves ASCII bytes, so the converted
642-
// prefix remains valid UTF-8.
643-
let (mut s, rest) = unsafe { convert_while_ascii(self, u8::to_ascii_uppercase) };
641+
let (mut s, rest) = convert_while_ascii(self, ascii::Char::to_uppercase);
644642

645643
for c in rest.chars() {
646644
match conversions::to_upper(c) {
@@ -659,6 +657,106 @@ impl str {
659657
s
660658
}
661659

660+
/// Returns the case-folded equivalent of this string slice, as a new [`String`].
661+
///
662+
/// Case folding is a transformation, mostly matching lowercase, that is meant to be used
663+
/// for case-insensitive string comparisons. Case-folded strings should not usually
664+
/// be exposed directly to users.
665+
///
666+
/// For the precise specification of case folding, see
667+
/// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63737)
668+
/// of the Unicode standard.
669+
///
670+
/// Since some characters can expand into multiple characters when case folding,
671+
/// this function returns a [`String`] instead of modifying the parameter in-place.
672+
///
673+
/// No [normalization] (e.g. NFC) is performed, so visually and semantically identical strings
674+
/// might still casefold differently. For example, `"Å"` (U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE)
675+
/// is considered distinct from `"Å"` (A followed by U+030A COMBINING RING ABOVE),
676+
/// even though Unicode considers them canonically equivalent.
677+
///
678+
/// Like [`char::to_casefold_unnormalized()`], this method does not handle language-specific
679+
/// casings like Turkish and Azeri I/ı/İ/i. See that method's documentation
680+
/// for more information.
681+
///
682+
/// # Examples
683+
///
684+
/// Basic usage:
685+
///
686+
/// ```
687+
/// #![feature(casefold)]
688+
/// let s0 = "HELLO";
689+
/// let s1 = "Hello";
690+
///
691+
/// assert_eq!(s0.to_casefold_unnormalized(), s1.to_casefold_unnormalized());
692+
/// assert_eq!(s0.to_casefold_unnormalized(), "hello")
693+
/// ```
694+
///
695+
/// Scripts without case are not changed:
696+
///
697+
/// ```
698+
/// #![feature(casefold)]
699+
/// let new_year = "农历新年";
700+
///
701+
/// assert_eq!(new_year, new_year.to_casefold_unnormalized());
702+
/// ```
703+
///
704+
/// One character can become multiple:
705+
///
706+
/// ```
707+
/// #![feature(casefold)]
708+
/// let s0 = "TSCHÜẞ";
709+
/// let s1 = "TSCHÜSS";
710+
/// let s2 = "tschüß";
711+
///
712+
/// assert_eq!(s0.to_casefold_unnormalized(), s1.to_casefold_unnormalized());
713+
/// assert_eq!(s0.to_casefold_unnormalized(), s2.to_casefold_unnormalized());
714+
/// assert_eq!(s0.to_casefold_unnormalized(), "tschüss");
715+
/// ```
716+
///
717+
/// No NFC [normalization] is performed:
718+
///
719+
/// ```rust
720+
/// #![feature(casefold)]
721+
/// // These two strings are visually and semantically identical...
722+
/// let comp = "Å";
723+
/// let decomp = "Å";
724+
///
725+
/// // ... but not codepoint-for-codepoint equal.
726+
/// assert_eq!(comp, "\u{C5}");
727+
/// assert_eq!(decomp, "A\u{030A}");
728+
///
729+
/// // Their case-foldings are likewise unequal:
730+
/// assert_eq!(comp.to_casefold_unnormalized(), "\u{E5}");
731+
/// assert_eq!(decomp.to_casefold_unnormalized(), "a\u{030A}");
732+
/// ```
733+
///
734+
/// [normalization]: https://www.unicode.org/faq/normalization
735+
#[cfg(not(no_global_oom_handling))]
736+
#[rustc_allow_incoherent_impl]
737+
#[must_use = "this returns the case-folded string as a new String, \
738+
without modifying the original"]
739+
#[unstable(feature = "casefold", issue = "154742")]
740+
pub fn to_casefold_unnormalized(&self) -> String {
741+
let (mut s, rest) = convert_while_ascii(self, ascii::Char::to_lowercase);
742+
743+
for c in rest.chars() {
744+
match conversions::to_casefold(c) {
745+
[a, '\0', _] => s.push(a),
746+
[a, b, '\0'] => {
747+
s.push(a);
748+
s.push(b);
749+
}
750+
[a, b, c] => {
751+
s.push(a);
752+
s.push(b);
753+
s.push(c);
754+
}
755+
}
756+
}
757+
s
758+
}
759+
662760
/// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
663761
///
664762
/// # Examples
@@ -803,23 +901,19 @@ pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box<str> {
803901
///
804902
/// This function is only public so that it can be verified in a codegen test,
805903
/// see `issue-123712-str-to-lower-autovectorization.rs`.
806-
///
807-
/// # Safety
808-
///
809-
/// `convert` must return an ASCII byte for every ASCII input byte.
810904
#[unstable(feature = "str_internals", issue = "none")]
811905
#[doc(hidden)]
812906
#[inline]
813907
#[cfg(not(no_global_oom_handling))]
814-
pub unsafe fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &str) {
908+
pub fn convert_while_ascii(s: &str, convert: fn(ascii::Char) -> ascii::Char) -> (String, &str) {
815909
// Process the input in chunks of 16 bytes to enable auto-vectorization.
816910
// Previously the chunk size depended on the size of `usize`,
817911
// but on 32-bit platforms with sse or neon is also the better choice.
818912
// The only downside on other platforms would be a bit more loop-unrolling.
819913
const N: usize = 16;
820914

821915
let mut slice = s.as_bytes();
822-
let mut out = Vec::with_capacity(slice.len());
916+
let mut out: Vec<u8> = Vec::with_capacity(slice.len());
823917
let mut out_slice = out.spare_capacity_mut();
824918

825919
let mut ascii_prefix_len = 0_usize;
@@ -844,7 +938,10 @@ pub unsafe fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &
844938
}
845939

846940
for j in 0..N {
847-
out_chunk[j] = MaybeUninit::new(convert(&chunk[j]));
941+
out_chunk[j] = MaybeUninit::new(
942+
// SAFETY: we checked that this byte is valid ASCII above
943+
convert(unsafe { ascii::Char::from_u8_unchecked(chunk[j]) }).to_u8(),
944+
);
848945
}
849946

850947
ascii_prefix_len += N;
@@ -858,10 +955,17 @@ pub unsafe fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &
858955
if byte > 127 {
859956
break;
860957
}
958+
959+
let converted_byte = MaybeUninit::new(
960+
// SAFETY: we checked that this byte is valid ASCII above
961+
convert(unsafe { ascii::Char::from_u8_unchecked(byte) }).to_u8(),
962+
);
963+
861964
// SAFETY: out_slice has at least same length as input slice
862965
unsafe {
863-
*out_slice.get_unchecked_mut(0) = MaybeUninit::new(convert(&byte));
966+
*out_slice.get_unchecked_mut(0) = converted_byte;
864967
}
968+
865969
ascii_prefix_len += 1;
866970
slice = unsafe { slice.get_unchecked(1..) };
867971
out_slice = unsafe { out_slice.get_unchecked_mut(1..) };

library/alloctests/tests/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#![feature(const_heap)]
44
#![feature(deque_extend_front)]
55
#![feature(iter_array_chunks)]
6+
#![feature(casefold)]
67
#![feature(cow_is_borrowed)]
78
#![feature(core_intrinsics)]
89
#![feature(downcast_unchecked)]

library/alloctests/tests/str.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1886,7 +1886,13 @@ fn to_lowercase() {
18861886
#[test]
18871887
fn to_uppercase() {
18881888
assert_eq!("".to_uppercase(), "");
1889-
assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ");
1889+
assert_eq!("aéDžßẞfiᾀ".to_uppercase(), "AÉDŽSSẞFIἈΙ");
1890+
}
1891+
1892+
#[test]
1893+
fn to_casefold_unnormalized() {
1894+
assert_eq!("".to_casefold_unnormalized(), "");
1895+
assert_eq!("ꮿfiῲὼ\u{0345}ßẞΣς".to_casefold_unnormalized(), "Ꮿfiὼιὼιssssσσ");
18901896
}
18911897

18921898
#[test]

library/core/src/ascii/ascii_char.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,11 @@ impl AsciiChar {
476476
#[unstable(feature = "ascii_char", issue = "110998")]
477477
#[inline]
478478
pub const unsafe fn from_u8_unchecked(b: u8) -> Self {
479+
assert_unsafe_precondition!(
480+
check_library_ub,
481+
"`ascii::Char::from_u8_unchecked` input cannot exceed 127.",
482+
(b: u8 = b) => b <= 127,
483+
);
479484
// SAFETY: Our safety precondition is that `b` is in-range.
480485
unsafe { transmute(b) }
481486
}

0 commit comments

Comments
 (0)