77// It's cleaner to just turn off the unused_imports warning than to fix them.
88#![ allow( unused_imports) ]
99
10+ #[ cfg( not( no_global_oom_handling) ) ]
11+ use core:: ascii;
1012use core:: borrow:: { Borrow , BorrowMut } ;
1113use core:: iter:: FusedIterator ;
1214use core:: mem:: MaybeUninit ;
@@ -431,9 +433,7 @@ impl str {
431433 without modifying the original"]
432434 #[ stable( feature = "unicode_case_mapping" , since = "1.2.0" ) ]
433435 pub fn to_lowercase ( & self ) -> String {
434- // SAFETY: `to_ascii_lowercase` preserves ASCII bytes, so the converted
435- // prefix remains valid UTF-8.
436- let ( mut s, rest) = unsafe { convert_while_ascii ( self , u8:: to_ascii_lowercase) } ;
436+ let ( mut s, rest) = convert_while_ascii ( self , ascii:: Char :: to_lowercase) ;
437437
438438 let prefix_len = s. len ( ) ;
439439
@@ -638,9 +638,7 @@ impl str {
638638 without modifying the original"]
639639 #[ stable( feature = "unicode_case_mapping" , since = "1.2.0" ) ]
640640 pub fn to_uppercase ( & self ) -> String {
641- // SAFETY: `to_ascii_uppercase` preserves ASCII bytes, so the converted
642- // prefix remains valid UTF-8.
643- let ( mut s, rest) = unsafe { convert_while_ascii ( self , u8:: to_ascii_uppercase) } ;
641+ let ( mut s, rest) = convert_while_ascii ( self , ascii:: Char :: to_uppercase) ;
644642
645643 for c in rest. chars ( ) {
646644 match conversions:: to_upper ( c) {
@@ -659,6 +657,106 @@ impl str {
659657 s
660658 }
661659
660+ /// Returns the case-folded equivalent of this string slice, as a new [`String`].
661+ ///
662+ /// Case folding is a transformation, mostly matching lowercase, that is meant to be used
663+ /// for case-insensitive string comparisons. Case-folded strings should not usually
664+ /// be exposed directly to users.
665+ ///
666+ /// For the precise specification of case folding, see
667+ /// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63737)
668+ /// of the Unicode standard.
669+ ///
670+ /// Since some characters can expand into multiple characters when case folding,
671+ /// this function returns a [`String`] instead of modifying the parameter in-place.
672+ ///
673+ /// No [normalization] (e.g. NFC) is performed, so visually and semantically identical strings
674+ /// might still casefold differently. For example, `"Å"` (U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE)
675+ /// is considered distinct from `"Å"` (A followed by U+030A COMBINING RING ABOVE),
676+ /// even though Unicode considers them canonically equivalent.
677+ ///
678+ /// Like [`char::to_casefold_unnormalized()`], this method does not handle language-specific
679+ /// casings like Turkish and Azeri I/ı/İ/i. See that method's documentation
680+ /// for more information.
681+ ///
682+ /// # Examples
683+ ///
684+ /// Basic usage:
685+ ///
686+ /// ```
687+ /// #![feature(casefold)]
688+ /// let s0 = "HELLO";
689+ /// let s1 = "Hello";
690+ ///
691+ /// assert_eq!(s0.to_casefold_unnormalized(), s1.to_casefold_unnormalized());
692+ /// assert_eq!(s0.to_casefold_unnormalized(), "hello")
693+ /// ```
694+ ///
695+ /// Scripts without case are not changed:
696+ ///
697+ /// ```
698+ /// #![feature(casefold)]
699+ /// let new_year = "农历新年";
700+ ///
701+ /// assert_eq!(new_year, new_year.to_casefold_unnormalized());
702+ /// ```
703+ ///
704+ /// One character can become multiple:
705+ ///
706+ /// ```
707+ /// #![feature(casefold)]
708+ /// let s0 = "TSCHÜẞ";
709+ /// let s1 = "TSCHÜSS";
710+ /// let s2 = "tschüß";
711+ ///
712+ /// assert_eq!(s0.to_casefold_unnormalized(), s1.to_casefold_unnormalized());
713+ /// assert_eq!(s0.to_casefold_unnormalized(), s2.to_casefold_unnormalized());
714+ /// assert_eq!(s0.to_casefold_unnormalized(), "tschüss");
715+ /// ```
716+ ///
717+ /// No NFC [normalization] is performed:
718+ ///
719+ /// ```rust
720+ /// #![feature(casefold)]
721+ /// // These two strings are visually and semantically identical...
722+ /// let comp = "Å";
723+ /// let decomp = "Å";
724+ ///
725+ /// // ... but not codepoint-for-codepoint equal.
726+ /// assert_eq!(comp, "\u{C5}");
727+ /// assert_eq!(decomp, "A\u{030A}");
728+ ///
729+ /// // Their case-foldings are likewise unequal:
730+ /// assert_eq!(comp.to_casefold_unnormalized(), "\u{E5}");
731+ /// assert_eq!(decomp.to_casefold_unnormalized(), "a\u{030A}");
732+ /// ```
733+ ///
734+ /// [normalization]: https://www.unicode.org/faq/normalization
735+ #[ cfg( not( no_global_oom_handling) ) ]
736+ #[ rustc_allow_incoherent_impl]
737+ #[ must_use = "this returns the case-folded string as a new String, \
738+ without modifying the original"]
739+ #[ unstable( feature = "casefold" , issue = "154742" ) ]
740+ pub fn to_casefold_unnormalized ( & self ) -> String {
741+ let ( mut s, rest) = convert_while_ascii ( self , ascii:: Char :: to_lowercase) ;
742+
743+ for c in rest. chars ( ) {
744+ match conversions:: to_casefold ( c) {
745+ [ a, '\0' , _] => s. push ( a) ,
746+ [ a, b, '\0' ] => {
747+ s. push ( a) ;
748+ s. push ( b) ;
749+ }
750+ [ a, b, c] => {
751+ s. push ( a) ;
752+ s. push ( b) ;
753+ s. push ( c) ;
754+ }
755+ }
756+ }
757+ s
758+ }
759+
662760 /// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
663761 ///
664762 /// # Examples
@@ -803,23 +901,19 @@ pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box<str> {
803901///
804902/// This function is only public so that it can be verified in a codegen test,
805903/// see `issue-123712-str-to-lower-autovectorization.rs`.
806- ///
807- /// # Safety
808- ///
809- /// `convert` must return an ASCII byte for every ASCII input byte.
810904#[ unstable( feature = "str_internals" , issue = "none" ) ]
811905#[ doc( hidden) ]
812906#[ inline]
813907#[ cfg( not( no_global_oom_handling) ) ]
814- pub unsafe fn convert_while_ascii ( s : & str , convert : fn ( & u8 ) -> u8 ) -> ( String , & str ) {
908+ pub fn convert_while_ascii ( s : & str , convert : fn ( ascii :: Char ) -> ascii :: Char ) -> ( String , & str ) {
815909 // Process the input in chunks of 16 bytes to enable auto-vectorization.
816910 // Previously the chunk size depended on the size of `usize`,
817911 // but on 32-bit platforms with sse or neon is also the better choice.
818912 // The only downside on other platforms would be a bit more loop-unrolling.
819913 const N : usize = 16 ;
820914
821915 let mut slice = s. as_bytes ( ) ;
822- let mut out = Vec :: with_capacity ( slice. len ( ) ) ;
916+ let mut out: Vec < u8 > = Vec :: with_capacity ( slice. len ( ) ) ;
823917 let mut out_slice = out. spare_capacity_mut ( ) ;
824918
825919 let mut ascii_prefix_len = 0_usize ;
@@ -844,7 +938,10 @@ pub unsafe fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &
844938 }
845939
846940 for j in 0 ..N {
847- out_chunk[ j] = MaybeUninit :: new ( convert ( & chunk[ j] ) ) ;
941+ out_chunk[ j] = MaybeUninit :: new (
942+ // SAFETY: we checked that this byte is valid ASCII above
943+ convert ( unsafe { ascii:: Char :: from_u8_unchecked ( chunk[ j] ) } ) . to_u8 ( ) ,
944+ ) ;
848945 }
849946
850947 ascii_prefix_len += N ;
@@ -858,10 +955,17 @@ pub unsafe fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &
858955 if byte > 127 {
859956 break ;
860957 }
958+
959+ let converted_byte = MaybeUninit :: new (
960+ // SAFETY: we checked that this byte is valid ASCII above
961+ convert ( unsafe { ascii:: Char :: from_u8_unchecked ( byte) } ) . to_u8 ( ) ,
962+ ) ;
963+
861964 // SAFETY: out_slice has at least same length as input slice
862965 unsafe {
863- * out_slice. get_unchecked_mut ( 0 ) = MaybeUninit :: new ( convert ( & byte ) ) ;
966+ * out_slice. get_unchecked_mut ( 0 ) = converted_byte ;
864967 }
968+
865969 ascii_prefix_len += 1 ;
866970 slice = unsafe { slice. get_unchecked ( 1 ..) } ;
867971 out_slice = unsafe { out_slice. get_unchecked_mut ( 1 ..) } ;
0 commit comments