Add String::make_(lower|upper)case APIs

krtab · krtab · commit a449f25d89e4 · 2026-03-28T03:30:56.000+01:00
diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs
@@ -147,6 +147,7 @@
 #![feature(slice_ptr_get)]
 #![feature(slice_range)]
 #![feature(std_internals)]
+#![feature(str_internals)]
 #![feature(temporary_niche_types)]
 #![feature(titlecase)]
 #![feature(transmutability)]
diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs
@@ -415,13 +415,6 @@ impl str {
                 && !case_ignorable_then_cased(from[i + const { 'Σ'.len_utf8() }..].chars());
             if is_word_final { 'ς' } else { 'σ' }
         }
-
-        fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
-            match iter.skip_while(|&c| c.is_case_ignorable()).next() {
-                Some(c) => c.is_cased(),
-                None => false,
-            }
-        }
     }
 
     /// Returns the uppercase equivalent of this string slice, as a new [`String`].
@@ -481,7 +474,16 @@ impl str {
         }
         s
     }
+}
+
+pub(crate) fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
+    match iter.skip_while(|&c| c.is_case_ignorable()).next() {
+        Some(c) => c.is_cased(),
+        None => false,
+    }
+}
 
+impl str {
     /// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
     ///
     /// # Examples
diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
@@ -61,7 +61,7 @@ use crate::alloc::Allocator;
 #[cfg(not(no_global_oom_handling))]
 use crate::borrow::{Cow, ToOwned};
 use crate::boxed::Box;
-use crate::collections::TryReserveError;
+use crate::collections::{TryReserveError, VecDeque};
 use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut};
 #[cfg(not(no_global_oom_handling))]
 use crate::str::{FromStr, from_boxed_utf8_unchecked};
@@ -3604,3 +3604,203 @@ impl From<char> for String {
         c.to_string()
     }
 }
+
+// In place case changes
+
+impl String {
+    /// Converts this string to its uppercase equivalent in-place.
+    ///
+    /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property
+    /// `Uppercase`.
+    ///
+    /// Since some characters can expand into multiple characters when changing
+    /// the case, this method may change the length of the string. If the string
+    /// shrinks, the excess capacity is not reclaimed.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// #![feature(string_make_uplowercase)]
+    ///
+    /// let mut s = String::from("hello");
+    /// s.make_uppercase();
+    ///
+    /// assert_eq!("HELLO", s);
+    /// ```
+    ///
+    /// Scripts without case are not changed:
+    ///
+    /// ```
+    /// #![feature(string_make_uplowercase)]
+    ///
+    /// let mut new_year = String::from("农历新年");
+    /// new_year.make_uppercase();
+    ///
+    /// assert_eq!("农历新年", new_year);
+    /// ```
+    ///
+    /// One character can become multiple:
+    ///
+    /// ```
+    /// #![feature(string_make_uplowercase)]
+    ///
+    /// let mut s = String::from("tschüß");
+    /// s.make_uppercase();
+    ///
+    /// assert_eq!("TSCHÜSS", s);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "string_make_uplowercase", issue = "135885")]
+    pub fn make_uppercase(&mut self) {
+        let mut wc = WriteChars::new(self);
+        while let Some(l_c) = wc.pop() {
+            l_c.to_uppercase().for_each(|u_c| wc.write(u_c));
+        }
+    }
+
+    /// Converts this string to its lowercase equivalent in-place.
+    ///
+    /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property
+    /// `Lowercase`.
+    ///
+    /// Since some characters can expand into multiple characters when changing
+    /// the case, this method may change the length of the string. If the string
+    /// shrinks, the excess capacity is not reclaimed.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// #![feature(string_make_uplowercase)]
+    ///
+    /// let mut s = String::from("HELLO");
+    /// s.make_lowercase();
+    ///
+    /// assert_eq!("hello", s);
+    /// ```
+    ///
+    /// ```
+    /// #![feature(string_make_uplowercase)]
+    ///
+    /// let mut odysseus = String::from("ὈΔΥΣΣΕΎΣ");
+    /// odysseus.make_lowercase();
+    ///
+    /// assert_eq!("ὀδυσσεύς", odysseus);
+    /// ```
+    ///
+    /// Languages without case are not changed:
+    ///
+    /// ```
+    /// #![feature(string_make_uplowercase)]
+    ///
+    /// let mut new_year = String::from("农历新年");
+    /// new_year.make_lowercase();
+    ///
+    /// assert_eq!("农历新年", new_year);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "string_make_uplowercase", issue = "135885")]
+    pub fn make_lowercase(&mut self) {
+        let mut wc = WriteChars::new(self);
+        // This is unfortunately paid whether or not you have sigmas in the str
+        // but it is kind of mandatory because as we are overwriting the source bytes
+        // we have to compute this information as we go.
+        let mut word_final_so_far = false;
+        while let Some(u_c) = wc.pop() {
+            if u_c == 'Σ' {
+                if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) {
+                    // actually word final
+                    wc.write('ς');
+                } else {
+                    wc.write('σ');
+                }
+            } else {
+                u_c.to_lowercase().for_each(|l_c| wc.write(l_c));
+            }
+            word_final_so_far = u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable());
+        }
+    }
+}
+
+/// A helper for in place modification of strings, where we gradually "pop" characters,
+/// hereby making room to write back to the string buffer
+#[unstable(issue = "none", feature = "std_internals")]
+struct WriteChars<'a> {
+    // This is the internal buffer of the string temporarily changed to Vec<u8> because
+    // it will contain non utf8 bytes.
+    // invariant: self.v.len() == original string until drop is run
+    v: Vec<u8>,
+    // A reference kept to restore the string at the end
+    // (ie drop time)
+    s: &'a mut String,
+    // invariant: write_offset <= read_offset
+    write_offset: usize,
+    // invariant: self.read_offset <= self.v.len()
+    // before the Drop
+    read_offset: usize,
+    buffer: VecDeque<u8>,
+}
+
+impl<'a> Drop for WriteChars<'a> {
+    // Set the proper length of the strings storage
+    // or grow it to add what is still in the buffer.
+    fn drop(&mut self) {
+        if self.buffer.is_empty() {
+            // SAFETY: if the queue is empty, then
+            // there were less bytes than in the original so we can simply shrink
+            unsafe {
+                self.v.set_len(self.write_offset);
+            }
+        } else {
+            let (q1, q2) = self.buffer.as_slices();
+            self.v.extend_from_slice(q1);
+            self.v.extend_from_slice(q2);
+        };
+        // SAFETY: this is valid utf8
+        *self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) }
+    }
+}
+
+#[unstable(issue = "none", feature = "std_internals")]
+impl<'a> WriteChars<'a> {
+    fn new(s: &'a mut String) -> Self {
+        let v = core::mem::take(s).into_bytes();
+        WriteChars { s, v, write_offset: 0, read_offset: 0, buffer: VecDeque::new() }
+    }
+
+    fn rest(&self) -> &str {
+        // SAFETY: read_offset is always ok to read from
+        unsafe { str::from_utf8_unchecked(&self.v[self.read_offset..]) }
+    }
+
+    fn pop(&mut self) -> Option<char> {
+        // SAFETY: The bytes from read_offset are valid UTF8
+        let (code_point, width) = unsafe {
+            core::str::next_code_point_with_width(&mut self.v[self.read_offset..].iter())?
+        };
+        self.read_offset += width;
+        // Dump what is buffered in the newly freed space
+        while self.write_offset < self.read_offset
+            && let Some(b) = self.buffer.pop_front()
+        {
+            self.v[self.write_offset] = b;
+            self.write_offset += 1;
+        }
+        // SAFETY: The code point is valid
+        let c = unsafe { char::from_u32_unchecked(code_point) };
+        Some(c)
+    }
+
+    fn write(&mut self, c: char) {
+        let writable_slice = &mut self.v[self.write_offset..self.read_offset];
+        let mut buffer = [0u8; 4];
+        let len = c.encode_utf8(&mut buffer).len();
+        let direct_copy_length = core::cmp::min(len, writable_slice.len());
+        writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]);
+        self.write_offset += direct_copy_length;
+        self.buffer.extend(&buffer[direct_copy_length..len]);
+    }
+}
diff --git a/library/alloctests/tests/lib.rs b/library/alloctests/tests/lib.rs
@@ -44,6 +44,7 @@
 #![allow(internal_features)]
 #![deny(fuzzy_provenance_casts)]
 #![deny(unsafe_op_in_unsafe_fn)]
+#![feature(string_make_uplowercase)]
 
 extern crate alloc;
 
diff --git a/library/alloctests/tests/string.rs b/library/alloctests/tests/string.rs
@@ -956,3 +956,83 @@ fn test_str_concat() {
     let s: String = format!("{a}{b}");
     assert_eq!(s.as_bytes()[9], 'd' as u8);
 }
+
+#[test]
+fn make_uppercase() {
+    fn test(s: &str) {
+        let ground_truth = s.to_uppercase();
+        let mut tested = s.to_owned();
+        tested.make_uppercase();
+        assert!(
+            tested == ground_truth,
+            r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
+        );
+    }
+    test("");
+    test("abcde");
+    // 4 to 9 bytes
+    test("ǰΐ");
+    // 10*3 to 10*2 bytes
+    test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
+    test("aéǅßﬁᾀ");
+}
+
+#[test]
+fn make_lowercase() {
+    fn test(s: &str) {
+        let ground_truth = s.to_lowercase();
+        let mut tested = s.to_owned();
+        tested.make_lowercase();
+        assert!(
+            tested == ground_truth,
+            r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
+        );
+    }
+    test("");
+    test("AÉǅaé ");
+
+    // https://github.com/rust-lang/rust/issues/26035
+    test("ΑΣ");
+    test("Α'Σ");
+    test("Α''Σ");
+
+    test("ΑΣ Α");
+    test("Α'Σ Α");
+    test("Α''Σ Α");
+
+    test("ΑΣ' Α");
+    test("ΑΣ'' Α");
+
+    test("Α'Σ' Α");
+    test("Α''Σ'' Α");
+
+    test("Α Σ");
+    test("Α 'Σ");
+    test("Α ''Σ");
+
+    test("Σ");
+    test("'Σ");
+    test("''Σ");
+
+    test("ΑΣΑ");
+    test("ΑΣ'Α");
+    test("ΑΣ''Α");
+
+    // https://github.com/rust-lang/rust/issues/124714
+    // input lengths around the boundary of the chunk size used by the ascii prefix optimization
+    test("abcdefghijklmnoΣ");
+    test("abcdefghijklmnopΣ");
+    test("abcdefghijklmnopqΣ");
+
+    // a really long string that has it's lowercase form
+    // even longer. this tests that implementations don't assume
+    // an incorrect upper bound on allocations
+    let upper = str::repeat("İ", 512);
+    test(&upper);
+
+    // a really long ascii-only string.
+    // This test that the ascii hot-path
+    // functions correctly
+    let upper = str::repeat("A", 511);
+    test(&upper);
+}
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -58,7 +58,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use traits::FromStr;
 #[unstable(feature = "str_internals", issue = "none")]
-pub use validations::{next_code_point, utf8_char_width};
+pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width};
 
 #[inline(never)]
 #[cold]
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs

Original file line number	Diff line number	Diff line change
`@@ -415,13 +415,6 @@ impl str {`
`415`	`415`	`&& !case_ignorable_then_cased(from[i + const { 'Σ'.len_utf8() }..].chars());`
`416`	`416`	`if is_word_final { 'ς' } else { 'σ' }`
`417`	`417`	`}`
`418`		`-`
`419`		`- fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {`
`420`		`- match iter.skip_while(\|&c\| c.is_case_ignorable()).next() {`
`421`		`- Some(c) => c.is_cased(),`
`422`		`- None => false,`
`423`		`- }`
`424`		`- }`
`425`	`418`	`}`
`426`	`419`
`427`	`420`	/// Returns the uppercase equivalent of this string slice, as a new [`String`].
`@@ -481,7 +474,16 @@ impl str {`
`481`	`474`	`}`
`482`	`475`	`s`
`483`	`476`	`}`
	`477`	`+}`
	`478`	`+`
	`479`	`+pub(crate) fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {`
	`480`	`+ match iter.skip_while(\|&c\| c.is_case_ignorable()).next() {`
	`481`	`+ Some(c) => c.is_cased(),`
	`482`	`+ None => false,`
	`483`	`+ }`
	`484`	`+}`
`484`	`485`
	`486`	`+impl str {`
`485`	`487`	/// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
`486`	`488`	`///`
`487`	`489`	`/// # Examples`