Skip to content

Commit a449f25

Browse files
committed
Add String::make_(lower|upper)case APIs
1 parent 362211d commit a449f25

7 files changed

Lines changed: 315 additions & 13 deletions

File tree

library/alloc/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@
147147
#![feature(slice_ptr_get)]
148148
#![feature(slice_range)]
149149
#![feature(std_internals)]
150+
#![feature(str_internals)]
150151
#![feature(temporary_niche_types)]
151152
#![feature(titlecase)]
152153
#![feature(transmutability)]

library/alloc/src/str.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -415,13 +415,6 @@ impl str {
415415
&& !case_ignorable_then_cased(from[i + const { 'Σ'.len_utf8() }..].chars());
416416
if is_word_final { 'ς' } else { 'σ' }
417417
}
418-
419-
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
420-
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
421-
Some(c) => c.is_cased(),
422-
None => false,
423-
}
424-
}
425418
}
426419

427420
/// Returns the uppercase equivalent of this string slice, as a new [`String`].
@@ -481,7 +474,16 @@ impl str {
481474
}
482475
s
483476
}
477+
}
478+
479+
pub(crate) fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
480+
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
481+
Some(c) => c.is_cased(),
482+
None => false,
483+
}
484+
}
484485

486+
impl str {
485487
/// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
486488
///
487489
/// # Examples

library/alloc/src/string.rs

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ use crate::alloc::Allocator;
6161
#[cfg(not(no_global_oom_handling))]
6262
use crate::borrow::{Cow, ToOwned};
6363
use crate::boxed::Box;
64-
use crate::collections::TryReserveError;
64+
use crate::collections::{TryReserveError, VecDeque};
6565
use crate::str::{self, CharIndices, Chars, Utf8Error, from_utf8_unchecked_mut};
6666
#[cfg(not(no_global_oom_handling))]
6767
use crate::str::{FromStr, from_boxed_utf8_unchecked};
@@ -3604,3 +3604,203 @@ impl From<char> for String {
36043604
c.to_string()
36053605
}
36063606
}
3607+
3608+
// In place case changes
3609+
3610+
impl String {
3611+
/// Converts this string to its uppercase equivalent in-place.
3612+
///
3613+
/// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property
3614+
/// `Uppercase`.
3615+
///
3616+
/// Since some characters can expand into multiple characters when changing
3617+
/// the case, this method may change the length of the string. If the string
3618+
/// shrinks, the excess capacity is not reclaimed.
3619+
///
3620+
/// # Examples
3621+
///
3622+
/// Basic usage:
3623+
///
3624+
/// ```
3625+
/// #![feature(string_make_uplowercase)]
3626+
///
3627+
/// let mut s = String::from("hello");
3628+
/// s.make_uppercase();
3629+
///
3630+
/// assert_eq!("HELLO", s);
3631+
/// ```
3632+
///
3633+
/// Scripts without case are not changed:
3634+
///
3635+
/// ```
3636+
/// #![feature(string_make_uplowercase)]
3637+
///
3638+
/// let mut new_year = String::from("农历新年");
3639+
/// new_year.make_uppercase();
3640+
///
3641+
/// assert_eq!("农历新年", new_year);
3642+
/// ```
3643+
///
3644+
/// One character can become multiple:
3645+
///
3646+
/// ```
3647+
/// #![feature(string_make_uplowercase)]
3648+
///
3649+
/// let mut s = String::from("tschüß");
3650+
/// s.make_uppercase();
3651+
///
3652+
/// assert_eq!("TSCHÜSS", s);
3653+
/// ```
3654+
#[cfg(not(no_global_oom_handling))]
3655+
#[unstable(feature = "string_make_uplowercase", issue = "135885")]
3656+
pub fn make_uppercase(&mut self) {
3657+
let mut wc = WriteChars::new(self);
3658+
while let Some(l_c) = wc.pop() {
3659+
l_c.to_uppercase().for_each(|u_c| wc.write(u_c));
3660+
}
3661+
}
3662+
3663+
/// Converts this string to its lowercase equivalent in-place.
3664+
///
3665+
/// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property
3666+
/// `Lowercase`.
3667+
///
3668+
/// Since some characters can expand into multiple characters when changing
3669+
/// the case, this method may change the length of the string. If the string
3670+
/// shrinks, the excess capacity is not reclaimed.
3671+
///
3672+
/// # Examples
3673+
///
3674+
/// Basic usage:
3675+
///
3676+
/// ```
3677+
/// #![feature(string_make_uplowercase)]
3678+
///
3679+
/// let mut s = String::from("HELLO");
3680+
/// s.make_lowercase();
3681+
///
3682+
/// assert_eq!("hello", s);
3683+
/// ```
3684+
///
3685+
/// ```
3686+
/// #![feature(string_make_uplowercase)]
3687+
///
3688+
/// let mut odysseus = String::from("ὈΔΥΣΣΕΎΣ");
3689+
/// odysseus.make_lowercase();
3690+
///
3691+
/// assert_eq!("ὀδυσσεύς", odysseus);
3692+
/// ```
3693+
///
3694+
/// Languages without case are not changed:
3695+
///
3696+
/// ```
3697+
/// #![feature(string_make_uplowercase)]
3698+
///
3699+
/// let mut new_year = String::from("农历新年");
3700+
/// new_year.make_lowercase();
3701+
///
3702+
/// assert_eq!("农历新年", new_year);
3703+
/// ```
3704+
#[cfg(not(no_global_oom_handling))]
3705+
#[unstable(feature = "string_make_uplowercase", issue = "135885")]
3706+
pub fn make_lowercase(&mut self) {
3707+
let mut wc = WriteChars::new(self);
3708+
// This is unfortunately paid whether or not you have sigmas in the str
3709+
// but it is kind of mandatory because as we are overwriting the source bytes
3710+
// we have to compute this information as we go.
3711+
let mut word_final_so_far = false;
3712+
while let Some(u_c) = wc.pop() {
3713+
if u_c == 'Σ' {
3714+
if word_final_so_far && !crate::str::case_ignorable_then_cased(wc.rest().chars()) {
3715+
// actually word final
3716+
wc.write('ς');
3717+
} else {
3718+
wc.write('σ');
3719+
}
3720+
} else {
3721+
u_c.to_lowercase().for_each(|l_c| wc.write(l_c));
3722+
}
3723+
word_final_so_far = u_c.is_cased() || (word_final_so_far && u_c.is_case_ignorable());
3724+
}
3725+
}
3726+
}
3727+
3728+
/// A helper for in place modification of strings, where we gradually "pop" characters,
3729+
/// hereby making room to write back to the string buffer
3730+
#[unstable(issue = "none", feature = "std_internals")]
3731+
struct WriteChars<'a> {
3732+
// This is the internal buffer of the string temporarily changed to Vec<u8> because
3733+
// it will contain non utf8 bytes.
3734+
// invariant: self.v.len() == original string until drop is run
3735+
v: Vec<u8>,
3736+
// A reference kept to restore the string at the end
3737+
// (ie drop time)
3738+
s: &'a mut String,
3739+
// invariant: write_offset <= read_offset
3740+
write_offset: usize,
3741+
// invariant: self.read_offset <= self.v.len()
3742+
// before the Drop
3743+
read_offset: usize,
3744+
buffer: VecDeque<u8>,
3745+
}
3746+
3747+
impl<'a> Drop for WriteChars<'a> {
3748+
// Set the proper length of the strings storage
3749+
// or grow it to add what is still in the buffer.
3750+
fn drop(&mut self) {
3751+
if self.buffer.is_empty() {
3752+
// SAFETY: if the queue is empty, then
3753+
// there were less bytes than in the original so we can simply shrink
3754+
unsafe {
3755+
self.v.set_len(self.write_offset);
3756+
}
3757+
} else {
3758+
let (q1, q2) = self.buffer.as_slices();
3759+
self.v.extend_from_slice(q1);
3760+
self.v.extend_from_slice(q2);
3761+
};
3762+
// SAFETY: this is valid utf8
3763+
*self.s = unsafe { String::from_utf8_unchecked(core::mem::take(&mut self.v)) }
3764+
}
3765+
}
3766+
3767+
#[unstable(issue = "none", feature = "std_internals")]
3768+
impl<'a> WriteChars<'a> {
3769+
fn new(s: &'a mut String) -> Self {
3770+
let v = core::mem::take(s).into_bytes();
3771+
WriteChars { s, v, write_offset: 0, read_offset: 0, buffer: VecDeque::new() }
3772+
}
3773+
3774+
fn rest(&self) -> &str {
3775+
// SAFETY: read_offset is always ok to read from
3776+
unsafe { str::from_utf8_unchecked(&self.v[self.read_offset..]) }
3777+
}
3778+
3779+
fn pop(&mut self) -> Option<char> {
3780+
// SAFETY: The bytes from read_offset are valid UTF8
3781+
let (code_point, width) = unsafe {
3782+
core::str::next_code_point_with_width(&mut self.v[self.read_offset..].iter())?
3783+
};
3784+
self.read_offset += width;
3785+
// Dump what is buffered in the newly freed space
3786+
while self.write_offset < self.read_offset
3787+
&& let Some(b) = self.buffer.pop_front()
3788+
{
3789+
self.v[self.write_offset] = b;
3790+
self.write_offset += 1;
3791+
}
3792+
// SAFETY: The code point is valid
3793+
let c = unsafe { char::from_u32_unchecked(code_point) };
3794+
Some(c)
3795+
}
3796+
3797+
fn write(&mut self, c: char) {
3798+
let writable_slice = &mut self.v[self.write_offset..self.read_offset];
3799+
let mut buffer = [0u8; 4];
3800+
let len = c.encode_utf8(&mut buffer).len();
3801+
let direct_copy_length = core::cmp::min(len, writable_slice.len());
3802+
writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]);
3803+
self.write_offset += direct_copy_length;
3804+
self.buffer.extend(&buffer[direct_copy_length..len]);
3805+
}
3806+
}

library/alloctests/tests/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#![allow(internal_features)]
4545
#![deny(fuzzy_provenance_casts)]
4646
#![deny(unsafe_op_in_unsafe_fn)]
47+
#![feature(string_make_uplowercase)]
4748

4849
extern crate alloc;
4950

library/alloctests/tests/string.rs

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -956,3 +956,83 @@ fn test_str_concat() {
956956
let s: String = format!("{a}{b}");
957957
assert_eq!(s.as_bytes()[9], 'd' as u8);
958958
}
959+
960+
#[test]
961+
fn make_uppercase() {
962+
fn test(s: &str) {
963+
let ground_truth = s.to_uppercase();
964+
let mut tested = s.to_owned();
965+
tested.make_uppercase();
966+
assert!(
967+
tested == ground_truth,
968+
r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
969+
);
970+
}
971+
test("");
972+
test("abcde");
973+
// 4 to 9 bytes
974+
test("ǰΐ");
975+
// 10*3 to 10*2 bytes
976+
test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
977+
test("aéDžßfiᾀ");
978+
}
979+
980+
#[test]
981+
fn make_lowercase() {
982+
fn test(s: &str) {
983+
let ground_truth = s.to_lowercase();
984+
let mut tested = s.to_owned();
985+
tested.make_lowercase();
986+
assert!(
987+
tested == ground_truth,
988+
r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
989+
);
990+
}
991+
test("");
992+
test("AÉDžaé ");
993+
994+
// https://github.com/rust-lang/rust/issues/26035
995+
test("ΑΣ");
996+
test("Α'Σ");
997+
test("Α''Σ");
998+
999+
test("ΑΣ Α");
1000+
test("Α'Σ Α");
1001+
test("Α''Σ Α");
1002+
1003+
test("ΑΣ' Α");
1004+
test("ΑΣ'' Α");
1005+
1006+
test("Α'Σ' Α");
1007+
test("Α''Σ'' Α");
1008+
1009+
test("Α Σ");
1010+
test("Α 'Σ");
1011+
test("Α ''Σ");
1012+
1013+
test("Σ");
1014+
test("'Σ");
1015+
test("''Σ");
1016+
1017+
test("ΑΣΑ");
1018+
test("ΑΣ'Α");
1019+
test("ΑΣ''Α");
1020+
1021+
// https://github.com/rust-lang/rust/issues/124714
1022+
// input lengths around the boundary of the chunk size used by the ascii prefix optimization
1023+
test("abcdefghijklmnoΣ");
1024+
test("abcdefghijklmnopΣ");
1025+
test("abcdefghijklmnopqΣ");
1026+
1027+
// a really long string that has it's lowercase form
1028+
// even longer. this tests that implementations don't assume
1029+
// an incorrect upper bound on allocations
1030+
let upper = str::repeat("İ", 512);
1031+
test(&upper);
1032+
1033+
// a really long ascii-only string.
1034+
// This test that the ascii hot-path
1035+
// functions correctly
1036+
let upper = str::repeat("A", 511);
1037+
test(&upper);
1038+
}

library/core/src/str/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
5858
#[stable(feature = "rust1", since = "1.0.0")]
5959
pub use traits::FromStr;
6060
#[unstable(feature = "str_internals", issue = "none")]
61-
pub use validations::{next_code_point, utf8_char_width};
61+
pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width};
6262

6363
#[inline(never)]
6464
#[cold]

0 commit comments

Comments
 (0)