Skip to content

Commit 8d07261

Browse files
Add APIs for dealing with titlecase
- `char::is_cased` - `char::is_titlecase` - `char::case` - `char::to_titlecase`
1 parent 4eb9e66 commit 8d07261

11 files changed

Lines changed: 581 additions & 102 deletions

File tree

library/alloc/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@
148148
#![feature(slice_range)]
149149
#![feature(std_internals)]
150150
#![feature(temporary_niche_types)]
151+
#![feature(titlecase)]
151152
#![feature(transmutability)]
152153
#![feature(trivial_clone)]
153154
#![feature(trusted_fused)]

library/core/src/char/methods.rs

Lines changed: 213 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -777,12 +777,76 @@ impl char {
777777
#[inline]
778778
pub fn is_alphabetic(self) -> bool {
779779
match self {
780-
'A'..='Z' | 'a'..='z' => true,
780+
'a'..='z' | 'A'..='Z' => true,
781781
'\0'..='\u{A9}' => false,
782782
_ => unicode::Alphabetic(self),
783783
}
784784
}
785785

786+
/// Returns `true` if this `char` has the `Cased` property.
787+
/// A character is cased if and only if it is uppercase, lowercase, or titlecase.
788+
///
789+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
790+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
791+
///
792+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
793+
/// [ucd]: https://www.unicode.org/reports/tr44/
794+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
795+
///
796+
/// # Examples
797+
///
798+
/// Basic usage:
799+
///
800+
/// ```
801+
/// #![feature(titlecase)]
802+
/// assert!('A'.is_cased());
803+
/// assert!('a'.is_cased());
804+
/// assert!(!'京'.is_cased());
805+
/// ```
806+
#[must_use]
807+
#[unstable(feature = "titlecase", issue = "153892")]
808+
#[inline]
809+
pub fn is_cased(self) -> bool {
810+
match self {
811+
'a'..='z' | 'A'..='Z' => true,
812+
'\0'..='\u{A9}' => false,
813+
_ => unicode::Cased(self),
814+
}
815+
}
816+
817+
/// Returns the case of this character:
818+
/// [`Some(CharCase::Upper)`][`CharCase::Upper`] if [`self.is_uppercase()`][`char::is_uppercase`],
819+
/// [`Some(CharCase::Lower)`][`CharCase::Lower`] if [`self.is_lowercase()`][`char::is_lowercase`],
820+
/// [`Some(CharCase::Title)`][`CharCase::Title`] if [`self.is_titlecase()`][`char::is_titlecase`], and
821+
/// `None` if [`!self.is_cased()`][`char::is_cased`].
822+
///
823+
/// # Examples
824+
///
825+
/// ```
826+
/// #![feature(titlecase)]
827+
/// use core::char::CharCase;
828+
/// assert_eq!('a'.case(), Some(CharCase::Lower));
829+
/// assert_eq!('δ'.case(), Some(CharCase::Lower));
830+
/// assert_eq!('A'.case(), Some(CharCase::Upper));
831+
/// assert_eq!('Δ'.case(), Some(CharCase::Upper));
832+
/// assert_eq!('Dž'.case(), Some(CharCase::Title));
833+
/// assert_eq!('中'.case(), None);
834+
/// ```
835+
#[must_use]
836+
#[unstable(feature = "titlecase", issue = "153892")]
837+
#[inline]
838+
pub fn case(self) -> Option<CharCase> {
839+
match self {
840+
'a'..='z' => Some(CharCase::Lower),
841+
'A'..='Z' => Some(CharCase::Upper),
842+
'\0'..='\u{A9}' => None,
843+
_ if !unicode::Cased(self) => None,
844+
_ if unicode::Lowercase(self) => Some(CharCase::Lower),
845+
_ if unicode::Uppercase(self) => Some(CharCase::Upper),
846+
_ => Some(CharCase::Title),
847+
}
848+
}
849+
786850
/// Returns `true` if this `char` has the `Lowercase` property.
787851
///
788852
/// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
@@ -825,6 +889,40 @@ impl char {
825889
}
826890
}
827891

892+
/// Returns `true` if this `char` has the general category for titlecase letters.
893+
/// Conceptually, these characters consist of an uppercase portion followed by a lowercase portion.
894+
///
895+
/// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4
896+
/// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character
897+
/// Database][ucd] [`UnicodeData.txt`].
898+
///
899+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
900+
/// [ucd]: https://www.unicode.org/reports/tr44/
901+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
902+
///
903+
/// # Examples
904+
///
905+
/// Basic usage:
906+
///
907+
/// ```
908+
/// #![feature(titlecase)]
909+
/// assert!('Dž'.is_titlecase());
910+
/// assert!('ῼ'.is_titlecase());
911+
/// assert!(!'D'.is_titlecase());
912+
/// assert!(!'z'.is_titlecase());
913+
/// assert!(!'中'.is_titlecase());
914+
/// assert!(!' '.is_titlecase());
915+
/// ```
916+
#[must_use]
917+
#[unstable(feature = "titlecase", issue = "153892")]
918+
#[inline]
919+
pub fn is_titlecase(self) -> bool {
920+
match self {
921+
'\0'..='\u{01C4}' => false,
922+
_ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(),
923+
}
924+
}
925+
828926
/// Returns `true` if this `char` has the `Uppercase` property.
829927
///
830928
/// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
@@ -925,7 +1023,7 @@ impl char {
9251023
#[inline]
9261024
pub fn is_alphanumeric(self) -> bool {
9271025
match self {
928-
'0'..='9' | 'A'..='Z' | 'a'..='z' => true,
1026+
'a'..='z' | 'A'..='Z' | '0'..='9' => true,
9291027
'\0'..='\u{A9}' => false,
9301028
_ => unicode::Alphabetic(self) || unicode::N(self),
9311029
}
@@ -976,26 +1074,6 @@ impl char {
9761074
self > '\u{02FF}' && unicode::Grapheme_Extend(self)
9771075
}
9781076

979-
/// Returns `true` if this `char` has the `Cased` property.
980-
///
981-
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
982-
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
983-
///
984-
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
985-
/// [ucd]: https://www.unicode.org/reports/tr44/
986-
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
987-
#[must_use]
988-
#[inline]
989-
#[doc(hidden)]
990-
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
991-
pub fn is_cased(self) -> bool {
992-
match self {
993-
'A'..='Z' | 'a'..='z' => true,
994-
'\0'..='\u{A9}' => false,
995-
_ => unicode::Cased(self),
996-
}
997-
}
998-
9991077
/// Returns `true` if this `char` has the `Case_Ignorable` property.
10001078
///
10011079
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
@@ -1119,17 +1197,123 @@ impl char {
11191197
/// // convert into themselves.
11201198
/// assert_eq!('山'.to_lowercase().to_string(), "山");
11211199
/// ```
1122-
#[must_use = "this returns the lowercase character as a new iterator, \
1200+
#[must_use = "this returns the lowercased character as a new iterator, \
11231201
without modifying the original"]
11241202
#[stable(feature = "rust1", since = "1.0.0")]
11251203
#[inline]
11261204
pub fn to_lowercase(self) -> ToLowercase {
11271205
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
11281206
}
11291207

1208+
/// Returns an iterator that yields the titlecase mapping of this `char` as one or more
1209+
/// `char`s.
1210+
///
1211+
/// This is usually, but not always, equivalent to the uppercase mapping
1212+
/// returned by [`Self::to_uppercase`]. Prefer this method when seeking to capitalize
1213+
/// Only The First Letter of a word, but use [`Self::to_uppercase`] for ALL CAPS.
1214+
///
1215+
/// If this `char` does not have an titlecase mapping, the iterator yields the same `char`.
1216+
///
1217+
/// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character
1218+
/// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`.
1219+
///
1220+
/// [ucd]: https://www.unicode.org/reports/tr44/
1221+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1222+
///
1223+
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1224+
/// the `char`(s) given by [`SpecialCasing.txt`].
1225+
///
1226+
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
1227+
///
1228+
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
1229+
/// is independent of context and language.
1230+
///
1231+
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
1232+
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
1233+
///
1234+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
1235+
///
1236+
/// # Examples
1237+
///
1238+
/// As an iterator:
1239+
///
1240+
/// ```
1241+
/// #![feature(titlecase)]
1242+
/// for c in 'ß'.to_titlecase() {
1243+
/// print!("{c}");
1244+
/// }
1245+
/// println!();
1246+
/// ```
1247+
///
1248+
/// Using `println!` directly:
1249+
///
1250+
/// ```
1251+
/// #![feature(titlecase)]
1252+
/// println!("{}", 'ß'.to_titlecase());
1253+
/// ```
1254+
///
1255+
/// Both are equivalent to:
1256+
///
1257+
/// ```
1258+
/// println!("Ss");
1259+
/// ```
1260+
///
1261+
/// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string):
1262+
///
1263+
/// ```
1264+
/// #![feature(titlecase)]
1265+
/// assert_eq!('c'.to_titlecase().to_string(), "C");
1266+
/// assert_eq!('dž'.to_titlecase().to_string(), "Dž");
1267+
/// assert_eq!('ῼ'.to_titlecase().to_string(), "ῼ");
1268+
///
1269+
/// // Sometimes the result is more than one character:
1270+
/// assert_eq!('ß'.to_titlecase().to_string(), "Ss");
1271+
///
1272+
/// // Characters that do not have separate cased forms
1273+
/// // convert into themselves.
1274+
/// assert_eq!('山'.to_titlecase().to_string(), "山");
1275+
/// ```
1276+
///
1277+
/// # Note on locale
1278+
///
1279+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
1280+
///
1281+
/// * 'Dotless': I / ı, sometimes written ï
1282+
/// * 'Dotted': İ / i
1283+
///
1284+
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
1285+
///
1286+
/// ```
1287+
/// #![feature(titlecase)]
1288+
/// let upper_i = 'i'.to_titlecase().to_string();
1289+
/// ```
1290+
///
1291+
/// The value of `upper_i` here relies on the language of the text: if we're
1292+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
1293+
/// be `"İ"`. `to_titlecase()` does not take this into account, and so:
1294+
///
1295+
/// ```
1296+
/// #![feature(titlecase)]
1297+
/// let upper_i = 'i'.to_titlecase().to_string();
1298+
///
1299+
/// assert_eq!(upper_i, "I");
1300+
/// ```
1301+
///
1302+
/// holds across languages.
1303+
#[must_use = "this returns the titlecased character as a new iterator, \
1304+
without modifying the original"]
1305+
#[unstable(feature = "titlecase", issue = "153892")]
1306+
#[inline]
1307+
pub fn to_titlecase(self) -> ToTitlecase {
1308+
ToTitlecase(CaseMappingIter::new(conversions::to_title(self)))
1309+
}
1310+
11301311
/// Returns an iterator that yields the uppercase mapping of this `char` as one or more
11311312
/// `char`s.
11321313
///
1314+
/// Prefer this method when converting a word into ALL CAPS, but consider [`Self::to_titlecase`]
1315+
/// instead if you seek to capitalize Only The First Letter.
1316+
///
11331317
/// If this `char` does not have an uppercase mapping, the iterator yields the same `char`.
11341318
///
11351319
/// If this `char` has a one-to-one uppercase mapping given by the [Unicode Character
@@ -1179,9 +1363,11 @@ impl char {
11791363
///
11801364
/// ```
11811365
/// assert_eq!('c'.to_uppercase().to_string(), "C");
1366+
/// assert_eq!('dž'.to_uppercase().to_string(), "DŽ");
11821367
///
11831368
/// // Sometimes the result is more than one character:
11841369
/// assert_eq!('ſt'.to_uppercase().to_string(), "ST");
1370+
/// assert_eq!('ῼ'.to_uppercase().to_string(), "ΩΙ");
11851371
///
11861372
/// // Characters that do not have both uppercase and lowercase
11871373
/// // convert into themselves.
@@ -1190,7 +1376,7 @@ impl char {
11901376
///
11911377
/// # Note on locale
11921378
///
1193-
/// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
1379+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
11941380
///
11951381
/// * 'Dotless': I / ı, sometimes written ï
11961382
/// * 'Dotted': İ / i
@@ -1202,7 +1388,7 @@ impl char {
12021388
/// ```
12031389
///
12041390
/// The value of `upper_i` here relies on the language of the text: if we're
1205-
/// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should
1391+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
12061392
/// be `"İ"`. `to_uppercase()` does not take this into account, and so:
12071393
///
12081394
/// ```
@@ -1212,7 +1398,7 @@ impl char {
12121398
/// ```
12131399
///
12141400
/// holds across languages.
1215-
#[must_use = "this returns the uppercase character as a new iterator, \
1401+
#[must_use = "this returns the uppercased character as a new iterator, \
12161402
without modifying the original"]
12171403
#[stable(feature = "rust1", since = "1.0.0")]
12181404
#[inline]
@@ -1455,7 +1641,7 @@ impl char {
14551641
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
14561642
#[inline]
14571643
pub const fn is_ascii_alphabetic(&self) -> bool {
1458-
matches!(*self, 'A'..='Z' | 'a'..='z')
1644+
matches!(*self, 'a'..='z' | 'A'..='Z')
14591645
}
14601646

14611647
/// Checks if the value is an ASCII uppercase character:

0 commit comments

Comments
 (0)