@@ -1151,13 +1151,14 @@ impl char {
11511151 /// [ucd]: https://www.unicode.org/reports/tr44/
11521152 /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
11531153 ///
1154- /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1155- /// the `char`(s) given by [`SpecialCasing.txt`] .
1154+ /// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by
1155+ /// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3 .
11561156 ///
11571157 /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
11581158 ///
11591159 /// This operation performs an unconditional mapping without tailoring. That is, the conversion
1160- /// is independent of context and language.
1160+ /// is independent of context and language. See [below](#notes-on-context-and-locale)
1161+ /// for more information.
11611162 ///
11621163 /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
11631164 /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
@@ -1199,6 +1200,48 @@ impl char {
11991200 /// // convert into themselves.
12001201 /// assert_eq!('山'.to_lowercase().to_string(), "山");
12011202 /// ```
1203+ /// # Notes on context and locale
1204+ ///
1205+ /// As stated earlier, this method does not take into account language or context.
1206+ /// Below is a non-exhaustive list of situations where this can be relevant.
1207+ /// If you need to handle locale-depedendent casing in your code, consider using
1208+ /// an external crate, like [`icu_casemap`](https://crates.io/crates/icu_casemap)
1209+ /// which is developed by Unicode.
1210+ ///
1211+ /// ## Greek sigma
1212+ ///
1213+ /// In Greek, the letter simga (uppercase Σ) has two lowercase forms:
1214+ /// ς which is used only at the end of a word, and σ which is used everywhere else.
1215+ /// `to_lowercase()` always uses the second form:
1216+ ///
1217+ /// ```
1218+ /// assert_eq!('Σ'.to_lowercase().to_string(), "σ");
1219+ /// ```
1220+ ///
1221+ /// ## Turkish and Azeri I/ı/İ/i
1222+ ///
1223+ /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
1224+ ///
1225+ /// * 'Dotless': I / ı, sometimes written ï
1226+ /// * 'Dotted': İ / i
1227+ ///
1228+ /// Note that the uppercase undotted 'I' is the same as the Latin. Therefore:
1229+ ///
1230+ /// ```
1231+ /// let lower_i = 'I'.to_lowercase().to_string();
1232+ /// ```
1233+ ///
1234+ /// The value of `lower_i` here relies on the language of the text: if we're
1235+ /// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should
1236+ /// be `"ı"`. `to_lowercase()` does not take this into account, and so:
1237+ ///
1238+ /// ```
1239+ /// let lower_i = 'I'.to_lowercase().to_string();
1240+ ///
1241+ /// assert_eq!(lower_i, "i");
1242+ /// ```
1243+ ///
1244+ /// holds across languages.
12021245 #[ must_use = "this returns the lowercased character as a new iterator, \
12031246 without modifying the original"]
12041247 #[ stable( feature = "rust1" , since = "1.0.0" ) ]
@@ -1211,8 +1254,10 @@ impl char {
12111254 /// `char`s.
12121255 ///
12131256 /// This is usually, but not always, equivalent to the uppercase mapping
1214- /// returned by [`Self::to_uppercase`]. Prefer this method when seeking to capitalize
1215- /// Only The First Letter of a word, but use [`Self::to_uppercase`] for ALL CAPS.
1257+ /// returned by [`to_uppercase()`]. Prefer this method when seeking to capitalize
1258+ /// Only The First Letter of a word, but use [`to_uppercase()`] for ALL CAPS.
1259+ /// See [below](#difference-from-uppercase) for a thorough explanation
1260+ /// of the difference between the two methods.
12161261 ///
12171262 /// If this `char` does not have a titlecase mapping, the iterator yields the same `char`.
12181263 ///
@@ -1222,13 +1267,14 @@ impl char {
12221267 /// [ucd]: https://www.unicode.org/reports/tr44/
12231268 /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
12241269 ///
1225- /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1226- /// the `char`(s) given by [`SpecialCasing.txt`] .
1270+ /// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by
1271+ /// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3 .
12271272 ///
12281273 /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
12291274 ///
12301275 /// This operation performs an unconditional mapping without tailoring. That is, the conversion
1231- /// is independent of context and language.
1276+ /// is independent of context and language. See [below](#note-on-locale)
1277+ /// for more information.
12321278 ///
12331279 /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
12341280 /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
@@ -1265,8 +1311,9 @@ impl char {
12651311 /// ```
12661312 /// #![feature(titlecase)]
12671313 /// assert_eq!('c'.to_titlecase().to_string(), "C");
1314+ /// assert_eq!('ა'.to_titlecase().to_string(), "ა");
12681315 /// assert_eq!('dž'.to_titlecase().to_string(), "Dž");
1269- /// assert_eq!('ῼ '.to_titlecase().to_string(), "ῼ ");
1316+ /// assert_eq!('ᾨ '.to_titlecase().to_string(), "ᾨ ");
12701317 ///
12711318 /// // Sometimes the result is more than one character:
12721319 /// assert_eq!('ß'.to_titlecase().to_string(), "Ss");
@@ -1276,8 +1323,78 @@ impl char {
12761323 /// assert_eq!('山'.to_titlecase().to_string(), "山");
12771324 /// ```
12781325 ///
1326+ /// # Difference from uppercase
1327+ ///
1328+ /// Currently, there are three classes of characters where [`to_uppercase()`]
1329+ /// and `to_titlecase()` give different results:
1330+ ///
1331+ /// ## Georgian script
1332+ ///
1333+ /// Each letter in the modern Georgian alphabet can be written in one of two forms:
1334+ /// the typical lowercase-like "mkhedruli" form, and a variant uppercase-like "mtavruli"
1335+ /// form. However, unlike uppercase in most cased scripts, mtavruli is not typically used
1336+ /// to start sentences, denote proper nouns, or for any other purpose
1337+ /// in running text. It is instead confined to titles and headings, which are written entirely
1338+ /// in mtavruli. For this reason, [`to_uppercase()`] applied to a Georgian letter
1339+ /// will return the mtavruli form, but `to_titlecase()` will return the mkhedruli form.
1340+ ///
1341+ /// ```
1342+ /// #![feature(titlecase)]
1343+ /// let ani = 'ა'; // First letter of the Georgian alphabet, in mkhedruli form
1344+ ///
1345+ /// // Titlecasing mkhedruli maps it to itself...
1346+ /// assert_eq!(ani.to_titlecase().to_string(), ani.to_string());
1347+ ///
1348+ /// // but uppercasing it maps it to mtavruli
1349+ /// assert_eq!(ani.to_uppercase().to_string(), "Ა");
1350+ /// ```
1351+ ///
1352+ /// ## Compatibility digraphs for Latin-alphabet Serbo-Croatian
1353+ ///
1354+ /// The standard Latin alphabet for the Serbo-Croatian language
1355+ /// (Bosnian, Croatian, Montenegrin, and Serbian) contains
1356+ /// three digraphs: Dž, Lj, and Nj. These are usually represented as
1357+ /// two characters. However, for compatibility with older character sets,
1358+ /// Unicode includes single-character versions of these digraphs.
1359+ /// Each has a uppercase, titlecase, and lowercase version:
1360+ ///
1361+ /// - `'DŽ'`, `'Dž'`, `'dž'`
1362+ /// - `'LJ'`, `'Lj'`, `'lj'`
1363+ /// - `'NJ'`, `'Nj'`, `'nj'`
1364+ ///
1365+ /// Unicode additionally encodes a casing triad for the Dz digraph
1366+ /// without the caron: `'DZ'`, `'Dz'`, `'dz'`.
1367+ ///
1368+ /// ## Iota-subscritped Greek vowels
1369+ ///
1370+ /// In ancient Greek, the long vowels alpha (α), eta (η), and omega (ω)
1371+ /// were sometimes followed by an iota (ι), forming a diphthong. Over time,
1372+ /// the diphthong pronunciation was slowly lost, with the iota becoming mute.
1373+ /// Eventually, the ι disappeared from the spelling as well.
1374+ /// However, there remains a need to represent ancient texts faithfully.
1375+ ///
1376+ /// Modern editions of ancient Greek texts commonly use a reduced-sized
1377+ /// ι symbol to denote mute iotas, while distinguishing them from ιs
1378+ /// which continued to affect pronunciation. The exact standard differs
1379+ /// between different publications. Some render the mute ι below its associated
1380+ /// vowel (subscript), while others place it to the right of said vowel (adscript).
1381+ /// The interaction of mute ι symbols with casing also varies.
1382+ ///
1383+ /// The Unicode Standard, for its default casing rules, chose to make lowercase
1384+ /// Greek vowels with iota subscipt (e.g. `'ᾠ'`) titlecase to the uppercase vowel
1385+ /// with iota subscript (`'ᾨ'`) but uppercase to the uppercase vowel followed by
1386+ /// full-size uppercase iota (`"ὨΙ"`). This is just one convention among many
1387+ /// in common use, but it is the one Unicode settled on,
1388+ /// so it is what this method does also.
1389+ ///
12791390 /// # Note on locale
12801391 ///
1392+ /// As stated above, this method is locale-insensitive.
1393+ /// If you need locale support, consider using an external crate,
1394+ /// like [`icu_casemap`](https://crates.io/crates/icu_casemap)
1395+ /// which is developed by Unicode. A description of a common
1396+ /// locale-dependent casing issue follows:
1397+ ///
12811398 /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
12821399 ///
12831400 /// * 'Dotless': I / ı, sometimes written ï
@@ -1302,6 +1419,8 @@ impl char {
13021419 /// ```
13031420 ///
13041421 /// holds across languages.
1422+ ///
1423+ /// [`to_uppercase()`]: Self::to_uppercase()
13051424 #[ must_use = "this returns the titlecased character as a new iterator, \
13061425 without modifying the original"]
13071426 #[ unstable( feature = "titlecase" , issue = "153892" ) ]
@@ -1313,8 +1432,9 @@ impl char {
13131432 /// Returns an iterator that yields the uppercase mapping of this `char` as one or more
13141433 /// `char`s.
13151434 ///
1316- /// Prefer this method when converting a word into ALL CAPS, but consider [`Self::to_titlecase`]
1317- /// instead if you seek to capitalize Only The First Letter.
1435+ /// Prefer this method when converting a word into ALL CAPS, but consider [`to_titlecase()`]
1436+ /// instead if you seek to capitalize Only The First Letter. See that method's documentation
1437+ /// for more information on the difference between the two.
13181438 ///
13191439 /// If this `char` does not have an uppercase mapping, the iterator yields the same `char`.
13201440 ///
@@ -1324,20 +1444,22 @@ impl char {
13241444 /// [ucd]: https://www.unicode.org/reports/tr44/
13251445 /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
13261446 ///
1327- /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1328- /// the `char`(s) given by [`SpecialCasing.txt`] .
1447+ /// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by
1448+ /// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3 .
13291449 ///
13301450 /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
13311451 ///
13321452 /// This operation performs an unconditional mapping without tailoring. That is, the conversion
1333- /// is independent of context and language.
1453+ /// is independent of context and language. See [below](#note-on-locale)
1454+ /// for more information.
13341455 ///
13351456 /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
13361457 /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
13371458 ///
13381459 /// [Unicode Standard]: https://www.unicode.org/versions/latest/
13391460 ///
13401461 /// # Examples
1462+ ///
13411463 /// `'ſt'` (U+FB05) is a single Unicode code point (a ligature) that maps to "ST" in uppercase.
13421464 ///
13431465 /// As an iterator:
@@ -1365,11 +1487,12 @@ impl char {
13651487 ///
13661488 /// ```
13671489 /// assert_eq!('c'.to_uppercase().to_string(), "C");
1490+ /// assert_eq!('ა'.to_uppercase().to_string(), "Ა");
13681491 /// assert_eq!('dž'.to_uppercase().to_string(), "DŽ");
13691492 ///
13701493 /// // Sometimes the result is more than one character:
13711494 /// assert_eq!('ſt'.to_uppercase().to_string(), "ST");
1372- /// assert_eq!('ῼ '.to_uppercase().to_string(), "ΩΙ ");
1495+ /// assert_eq!('ᾨ '.to_uppercase().to_string(), "ὨΙ ");
13731496 ///
13741497 /// // Characters that do not have both uppercase and lowercase
13751498 /// // convert into themselves.
@@ -1378,6 +1501,12 @@ impl char {
13781501 ///
13791502 /// # Note on locale
13801503 ///
1504+ /// As stated above, this method is locale-insensitive.
1505+ /// If you need locale support, consider using an external crate,
1506+ /// like [`icu_casemap`](https://crates.io/crates/icu_casemap)
1507+ /// which is developed by Unicode. A description of a common
1508+ /// locale-dependent casing issue follows:
1509+ ///
13811510 /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
13821511 ///
13831512 /// * 'Dotless': I / ı, sometimes written ï
@@ -1400,6 +1529,8 @@ impl char {
14001529 /// ```
14011530 ///
14021531 /// holds across languages.
1532+ ///
1533+ /// [`to_titlecase()`]: Self::to_titlecase()
14031534 #[ must_use = "this returns the uppercased character as a new iterator, \
14041535 without modifying the original"]
14051536 #[ stable( feature = "rust1" , since = "1.0.0" ) ]
0 commit comments