Skip to content

Commit 37a2be8

Browse files
authored
Handle supplementary code points in splitByCharacterType (#1734)
1 parent 261392f commit 37a2be8

2 files changed

Lines changed: 30 additions & 4 deletions

File tree

src/main/java/org/apache/commons/lang3/StringUtils.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7259,14 +7259,17 @@ private static String[] splitByCharacterType(final String str, final boolean cam
72597259
final char[] c = str.toCharArray();
72607260
final List<String> list = new ArrayList<>();
72617261
int tokenStart = 0;
7262-
int currentType = Character.getType(c[tokenStart]);
7263-
for (int pos = tokenStart + 1; pos < c.length; pos++) {
7264-
final int type = Character.getType(c[pos]);
7262+
int currentType = Character.getType(Character.codePointAt(c, tokenStart));
7263+
for (int pos = tokenStart + Character.charCount(Character.codePointAt(c, tokenStart)); pos < c.length;) {
7264+
final int codePoint = Character.codePointAt(c, pos);
7265+
final int type = Character.getType(codePoint);
7266+
final int count = Character.charCount(codePoint);
72657267
if (type == currentType) {
7268+
pos += count;
72667269
continue;
72677270
}
72687271
if (camelCase && type == Character.LOWERCASE_LETTER && currentType == Character.UPPERCASE_LETTER) {
7269-
final int newTokenStart = pos - 1;
7272+
final int newTokenStart = pos - Character.charCount(Character.codePointBefore(c, pos));
72707273
if (newTokenStart != tokenStart) {
72717274
list.add(new String(c, tokenStart, newTokenStart - tokenStart));
72727275
tokenStart = newTokenStart;
@@ -7276,6 +7279,7 @@ private static String[] splitByCharacterType(final String str, final boolean cam
72767279
tokenStart = pos;
72777280
}
72787281
currentType = type;
7282+
pos += count;
72797283
}
72807284
list.add(new String(c, tokenStart, c.length - tokenStart));
72817285
return list.toArray(ArrayUtils.EMPTY_STRING_ARRAY);

src/test/java/org/apache/commons/lang3/StringUtilsTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2355,6 +2355,18 @@ void testSplitByCharacterType() {
23552355

23562356
assertTrue(Objects.deepEquals(new String[]{"ASFR", "ules"},
23572357
StringUtils.splitByCharacterType("ASFRules")));
2358+
2359+
// Supplementary code points are classified by their own type, not split apart as surrogates.
2360+
// U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter, like ASCII 'A'.
2361+
final String boldA = new String(Character.toChars(0x1D400));
2362+
// U+1D7D3 MATHEMATICAL BOLD DIGIT FIVE is a decimal digit, like ASCII '5'.
2363+
final String boldFive = new String(Character.toChars(0x1D7D3));
2364+
assertTrue(Objects.deepEquals(new String[]{"A" + boldA},
2365+
StringUtils.splitByCharacterType("A" + boldA)));
2366+
assertTrue(Objects.deepEquals(new String[]{"5" + boldFive},
2367+
StringUtils.splitByCharacterType("5" + boldFive)));
2368+
assertTrue(Objects.deepEquals(new String[]{boldA, "5" + boldFive, "z"},
2369+
StringUtils.splitByCharacterType(boldA + "5" + boldFive + "z")));
23582370
}
23592371

23602372
@Test
@@ -2382,6 +2394,16 @@ void testSplitByCharacterTypeCamelCase() {
23822394

23832395
assertTrue(Objects.deepEquals(new String[]{"ASF", "Rules"},
23842396
StringUtils.splitByCharacterTypeCamelCase("ASFRules")));
2397+
2398+
// A supplementary upper-case letter immediately before a lower-case run joins the following token,
2399+
// exactly as a BMP upper-case letter does. U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter.
2400+
final String boldA = new String(Character.toChars(0x1D400));
2401+
assertTrue(Objects.deepEquals(new String[]{boldA + "bc"},
2402+
StringUtils.splitByCharacterTypeCamelCase(boldA + "bc")));
2403+
assertTrue(Objects.deepEquals(new String[]{"AB", boldA + "cd"},
2404+
StringUtils.splitByCharacterTypeCamelCase("AB" + boldA + "cd")));
2405+
assertTrue(Objects.deepEquals(new String[]{"foo", boldA + "bar"},
2406+
StringUtils.splitByCharacterTypeCamelCase("foo" + boldA + "bar")));
23852407
}
23862408

23872409
@Test

0 commit comments

Comments
 (0)