Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/main/java/org/apache/commons/lang3/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -7259,14 +7259,17 @@ private static String[] splitByCharacterType(final String str, final boolean cam
final char[] c = str.toCharArray();
final List<String> list = new ArrayList<>();
int tokenStart = 0;
int currentType = Character.getType(c[tokenStart]);
for (int pos = tokenStart + 1; pos < c.length; pos++) {
final int type = Character.getType(c[pos]);
int currentType = Character.getType(Character.codePointAt(c, tokenStart));
for (int pos = tokenStart + Character.charCount(Character.codePointAt(c, tokenStart)); pos < c.length;) {
final int codePoint = Character.codePointAt(c, pos);
final int type = Character.getType(codePoint);
final int count = Character.charCount(codePoint);
if (type == currentType) {
pos += count;
continue;
}
if (camelCase && type == Character.LOWERCASE_LETTER && currentType == Character.UPPERCASE_LETTER) {
final int newTokenStart = pos - 1;
final int newTokenStart = pos - Character.charCount(Character.codePointBefore(c, pos));
if (newTokenStart != tokenStart) {
list.add(new String(c, tokenStart, newTokenStart - tokenStart));
tokenStart = newTokenStart;
Expand All @@ -7276,6 +7279,7 @@ private static String[] splitByCharacterType(final String str, final boolean cam
tokenStart = pos;
}
currentType = type;
pos += count;
}
list.add(new String(c, tokenStart, c.length - tokenStart));
return list.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
Expand Down
22 changes: 22 additions & 0 deletions src/test/java/org/apache/commons/lang3/StringUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -2355,6 +2355,18 @@ void testSplitByCharacterType() {

assertTrue(Objects.deepEquals(new String[]{"ASFR", "ules"},
StringUtils.splitByCharacterType("ASFRules")));

// Supplementary code points are classified by their own type, not split apart as surrogates.
// U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter, like ASCII 'A'.
final String boldA = new String(Character.toChars(0x1D400));
// U+1D7D3 MATHEMATICAL BOLD DIGIT FIVE is a decimal digit, like ASCII '5'.
final String boldFive = new String(Character.toChars(0x1D7D3));
assertTrue(Objects.deepEquals(new String[]{"A" + boldA},
StringUtils.splitByCharacterType("A" + boldA)));
assertTrue(Objects.deepEquals(new String[]{"5" + boldFive},
StringUtils.splitByCharacterType("5" + boldFive)));
assertTrue(Objects.deepEquals(new String[]{boldA, "5" + boldFive, "z"},
StringUtils.splitByCharacterType(boldA + "5" + boldFive + "z")));
}

@Test
Expand Down Expand Up @@ -2382,6 +2394,16 @@ void testSplitByCharacterTypeCamelCase() {

assertTrue(Objects.deepEquals(new String[]{"ASF", "Rules"},
StringUtils.splitByCharacterTypeCamelCase("ASFRules")));

// A supplementary upper-case letter immediately before a lower-case run joins the following token,
// exactly as a BMP upper-case letter does. U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter.
final String boldA = new String(Character.toChars(0x1D400));
assertTrue(Objects.deepEquals(new String[]{boldA + "bc"},
StringUtils.splitByCharacterTypeCamelCase(boldA + "bc")));
assertTrue(Objects.deepEquals(new String[]{"AB", boldA + "cd"},
StringUtils.splitByCharacterTypeCamelCase("AB" + boldA + "cd")));
assertTrue(Objects.deepEquals(new String[]{"foo", boldA + "bar"},
StringUtils.splitByCharacterTypeCamelCase("foo" + boldA + "bar")));
}

@Test
Expand Down
Loading