diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index 4d7150fca79..c8ef44dbd48 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -7259,14 +7259,17 @@ private static String[] splitByCharacterType(final String str, final boolean cam final char[] c = str.toCharArray(); final List list = new ArrayList<>(); int tokenStart = 0; - int currentType = Character.getType(c[tokenStart]); - for (int pos = tokenStart + 1; pos < c.length; pos++) { - final int type = Character.getType(c[pos]); + int currentType = Character.getType(Character.codePointAt(c, tokenStart)); + for (int pos = tokenStart + Character.charCount(Character.codePointAt(c, tokenStart)); pos < c.length;) { + final int codePoint = Character.codePointAt(c, pos); + final int type = Character.getType(codePoint); + final int count = Character.charCount(codePoint); if (type == currentType) { + pos += count; continue; } if (camelCase && type == Character.LOWERCASE_LETTER && currentType == Character.UPPERCASE_LETTER) { - final int newTokenStart = pos - 1; + final int newTokenStart = pos - Character.charCount(Character.codePointBefore(c, pos)); if (newTokenStart != tokenStart) { list.add(new String(c, tokenStart, newTokenStart - tokenStart)); tokenStart = newTokenStart; @@ -7276,6 +7279,7 @@ private static String[] splitByCharacterType(final String str, final boolean cam tokenStart = pos; } currentType = type; + pos += count; } list.add(new String(c, tokenStart, c.length - tokenStart)); return list.toArray(ArrayUtils.EMPTY_STRING_ARRAY); diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index fa34c839df5..e90b276fbc7 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -2355,6 +2355,18 @@ void testSplitByCharacterType() { assertTrue(Objects.deepEquals(new String[]{"ASFR", "ules"}, StringUtils.splitByCharacterType("ASFRules"))); + + // Supplementary code points are classified by their own type, not split apart as surrogates. + // U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter, like ASCII 'A'. + final String boldA = new String(Character.toChars(0x1D400)); + // U+1D7D3 MATHEMATICAL BOLD DIGIT FIVE is a decimal digit, like ASCII '5'. + final String boldFive = new String(Character.toChars(0x1D7D3)); + assertTrue(Objects.deepEquals(new String[]{"A" + boldA}, + StringUtils.splitByCharacterType("A" + boldA))); + assertTrue(Objects.deepEquals(new String[]{"5" + boldFive}, + StringUtils.splitByCharacterType("5" + boldFive))); + assertTrue(Objects.deepEquals(new String[]{boldA, "5" + boldFive, "z"}, + StringUtils.splitByCharacterType(boldA + "5" + boldFive + "z"))); } @Test @@ -2382,6 +2394,16 @@ void testSplitByCharacterTypeCamelCase() { assertTrue(Objects.deepEquals(new String[]{"ASF", "Rules"}, StringUtils.splitByCharacterTypeCamelCase("ASFRules"))); + + // A supplementary upper-case letter immediately before a lower-case run joins the following token, + // exactly as a BMP upper-case letter does. U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter. + final String boldA = new String(Character.toChars(0x1D400)); + assertTrue(Objects.deepEquals(new String[]{boldA + "bc"}, + StringUtils.splitByCharacterTypeCamelCase(boldA + "bc"))); + assertTrue(Objects.deepEquals(new String[]{"AB", boldA + "cd"}, + StringUtils.splitByCharacterTypeCamelCase("AB" + boldA + "cd"))); + assertTrue(Objects.deepEquals(new String[]{"foo", boldA + "bar"}, + StringUtils.splitByCharacterTypeCamelCase("foo" + boldA + "bar"))); } @Test