Skip to content

Commit 47e2f90

Browse files
committed
Support for any language special characters
1 parent 03099b4 commit 47e2f90

File tree

2 files changed

+129
-11
lines changed

2 files changed

+129
-11
lines changed

core/src/main/kotlin/org/evomaster/core/utils/StringUtils.kt

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,21 +82,77 @@ object StringUtils {
8282
}
8383

8484
/**
85-
* Replaces non-ASCII characters in a name to make it a valid SMT-LIB identifier.
86-
* SMT-LIB unquoted symbols are restricted to ASCII, so characters like Æ, Ø, Å must be transliterated.
85+
* Converts a string to a valid ASCII identifier for use in SMT-LIB.
86+
* SMT-LIB unquoted symbols are restricted to ASCII.
8787
*
88-
* This is needed because our test suite includes Norwegian APIs whose database schemas
89-
* contain column and table names with Norwegian characters (Æ, Ø, Å).
88+
* The conversion uses two complementary steps:
89+
* 1. An explicit folding map for characters that have no canonical decomposition under NFD
90+
* (e.g., Ø→O, Æ→AE, ß→ss, ð→d, þ→th, Ł→L, Œ→OE, ŋ→n, ħ→h, ı→i, …),
91+
* covering non-decomposable characters from the Unicode Latin Extended blocks.
92+
* 2. NFD normalization followed by stripping of non-ASCII combining marks, which handles
93+
* all accented characters that do decompose (e.g., é→e, ü→u, ñ→n, Ä→A, ö→o, å→a).
9094
*
91-
* Characters that do not decompose under NFD (Ø, Æ) are replaced explicitly.
92-
* Characters that decompose under NFD (Å→A, and other accented letters like é, ü, ñ)
93-
* are handled by normalizing to NFD form and stripping the remaining non-ASCII combining marks.
95+
* Any remaining non-ASCII characters (e.g., from non-Latin scripts) are dropped.
9496
*/
9597
fun convertToAscii(name: String): String {
96-
val replaced = name
97-
.replace('Ø', 'O').replace('ø', 'o')
98-
.replace("Æ", "AE").replace("æ", "ae")
99-
return java.text.Normalizer.normalize(replaced, java.text.Normalizer.Form.NFD)
98+
val sb = StringBuilder(name.length * 2)
99+
for (ch in name) {
100+
sb.append(ASCII_FOLD_MAP[ch] ?: ch.toString())
101+
}
102+
return java.text.Normalizer.normalize(sb.toString(), java.text.Normalizer.Form.NFD)
100103
.replace(Regex("[^\\x00-\\x7F]"), "")
101104
}
105+
106+
/**
107+
* Explicit ASCII replacements for Unicode characters that do not decompose under NFD normalization.
108+
* Covers non-decomposable characters from the Unicode Latin-1 Supplement and Latin Extended-A/B blocks.
109+
* Characters that DO decompose under NFD (e.g., Ä, ö, å, é, ü, ñ) are handled by the NFD step in
110+
* [convertToAscii] and need no entry here.
111+
*/
112+
private val ASCII_FOLD_MAP: Map<Char, String> = mapOf(
113+
// Latin-1 Supplement
114+
'Æ' to "AE", 'æ' to "ae", // AE ligature (Danish, Norwegian, Old English)
115+
'Ð' to "D", 'ð' to "d", // Eth (Icelandic, Old English)
116+
'Ø' to "O", 'ø' to "o", // O with stroke (Danish, Norwegian)
117+
'Þ' to "TH", 'þ' to "th", // Thorn (Icelandic, Old English)
118+
'ß' to "ss", // Sharp S (German)
119+
// Latin Extended-A
120+
'Ħ' to "H", 'ħ' to "h", // H with stroke (Maltese)
121+
'ı' to "i", // Dotless i (Turkish, Azerbaijani)
122+
'IJ' to "IJ", 'ij' to "ij", // IJ digraph (Dutch)
123+
'ĸ' to "k", // Kra (Greenlandic)
124+
'Ł' to "L", 'ł' to "l", // L with stroke (Polish, Croatian, Sorbian)
125+
'Ŋ' to "N", 'ŋ' to "n", // Eng (Sami, African languages)
126+
'Œ' to "OE", 'œ' to "oe", // OE ligature (French)
127+
'Ŧ' to "T", 'ŧ' to "t", // T with stroke (Sami)
128+
// Latin Extended-B
129+
'ƀ' to "b", 'Ƀ' to "B", // B with stroke
130+
'Ɓ' to "B", // B with hook
131+
'Ƈ' to "C", 'ƈ' to "c", // C with hook
132+
'Ɗ' to "D", // D with hook
133+
'ƌ' to "d", // D with topbar
134+
'Ƒ' to "F", 'ƒ' to "f", // F with hook
135+
'Ɠ' to "G", // G with hook
136+
'Ɨ' to "I", // I with stroke
137+
'Ƙ' to "K", 'ƙ' to "k", // K with hook
138+
'ƚ' to "l", // L with bar
139+
'Ɲ' to "N", 'ƞ' to "n", // N with hook / N with long right leg
140+
'Œ' to "OE", 'œ' to "oe",
141+
'Ƥ' to "P", 'ƥ' to "p", // P with hook
142+
'ƫ' to "t", // T with palatal hook
143+
'Ƭ' to "T", 'ƭ' to "t", // T with hook
144+
'Ʈ' to "T", // T with retroflex hook
145+
'Ư' to "U", 'ư' to "u", // U with horn (Vietnamese)
146+
'Ʋ' to "V", // V with hook
147+
'Ƴ' to "Y", 'ƴ' to "y", // Y with hook
148+
'Ƶ' to "Z", 'ƶ' to "z", // Z with stroke
149+
'Ǝ' to "E", 'ǝ' to "e", // Reversed E / Schwa
150+
'Ɵ' to "O", // O with middle tilde
151+
'Ȼ' to "C", 'ȼ' to "c", // C with stroke
152+
'Ɇ' to "E", 'ɇ' to "e", // E with stroke
153+
'Ɉ' to "J", 'ɉ' to "j", // J with stroke
154+
'Ɋ' to "Q", 'ɋ' to "q", // Q with hook tail
155+
'Ɍ' to "R", 'ɍ' to "r", // R with stroke
156+
'Ɏ' to "Y", 'ɏ' to "y", // Y with stroke
157+
)
102158
}

core/src/test/kotlin/org/evomaster/core/utils/StringUtilsTest.kt

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,66 @@ class StringUtilsTest{
1919
assertEquals(", Hello",lines[2])
2020
assertEquals(", C, D",lines[3])
2121
}
22+
23+
@Test
24+
fun testConvertToAsciiPlainAsciiUnchanged() {
25+
assertEquals("hello_world", StringUtils.convertToAscii("hello_world"))
26+
assertEquals("FooBar123", StringUtils.convertToAscii("FooBar123"))
27+
}
28+
29+
@Test
30+
fun testConvertToAsciiNorwegianDanish() {
31+
// Ø/ø and Æ/æ do not decompose under NFD — handled by explicit map
32+
assertEquals("O", StringUtils.convertToAscii("Ø"))
33+
assertEquals("o", StringUtils.convertToAscii("ø"))
34+
assertEquals("AE", StringUtils.convertToAscii("Æ"))
35+
assertEquals("ae", StringUtils.convertToAscii("æ"))
36+
// Å/å decomposes under NFD
37+
assertEquals("A", StringUtils.convertToAscii("Å"))
38+
assertEquals("a", StringUtils.convertToAscii("å"))
39+
}
40+
41+
@Test
42+
fun testConvertToAsciiSwedishGerman() {
43+
// These all decompose under NFD (base letter + combining diacritic)
44+
assertEquals("A", StringUtils.convertToAscii("Ä"))
45+
assertEquals("a", StringUtils.convertToAscii("ä"))
46+
assertEquals("O", StringUtils.convertToAscii("Ö"))
47+
assertEquals("o", StringUtils.convertToAscii("ö"))
48+
assertEquals("U", StringUtils.convertToAscii("Ü"))
49+
assertEquals("u", StringUtils.convertToAscii("ü"))
50+
// ß does not decompose under NFD — handled by explicit map
51+
assertEquals("ss", StringUtils.convertToAscii("ß"))
52+
}
53+
54+
@Test
55+
fun testConvertToAsciiIcelandic() {
56+
assertEquals("D", StringUtils.convertToAscii("Ð"))
57+
assertEquals("d", StringUtils.convertToAscii("ð"))
58+
assertEquals("TH", StringUtils.convertToAscii("Þ"))
59+
assertEquals("th", StringUtils.convertToAscii("þ"))
60+
}
61+
62+
@Test
63+
fun testConvertToAsciiPolishFrench() {
64+
assertEquals("L", StringUtils.convertToAscii("Ł"))
65+
assertEquals("l", StringUtils.convertToAscii("ł"))
66+
assertEquals("OE", StringUtils.convertToAscii("Œ"))
67+
assertEquals("oe", StringUtils.convertToAscii("œ"))
68+
}
69+
70+
@Test
71+
fun testConvertToAsciiOtherAccented() {
72+
// Common accented characters that decompose under NFD
73+
assertEquals("e", StringUtils.convertToAscii("é"))
74+
assertEquals("e", StringUtils.convertToAscii("è"))
75+
assertEquals("n", StringUtils.convertToAscii("ñ"))
76+
assertEquals("c", StringUtils.convertToAscii("ç"))
77+
}
78+
79+
@Test
80+
fun testConvertToAsciiMixedString() {
81+
assertEquals("StromsAElv", StringUtils.convertToAscii("StrømsÆlv"))
82+
assertEquals("Malostranke_namesti", StringUtils.convertToAscii("Malostranké_náměstí"))
83+
}
2284
}

0 commit comments

Comments
 (0)