Support for any language special characters

agusaldasoro · agusaldasoro · commit 47e2f90df345 · 2026-04-06T19:00:14.000-03:00
diff --git a/core/src/main/kotlin/org/evomaster/core/utils/StringUtils.kt b/core/src/main/kotlin/org/evomaster/core/utils/StringUtils.kt
@@ -82,21 +82,77 @@ object StringUtils {
     }
 
     /**
-     * Replaces non-ASCII characters in a name to make it a valid SMT-LIB identifier.
-     * SMT-LIB unquoted symbols are restricted to ASCII, so characters like Æ, Ø, Å must be transliterated.
+     * Converts a string to a valid ASCII identifier for use in SMT-LIB.
+     * SMT-LIB unquoted symbols are restricted to ASCII.
      *
-     * This is needed because our test suite includes Norwegian APIs whose database schemas
-     * contain column and table names with Norwegian characters (Æ, Ø, Å).
+     * The conversion uses two complementary steps:
+     * 1. An explicit folding map for characters that have no canonical decomposition under NFD
+     *    (e.g., Ø→O, Æ→AE, ß→ss, ð→d, þ→th, Ł→L, Œ→OE, ŋ→n, ħ→h, ı→i, …),
+     *    covering non-decomposable characters from the Unicode Latin Extended blocks.
+     * 2. NFD normalization followed by stripping of non-ASCII combining marks, which handles
+     *    all accented characters that do decompose (e.g., é→e, ü→u, ñ→n, Ä→A, ö→o, å→a).
      *
-     * Characters that do not decompose under NFD (Ø, Æ) are replaced explicitly.
-     * Characters that decompose under NFD (Å→A, and other accented letters like é, ü, ñ)
-     * are handled by normalizing to NFD form and stripping the remaining non-ASCII combining marks.
+     * Any remaining non-ASCII characters (e.g., from non-Latin scripts) are dropped.
      */
     fun convertToAscii(name: String): String {
-        val replaced = name
-            .replace('Ø', 'O').replace('ø', 'o')
-            .replace("Æ", "AE").replace("æ", "ae")
-        return java.text.Normalizer.normalize(replaced, java.text.Normalizer.Form.NFD)
+        val sb = StringBuilder(name.length * 2)
+        for (ch in name) {
+            sb.append(ASCII_FOLD_MAP[ch] ?: ch.toString())
+        }
+        return java.text.Normalizer.normalize(sb.toString(), java.text.Normalizer.Form.NFD)
             .replace(Regex("[^\\x00-\\x7F]"), "")
     }
+
+    /**
+     * Explicit ASCII replacements for Unicode characters that do not decompose under NFD normalization.
+     * Covers non-decomposable characters from the Unicode Latin-1 Supplement and Latin Extended-A/B blocks.
+     * Characters that DO decompose under NFD (e.g., Ä, ö, å, é, ü, ñ) are handled by the NFD step in
+     * [convertToAscii] and need no entry here.
+     */
+    private val ASCII_FOLD_MAP: Map<Char, String> = mapOf(
+        // Latin-1 Supplement
+        'Æ' to "AE", 'æ' to "ae",   // AE ligature (Danish, Norwegian, Old English)
+        'Ð' to "D",  'ð' to "d",    // Eth (Icelandic, Old English)
+        'Ø' to "O",  'ø' to "o",    // O with stroke (Danish, Norwegian)
+        'Þ' to "TH", 'þ' to "th",   // Thorn (Icelandic, Old English)
+        'ß' to "ss",                  // Sharp S (German)
+        // Latin Extended-A
+        'Ħ' to "H",  'ħ' to "h",    // H with stroke (Maltese)
+        'ı' to "i",                   // Dotless i (Turkish, Azerbaijani)
+        'Ĳ' to "IJ", 'ĳ' to "ij",   // IJ digraph (Dutch)
+        'ĸ' to "k",                   // Kra (Greenlandic)
+        'Ł' to "L",  'ł' to "l",    // L with stroke (Polish, Croatian, Sorbian)
+        'Ŋ' to "N",  'ŋ' to "n",    // Eng (Sami, African languages)
+        'Œ' to "OE", 'œ' to "oe",   // OE ligature (French)
+        'Ŧ' to "T",  'ŧ' to "t",    // T with stroke (Sami)
+        // Latin Extended-B
+        'ƀ' to "b",  'Ƀ' to "B",    // B with stroke
+        'Ɓ' to "B",                   // B with hook
+        'Ƈ' to "C",  'ƈ' to "c",    // C with hook
+        'Ɗ' to "D",                   // D with hook
+        'ƌ' to "d",                   // D with topbar
+        'Ƒ' to "F",  'ƒ' to "f",    // F with hook
+        'Ɠ' to "G",                   // G with hook
+        'Ɨ' to "I",                   // I with stroke
+        'Ƙ' to "K",  'ƙ' to "k",    // K with hook
+        'ƚ' to "l",                   // L with bar
+        'Ɲ' to "N",  'ƞ' to "n",    // N with hook / N with long right leg
+        'Œ' to "OE", 'œ' to "oe",
+        'Ƥ' to "P",  'ƥ' to "p",    // P with hook
+        'ƫ' to "t",                   // T with palatal hook
+        'Ƭ' to "T",  'ƭ' to "t",    // T with hook
+        'Ʈ' to "T",                   // T with retroflex hook
+        'Ư' to "U",  'ư' to "u",    // U with horn (Vietnamese)
+        'Ʋ' to "V",                   // V with hook
+        'Ƴ' to "Y",  'ƴ' to "y",    // Y with hook
+        'Ƶ' to "Z",  'ƶ' to "z",    // Z with stroke
+        'Ǝ' to "E",  'ǝ' to "e",    // Reversed E / Schwa
+        'Ɵ' to "O",                   // O with middle tilde
+        'Ȼ' to "C",  'ȼ' to "c",    // C with stroke
+        'Ɇ' to "E",  'ɇ' to "e",    // E with stroke
+        'Ɉ' to "J",  'ɉ' to "j",    // J with stroke
+        'Ɋ' to "Q",  'ɋ' to "q",    // Q with hook tail
+        'Ɍ' to "R",  'ɍ' to "r",    // R with stroke
+        'Ɏ' to "Y",  'ɏ' to "y",    // Y with stroke
+    )
 }
diff --git a/core/src/test/kotlin/org/evomaster/core/utils/StringUtilsTest.kt b/core/src/test/kotlin/org/evomaster/core/utils/StringUtilsTest.kt
@@ -19,4 +19,66 @@ class StringUtilsTest{
         assertEquals(", Hello",lines[2])
         assertEquals(", C, D",lines[3])
     }
+
+    @Test
+    fun testConvertToAsciiPlainAsciiUnchanged() {
+        assertEquals("hello_world", StringUtils.convertToAscii("hello_world"))
+        assertEquals("FooBar123", StringUtils.convertToAscii("FooBar123"))
+    }
+
+    @Test
+    fun testConvertToAsciiNorwegianDanish() {
+        // Ø/ø and Æ/æ do not decompose under NFD — handled by explicit map
+        assertEquals("O", StringUtils.convertToAscii("Ø"))
+        assertEquals("o", StringUtils.convertToAscii("ø"))
+        assertEquals("AE", StringUtils.convertToAscii("Æ"))
+        assertEquals("ae", StringUtils.convertToAscii("æ"))
+        // Å/å decomposes under NFD
+        assertEquals("A", StringUtils.convertToAscii("Å"))
+        assertEquals("a", StringUtils.convertToAscii("å"))
+    }
+
+    @Test
+    fun testConvertToAsciiSwedishGerman() {
+        // These all decompose under NFD (base letter + combining diacritic)
+        assertEquals("A", StringUtils.convertToAscii("Ä"))
+        assertEquals("a", StringUtils.convertToAscii("ä"))
+        assertEquals("O", StringUtils.convertToAscii("Ö"))
+        assertEquals("o", StringUtils.convertToAscii("ö"))
+        assertEquals("U", StringUtils.convertToAscii("Ü"))
+        assertEquals("u", StringUtils.convertToAscii("ü"))
+        // ß does not decompose under NFD — handled by explicit map
+        assertEquals("ss", StringUtils.convertToAscii("ß"))
+    }
+
+    @Test
+    fun testConvertToAsciiIcelandic() {
+        assertEquals("D",  StringUtils.convertToAscii("Ð"))
+        assertEquals("d",  StringUtils.convertToAscii("ð"))
+        assertEquals("TH", StringUtils.convertToAscii("Þ"))
+        assertEquals("th", StringUtils.convertToAscii("þ"))
+    }
+
+    @Test
+    fun testConvertToAsciiPolishFrench() {
+        assertEquals("L",  StringUtils.convertToAscii("Ł"))
+        assertEquals("l",  StringUtils.convertToAscii("ł"))
+        assertEquals("OE", StringUtils.convertToAscii("Œ"))
+        assertEquals("oe", StringUtils.convertToAscii("œ"))
+    }
+
+    @Test
+    fun testConvertToAsciiOtherAccented() {
+        // Common accented characters that decompose under NFD
+        assertEquals("e", StringUtils.convertToAscii("é"))
+        assertEquals("e", StringUtils.convertToAscii("è"))
+        assertEquals("n", StringUtils.convertToAscii("ñ"))
+        assertEquals("c", StringUtils.convertToAscii("ç"))
+    }
+
+    @Test
+    fun testConvertToAsciiMixedString() {
+        assertEquals("StromsAElv", StringUtils.convertToAscii("StrømsÆlv"))
+        assertEquals("Malostranke_namesti", StringUtils.convertToAscii("Malostranké_náměstí"))
+    }
 }