@@ -82,21 +82,77 @@ object StringUtils {
8282 }
8383
8484 /* *
85- * Replaces non-ASCII characters in a name to make it a valid SMT-LIB identifier .
86- * SMT-LIB unquoted symbols are restricted to ASCII, so characters like Æ, Ø, Å must be transliterated .
85+ * Converts a string to a valid ASCII identifier for use in SMT-LIB.
86+ * SMT-LIB unquoted symbols are restricted to ASCII.
8787 *
88- * This is needed because our test suite includes Norwegian APIs whose database schemas
89- * contain column and table names with Norwegian characters (Æ, Ø, Å).
88+ * The conversion uses two complementary steps:
89+ * 1. An explicit folding map for characters that have no canonical decomposition under NFD
90+ * (e.g., Ø→O, Æ→AE, ß→ss, ð→d, þ→th, Ł→L, Œ→OE, ŋ→n, ħ→h, ı→i, …),
91+ * covering non-decomposable characters from the Unicode Latin Extended blocks.
92+ * 2. NFD normalization followed by stripping of non-ASCII combining marks, which handles
93+ * all accented characters that do decompose (e.g., é→e, ü→u, ñ→n, Ä→A, ö→o, å→a).
9094 *
91- * Characters that do not decompose under NFD (Ø, Æ) are replaced explicitly.
92- * Characters that decompose under NFD (Å→A, and other accented letters like é, ü, ñ)
93- * are handled by normalizing to NFD form and stripping the remaining non-ASCII combining marks.
95+ * Any remaining non-ASCII characters (e.g., from non-Latin scripts) are dropped.
9496 */
9597 fun convertToAscii (name : String ): String {
96- val replaced = name
97- .replace(' Ø' , ' O' ).replace(' ø' , ' o' )
98- .replace(" Æ" , " AE" ).replace(" æ" , " ae" )
99- return java.text.Normalizer .normalize(replaced, java.text.Normalizer .Form .NFD )
98+ val sb = StringBuilder (name.length * 2 )
99+ for (ch in name) {
100+ sb.append(ASCII_FOLD_MAP [ch] ? : ch.toString())
101+ }
102+ return java.text.Normalizer .normalize(sb.toString(), java.text.Normalizer .Form .NFD )
100103 .replace(Regex (" [^\\ x00-\\ x7F]" ), " " )
101104 }
105+
106+ /* *
107+ * Explicit ASCII replacements for Unicode characters that do not decompose under NFD normalization.
108+ * Covers non-decomposable characters from the Unicode Latin-1 Supplement and Latin Extended-A/B blocks.
109+ * Characters that DO decompose under NFD (e.g., Ä, ö, å, é, ü, ñ) are handled by the NFD step in
110+ * [convertToAscii] and need no entry here.
111+ */
112+ private val ASCII_FOLD_MAP : Map <Char , String > = mapOf (
113+ // Latin-1 Supplement
114+ ' Æ' to " AE" , ' æ' to " ae" , // AE ligature (Danish, Norwegian, Old English)
115+ ' Ð' to " D" , ' ð' to " d" , // Eth (Icelandic, Old English)
116+ ' Ø' to " O" , ' ø' to " o" , // O with stroke (Danish, Norwegian)
117+ ' Þ' to " TH" , ' þ' to " th" , // Thorn (Icelandic, Old English)
118+ ' ß' to " ss" , // Sharp S (German)
119+ // Latin Extended-A
120+ ' Ħ' to " H" , ' ħ' to " h" , // H with stroke (Maltese)
121+ ' ı' to " i" , // Dotless i (Turkish, Azerbaijani)
122+ ' IJ' to " IJ" , ' ij' to " ij" , // IJ digraph (Dutch)
123+ ' ĸ' to " k" , // Kra (Greenlandic)
124+ ' Ł' to " L" , ' ł' to " l" , // L with stroke (Polish, Croatian, Sorbian)
125+ ' Ŋ' to " N" , ' ŋ' to " n" , // Eng (Sami, African languages)
126+ ' Œ' to " OE" , ' œ' to " oe" , // OE ligature (French)
127+ ' Ŧ' to " T" , ' ŧ' to " t" , // T with stroke (Sami)
128+ // Latin Extended-B
129+ ' ƀ' to " b" , ' Ƀ' to " B" , // B with stroke
130+ ' Ɓ' to " B" , // B with hook
131+ ' Ƈ' to " C" , ' ƈ' to " c" , // C with hook
132+ ' Ɗ' to " D" , // D with hook
133+ ' ƌ' to " d" , // D with topbar
134+ ' Ƒ' to " F" , ' ƒ' to " f" , // F with hook
135+ ' Ɠ' to " G" , // G with hook
136+ ' Ɨ' to " I" , // I with stroke
137+ ' Ƙ' to " K" , ' ƙ' to " k" , // K with hook
138+ ' ƚ' to " l" , // L with bar
139+ ' Ɲ' to " N" , ' ƞ' to " n" , // N with hook / N with long right leg
140+ ' Œ' to " OE" , ' œ' to " oe" ,
141+ ' Ƥ' to " P" , ' ƥ' to " p" , // P with hook
142+ ' ƫ' to " t" , // T with palatal hook
143+ ' Ƭ' to " T" , ' ƭ' to " t" , // T with hook
144+ ' Ʈ' to " T" , // T with retroflex hook
145+ ' Ư' to " U" , ' ư' to " u" , // U with horn (Vietnamese)
146+ ' Ʋ' to " V" , // V with hook
147+ ' Ƴ' to " Y" , ' ƴ' to " y" , // Y with hook
148+ ' Ƶ' to " Z" , ' ƶ' to " z" , // Z with stroke
149+ ' Ǝ' to " E" , ' ǝ' to " e" , // Reversed E / Schwa
150+ ' Ɵ' to " O" , // O with middle tilde
151+ ' Ȼ' to " C" , ' ȼ' to " c" , // C with stroke
152+ ' Ɇ' to " E" , ' ɇ' to " e" , // E with stroke
153+ ' Ɉ' to " J" , ' ɉ' to " j" , // J with stroke
154+ ' Ɋ' to " Q" , ' ɋ' to " q" , // Q with hook tail
155+ ' Ɍ' to " R" , ' ɍ' to " r" , // R with stroke
156+ ' Ɏ' to " Y" , ' ɏ' to " y" , // Y with stroke
157+ )
102158}
0 commit comments