Skip to content

Commit 02e165c

Browse files
committed
Strings::toAscii - simplified process when Transliterate is available
1 parent d6cd63d commit 02e165c

1 file changed

Lines changed: 38 additions & 25 deletions

File tree

src/Utils/Strings.php

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -150,34 +150,47 @@ public static function toAscii(string $s): string
150150
['"', '"', '"', "'", "'", "'", '^', 'Ya', 'ya', 'Yu', 'yu'],
151151
$s
152152
);
153-
// temporarily hide these characters to distinguish them from the garbage that iconv creates
154-
$s = strtr($s, '`\'"^~?', "\x01\x02\x03\x04\x05\x06");
155-
if ($transliterator !== null) {
153+
154+
if ($transliterator) {
156155
$s = $transliterator->transliterate($s);
157-
}
158-
if (ICONV_IMPL === 'glibc') {
159-
// glibc implementation is very limited. replace some characters directly
160-
$s = str_replace(
161-
["\u{BB}", "\u{AB}", "\u{2026}", "\u{2122}", "\u{A9}", "\u{AE}"], // » « … ™ © ®
162-
['>>', '<<', '...', 'TM', '(c)', '(R)'],
163-
$s
164-
);
165-
// transliterate the rest into Windows-1250 and then into ASCII, so most Eastern European characters are preserved
166-
$s = iconv('UTF-8', 'WINDOWS-1250//TRANSLIT//IGNORE', $s);
167-
$s = strtr($s, "\xa5\xa3\xbc\x8c\xa7\x8a\xaa\x8d\x8f\x8e\xaf\xb9\xb3\xbe\x9c\x9a\xba\x9d\x9f\x9e"
168-
. "\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
169-
. "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8"
170-
. "\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe"
171-
. "\x96\xa0\x8b\x97\x9b\xa6\xad\xb7",
172-
'ALLSSSSTZZZallssstzzzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTsraaaalccceeeeiiddnnooooruuuuyt- <->|-.');
173-
$s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]);
174-
} else {
156+
if (ICONV_IMPL === 'glibc') {
157+
// temporarily hide ? to distinguish them from the garbage that iconv creates
158+
$s = strtr($s, '?', "\x01");
159+
}
160+
// use iconv because The transliterator leaves some characters out of ASCII, eg → ʾ
175161
$s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
162+
if (ICONV_IMPL === 'glibc') {
163+
// remove garbage and restore ? characters
164+
$s = str_replace(['?', "\x01"], ['', '?'], $s);
165+
}
166+
} else {
167+
// temporarily hide these characters to distinguish them from the garbage that iconv creates
168+
$s = strtr($s, '`\'"^~?', "\x01\x02\x03\x04\x05\x06");
169+
if (ICONV_IMPL === 'glibc') {
170+
// glibc implementation is very limited. replace some characters directly
171+
$s = str_replace(
172+
["\u{BB}", "\u{AB}", "\u{2026}", "\u{2122}", "\u{A9}", "\u{AE}"], // » « … ™ © ®
173+
['>>', '<<', '...', 'TM', '(c)', '(R)'],
174+
$s
175+
);
176+
// transliterate the rest into Windows-1250 and then into ASCII, so most Eastern European characters are preserved
177+
$s = iconv('UTF-8', 'WINDOWS-1250//TRANSLIT//IGNORE', $s);
178+
$s = strtr($s, "\xa5\xa3\xbc\x8c\xa7\x8a\xaa\x8d\x8f\x8e\xaf\xb9\xb3\xbe\x9c\x9a\xba\x9d\x9f\x9e"
179+
. "\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
180+
. "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8"
181+
. "\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe"
182+
. "\x96\xa0\x8b\x97\x9b\xa6\xad\xb7",
183+
'ALLSSSSTZZZallssstzzzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTsraaaalccceeeeiiddnnooooruuuuyt- <->|-.');
184+
$s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]);
185+
} else {
186+
$s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
187+
}
188+
// remove garbage that iconv creates during transliteration (eg Ý -> Y')
189+
$s = str_replace(['`', "'", '"', '^', '~', '?'], '', $s);
190+
// restore temporarily hidden characters
191+
$s = strtr($s, "\x01\x02\x03\x04\x05\x06", '`\'"^~?');
176192
}
177-
// remove garbage that iconv creates during transliteration (eg Ý -> Y')
178-
$s = str_replace(['`', "'", '"', '^', '~', '?'], '', $s);
179-
// restore temporarily hidden characters
180-
$s = strtr($s, "\x01\x02\x03\x04\x05\x06", '`\'"^~?');
193+
181194
return $s;
182195
}
183196

0 commit comments

Comments
 (0)