1212 LANGUAGE_SUPPORTED_COUNT ,
1313 TOO_SMALL_SEQUENCE ,
1414 ZH_NAMES ,
15+ _FREQUENCIES_SET ,
16+ _FREQUENCIES_RANK ,
1517)
1618from .md import is_suspiciously_successive_range
1719from .models import CoherenceMatches
@@ -142,6 +144,7 @@ def alphabet_languages(
142144 """
143145 languages : list [tuple [str , float ]] = []
144146
147+ characters_set : frozenset [str ] = frozenset (characters )
145148 source_have_accents = any (is_accentuated (character ) for character in characters )
146149
147150 for language , language_characters in FREQUENCIES .items ():
@@ -155,9 +158,7 @@ def alphabet_languages(
155158
156159 character_count : int = len (language_characters )
157160
158- character_match_count : int = len (
159- [c for c in language_characters if c in characters ]
160- )
161+ character_match_count : int = len (_FREQUENCIES_SET [language ] & characters_set )
161162
162163 ratio : float = character_match_count / character_count
163164
@@ -181,23 +182,36 @@ def characters_popularity_compare(
181182 raise ValueError (f"{ language } not available" )
182183
183184 character_approved_count : int = 0
184- FREQUENCIES_language_set = set (FREQUENCIES [language ])
185+ frequencies_language_set : frozenset [str ] = _FREQUENCIES_SET [language ]
186+ lang_rank : dict [str , int ] = _FREQUENCIES_RANK [language ]
185187
186188 ordered_characters_count : int = len (ordered_characters )
187189 target_language_characters_count : int = len (FREQUENCIES [language ])
188190
189191 large_alphabet : bool = target_language_characters_count > 26
190192
193+ expected_projection_ratio : float = (
194+ target_language_characters_count / ordered_characters_count
195+ )
196+
197+ # Pre-built rank dict for ordered_characters (avoids repeated list slicing).
198+ ordered_rank : dict [str , int ] = {
199+ char : rank for rank , char in enumerate (ordered_characters )
200+ }
201+
202+ # Pre-compute characters common to both orderings.
203+ # Avoids repeated `c in ordered_rank` dict lookups in the inner counts.
204+ common_chars : list [tuple [int , int ]] = [
205+ (lr , ordered_rank [c ]) for c , lr in lang_rank .items () if c in ordered_rank
206+ ]
207+
191208 for character , character_rank in zip (
192209 ordered_characters , range (0 , ordered_characters_count )
193210 ):
194- if character not in FREQUENCIES_language_set :
211+ if character not in frequencies_language_set :
195212 continue
196213
197- character_rank_in_language : int = FREQUENCIES [language ].index (character )
198- expected_projection_ratio : float = (
199- target_language_characters_count / ordered_characters_count
200- )
214+ character_rank_in_language : int = lang_rank [character ]
201215 character_rank_projection : int = int (character_rank * expected_projection_ratio )
202216
203217 if (
@@ -214,35 +228,33 @@ def characters_popularity_compare(
214228 character_approved_count += 1
215229 continue
216230
217- characters_before_source : list [str ] = FREQUENCIES [language ][
218- 0 :character_rank_in_language
219- ]
220- characters_after_source : list [str ] = FREQUENCIES [language ][
221- character_rank_in_language :
222- ]
223- characters_before : list [str ] = ordered_characters [0 :character_rank ]
224- characters_after : list [str ] = ordered_characters [character_rank :]
225-
226- before_match_count : int = len (
227- set (characters_before ) & set (characters_before_source )
231+ # Count how many characters appear "before" in both orderings,
232+ # and how many appear "at or after" in both orderings.
233+ before_match_count : int = sum (
234+ 1
235+ for lr , orr in common_chars
236+ if lr < character_rank_in_language and orr < character_rank
228237 )
229238
230- after_match_count : int = len (
231- set (characters_after ) & set (characters_after_source )
239+ after_len : int = target_language_characters_count - character_rank_in_language
240+ after_match_count : int = sum (
241+ 1
242+ for lr , orr in common_chars
243+ if lr >= character_rank_in_language and orr >= character_rank
232244 )
233245
234- if len ( characters_before_source ) == 0 and before_match_count <= 4 :
246+ if character_rank_in_language == 0 and before_match_count <= 4 :
235247 character_approved_count += 1
236248 continue
237249
238- if len ( characters_after_source ) == 0 and after_match_count <= 4 :
250+ if after_len == 0 and after_match_count <= 4 :
239251 character_approved_count += 1
240252 continue
241253
242254 if (
243- before_match_count / len ( characters_before_source ) >= 0.4
244- or after_match_count / len ( characters_after_source ) >= 0.4
245- ):
255+ character_rank_in_language > 0
256+ and before_match_count / character_rank_in_language >= 0.4
257+ ) or ( after_len > 0 and after_match_count / after_len >= 0.4 ) :
246258 character_approved_count += 1
247259 continue
248260
@@ -255,7 +267,11 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
255267 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256268 One containing the latin letters and the other hebrew.
257269 """
258- layers : dict [str , str ] = {}
270+ layers : dict [str , list [str ]] = {}
271+
272+ # Fast path: track single-layer key to skip dict iteration for single-script text.
273+ single_layer_key : str | None = None
274+ multi_layer : bool = False
259275
260276 for character in decoded_sequence :
261277 if character .isalpha () is False :
@@ -268,24 +284,34 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
268284
269285 layer_target_range : str | None = None
270286
271- for discovered_range in layers :
287+ if multi_layer :
288+ for discovered_range in layers :
289+ if (
290+ is_suspiciously_successive_range (discovered_range , character_range )
291+ is False
292+ ):
293+ layer_target_range = discovered_range
294+ break
295+ elif single_layer_key is not None :
272296 if (
273- is_suspiciously_successive_range (discovered_range , character_range )
297+ is_suspiciously_successive_range (single_layer_key , character_range )
274298 is False
275299 ):
276- layer_target_range = discovered_range
277- break
300+ layer_target_range = single_layer_key
278301
279302 if layer_target_range is None :
280303 layer_target_range = character_range
281304
282305 if layer_target_range not in layers :
283- layers [layer_target_range ] = character .lower ()
284- continue
306+ layers [layer_target_range ] = []
307+ if single_layer_key is None :
308+ single_layer_key = layer_target_range
309+ else :
310+ multi_layer = True
285311
286- layers [layer_target_range ] += character . lower ( )
312+ layers [layer_target_range ]. append ( character )
287313
288- return list ( layers .values ())
314+ return [ "" . join ( chars ). lower () for chars in layers .values ()]
289315
290316
291317def merge_coherence_ratios (results : list [CoherenceMatches ]) -> CoherenceMatches :
@@ -366,7 +392,7 @@ def coherence_ratio(
366392 sequence_frequencies : TypeCounter [str ] = Counter (layer )
367393 most_common = sequence_frequencies .most_common ()
368394
369- character_count : int = sum ( o for c , o in most_common )
395+ character_count : int = len ( layer )
370396
371397 if character_count <= TOO_SMALL_SEQUENCE :
372398 continue
0 commit comments