Skip to content

Commit 9cb5dcf

Browse files
authored
Merge branch 'main' into dependabot/pip/certifi-2026.2.25
2 parents 57459e9 + b4b2ac1 commit 9cb5dcf

File tree

9 files changed

+486
-346
lines changed

9 files changed

+486
-346
lines changed

lib/charset_normalizer/api.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,20 @@
1010
mb_encoding_languages,
1111
merge_coherence_ratios,
1212
)
13-
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
13+
from .constant import (
14+
IANA_SUPPORTED,
15+
IANA_SUPPORTED_SIMILAR,
16+
TOO_BIG_SEQUENCE,
17+
TOO_SMALL_SEQUENCE,
18+
TRACE,
19+
)
1420
from .md import mess_ratio
1521
from .models import CharsetMatch, CharsetMatches
1622
from .utils import (
1723
any_specified_encoding,
1824
cut_sequence_chunks,
1925
iana_name,
2026
identify_sig_or_bom,
21-
is_cp_similar,
2227
is_multi_byte_encoding,
2328
should_strip_sig_or_bom,
2429
)
@@ -78,7 +83,7 @@ def from_bytes(
7883
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
7984
if explain: # Defensive: ensure exit path clean handler
8085
logger.removeHandler(explain_handler)
81-
logger.setLevel(previous_logger_level or logging.WARNING)
86+
logger.setLevel(previous_logger_level)
8287
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
8388

8489
if cp_isolation is not None:
@@ -152,6 +157,7 @@ def from_bytes(
152157
tested: set[str] = set()
153158
tested_but_hard_failure: list[str] = []
154159
tested_but_soft_failure: list[str] = []
160+
soft_failure_skip: set[str] = set()
155161

156162
fallback_ascii: CharsetMatch | None = None
157163
fallback_u8: CharsetMatch | None = None
@@ -210,6 +216,16 @@ def from_bytes(
210216
)
211217
continue
212218

219+
# Skip encodings similar to ones that already soft-failed (high mess ratio).
220+
# Checked BEFORE the expensive decode attempt.
221+
if encoding_iana in soft_failure_skip:
222+
logger.log(
223+
TRACE,
224+
"%s is deemed too similar to a code page that was already considered unsuited. Continuing!",
225+
encoding_iana,
226+
)
227+
continue
228+
213229
try:
214230
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
215231
except (ModuleNotFoundError, ImportError):
@@ -250,22 +266,6 @@ def from_bytes(
250266
tested_but_hard_failure.append(encoding_iana)
251267
continue
252268

253-
similar_soft_failure_test: bool = False
254-
255-
for encoding_soft_failed in tested_but_soft_failure:
256-
if is_cp_similar(encoding_iana, encoding_soft_failed):
257-
similar_soft_failure_test = True
258-
break
259-
260-
if similar_soft_failure_test:
261-
logger.log(
262-
TRACE,
263-
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
264-
encoding_iana,
265-
encoding_soft_failed,
266-
)
267-
continue
268-
269269
r_ = range(
270270
0 if not bom_or_sig_available else len(sig_payload),
271271
length,
@@ -358,6 +358,8 @@ def from_bytes(
358358
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
359359
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
360360
tested_but_soft_failure.append(encoding_iana)
361+
if encoding_iana in IANA_SUPPORTED_SIMILAR:
362+
soft_failure_skip.update(IANA_SUPPORTED_SIMILAR[encoding_iana])
361363
logger.log(
362364
TRACE,
363365
"%s was excluded because of initial chaos probing. Gave up %i time(s). "

lib/charset_normalizer/cd.py

Lines changed: 63 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
LANGUAGE_SUPPORTED_COUNT,
1313
TOO_SMALL_SEQUENCE,
1414
ZH_NAMES,
15+
_FREQUENCIES_SET,
16+
_FREQUENCIES_RANK,
1517
)
1618
from .md import is_suspiciously_successive_range
1719
from .models import CoherenceMatches
@@ -142,6 +144,7 @@ def alphabet_languages(
142144
"""
143145
languages: list[tuple[str, float]] = []
144146

147+
characters_set: frozenset[str] = frozenset(characters)
145148
source_have_accents = any(is_accentuated(character) for character in characters)
146149

147150
for language, language_characters in FREQUENCIES.items():
@@ -155,9 +158,7 @@ def alphabet_languages(
155158

156159
character_count: int = len(language_characters)
157160

158-
character_match_count: int = len(
159-
[c for c in language_characters if c in characters]
160-
)
161+
character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set)
161162

162163
ratio: float = character_match_count / character_count
163164

@@ -181,23 +182,36 @@ def characters_popularity_compare(
181182
raise ValueError(f"{language} not available")
182183

183184
character_approved_count: int = 0
184-
FREQUENCIES_language_set = set(FREQUENCIES[language])
185+
frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
186+
lang_rank: dict[str, int] = _FREQUENCIES_RANK[language]
185187

186188
ordered_characters_count: int = len(ordered_characters)
187189
target_language_characters_count: int = len(FREQUENCIES[language])
188190

189191
large_alphabet: bool = target_language_characters_count > 26
190192

193+
expected_projection_ratio: float = (
194+
target_language_characters_count / ordered_characters_count
195+
)
196+
197+
# Pre-built rank dict for ordered_characters (avoids repeated list slicing).
198+
ordered_rank: dict[str, int] = {
199+
char: rank for rank, char in enumerate(ordered_characters)
200+
}
201+
202+
# Pre-compute characters common to both orderings.
203+
# Avoids repeated `c in ordered_rank` dict lookups in the inner counts.
204+
common_chars: list[tuple[int, int]] = [
205+
(lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
206+
]
207+
191208
for character, character_rank in zip(
192209
ordered_characters, range(0, ordered_characters_count)
193210
):
194-
if character not in FREQUENCIES_language_set:
211+
if character not in frequencies_language_set:
195212
continue
196213

197-
character_rank_in_language: int = FREQUENCIES[language].index(character)
198-
expected_projection_ratio: float = (
199-
target_language_characters_count / ordered_characters_count
200-
)
214+
character_rank_in_language: int = lang_rank[character]
201215
character_rank_projection: int = int(character_rank * expected_projection_ratio)
202216

203217
if (
@@ -214,35 +228,33 @@ def characters_popularity_compare(
214228
character_approved_count += 1
215229
continue
216230

217-
characters_before_source: list[str] = FREQUENCIES[language][
218-
0:character_rank_in_language
219-
]
220-
characters_after_source: list[str] = FREQUENCIES[language][
221-
character_rank_in_language:
222-
]
223-
characters_before: list[str] = ordered_characters[0:character_rank]
224-
characters_after: list[str] = ordered_characters[character_rank:]
225-
226-
before_match_count: int = len(
227-
set(characters_before) & set(characters_before_source)
231+
# Count how many characters appear "before" in both orderings,
232+
# and how many appear "at or after" in both orderings.
233+
before_match_count: int = sum(
234+
1
235+
for lr, orr in common_chars
236+
if lr < character_rank_in_language and orr < character_rank
228237
)
229238

230-
after_match_count: int = len(
231-
set(characters_after) & set(characters_after_source)
239+
after_len: int = target_language_characters_count - character_rank_in_language
240+
after_match_count: int = sum(
241+
1
242+
for lr, orr in common_chars
243+
if lr >= character_rank_in_language and orr >= character_rank
232244
)
233245

234-
if len(characters_before_source) == 0 and before_match_count <= 4:
246+
if character_rank_in_language == 0 and before_match_count <= 4:
235247
character_approved_count += 1
236248
continue
237249

238-
if len(characters_after_source) == 0 and after_match_count <= 4:
250+
if after_len == 0 and after_match_count <= 4:
239251
character_approved_count += 1
240252
continue
241253

242254
if (
243-
before_match_count / len(characters_before_source) >= 0.4
244-
or after_match_count / len(characters_after_source) >= 0.4
245-
):
255+
character_rank_in_language > 0
256+
and before_match_count / character_rank_in_language >= 0.4
257+
) or (after_len > 0 and after_match_count / after_len >= 0.4):
246258
character_approved_count += 1
247259
continue
248260

@@ -255,7 +267,11 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
255267
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256268
One containing the latin letters and the other hebrew.
257269
"""
258-
layers: dict[str, str] = {}
270+
layers: dict[str, list[str]] = {}
271+
272+
# Fast path: track single-layer key to skip dict iteration for single-script text.
273+
single_layer_key: str | None = None
274+
multi_layer: bool = False
259275

260276
for character in decoded_sequence:
261277
if character.isalpha() is False:
@@ -268,24 +284,34 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
268284

269285
layer_target_range: str | None = None
270286

271-
for discovered_range in layers:
287+
if multi_layer:
288+
for discovered_range in layers:
289+
if (
290+
is_suspiciously_successive_range(discovered_range, character_range)
291+
is False
292+
):
293+
layer_target_range = discovered_range
294+
break
295+
elif single_layer_key is not None:
272296
if (
273-
is_suspiciously_successive_range(discovered_range, character_range)
297+
is_suspiciously_successive_range(single_layer_key, character_range)
274298
is False
275299
):
276-
layer_target_range = discovered_range
277-
break
300+
layer_target_range = single_layer_key
278301

279302
if layer_target_range is None:
280303
layer_target_range = character_range
281304

282305
if layer_target_range not in layers:
283-
layers[layer_target_range] = character.lower()
284-
continue
306+
layers[layer_target_range] = []
307+
if single_layer_key is None:
308+
single_layer_key = layer_target_range
309+
else:
310+
multi_layer = True
285311

286-
layers[layer_target_range] += character.lower()
312+
layers[layer_target_range].append(character)
287313

288-
return list(layers.values())
314+
return ["".join(chars).lower() for chars in layers.values()]
289315

290316

291317
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
@@ -366,7 +392,7 @@ def coherence_ratio(
366392
sequence_frequencies: TypeCounter[str] = Counter(layer)
367393
most_common = sequence_frequencies.most_common()
368394

369-
character_count: int = sum(o for c, o in most_common)
395+
character_count: int = len(layer)
370396

371397
if character_count <= TOO_SMALL_SEQUENCE:
372398
continue

lib/charset_normalizer/cli/__main__.py

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,36 +15,18 @@
1515

1616

1717
def query_yes_no(question: str, default: str = "yes") -> bool:
18-
"""Ask a yes/no question via input() and return their answer.
19-
20-
"question" is a string that is presented to the user.
21-
"default" is the presumed answer if the user just hits <Enter>.
22-
It must be "yes" (the default), "no" or None (meaning
23-
an answer is required of the user).
24-
25-
The "answer" return value is True for "yes" or False for "no".
26-
27-
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
28-
"""
29-
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
30-
if default is None:
31-
prompt = " [y/n] "
32-
elif default == "yes":
33-
prompt = " [Y/n] "
34-
elif default == "no":
35-
prompt = " [y/N] "
36-
else:
37-
raise ValueError("invalid default answer: '%s'" % default)
18+
"""Ask a yes/no question via input() and return the answer as a bool."""
19+
prompt = " [Y/n] " if default == "yes" else " [y/N] "
3820

3921
while True:
40-
sys.stdout.write(question + prompt)
41-
choice = input().lower()
42-
if default is not None and choice == "":
43-
return valid[default]
44-
elif choice in valid:
45-
return valid[choice]
46-
else:
47-
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
22+
choice = input(question + prompt).strip().lower()
23+
if not choice:
24+
return default == "yes"
25+
if choice in ("y", "yes"):
26+
return True
27+
if choice in ("n", "no"):
28+
return False
29+
print("Please respond with 'y' or 'n'.")
4830

4931

5032
class FileType:

0 commit comments

Comments
 (0)