diff --git a/infra/scripts/index_scripts/content_understanding_client.py b/infra/scripts/index_scripts/content_understanding_client.py index bdd4d6d37..3dd7c4262 100644 --- a/infra/scripts/index_scripts/content_understanding_client.py +++ b/infra/scripts/index_scripts/content_understanding_client.py @@ -6,28 +6,31 @@ from pathlib import Path -def sanitize_cu_output(text): - """Replace non-printable control characters that may appear in CU output. +# Module-level constant: corrupted control-char → intended Unicode mapping. +# The CU analyzeBinary API (v2025-11-01) intermittently strips the high byte +# from Unicode characters (e.g. U+2019 → U+0019). This dict maps each known +# corrupted control character back to its intended equivalent. +_CU_REPLACEMENTS = { + '\u0014': '\u2014', # em dash + '\u0019': '\u2019', # right single quotation mark + '\u001a': '\u201a', # single low-9 quotation mark + '\u001c': '\u201c', # left double quotation mark + '\u001d': '\u201d', # right double quotation mark + '\u001e': '\u201e', # double low-9 quotation mark +} +_CU_BAD_CHARS = set(_CU_REPLACEMENTS.keys()) + - The Content Understanding analyzeBinary API (v2025-11-01) intermittently - corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes - U+0019). This function maps each known corrupted control character back to - its intended Unicode equivalent. The mapping is based on empirical - observation of characters corrupted in a single high-byte-stripping pass - over the U+201x range. +def sanitize_cu_output(text): + """Replace corrupted control characters that CU may emit. - The fix is zero-cost when CU output is already correct. + Returns *text* unchanged (no-op) when none of the known corrupted + characters are present. The replacement mapping is allocated once + at module level to avoid per-call overhead. """ - if not text: + if not text or _CU_BAD_CHARS.isdisjoint(text): return text - replacements = { - '\u0019': '\u2019', # right single quotation mark - '\u001a': '\u201a', # single low-9 quotation mark - '\u001c': '\u201c', # left double quotation mark - '\u001d': '\u201d', # right double quotation mark - '\u001e': '\u2014', # em dash (empirically observed) - } - for bad, good in replacements.items(): + for bad, good in _CU_REPLACEMENTS.items(): text = text.replace(bad, good) return text