Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions infra/scripts/index_scripts/content_understanding_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,31 @@
from pathlib import Path


def sanitize_cu_output(text):
"""Replace non-printable control characters that may appear in CU output.
# Module-level constant: corrupted control-char → intended Unicode mapping.
# The CU analyzeBinary API (v2025-11-01) intermittently strips the high byte
# from Unicode characters (e.g. U+2019 → U+0019). This dict maps each known
# corrupted control character back to its intended equivalent.
_CU_REPLACEMENTS = {
'\u0014': '\u2014', # em dash
'\u0019': '\u2019', # right single quotation mark
'\u001a': '\u201a', # single low-9 quotation mark
'\u001c': '\u201c', # left double quotation mark
'\u001d': '\u201d', # right double quotation mark
'\u001e': '\u201e', # double low-9 quotation mark
}
_CU_BAD_CHARS = set(_CU_REPLACEMENTS.keys())


The Content Understanding analyzeBinary API (v2025-11-01) intermittently
corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
U+0019). This function maps each known corrupted control character back to
its intended Unicode equivalent. The mapping is based on empirical
observation of characters corrupted in a single high-byte-stripping pass
over the U+201x range.
def sanitize_cu_output(text):
"""Replace corrupted control characters that CU may emit.

The fix is zero-cost when CU output is already correct.
Returns *text* unchanged (no-op) when none of the known corrupted
characters are present. The replacement mapping is allocated once
at module level to avoid per-call overhead.
"""
if not text:
if not text or _CU_BAD_CHARS.isdisjoint(text):
return text
replacements = {
'\u0019': '\u2019', # right single quotation mark
'\u001a': '\u201a', # single low-9 quotation mark
'\u001c': '\u201c', # left double quotation mark
'\u001d': '\u201d', # right double quotation mark
'\u001e': '\u2014', # em dash (empirically observed)
}
for bad, good in replacements.items():
for bad, good in _CU_REPLACEMENTS.items():
text = text.replace(bad, good)
return text

Expand Down