fix: hoist sanitize mapping to module constant, fix U+001E em-dash mismap

Avijit-Microsoft · web-flow · commit e68f85f02ce5 · 2026-05-22T17:11:07.000+05:30
diff --git a/infra/scripts/index_scripts/content_understanding_client.py b/infra/scripts/index_scripts/content_understanding_client.py
@@ -6,28 +6,31 @@
 from pathlib import Path
 
 
-def sanitize_cu_output(text):
-    """Replace non-printable control characters that may appear in CU output.
+# Module-level constant: corrupted control-char → intended Unicode mapping.
+# The CU analyzeBinary API (v2025-11-01) intermittently strips the high byte
+# from Unicode characters (e.g. U+2019 → U+0019). This dict maps each known
+# corrupted control character back to its intended equivalent.
+_CU_REPLACEMENTS = {
+    '\u0014': '\u2014',  # em dash
+    '\u0019': '\u2019',  # right single quotation mark
+    '\u001a': '\u201a',  # single low-9 quotation mark
+    '\u001c': '\u201c',  # left double quotation mark
+    '\u001d': '\u201d',  # right double quotation mark
+    '\u001e': '\u201e',  # double low-9 quotation mark
+}
+_CU_BAD_CHARS = set(_CU_REPLACEMENTS.keys())
+
 
-    The Content Understanding analyzeBinary API (v2025-11-01) intermittently
-    corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
-    U+0019).  This function maps each known corrupted control character back to
-    its intended Unicode equivalent.  The mapping is based on empirical
-    observation of characters corrupted in a single high-byte-stripping pass
-    over the U+201x range.
+def sanitize_cu_output(text):
+    """Replace corrupted control characters that CU may emit.
 
-    The fix is zero-cost when CU output is already correct.
+    Returns *text* unchanged (no-op) when none of the known corrupted
+    characters are present.  The replacement mapping is allocated once
+    at module level to avoid per-call overhead.
     """
-    if not text:
+    if not text or _CU_BAD_CHARS.isdisjoint(text):
         return text
-    replacements = {
-        '\u0019': '\u2019',  # right single quotation mark
-        '\u001a': '\u201a',  # single low-9 quotation mark
-        '\u001c': '\u201c',  # left double quotation mark
-        '\u001d': '\u201d',  # right double quotation mark
-        '\u001e': '\u2014',  # em dash (empirically observed)
-    }
-    for bad, good in replacements.items():
+    for bad, good in _CU_REPLACEMENTS.items():
         text = text.replace(bad, good)
     return text