refactor: move sanitize_cu_output to shared module and fix \u001a mapping

Copilot · Yamini-Microsoft · web-flow · commit f73b6ede149f · 2026-05-20T09:43:54.000Z
Agent-Logs-Url: https://github.com/microsoft/Conversation-Knowledge-Mining-Solution-Accelerator/sessions/50f2a621-4990-4aca-8dd8-cfa5ea31ce6f Co-authored-by: Yamini-Microsoft <191316559+Yamini-Microsoft@users.noreply.github.com>
diff --git a/infra/scripts/index_scripts/03_cu_process_data_text.py b/infra/scripts/index_scripts/03_cu_process_data_text.py
@@ -31,7 +31,7 @@
 
 from agent_framework.azure import AzureAIProjectAgentProvider
 
-from content_understanding_client import AzureContentUnderstandingClient
+from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
 
 # Get parameters from command line
 p = argparse.ArgumentParser()
@@ -314,35 +314,14 @@ def create_tables():
     conn.commit()
 
 
-create_tables()
-
-def _sanitize_cu_output(text):
-    """Replace non-printable control characters that may appear in CU output.
 
-    When JSON files containing Unicode escape sequences (e.g. \\u2019) are
-    sent as raw binary via the analyzeBinary endpoint, the returned text
-    occasionally contains unexpected control characters instead of the
-    intended Unicode characters. This is a defensive fix that maps those
-    control characters back to their likely intended values.
-    """
-    if not text:
-        return text
-    replacements = {
-        '\u0019': '\u2019',  # right single quotation mark
-        '\u001a': '\u2019',  # right single quotation mark
-        '\u001c': '\u201c',  # left double quotation mark
-        '\u001d': '\u201d',  # right double quotation mark
-        '\u001e': '\u2014',  # em dash
-    }
-    for bad, good in replacements.items():
-        text = text.replace(bad, good)
-    return text
+create_tables()
 
 
 def get_field_value(fields, field_name, default=""):
     field = fields.get(field_name, {})
     value = field.get('valueString', default)
-    return _sanitize_cu_output(value)
+    return sanitize_cu_output(value)
 
 # Process files and insert into DB and Search
 async def process_files():
diff --git a/infra/scripts/index_scripts/04_cu_process_custom_data.py b/infra/scripts/index_scripts/04_cu_process_custom_data.py
@@ -45,7 +45,7 @@
 
 from agent_framework.azure import AzureAIProjectAgentProvider
 
-from content_understanding_client import AzureContentUnderstandingClient
+from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
 
 # Constants and configuration
 FILE_SYSTEM_CLIENT_NAME = "data"
@@ -368,35 +368,14 @@ def create_tables():
     conn.commit()
 
 
-create_tables()
-
-def _sanitize_cu_output(text):
-    """Replace non-printable control characters that may appear in CU output.
 
-    When JSON files containing Unicode escape sequences (e.g. \\u2019) are
-    sent as raw binary via the analyzeBinary endpoint, the returned text
-    occasionally contains unexpected control characters instead of the
-    intended Unicode characters. This is a defensive fix that maps those
-    control characters back to their likely intended values.
-    """
-    if not text:
-        return text
-    replacements = {
-        '\u0019': '\u2019',  # right single quotation mark
-        '\u001a': '\u2019',  # right single quotation mark
-        '\u001c': '\u201c',  # left double quotation mark
-        '\u001d': '\u201d',  # right double quotation mark
-        '\u001e': '\u2014',  # em dash
-    }
-    for bad, good in replacements.items():
-        text = text.replace(bad, good)
-    return text
+create_tables()
 
 
 def get_field_value(fields, field_name, default=""):
     field = fields.get(field_name, {})
     value = field.get('valueString', default)
-    return _sanitize_cu_output(value)
+    return sanitize_cu_output(value)
 
 # Process files and insert into DB and Search
 async def process_files():
diff --git a/infra/scripts/index_scripts/content_understanding_client.py b/infra/scripts/index_scripts/content_understanding_client.py
@@ -6,6 +6,32 @@
 from pathlib import Path
 
 
+def sanitize_cu_output(text):
+    """Replace non-printable control characters that may appear in CU output.
+
+    The Content Understanding analyzeBinary API (v2025-11-01) intermittently
+    corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
+    U+0019).  This function maps each known corrupted control character back to
+    its intended Unicode equivalent.  The mapping is based on empirical
+    observation of characters corrupted in a single high-byte-stripping pass
+    over the U+201x range.
+
+    The fix is zero-cost when CU output is already correct.
+    """
+    if not text:
+        return text
+    replacements = {
+        '\u0019': '\u2019',  # right single quotation mark
+        '\u001a': '\u201a',  # single low-9 quotation mark
+        '\u001c': '\u201c',  # left double quotation mark
+        '\u001d': '\u201d',  # right double quotation mark
+        '\u001e': '\u2014',  # em dash (empirically observed)
+    }
+    for bad, good in replacements.items():
+        text = text.replace(bad, good)
+    return text
+
+
 class AzureContentUnderstandingClient:
     def __init__(
         self,