From 88b1868ae5e1a7af6b00bf7d470f6dd2785a2e89 Mon Sep 17 00:00:00 2001
From: Yamini-Microsoft <v-yamini3@microsoft.com>
Date: Tue, 19 May 2026 20:11:11 +0530
Subject: [PATCH 1/2] fix: sanitize CU output to prevent Unicode corruption in
 citations

The Content Understanding analyzeBinary API (v2025-11-01) intermittently
corrupts Unicode characters by stripping the high byte (e.g. U+2019 -> U+0019).
This causes apostrophes and quotes to render as box characters in the Citation Panel.

Added _sanitize_cu_output() to map known corrupted control characters back to
their intended Unicode equivalents after CU processing, before saving to Search/SQL.

Fixes AB#43310

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../index_scripts/03_cu_process_data_text.py  | 26 ++++++++++++++++++-
 .../04_cu_process_custom_data.py              | 26 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/infra/scripts/index_scripts/03_cu_process_data_text.py b/infra/scripts/index_scripts/03_cu_process_data_text.py
index 2ba6cb9b4..c8a4884ff 100644
--- a/infra/scripts/index_scripts/03_cu_process_data_text.py
+++ b/infra/scripts/index_scripts/03_cu_process_data_text.py
@@ -316,9 +316,33 @@ def create_tables():
 
 create_tables()
 
+def _sanitize_cu_output(text):
+    """Replace non-printable control characters that may appear in CU output.
+
+    When JSON files containing Unicode escape sequences (e.g. \\u2019) are
+    sent as raw binary via the analyzeBinary endpoint, the returned text
+    occasionally contains unexpected control characters instead of the
+    intended Unicode characters. This is a defensive fix that maps those
+    control characters back to their likely intended values.
+    """
+    if not text:
+        return text
+    replacements = {
+        '\u0019': '\u2019',  # right single quotation mark
+        '\u001a': '\u2019',  # right single quotation mark
+        '\u001c': '\u201c',  # left double quotation mark
+        '\u001d': '\u201d',  # right double quotation mark
+        '\u001e': '\u2014',  # em dash
+    }
+    for bad, good in replacements.items():
+        text = text.replace(bad, good)
+    return text
+
+
 def get_field_value(fields, field_name, default=""):
     field = fields.get(field_name, {})
-    return field.get('valueString', default)
+    value = field.get('valueString', default)
+    return _sanitize_cu_output(value)
 
 # Process files and insert into DB and Search
 async def process_files():
diff --git a/infra/scripts/index_scripts/04_cu_process_custom_data.py b/infra/scripts/index_scripts/04_cu_process_custom_data.py
index c7bc81a5e..6ce32f68f 100644
--- a/infra/scripts/index_scripts/04_cu_process_custom_data.py
+++ b/infra/scripts/index_scripts/04_cu_process_custom_data.py
@@ -370,9 +370,33 @@ def create_tables():
 
 create_tables()
 
+def _sanitize_cu_output(text):
+    """Replace non-printable control characters that may appear in CU output.
+
+    When JSON files containing Unicode escape sequences (e.g. \\u2019) are
+    sent as raw binary via the analyzeBinary endpoint, the returned text
+    occasionally contains unexpected control characters instead of the
+    intended Unicode characters. This is a defensive fix that maps those
+    control characters back to their likely intended values.
+    """
+    if not text:
+        return text
+    replacements = {
+        '\u0019': '\u2019',  # right single quotation mark
+        '\u001a': '\u2019',  # right single quotation mark
+        '\u001c': '\u201c',  # left double quotation mark
+        '\u001d': '\u201d',  # right double quotation mark
+        '\u001e': '\u2014',  # em dash
+    }
+    for bad, good in replacements.items():
+        text = text.replace(bad, good)
+    return text
+
+
 def get_field_value(fields, field_name, default=""):
     field = fields.get(field_name, {})
-    return field.get('valueString', default)
+    value = field.get('valueString', default)
+    return _sanitize_cu_output(value)
 
 # Process files and insert into DB and Search
 async def process_files():

From f73b6ede149fe35f548ac5a392c9e8879b79e9c2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 20 May 2026 09:43:54 +0000
Subject: [PATCH 2/2] refactor: move sanitize_cu_output to shared module and
 fix \u001a mapping

Agent-Logs-Url: https://github.com/microsoft/Conversation-Knowledge-Mining-Solution-Accelerator/sessions/50f2a621-4990-4aca-8dd8-cfa5ea31ce6f

Co-authored-by: Yamini-Microsoft <191316559+Yamini-Microsoft@users.noreply.github.com>
---
 .../index_scripts/03_cu_process_data_text.py  | 27 +++----------------
 .../04_cu_process_custom_data.py              | 27 +++----------------
 .../content_understanding_client.py           | 26 ++++++++++++++++++
 3 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/infra/scripts/index_scripts/03_cu_process_data_text.py b/infra/scripts/index_scripts/03_cu_process_data_text.py
index c8a4884ff..4cfaa5e40 100644
--- a/infra/scripts/index_scripts/03_cu_process_data_text.py
+++ b/infra/scripts/index_scripts/03_cu_process_data_text.py
@@ -31,7 +31,7 @@
 
 from agent_framework.azure import AzureAIProjectAgentProvider
 
-from content_understanding_client import AzureContentUnderstandingClient
+from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
 
 # Get parameters from command line
 p = argparse.ArgumentParser()
@@ -314,35 +314,14 @@ def create_tables():
     conn.commit()
 
 
-create_tables()
-
-def _sanitize_cu_output(text):
-    """Replace non-printable control characters that may appear in CU output.
 
-    When JSON files containing Unicode escape sequences (e.g. \\u2019) are
-    sent as raw binary via the analyzeBinary endpoint, the returned text
-    occasionally contains unexpected control characters instead of the
-    intended Unicode characters. This is a defensive fix that maps those
-    control characters back to their likely intended values.
-    """
-    if not text:
-        return text
-    replacements = {
-        '\u0019': '\u2019',  # right single quotation mark
-        '\u001a': '\u2019',  # right single quotation mark
-        '\u001c': '\u201c',  # left double quotation mark
-        '\u001d': '\u201d',  # right double quotation mark
-        '\u001e': '\u2014',  # em dash
-    }
-    for bad, good in replacements.items():
-        text = text.replace(bad, good)
-    return text
+create_tables()
 
 
 def get_field_value(fields, field_name, default=""):
     field = fields.get(field_name, {})
     value = field.get('valueString', default)
-    return _sanitize_cu_output(value)
+    return sanitize_cu_output(value)
 
 # Process files and insert into DB and Search
 async def process_files():
diff --git a/infra/scripts/index_scripts/04_cu_process_custom_data.py b/infra/scripts/index_scripts/04_cu_process_custom_data.py
index 6ce32f68f..3ac1166b8 100644
--- a/infra/scripts/index_scripts/04_cu_process_custom_data.py
+++ b/infra/scripts/index_scripts/04_cu_process_custom_data.py
@@ -45,7 +45,7 @@
 
 from agent_framework.azure import AzureAIProjectAgentProvider
 
-from content_understanding_client import AzureContentUnderstandingClient
+from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
 
 # Constants and configuration
 FILE_SYSTEM_CLIENT_NAME = "data"
@@ -368,35 +368,14 @@ def create_tables():
     conn.commit()
 
 
-create_tables()
-
-def _sanitize_cu_output(text):
-    """Replace non-printable control characters that may appear in CU output.
 
-    When JSON files containing Unicode escape sequences (e.g. \\u2019) are
-    sent as raw binary via the analyzeBinary endpoint, the returned text
-    occasionally contains unexpected control characters instead of the
-    intended Unicode characters. This is a defensive fix that maps those
-    control characters back to their likely intended values.
-    """
-    if not text:
-        return text
-    replacements = {
-        '\u0019': '\u2019',  # right single quotation mark
-        '\u001a': '\u2019',  # right single quotation mark
-        '\u001c': '\u201c',  # left double quotation mark
-        '\u001d': '\u201d',  # right double quotation mark
-        '\u001e': '\u2014',  # em dash
-    }
-    for bad, good in replacements.items():
-        text = text.replace(bad, good)
-    return text
+create_tables()
 
 
 def get_field_value(fields, field_name, default=""):
     field = fields.get(field_name, {})
     value = field.get('valueString', default)
-    return _sanitize_cu_output(value)
+    return sanitize_cu_output(value)
 
 # Process files and insert into DB and Search
 async def process_files():
diff --git a/infra/scripts/index_scripts/content_understanding_client.py b/infra/scripts/index_scripts/content_understanding_client.py
index 2585341da..bdd4d6d37 100644
--- a/infra/scripts/index_scripts/content_understanding_client.py
+++ b/infra/scripts/index_scripts/content_understanding_client.py
@@ -6,6 +6,32 @@
 from pathlib import Path
 
 
+def sanitize_cu_output(text):
+    """Replace non-printable control characters that may appear in CU output.
+
+    The Content Understanding analyzeBinary API (v2025-11-01) intermittently
+    corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
+    U+0019).  This function maps each known corrupted control character back to
+    its intended Unicode equivalent.  The mapping is based on empirical
+    observation of characters corrupted in a single high-byte-stripping pass
+    over the U+201x range.
+
+    The fix is zero-cost when CU output is already correct.
+    """
+    if not text:
+        return text
+    replacements = {
+        '\u0019': '\u2019',  # right single quotation mark
+        '\u001a': '\u201a',  # single low-9 quotation mark
+        '\u001c': '\u201c',  # left double quotation mark
+        '\u001d': '\u201d',  # right double quotation mark
+        '\u001e': '\u2014',  # em dash (empirically observed)
+    }
+    for bad, good in replacements.items():
+        text = text.replace(bad, good)
+    return text
+
+
 class AzureContentUnderstandingClient:
     def __init__(
         self,