From 88b1868ae5e1a7af6b00bf7d470f6dd2785a2e89 Mon Sep 17 00:00:00 2001 From: Yamini-Microsoft Date: Tue, 19 May 2026 20:11:11 +0530 Subject: [PATCH 1/2] fix: sanitize CU output to prevent Unicode corruption in citations The Content Understanding analyzeBinary API (v2025-11-01) intermittently corrupts Unicode characters by stripping the high byte (e.g. U+2019 -> U+0019). This causes apostrophes and quotes to render as box characters in the Citation Panel. Added _sanitize_cu_output() to map known corrupted control characters back to their intended Unicode equivalents after CU processing, before saving to Search/SQL. Fixes AB#43310 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../index_scripts/03_cu_process_data_text.py | 26 ++++++++++++++++++- .../04_cu_process_custom_data.py | 26 ++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/infra/scripts/index_scripts/03_cu_process_data_text.py b/infra/scripts/index_scripts/03_cu_process_data_text.py index 2ba6cb9b4..c8a4884ff 100644 --- a/infra/scripts/index_scripts/03_cu_process_data_text.py +++ b/infra/scripts/index_scripts/03_cu_process_data_text.py @@ -316,9 +316,33 @@ def create_tables(): create_tables() +def _sanitize_cu_output(text): + """Replace non-printable control characters that may appear in CU output. + + When JSON files containing Unicode escape sequences (e.g. \\u2019) are + sent as raw binary via the analyzeBinary endpoint, the returned text + occasionally contains unexpected control characters instead of the + intended Unicode characters. This is a defensive fix that maps those + control characters back to their likely intended values. + """ + if not text: + return text + replacements = { + '\u0019': '\u2019', # right single quotation mark + '\u001a': '\u2019', # right single quotation mark + '\u001c': '\u201c', # left double quotation mark + '\u001d': '\u201d', # right double quotation mark + '\u001e': '\u2014', # em dash + } + for bad, good in replacements.items(): + text = text.replace(bad, good) + return text + + def get_field_value(fields, field_name, default=""): field = fields.get(field_name, {}) - return field.get('valueString', default) + value = field.get('valueString', default) + return _sanitize_cu_output(value) # Process files and insert into DB and Search async def process_files(): diff --git a/infra/scripts/index_scripts/04_cu_process_custom_data.py b/infra/scripts/index_scripts/04_cu_process_custom_data.py index c7bc81a5e..6ce32f68f 100644 --- a/infra/scripts/index_scripts/04_cu_process_custom_data.py +++ b/infra/scripts/index_scripts/04_cu_process_custom_data.py @@ -370,9 +370,33 @@ def create_tables(): create_tables() +def _sanitize_cu_output(text): + """Replace non-printable control characters that may appear in CU output. + + When JSON files containing Unicode escape sequences (e.g. \\u2019) are + sent as raw binary via the analyzeBinary endpoint, the returned text + occasionally contains unexpected control characters instead of the + intended Unicode characters. This is a defensive fix that maps those + control characters back to their likely intended values. + """ + if not text: + return text + replacements = { + '\u0019': '\u2019', # right single quotation mark + '\u001a': '\u2019', # right single quotation mark + '\u001c': '\u201c', # left double quotation mark + '\u001d': '\u201d', # right double quotation mark + '\u001e': '\u2014', # em dash + } + for bad, good in replacements.items(): + text = text.replace(bad, good) + return text + + def get_field_value(fields, field_name, default=""): field = fields.get(field_name, {}) - return field.get('valueString', default) + value = field.get('valueString', default) + return _sanitize_cu_output(value) # Process files and insert into DB and Search async def process_files(): From f73b6ede149fe35f548ac5a392c9e8879b79e9c2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 09:43:54 +0000 Subject: [PATCH 2/2] refactor: move sanitize_cu_output to shared module and fix \u001a mapping Agent-Logs-Url: https://github.com/microsoft/Conversation-Knowledge-Mining-Solution-Accelerator/sessions/50f2a621-4990-4aca-8dd8-cfa5ea31ce6f Co-authored-by: Yamini-Microsoft <191316559+Yamini-Microsoft@users.noreply.github.com> --- .../index_scripts/03_cu_process_data_text.py | 27 +++---------------- .../04_cu_process_custom_data.py | 27 +++---------------- .../content_understanding_client.py | 26 ++++++++++++++++++ 3 files changed, 32 insertions(+), 48 deletions(-) diff --git a/infra/scripts/index_scripts/03_cu_process_data_text.py b/infra/scripts/index_scripts/03_cu_process_data_text.py index c8a4884ff..4cfaa5e40 100644 --- a/infra/scripts/index_scripts/03_cu_process_data_text.py +++ b/infra/scripts/index_scripts/03_cu_process_data_text.py @@ -31,7 +31,7 @@ from agent_framework.azure import AzureAIProjectAgentProvider -from content_understanding_client import AzureContentUnderstandingClient +from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output # Get parameters from command line p = argparse.ArgumentParser() @@ -314,35 +314,14 @@ def create_tables(): conn.commit() -create_tables() - -def _sanitize_cu_output(text): - """Replace non-printable control characters that may appear in CU output. - When JSON files containing Unicode escape sequences (e.g. \\u2019) are - sent as raw binary via the analyzeBinary endpoint, the returned text - occasionally contains unexpected control characters instead of the - intended Unicode characters. This is a defensive fix that maps those - control characters back to their likely intended values. - """ - if not text: - return text - replacements = { - '\u0019': '\u2019', # right single quotation mark - '\u001a': '\u2019', # right single quotation mark - '\u001c': '\u201c', # left double quotation mark - '\u001d': '\u201d', # right double quotation mark - '\u001e': '\u2014', # em dash - } - for bad, good in replacements.items(): - text = text.replace(bad, good) - return text +create_tables() def get_field_value(fields, field_name, default=""): field = fields.get(field_name, {}) value = field.get('valueString', default) - return _sanitize_cu_output(value) + return sanitize_cu_output(value) # Process files and insert into DB and Search async def process_files(): diff --git a/infra/scripts/index_scripts/04_cu_process_custom_data.py b/infra/scripts/index_scripts/04_cu_process_custom_data.py index 6ce32f68f..3ac1166b8 100644 --- a/infra/scripts/index_scripts/04_cu_process_custom_data.py +++ b/infra/scripts/index_scripts/04_cu_process_custom_data.py @@ -45,7 +45,7 @@ from agent_framework.azure import AzureAIProjectAgentProvider -from content_understanding_client import AzureContentUnderstandingClient +from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output # Constants and configuration FILE_SYSTEM_CLIENT_NAME = "data" @@ -368,35 +368,14 @@ def create_tables(): conn.commit() -create_tables() - -def _sanitize_cu_output(text): - """Replace non-printable control characters that may appear in CU output. - When JSON files containing Unicode escape sequences (e.g. \\u2019) are - sent as raw binary via the analyzeBinary endpoint, the returned text - occasionally contains unexpected control characters instead of the - intended Unicode characters. This is a defensive fix that maps those - control characters back to their likely intended values. - """ - if not text: - return text - replacements = { - '\u0019': '\u2019', # right single quotation mark - '\u001a': '\u2019', # right single quotation mark - '\u001c': '\u201c', # left double quotation mark - '\u001d': '\u201d', # right double quotation mark - '\u001e': '\u2014', # em dash - } - for bad, good in replacements.items(): - text = text.replace(bad, good) - return text +create_tables() def get_field_value(fields, field_name, default=""): field = fields.get(field_name, {}) value = field.get('valueString', default) - return _sanitize_cu_output(value) + return sanitize_cu_output(value) # Process files and insert into DB and Search async def process_files(): diff --git a/infra/scripts/index_scripts/content_understanding_client.py b/infra/scripts/index_scripts/content_understanding_client.py index 2585341da..bdd4d6d37 100644 --- a/infra/scripts/index_scripts/content_understanding_client.py +++ b/infra/scripts/index_scripts/content_understanding_client.py @@ -6,6 +6,32 @@ from pathlib import Path +def sanitize_cu_output(text): + """Replace non-printable control characters that may appear in CU output. + + The Content Understanding analyzeBinary API (v2025-11-01) intermittently + corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes + U+0019). This function maps each known corrupted control character back to + its intended Unicode equivalent. The mapping is based on empirical + observation of characters corrupted in a single high-byte-stripping pass + over the U+201x range. + + The fix is zero-cost when CU output is already correct. + """ + if not text: + return text + replacements = { + '\u0019': '\u2019', # right single quotation mark + '\u001a': '\u201a', # single low-9 quotation mark + '\u001c': '\u201c', # left double quotation mark + '\u001d': '\u201d', # right double quotation mark + '\u001e': '\u2014', # em dash (empirically observed) + } + for bad, good in replacements.items(): + text = text.replace(bad, good) + return text + + class AzureContentUnderstandingClient: def __init__( self,