diff --git a/infra/scripts/index_scripts/03_cu_process_data_text.py b/infra/scripts/index_scripts/03_cu_process_data_text.py index 2ba6cb9b4..4cfaa5e40 100644 --- a/infra/scripts/index_scripts/03_cu_process_data_text.py +++ b/infra/scripts/index_scripts/03_cu_process_data_text.py @@ -31,7 +31,7 @@ from agent_framework.azure import AzureAIProjectAgentProvider -from content_understanding_client import AzureContentUnderstandingClient +from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output # Get parameters from command line p = argparse.ArgumentParser() @@ -314,11 +314,14 @@ def create_tables(): conn.commit() + create_tables() + def get_field_value(fields, field_name, default=""): field = fields.get(field_name, {}) - return field.get('valueString', default) + value = field.get('valueString', default) + return sanitize_cu_output(value) # Process files and insert into DB and Search async def process_files(): diff --git a/infra/scripts/index_scripts/04_cu_process_custom_data.py b/infra/scripts/index_scripts/04_cu_process_custom_data.py index c7bc81a5e..3ac1166b8 100644 --- a/infra/scripts/index_scripts/04_cu_process_custom_data.py +++ b/infra/scripts/index_scripts/04_cu_process_custom_data.py @@ -45,7 +45,7 @@ from agent_framework.azure import AzureAIProjectAgentProvider -from content_understanding_client import AzureContentUnderstandingClient +from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output # Constants and configuration FILE_SYSTEM_CLIENT_NAME = "data" @@ -368,11 +368,14 @@ def create_tables(): conn.commit() + create_tables() + def get_field_value(fields, field_name, default=""): field = fields.get(field_name, {}) - return field.get('valueString', default) + value = field.get('valueString', default) + return sanitize_cu_output(value) # Process files and insert into DB and Search async def process_files(): diff --git a/infra/scripts/index_scripts/content_understanding_client.py b/infra/scripts/index_scripts/content_understanding_client.py index 2585341da..bdd4d6d37 100644 --- a/infra/scripts/index_scripts/content_understanding_client.py +++ b/infra/scripts/index_scripts/content_understanding_client.py @@ -6,6 +6,32 @@ from pathlib import Path +def sanitize_cu_output(text): + """Replace non-printable control characters that may appear in CU output. + + The Content Understanding analyzeBinary API (v2025-11-01) intermittently + corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes + U+0019). This function maps each known corrupted control character back to + its intended Unicode equivalent. The mapping is based on empirical + observation of characters corrupted in a single high-byte-stripping pass + over the U+201x range. + + The fix is zero-cost when CU output is already correct. + """ + if not text: + return text + replacements = { + '\u0019': '\u2019', # right single quotation mark + '\u001a': '\u201a', # single low-9 quotation mark + '\u001c': '\u201c', # left double quotation mark + '\u001d': '\u201d', # right double quotation mark + '\u001e': '\u2014', # em dash (empirically observed) + } + for bad, good in replacements.items(): + text = text.replace(bad, good) + return text + + class AzureContentUnderstandingClient: def __init__( self,