Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions infra/scripts/index_scripts/03_cu_process_data_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

from agent_framework.azure import AzureAIProjectAgentProvider

from content_understanding_client import AzureContentUnderstandingClient
from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output

# Get parameters from command line
p = argparse.ArgumentParser()
Expand Down Expand Up @@ -314,11 +314,14 @@ def create_tables():
conn.commit()



create_tables()


def get_field_value(fields, field_name, default=""):
field = fields.get(field_name, {})
return field.get('valueString', default)
value = field.get('valueString', default)
return sanitize_cu_output(value)

# Process files and insert into DB and Search
async def process_files():
Expand Down
7 changes: 5 additions & 2 deletions infra/scripts/index_scripts/04_cu_process_custom_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

from agent_framework.azure import AzureAIProjectAgentProvider

from content_understanding_client import AzureContentUnderstandingClient
from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output

# Constants and configuration
FILE_SYSTEM_CLIENT_NAME = "data"
Expand Down Expand Up @@ -368,11 +368,14 @@ def create_tables():
conn.commit()



create_tables()


def get_field_value(fields, field_name, default=""):
field = fields.get(field_name, {})
return field.get('valueString', default)
value = field.get('valueString', default)
return sanitize_cu_output(value)

# Process files and insert into DB and Search
async def process_files():
Expand Down
26 changes: 26 additions & 0 deletions infra/scripts/index_scripts/content_understanding_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,32 @@
from pathlib import Path


def sanitize_cu_output(text):
"""Replace non-printable control characters that may appear in CU output.

The Content Understanding analyzeBinary API (v2025-11-01) intermittently
corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
U+0019). This function maps each known corrupted control character back to
its intended Unicode equivalent. The mapping is based on empirical
observation of characters corrupted in a single high-byte-stripping pass
over the U+201x range.

The fix is zero-cost when CU output is already correct.
"""
if not text:
return text
replacements = {
'\u0019': '\u2019', # right single quotation mark
'\u001a': '\u201a', # single low-9 quotation mark
'\u001c': '\u201c', # left double quotation mark
'\u001d': '\u201d', # right double quotation mark
'\u001e': '\u2014', # em dash (empirically observed)
}
for bad, good in replacements.items():
text = text.replace(bad, good)
return text


class AzureContentUnderstandingClient:
def __init__(
self,
Expand Down