Skip to content

Commit e081526

Browse files
fix: Sanitize CU output to prevent Unicode corruption in Citation Panel
2 parents 0861906 + f73b6ed commit e081526

3 files changed

Lines changed: 36 additions & 4 deletions

File tree

infra/scripts/index_scripts/03_cu_process_data_text.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
from agent_framework.azure import AzureAIProjectAgentProvider
3333

34-
from content_understanding_client import AzureContentUnderstandingClient
34+
from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
3535

3636
# Get parameters from command line
3737
p = argparse.ArgumentParser()
@@ -314,11 +314,14 @@ def create_tables():
314314
conn.commit()
315315

316316

317+
317318
create_tables()
318319

320+
319321
def get_field_value(fields, field_name, default=""):
320322
field = fields.get(field_name, {})
321-
return field.get('valueString', default)
323+
value = field.get('valueString', default)
324+
return sanitize_cu_output(value)
322325

323326
# Process files and insert into DB and Search
324327
async def process_files():

infra/scripts/index_scripts/04_cu_process_custom_data.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
from agent_framework.azure import AzureAIProjectAgentProvider
4747

48-
from content_understanding_client import AzureContentUnderstandingClient
48+
from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
4949

5050
# Constants and configuration
5151
FILE_SYSTEM_CLIENT_NAME = "data"
@@ -368,11 +368,14 @@ def create_tables():
368368
conn.commit()
369369

370370

371+
371372
create_tables()
372373

374+
373375
def get_field_value(fields, field_name, default=""):
374376
field = fields.get(field_name, {})
375-
return field.get('valueString', default)
377+
value = field.get('valueString', default)
378+
return sanitize_cu_output(value)
376379

377380
# Process files and insert into DB and Search
378381
async def process_files():

infra/scripts/index_scripts/content_understanding_client.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,32 @@
66
from pathlib import Path
77

88

9+
def sanitize_cu_output(text):
10+
"""Replace non-printable control characters that may appear in CU output.
11+
12+
The Content Understanding analyzeBinary API (v2025-11-01) intermittently
13+
corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
14+
U+0019). This function maps each known corrupted control character back to
15+
its intended Unicode equivalent. The mapping is based on empirical
16+
observation of characters corrupted in a single high-byte-stripping pass
17+
over the U+201x range.
18+
19+
The fix is zero-cost when CU output is already correct.
20+
"""
21+
if not text:
22+
return text
23+
replacements = {
24+
'\u0019': '\u2019', # right single quotation mark
25+
'\u001a': '\u201a', # single low-9 quotation mark
26+
'\u001c': '\u201c', # left double quotation mark
27+
'\u001d': '\u201d', # right double quotation mark
28+
'\u001e': '\u2014', # em dash (empirically observed)
29+
}
30+
for bad, good in replacements.items():
31+
text = text.replace(bad, good)
32+
return text
33+
34+
935
class AzureContentUnderstandingClient:
1036
def __init__(
1137
self,

0 commit comments

Comments
 (0)