File tree Expand file tree Collapse file tree
infra/scripts/index_scripts Expand file tree Collapse file tree Original file line number Diff line number Diff line change 3131
3232from agent_framework .azure import AzureAIProjectAgentProvider
3333
34- from content_understanding_client import AzureContentUnderstandingClient
34+ from content_understanding_client import AzureContentUnderstandingClient , sanitize_cu_output
3535
3636# Get parameters from command line
3737p = argparse .ArgumentParser ()
@@ -314,11 +314,14 @@ def create_tables():
314314 conn .commit ()
315315
316316
317+
317318create_tables ()
318319
320+
319321def get_field_value (fields , field_name , default = "" ):
320322 field = fields .get (field_name , {})
321- return field .get ('valueString' , default )
323+ value = field .get ('valueString' , default )
324+ return sanitize_cu_output (value )
322325
323326# Process files and insert into DB and Search
324327async def process_files ():
Original file line number Diff line number Diff line change 4545
4646from agent_framework .azure import AzureAIProjectAgentProvider
4747
48- from content_understanding_client import AzureContentUnderstandingClient
48+ from content_understanding_client import AzureContentUnderstandingClient , sanitize_cu_output
4949
5050# Constants and configuration
5151FILE_SYSTEM_CLIENT_NAME = "data"
@@ -368,11 +368,14 @@ def create_tables():
368368 conn .commit ()
369369
370370
371+
371372create_tables ()
372373
374+
373375def get_field_value (fields , field_name , default = "" ):
374376 field = fields .get (field_name , {})
375- return field .get ('valueString' , default )
377+ value = field .get ('valueString' , default )
378+ return sanitize_cu_output (value )
376379
377380# Process files and insert into DB and Search
378381async def process_files ():
Original file line number Diff line number Diff line change 66from pathlib import Path
77
88
9+ def sanitize_cu_output (text ):
10+ """Replace non-printable control characters that may appear in CU output.
11+
12+ The Content Understanding analyzeBinary API (v2025-11-01) intermittently
13+ corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
14+ U+0019). This function maps each known corrupted control character back to
15+ its intended Unicode equivalent. The mapping is based on empirical
16+ observation of characters corrupted in a single high-byte-stripping pass
17+ over the U+201x range.
18+
19+ The fix is zero-cost when CU output is already correct.
20+ """
21+ if not text :
22+ return text
23+ replacements = {
24+ '\u0019 ' : '\u2019 ' , # right single quotation mark
25+ '\u001a ' : '\u201a ' , # single low-9 quotation mark
26+ '\u001c ' : '\u201c ' , # left double quotation mark
27+ '\u001d ' : '\u201d ' , # right double quotation mark
28+ '\u001e ' : '\u2014 ' , # em dash (empirically observed)
29+ }
30+ for bad , good in replacements .items ():
31+ text = text .replace (bad , good )
32+ return text
33+
34+
935class AzureContentUnderstandingClient :
1036 def __init__ (
1137 self ,
You can’t perform that action at this time.
0 commit comments