Skip to content

Commit f73b6ed

Browse files
refactor: move sanitize_cu_output to shared module and fix \u001a mapping
Agent-Logs-Url: https://github.com/microsoft/Conversation-Knowledge-Mining-Solution-Accelerator/sessions/50f2a621-4990-4aca-8dd8-cfa5ea31ce6f Co-authored-by: Yamini-Microsoft <191316559+Yamini-Microsoft@users.noreply.github.com>
1 parent 88b1868 commit f73b6ed

3 files changed

Lines changed: 32 additions & 48 deletions

File tree

infra/scripts/index_scripts/03_cu_process_data_text.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
from agent_framework.azure import AzureAIProjectAgentProvider
3333

34-
from content_understanding_client import AzureContentUnderstandingClient
34+
from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
3535

3636
# Get parameters from command line
3737
p = argparse.ArgumentParser()
@@ -314,35 +314,14 @@ def create_tables():
314314
conn.commit()
315315

316316

317-
create_tables()
318-
319-
def _sanitize_cu_output(text):
320-
"""Replace non-printable control characters that may appear in CU output.
321317

322-
When JSON files containing Unicode escape sequences (e.g. \\u2019) are
323-
sent as raw binary via the analyzeBinary endpoint, the returned text
324-
occasionally contains unexpected control characters instead of the
325-
intended Unicode characters. This is a defensive fix that maps those
326-
control characters back to their likely intended values.
327-
"""
328-
if not text:
329-
return text
330-
replacements = {
331-
'\u0019': '\u2019', # right single quotation mark
332-
'\u001a': '\u2019', # right single quotation mark
333-
'\u001c': '\u201c', # left double quotation mark
334-
'\u001d': '\u201d', # right double quotation mark
335-
'\u001e': '\u2014', # em dash
336-
}
337-
for bad, good in replacements.items():
338-
text = text.replace(bad, good)
339-
return text
318+
create_tables()
340319

341320

342321
def get_field_value(fields, field_name, default=""):
343322
field = fields.get(field_name, {})
344323
value = field.get('valueString', default)
345-
return _sanitize_cu_output(value)
324+
return sanitize_cu_output(value)
346325

347326
# Process files and insert into DB and Search
348327
async def process_files():

infra/scripts/index_scripts/04_cu_process_custom_data.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
from agent_framework.azure import AzureAIProjectAgentProvider
4747

48-
from content_understanding_client import AzureContentUnderstandingClient
48+
from content_understanding_client import AzureContentUnderstandingClient, sanitize_cu_output
4949

5050
# Constants and configuration
5151
FILE_SYSTEM_CLIENT_NAME = "data"
@@ -368,35 +368,14 @@ def create_tables():
368368
conn.commit()
369369

370370

371-
create_tables()
372-
373-
def _sanitize_cu_output(text):
374-
"""Replace non-printable control characters that may appear in CU output.
375371

376-
When JSON files containing Unicode escape sequences (e.g. \\u2019) are
377-
sent as raw binary via the analyzeBinary endpoint, the returned text
378-
occasionally contains unexpected control characters instead of the
379-
intended Unicode characters. This is a defensive fix that maps those
380-
control characters back to their likely intended values.
381-
"""
382-
if not text:
383-
return text
384-
replacements = {
385-
'\u0019': '\u2019', # right single quotation mark
386-
'\u001a': '\u2019', # right single quotation mark
387-
'\u001c': '\u201c', # left double quotation mark
388-
'\u001d': '\u201d', # right double quotation mark
389-
'\u001e': '\u2014', # em dash
390-
}
391-
for bad, good in replacements.items():
392-
text = text.replace(bad, good)
393-
return text
372+
create_tables()
394373

395374

396375
def get_field_value(fields, field_name, default=""):
397376
field = fields.get(field_name, {})
398377
value = field.get('valueString', default)
399-
return _sanitize_cu_output(value)
378+
return sanitize_cu_output(value)
400379

401380
# Process files and insert into DB and Search
402381
async def process_files():

infra/scripts/index_scripts/content_understanding_client.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,32 @@
66
from pathlib import Path
77

88

9+
def sanitize_cu_output(text):
10+
"""Replace non-printable control characters that may appear in CU output.
11+
12+
The Content Understanding analyzeBinary API (v2025-11-01) intermittently
13+
corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes
14+
U+0019). This function maps each known corrupted control character back to
15+
its intended Unicode equivalent. The mapping is based on empirical
16+
observation of characters corrupted in a single high-byte-stripping pass
17+
over the U+201x range.
18+
19+
The fix is zero-cost when CU output is already correct.
20+
"""
21+
if not text:
22+
return text
23+
replacements = {
24+
'\u0019': '\u2019', # right single quotation mark
25+
'\u001a': '\u201a', # single low-9 quotation mark
26+
'\u001c': '\u201c', # left double quotation mark
27+
'\u001d': '\u201d', # right double quotation mark
28+
'\u001e': '\u2014', # em dash (empirically observed)
29+
}
30+
for bad, good in replacements.items():
31+
text = text.replace(bad, good)
32+
return text
33+
34+
935
class AzureContentUnderstandingClient:
1036
def __init__(
1137
self,

0 commit comments

Comments
 (0)