|
6 | 6 | from pathlib import Path |
7 | 7 |
|
8 | 8 |
|
9 | | -def sanitize_cu_output(text): |
10 | | - """Replace non-printable control characters that may appear in CU output. |
| 9 | +# Module-level constant: corrupted control-char → intended Unicode mapping. |
| 10 | +# The CU analyzeBinary API (v2025-11-01) intermittently strips the high byte |
| 11 | +# from Unicode characters (e.g. U+2019 → U+0019). This dict maps each known |
| 12 | +# corrupted control character back to its intended equivalent. |
| 13 | +_CU_REPLACEMENTS = { |
| 14 | + '\u0014': '\u2014', # em dash |
| 15 | + '\u0019': '\u2019', # right single quotation mark |
| 16 | + '\u001a': '\u201a', # single low-9 quotation mark |
| 17 | + '\u001c': '\u201c', # left double quotation mark |
| 18 | + '\u001d': '\u201d', # right double quotation mark |
| 19 | + '\u001e': '\u201e', # double low-9 quotation mark |
| 20 | +} |
| 21 | +_CU_BAD_CHARS = set(_CU_REPLACEMENTS.keys()) |
| 22 | + |
11 | 23 |
|
12 | | - The Content Understanding analyzeBinary API (v2025-11-01) intermittently |
13 | | - corrupts Unicode characters by stripping the high byte (e.g. U+2019 becomes |
14 | | - U+0019). This function maps each known corrupted control character back to |
15 | | - its intended Unicode equivalent. The mapping is based on empirical |
16 | | - observation of characters corrupted in a single high-byte-stripping pass |
17 | | - over the U+201x range. |
| 24 | +def sanitize_cu_output(text): |
| 25 | + """Replace corrupted control characters that CU may emit. |
18 | 26 |
|
19 | | - The fix is zero-cost when CU output is already correct. |
| 27 | + Returns *text* unchanged (no-op) when none of the known corrupted |
| 28 | + characters are present. The replacement mapping is allocated once |
| 29 | + at module level to avoid per-call overhead. |
20 | 30 | """ |
21 | | - if not text: |
| 31 | + if not text or _CU_BAD_CHARS.isdisjoint(text): |
22 | 32 | return text |
23 | | - replacements = { |
24 | | - '\u0019': '\u2019', # right single quotation mark |
25 | | - '\u001a': '\u201a', # single low-9 quotation mark |
26 | | - '\u001c': '\u201c', # left double quotation mark |
27 | | - '\u001d': '\u201d', # right double quotation mark |
28 | | - '\u001e': '\u2014', # em dash (empirically observed) |
29 | | - } |
30 | | - for bad, good in replacements.items(): |
| 33 | + for bad, good in _CU_REPLACEMENTS.items(): |
31 | 34 | text = text.replace(bad, good) |
32 | 35 | return text |
33 | 36 |
|
|
0 commit comments