Skip to content

Commit 9c3b27c

Browse files
committed
Fix pre-existing bug: left smart quotes not normalized due to duplicate dict keys
The quote-mapping dicts used literal quote characters as keys, but '"'/'"'/'"' all encode as byte 0x22 and '''/'''/''' as 0x27. Python deduplicates them, silently dropping U+201C (left double) and U+2018 (left single) before the translation table is built. Restructure as tuples of \uXXXX escape sequences so every codepoint is guaranteed unique.
1 parent aaf93c4 commit 9c3b27c

1 file changed

Lines changed: 46 additions & 47 deletions

File tree

unstructured/metrics/text_extraction.py

Lines changed: 46 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -4,55 +4,54 @@
44

55
from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
66

7-
_DOUBLE_QUOTES = {
8-
'"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote
9-
'"': "U+201C", # noqa 601 # Left double quotation mark
10-
'"': "U+201D", # noqa 601 # Right double quotation mark
11-
"„": "U+201E", # Double low-9 quotation mark
12-
"‟": "U+201F", # Double high-reversed-9 quotation mark
13-
"«": "U+00AB", # Left-pointing double angle quotation mark
14-
"»": "U+00BB", # Right-pointing double angle quotation mark
15-
"❝": "U+275D", # Heavy double turned comma quotation mark ornament
16-
"❞": "U+275E", # Heavy double comma quotation mark ornament
17-
"⹂": "U+2E42", # Double low-reversed-9 quotation mark
18-
"🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
19-
"🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
20-
"🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
21-
"⠦": "U+2826", # Braille double closing quotation mark
22-
"⠴": "U+2834", # Braille double opening quotation mark
23-
"〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
24-
"〞": "U+301E", # DOUBLE PRIME QUOTATION MARK
25-
"〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
26-
""": "U+FF02", # FULLWIDTH QUOTATION MARK
27-
",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT
28-
}
29-
30-
_SINGLE_QUOTES = {
31-
"'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote
32-
"'": "U+2018", # noqa 601 # Left single quotation mark
33-
"'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605
34-
"‚": "U+201A", # Single low-9 quotation mark
35-
"‛": "U+201B", # Single high-reversed-9 quotation mark
36-
"‹": "U+2039", # Single left-pointing angle quotation mark
37-
"›": "U+203A", # Single right-pointing angle quotation mark
38-
"❛": "U+275B", # Heavy single turned comma quotation mark ornament
39-
"❜": "U+275C", # Heavy single comma quotation mark ornament
40-
"「": "U+300C", # Left corner bracket
41-
"」": "U+300D", # Right corner bracket
42-
"『": "U+300E", # Left white corner bracket
43-
"』": "U+300F", # Right white corner bracket
44-
"﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
45-
"﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
46-
"﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
47-
"﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
48-
"'": "U+FF07", # FULLWIDTH APOSTROPHE
49-
"「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
50-
"」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
51-
}
7+
_DOUBLE_QUOTE_CODEPOINTS = (
8+
"\u0022", # U+0022 Standard typewriter/programmer's quote
9+
"\u201C", # U+201C Left double quotation mark
10+
"\u201D", # U+201D Right double quotation mark
11+
"\u201E", # U+201E Double low-9 quotation mark
12+
"\u201F", # U+201F Double high-reversed-9 quotation mark
13+
"\u00AB", # U+00AB Left-pointing double angle quotation mark
14+
"\u00BB", # U+00BB Right-pointing double angle quotation mark
15+
"\u275D", # U+275D Heavy double turned comma quotation mark ornament
16+
"\u275E", # U+275E Heavy double comma quotation mark ornament
17+
"\u2E42", # U+2E42 Double low-reversed-9 quotation mark
18+
"\U0001F676", # U+1F676 SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
19+
"\U0001F677", # U+1F677 SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
20+
"\U0001F678", # U+1F678 SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
21+
"\u2826", # U+2826 Braille double closing quotation mark
22+
"\u2834", # U+2834 Braille double opening quotation mark
23+
"\u301D", # U+301D REVERSED DOUBLE PRIME QUOTATION MARK
24+
"\u301E", # U+301E DOUBLE PRIME QUOTATION MARK
25+
"\u301F", # U+301F LOW DOUBLE PRIME QUOTATION MARK
26+
"\uFF02", # U+FF02 FULLWIDTH QUOTATION MARK
27+
)
28+
29+
_SINGLE_QUOTE_CODEPOINTS = (
30+
"\u0027", # U+0027 Standard typewriter/programmer's quote
31+
"\u2018", # U+2018 Left single quotation mark
32+
"\u2019", # U+2019 Right single quotation mark
33+
"\u201A", # U+201A Single low-9 quotation mark
34+
"\u201B", # U+201B Single high-reversed-9 quotation mark
35+
"\u2039", # U+2039 Single left-pointing angle quotation mark
36+
"\u203A", # U+203A Single right-pointing angle quotation mark
37+
"\u275B", # U+275B Heavy single turned comma quotation mark ornament
38+
"\u275C", # U+275C Heavy single comma quotation mark ornament
39+
"\u300C", # U+300C Left corner bracket
40+
"\u300D", # U+300D Right corner bracket
41+
"\u300E", # U+300E Left white corner bracket
42+
"\u300F", # U+300F Right white corner bracket
43+
"\uFE41", # U+FE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
44+
"\uFE42", # U+FE42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
45+
"\uFE43", # U+FE43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
46+
"\uFE44", # U+FE44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
47+
"\uFF07", # U+FF07 FULLWIDTH APOSTROPHE
48+
"\uFF62", # U+FF62 HALFWIDTH LEFT CORNER BRACKET
49+
"\uFF63", # U+FF63 HALFWIDTH RIGHT CORNER BRACKET
50+
)
5251

5352
_TRANSLATION_TABLE = str.maketrans(
54-
{chr(int(v.replace("U+", ""), 16)): '"' for v in _DOUBLE_QUOTES.values()}
55-
| {chr(int(v.replace("U+", ""), 16)): "'" for v in _SINGLE_QUOTES.values()}
53+
{c: '"' for c in _DOUBLE_QUOTE_CODEPOINTS}
54+
| {c: "'" for c in _SINGLE_QUOTE_CODEPOINTS}
5655
)
5756

5857

0 commit comments

Comments
 (0)