|
4 | 4 |
|
5 | 5 | from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation |
6 | 6 |
|
7 | | -_DOUBLE_QUOTES = { |
8 | | - '"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote |
9 | | - '"': "U+201C", # noqa 601 # Left double quotation mark |
10 | | - '"': "U+201D", # noqa 601 # Right double quotation mark |
11 | | - "„": "U+201E", # Double low-9 quotation mark |
12 | | - "‟": "U+201F", # Double high-reversed-9 quotation mark |
13 | | - "«": "U+00AB", # Left-pointing double angle quotation mark |
14 | | - "»": "U+00BB", # Right-pointing double angle quotation mark |
15 | | - "❝": "U+275D", # Heavy double turned comma quotation mark ornament |
16 | | - "❞": "U+275E", # Heavy double comma quotation mark ornament |
17 | | - "⹂": "U+2E42", # Double low-reversed-9 quotation mark |
18 | | - "🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT |
19 | | - "🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT |
20 | | - "🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT |
21 | | - "⠦": "U+2826", # Braille double closing quotation mark |
22 | | - "⠴": "U+2834", # Braille double opening quotation mark |
23 | | - "〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK |
24 | | - "〞": "U+301E", # DOUBLE PRIME QUOTATION MARK |
25 | | - "〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK |
26 | | - """: "U+FF02", # FULLWIDTH QUOTATION MARK |
27 | | - ",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT |
28 | | -} |
29 | | - |
30 | | -_SINGLE_QUOTES = { |
31 | | - "'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote |
32 | | - "'": "U+2018", # noqa 601 # Left single quotation mark |
33 | | - "'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605 |
34 | | - "‚": "U+201A", # Single low-9 quotation mark |
35 | | - "‛": "U+201B", # Single high-reversed-9 quotation mark |
36 | | - "‹": "U+2039", # Single left-pointing angle quotation mark |
37 | | - "›": "U+203A", # Single right-pointing angle quotation mark |
38 | | - "❛": "U+275B", # Heavy single turned comma quotation mark ornament |
39 | | - "❜": "U+275C", # Heavy single comma quotation mark ornament |
40 | | - "「": "U+300C", # Left corner bracket |
41 | | - "」": "U+300D", # Right corner bracket |
42 | | - "『": "U+300E", # Left white corner bracket |
43 | | - "』": "U+300F", # Right white corner bracket |
44 | | - "﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET |
45 | | - "﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET |
46 | | - "﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET |
47 | | - "﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET |
48 | | - "'": "U+FF07", # FULLWIDTH APOSTROPHE |
49 | | - "「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET |
50 | | - "」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET |
51 | | -} |
| 7 | +_DOUBLE_QUOTE_CODEPOINTS = ( |
| 8 | + "\u0022", # U+0022 Standard typewriter/programmer's quote |
| 9 | + "\u201C", # U+201C Left double quotation mark |
| 10 | + "\u201D", # U+201D Right double quotation mark |
| 11 | + "\u201E", # U+201E Double low-9 quotation mark |
| 12 | + "\u201F", # U+201F Double high-reversed-9 quotation mark |
| 13 | + "\u00AB", # U+00AB Left-pointing double angle quotation mark |
| 14 | + "\u00BB", # U+00BB Right-pointing double angle quotation mark |
| 15 | + "\u275D", # U+275D Heavy double turned comma quotation mark ornament |
| 16 | + "\u275E", # U+275E Heavy double comma quotation mark ornament |
| 17 | + "\u2E42", # U+2E42 Double low-reversed-9 quotation mark |
| 18 | + "\U0001F676", # U+1F676 SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT |
| 19 | + "\U0001F677", # U+1F677 SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT |
| 20 | + "\U0001F678", # U+1F678 SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT |
| 21 | + "\u2826", # U+2826 Braille double closing quotation mark |
| 22 | + "\u2834", # U+2834 Braille double opening quotation mark |
| 23 | + "\u301D", # U+301D REVERSED DOUBLE PRIME QUOTATION MARK |
| 24 | + "\u301E", # U+301E DOUBLE PRIME QUOTATION MARK |
| 25 | + "\u301F", # U+301F LOW DOUBLE PRIME QUOTATION MARK |
| 26 | + "\uFF02", # U+FF02 FULLWIDTH QUOTATION MARK |
| 27 | +) |
| 28 | + |
| 29 | +_SINGLE_QUOTE_CODEPOINTS = ( |
| 30 | + "\u0027", # U+0027 Standard typewriter/programmer's quote |
| 31 | + "\u2018", # U+2018 Left single quotation mark |
| 32 | + "\u2019", # U+2019 Right single quotation mark |
| 33 | + "\u201A", # U+201A Single low-9 quotation mark |
| 34 | + "\u201B", # U+201B Single high-reversed-9 quotation mark |
| 35 | + "\u2039", # U+2039 Single left-pointing angle quotation mark |
| 36 | + "\u203A", # U+203A Single right-pointing angle quotation mark |
| 37 | + "\u275B", # U+275B Heavy single turned comma quotation mark ornament |
| 38 | + "\u275C", # U+275C Heavy single comma quotation mark ornament |
| 39 | + "\u300C", # U+300C Left corner bracket |
| 40 | + "\u300D", # U+300D Right corner bracket |
| 41 | + "\u300E", # U+300E Left white corner bracket |
| 42 | + "\u300F", # U+300F Right white corner bracket |
| 43 | + "\uFE41", # U+FE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET |
| 44 | + "\uFE42", # U+FE42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET |
| 45 | + "\uFE43", # U+FE43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET |
| 46 | + "\uFE44", # U+FE44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET |
| 47 | + "\uFF07", # U+FF07 FULLWIDTH APOSTROPHE |
| 48 | + "\uFF62", # U+FF62 HALFWIDTH LEFT CORNER BRACKET |
| 49 | + "\uFF63", # U+FF63 HALFWIDTH RIGHT CORNER BRACKET |
| 50 | +) |
52 | 51 |
|
53 | 52 | _TRANSLATION_TABLE = str.maketrans( |
54 | | - {chr(int(v.replace("U+", ""), 16)): '"' for v in _DOUBLE_QUOTES.values()} |
55 | | - | {chr(int(v.replace("U+", ""), 16)): "'" for v in _SINGLE_QUOTES.values()} |
| 53 | + {c: '"' for c in _DOUBLE_QUOTE_CODEPOINTS} |
| 54 | + | {c: "'" for c in _SINGLE_QUOTE_CODEPOINTS} |
56 | 55 | ) |
57 | 56 |
|
58 | 57 |
|
|
0 commit comments