Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.22.13

### Enhancements
- **Speed up `standardize_quotes`**: Replace loop-based character replacement with a single `str.translate()` call using a pre-computed translation table. Also fixes a pre-existing bug where left smart quotes were never normalized due to duplicate dictionary keys.

## 0.22.12

### Fixes
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ test = [
"types-tabulate>=0.9.0.20241207, <1.0.0",
"unstructured-pytesseract>=0.3.15, <1.0.0",
"weaviate-client>=4.20.1, <5.0.0",
"pytest-benchmark>=5.2.3",
]
dev = [
"pre-commit>=4.5.1, <5.0.0",
Expand Down Expand Up @@ -265,3 +266,4 @@ tests-root = "test_unstructured"
test-framework = "pytest"
ignore-paths = []
formatter-cmds = ["ruff check --exit-zero --fix-only $file", "ruff format $file"]
benchmarks-root = "test_unstructured/benchmarks"
Empty file.
26 changes: 26 additions & 0 deletions test_unstructured/benchmarks/test_benchmark_standardize_quotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from unstructured.metrics.text_extraction import standardize_quotes

SAMPLE_TEXTS = [
"She said \u201cHello\u201d and then whispered \u2018Goodbye\u2019 before leaving.",
"\u201eTo be, or not to be, that is the question\u201d - Shakespeare\u2019s famous quote.",
"\u00abWhen he said \u201clife is beautiful,\u201d I believed him\u00bb wrote Maria.",
"\u275dDo you remember when we first met?\u275e she asked with a smile.",
"\u301dThe meeting starts at 10:00, don\u2019t be late!\u301f announced the manager.",
'\u300cHe told me "This is important" yesterday\u300d, she explained.',
"\u300eThe sun was setting. The birds were singing. It was peaceful.\u300f",
"\ufe42Meeting #123 @ 15:00 - Don\u2019t forget!\ufe41",
"\u300cHello\u300d, \u275dWorld\u275e, \"Test\", 'Example', \u201eQuote\u201d, \u00abFinal\u00bb", # noqa: E501
"It\u2019s John\u2019s book, isn\u2019t it?",
'\u2039Testing the system\u2019s capability for "quoted" text\u203a',
"\u275bFirst sentence. Second sentence. Third sentence.\u275c",
"\u300cChapter 1\u300d: \u275dThe Beginning\u275e - \u201eA new story\u201d begins \u00abtoday\u00bb.", # noqa: E501
]


def run_standardize_quotes():
for text in SAMPLE_TEXTS:
standardize_quotes(text)


def test_benchmark_standardize_quotes(benchmark):
benchmark(run_standardize_quotes)
32 changes: 32 additions & 0 deletions test_unstructured/metrics/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,44 @@ def test_prepare_string(text, expected):
'「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
),
# --- Regression: U+201C / U+2018 were silently dropped by duplicate dict keys ---
# U+201C left double quotation mark (isolated)
("\u201c", '"'),
# U+2018 left single quotation mark (isolated)
("\u2018", "'"),
# Left + right double smart quotes wrapping a word
("\u201cHello\u201d", '"Hello"'),
# Left + right single smart quotes wrapping a word
("\u2018world\u2019", "'world'"),
# Mixed left/right smart quotes in a sentence
(
"She said \u201cHello\u201d and then whispered \u2018Goodbye\u2019",
"She said \"Hello\" and then whispered 'Goodbye'",
),
# Possessive with left single smart quote
("\u2018tis the season", "'tis the season"),
],
)
def test_standardize_quotes(input_text, expected_output):
assert text_extraction.standardize_quotes(input_text) == expected_output


@pytest.mark.parametrize(
("codepoint", "expected"),
[
pytest.param(cp, '"', id=f"U+{ord(cp):04X}->double")
for cp in text_extraction._DOUBLE_QUOTE_CODEPOINTS
]
+ [
pytest.param(cp, "'", id=f"U+{ord(cp):04X}->single")
for cp in text_extraction._SINGLE_QUOTE_CODEPOINTS
],
)
def test_standardize_quotes_every_codepoint(codepoint, expected):
"""Every codepoint in the translation table must map to its ASCII equivalent."""
assert text_extraction.standardize_quotes(codepoint) == expected


@pytest.mark.parametrize(
("output_text", "source_text", "expected_percentage"),
[
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.12" # pragma: no cover
__version__ = "0.22.13" # pragma: no cover
127 changes: 50 additions & 77 deletions unstructured/metrics/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,55 @@

from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation

_DOUBLE_QUOTE_CODEPOINTS = (
"\u0022", # U+0022 Standard typewriter/programmer's quote
"\u201c", # U+201C Left double quotation mark
"\u201d", # U+201D Right double quotation mark
"\u201e", # U+201E Double low-9 quotation mark
"\u201f", # U+201F Double high-reversed-9 quotation mark
"\u00ab", # U+00AB Left-pointing double angle quotation mark
"\u00bb", # U+00BB Right-pointing double angle quotation mark
"\u275d", # U+275D Heavy double turned comma quotation mark ornament
"\u275e", # U+275E Heavy double comma quotation mark ornament
"\u2e42", # U+2E42 Double low-reversed-9 quotation mark
"\U0001f676", # U+1F676 SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
"\U0001f677", # U+1F677 SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
"\U0001f678", # U+1F678 SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
"\u2826", # U+2826 Braille double closing quotation mark
"\u2834", # U+2834 Braille double opening quotation mark
"\u301d", # U+301D REVERSED DOUBLE PRIME QUOTATION MARK
"\u301e", # U+301E DOUBLE PRIME QUOTATION MARK
"\u301f", # U+301F LOW DOUBLE PRIME QUOTATION MARK
"\uff02", # U+FF02 FULLWIDTH QUOTATION MARK
)

_SINGLE_QUOTE_CODEPOINTS = (
"\u0027", # U+0027 Standard typewriter/programmer's quote
"\u2018", # U+2018 Left single quotation mark
"\u2019", # U+2019 Right single quotation mark
"\u201a", # U+201A Single low-9 quotation mark
"\u201b", # U+201B Single high-reversed-9 quotation mark
"\u2039", # U+2039 Single left-pointing angle quotation mark
"\u203a", # U+203A Single right-pointing angle quotation mark
"\u275b", # U+275B Heavy single turned comma quotation mark ornament
"\u275c", # U+275C Heavy single comma quotation mark ornament
"\u300c", # U+300C Left corner bracket
"\u300d", # U+300D Right corner bracket
"\u300e", # U+300E Left white corner bracket
"\u300f", # U+300F Right white corner bracket
"\ufe41", # U+FE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
"\ufe42", # U+FE42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
"\ufe43", # U+FE43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
"\ufe44", # U+FE44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
"\uff07", # U+FF07 FULLWIDTH APOSTROPHE
"\uff62", # U+FF62 HALFWIDTH LEFT CORNER BRACKET
"\uff63", # U+FF63 HALFWIDTH RIGHT CORNER BRACKET
)

_TRANSLATION_TABLE = str.maketrans(
dict.fromkeys(_DOUBLE_QUOTE_CODEPOINTS, '"') | dict.fromkeys(_SINGLE_QUOTE_CODEPOINTS, "'")
)


def calculate_accuracy(
output: Optional[str],
Expand Down Expand Up @@ -172,80 +221,4 @@ def standardize_quotes(text: str) -> str:
Returns:
str: The text with standardized quotes.
"""
# Double Quotes Dictionary
double_quotes = {
'"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote
'"': "U+201C", # noqa 601 # Left double quotation mark
'"': "U+201D", # noqa 601 # Right double quotation mark
"„": "U+201E", # Double low-9 quotation mark
"‟": "U+201F", # Double high-reversed-9 quotation mark
"«": "U+00AB", # Left-pointing double angle quotation mark
"»": "U+00BB", # Right-pointing double angle quotation mark
"❝": "U+275D", # Heavy double turned comma quotation mark ornament
"❞": "U+275E", # Heavy double comma quotation mark ornament
"⹂": "U+2E42", # Double low-reversed-9 quotation mark
"🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
"🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
"🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
"⠦": "U+2826", # Braille double closing quotation mark
"⠴": "U+2834", # Braille double opening quotation mark
"〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
"〞": "U+301E", # DOUBLE PRIME QUOTATION MARK
"〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
""": "U+FF02", # FULLWIDTH QUOTATION MARK
",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT
}

# Single Quotes Dictionary
single_quotes = {
"'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote
"'": "U+2018", # noqa 601 # Left single quotation mark
"'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605
"‚": "U+201A", # Single low-9 quotation mark
"‛": "U+201B", # Single high-reversed-9 quotation mark
"‹": "U+2039", # Single left-pointing angle quotation mark
"›": "U+203A", # Single right-pointing angle quotation mark
"❛": "U+275B", # Heavy single turned comma quotation mark ornament
"❜": "U+275C", # Heavy single comma quotation mark ornament
"「": "U+300C", # Left corner bracket
"」": "U+300D", # Right corner bracket
"『": "U+300E", # Left white corner bracket
"』": "U+300F", # Right white corner bracket
"﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
"﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
"﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
"﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
"'": "U+FF07", # FULLWIDTH APOSTROPHE
"「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
"」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
}

double_quote_standard = '"'
single_quote_standard = "'"

# Apply double quote replacements
for unicode_val in double_quotes.values():
unicode_char = unicode_to_char(unicode_val)
if unicode_char in text:
text = text.replace(unicode_char, double_quote_standard)

# Apply single quote replacements
for unicode_val in single_quotes.values():
unicode_char = unicode_to_char(unicode_val)
if unicode_char in text:
text = text.replace(unicode_char, single_quote_standard)

return text


def unicode_to_char(unicode_val: str) -> str:
"""
Converts a Unicode value to a character.

Args:
unicode_val (str): The Unicode value to convert.

Returns:
str: The character corresponding to the Unicode value.
"""
return chr(int(unicode_val.replace("U+", ""), 16))
return text.translate(_TRANSLATION_TABLE)
24 changes: 24 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading