Unstructured-IO · cragwolfe · Apr 3, 2026 · Jan 19, 2026 · Jan 19, 2026 · Apr 1, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.22.13
+
+### Enhancements
+- **Speed up `standardize_quotes`**: Replace loop-based character replacement with a single `str.translate()` call using a pre-computed translation table. Also fixes a pre-existing bug where left smart quotes were never normalized due to duplicate dictionary keys.
+
 ## 0.22.12
 
 ### Fixes

diff --git a/pyproject.toml b/pyproject.toml
@@ -175,6 +175,7 @@ test = [
     "types-tabulate>=0.9.0.20241207, <1.0.0",
     "unstructured-pytesseract>=0.3.15, <1.0.0",
     "weaviate-client>=4.20.1, <5.0.0",
+    "pytest-benchmark>=5.2.3",
 ]
 dev = [
     "pre-commit>=4.5.1, <5.0.0",
@@ -265,3 +266,4 @@ tests-root = "test_unstructured"
 test-framework = "pytest"
 ignore-paths = []
 formatter-cmds = ["ruff check --exit-zero --fix-only $file", "ruff format $file"]
+benchmarks-root = "test_unstructured/benchmarks"
diff --git a/test_unstructured/benchmarks/__init__.py b/test_unstructured/benchmarks/__init__.py
diff --git a/test_unstructured/benchmarks/test_benchmark_standardize_quotes.py b/test_unstructured/benchmarks/test_benchmark_standardize_quotes.py
@@ -0,0 +1,26 @@
+from unstructured.metrics.text_extraction import standardize_quotes
+
+SAMPLE_TEXTS = [
+    "She said \u201cHello\u201d and then whispered \u2018Goodbye\u2019 before leaving.",
+    "\u201eTo be, or not to be, that is the question\u201d - Shakespeare\u2019s famous quote.",
+    "\u00abWhen he said \u201clife is beautiful,\u201d I believed him\u00bb wrote Maria.",
+    "\u275dDo you remember when we first met?\u275e she asked with a smile.",
+    "\u301dThe meeting starts at 10:00, don\u2019t be late!\u301f announced the manager.",
+    '\u300cHe told me "This is important" yesterday\u300d, she explained.',
+    "\u300eThe sun was setting. The birds were singing. It was peaceful.\u300f",
+    "\ufe42Meeting #123 @ 15:00 - Don\u2019t forget!\ufe41",
+    "\u300cHello\u300d, \u275dWorld\u275e, \"Test\", 'Example', \u201eQuote\u201d, \u00abFinal\u00bb",  # noqa: E501
+    "It\u2019s John\u2019s book, isn\u2019t it?",
+    '\u2039Testing the system\u2019s capability for "quoted" text\u203a',
+    "\u275bFirst sentence. Second sentence. Third sentence.\u275c",
+    "\u300cChapter 1\u300d: \u275dThe Beginning\u275e - \u201eA new story\u201d begins \u00abtoday\u00bb.",  # noqa: E501
+]
+
+
+def run_standardize_quotes():
+    for text in SAMPLE_TEXTS:
+        standardize_quotes(text)
+
+
+def test_benchmark_standardize_quotes(benchmark):
+    benchmark(run_standardize_quotes)
diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
@@ -402,12 +402,44 @@ def test_prepare_string(text, expected):
             '「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
             '\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
         ),
+        # --- Regression: U+201C / U+2018 were silently dropped by duplicate dict keys ---
+        # U+201C left double quotation mark (isolated)
+        ("\u201c", '"'),
+        # U+2018 left single quotation mark (isolated)
+        ("\u2018", "'"),
+        # Left + right double smart quotes wrapping a word
+        ("\u201cHello\u201d", '"Hello"'),
+        # Left + right single smart quotes wrapping a word
+        ("\u2018world\u2019", "'world'"),
+        # Mixed left/right smart quotes in a sentence
+        (
+            "She said \u201cHello\u201d and then whispered \u2018Goodbye\u2019",
+            "She said \"Hello\" and then whispered 'Goodbye'",
+        ),
+        # Possessive with left single smart quote
+        ("\u2018tis the season", "'tis the season"),
     ],
 )
 def test_standardize_quotes(input_text, expected_output):
     assert text_extraction.standardize_quotes(input_text) == expected_output
 
 
+@pytest.mark.parametrize(
+    ("codepoint", "expected"),
+    [
+        pytest.param(cp, '"', id=f"U+{ord(cp):04X}->double")
+        for cp in text_extraction._DOUBLE_QUOTE_CODEPOINTS
+    ]
+    + [
+        pytest.param(cp, "'", id=f"U+{ord(cp):04X}->single")
+        for cp in text_extraction._SINGLE_QUOTE_CODEPOINTS
+    ],
+)
+def test_standardize_quotes_every_codepoint(codepoint, expected):
+    """Every codepoint in the translation table must map to its ASCII equivalent."""
+    assert text_extraction.standardize_quotes(codepoint) == expected
+
+
 @pytest.mark.parametrize(
     ("output_text", "source_text", "expected_percentage"),
     [

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.12"  # pragma: no cover
+__version__ = "0.22.13"  # pragma: no cover
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
@@ -4,6 +4,55 @@
 
 from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
 
+_DOUBLE_QUOTE_CODEPOINTS = (
+    "\u0022",  # U+0022 Standard typewriter/programmer's quote
+    "\u201c",  # U+201C Left double quotation mark
+    "\u201d",  # U+201D Right double quotation mark
+    "\u201e",  # U+201E Double low-9 quotation mark
+    "\u201f",  # U+201F Double high-reversed-9 quotation mark
+    "\u00ab",  # U+00AB Left-pointing double angle quotation mark
+    "\u00bb",  # U+00BB Right-pointing double angle quotation mark
+    "\u275d",  # U+275D Heavy double turned comma quotation mark ornament
+    "\u275e",  # U+275E Heavy double comma quotation mark ornament
+    "\u2e42",  # U+2E42 Double low-reversed-9 quotation mark
+    "\U0001f676",  # U+1F676 SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
+    "\U0001f677",  # U+1F677 SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
+    "\U0001f678",  # U+1F678 SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
+    "\u2826",  # U+2826 Braille double closing quotation mark
+    "\u2834",  # U+2834 Braille double opening quotation mark
+    "\u301d",  # U+301D REVERSED DOUBLE PRIME QUOTATION MARK
+    "\u301e",  # U+301E DOUBLE PRIME QUOTATION MARK
+    "\u301f",  # U+301F LOW DOUBLE PRIME QUOTATION MARK
+    "\uff02",  # U+FF02 FULLWIDTH QUOTATION MARK
+)
+
+_SINGLE_QUOTE_CODEPOINTS = (
+    "\u0027",  # U+0027 Standard typewriter/programmer's quote
+    "\u2018",  # U+2018 Left single quotation mark
+    "\u2019",  # U+2019 Right single quotation mark
+    "\u201a",  # U+201A Single low-9 quotation mark
+    "\u201b",  # U+201B Single high-reversed-9 quotation mark
+    "\u2039",  # U+2039 Single left-pointing angle quotation mark
+    "\u203a",  # U+203A Single right-pointing angle quotation mark
+    "\u275b",  # U+275B Heavy single turned comma quotation mark ornament
+    "\u275c",  # U+275C Heavy single comma quotation mark ornament
+    "\u300c",  # U+300C Left corner bracket
+    "\u300d",  # U+300D Right corner bracket
+    "\u300e",  # U+300E Left white corner bracket
+    "\u300f",  # U+300F Right white corner bracket
+    "\ufe41",  # U+FE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
+    "\ufe42",  # U+FE42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
+    "\ufe43",  # U+FE43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
+    "\ufe44",  # U+FE44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
+    "\uff07",  # U+FF07 FULLWIDTH APOSTROPHE
+    "\uff62",  # U+FF62 HALFWIDTH LEFT CORNER BRACKET
+    "\uff63",  # U+FF63 HALFWIDTH RIGHT CORNER BRACKET
+)
+
+_TRANSLATION_TABLE = str.maketrans(
+    dict.fromkeys(_DOUBLE_QUOTE_CODEPOINTS, '"') | dict.fromkeys(_SINGLE_QUOTE_CODEPOINTS, "'")
+)
+
 
 def calculate_accuracy(
     output: Optional[str],
@@ -172,80 +221,4 @@ def standardize_quotes(text: str) -> str:
     Returns:
         str: The text with standardized quotes.
     """
-    # Double Quotes Dictionary
-    double_quotes = {
-        '"': "U+0022",  # noqa 601 # Standard typewriter/programmer's quote
-        '"': "U+201C",  # noqa 601 # Left double quotation mark
-        '"': "U+201D",  # noqa 601 # Right double quotation mark
-        "„": "U+201E",  # Double low-9 quotation mark
-        "‟": "U+201F",  # Double high-reversed-9 quotation mark
-        "«": "U+00AB",  # Left-pointing double angle quotation mark
-        "»": "U+00BB",  # Right-pointing double angle quotation mark
-        "❝": "U+275D",  # Heavy double turned comma quotation mark ornament
-        "❞": "U+275E",  # Heavy double comma quotation mark ornament
-        "⹂": "U+2E42",  # Double low-reversed-9 quotation mark
-        "🙶": "U+1F676",  # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
-        "🙷": "U+1F677",  # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
-        "🙸": "U+1F678",  # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
-        "⠦": "U+2826",  # Braille double closing quotation mark
-        "⠴": "U+2834",  # Braille double opening quotation mark
-        "〝": "U+301D",  # REVERSED DOUBLE PRIME QUOTATION MARK
-        "〞": "U+301E",  # DOUBLE PRIME QUOTATION MARK
-        "〟": "U+301F",  # LOW DOUBLE PRIME QUOTATION MARK
-        "＂": "U+FF02",  # FULLWIDTH QUOTATION MARK
-        ",,": "U+275E",  # LOW HEAVY DOUBLE COMMA ORNAMENT
-    }
-
-    # Single Quotes Dictionary
-    single_quotes = {
-        "'": "U+0027",  # noqa 601 # Standard typewriter/programmer's quote
-        "'": "U+2018",  # noqa 601 # Left single quotation mark
-        "'": "U+2019",  # noqa 601 # Right single quotation mark # noqa: W605
-        "‚": "U+201A",  # Single low-9 quotation mark
-        "‛": "U+201B",  # Single high-reversed-9 quotation mark
-        "‹": "U+2039",  # Single left-pointing angle quotation mark
-        "›": "U+203A",  # Single right-pointing angle quotation mark
-        "❛": "U+275B",  # Heavy single turned comma quotation mark ornament
-        "❜": "U+275C",  # Heavy single comma quotation mark ornament
-        "「": "U+300C",  # Left corner bracket
-        "」": "U+300D",  # Right corner bracket
-        "『": "U+300E",  # Left white corner bracket
-        "』": "U+300F",  # Right white corner bracket
-        "﹁": "U+FE41",  # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
-        "﹂": "U+FE42",  # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
-        "﹃": "U+FE43",  # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
-        "﹄": "U+FE44",  # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
-        "＇": "U+FF07",  # FULLWIDTH APOSTROPHE
-        "｢": "U+FF62",  # HALFWIDTH LEFT CORNER BRACKET
-        "｣": "U+FF63",  # HALFWIDTH RIGHT CORNER BRACKET
-    }
-
-    double_quote_standard = '"'
-    single_quote_standard = "'"
-
-    # Apply double quote replacements
-    for unicode_val in double_quotes.values():
-        unicode_char = unicode_to_char(unicode_val)
-        if unicode_char in text:
-            text = text.replace(unicode_char, double_quote_standard)
-
-    # Apply single quote replacements
-    for unicode_val in single_quotes.values():
-        unicode_char = unicode_to_char(unicode_val)
-        if unicode_char in text:
-            text = text.replace(unicode_char, single_quote_standard)
-
-    return text
-
-
-def unicode_to_char(unicode_val: str) -> str:
-    """
-    Converts a Unicode value to a character.
-
-    Args:
-        unicode_val (str): The Unicode value to convert.
-
-    Returns:
-        str: The character corresponding to the Unicode value.
-    """
-    return chr(int(unicode_val.replace("U+", ""), 16))
+    return text.translate(_TRANSLATION_TABLE)
diff --git a/uv.lock b/uv.lock
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.22.12" # pragma: no cover
		__version__ = "0.22.13" # pragma: no cover