From da639859ed6057cbd0eb0d1c55899e01c772e5ed Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Sat, 24 Jan 2026 08:36:35 +0000
Subject: [PATCH] Optimize calculate_edit_distance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The optimized code achieves a **150% speedup** (from 5.47ms to 2.19ms) by eliminating redundant dictionary construction and replacing inefficient character-by-character replacements with a pre-computed translation table.

## Key Optimizations

### 1. **Module-level Pre-computation**
The original code reconstructed the `double_quotes` and `single_quotes` dictionaries on *every* call to `standardize_quotes` (217 calls in the profile). This consumed **~23% of runtime** just building dictionaries. The optimized version moves these to module-level constants (`_DOUBLE_QUOTES`, `_SINGLE_QUOTES`), computed once at import time.

### 2. **Translation Table (`str.translate()`)**
The original code used a loop with `unicode_to_char()` conversions and individual `str.replace()` calls for each quote type (~40 iterations per call). The optimized version pre-computes all unicode characters and builds a single translation table (`_QUOTE_TRANSLATION`) using `str.maketrans()`. This allows `str.translate()` to replace all quote characters in a **single pass** through the string, which is implemented in C and far more efficient than Python loops with multiple `replace()` calls.

Line profiler shows `standardize_quotes` dropped from **25.9ms total time** (with ~65% spent in loops and dictionary construction) to just **0.5ms** (single translate call).

### 3. **Faster Validation Check**
Changed `return_as not in return_types` from a list lookup to a tuple literal check `return_as not in ("score", "distance")`. This avoids list construction on every call and uses Python's optimized tuple comparison. The list is now only created in the error path (3 out of 105 calls).

## Impact on Workloads

The `function_references` show `calculate_edit_distance` is called by `calculate_accuracy`, which appears to be a high-level metric function. Given that the test results show **3-10x speedups** on individual calls (e.g., 44μs → 9μs for typical inputs), any workflow processing multiple documents or computing accuracy metrics repeatedly will benefit significantly. The optimization is particularly effective when:

- **Text contains many unicode quotes**: The translation table eliminates the need to check each quote type individually
- **Called in loops**: Module-level constants amortize setup costs across all calls
- **Large documents**: The single-pass `translate()` scales better than multiple `replace()` operations (e.g., 500-char strings show 272% speedup)

Test cases with standard ASCII text show ~380% speedup, while those with unicode quotes show ~330% speedup - demonstrating consistent gains across input types. The optimization maintains correctness while reducing overhead from 94% of runtime to negligible levels.
---
 unstructured/metrics/text_extraction.py | 127 ++++++++++++------------
 1 file changed, 61 insertions(+), 66 deletions(-)

diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 7153852305..c3af5d7c44 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -4,6 +4,63 @@
 
 from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
 
+_DOUBLE_QUOTES = {
+    '"': "U+0022",  # noqa 601 # Standard typewriter/programmer's quote
+    '"': "U+201C",  # noqa 601 # Left double quotation mark
+    '"': "U+201D",  # noqa 601 # Right double quotation mark
+    "„": "U+201E",  # Double low-9 quotation mark
+    "‟": "U+201F",  # Double high-reversed-9 quotation mark
+    "«": "U+00AB",  # Left-pointing double angle quotation mark
+    "»": "U+00BB",  # Right-pointing double angle quotation mark
+    "❝": "U+275D",  # Heavy double turned comma quotation mark ornament
+    "❞": "U+275E",  # Heavy double comma quotation mark ornament
+    "⹂": "U+2E42",  # Double low-reversed-9 quotation mark
+    "🙶": "U+1F676",  # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
+    "🙷": "U+1F677",  # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
+    "🙸": "U+1F678",  # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
+    "⠦": "U+2826",  # Braille double closing quotation mark
+    "⠴": "U+2834",  # Braille double opening quotation mark
+    "〝": "U+301D",  # REVERSED DOUBLE PRIME QUOTATION MARK
+    "〞": "U+301E",  # DOUBLE PRIME QUOTATION MARK
+    "〟": "U+301F",  # LOW DOUBLE PRIME QUOTATION MARK
+    "＂": "U+FF02",  # FULLWIDTH QUOTATION MARK
+    ",,": "U+275E",  # LOW HEAVY DOUBLE COMMA ORNAMENT
+}
+
+_SINGLE_QUOTES = {
+    "'": "U+0027",  # noqa 601 # Standard typewriter/programmer's quote
+    "'": "U+2018",  # noqa 601 # Left single quotation mark
+    "'": "U+2019",  # noqa 601 # Right single quotation mark # noqa: W605
+    "‚": "U+201A",  # Single low-9 quotation mark
+    "‛": "U+201B",  # Single high-reversed-9 quotation mark
+    "‹": "U+2039",  # Single left-pointing angle quotation mark
+    "›": "U+203A",  # Single right-pointing angle quotation mark
+    "❛": "U+275B",  # Heavy single turned comma quotation mark ornament
+    "❜": "U+275C",  # Heavy single comma quotation mark ornament
+    "「": "U+300C",  # Left corner bracket
+    "」": "U+300D",  # Right corner bracket
+    "『": "U+300E",  # Left white corner bracket
+    "』": "U+300F",  # Right white corner bracket
+    "﹁": "U+FE41",  # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
+    "﹂": "U+FE42",  # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
+    "﹃": "U+FE43",  # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
+    "﹄": "U+FE44",  # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
+    "＇": "U+FF07",  # FULLWIDTH APOSTROPHE
+    "｢": "U+FF62",  # HALFWIDTH LEFT CORNER BRACKET
+    "｣": "U+FF63",  # HALFWIDTH RIGHT CORNER BRACKET
+}
+
+_DOUBLE_QUOTE_CHARS = {chr(int(val.replace("U+", ""), 16)) for val in _DOUBLE_QUOTES.values()}
+
+_SINGLE_QUOTE_CHARS = {chr(int(val.replace("U+", ""), 16)) for val in _SINGLE_QUOTES.values()}
+
+_QUOTE_TRANSLATION = str.maketrans(
+    {
+        **{chr(int(val.replace("U+", ""), 16)): '"' for val in _DOUBLE_QUOTES.values()},
+        **{chr(int(val.replace("U+", ""), 16)): "'" for val in _SINGLE_QUOTES.values()},
+    }
+)
+
 
 def calculate_accuracy(
     output: Optional[str],
@@ -54,8 +111,8 @@ def calculate_edit_distance(
         - "distance": Returns the raw edit distance value.
 
     """
-    return_types = ["score", "distance"]
-    if return_as not in return_types:
+    if return_as not in ("score", "distance"):
+        return_types = ["score", "distance"]
         raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
     output = standardize_quotes(prepare_str(output, standardize_whitespaces))
     source = standardize_quotes(prepare_str(source, standardize_whitespaces))
@@ -172,70 +229,8 @@ def standardize_quotes(text: str) -> str:
     Returns:
         str: The text with standardized quotes.
     """
-    # Double Quotes Dictionary
-    double_quotes = {
-        '"': "U+0022",  # noqa 601 # Standard typewriter/programmer's quote
-        '"': "U+201C",  # noqa 601 # Left double quotation mark
-        '"': "U+201D",  # noqa 601 # Right double quotation mark
-        "„": "U+201E",  # Double low-9 quotation mark
-        "‟": "U+201F",  # Double high-reversed-9 quotation mark
-        "«": "U+00AB",  # Left-pointing double angle quotation mark
-        "»": "U+00BB",  # Right-pointing double angle quotation mark
-        "❝": "U+275D",  # Heavy double turned comma quotation mark ornament
-        "❞": "U+275E",  # Heavy double comma quotation mark ornament
-        "⹂": "U+2E42",  # Double low-reversed-9 quotation mark
-        "🙶": "U+1F676",  # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
-        "🙷": "U+1F677",  # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
-        "🙸": "U+1F678",  # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
-        "⠦": "U+2826",  # Braille double closing quotation mark
-        "⠴": "U+2834",  # Braille double opening quotation mark
-        "〝": "U+301D",  # REVERSED DOUBLE PRIME QUOTATION MARK
-        "〞": "U+301E",  # DOUBLE PRIME QUOTATION MARK
-        "〟": "U+301F",  # LOW DOUBLE PRIME QUOTATION MARK
-        "＂": "U+FF02",  # FULLWIDTH QUOTATION MARK
-        ",,": "U+275E",  # LOW HEAVY DOUBLE COMMA ORNAMENT
-    }
-
-    # Single Quotes Dictionary
-    single_quotes = {
-        "'": "U+0027",  # noqa 601 # Standard typewriter/programmer's quote
-        "'": "U+2018",  # noqa 601 # Left single quotation mark
-        "'": "U+2019",  # noqa 601 # Right single quotation mark # noqa: W605
-        "‚": "U+201A",  # Single low-9 quotation mark
-        "‛": "U+201B",  # Single high-reversed-9 quotation mark
-        "‹": "U+2039",  # Single left-pointing angle quotation mark
-        "›": "U+203A",  # Single right-pointing angle quotation mark
-        "❛": "U+275B",  # Heavy single turned comma quotation mark ornament
-        "❜": "U+275C",  # Heavy single comma quotation mark ornament
-        "「": "U+300C",  # Left corner bracket
-        "」": "U+300D",  # Right corner bracket
-        "『": "U+300E",  # Left white corner bracket
-        "』": "U+300F",  # Right white corner bracket
-        "﹁": "U+FE41",  # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
-        "﹂": "U+FE42",  # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
-        "﹃": "U+FE43",  # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
-        "﹄": "U+FE44",  # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
-        "＇": "U+FF07",  # FULLWIDTH APOSTROPHE
-        "｢": "U+FF62",  # HALFWIDTH LEFT CORNER BRACKET
-        "｣": "U+FF63",  # HALFWIDTH RIGHT CORNER BRACKET
-    }
-
-    double_quote_standard = '"'
-    single_quote_standard = "'"
-
-    # Apply double quote replacements
-    for unicode_val in double_quotes.values():
-        unicode_char = unicode_to_char(unicode_val)
-        if unicode_char in text:
-            text = text.replace(unicode_char, double_quote_standard)
-
-    # Apply single quote replacements
-    for unicode_val in single_quotes.values():
-        unicode_char = unicode_to_char(unicode_val)
-        if unicode_char in text:
-            text = text.replace(unicode_char, single_quote_standard)
-
-    return text
+    # Use str.translate() for efficient bulk character replacement
+    return text.translate(_QUOTE_TRANSLATION)
 
 
 def unicode_to_char(unicode_val: str) -> str: