From da639859ed6057cbd0eb0d1c55899e01c772e5ed Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 24 Jan 2026 08:36:35 +0000 Subject: [PATCH] Optimize calculate_edit_distance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **150% speedup** (from 5.47ms to 2.19ms) by eliminating redundant dictionary construction and replacing inefficient character-by-character replacements with a pre-computed translation table. ## Key Optimizations ### 1. **Module-level Pre-computation** The original code reconstructed the `double_quotes` and `single_quotes` dictionaries on *every* call to `standardize_quotes` (217 calls in the profile). This consumed **~23% of runtime** just building dictionaries. The optimized version moves these to module-level constants (`_DOUBLE_QUOTES`, `_SINGLE_QUOTES`), computed once at import time. ### 2. **Translation Table (`str.translate()`)** The original code used a loop with `unicode_to_char()` conversions and individual `str.replace()` calls for each quote type (~40 iterations per call). The optimized version pre-computes all unicode characters and builds a single translation table (`_QUOTE_TRANSLATION`) using `str.maketrans()`. This allows `str.translate()` to replace all quote characters in a **single pass** through the string, which is implemented in C and far more efficient than Python loops with multiple `replace()` calls. Line profiler shows `standardize_quotes` dropped from **25.9ms total time** (with ~65% spent in loops and dictionary construction) to just **0.5ms** (single translate call). ### 3. **Faster Validation Check** Changed `return_as not in return_types` from a list lookup to a tuple literal check `return_as not in ("score", "distance")`. This avoids list construction on every call and uses Python's optimized tuple comparison. The list is now only created in the error path (3 out of 105 calls). ## Impact on Workloads The `function_references` show `calculate_edit_distance` is called by `calculate_accuracy`, which appears to be a high-level metric function. Given that the test results show **3-10x speedups** on individual calls (e.g., 44μs → 9μs for typical inputs), any workflow processing multiple documents or computing accuracy metrics repeatedly will benefit significantly. The optimization is particularly effective when: - **Text contains many unicode quotes**: The translation table eliminates the need to check each quote type individually - **Called in loops**: Module-level constants amortize setup costs across all calls - **Large documents**: The single-pass `translate()` scales better than multiple `replace()` operations (e.g., 500-char strings show 272% speedup) Test cases with standard ASCII text show ~380% speedup, while those with unicode quotes show ~330% speedup - demonstrating consistent gains across input types. The optimization maintains correctness while reducing overhead from 94% of runtime to negligible levels. --- unstructured/metrics/text_extraction.py | 127 ++++++++++++------------ 1 file changed, 61 insertions(+), 66 deletions(-) diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 7153852305..c3af5d7c44 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -4,6 +4,63 @@ from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation +_DOUBLE_QUOTES = { + '"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote + '"': "U+201C", # noqa 601 # Left double quotation mark + '"': "U+201D", # noqa 601 # Right double quotation mark + "„": "U+201E", # Double low-9 quotation mark + "‟": "U+201F", # Double high-reversed-9 quotation mark + "«": "U+00AB", # Left-pointing double angle quotation mark + "»": "U+00BB", # Right-pointing double angle quotation mark + "❝": "U+275D", # Heavy double turned comma quotation mark ornament + "❞": "U+275E", # Heavy double comma quotation mark ornament + "⹂": "U+2E42", # Double low-reversed-9 quotation mark + "🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT + "🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT + "🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT + "⠦": "U+2826", # Braille double closing quotation mark + "⠴": "U+2834", # Braille double opening quotation mark + "〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK + "〞": "U+301E", # DOUBLE PRIME QUOTATION MARK + "〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK + """: "U+FF02", # FULLWIDTH QUOTATION MARK + ",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT +} + +_SINGLE_QUOTES = { + "'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote + "'": "U+2018", # noqa 601 # Left single quotation mark + "'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605 + "‚": "U+201A", # Single low-9 quotation mark + "‛": "U+201B", # Single high-reversed-9 quotation mark + "‹": "U+2039", # Single left-pointing angle quotation mark + "›": "U+203A", # Single right-pointing angle quotation mark + "❛": "U+275B", # Heavy single turned comma quotation mark ornament + "❜": "U+275C", # Heavy single comma quotation mark ornament + "「": "U+300C", # Left corner bracket + "」": "U+300D", # Right corner bracket + "『": "U+300E", # Left white corner bracket + "』": "U+300F", # Right white corner bracket + "﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET + "﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET + "﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET + "﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET + "'": "U+FF07", # FULLWIDTH APOSTROPHE + "「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET + "」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET +} + +_DOUBLE_QUOTE_CHARS = {chr(int(val.replace("U+", ""), 16)) for val in _DOUBLE_QUOTES.values()} + +_SINGLE_QUOTE_CHARS = {chr(int(val.replace("U+", ""), 16)) for val in _SINGLE_QUOTES.values()} + +_QUOTE_TRANSLATION = str.maketrans( + { + **{chr(int(val.replace("U+", ""), 16)): '"' for val in _DOUBLE_QUOTES.values()}, + **{chr(int(val.replace("U+", ""), 16)): "'" for val in _SINGLE_QUOTES.values()}, + } +) + def calculate_accuracy( output: Optional[str], @@ -54,8 +111,8 @@ def calculate_edit_distance( - "distance": Returns the raw edit distance value. """ - return_types = ["score", "distance"] - if return_as not in return_types: + if return_as not in ("score", "distance"): + return_types = ["score", "distance"] raise ValueError("Invalid return value type. Expected one of: %s" % return_types) output = standardize_quotes(prepare_str(output, standardize_whitespaces)) source = standardize_quotes(prepare_str(source, standardize_whitespaces)) @@ -172,70 +229,8 @@ def standardize_quotes(text: str) -> str: Returns: str: The text with standardized quotes. """ - # Double Quotes Dictionary - double_quotes = { - '"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote - '"': "U+201C", # noqa 601 # Left double quotation mark - '"': "U+201D", # noqa 601 # Right double quotation mark - "„": "U+201E", # Double low-9 quotation mark - "‟": "U+201F", # Double high-reversed-9 quotation mark - "«": "U+00AB", # Left-pointing double angle quotation mark - "»": "U+00BB", # Right-pointing double angle quotation mark - "❝": "U+275D", # Heavy double turned comma quotation mark ornament - "❞": "U+275E", # Heavy double comma quotation mark ornament - "⹂": "U+2E42", # Double low-reversed-9 quotation mark - "🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT - "🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT - "🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT - "⠦": "U+2826", # Braille double closing quotation mark - "⠴": "U+2834", # Braille double opening quotation mark - "〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK - "〞": "U+301E", # DOUBLE PRIME QUOTATION MARK - "〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK - """: "U+FF02", # FULLWIDTH QUOTATION MARK - ",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT - } - - # Single Quotes Dictionary - single_quotes = { - "'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote - "'": "U+2018", # noqa 601 # Left single quotation mark - "'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605 - "‚": "U+201A", # Single low-9 quotation mark - "‛": "U+201B", # Single high-reversed-9 quotation mark - "‹": "U+2039", # Single left-pointing angle quotation mark - "›": "U+203A", # Single right-pointing angle quotation mark - "❛": "U+275B", # Heavy single turned comma quotation mark ornament - "❜": "U+275C", # Heavy single comma quotation mark ornament - "「": "U+300C", # Left corner bracket - "」": "U+300D", # Right corner bracket - "『": "U+300E", # Left white corner bracket - "』": "U+300F", # Right white corner bracket - "﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET - "﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET - "﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET - "﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET - "'": "U+FF07", # FULLWIDTH APOSTROPHE - "「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET - "」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET - } - - double_quote_standard = '"' - single_quote_standard = "'" - - # Apply double quote replacements - for unicode_val in double_quotes.values(): - unicode_char = unicode_to_char(unicode_val) - if unicode_char in text: - text = text.replace(unicode_char, double_quote_standard) - - # Apply single quote replacements - for unicode_val in single_quotes.values(): - unicode_char = unicode_to_char(unicode_val) - if unicode_char in text: - text = text.replace(unicode_char, single_quote_standard) - - return text + # Use str.translate() for efficient bulk character replacement + return text.translate(_QUOTE_TRANSLATION) def unicode_to_char(unicode_val: str) -> str: