From f66d06b34fcd04b47f8175ca019a545845edd5bc Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 24 Jan 2026 08:26:20 +0000 Subject: [PATCH] Optimize calculate_accuracy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **71% speedup** through three key optimizations that reduce redundant work in the common case: ## What Changed 1. **Module-level constant for validation** (`_RETURN_TYPES`): Moved the allowed return types to a module-level tuple instead of creating a new list on every function call. 2. **Conditional Unicode quote standardization**: Added `str.isascii()` checks before calling `standardize_quotes()`. This expensive Unicode replacement operation (which iterates through ~40 quote mappings) is now skipped when strings contain only ASCII characters. 3. **Early equality check**: After string preparation, added a fast-path check `if output == source` to immediately return the result without calling the expensive `Levenshtein.distance()` calculation. ## Why It's Faster **ASCII check optimization**: The line profiler shows `standardize_quotes()` consumed ~76% of runtime in the original (12.6ms out of 16.6ms total). With `str.isascii()` being a fast C-level operation, the optimization successfully skips this expensive Unicode processing in most test cases - only 3 out of 71 function calls (4%) actually needed quote standardization in the test suite. **Early equality shortcut**: When strings are identical after preprocessing (33 out of 71 calls = 46% of test cases), the optimized version immediately returns without computing Levenshtein distance (originally ~21.5% of runtime). The profiler confirms these 33 cases now exit early, avoiding the distance calculation entirely. **Validation overhead elimination**: While small (0.4% of runtime), removing the list allocation on every call adds up, especially given the function_references show this is called from `_process_document()` which processes multiple documents in evaluation workloads. ## Impact on Workloads Based on the function_references, `calculate_accuracy()` is called from document evaluation pipelines (`evaluate.py`) where it processes extracted text against source documents. The optimizations are particularly effective for: - **ASCII-only documents** (most English text): Skip all Unicode quote processing - **Identical text cases** (perfect extraction): Return immediately without distance calculation - **Validation-heavy paths**: The module-level constant avoids repeated allocations in batch processing The test results confirm this: identical string tests show **10-20x speedup** (e.g., `test_identical_strings_returns_perfect_score`: 46.3μs → 3.96μs), while tests requiring actual Levenshtein computation show smaller but still meaningful gains (6-15%). The document evaluation context in `_process_document()` indicates this function may be called repeatedly in loops, amplifying the per-call savings. --- unstructured/metrics/text_extraction.py | 33 +++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 7153852305..aa3102eaf9 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -4,6 +4,8 @@ from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation +_RETURN_TYPES = ("score", "distance") + def calculate_accuracy( output: Optional[str], @@ -54,15 +56,32 @@ def calculate_edit_distance( - "distance": Returns the raw edit distance value. """ - return_types = ["score", "distance"] - if return_as not in return_types: - raise ValueError("Invalid return value type. Expected one of: %s" % return_types) - output = standardize_quotes(prepare_str(output, standardize_whitespaces)) - source = standardize_quotes(prepare_str(source, standardize_whitespaces)) + if return_as not in _RETURN_TYPES: + raise ValueError("Invalid return value type. Expected one of: %s" % (list(_RETURN_TYPES),)) + + # Prepare strings (may be expensive); keep semantics identical. + output = prepare_str(output, standardize_whitespaces) + source = prepare_str(source, standardize_whitespaces) + + # Avoid running the heavier unicode quote standardization when both strings are pure ASCII. + # str.isascii() is a fast C-level check and correctly identifies cases where no + # non-ASCII quote characters are present. + if not output.isascii(): + output = standardize_quotes(output) + if not source.isascii(): + source = standardize_quotes(source) + + # Fast path: if strings are identical after preprocessing, we can return immediately. + if output == source: + if return_as == "distance": + return 0 + # return_as == "score" + return 1.0 + distance = Levenshtein.distance(output, source, weights=weights) # type: ignore - # lower bounded the char length for source string at 1.0 because to avoid division by zero + # lower bounded the char length for source string at 1 because to avoid division by zero # in the case where source string is empty, the distance should be at 100% - source_char_len = max(len(source), 1.0) # type: ignore + source_char_len = max(len(source), 1) bounded_percentage_distance = min(max(distance / source_char_len, 0.0), 1.0) if return_as == "score": return 1 - bounded_percentage_distance