diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fa1caca22..360af8ad4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.22.13 + +### Enhancements +- **Speed up `standardize_quotes`**: Replace loop-based character replacement with a single `str.translate()` call using a pre-computed translation table. Also fixes a pre-existing bug where left smart quotes were never normalized due to duplicate dictionary keys. + ## 0.22.12 ### Fixes diff --git a/pyproject.toml b/pyproject.toml index c625c3f62e..96239205fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -175,6 +175,7 @@ test = [ "types-tabulate>=0.9.0.20241207, <1.0.0", "unstructured-pytesseract>=0.3.15, <1.0.0", "weaviate-client>=4.20.1, <5.0.0", + "pytest-benchmark>=5.2.3", ] dev = [ "pre-commit>=4.5.1, <5.0.0", @@ -265,3 +266,4 @@ tests-root = "test_unstructured" test-framework = "pytest" ignore-paths = [] formatter-cmds = ["ruff check --exit-zero --fix-only $file", "ruff format $file"] +benchmarks-root = "test_unstructured/benchmarks" diff --git a/test_unstructured/benchmarks/__init__.py b/test_unstructured/benchmarks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_unstructured/benchmarks/test_benchmark_standardize_quotes.py b/test_unstructured/benchmarks/test_benchmark_standardize_quotes.py new file mode 100644 index 0000000000..3c33db220e --- /dev/null +++ b/test_unstructured/benchmarks/test_benchmark_standardize_quotes.py @@ -0,0 +1,26 @@ +from unstructured.metrics.text_extraction import standardize_quotes + +SAMPLE_TEXTS = [ + "She said \u201cHello\u201d and then whispered \u2018Goodbye\u2019 before leaving.", + "\u201eTo be, or not to be, that is the question\u201d - Shakespeare\u2019s famous quote.", + "\u00abWhen he said \u201clife is beautiful,\u201d I believed him\u00bb wrote Maria.", + "\u275dDo you remember when we first met?\u275e she asked with a smile.", + "\u301dThe meeting starts at 10:00, don\u2019t be late!\u301f announced the manager.", + '\u300cHe told me "This is important" yesterday\u300d, she explained.', + "\u300eThe sun was setting. The birds were singing. It was peaceful.\u300f", + "\ufe42Meeting #123 @ 15:00 - Don\u2019t forget!\ufe41", + "\u300cHello\u300d, \u275dWorld\u275e, \"Test\", 'Example', \u201eQuote\u201d, \u00abFinal\u00bb", # noqa: E501 + "It\u2019s John\u2019s book, isn\u2019t it?", + '\u2039Testing the system\u2019s capability for "quoted" text\u203a', + "\u275bFirst sentence. Second sentence. Third sentence.\u275c", + "\u300cChapter 1\u300d: \u275dThe Beginning\u275e - \u201eA new story\u201d begins \u00abtoday\u00bb.", # noqa: E501 +] + + +def run_standardize_quotes(): + for text in SAMPLE_TEXTS: + standardize_quotes(text) + + +def test_benchmark_standardize_quotes(benchmark): + benchmark(run_standardize_quotes) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 18cf7f97c3..c965d4e764 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -402,12 +402,44 @@ def test_prepare_string(text, expected): '「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».', '\'Chapter 1\': "The Beginning" - "A new story" begins "today".', ), + # --- Regression: U+201C / U+2018 were silently dropped by duplicate dict keys --- + # U+201C left double quotation mark (isolated) + ("\u201c", '"'), + # U+2018 left single quotation mark (isolated) + ("\u2018", "'"), + # Left + right double smart quotes wrapping a word + ("\u201cHello\u201d", '"Hello"'), + # Left + right single smart quotes wrapping a word + ("\u2018world\u2019", "'world'"), + # Mixed left/right smart quotes in a sentence + ( + "She said \u201cHello\u201d and then whispered \u2018Goodbye\u2019", + "She said \"Hello\" and then whispered 'Goodbye'", + ), + # Possessive with left single smart quote + ("\u2018tis the season", "'tis the season"), ], ) def test_standardize_quotes(input_text, expected_output): assert text_extraction.standardize_quotes(input_text) == expected_output +@pytest.mark.parametrize( + ("codepoint", "expected"), + [ + pytest.param(cp, '"', id=f"U+{ord(cp):04X}->double") + for cp in text_extraction._DOUBLE_QUOTE_CODEPOINTS + ] + + [ + pytest.param(cp, "'", id=f"U+{ord(cp):04X}->single") + for cp in text_extraction._SINGLE_QUOTE_CODEPOINTS + ], +) +def test_standardize_quotes_every_codepoint(codepoint, expected): + """Every codepoint in the translation table must map to its ASCII equivalent.""" + assert text_extraction.standardize_quotes(codepoint) == expected + + @pytest.mark.parametrize( ("output_text", "source_text", "expected_percentage"), [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 02903488b7..733ab138e3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.22.12" # pragma: no cover +__version__ = "0.22.13" # pragma: no cover diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 7153852305..a6a29dfdef 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -4,6 +4,55 @@ from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation +_DOUBLE_QUOTE_CODEPOINTS = ( + "\u0022", # U+0022 Standard typewriter/programmer's quote + "\u201c", # U+201C Left double quotation mark + "\u201d", # U+201D Right double quotation mark + "\u201e", # U+201E Double low-9 quotation mark + "\u201f", # U+201F Double high-reversed-9 quotation mark + "\u00ab", # U+00AB Left-pointing double angle quotation mark + "\u00bb", # U+00BB Right-pointing double angle quotation mark + "\u275d", # U+275D Heavy double turned comma quotation mark ornament + "\u275e", # U+275E Heavy double comma quotation mark ornament + "\u2e42", # U+2E42 Double low-reversed-9 quotation mark + "\U0001f676", # U+1F676 SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT + "\U0001f677", # U+1F677 SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT + "\U0001f678", # U+1F678 SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT + "\u2826", # U+2826 Braille double closing quotation mark + "\u2834", # U+2834 Braille double opening quotation mark + "\u301d", # U+301D REVERSED DOUBLE PRIME QUOTATION MARK + "\u301e", # U+301E DOUBLE PRIME QUOTATION MARK + "\u301f", # U+301F LOW DOUBLE PRIME QUOTATION MARK + "\uff02", # U+FF02 FULLWIDTH QUOTATION MARK +) + +_SINGLE_QUOTE_CODEPOINTS = ( + "\u0027", # U+0027 Standard typewriter/programmer's quote + "\u2018", # U+2018 Left single quotation mark + "\u2019", # U+2019 Right single quotation mark + "\u201a", # U+201A Single low-9 quotation mark + "\u201b", # U+201B Single high-reversed-9 quotation mark + "\u2039", # U+2039 Single left-pointing angle quotation mark + "\u203a", # U+203A Single right-pointing angle quotation mark + "\u275b", # U+275B Heavy single turned comma quotation mark ornament + "\u275c", # U+275C Heavy single comma quotation mark ornament + "\u300c", # U+300C Left corner bracket + "\u300d", # U+300D Right corner bracket + "\u300e", # U+300E Left white corner bracket + "\u300f", # U+300F Right white corner bracket + "\ufe41", # U+FE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET + "\ufe42", # U+FE42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET + "\ufe43", # U+FE43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET + "\ufe44", # U+FE44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET + "\uff07", # U+FF07 FULLWIDTH APOSTROPHE + "\uff62", # U+FF62 HALFWIDTH LEFT CORNER BRACKET + "\uff63", # U+FF63 HALFWIDTH RIGHT CORNER BRACKET +) + +_TRANSLATION_TABLE = str.maketrans( + dict.fromkeys(_DOUBLE_QUOTE_CODEPOINTS, '"') | dict.fromkeys(_SINGLE_QUOTE_CODEPOINTS, "'") +) + def calculate_accuracy( output: Optional[str], @@ -172,80 +221,4 @@ def standardize_quotes(text: str) -> str: Returns: str: The text with standardized quotes. """ - # Double Quotes Dictionary - double_quotes = { - '"': "U+0022", # noqa 601 # Standard typewriter/programmer's quote - '"': "U+201C", # noqa 601 # Left double quotation mark - '"': "U+201D", # noqa 601 # Right double quotation mark - "„": "U+201E", # Double low-9 quotation mark - "‟": "U+201F", # Double high-reversed-9 quotation mark - "«": "U+00AB", # Left-pointing double angle quotation mark - "»": "U+00BB", # Right-pointing double angle quotation mark - "❝": "U+275D", # Heavy double turned comma quotation mark ornament - "❞": "U+275E", # Heavy double comma quotation mark ornament - "⹂": "U+2E42", # Double low-reversed-9 quotation mark - "🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT - "🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT - "🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT - "⠦": "U+2826", # Braille double closing quotation mark - "⠴": "U+2834", # Braille double opening quotation mark - "〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK - "〞": "U+301E", # DOUBLE PRIME QUOTATION MARK - "〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK - """: "U+FF02", # FULLWIDTH QUOTATION MARK - ",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT - } - - # Single Quotes Dictionary - single_quotes = { - "'": "U+0027", # noqa 601 # Standard typewriter/programmer's quote - "'": "U+2018", # noqa 601 # Left single quotation mark - "'": "U+2019", # noqa 601 # Right single quotation mark # noqa: W605 - "‚": "U+201A", # Single low-9 quotation mark - "‛": "U+201B", # Single high-reversed-9 quotation mark - "‹": "U+2039", # Single left-pointing angle quotation mark - "›": "U+203A", # Single right-pointing angle quotation mark - "❛": "U+275B", # Heavy single turned comma quotation mark ornament - "❜": "U+275C", # Heavy single comma quotation mark ornament - "「": "U+300C", # Left corner bracket - "」": "U+300D", # Right corner bracket - "『": "U+300E", # Left white corner bracket - "』": "U+300F", # Right white corner bracket - "﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET - "﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET - "﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET - "﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET - "'": "U+FF07", # FULLWIDTH APOSTROPHE - "「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET - "」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET - } - - double_quote_standard = '"' - single_quote_standard = "'" - - # Apply double quote replacements - for unicode_val in double_quotes.values(): - unicode_char = unicode_to_char(unicode_val) - if unicode_char in text: - text = text.replace(unicode_char, double_quote_standard) - - # Apply single quote replacements - for unicode_val in single_quotes.values(): - unicode_char = unicode_to_char(unicode_val) - if unicode_char in text: - text = text.replace(unicode_char, single_quote_standard) - - return text - - -def unicode_to_char(unicode_val: str) -> str: - """ - Converts a Unicode value to a character. - - Args: - unicode_val (str): The Unicode value to convert. - - Returns: - str: The character corresponding to the Unicode value. - """ - return chr(int(unicode_val.replace("U+", ""), 16)) + return text.translate(_TRANSLATION_TABLE) diff --git a/uv.lock b/uv.lock index 0dfe70903a..60ac7d75bf 100644 --- a/uv.lock +++ b/uv.lock @@ -4720,6 +4720,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/80/2d/1bb683f64737bbb1f86c82b7359db1eb2be4e2c0c13b947f80efefa7d3e5/psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa", size = 2714215, upload-time = "2025-10-10T11:13:07.14Z" }, ] +[[package]] +name = "py-cpuinfo" +version = "9.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716, upload-time = "2022-10-25T20:38:06.303Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" }, +] + [[package]] name = "pyairtable" version = "3.3.0" @@ -5313,6 +5322,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] +[[package]] +name = "pytest-benchmark" +version = "5.2.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "py-cpuinfo" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/24/34/9f732b76456d64faffbef6232f1f9dbec7a7c4999ff46282fa418bd1af66/pytest_benchmark-5.2.3.tar.gz", hash = "sha256:deb7317998a23c650fd4ff76e1230066a76cb45dcece0aca5607143c619e7779", size = 341340, upload-time = "2025-11-09T18:48:43.215Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/29/e756e715a48959f1c0045342088d7ca9762a2f509b945f362a316e9412b7/pytest_benchmark-5.2.3-py3-none-any.whl", hash = "sha256:bc839726ad20e99aaa0d11a127445457b4219bdb9e80a1afc4b51da7f96b0803", size = 45255, upload-time = "2025-11-09T18:48:39.765Z" }, +] + [[package]] name = "pytest-cov" version = "7.0.0" @@ -7241,6 +7263,7 @@ test = [ { name = "pip-licenses" }, { name = "pydantic" }, { name = "pytest" }, + { name = "pytest-benchmark" }, { name = "pytest-cov" }, { name = "pytest-mock" }, { name = "pytest-xdist" }, @@ -7384,6 +7407,7 @@ test = [ { name = "pip-licenses", specifier = ">=5.0.0,<6.0.0" }, { name = "pydantic", specifier = ">=2.12.5,<3.0.0" }, { name = "pytest", specifier = ">=9.0.2,<10.0.0" }, + { name = "pytest-benchmark", specifier = ">=5.2.3" }, { name = "pytest-cov", specifier = ">=7.0.0,<8.0.0" }, { name = "pytest-mock", specifier = ">=3.15.1,<4.0.0" }, { name = "pytest-xdist", specifier = ">=3.8.0,<4.0.0" },