fix: handle text too long for spacy issue (#4353)

badGarnet · cubic-dev-ai[bot] · web-flow · commit bfd78b267859 · 2026-05-17T13:28:24.000Z
This PR addresses the issue that very long text can fail partition because its length exceeds `spacy`'s character limit. `spacy` is used to classify text content. For text too long to fit under the limit we now truncate the text and use the truncated text to represent the full text for classification purposes.  --- ## Summary by cubic Prevents tokenization failures on very long inputs by truncating text that exceeds `spacy`’s `max_length`, keeping partition/classification stable for large documents without affecting normal cases. - **Bug Fixes** - Guard `_process` for inputs over `nlp.max_length`; truncate at the last whitespace within budget, log a warning, and avoid `spacy` ValueError E088. - Add tests for truncation behavior and for normal processing within the limit. - Update version to `0.22.29` and changelog. <sup>Written for commit 1bfdd91. Summary will update on new commits. <a href="https://cubic.dev/pr/Unstructured-IO/unstructured/pull/4353?utm_source=github">Review in cubic</a></sup>  --------- Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.22.29
+
+### Fixes
+
+- **Truncate text if it exceeds `spacy` limit**: add a guard against calling `spacy` tokenizer with very long text. Now long texts are truncated to fit under the character limit.
+
 ## 0.22.28
 
 ### Fixes
diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py
@@ -41,3 +41,26 @@ def test_tokenizers_functions_run():
     tokenize.sent_tokenize(sentence)
     tokenize.word_tokenize(sentence)
     tokenize.pos_tag(sentence)
+
+
+def test_process_truncates_text_exceeding_spacy_max_length(caplog):
+    # Build text well above spaCy's default 1,000,000-char limit, like the prod trace.
+    nlp = tokenize._get_nlp()
+    long_text = "This is a sentence. " * ((nlp.max_length // 20) + 10_000)
+    assert len(long_text) > nlp.max_length
+
+    with caplog.at_level("WARNING", logger=tokenize.logger.name):
+        # Must not raise spacy ValueError E088.
+        sents = tokenize.sent_tokenize(long_text)
+
+    assert len(sents) > 0
+    assert any("exceeds spaCy max_length" in rec.message for rec in caplog.records)
+
+
+def test_process_does_not_truncate_text_within_limit():
+    nlp = tokenize._get_nlp()
+    text = "Greetings! I am from outer space."
+    assert len(text) <= nlp.max_length
+    doc = tokenize._process(text)
+    # When no truncation occurs the full text round-trips through spaCy.
+    assert doc.text == text
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.28"  # pragma: no cover
+__version__ = "0.22.29"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
@@ -151,7 +151,20 @@ def _get_nlp() -> spacy.language.Language:
 def _process(text: str) -> spacy.tokens.Doc:
     """Run the spaCy pipeline once. All public functions extract what they need from the Doc."""
     # -- str() handles numpy.str_ from OCR pipelines --
-    return _get_nlp()(str(text))
+    text = str(text)
+    nlp = _get_nlp()
+    if len(text) > nlp.max_length:
+        logger.warning(
+            "Input text of length %d exceeds spaCy max_length=%d; "
+            "truncating for partition heuristics.",
+            len(text),
+            nlp.max_length,
+        )
+        # Prefer to cut at the last whitespace within the budget so we don't split a token.
+        cut = text.rfind(" ", max(0, nlp.max_length - 256), nlp.max_length)
+        truncated = text[: cut if cut != -1 else nlp.max_length]
+        return nlp(truncated)
+    return nlp(text)
 
 
 def sent_tokenize(text: str) -> List[str]:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.22.28" # pragma: no cover`
	`1`	`+__version__ = "0.22.29" # pragma: no cover`