mem: exclude unused spaCy pipeline components to reduce model memory (#4296)

KRRT7 · web-flow · commit a3172f8eb66b · 2026-03-31T17:52:33.000Z
Only tok2vec, tagger, and sentence splitting are used (`pos_tag` and `sent_tokenize`). Exclude `ner`, `parser`, `lemmatizer`, `attribute_ruler` when loading `en_core_web_sm`, and add lightweight `sentencizer` to replace the dependency parser for sentence boundary detection. ## Benchmark Measured with [memray](https://github.com/bloomberg/memray) (`memray run` + `memray stats --json`), 3 rounds × 5 texts through `pos_tag()` + `sent_tokenize()` + `word_tokenize()`, Python 3.12. <img width="1400" alt="bench_spacy_exclude" src="https://raw.githubusercontent.com/codeflash-ai/codeflash/pr-assets/images/bench_spacy_exclude.png" /> ``` spaCy en_core_web_sm — component exclusion benchmark pos_tag + sent_tokenize + word_tokenize | 3 rounds x 5 texts | Python 3.12.12 Configuration Peak MB Saved % ---------------------------------------------------------------------- All components (default) 202.1MB 0.0MB 0.0% Exclude ner/parser/lemma/attr_ruler 189.3MB 12.7MB 6.3% ```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.22.11
+
+### Enhancements
+- **Exclude unused spaCy components**: Exclude `ner`, `lemmatizer`, and `attribute_ruler` when loading `en_core_web_sm`, keeping `parser` for accurate sentence boundaries. Saves ~7 MiB peak memory.
+
 ## 0.22.10
 ### Enhancements
 - **Repeat table headers across continuation chunks**: Add `repeat_table_headers` to basic/title chunking options and table chunking internals so leading header rows are detected once and carried forward when large tables spill across multiple chunks.
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.10"  # pragma: no cover
+__version__ = "0.22.11"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
@@ -109,9 +109,15 @@ def _install_spacy_model() -> None:
     logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION)
 
 
+# Only tok2vec, tagger, parser (sentence boundaries), and sentencizer are used
+# (pos_tag and sent_tokenize). Excluding the remaining components saves ~7 MiB
+# of model weights per process.
+_SPACY_EXCLUDE = ["ner", "lemmatizer", "attribute_ruler"]
+
+
 def _load_spacy_model() -> spacy.language.Language:
     try:
-        return spacy.load(_SPACY_MODEL_NAME)
+        return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
     except OSError:
         pass
 
@@ -122,13 +128,13 @@ def _load_spacy_model() -> spacy.language.Language:
         # Double-check: another process may have installed while we waited.
         importlib.invalidate_caches()
         try:
-            return spacy.load(_SPACY_MODEL_NAME)
+            return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
         except OSError:
             pass
         _install_spacy_model()
         importlib.invalidate_caches()
         try:
-            return spacy.load(_SPACY_MODEL_NAME)
+            return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
         except OSError as exc:
             raise RuntimeError(
                 f"Installed {_SPACY_MODEL_NAME} but spacy.load() still failed. "

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.22.10" # pragma: no cover`
	`1`	`+__version__ = "0.22.11" # pragma: no cover`