Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.22.11

### Enhancements
- **Exclude unused spaCy components**: Exclude `ner`, `lemmatizer`, and `attribute_ruler` when loading `en_core_web_sm`, keeping `parser` for accurate sentence boundaries. Saves ~7 MiB peak memory.

## 0.22.10
### Enhancements
- **Repeat table headers across continuation chunks**: Add `repeat_table_headers` to basic/title chunking options and table chunking internals so leading header rows are detected once and carried forward when large tables spill across multiple chunks.
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.10" # pragma: no cover
__version__ = "0.22.11" # pragma: no cover
12 changes: 9 additions & 3 deletions unstructured/nlp/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,15 @@ def _install_spacy_model() -> None:
logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION)


# Only tok2vec, tagger, parser (sentence boundaries), and sentencizer are used
# (pos_tag and sent_tokenize). Excluding the remaining components saves ~7 MiB
# of model weights per process.
_SPACY_EXCLUDE = ["ner", "lemmatizer", "attribute_ruler"]


def _load_spacy_model() -> spacy.language.Language:
try:
return spacy.load(_SPACY_MODEL_NAME)
return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
except OSError:
pass

Expand All @@ -122,13 +128,13 @@ def _load_spacy_model() -> spacy.language.Language:
# Double-check: another process may have installed while we waited.
importlib.invalidate_caches()
try:
return spacy.load(_SPACY_MODEL_NAME)
return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
except OSError:
pass
_install_spacy_model()
importlib.invalidate_caches()
try:
return spacy.load(_SPACY_MODEL_NAME)
return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
except OSError as exc:
raise RuntimeError(
f"Installed {_SPACY_MODEL_NAME} but spacy.load() still failed. "
Expand Down
Loading