Skip to content

Commit 23c4fff

Browse files
committed
fix: keep parser for accurate sentence boundaries
Per review feedback, removing parser and using sentencizer causes sentence splitting regressions. Keep parser loaded, only exclude ner, lemmatizer, and attribute_ruler.
1 parent b565a50 commit 23c4fff

2 files changed

Lines changed: 7 additions & 16 deletions

File tree

CHANGELOG.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,13 @@
2929
### Enhancements
3030
- **`partition_md` Markdown `extensions`**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`.
3131

32-
<<<<<<< HEAD
3332
## 0.22.2
3433

3534
### Enhancements
3635
- Store routing in ElementMetadata
3736

3837
## 0.22.1
3938

40-
=======
41-
>>>>>>> 2291c234 (Bump version to 0.22.3 for changelog CI check)
4239
### Fixes
4340
- **Security update**: Bumped dependencies to address security vulnerabilities
4441

unstructured/nlp/tokenize.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -109,21 +109,15 @@ def _install_spacy_model() -> None:
109109
logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION)
110110

111111

112-
# Only tok2vec, tagger, and sentence splitting are used (pos_tag and sent_tokenize).
113-
# Excluding the remaining components saves ~12 MiB of model weights per process.
114-
_SPACY_EXCLUDE = ["ner", "parser", "lemmatizer", "attribute_ruler"]
115-
116-
117-
def _load_and_configure() -> spacy.language.Language:
118-
"""Load the model excluding unused components, add a lightweight sentencizer."""
119-
nlp = spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
120-
nlp.add_pipe("sentencizer")
121-
return nlp
112+
# Only tok2vec, tagger, parser (sentence boundaries), and sentencizer are used
113+
# (pos_tag and sent_tokenize). Excluding the remaining components saves ~7 MiB
114+
# of model weights per process.
115+
_SPACY_EXCLUDE = ["ner", "lemmatizer", "attribute_ruler"]
122116

123117

124118
def _load_spacy_model() -> spacy.language.Language:
125119
try:
126-
return _load_and_configure()
120+
return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
127121
except OSError:
128122
pass
129123

@@ -134,13 +128,13 @@ def _load_spacy_model() -> spacy.language.Language:
134128
# Double-check: another process may have installed while we waited.
135129
importlib.invalidate_caches()
136130
try:
137-
return _load_and_configure()
131+
return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
138132
except OSError:
139133
pass
140134
_install_spacy_model()
141135
importlib.invalidate_caches()
142136
try:
143-
return _load_and_configure()
137+
return spacy.load(_SPACY_MODEL_NAME, exclude=_SPACY_EXCLUDE)
144138
except OSError as exc:
145139
raise RuntimeError(
146140
f"Installed {_SPACY_MODEL_NAME} but spacy.load() still failed. "

0 commit comments

Comments
 (0)