From d132279ca92c47294c20b246aba29aac27721d7f Mon Sep 17 00:00:00 2001 From: Lawrence Elitzer Date: Sun, 22 Feb 2026 15:52:54 -0600 Subject: [PATCH 1/9] fix: self-install pinned spaCy model at runtime with SHA256 verification Replace the en-core-web-sm direct URL dependency in pyproject.toml with the installer library. The spaCy model is now downloaded and installed on first use with hash verification, removing the need for [tool.uv.sources] and making the install more portable. Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 5 +++ pyproject.toml | 5 +-- unstructured/__version__.py | 2 +- unstructured/nlp/tokenize.py | 68 ++++++++++++++++++++++++++++++++---- 4 files changed, 68 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc3897b7e0..0e1d035f67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.21.2 + +### Fixes +- **Self-install pinned spaCy model at runtime with SHA256 verification**: Replace the `en-core-web-sm` direct URL dependency in `pyproject.toml` with the `installer` library. The spaCy model is now downloaded and installed on first use with hash verification, removing the need for `[tool.uv.sources]` and making the install more portable. + ## 0.21.1 - Bump version to create a new release diff --git a/pyproject.toml b/pyproject.toml index 6173be18c2..70fde4f335 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "langdetect>=1.0.9, <2.0.0", "lxml>=5.0.0, <7.0.0", "spacy>=3.7.0, <4.0.0", - "en-core-web-sm>=3.8.0, <4.0.0", + "installer>=0.7.0, <1.0.0", "numba>=0.60.0, <1.0.0", "numpy>=1.26.0, <3.0.0", "psutil>=7.2.2, <8.0.0", @@ -181,9 +181,6 @@ release = [ "twine>=6.0.0, <7.0.0", ] -[tool.uv.sources] -en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" } - [tool.uv] required-environments = [ "sys_platform == 'linux' and platform_machine == 'x86_64'", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e2fa5aba8d..94f60f0c05 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.1" # pragma: no cover +__version__ = "0.21.2" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 610f0b9503..2e5e3a1481 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -1,19 +1,73 @@ from __future__ import annotations +import hashlib +import logging +import os +import sys +import sysconfig +import tempfile +import urllib.request from functools import lru_cache from typing import Final, List, Tuple import spacy +logger = logging.getLogger(__name__) + CACHE_MAX_SIZE: Final[int] = 128 -try: - _nlp = spacy.load("en_core_web_sm") -except OSError: - raise OSError( - "The spacy model 'en_core_web_sm' is required but not installed. " - "Install it with: python -m spacy download en_core_web_sm" - ) +_SPACY_MODEL_NAME: Final[str] = "en_core_web_sm" +_SPACY_MODEL_VERSION: Final[str] = "3.8.0" +_SPACY_MODEL_URL: Final[str] = ( + f"https://github.com/explosion/spacy-models/releases/download/" + f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}/" + f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl" +) +_SPACY_MODEL_SHA256: Final[str] = ( + "1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85" +) + + +def _install_spacy_model() -> None: + """Download and install the pinned spaCy model wheel using the `installer` library.""" + from installer import install + from installer.destinations import SchemeDictionaryDestination + from installer.sources import WheelFile + + with tempfile.TemporaryDirectory() as tmp: + whl_path = os.path.join( + tmp, f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl" + ) + logger.info("Downloading spaCy model %s %s …", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) + urllib.request.urlretrieve(_SPACY_MODEL_URL, whl_path) + + sha256 = hashlib.sha256(open(whl_path, "rb").read()).hexdigest() + if sha256 != _SPACY_MODEL_SHA256: + raise RuntimeError( + f"Hash mismatch for {_SPACY_MODEL_NAME}: " + f"expected {_SPACY_MODEL_SHA256}, got {sha256}" + ) + + destination = SchemeDictionaryDestination( + sysconfig.get_paths(), + interpreter=sys.executable, + script_kind="win-ia32" if sys.platform == "win32" else "posix", + ) + with WheelFile.open(whl_path) as source: + install(source=source, destination=destination) + + logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) + + +def _load_spacy_model() -> spacy.language.Language: + try: + return spacy.load(_SPACY_MODEL_NAME) + except OSError: + _install_spacy_model() + return spacy.load(_SPACY_MODEL_NAME) + + +_nlp = _load_spacy_model() def _process(text: str) -> spacy.tokens.Doc: From 0606623ec991807bc3fb74596f6721c752b986ea Mon Sep 17 00:00:00 2001 From: Lawrence Elitzer Date: Sun, 22 Feb 2026 15:59:17 -0600 Subject: [PATCH 2/9] chore: update uv.lock Co-Authored-By: Claude Opus 4.6 --- uv.lock | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/uv.lock b/uv.lock index d13e30f118..c18b3d6bb6 100644 --- a/uv.lock +++ b/uv.lock @@ -1488,14 +1488,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/5e/4b5aaaabddfacfe36ba7768817bd1f71a7a810a43705e531f3ae4c690767/emoji-2.15.0-py3-none-any.whl", hash = "sha256:205296793d66a89d88af4688fa57fd6496732eb48917a87175a023c8138995eb", size = 608433, upload-time = "2025-09-21T12:13:01.197Z" }, ] -[[package]] -name = "en-core-web-sm" -version = "3.8.0" -source = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" } -wheels = [ - { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl", hash = "sha256:1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85" }, -] - [[package]] name = "et-xmlfile" version = "2.0.0" @@ -2415,6 +2407,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "installer" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/18/ceeb4e3ab3aa54495775775b38ae42b10a92f42ce42dfa44da684289b8c8/installer-0.7.0.tar.gz", hash = "sha256:a26d3e3116289bb08216e0d0f7d925fcef0b0194eedfa0c944bcaaa106c4b631", size = 474349, upload-time = "2023-03-17T20:39:38.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/ca/1172b6638d52f2d6caa2dd262ec4c811ba59eee96d54a7701930726bce18/installer-0.7.0-py3-none-any.whl", hash = "sha256:05d1933f0a5ba7d8d6296bb6d5018e7c94fa473ceb10cf198a92ccea19c27b53", size = 453838, upload-time = "2023-03-17T20:39:36.219Z" }, +] + [[package]] name = "invoke" version = "2.2.1" @@ -7036,9 +7037,9 @@ dependencies = [ { name = "beautifulsoup4" }, { name = "charset-normalizer" }, { name = "emoji" }, - { name = "en-core-web-sm" }, { name = "filetype" }, { name = "html5lib" }, + { name = "installer" }, { name = "langdetect" }, { name = "lxml" }, { name = "numba" }, @@ -7215,13 +7216,13 @@ requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.14.3,<5.0.0" }, { name = "charset-normalizer", specifier = ">=3.4.4,<4.0.0" }, { name = "emoji", specifier = ">=2.15.0,<3.0.0" }, - { name = "en-core-web-sm", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }, { name = "filetype", specifier = ">=1.2.0,<2.0.0" }, { name = "google-cloud-vision", marker = "extra == 'all-docs'", specifier = ">=3.12.1,<4.0.0" }, { name = "google-cloud-vision", marker = "extra == 'image'", specifier = ">=3.12.1,<4.0.0" }, { name = "google-cloud-vision", marker = "extra == 'local-inference'", specifier = ">=3.12.1,<4.0.0" }, { name = "google-cloud-vision", marker = "extra == 'pdf'", specifier = ">=3.12.1,<4.0.0" }, { name = "html5lib", specifier = ">=1.1,<2.0.0" }, + { name = "installer", specifier = ">=0.7.0,<1.0.0" }, { name = "langdetect", specifier = ">=1.0.9,<2.0.0" }, { name = "lxml", specifier = ">=5.0.0,<7.0.0" }, { name = "markdown", marker = "extra == 'all-docs'", specifier = ">=3.10.1,<4.0.0" }, From 54d689f2ace08dfb6534be34b6d60105770557d3 Mon Sep 17 00:00:00 2001 From: Lawrence Elitzer Date: Sun, 22 Feb 2026 16:01:46 -0600 Subject: [PATCH 3/9] fix: address Cursor Bugbot findings in tokenize.py - Use installer.utils.get_launcher_kind() instead of hardcoded "win-ia32" for correct 64-bit Windows support - Wrap file handle in with statement for SHA256 computation - Call importlib.invalidate_caches() after runtime model install so Python's FileFinder discovers the new package Co-Authored-By: Claude Opus 4.6 --- unstructured/nlp/tokenize.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 2e5e3a1481..11c4b914f4 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -1,6 +1,7 @@ from __future__ import annotations import hashlib +import importlib import logging import os import sys @@ -33,6 +34,7 @@ def _install_spacy_model() -> None: from installer import install from installer.destinations import SchemeDictionaryDestination from installer.sources import WheelFile + from installer.utils import get_launcher_kind with tempfile.TemporaryDirectory() as tmp: whl_path = os.path.join( @@ -41,7 +43,8 @@ def _install_spacy_model() -> None: logger.info("Downloading spaCy model %s %s …", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) urllib.request.urlretrieve(_SPACY_MODEL_URL, whl_path) - sha256 = hashlib.sha256(open(whl_path, "rb").read()).hexdigest() + with open(whl_path, "rb") as f: + sha256 = hashlib.sha256(f.read()).hexdigest() if sha256 != _SPACY_MODEL_SHA256: raise RuntimeError( f"Hash mismatch for {_SPACY_MODEL_NAME}: " @@ -51,7 +54,7 @@ def _install_spacy_model() -> None: destination = SchemeDictionaryDestination( sysconfig.get_paths(), interpreter=sys.executable, - script_kind="win-ia32" if sys.platform == "win32" else "posix", + script_kind=get_launcher_kind(), ) with WheelFile.open(whl_path) as source: install(source=source, destination=destination) @@ -64,6 +67,7 @@ def _load_spacy_model() -> spacy.language.Language: return spacy.load(_SPACY_MODEL_NAME) except OSError: _install_spacy_model() + importlib.invalidate_caches() return spacy.load(_SPACY_MODEL_NAME) From 49bcd754132e21dcf5f3bd26c1936d2e6e66489c Mon Sep 17 00:00:00 2001 From: Yao You Date: Sun, 22 Feb 2026 16:06:07 -0600 Subject: [PATCH 4/9] fix call to install --- unstructured/nlp/tokenize.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 11c4b914f4..d6561b8f3a 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -24,9 +24,7 @@ f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}/" f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl" ) -_SPACY_MODEL_SHA256: Final[str] = ( - "1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85" -) +_SPACY_MODEL_SHA256: Final[str] = "1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85" def _install_spacy_model() -> None: @@ -37,9 +35,7 @@ def _install_spacy_model() -> None: from installer.utils import get_launcher_kind with tempfile.TemporaryDirectory() as tmp: - whl_path = os.path.join( - tmp, f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl" - ) + whl_path = os.path.join(tmp, f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl") logger.info("Downloading spaCy model %s %s …", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) urllib.request.urlretrieve(_SPACY_MODEL_URL, whl_path) @@ -57,7 +53,7 @@ def _install_spacy_model() -> None: script_kind=get_launcher_kind(), ) with WheelFile.open(whl_path) as source: - install(source=source, destination=destination) + install(source=source, destination=destination, additional_metadata={}) logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) From 14552ab26eed43ee899243bc1b869f1f75005c48 Mon Sep 17 00:00:00 2001 From: Lawrence Elitzer Date: Sun, 22 Feb 2026 16:20:01 -0600 Subject: [PATCH 5/9] fix: handle concurrent spaCy model install and add download timeout - Catch FileExistsError during wheel install so parallel processes (e.g. pytest-xdist workers) don't crash when another process already installed the model - Replace urlretrieve with urlopen + 120s socket timeout to prevent indefinite hangs in network-restricted environments - Trigger spaCy model self-install during Docker build so offline deployments (HF_HUB_OFFLINE=1) have the model available at runtime Co-Authored-By: Claude Opus 4.6 --- Dockerfile | 3 ++- unstructured/nlp/tokenize.py | 22 +++++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index c4025304a3..5187fcfd52 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,8 +71,9 @@ ENV TESSDATA_PREFIX=/usr/local/share/tessdata ENV UV_COMPILE_BYTECODE=1 ENV UV_PYTHON_DOWNLOADS=never -# Install Python dependencies via uv (en-core-web-sm is declared in pyproject.toml) +# Install Python dependencies via uv, then trigger spaCy model self-install while network is available RUN uv sync --locked --all-extras --no-group dev --no-group lint --no-group test --no-group release && \ + uv run --no-sync $PYTHON -c "from unstructured.nlp.tokenize import _nlp; print('spaCy model loaded:', _nlp.meta['name'])" && \ uv run --no-sync $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \ uv run --no-sync $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index d6561b8f3a..7f9d67ecae 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -4,6 +4,7 @@ import importlib import logging import os +import shutil import sys import sysconfig import tempfile @@ -27,6 +28,16 @@ _SPACY_MODEL_SHA256: Final[str] = "1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85" +_DOWNLOAD_TIMEOUT_SECONDS: Final[int] = 120 + + +def _download_with_timeout(url: str, dest: str) -> None: + """Download a URL to a local file with a socket-level timeout.""" + with urllib.request.urlopen(url, timeout=_DOWNLOAD_TIMEOUT_SECONDS) as resp: + with open(dest, "wb") as out: + shutil.copyfileobj(resp, out) + + def _install_spacy_model() -> None: """Download and install the pinned spaCy model wheel using the `installer` library.""" from installer import install @@ -37,7 +48,7 @@ def _install_spacy_model() -> None: with tempfile.TemporaryDirectory() as tmp: whl_path = os.path.join(tmp, f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl") logger.info("Downloading spaCy model %s %s …", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) - urllib.request.urlretrieve(_SPACY_MODEL_URL, whl_path) + _download_with_timeout(_SPACY_MODEL_URL, whl_path) with open(whl_path, "rb") as f: sha256 = hashlib.sha256(f.read()).hexdigest() @@ -52,8 +63,13 @@ def _install_spacy_model() -> None: interpreter=sys.executable, script_kind=get_launcher_kind(), ) - with WheelFile.open(whl_path) as source: - install(source=source, destination=destination, additional_metadata={}) + try: + with WheelFile.open(whl_path) as source: + install(source=source, destination=destination, additional_metadata={}) + except FileExistsError: + # Another process (e.g. pytest-xdist worker) installed concurrently + logger.info("Model files already exist, assuming concurrent install by another process") + return logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) From 230f5f1959043b7e54821cfbf0d97bd78f4f6201 Mon Sep 17 00:00:00 2001 From: Lawrence Elitzer Date: Sun, 22 Feb 2026 16:42:32 -0600 Subject: [PATCH 6/9] fix: use atomic staging dir for spaCy model install to prevent race condition Install the wheel into a temp staging directory first, then move items into site-packages. This prevents the incomplete-install race where one process sees a partial model (e.g. __init__.py exists but model data is missing) and fails with OSError: [E050] Can't find model. Co-Authored-By: Claude Opus 4.6 --- unstructured/nlp/tokenize.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 7f9d67ecae..b96648e4b0 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -58,18 +58,30 @@ def _install_spacy_model() -> None: f"expected {_SPACY_MODEL_SHA256}, got {sha256}" ) + # Install into a staging directory to avoid races with other processes + staging = os.path.join(tmp, "staging") + paths = sysconfig.get_paths() + staged_paths = paths.copy() + staged_paths["purelib"] = staging + staged_paths["platlib"] = staging + destination = SchemeDictionaryDestination( - sysconfig.get_paths(), + staged_paths, interpreter=sys.executable, script_kind=get_launcher_kind(), ) - try: - with WheelFile.open(whl_path) as source: - install(source=source, destination=destination, additional_metadata={}) - except FileExistsError: - # Another process (e.g. pytest-xdist worker) installed concurrently - logger.info("Model files already exist, assuming concurrent install by another process") - return + with WheelFile.open(whl_path) as source: + install(source=source, destination=destination, additional_metadata={}) + + # Move installed packages from staging into real site-packages + site_packages = paths["purelib"] + for item in os.listdir(staging): + src = os.path.join(staging, item) + dst = os.path.join(site_packages, item) + if os.path.exists(dst): + logger.info("Skipping %s, already exists (concurrent install)", item) + continue + shutil.move(src, dst) logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) From 6ef9188c9cccc3c9fa6b5c028484ed6be4fdfb02 Mon Sep 17 00:00:00 2001 From: Yao You Date: Sun, 22 Feb 2026 17:09:13 -0600 Subject: [PATCH 7/9] use filelock to prevent racing condition --- pyproject.toml | 1 + unstructured/nlp/tokenize.py | 22 ++++++++++++++++++---- uv.lock | 2 ++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70fde4f335..bfb82dfe96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "typing-extensions>=4.15.0, <5.0.0", "unstructured-client>=0.25.9, <1.0.0", "wrapt>=1.0.0, <2.0.0", + "filelock>=3.12.0,<4.0.0", ] [project.optional-dependencies] diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index b96648e4b0..9074163824 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -13,6 +13,7 @@ from typing import Final, List, Tuple import spacy +from filelock import FileLock logger = logging.getLogger(__name__) @@ -29,6 +30,9 @@ _DOWNLOAD_TIMEOUT_SECONDS: Final[int] = 120 +_INSTALL_LOCK_PATH: Final[str] = os.path.join( + tempfile.gettempdir(), f"{_SPACY_MODEL_NAME}.install.lock" +) def _download_with_timeout(url: str, dest: str) -> None: @@ -73,14 +77,12 @@ def _install_spacy_model() -> None: with WheelFile.open(whl_path) as source: install(source=source, destination=destination, additional_metadata={}) - # Move installed packages from staging into real site-packages + # Move installed packages from staging into real site-packages. + # The caller holds _INSTALL_LOCK_PATH so no other process races here. site_packages = paths["purelib"] for item in os.listdir(staging): src = os.path.join(staging, item) dst = os.path.join(site_packages, item) - if os.path.exists(dst): - logger.info("Skipping %s, already exists (concurrent install)", item) - continue shutil.move(src, dst) logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) @@ -90,6 +92,18 @@ def _load_spacy_model() -> spacy.language.Language: try: return spacy.load(_SPACY_MODEL_NAME) except OSError: + pass + + # Serialize model installation across processes with an exclusive file lock. + # A well-known path in the system temp dir is visible to all processes + # regardless of their working directory. + with FileLock(_INSTALL_LOCK_PATH, timeout=-1): + # Double-check: another process may have installed while we waited. + importlib.invalidate_caches() + try: + return spacy.load(_SPACY_MODEL_NAME) + except OSError: + pass _install_spacy_model() importlib.invalidate_caches() return spacy.load(_SPACY_MODEL_NAME) diff --git a/uv.lock b/uv.lock index c18b3d6bb6..e282f04b04 100644 --- a/uv.lock +++ b/uv.lock @@ -7037,6 +7037,7 @@ dependencies = [ { name = "beautifulsoup4" }, { name = "charset-normalizer" }, { name = "emoji" }, + { name = "filelock" }, { name = "filetype" }, { name = "html5lib" }, { name = "installer" }, @@ -7216,6 +7217,7 @@ requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.14.3,<5.0.0" }, { name = "charset-normalizer", specifier = ">=3.4.4,<4.0.0" }, { name = "emoji", specifier = ">=2.15.0,<3.0.0" }, + { name = "filelock", specifier = ">=3.12.0,<4.0.0" }, { name = "filetype", specifier = ">=1.2.0,<2.0.0" }, { name = "google-cloud-vision", marker = "extra == 'all-docs'", specifier = ">=3.12.1,<4.0.0" }, { name = "google-cloud-vision", marker = "extra == 'image'", specifier = ">=3.12.1,<4.0.0" }, From 02800561afebd92df0c88c8e29104b128b822211 Mon Sep 17 00:00:00 2001 From: Yao You Date: Sun, 22 Feb 2026 17:56:17 -0600 Subject: [PATCH 8/9] add better error message and use lazy load --- unstructured/nlp/tokenize.py | 44 ++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 9074163824..cdc21cc13f 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -8,6 +8,7 @@ import sys import sysconfig import tempfile +import urllib.error import urllib.request from functools import lru_cache from typing import Final, List, Tuple @@ -37,9 +38,15 @@ def _download_with_timeout(url: str, dest: str) -> None: """Download a URL to a local file with a socket-level timeout.""" - with urllib.request.urlopen(url, timeout=_DOWNLOAD_TIMEOUT_SECONDS) as resp: - with open(dest, "wb") as out: - shutil.copyfileobj(resp, out) + try: + with urllib.request.urlopen(url, timeout=_DOWNLOAD_TIMEOUT_SECONDS) as resp: + with open(dest, "wb") as out: + shutil.copyfileobj(resp, out) + except urllib.error.URLError as exc: + raise RuntimeError( + f"Failed to download spaCy model from {url}: {exc}. " + "Check your network connection and try again." + ) from exc def _install_spacy_model() -> None: @@ -79,11 +86,25 @@ def _install_spacy_model() -> None: # Move installed packages from staging into real site-packages. # The caller holds _INSTALL_LOCK_PATH so no other process races here. + # Any dst that already exists is a remnant of a previous failed install + # (spacy.load() just failed), so remove it before moving to avoid + # shutil.move placing src *inside* an existing directory. site_packages = paths["purelib"] for item in os.listdir(staging): src = os.path.join(staging, item) dst = os.path.join(site_packages, item) - shutil.move(src, dst) + try: + if os.path.isdir(dst): + shutil.rmtree(dst) + elif os.path.exists(dst): + os.remove(dst) + shutil.move(src, dst) + except OSError as exc: + raise RuntimeError( + f"Failed to install {_SPACY_MODEL_NAME} to {site_packages}: {exc}. " + "Ensure the site-packages directory is writable, or pre-install the model " + f"with: python -m spacy download {_SPACY_MODEL_NAME}" + ) from exc logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) @@ -106,16 +127,25 @@ def _load_spacy_model() -> spacy.language.Language: pass _install_spacy_model() importlib.invalidate_caches() - return spacy.load(_SPACY_MODEL_NAME) + try: + return spacy.load(_SPACY_MODEL_NAME) + except OSError as exc: + raise RuntimeError( + f"Installed {_SPACY_MODEL_NAME} but spacy.load() still failed. " + "Check site-packages permissions and installation integrity." + ) from exc -_nlp = _load_spacy_model() +@lru_cache(maxsize=1) +def _get_nlp() -> spacy.language.Language: + """Load the spaCy model on first use and cache it for the lifetime of the process.""" + return _load_spacy_model() def _process(text: str) -> spacy.tokens.Doc: """Run the spaCy pipeline once. All public functions extract what they need from the Doc.""" # -- str() handles numpy.str_ from OCR pipelines -- - return _nlp(str(text)) + return _get_nlp()(str(text)) def sent_tokenize(text: str) -> List[str]: From df62a9cf08b07f7292104ea69bd0b9725948b4a8 Mon Sep 17 00:00:00 2001 From: Yao You Date: Sun, 22 Feb 2026 18:07:33 -0600 Subject: [PATCH 9/9] update dockerfile to call new get_nlp() --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5187fcfd52..fcee8757db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -73,7 +73,7 @@ ENV UV_PYTHON_DOWNLOADS=never # Install Python dependencies via uv, then trigger spaCy model self-install while network is available RUN uv sync --locked --all-extras --no-group dev --no-group lint --no-group test --no-group release && \ - uv run --no-sync $PYTHON -c "from unstructured.nlp.tokenize import _nlp; print('spaCy model loaded:', _nlp.meta['name'])" && \ + uv run --no-sync $PYTHON -c "from unstructured.nlp.tokenize import _get_nlp; print('spaCy model loaded:', _get_nlp().meta['name'])" && \ uv run --no-sync $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \ uv run --no-sync $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"