diff --git a/CHANGELOG.md b/CHANGELOG.md index bc3897b7e0..0e1d035f67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.21.2 + +### Fixes +- **Self-install pinned spaCy model at runtime with SHA256 verification**: Replace the `en-core-web-sm` direct URL dependency in `pyproject.toml` with the `installer` library. The spaCy model is now downloaded and installed on first use with hash verification, removing the need for `[tool.uv.sources]` and making the install more portable. + ## 0.21.1 - Bump version to create a new release diff --git a/Dockerfile b/Dockerfile index c4025304a3..fcee8757db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,8 +71,9 @@ ENV TESSDATA_PREFIX=/usr/local/share/tessdata ENV UV_COMPILE_BYTECODE=1 ENV UV_PYTHON_DOWNLOADS=never -# Install Python dependencies via uv (en-core-web-sm is declared in pyproject.toml) +# Install Python dependencies via uv, then trigger spaCy model self-install while network is available RUN uv sync --locked --all-extras --no-group dev --no-group lint --no-group test --no-group release && \ + uv run --no-sync $PYTHON -c "from unstructured.nlp.tokenize import _get_nlp; print('spaCy model loaded:', _get_nlp().meta['name'])" && \ uv run --no-sync $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \ uv run --no-sync $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" diff --git a/pyproject.toml b/pyproject.toml index 6173be18c2..bfb82dfe96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "langdetect>=1.0.9, <2.0.0", "lxml>=5.0.0, <7.0.0", "spacy>=3.7.0, <4.0.0", - "en-core-web-sm>=3.8.0, <4.0.0", + "installer>=0.7.0, <1.0.0", "numba>=0.60.0, <1.0.0", "numpy>=1.26.0, <3.0.0", "psutil>=7.2.2, <8.0.0", @@ -43,6 +43,7 @@ dependencies = [ "typing-extensions>=4.15.0, <5.0.0", "unstructured-client>=0.25.9, <1.0.0", "wrapt>=1.0.0, <2.0.0", + "filelock>=3.12.0,<4.0.0", ] [project.optional-dependencies] @@ -181,9 +182,6 @@ release = [ "twine>=6.0.0, <7.0.0", ] -[tool.uv.sources] -en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" } - [tool.uv] required-environments = [ "sys_platform == 'linux' and platform_machine == 'x86_64'", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e2fa5aba8d..94f60f0c05 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.1" # pragma: no cover +__version__ = "0.21.2" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 610f0b9503..cdc21cc13f 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -1,25 +1,151 @@ from __future__ import annotations +import hashlib +import importlib +import logging +import os +import shutil +import sys +import sysconfig +import tempfile +import urllib.error +import urllib.request from functools import lru_cache from typing import Final, List, Tuple import spacy +from filelock import FileLock + +logger = logging.getLogger(__name__) CACHE_MAX_SIZE: Final[int] = 128 -try: - _nlp = spacy.load("en_core_web_sm") -except OSError: - raise OSError( - "The spacy model 'en_core_web_sm' is required but not installed. " - "Install it with: python -m spacy download en_core_web_sm" - ) +_SPACY_MODEL_NAME: Final[str] = "en_core_web_sm" +_SPACY_MODEL_VERSION: Final[str] = "3.8.0" +_SPACY_MODEL_URL: Final[str] = ( + f"https://github.com/explosion/spacy-models/releases/download/" + f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}/" + f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl" +) +_SPACY_MODEL_SHA256: Final[str] = "1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85" + + +_DOWNLOAD_TIMEOUT_SECONDS: Final[int] = 120 +_INSTALL_LOCK_PATH: Final[str] = os.path.join( + tempfile.gettempdir(), f"{_SPACY_MODEL_NAME}.install.lock" +) + + +def _download_with_timeout(url: str, dest: str) -> None: + """Download a URL to a local file with a socket-level timeout.""" + try: + with urllib.request.urlopen(url, timeout=_DOWNLOAD_TIMEOUT_SECONDS) as resp: + with open(dest, "wb") as out: + shutil.copyfileobj(resp, out) + except urllib.error.URLError as exc: + raise RuntimeError( + f"Failed to download spaCy model from {url}: {exc}. " + "Check your network connection and try again." + ) from exc + + +def _install_spacy_model() -> None: + """Download and install the pinned spaCy model wheel using the `installer` library.""" + from installer import install + from installer.destinations import SchemeDictionaryDestination + from installer.sources import WheelFile + from installer.utils import get_launcher_kind + + with tempfile.TemporaryDirectory() as tmp: + whl_path = os.path.join(tmp, f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl") + logger.info("Downloading spaCy model %s %s …", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) + _download_with_timeout(_SPACY_MODEL_URL, whl_path) + + with open(whl_path, "rb") as f: + sha256 = hashlib.sha256(f.read()).hexdigest() + if sha256 != _SPACY_MODEL_SHA256: + raise RuntimeError( + f"Hash mismatch for {_SPACY_MODEL_NAME}: " + f"expected {_SPACY_MODEL_SHA256}, got {sha256}" + ) + + # Install into a staging directory to avoid races with other processes + staging = os.path.join(tmp, "staging") + paths = sysconfig.get_paths() + staged_paths = paths.copy() + staged_paths["purelib"] = staging + staged_paths["platlib"] = staging + + destination = SchemeDictionaryDestination( + staged_paths, + interpreter=sys.executable, + script_kind=get_launcher_kind(), + ) + with WheelFile.open(whl_path) as source: + install(source=source, destination=destination, additional_metadata={}) + + # Move installed packages from staging into real site-packages. + # The caller holds _INSTALL_LOCK_PATH so no other process races here. + # Any dst that already exists is a remnant of a previous failed install + # (spacy.load() just failed), so remove it before moving to avoid + # shutil.move placing src *inside* an existing directory. + site_packages = paths["purelib"] + for item in os.listdir(staging): + src = os.path.join(staging, item) + dst = os.path.join(site_packages, item) + try: + if os.path.isdir(dst): + shutil.rmtree(dst) + elif os.path.exists(dst): + os.remove(dst) + shutil.move(src, dst) + except OSError as exc: + raise RuntimeError( + f"Failed to install {_SPACY_MODEL_NAME} to {site_packages}: {exc}. " + "Ensure the site-packages directory is writable, or pre-install the model " + f"with: python -m spacy download {_SPACY_MODEL_NAME}" + ) from exc + + logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION) + + +def _load_spacy_model() -> spacy.language.Language: + try: + return spacy.load(_SPACY_MODEL_NAME) + except OSError: + pass + + # Serialize model installation across processes with an exclusive file lock. + # A well-known path in the system temp dir is visible to all processes + # regardless of their working directory. + with FileLock(_INSTALL_LOCK_PATH, timeout=-1): + # Double-check: another process may have installed while we waited. + importlib.invalidate_caches() + try: + return spacy.load(_SPACY_MODEL_NAME) + except OSError: + pass + _install_spacy_model() + importlib.invalidate_caches() + try: + return spacy.load(_SPACY_MODEL_NAME) + except OSError as exc: + raise RuntimeError( + f"Installed {_SPACY_MODEL_NAME} but spacy.load() still failed. " + "Check site-packages permissions and installation integrity." + ) from exc + + +@lru_cache(maxsize=1) +def _get_nlp() -> spacy.language.Language: + """Load the spaCy model on first use and cache it for the lifetime of the process.""" + return _load_spacy_model() def _process(text: str) -> spacy.tokens.Doc: """Run the spaCy pipeline once. All public functions extract what they need from the Doc.""" # -- str() handles numpy.str_ from OCR pipelines -- - return _nlp(str(text)) + return _get_nlp()(str(text)) def sent_tokenize(text: str) -> List[str]: diff --git a/uv.lock b/uv.lock index d13e30f118..e282f04b04 100644 --- a/uv.lock +++ b/uv.lock @@ -1488,14 +1488,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/5e/4b5aaaabddfacfe36ba7768817bd1f71a7a810a43705e531f3ae4c690767/emoji-2.15.0-py3-none-any.whl", hash = "sha256:205296793d66a89d88af4688fa57fd6496732eb48917a87175a023c8138995eb", size = 608433, upload-time = "2025-09-21T12:13:01.197Z" }, ] -[[package]] -name = "en-core-web-sm" -version = "3.8.0" -source = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" } -wheels = [ - { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl", hash = "sha256:1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85" }, -] - [[package]] name = "et-xmlfile" version = "2.0.0" @@ -2415,6 +2407,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "installer" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/18/ceeb4e3ab3aa54495775775b38ae42b10a92f42ce42dfa44da684289b8c8/installer-0.7.0.tar.gz", hash = "sha256:a26d3e3116289bb08216e0d0f7d925fcef0b0194eedfa0c944bcaaa106c4b631", size = 474349, upload-time = "2023-03-17T20:39:38.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/ca/1172b6638d52f2d6caa2dd262ec4c811ba59eee96d54a7701930726bce18/installer-0.7.0-py3-none-any.whl", hash = "sha256:05d1933f0a5ba7d8d6296bb6d5018e7c94fa473ceb10cf198a92ccea19c27b53", size = 453838, upload-time = "2023-03-17T20:39:36.219Z" }, +] + [[package]] name = "invoke" version = "2.2.1" @@ -7036,9 +7037,10 @@ dependencies = [ { name = "beautifulsoup4" }, { name = "charset-normalizer" }, { name = "emoji" }, - { name = "en-core-web-sm" }, + { name = "filelock" }, { name = "filetype" }, { name = "html5lib" }, + { name = "installer" }, { name = "langdetect" }, { name = "lxml" }, { name = "numba" }, @@ -7215,13 +7217,14 @@ requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.14.3,<5.0.0" }, { name = "charset-normalizer", specifier = ">=3.4.4,<4.0.0" }, { name = "emoji", specifier = ">=2.15.0,<3.0.0" }, - { name = "en-core-web-sm", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }, + { name = "filelock", specifier = ">=3.12.0,<4.0.0" }, { name = "filetype", specifier = ">=1.2.0,<2.0.0" }, { name = "google-cloud-vision", marker = "extra == 'all-docs'", specifier = ">=3.12.1,<4.0.0" }, { name = "google-cloud-vision", marker = "extra == 'image'", specifier = ">=3.12.1,<4.0.0" }, { name = "google-cloud-vision", marker = "extra == 'local-inference'", specifier = ">=3.12.1,<4.0.0" }, { name = "google-cloud-vision", marker = "extra == 'pdf'", specifier = ">=3.12.1,<4.0.0" }, { name = "html5lib", specifier = ">=1.1,<2.0.0" }, + { name = "installer", specifier = ">=0.7.0,<1.0.0" }, { name = "langdetect", specifier = ">=1.0.9,<2.0.0" }, { name = "lxml", specifier = ">=5.0.0,<7.0.0" }, { name = "markdown", marker = "extra == 'all-docs'", specifier = ">=3.10.1,<4.0.0" },