Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.21.2

### Fixes
- **Self-install pinned spaCy model at runtime with SHA256 verification**: Replace the `en-core-web-sm` direct URL dependency in `pyproject.toml` with the `installer` library. The spaCy model is now downloaded and installed on first use with hash verification, removing the need for `[tool.uv.sources]` and making the install more portable.

## 0.21.1

- Bump version to create a new release
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@ ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV UV_COMPILE_BYTECODE=1
ENV UV_PYTHON_DOWNLOADS=never

# Install Python dependencies via uv (en-core-web-sm is declared in pyproject.toml)
# Install Python dependencies via uv, then trigger spaCy model self-install while network is available
RUN uv sync --locked --all-extras --no-group dev --no-group lint --no-group test --no-group release && \
uv run --no-sync $PYTHON -c "from unstructured.nlp.tokenize import _get_nlp; print('spaCy model loaded:', _get_nlp().meta['name'])" && \
uv run --no-sync $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
uv run --no-sync $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

Expand Down
6 changes: 2 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dependencies = [
"langdetect>=1.0.9, <2.0.0",
"lxml>=5.0.0, <7.0.0",
"spacy>=3.7.0, <4.0.0",
"en-core-web-sm>=3.8.0, <4.0.0",
"installer>=0.7.0, <1.0.0",
Comment thread
cursor[bot] marked this conversation as resolved.
"numba>=0.60.0, <1.0.0",
"numpy>=1.26.0, <3.0.0",
"psutil>=7.2.2, <8.0.0",
Expand All @@ -43,6 +43,7 @@ dependencies = [
"typing-extensions>=4.15.0, <5.0.0",
"unstructured-client>=0.25.9, <1.0.0",
"wrapt>=1.0.0, <2.0.0",
"filelock>=3.12.0,<4.0.0",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -181,9 +182,6 @@ release = [
"twine>=6.0.0, <7.0.0",
]

[tool.uv.sources]
en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }

[tool.uv]
required-environments = [
"sys_platform == 'linux' and platform_machine == 'x86_64'",
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.21.1" # pragma: no cover
__version__ = "0.21.2" # pragma: no cover
142 changes: 134 additions & 8 deletions unstructured/nlp/tokenize.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,151 @@
from __future__ import annotations

import hashlib
import importlib
import logging
import os
import shutil
import sys
import sysconfig
import tempfile
import urllib.error
import urllib.request
from functools import lru_cache
from typing import Final, List, Tuple

import spacy
from filelock import FileLock

logger = logging.getLogger(__name__)

CACHE_MAX_SIZE: Final[int] = 128

try:
_nlp = spacy.load("en_core_web_sm")
except OSError:
raise OSError(
"The spacy model 'en_core_web_sm' is required but not installed. "
"Install it with: python -m spacy download en_core_web_sm"
)
_SPACY_MODEL_NAME: Final[str] = "en_core_web_sm"
_SPACY_MODEL_VERSION: Final[str] = "3.8.0"
_SPACY_MODEL_URL: Final[str] = (
f"https://github.com/explosion/spacy-models/releases/download/"
f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}/"
f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl"
)
_SPACY_MODEL_SHA256: Final[str] = "1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85"


_DOWNLOAD_TIMEOUT_SECONDS: Final[int] = 120
_INSTALL_LOCK_PATH: Final[str] = os.path.join(
tempfile.gettempdir(), f"{_SPACY_MODEL_NAME}.install.lock"
)


def _download_with_timeout(url: str, dest: str) -> None:
"""Download a URL to a local file with a socket-level timeout."""
try:
with urllib.request.urlopen(url, timeout=_DOWNLOAD_TIMEOUT_SECONDS) as resp:
with open(dest, "wb") as out:
shutil.copyfileobj(resp, out)
except urllib.error.URLError as exc:
raise RuntimeError(
f"Failed to download spaCy model from {url}: {exc}. "
"Check your network connection and try again."
) from exc


def _install_spacy_model() -> None:
"""Download and install the pinned spaCy model wheel using the `installer` library."""
from installer import install
from installer.destinations import SchemeDictionaryDestination
from installer.sources import WheelFile
from installer.utils import get_launcher_kind

with tempfile.TemporaryDirectory() as tmp:
whl_path = os.path.join(tmp, f"{_SPACY_MODEL_NAME}-{_SPACY_MODEL_VERSION}-py3-none-any.whl")
logger.info("Downloading spaCy model %s %s …", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION)
_download_with_timeout(_SPACY_MODEL_URL, whl_path)
Comment thread
cursor[bot] marked this conversation as resolved.

with open(whl_path, "rb") as f:
sha256 = hashlib.sha256(f.read()).hexdigest()
if sha256 != _SPACY_MODEL_SHA256:
raise RuntimeError(
f"Hash mismatch for {_SPACY_MODEL_NAME}: "
f"expected {_SPACY_MODEL_SHA256}, got {sha256}"
)

# Install into a staging directory to avoid races with other processes
staging = os.path.join(tmp, "staging")
paths = sysconfig.get_paths()
staged_paths = paths.copy()
staged_paths["purelib"] = staging
staged_paths["platlib"] = staging

destination = SchemeDictionaryDestination(
staged_paths,
interpreter=sys.executable,
script_kind=get_launcher_kind(),
)
with WheelFile.open(whl_path) as source:
install(source=source, destination=destination, additional_metadata={})
Comment thread
cursor[bot] marked this conversation as resolved.

# Move installed packages from staging into real site-packages.
# The caller holds _INSTALL_LOCK_PATH so no other process races here.
# Any dst that already exists is a remnant of a previous failed install
# (spacy.load() just failed), so remove it before moving to avoid
# shutil.move placing src *inside* an existing directory.
site_packages = paths["purelib"]
for item in os.listdir(staging):
src = os.path.join(staging, item)
dst = os.path.join(site_packages, item)
try:
if os.path.isdir(dst):
shutil.rmtree(dst)
elif os.path.exists(dst):
os.remove(dst)
shutil.move(src, dst)
except OSError as exc:
raise RuntimeError(
f"Failed to install {_SPACY_MODEL_NAME} to {site_packages}: {exc}. "
"Ensure the site-packages directory is writable, or pre-install the model "
f"with: python -m spacy download {_SPACY_MODEL_NAME}"
) from exc

logger.info("Installed %s %s", _SPACY_MODEL_NAME, _SPACY_MODEL_VERSION)


def _load_spacy_model() -> spacy.language.Language:
try:
return spacy.load(_SPACY_MODEL_NAME)
except OSError:
pass

# Serialize model installation across processes with an exclusive file lock.
# A well-known path in the system temp dir is visible to all processes
# regardless of their working directory.
with FileLock(_INSTALL_LOCK_PATH, timeout=-1):
Comment thread
badGarnet marked this conversation as resolved.
# Double-check: another process may have installed while we waited.
importlib.invalidate_caches()
try:
return spacy.load(_SPACY_MODEL_NAME)
except OSError:
pass
_install_spacy_model()
importlib.invalidate_caches()
try:
return spacy.load(_SPACY_MODEL_NAME)
except OSError as exc:
raise RuntimeError(
f"Installed {_SPACY_MODEL_NAME} but spacy.load() still failed. "
"Check site-packages permissions and installation integrity."
) from exc


@lru_cache(maxsize=1)
def _get_nlp() -> spacy.language.Language:
"""Load the spaCy model on first use and cache it for the lifetime of the process."""
return _load_spacy_model()


def _process(text: str) -> spacy.tokens.Doc:
"""Run the spaCy pipeline once. All public functions extract what they need from the Doc."""
# -- str() handles numpy.str_ from OCR pipelines --
return _nlp(str(text))
return _get_nlp()(str(text))


def sent_tokenize(text: str) -> List[str]:
Expand Down
23 changes: 13 additions & 10 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading