diff --git a/backend/pyproject.toml b/backend/pyproject.toml index af0d10ef4..65e27107a 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -25,7 +25,8 @@ data-process = [ "celery>=5.3.6", "flower>=2.0.1", "nest_asyncio>=1.5.6", - "unstructured[csv,docx,pdf,pptx,xlsx,md]==0.18.14" + "unstructured[csv,docx,pdf,pptx,xlsx,md]==0.18.14", + "huggingface_hub>=0.19.0,<0.21.0" ] test = [ "pytest", diff --git a/make/data_process/Dockerfile b/make/data_process/Dockerfile index e7550fe2d..35d7a6c48 100644 --- a/make/data_process/Dockerfile +++ b/make/data_process/Dockerfile @@ -42,6 +42,9 @@ RUN uv sync --no-cache-dir --extra data-process $(test -n "$MIRROR" && echo "-i COPY sdk /opt/sdk RUN uv pip install --no-cache-dir /opt/sdk $(test -n "$MIRROR" && echo "-i $MIRROR") && \ uv cache clean + +# Pre-download tiktoken cl100k_base model to avoid network issues during runtime +RUN uv run python -c "import tiktoken; enc = tiktoken.get_encoding('cl100k_base')" # Layer 3: copy backend code COPY backend /opt/backend