Skip to content

Commit 18bfb97

Browse files
authored
New torch version (2.8) New flash attnention, remove hf_transfer (#636)
* hf-xet * update torch deps * update docker file * update docker templates * hf token no permissions * remove pg bar
1 parent 7606e06 commit 18bfb97

12 files changed

Lines changed: 173 additions & 234 deletions

.github/workflows/test.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ jobs:
7777
run: |
7878
poetry run coverage run -m --source ./infinity_emb pytest tests/${{ matrix.coverage_tests }}
7979
poetry run coverage xml
80+
env:
81+
HF_TOKEN: ${{ secrets.HFTOKEN_NOPERMISSIONS }}
8082

8183
- name: Run Pytest Coverage w/o infinity
8284
if: ${{ inputs.working-directory != 'libs/infinity_emb' }}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ Infinity is a high-throughput, low-latency REST API for serving text-embeddings,
5050
</p>
5151

5252
### Latest News 🔥
53-
53+
- [2025/07] Blackwell support
5454
- [2024/11] AMD, CPU, ONNX docker images
5555
- [2024/10] `pip install infinity_client`
5656
- [2024/07] Inference deployment example via [Modal](./infra/modal/README.md) and a [free GPU deployment](https://infinity.modal.michaelfeil.eu/)

libs/infinity_emb/Docker.template.yaml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@
33
# 1. Guide: pip install jinja2 jinja2-cli
44
nvidia:
55
# 2 .command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
6-
base_image: "nvidia/cuda:12.4.1-base-ubuntu22.04"
6+
base_image: "nvidia/cuda:12.9.0-base-ubuntu22.04"
77
main_install: 'RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all'
88
python_version: python3.10
99
extra_installs_main: |
1010
# nvcc is not installed -> the following might break if the torch version or python version changes.
11-
RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
11+
RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
12+
extra_env_variables: |
13+
RUN rm -rf /usr/local/cuda-12.9/compat
14+
ENV NVIDIA_DISABLE_REQUIRE=true
1215
cpu:
1316
# 2. command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
1417
base_image: "ubuntu:22.04"
@@ -81,7 +84,7 @@ amd:
8184
ENV INFINITY_BETTERTRANSFORMER="0"
8285
8386
trt:
84-
base_image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
87+
base_image: nvidia/cuda:12.9.0-cudnn-devel-ubuntu22.04
8588
poetry_extras: "all onnxruntime-gpu"
8689
extra_installs_main: |
8790
# Install utils for tensorrt
@@ -92,5 +95,7 @@ trt:
9295
# Set default to tensorrt
9396
ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
9497
ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
98+
RUN rm -rf /usr/local/cuda-12.9/compat
99+
ENV NVIDIA_DISABLE_REQUIRE=true
95100
python_version: python3.10
96-
main_install: 'RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all'
101+
main_install: 'RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all'

libs/infinity_emb/Dockerfile.amd_auto

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
1414
POETRY_VIRTUALENVS_CREATE="true" \
1515
POETRY_VIRTUALENVS_IN_PROJECT="true" \
1616
POETRY_NO_INTERACTION=1 \
17-
# huggingface
18-
HF_HUB_ENABLE_HF_TRANSFER=1 \
1917
# extras
2018
EXTRAS="all" \
2119
PYTHON="python3"

libs/infinity_emb/Dockerfile.cpu_auto

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
1414
POETRY_VIRTUALENVS_CREATE="true" \
1515
POETRY_VIRTUALENVS_IN_PROJECT="true" \
1616
POETRY_NO_INTERACTION=1 \
17-
# huggingface
18-
HF_HUB_ENABLE_HF_TRANSFER=1 \
1917
# extras
2018
EXTRAS="all" \
2119
PYTHON="python3"

libs/infinity_emb/Dockerfile.jinja2

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
1414
POETRY_VIRTUALENVS_CREATE="{{poetry_virtualenvs_create | default('true')}}" \
1515
POETRY_VIRTUALENVS_IN_PROJECT="{{poetry_virtualenvs_in_project | default('true')}}" \
1616
POETRY_NO_INTERACTION=1 \
17-
# huggingface
18-
HF_HUB_ENABLE_HF_TRANSFER=1 \
1917
# extras
2018
EXTRAS="{{poetry_extras | default('all')}}" \
2119
PYTHON="python3"

libs/infinity_emb/Dockerfile.nvidia_auto

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
33
# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
44

5-
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS base
5+
FROM nvidia/cuda:12.9.0-base-ubuntu22.04 AS base
66

77
ENV PYTHONUNBUFFERED=1 \
88
# pip
@@ -14,13 +14,13 @@ ENV PYTHONUNBUFFERED=1 \
1414
POETRY_VIRTUALENVS_CREATE="true" \
1515
POETRY_VIRTUALENVS_IN_PROJECT="true" \
1616
POETRY_NO_INTERACTION=1 \
17-
# huggingface
18-
HF_HUB_ENABLE_HF_TRANSFER=1 \
1917
# extras
2018
EXTRAS="all" \
2119
PYTHON="python3"
2220
# "python3.10"
2321
RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON-pip $PYTHON curl
22+
RUN rm -rf /usr/local/cuda-12.9/compat
23+
ENV NVIDIA_DISABLE_REQUIRE=true
2424

2525
WORKDIR /app
2626

@@ -44,7 +44,7 @@ COPY infinity_emb infinity_emb
4444
# Install dependency with infinity_emb package
4545
RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
4646
# nvcc is not installed -> the following might break if the torch version or python version changes.
47-
RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
47+
RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
4848

4949
# TODO: remove this line
5050
RUN apt-get install --no-install-recommends -y git && poetry run python -m pip install git+https://github.com/huggingface/transformers.git@7547f55e5d93245c0a013b50df976924f2d9e8b0 && rm -rf ~/.cache/ /tmp/*

libs/infinity_emb/Dockerfile.trt_onnx_auto

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
33
# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
44

5-
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS base
5+
FROM nvidia/cuda:12.9.0-cudnn-devel-ubuntu22.04 AS base
66

77
ENV PYTHONUNBUFFERED=1 \
88
# pip
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
1414
POETRY_VIRTUALENVS_CREATE="true" \
1515
POETRY_VIRTUALENVS_IN_PROJECT="true" \
1616
POETRY_NO_INTERACTION=1 \
17-
# huggingface
18-
HF_HUB_ENABLE_HF_TRANSFER=1 \
1917
# extras
2018
EXTRAS="all onnxruntime-gpu" \
2119
PYTHON="python3"
@@ -24,6 +22,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y build-essential
2422
# Set default to tensorrt
2523
ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
2624
ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
25+
RUN rm -rf /usr/local/cuda-12.9/compat
26+
ENV NVIDIA_DISABLE_REQUIRE=true
2727

2828
WORKDIR /app
2929

libs/infinity_emb/infinity_emb/__init__.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,6 @@
22
# Copyright (c) 2023-now michaelfeil
33

44
import importlib.metadata
5-
import os
6-
7-
### Check if HF_HUB_ENABLE_HF_TRANSFER is set, if not try to enable it
8-
if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
9-
try:
10-
# enable hf hub transfer if available
11-
import hf_transfer # type: ignore # noqa
12-
13-
# Needs to be at the top of the file / before other
14-
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
15-
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
16-
import huggingface_hub.constants # type: ignore
17-
18-
huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
19-
except ImportError:
20-
pass
21-
import huggingface_hub.constants # type: ignore
22-
23-
huggingface_hub.constants.HF_HUB_DISABLE_PROGRESS_BARS = True
24-
255

266
from infinity_emb.args import EngineArgs # noqa: E402
277
from infinity_emb.engine import AsyncEmbeddingEngine, AsyncEngineArray # noqa: E402

libs/infinity_emb/infinity_emb/infinity_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def create_server(
8282
async def lifespan(app: FastAPI):
8383
instrumentator.expose(app) # type: ignore
8484
logger.info(
85-
f"Creating {len(engine_args_list)}engines: engines={[e.served_model_name for e in engine_args_list]}"
85+
f"Creating {len(engine_args_list)} engines: {[e.served_model_name for e in engine_args_list]}"
8686
)
8787
telemetry_log_info()
8888
app.engine_array = AsyncEngineArray.from_args(engine_args_list) # type: ignore

0 commit comments

Comments
 (0)