New torch version (2.8) New flash attnention, remove hf_transfer (#636)

michaelfeil · web-flow · commit 18bfb975801d · 2025-08-22T15:24:32.000-07:00
* hf-xet

* update torch deps

* update docker file

* update docker templates

* hf token no permissions

* remove pg bar
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -77,6 +77,8 @@ jobs:
         run: |
           poetry run coverage run -m --source ./infinity_emb pytest tests/${{ matrix.coverage_tests }} 
           poetry run coverage xml
+        env:
+          HF_TOKEN: ${{ secrets.HFTOKEN_NOPERMISSIONS }}
       
       - name: Run Pytest Coverage w/o infinity
         if: ${{ inputs.working-directory != 'libs/infinity_emb' }}
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ Infinity is a high-throughput, low-latency REST API for serving text-embeddings,
 </p> 
 
 ### Latest News 🔥
-
+- [2025/07] Blackwell support
 - [2024/11] AMD, CPU, ONNX docker images
 - [2024/10] `pip install infinity_client`
 - [2024/07] Inference deployment example via [Modal](./infra/modal/README.md) and a [free GPU deployment](https://infinity.modal.michaelfeil.eu/)
diff --git a/libs/infinity_emb/Docker.template.yaml b/libs/infinity_emb/Docker.template.yaml
@@ -3,12 +3,15 @@
 # 1. Guide: pip install jinja2 jinja2-cli
 nvidia:
   # 2 .command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
-  base_image: "nvidia/cuda:12.4.1-base-ubuntu22.04"
+  base_image: "nvidia/cuda:12.9.0-base-ubuntu22.04"
   main_install: 'RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all'
   python_version: python3.10
   extra_installs_main: |
     # nvcc is not installed -> the following might break if the torch version or python version changes.
-    RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+    RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+  extra_env_variables: |
+    RUN rm -rf /usr/local/cuda-12.9/compat
+    ENV NVIDIA_DISABLE_REQUIRE=true
 cpu:
   # 2. command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
   base_image: "ubuntu:22.04"
@@ -81,7 +84,7 @@ amd:
     ENV INFINITY_BETTERTRANSFORMER="0"
 
 trt:
-  base_image: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+  base_image: nvidia/cuda:12.9.0-cudnn-devel-ubuntu22.04
   poetry_extras: "all onnxruntime-gpu"
   extra_installs_main: |
     # Install utils for tensorrt
@@ -92,5 +95,7 @@ trt:
     # Set default to tensorrt
     ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
     ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
+    RUN rm -rf /usr/local/cuda-12.9/compat
+    ENV NVIDIA_DISABLE_REQUIRE=true
   python_version: python3.10
-  main_install: 'RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all'
+  main_install: 'RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all'    
diff --git a/libs/infinity_emb/Dockerfile.amd_auto b/libs/infinity_emb/Dockerfile.amd_auto
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
     POETRY_VIRTUALENVS_CREATE="true" \
     POETRY_VIRTUALENVS_IN_PROJECT="true" \
     POETRY_NO_INTERACTION=1 \
-    # huggingface     
-    HF_HUB_ENABLE_HF_TRANSFER=1 \
     # extras
     EXTRAS="all" \
     PYTHON="python3" 
diff --git a/libs/infinity_emb/Dockerfile.cpu_auto b/libs/infinity_emb/Dockerfile.cpu_auto
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
     POETRY_VIRTUALENVS_CREATE="true" \
     POETRY_VIRTUALENVS_IN_PROJECT="true" \
     POETRY_NO_INTERACTION=1 \
-    # huggingface     
-    HF_HUB_ENABLE_HF_TRANSFER=1 \
     # extras
     EXTRAS="all" \
     PYTHON="python3" 
diff --git a/libs/infinity_emb/Dockerfile.jinja2 b/libs/infinity_emb/Dockerfile.jinja2
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
     POETRY_VIRTUALENVS_CREATE="{{poetry_virtualenvs_create | default('true')}}" \
     POETRY_VIRTUALENVS_IN_PROJECT="{{poetry_virtualenvs_in_project | default('true')}}" \
     POETRY_NO_INTERACTION=1 \
-    # huggingface     
-    HF_HUB_ENABLE_HF_TRANSFER=1 \
     # extras
     EXTRAS="{{poetry_extras | default('all')}}" \
     PYTHON="python3" 
diff --git a/libs/infinity_emb/Dockerfile.nvidia_auto b/libs/infinity_emb/Dockerfile.nvidia_auto
@@ -2,7 +2,7 @@
 # This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
 # Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
 
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS base
+FROM nvidia/cuda:12.9.0-base-ubuntu22.04 AS base
 
 ENV PYTHONUNBUFFERED=1 \
     # pip
@@ -14,13 +14,13 @@ ENV PYTHONUNBUFFERED=1 \
     POETRY_VIRTUALENVS_CREATE="true" \
     POETRY_VIRTUALENVS_IN_PROJECT="true" \
     POETRY_NO_INTERACTION=1 \
-    # huggingface     
-    HF_HUB_ENABLE_HF_TRANSFER=1 \
     # extras
     EXTRAS="all" \
     PYTHON="python3" 
     # "python3.10"
 RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON-pip $PYTHON curl
+RUN rm -rf /usr/local/cuda-12.9/compat
+ENV NVIDIA_DISABLE_REQUIRE=true
 
 WORKDIR /app
 
@@ -44,7 +44,7 @@ COPY infinity_emb infinity_emb
 # Install dependency with infinity_emb package
 RUN poetry install --no-interaction --no-ansi  --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
 # nvcc is not installed -> the following might break if the torch version or python version changes.
-RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+RUN poetry run $PYTHON -m pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 
 # TODO: remove this line
 RUN apt-get install --no-install-recommends -y git && poetry run python -m pip install git+https://github.com/huggingface/transformers.git@7547f55e5d93245c0a013b50df976924f2d9e8b0 && rm -rf ~/.cache/ /tmp/*
diff --git a/libs/infinity_emb/Dockerfile.trt_onnx_auto b/libs/infinity_emb/Dockerfile.trt_onnx_auto
@@ -2,7 +2,7 @@
 # This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
 # Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
 
-FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS base
+FROM nvidia/cuda:12.9.0-cudnn-devel-ubuntu22.04 AS base
 
 ENV PYTHONUNBUFFERED=1 \
     # pip
@@ -14,8 +14,6 @@ ENV PYTHONUNBUFFERED=1 \
     POETRY_VIRTUALENVS_CREATE="true" \
     POETRY_VIRTUALENVS_IN_PROJECT="true" \
     POETRY_NO_INTERACTION=1 \
-    # huggingface     
-    HF_HUB_ENABLE_HF_TRANSFER=1 \
     # extras
     EXTRAS="all onnxruntime-gpu" \
     PYTHON="python3" 
@@ -24,6 +22,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y build-essential
 # Set default to tensorrt
 ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
 ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
+RUN rm -rf /usr/local/cuda-12.9/compat
+ENV NVIDIA_DISABLE_REQUIRE=true
 
 WORKDIR /app
 
diff --git a/libs/infinity_emb/infinity_emb/__init__.py b/libs/infinity_emb/infinity_emb/__init__.py
@@ -2,26 +2,6 @@
 # Copyright (c) 2023-now michaelfeil
 
 import importlib.metadata
-import os
-
-### Check if HF_HUB_ENABLE_HF_TRANSFER is set, if not try to enable it
-if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
-    try:
-        # enable hf hub transfer if available
-        import hf_transfer  # type: ignore # noqa
-
-        # Needs to be at the top of the file / before other
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-        os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
-        import huggingface_hub.constants  # type: ignore
-
-        huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
-    except ImportError:
-        pass
-import huggingface_hub.constants  # type: ignore
-
-huggingface_hub.constants.HF_HUB_DISABLE_PROGRESS_BARS = True
-
 
 from infinity_emb.args import EngineArgs  # noqa: E402
 from infinity_emb.engine import AsyncEmbeddingEngine, AsyncEngineArray  # noqa: E402
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -82,7 +82,7 @@ def create_server(
     async def lifespan(app: FastAPI):
         instrumentator.expose(app)  # type: ignore
         logger.info(
-            f"Creating {len(engine_args_list)}engines: engines={[e.served_model_name for e in engine_args_list]}"
+            f"Creating {len(engine_args_list)} engines: {[e.served_model_name for e in engine_args_list]}"
         )
         telemetry_log_info()
         app.engine_array = AsyncEngineArray.from_args(engine_args_list)  # type: ignore
diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock
diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ def create_server(`
`82`	`82`	`async def lifespan(app: FastAPI):`
`83`	`83`	`instrumentator.expose(app) # type: ignore`
`84`	`84`	`logger.info(`
`85`		`- f"Creating {len(engine_args_list)}engines: engines={[e.served_model_name for e in engine_args_list]}"`
	`85`	`+ f"Creating {len(engine_args_list)} engines: {[e.served_model_name for e in engine_args_list]}"`
`86`	`86`	`)`
`87`	`87`	`telemetry_log_info()`
`88`	`88`	`app.engine_array = AsyncEngineArray.from_args(engine_args_list) # type: ignore`