refactor(dockerfile-lb): native per-version GPU LB base on nvidia/cuda

deanq · deanq · commit 0abb07d7d2d7 · 2026-06-18T00:54:42.000-07:00
Mirror the GPU worker rewrite for the load-balanced GPU image. Same
nvidia/cuda + deadsnakes pattern, same native-per-version layout, just
with EXPOSE 80 and the uvicorn entrypoint instead of the QB handler.

Refs AE-2827.
diff --git a/Dockerfile-lb b/Dockerfile-lb
@@ -10,12 +10,14 @@
 FROM runpod/pytorch:1.0.3-cu1281-torch291-ubuntu2204
 
 # Target Python version for the worker runtime.
+# Native per-version GPU LB base. Same shape as Dockerfile, with the
+# uvicorn entrypoint for load-balanced endpoints. See Dockerfile for the
+# full rationale on the nvidia/cuda + deadsnakes approach.
 ARG PYTHON_VERSION=3.12
 ARG TORCH_VERSION=2.9.1+cu128
 ARG TORCH_INDEX_URL=https://download.pytorch.org/whl/cu128
 
-# Expose the target version to the running worker for startup validation.
-ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION}
+FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04
 
 # Validate the base image provides the requested interpreter and activate it.
 # For non-3.12 targets, install torch for the selected Python and repoint
@@ -42,56 +44,71 @@ RUN python${PYTHON_VERSION} --version \
       && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python \
       && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python3; \
     fi
+# Re-declare ARGs after FROM so they're visible in this build stage.
+ARG PYTHON_VERSION
+ARG TORCH_VERSION
+ARG TORCH_INDEX_URL
 
-WORKDIR /app
-
-# Prevent interactive prompts during package installation
+ENV FLASH_PYTHON_VERSION=${PYTHON_VERSION}
 ENV DEBIAN_FRONTEND=noninteractive
-# Set timezone to avoid tzdata prompts
 ENV TZ=Etc/UTC
-
-# Enable HuggingFace transfer acceleration
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
-# Relocate HuggingFace cache outside /root/.cache to exclude from volume sync
 ENV HF_HOME=/hf-cache
 
-# Configure APT cache to persist under /root/.cache for volume sync
+# Install ONE Python natively. 3.10 from upstream Ubuntu (jammy ships it as
+# system Python); 3.11/3.12/3.13 from deadsnakes.
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+      software-properties-common ca-certificates curl gnupg \
+ && add-apt-repository -y ppa:deadsnakes/ppa \
+ && apt-get update \
+ && apt-get install -y --no-install-recommends \
+      python${PYTHON_VERSION} \
+      python${PYTHON_VERSION}-venv \
+      python${PYTHON_VERSION}-dev \
+      git \
+ && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python \
+ && ln -sf "$(which python${PYTHON_VERSION})" /usr/local/bin/python3 \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Bootstrap pip via get-pip.py.
+RUN python -c "import urllib.request; urllib.request.urlretrieve('https://bootstrap.pypa.io/get-pip.py', '/tmp/get-pip.py')" \
+ && python /tmp/get-pip.py --no-cache-dir \
+ && rm -f /tmp/get-pip.py
+
+# Install torch natively for the active interpreter.
+RUN python -m pip install --no-cache-dir \
+      --index-url ${TORCH_INDEX_URL} \
+      "torch==${TORCH_VERSION}"
+
+WORKDIR /app
+
+# Configure APT cache to persist under /root/.cache for volume sync.
 RUN mkdir -p /root/.cache/apt/archives/partial \
  && echo 'Dir::Cache "/root/.cache/apt";' > /etc/apt/apt.conf.d/01cache
 
-# Install system dependencies and uv
-# Note: build-essential not pre-installed to reduce image size (400MB savings)
-# Automatic detection will install it when needed (no manual action required)
-# Advanced: Users can pre-install via system_dependencies=["build-essential"]
-RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \
-    curl ca-certificates git \
- && curl -LsSf https://astral.sh/uv/install.sh | sh \
+# Install uv for downstream dependency installation.
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
  && cp ~/.local/bin/uv /usr/local/bin/uv \
- && chmod +x /usr/local/bin/uv \
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
+ && chmod +x /usr/local/bin/uv
 
-# Copy app code and install dependencies
-# Use --python to target the active interpreter (preserves torch in its site-packages)
+# Copy app code and install worker dependencies into the active interpreter.
 COPY README.md pyproject.toml uv.lock ./
 COPY src/ ./
 RUN uv export --format requirements-txt --no-dev --no-hashes > requirements.txt \
  && uv pip install --python $(which python) --break-system-packages -r requirements.txt
 
-# Install numpy for the active Python version.
-# The runpod/pytorch image ships torch but not numpy. Flash build excludes numpy
-# from tarballs (BASE_IMAGE_PACKAGES) to save tarball space (~30 MB), so numpy
-# must be provided here in the base image.
+# Install numpy for the active Python (excluded from flash tarballs).
 RUN python -m pip install --no-cache-dir numpy
 
-# Verify torch, numpy, and the expected Python version are available.
+# Verify torch, numpy, and the expected interpreter are wired correctly.
 RUN python -c "import sys; actual = f'{sys.version_info.major}.{sys.version_info.minor}'; expected = '${PYTHON_VERSION}'; assert actual == expected, f'Expected Python {expected}, got {actual}'; print(f'Python {actual} OK')" \
  && python -c "import torch; print(f'torch {torch.__version__} CUDA {torch.cuda.is_available()}')" \
  && python -c "import numpy; print(f'numpy {numpy.__version__}')"
 
 EXPOSE 80
 
-# CMD will be overridden by RunPod at runtime to run the specific generated handler
-# The handler factory generates handler_{resource_name}.py files
+# CMD will be overridden by RunPod at runtime to run the specific generated handler.
+# The handler factory generates handler_{resource_name}.py files.
 # RunPod will invoke: uvicorn handler_{resource_name}:app --host 0.0.0.0 --port 80
 CMD ["uvicorn", "lb_handler:app", "--host", "0.0.0.0", "--port", "80", "--timeout-keep-alive", "600"]