Add AMD ROCm Docker support (RDNA3/RDNA4)

Kaihui-AMD · Kaihui-AMD · commit fea78b2efda3 · 2026-06-01T17:34:15.000+08:00
diff --git a/compose.rocm.yml b/compose.rocm.yml
@@ -0,0 +1,58 @@
+name: fish-speech-rocm
+
+# AMD ROCm compose for Fish Speech (RDNA3 / RDNA4).
+# Mount your checkpoints into ./checkpoints before running.
+#
+#   docker compose -f compose.rocm.yml --profile webui up --build
+#   docker compose -f compose.rocm.yml --profile server up --build
+
+services:
+  webui:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.rocm
+      target: webui
+    image: fish-speech-webui:rocm
+    profiles: ["webui"]
+    ports:
+      - "${GRADIO_PORT:-7860}:7860"
+    volumes:
+      - ./checkpoints:/app/checkpoints
+      - ./references:/app/references
+    environment:
+      - ROCBLAS_USE_HIPBLASLT=0
+      - COMPILE=${COMPILE:-1}
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    group_add:
+      - video
+      - render
+    shm_size: "16g"
+    tty: true
+    stdin_open: true
+
+  server:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.rocm
+      target: server
+    image: fish-speech-server:rocm
+    profiles: ["server"]
+    ports:
+      - "${API_PORT:-8080}:8080"
+    volumes:
+      - ./checkpoints:/app/checkpoints
+      - ./references:/app/references
+    environment:
+      - ROCBLAS_USE_HIPBLASLT=0
+      - COMPILE=${COMPILE:-1}
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    group_add:
+      - video
+      - render
+    shm_size: "16g"
+    tty: true
+    stdin_open: true
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -0,0 +1,106 @@
+# docker/Dockerfile.rocm
+#
+# Fish Speech on AMD ROCm (RDNA3 / RDNA4).
+# The checkpoints are NOT bundled — mount them at /app/checkpoints.
+#
+# Build:
+#   docker build -f docker/Dockerfile.rocm --target webui -t fish-speech-webui:rocm .
+#   docker build -f docker/Dockerfile.rocm --target server -t fish-speech-server:rocm .
+#
+# Run (webui):
+#   docker run --device=/dev/kfd --device=/dev/dri \
+#       --group-add video --group-add render \
+#       -e ROCBLAS_USE_HIPBLASLT=0 \
+#       -v ./checkpoints:/app/checkpoints \
+#       -p 7860:7860 fish-speech-webui:rocm
+
+ARG ROCM_VERSION=7.2.3
+ARG BASE_IMAGE=rocm/pytorch:rocm${ROCM_VERSION}_ubuntu24.04_py3.12_pytorch_release_2.9.1
+
+FROM ${BASE_IMAGE} AS app-base
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    ROCBLAS_USE_HIPBLASLT=0
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        git ffmpeg libsox-dev build-essential cmake \
+        libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY . /app
+
+# Install runtime dependencies WITHOUT torch/torchaudio — the ROCm base image
+# already ships a gfx-tuned torch (2.9.1+rocm7.2.3). Then install the package
+# itself with --no-deps so pip does not try to pull a CUDA/CPU torch.
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel \
+    && pip install --no-cache-dir \
+        numpy "transformers<=4.57.3" datasets lightning pytorch_lightning \
+        hydra-core natsort einops librosa rich "gradio>5.0.0" wandb grpcio kui \
+        uvicorn loguru loralib pyrootutils resampy "einx[torch]==0.2.2" zstandard \
+        pydub "modelscope==1.17.1" "opencc-python-reimplemented==0.1.7" \
+        silero-vad ormsgpack tiktoken "pydantic==2.9.2" cachetools \
+        descript-audio-codec safetensors soundfile vector_quantize_pytorch \
+    && pip install --no-cache-dir --no-build-isolation pyaudio \
+    && pip install --no-cache-dir --no-deps -e . \
+    # descript-audiotools pins protobuf<3.20, but fish-speech's generated proto
+    # code needs >=3.20. Override after install (mirrors pyproject's uv override).
+    && pip install --no-cache-dir --no-deps --upgrade "protobuf>=4.25,<6.0"
+
+EXPOSE 7860 8080
+
+# torch.compile is enabled by default (verified working on gfx1201/RDNA4).
+# Set COMPILE=0 to disable.
+ENV COMPILE=1
+
+##############################################################
+# Gradio WebUI
+##############################################################
+FROM app-base AS webui
+
+ARG GRADIO_SERVER_NAME="0.0.0.0"
+ARG GRADIO_SERVER_PORT=7860
+ENV GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME} \
+    GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT}
+
+RUN printf '%s\n' \
+    '#!/bin/bash' \
+    'set -e' \
+    'ARGS=()' \
+    'if [ "${COMPILE:-0}" = "1" ] || [ "${COMPILE:-}" = "true" ]; then ARGS+=(--compile); fi' \
+    'exec python tools/run_webui.py \' \
+    '  --llama-checkpoint-path checkpoints/s2-pro \' \
+    '  --decoder-checkpoint-path checkpoints/s2-pro/codec.pth \' \
+    '  --decoder-config-name modded_dac_vq "${ARGS[@]}"' \
+    > /app/start_webui.sh && chmod +x /app/start_webui.sh
+
+ENTRYPOINT ["/app/start_webui.sh"]
+
+##############################################################
+# API Server
+##############################################################
+FROM app-base AS server
+
+ARG API_SERVER_NAME="0.0.0.0"
+ARG API_SERVER_PORT=8080
+ENV API_SERVER_NAME=${API_SERVER_NAME} \
+    API_SERVER_PORT=${API_SERVER_PORT}
+
+RUN printf '%s\n' \
+    '#!/bin/bash' \
+    'set -e' \
+    'ARGS=()' \
+    'if [ "${COMPILE:-0}" = "1" ] || [ "${COMPILE:-}" = "true" ]; then ARGS+=(--compile); fi' \
+    'exec python tools/api_server.py \' \
+    '  --listen 0.0.0.0:8080 \' \
+    '  --llama-checkpoint-path checkpoints/s2-pro \' \
+    '  --decoder-checkpoint-path checkpoints/s2-pro/codec.pth \' \
+    '  --decoder-config-name modded_dac_vq "${ARGS[@]}"' \
+    > /app/start_server.sh && chmod +x /app/start_server.sh
+
+ENTRYPOINT ["/app/start_server.sh"]
diff --git a/docs/en/install.md b/docs/en/install.md
@@ -189,3 +189,40 @@ Both methods require mounting these directories:
 
 !!! warning
     GPU support requires NVIDIA Docker runtime. For CPU-only deployment, remove the `--gpus all` flag and use CPU images.
+
+### AMD ROCm support
+
+Fish Speech runs on AMD GPUs via ROCm. The ROCm image is based on the official `rocm/pytorch` image, which already ships a gfx-tuned PyTorch, so no separate torch install is needed. Verified on RDNA4 (Radeon AI PRO R9700 / gfx1201) with ROCm 7.2.3; RDNA3 (gfx1100/gfx1101) should also work.
+
+**Prerequisites:**
+
+- AMD GPU with ROCm support (RDNA3 / RDNA4)
+- ROCm drivers installed on the host
+- Docker with GPU passthrough (`/dev/kfd` and `/dev/dri`)
+
+**Using Docker Compose:**
+
+```bash
+# WebUI
+docker compose -f compose.rocm.yml --profile webui up --build
+
+# API server
+docker compose -f compose.rocm.yml --profile server up --build
+```
+
+**Manual build and run:**
+
+```bash
+docker build -f docker/Dockerfile.rocm --target webui -t fish-speech-webui:rocm .
+
+docker run \
+    --device=/dev/kfd --device=/dev/dri \
+    --group-add video --group-add render \
+    -e ROCBLAS_USE_HIPBLASLT=0 \
+    -v ./checkpoints:/app/checkpoints \
+    -p 7860:7860 \
+    fish-speech-webui:rocm
+```
+
+!!! note
+    `ROCBLAS_USE_HIPBLASLT=0` is set by default for RDNA4 (gfx1201) stability; RDNA3 users may not need it. Fish Speech uses `scaled_dot_product_attention`, which dispatches to ROCm's AOTriton flash-attention backend automatically — no custom kernel build is required. The first run is slower while MIOpen auto-tunes kernels. `torch.compile` is enabled by default (`COMPILE=1`); set `COMPILE=0` to disable.