Skip to content

Commit 085df53

Browse files
author
Developer
committed
Merge remote-tracking branch 'origin/main' into R3_support
2 parents 6ed1951 + d3397d7 commit 085df53

File tree

96 files changed

+4884
-1277
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

96 files changed

+4884
-1277
lines changed

.github/workflows/docker-publish.yml

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ env:
2020

2121
jobs:
2222
build:
23-
2423
runs-on: ubuntu-latest
2524
permissions:
2625
contents: read
@@ -65,7 +64,6 @@ jobs:
6564
if: github.event_name != 'pull_request'
6665
uses: sigstore/cosign-installer@v3.5.0
6766

68-
6967
# Workaround: https://github.com/docker/build-push-action/issues/461
7068
- name: Setup Docker buildx
7169
uses: docker/setup-buildx-action@79abd3f86f79a9d68a23c75a09a9a85889262adf
@@ -88,9 +86,8 @@ jobs:
8886
with:
8987
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
9088

91-
# Build and push Docker image with Buildx (don't push on PR)
92-
# https://github.com/docker/build-push-action
93-
- name: Build and push Docker image
89+
# Build and push default image (cuda12.8.0)
90+
- name: Build and push Docker image (default cuda12.8.0)
9491
id: build-and-push
9592
uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a
9693
with:
@@ -99,19 +96,11 @@ jobs:
9996
push: ${{ github.event_name != 'pull_request' }}
10097
tags: ${{ steps.meta.outputs.tags }}
10198
labels: ${{ steps.meta.outputs.labels }}
102-
cache-from: type=gha
103-
cache-to: type=gha,mode=max
104-
105-
# Build and push specific Docker image for deepep
106-
# https://github.com/docker/build-push-action
107-
- name: Build and push deepep Docker image
108-
id: build-and-push-deepep
109-
uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a
110-
with:
111-
context: .
112-
file: ./docker/Dockerfile.deepep
113-
push: ${{ github.event_name != 'pull_request' }}
114-
tags: ghcr.io/modeltc/lightllm:main-deepep
99+
build-args: |
100+
CUDA_VERSION=12.8.0
101+
ENABLE_DEEPEP=1
102+
ENABLE_NIXL=1
103+
ENABLE_CACHE=1
115104
cache-from: type=gha
116105
cache-to: type=gha,mode=max
117106

@@ -128,4 +117,4 @@ jobs:
128117
DIGEST: ${{ steps.build-and-push.outputs.digest }}
129118
# This step uses the identity token to provision an ephemeral certificate
130119
# against the sigstore community Fulcio instance.
131-
run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
120+
run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}

build_and_upload_docker.sh

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,4 @@ IMAGE_TAG=$2
1818
ACCOUNT=$1
1919
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com
2020
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG .
21-
docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG
22-
23-
#deepep
24-
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.deepep -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep .
25-
docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep
21+
docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG

docker/Dockerfile

Lines changed: 102 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
1-
ARG CUDA_VERSION=12.6.1
1+
ARG CUDA_VERSION=12.8.0
22
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
3+
34
ARG PYTHON_VERSION=3.10
45
ARG MAMBA_VERSION=24.7.1-0
6+
ARG VLLM_VERSION=0.11.0
57
ARG TARGETPLATFORM
8+
ARG ENABLE_DEEPEP=1
9+
ARG ENABLE_NIXL=1
10+
ARG ENABLE_CACHE=1
11+
612
ENV PATH=/opt/conda/bin:$PATH \
713
CONDA_PREFIX=/opt/conda
814

9-
RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
10-
ca-certificates \
11-
libssl-dev \
12-
curl \
13-
g++ \
14-
make \
15-
git && \
15+
RUN chmod 777 -R /tmp && \
16+
apt-get update && \
17+
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
18+
ca-certificates \
19+
libssl-dev \
20+
curl \
21+
g++ \
22+
make \
23+
git && \
1624
rm -rf /var/lib/apt/lists/*
1725

1826
RUN case ${TARGETPLATFORM} in \
@@ -25,24 +33,103 @@ RUN case ${TARGETPLATFORM} in \
2533

2634
RUN case ${TARGETPLATFORM} in \
2735
"linux/arm64") exit 1 ;; \
28-
*) /opt/conda/bin/conda update -y conda && \
29-
/opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
36+
*) /opt/conda/bin/conda update -y conda && \
37+
/opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
3038
esac && \
3139
/opt/conda/bin/conda clean -ya
3240

33-
3441
WORKDIR /root
3542

3643
COPY ./requirements.txt /lightllm/requirements.txt
3744
RUN pip install -U pip
3845
RUN pip install -r /lightllm/requirements.txt --no-cache-dir
46+
RUN pip install --no-cache-dir vllm==${VLLM_VERSION}
47+
RUN pip install https://github.com/ModelTC/LightKernel/releases/download/v1.0.1/lightllm_kernel-0.1.0-cp310-cp310-linux_x86_64.whl
48+
49+
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
50+
51+
ENV CUDA_HOME=/usr/local/cuda \
52+
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
53+
54+
RUN if [ "${ENABLE_CACHE}" = "1" ]; then \
55+
apt-get update && apt-get install -y libboost-dev && rm -rf /var/lib/apt/lists/*; \
56+
LIGHTMEM_REF=5900baf92d85ef4dbda6124093506b0af906011a; \
57+
pip install --no-deps -v "git+https://github.com/ModelTC/LightMem.git@${LIGHTMEM_REF}#egg=light_mem"; \
58+
fi
3959

40-
RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
60+
RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
61+
apt-get update && apt-get install -y wget devscripts debhelper dh-make build-essential dkms && \
62+
apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev && \
63+
rm -rf /var/lib/apt/lists/*; \
64+
mkdir -p /tmp/gdrcopy && cd /tmp \
65+
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
66+
&& cd gdrcopy/packages \
67+
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
68+
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
69+
&& cd / && rm -rf /tmp/gdrcopy; \
70+
fi
4171

42-
# TODO: offline compile
43-
# RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
72+
RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
73+
set -e; \
74+
ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
75+
NVSHMEM_VERSION=3.3.9; \
76+
CUDA_ARCHS=90; \
77+
wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
78+
&& tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
79+
&& cd nvshmem \
80+
&& rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
81+
&& NVSHMEM_SHMEM_SUPPORT=0 \
82+
NVSHMEM_UCX_SUPPORT=0 \
83+
NVSHMEM_USE_NCCL=0 \
84+
NVSHMEM_MPI_SUPPORT=0 \
85+
NVSHMEM_IBGDA_SUPPORT=1 \
86+
NVSHMEM_PMIX_SUPPORT=0 \
87+
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
88+
NVSHMEM_USE_GDRCOPY=1 \
89+
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
90+
&& cmake --build build --target install -j64; \
91+
DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
92+
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
93+
cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
94+
fi
4495

45-
RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel
96+
RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
97+
apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
98+
DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
99+
rm -rf /usr/lib/ucx && rm -rf /opt/hpcx/ucx && \
100+
cd /usr/local/src && \
101+
git clone https://github.com/openucx/ucx.git && \
102+
cd ucx && \
103+
git checkout v1.19.x && \
104+
./autogen.sh && ./configure \
105+
--enable-shared \
106+
--disable-static \
107+
--disable-doxygen-doc \
108+
--enable-optimizations \
109+
--enable-cma \
110+
--enable-devel-headers \
111+
--with-cuda=/usr/local/cuda \
112+
--with-verbs=yes \
113+
--with-dm \
114+
--with-gdrcopy=/usr/local \
115+
--with-efa \
116+
--enable-mt && \
117+
make -j && \
118+
make -j install-strip && \
119+
ldconfig; \
120+
apt-get update && apt-get install -y pkg-config tmux net-tools && \
121+
cd /usr/local/src; \
122+
pip install --upgrade meson pybind11 patchelf; \
123+
git clone https://github.com/ai-dynamo/nixl.git -b main && \
124+
cd nixl && \
125+
rm -rf build && \
126+
mkdir build && \
127+
meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
128+
cd build && \
129+
ninja && \
130+
ninja install && \
131+
cd .. && pip install . --no-deps; \
132+
fi
46133

47134
COPY . /lightllm
48135
RUN pip install -e /lightllm --no-cache-dir

docker/Dockerfile.deepep

Lines changed: 0 additions & 84 deletions
This file was deleted.

docker/Dockerfile.nixl

Lines changed: 0 additions & 94 deletions
This file was deleted.

0 commit comments

Comments
 (0)