1- ARG CUDA_VERSION=12.6.1
1+ ARG CUDA_VERSION=12.8.0
22FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
3+
34ARG PYTHON_VERSION=3.10
45ARG MAMBA_VERSION=24.7.1-0
6+ ARG VLLM_VERSION=0.11.0
57ARG TARGETPLATFORM
8+ ARG ENABLE_DEEPEP=1
9+ ARG ENABLE_NIXL=1
10+ ARG ENABLE_CACHE=1
11+
612ENV PATH=/opt/conda/bin:$PATH \
713 CONDA_PREFIX=/opt/conda
814
9- RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
10- ca-certificates \
11- libssl-dev \
12- curl \
13- g++ \
14- make \
15- git && \
15+ RUN chmod 777 -R /tmp && \
16+ apt-get update && \
17+ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
18+ ca-certificates \
19+ libssl-dev \
20+ curl \
21+ g++ \
22+ make \
23+ git && \
1624 rm -rf /var/lib/apt/lists/*
1725
1826RUN case ${TARGETPLATFORM} in \
@@ -25,24 +33,103 @@ RUN case ${TARGETPLATFORM} in \
2533
2634RUN case ${TARGETPLATFORM} in \
2735 "linux/arm64" ) exit 1 ;; \
28- *) /opt/conda/bin/conda update -y conda && \
29- /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
36+ *) /opt/conda/bin/conda update -y conda && \
37+ /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
3038 esac && \
3139 /opt/conda/bin/conda clean -ya
3240
33-
3441WORKDIR /root
3542
3643COPY ./requirements.txt /lightllm/requirements.txt
3744RUN pip install -U pip
3845RUN pip install -r /lightllm/requirements.txt --no-cache-dir
46+ RUN pip install --no-cache-dir vllm==${VLLM_VERSION}
47+ RUN pip install https://github.com/ModelTC/LightKernel/releases/download/v1.0.1/lightllm_kernel-0.1.0-cp310-cp310-linux_x86_64.whl
48+
49+ RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
50+
51+ ENV CUDA_HOME=/usr/local/cuda \
52+ GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
53+
54+ RUN if [ "${ENABLE_CACHE}" = "1" ]; then \
55+ apt-get update && apt-get install -y libboost-dev && rm -rf /var/lib/apt/lists/*; \
56+ LIGHTMEM_REF=5900baf92d85ef4dbda6124093506b0af906011a; \
57+ pip install --no-deps -v "git+https://github.com/ModelTC/LightMem.git@${LIGHTMEM_REF}#egg=light_mem" ; \
58+ fi
3959
40- RUN pip install --no-cache-dir vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
60+ RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
61+ apt-get update && apt-get install -y wget devscripts debhelper dh-make build-essential dkms && \
62+ apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev && \
63+ rm -rf /var/lib/apt/lists/*; \
64+ mkdir -p /tmp/gdrcopy && cd /tmp \
65+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
66+ && cd gdrcopy/packages \
67+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
68+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
69+ && cd / && rm -rf /tmp/gdrcopy; \
70+ fi
4171
42- # TODO: offline compile
43- # RUN git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
72+ RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
73+ set -e; \
74+ ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
75+ NVSHMEM_VERSION=3.3.9; \
76+ CUDA_ARCHS=90; \
77+ wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
78+ && tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
79+ && cd nvshmem \
80+ && rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
81+ && NVSHMEM_SHMEM_SUPPORT=0 \
82+ NVSHMEM_UCX_SUPPORT=0 \
83+ NVSHMEM_USE_NCCL=0 \
84+ NVSHMEM_MPI_SUPPORT=0 \
85+ NVSHMEM_IBGDA_SUPPORT=1 \
86+ NVSHMEM_PMIX_SUPPORT=0 \
87+ NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
88+ NVSHMEM_USE_GDRCOPY=1 \
89+ cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
90+ && cmake --build build --target install -j64; \
91+ DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
92+ cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
93+ cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
94+ fi
4495
45- RUN apt-get update && apt-get install -y libnuma-dev # for sgl_kernel
96+ RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
97+ apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
98+ DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
99+ rm -rf /usr/lib/ucx && rm -rf /opt/hpcx/ucx && \
100+ cd /usr/local/src && \
101+ git clone https://github.com/openucx/ucx.git && \
102+ cd ucx && \
103+ git checkout v1.19.x && \
104+ ./autogen.sh && ./configure \
105+ --enable-shared \
106+ --disable-static \
107+ --disable-doxygen-doc \
108+ --enable-optimizations \
109+ --enable-cma \
110+ --enable-devel-headers \
111+ --with-cuda=/usr/local/cuda \
112+ --with-verbs=yes \
113+ --with-dm \
114+ --with-gdrcopy=/usr/local \
115+ --with-efa \
116+ --enable-mt && \
117+ make -j && \
118+ make -j install-strip && \
119+ ldconfig; \
120+ apt-get update && apt-get install -y pkg-config tmux net-tools && \
121+ cd /usr/local/src; \
122+ pip install --upgrade meson pybind11 patchelf; \
123+ git clone https://github.com/ai-dynamo/nixl.git -b main && \
124+ cd nixl && \
125+ rm -rf build && \
126+ mkdir build && \
127+ meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
128+ cd build && \
129+ ninja && \
130+ ninja install && \
131+ cd .. && pip install . --no-deps; \
132+ fi
46133
47134COPY . /lightllm
48135RUN pip install -e /lightllm --no-cache-dir
0 commit comments