Skip to content

Commit 9dd656f

Browse files
[XPU][NIXL] Add GPUDirect RDMA support for XPU (#35270)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
1 parent c8b678e commit 9dd656f

3 files changed

Lines changed: 62 additions & 5 deletions

File tree

docker/Dockerfile.xpu

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,57 @@ RUN --mount=type=cache,target=/root/.cache/uv \
115115
# install development dependencies (for testing)
116116
RUN uv pip install -e tests/vllm_test_utils
117117

118-
# install nixl from source code
119-
ENV NIXL_VERSION=0.7.0
120-
RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
118+
# install NIXL and UCX from source code
119+
ARG UCX_VERSION=e5d98879705239d254ede40b4a52891850cb5349
120+
ARG NIXL_VERSION=0.7.0
121+
122+
RUN apt-get update && apt-get install -y \
123+
pciutils \
124+
net-tools \
125+
iproute2 \
126+
hwloc \
127+
numactl \
128+
wget \
129+
curl \
130+
git \
131+
build-essential \
132+
autoconf \
133+
automake \
134+
libtool \
135+
pkg-config \
136+
rdma-core \
137+
libibverbs-dev \
138+
ibverbs-utils \
139+
libibverbs1 \
140+
librdmacm-dev \
141+
librdmacm1 \
142+
libibumad-dev \
143+
libibumad3 \
144+
libibmad-dev \
145+
libibmad5 \
146+
infiniband-diags \
147+
perftest \
148+
ibutils \
149+
libmlx5-1 \
150+
libmlx4-1 \
151+
ibverbs-providers \
152+
librdmacm1t64
153+
154+
ENV PKG_CONFIG_PATH=/tmp/ucx_install/lib/pkgconfig:${PKG_CONFIG_PATH}
155+
ENV LD_LIBRARY_PATH=/tmp/ucx_install/lib:${LD_LIBRARY_PATH}
156+
RUN --mount=type=cache,target=/root/.cache/uv \
157+
git clone https://github.com/openucx/ucx /tmp/ucx_source && \
158+
cd /tmp/ucx_source && git checkout "${UCX_VERSION}" && \
159+
bash autogen.sh && \
160+
./configure --prefix=/tmp/ucx_install --with-ze=yes --enable-examples --enable-mt && \
161+
make CFLAGS="-Wno-error=incompatible-pointer-types" -j8 && make install && \
162+
git clone https://github.com/ai-dynamo/nixl /tmp/nixl_source && \
163+
cd /tmp/nixl_source && git checkout "${NIXL_VERSION}" && \
164+
cd /tmp/nixl_source && \
165+
uv pip install --upgrade meson pybind11 patchelf && \
166+
uv pip install -r requirements.txt && \
167+
uv pip install . && \
168+
rm -rf /tmp/ucx_source /tmp/nixl_source
121169

122170
# FIX triton
123171
RUN --mount=type=cache,target=/root/.cache/uv \

vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,10 @@
135135
"cpu",
136136
),
137137
"tpu": ("cpu",),
138-
"xpu": ("cpu",),
138+
"xpu": (
139+
"cpu",
140+
"xpu",
141+
),
139142
"cpu": ("cpu",),
140143
}
141144
# support for oot platform by providing mapping in current_platform
@@ -945,7 +948,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
945948
# type based on kv_buffer_device
946949
nixl_memory_type = current_platform.get_nixl_memory_type()
947950
if nixl_memory_type is None:
948-
if self.kv_buffer_device == "cuda":
951+
if self.kv_buffer_device in ["cuda", "xpu"]:
949952
nixl_memory_type = "VRAM"
950953
elif self.kv_buffer_device == "cpu":
951954
nixl_memory_type = "DRAM"

vllm/platforms/xpu.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
221221
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
222222
)
223223

224+
# In some cases, the internal memory type cache can misdetect GPU
225+
# memory as host memory, also leading to invalid memory access.
226+
# This cache can be disabled by setting UCX_MEMTYPE_CACHE=n.
227+
# ref. https://openucx.readthedocs.io/en/master/faq.html
228+
os.environ["UCX_MEMTYPE_CACHE"] = "n"
229+
224230
@classmethod
225231
def support_hybrid_kv_cache(cls) -> bool:
226232
return True

0 commit comments

Comments
 (0)