Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions .github/workflows/ut.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ jobs:
- name: build docker image & push to local
id: build-image
run: |
docker build -t xpu-kernel-ci-image:latest -f Dockerfile.xpu .
docker tag xpu-kernel-ci-image:latest ${{ env.REGISTRY }}/xpu-kernel-ci-image:latest
docker push ${{ env.REGISTRY }}/xpu-kernel-ci-image:latest
docker build -t xpu-kernel-ci-image:test-213 -f Dockerfile.xpu .
docker tag xpu-kernel-ci-image:test-213 ${{ env.REGISTRY }}/xpu-kernel-ci-image:test-213
docker push ${{ env.REGISTRY }}/xpu-kernel-ci-image:test-213
Comment on lines +46 to +48

build-docker-image-latest-bmg:
runs-on: self-hosted-bmg
Expand All @@ -65,9 +65,9 @@ jobs:
- name: build docker image & push to local
id: build-image
run: |
docker build -t xpu-kernel-ci-image:latest -f Dockerfile.xpu .
docker tag xpu-kernel-ci-image:latest ${{ env.REGISTRY }}/xpu-kernel-ci-image:latest
docker push ${{ env.REGISTRY }}/xpu-kernel-ci-image:latest
docker build -t xpu-kernel-ci-image:test-213 -f Dockerfile.xpu .
docker tag xpu-kernel-ci-image:test-213 ${{ env.REGISTRY }}/xpu-kernel-ci-image:test-213
docker push ${{ env.REGISTRY }}/xpu-kernel-ci-image:test-213

# Build wheel only once on PVC, then share via GitHub Actions artifact.
# BMG runner cannot reach PVC directly (different network segment), so the
Expand All @@ -76,7 +76,7 @@ jobs:
runs-on: self-hosted-pvc
needs: build-docker-image-latest-pvc
container:
image: localhost:5000/xpu-kernel-ci-image:latest
image: localhost:5000/xpu-kernel-ci-image:test-213
options: --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged -v ccache:/root/.ccache -e CCACHE_DIR=/root/.ccache
steps:
- name: Checkout
Expand Down Expand Up @@ -125,8 +125,11 @@ jobs:
runs-on: self-hosted-pvc
needs: [build-docker-image-latest-pvc, build-wheel]
timeout-minutes: 50
defaults:
run:
shell: bash
container:
image: localhost:5000/xpu-kernel-ci-image:latest
image: localhost:5000/xpu-kernel-ci-image:test-213
options: --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged -v ccache:/root/.ccache -e CCACHE_DIR=/root/.ccache
steps:
- name: Checkout
Expand All @@ -143,11 +146,15 @@ jobs:
- name: install wheel
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
source /opt/intel/oneapi/setvars.sh --force || true
source /opt/venv/bin/activate
uv pip install -r requirements.txt
VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_LOCATION=$(find dist -name '*.whl' -print -quit) uv pip install --no-build-isolation -e . -v

- name: test
run: |
source /opt/intel/oneapi/setvars.sh --force || true
source /opt/venv/bin/activate
echo "Running tests with XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }}"
XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }} ZE_AFFINITY_MASK=0,1 SKIP_ACC_ERROR_KERNEL=1 pytest -v -s tests/ --ignore=tests/test_fp8_gemm_onednn.py
VLLM_XPU_FORCE_XE_DEFAULT_KERNEL=1 XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }} ZE_AFFINITY_MASK=0,1 pytest -v -s tests/fused_moe/test_grouped_gemm.py::test_grouped_gemm
Expand All @@ -167,8 +174,11 @@ jobs:
runs-on: self-hosted-bmg
needs: [build-docker-image-latest-bmg, build-wheel]
timeout-minutes: 50
defaults:
run:
shell: bash
container:
image: localhost:5000/xpu-kernel-ci-image:latest
image: localhost:5000/xpu-kernel-ci-image:test-213
options: --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged -v ccache:/root/.ccache -e CCACHE_DIR=/root/.ccache
steps:
- name: Checkout
Expand All @@ -185,11 +195,15 @@ jobs:
- name: install wheel
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
source /opt/intel/oneapi/setvars.sh --force || true
source /opt/venv/bin/activate
uv pip install -r requirements.txt
VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_LOCATION=$(find dist -name '*.whl' -print -quit) uv pip install --no-build-isolation -e . -v

- name: test
run: |
source /opt/intel/oneapi/setvars.sh --force || true
source /opt/venv/bin/activate
echo "Running tests with XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }}"
# tests/test_moe_align_block_size.py, tests/test_moe_lora_align_sum.py takes much time than expected. ignore it for now.
XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }} ZE_AFFINITY_MASK=0,1 pytest -v -s tests/ --ignore=tests/test_lora_ops.py --ignore=tests/test_fp8_quant.py --ignore=tests/test_moe_align_block_size.py --ignore=tests/test_moe_lora_align_sum.py --ignore=tests/test_cache.py::test_swap_blocks --ignore=tests/test_topk_per_row.py --ignore=tests/test_lora_ops.py --ignore=tests/test_fp8_gemm_onednn.py
Expand Down
15 changes: 13 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ set(BUILD_SYCL_TLA_KERNELS
CACHE BOOL "Build SYCL-TLA based kernels for XPU")
# ARCHITECTURE OPTIONS
option(VLLM_XPU_ENABLE_XE2 "Enable XE2 architecture kernels" ON)
option(VLLM_XPU_ENABLE_XE3 "Enable XE3 architecture kernels" OFF)
option(VLLM_XPU_ENABLE_XE_DEFAULT "Enable XE Default architecture kernels" ON)

# KERNEL OPTIONS — each controls whether the corresponding Python extension is
Expand All @@ -71,6 +72,7 @@ message(STATUS "")
message(STATUS "Kernel build configuration:")
message(STATUS " BUILD_SYCL_TLA_KERNELS = ${BUILD_SYCL_TLA_KERNELS}")
message(STATUS " VLLM_XPU_ENABLE_XE2 = ${VLLM_XPU_ENABLE_XE2}")
message(STATUS " VLLM_XPU_ENABLE_XE3 = ${VLLM_XPU_ENABLE_XE3}")
message(STATUS " VLLM_XPU_ENABLE_XE_DEFAULT = ${VLLM_XPU_ENABLE_XE_DEFAULT}")
message(STATUS " BASIC_KERNELS_ENABLED = ${BASIC_KERNELS_ENABLED}")
message(STATUS " FA2_KERNELS_ENABLED = ${FA2_KERNELS_ENABLED}")
Expand Down Expand Up @@ -178,8 +180,9 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
# VLLM_XPU_AOT_DEVICES and VLLM_XPU_XE2_AOT_DEVICES Example: export
# VLLM_XPU_AOT_DEVICES="pvc,bmg-g21-a0" export
# VLLM_XPU_XE2_AOT_DEVICES="pvc,bmg-g31-a0"
set(AOT_DEVICES "pvc,bmg,bmg-g21-a0,bmg-g31-a0")
set(AOT_DEVICES "pvc,bmg,bmg-g21-a0,bmg-g31-a0,xe3p,nvl-s")
set(XE2_AOT_DEVICES "pvc,bmg,bmg-g21-a0,bmg-g31-a0")
set(XE3_AOT_DEVICES "xe3p,nvl-s")

# Allow overriding via env, including explicitly disabling AOT by setting an
# empty env var (e.g. export VLLM_XPU_AOT_DEVICES="").
Expand Down Expand Up @@ -312,7 +315,7 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
# header only library
list(APPEND VLLM_CUTLASS_FLAGS "-DCUTLASS_ENABLE_HEADERS_ONLY")
list(APPEND VLLM_CUTLASS_FLAGS "-DCUTLASS_ENABLE_SYCL")
list(APPEND VLLM_CUTLASS_FLAGS "-DSYCL_INTEL_TARGET")
# list(APPEND VLLM_CUTLASS_FLAGS "-DSYCL_INTEL_TARGET")
list(APPEND VLLM_CUTLASS_FLAGS "-DCUTLASS_VERSIONS_GENERATED")
list(APPEND VLLM_CUTLASS_FLAGS "-ftemplate-backtrace-limit=0")
list(APPEND VLLM_CUTLASS_FLAGS "-fdiagnostics-color=always")
Expand Down Expand Up @@ -363,6 +366,14 @@ if(BUILD_SYCL_TLA_KERNELS)
endif()
list(APPEND SYCL_TLA_COMPILE_OPTIONS -DVLLM_XPU_ENABLE_XE2)
endif()
if(VLLM_XPU_ENABLE_XE3)
message("BUILDING XE3 ATTN!!!!!!")
# add_subdirectory(csrc/xpu/grouped_gemm/xe_3)
add_subdirectory(csrc/xpu/attn/xe_3)
# list(APPEND GROUPED_GEMM_LIB_NAME "grouped_gemm_xe_3")
list(APPEND ATTN_KERNEL_LIB_NAME "attn_kernels_xe_3")
list(APPEND SYCL_TLA_COMPILE_OPTIONS -DVLLM_XPU_ENABLE_XE3)
endif()
list(APPEND VLLM_GPU_COMPILE_FLAGS ${SYCL_TLA_COMPILE_OPTIONS})

endif()
Expand Down
19 changes: 10 additions & 9 deletions Dockerfile.xpu
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base
FROM intel/deep-learning-essentials:2026.0.0-devel-ubuntu24.04 AS vllm-base

WORKDIR /workspace/

ARG PYTHON_VERSION=3.12
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/xpu"
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/test/xpu"

Comment on lines 5 to 7
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
Expand All @@ -26,15 +26,16 @@ RUN apt clean && apt-get update -y && \
python3.12 \
python3.12-dev

RUN apt install -y libze1 libze-dev
RUN apt remove libze1 -y

RUN mkdir neo && cd neo && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.22.2/intel-igc-core-2_2.22.2+20121_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.22.2/intel-igc-opencl-2_2.22.2+20121_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.44.36015.8/intel-ocloc_25.44.36015.8-0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.44.36015.8/intel-opencl-icd_25.44.36015.8-0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.44.36015.8/libigdgmm12_22.8.2_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.44.36015.8/libze-intel-gpu1_25.44.36015.8-0_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.32.7/intel-igc-core-2_2.32.7+21184_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.32.7/intel-igc-opencl-2_2.32.7+21184_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/26.14.37833.4/intel-ocloc_26.14.37833.4-0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/26.14.37833.4/intel-opencl-icd_26.14.37833.4-0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/26.14.37833.4/libigdgmm12_22.9.0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/26.14.37833.4/libze-intel-gpu1_26.14.37833.4-0_amd64.deb && \
wget https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero_1.28.2+u24.04_amd64.deb && \
dpkg -i *.deb && cd .. && rm -rf neo
Comment on lines 31 to 39

ENV PATH="/root/.local/bin:$PATH"
Expand Down
72 changes: 72 additions & 0 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ function(add_xe2_kernel_library LIBRARY_NAME)
target_compile_options(${LIBRARY_NAME}
PRIVATE ${SYCL_TLA_KERNELS_COMPILE_FLAGS} -fPIC)
target_compile_definitions(${LIBRARY_NAME} PRIVATE -DVLLM_XPU_ENABLE_XE2)
target_compile_definitions(${LIBRARY_NAME} PRIVATE -DSYCL_INTEL_TARGET=20)
target_include_directories(${LIBRARY_NAME} PRIVATE ${SYCL_TLA_INCLUDE_DIRS})

# Link torch libraries
Expand Down Expand Up @@ -610,6 +611,76 @@ function(add_xe2_kernel_library LIBRARY_NAME)
target_link_options(${LIBRARY_NAME} PRIVATE ${XE2_GPU_LINK_FLAGS})
endfunction()

#
# Create a shared library for XE3 kernels with common configuration.
#
# Arguments: LIBRARY_NAME: Name of the library to create (e.g.,
# attn_kernels_xe_3) DESTINATION: Installation destination directory (optional,
# defaults to vllm_xpu_kernels) INCLUDE_CMAKE_SOURCE_DIR: Optional flag to
# include ${CMAKE_SOURCE_DIR} in include directories
#
function(add_xe3_kernel_library LIBRARY_NAME)
cmake_parse_arguments(
PARSE_ARGV 1 ARG "INCLUDE_CMAKE_SOURCE_DIR" # Boolean options
"DESTINATION" # Single value keywords
"" # Multi-value keywords
)

# Set default destination if not provided
if(NOT ARG_DESTINATION)
set(ARG_DESTINATION "vllm_xpu_kernels")
endif()

# Set C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Find all source files
file(GLOB_RECURSE KERNEL_SOURCES "*.cpp" ${ATTN_KERNEL_SRCS_GEN})

# Create shared library
add_library(${LIBRARY_NAME} SHARED ${KERNEL_SOURCES})

# Set include directories
target_include_directories(
${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/..)

# Optionally add CMAKE_SOURCE_DIR
if(ARG_INCLUDE_CMAKE_SOURCE_DIR)
target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_SOURCE_DIR})
endif()

# Set compile options and definitions
target_compile_options(
${LIBRARY_NAME}
PRIVATE ${SYCL_TLA_KERNELS_COMPILE_FLAGS} -fPIC -Wno-c++20-extensions
-Wno-intel-compat -Wno-pragma-once-outside-header)
target_compile_definitions(${LIBRARY_NAME} PRIVATE -DSYCL_INTEL_TARGET=35)
target_compile_definitions(${LIBRARY_NAME} PRIVATE -DVLLM_GRF_SIZE=512)
target_include_directories(${LIBRARY_NAME} PRIVATE ${SYCL_TLA_INCLUDE_DIRS})

# Link torch libraries
target_link_libraries(${LIBRARY_NAME} PRIVATE torch)
target_link_libraries(${LIBRARY_NAME} PRIVATE ${TORCH_LIBRARIES})

message(
STATUS
"Setting library output directory for target '${LIBRARY_NAME}' to '${CMAKE_BINARY_DIR}/'.'"
)
set_target_properties(${LIBRARY_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
"${CMAKE_BINARY_DIR}/")
install(TARGETS ${LIBRARY_NAME} LIBRARY DESTINATION ${ARG_DESTINATION}
COMPONENT ${LIBRARY_NAME})

# Set link options for XE3 devices
set(XE3_GPU_LINK_FLAGS ${SYCL_DEVICE_LINK_FLAGS})
list(
APPEND XE3_GPU_LINK_FLAGS -Xsycl-target-backend=spir64_gen
"-device ${XE3_AOT_DEVICES} -internal_options -cl-intel-512-GRF-per-thread")
target_link_options(${LIBRARY_NAME} PRIVATE ${XE3_GPU_LINK_FLAGS})
endfunction()

#
# Create a static library for XE default kernels with common configuration.
#
Expand Down Expand Up @@ -655,6 +726,7 @@ function(add_xe_default_kernel_library LIBRARY_NAME)
PRIVATE ${SYCL_TLA_KERNELS_COMPILE_FLAGS} -fPIC)
target_compile_definitions(${LIBRARY_NAME}
PRIVATE -DVLLM_XPU_ENABLE_XE_DEFAULT)
target_compile_definitions(${LIBRARY_NAME} PRIVATE -DSYCL_INTEL_TARGET=20)
target_include_directories(${LIBRARY_NAME} PRIVATE ${SYCL_TLA_INCLUDE_DIRS})

# Link torch libraries
Expand Down
7 changes: 7 additions & 0 deletions csrc/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ static inline bool is_xe3_arch(at::DeviceIndex device_index = -1) {
arch == syclex::architecture::intel_gpu_wcl;
}

#ifdef VLLM_XPU_ENABLE_XE3
static inline bool is_xe3p_arch(at::DeviceIndex device_index = -1) {
auto arch = get_device_architecture(device_index);
return arch == syclex::architecture::intel_gpu_nvl_s;
}
#endif

static inline std::optional<std::string> getEnv(const char* name) {
if (const char* val = std::getenv(name)) return val;
return std::nullopt;
Expand Down
10 changes: 10 additions & 0 deletions csrc/xpu/attn/xe_3/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cmake_minimum_required(VERSION 3.18)

set(ATTN_KERNEL_SRCS_GEN) # output
include("chunk_prefill_configure.cmake")
fmha_forward_configure(chunk_prefill_kernel_template)

include("paged_decode_configure.cmake")
paged_decode_configure(paged_decode_kernel_template)

add_xe3_kernel_library(attn_kernels_xe_3 INCLUDE_CMAKE_SOURCE_DIR)
54 changes: 54 additions & 0 deletions csrc/xpu/attn/xe_3/chunk_prefill_configure.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
function(fmha_forward_configure FILENAME_SUFFIX)
set(GEN_KERNEL_SRCS) # output
set(L_TYPES "fp16" "bf16")
set(L_BOOLS "false" "true")
set(BOOL_FLAG_false "f")
set(BOOL_FLAG_true "t")
set(policy_list
"chunk_policy_head64" "chunk_policy_head96" "chunk_policy_head128"
"chunk_policy_head192" "chunk_policy_head256")

set(IMPL_KV_T "fp16")

foreach(IMPL_POLICY ${policy_list})
# foreach(IMPL_T ${L_TYPES})
foreach(IMPL_KISPAGED ${L_BOOLS})
foreach(IMPL_KISCAUSAL ${L_BOOLS})
foreach(IMPL_KISLOCAL ${L_BOOLS})
foreach(IMPL_KISSINK ${L_BOOLS})
set(FILE_SUFFIX "${IMPL_POLICY}_")
set(FILE_SUFFIX "${FILE_SUFFIX}${BOOL_FLAG_${IMPL_KISPAGED}}")
set(FILE_SUFFIX "${FILE_SUFFIX}${BOOL_FLAG_${IMPL_KISCAUSAL}}")
set(FILE_SUFFIX "${FILE_SUFFIX}${BOOL_FLAG_${IMPL_KISSINK}}")
set(FILE_SUFFIX "${FILE_SUFFIX}${BOOL_FLAG_${IMPL_KISLOCAL}}")
configure_file(${FILENAME_SUFFIX}.cpp.in
"${FILENAME_SUFFIX}_${FILE_SUFFIX}.cpp")
list(
APPEND
GEN_KERNEL_SRCS
"${CMAKE_CURRENT_BINARY_DIR}/${FILENAME_SUFFIX}_${FILE_SUFFIX}.cpp"
)
endforeach()
endforeach()
endforeach()
endforeach()
endforeach()

list(REMOVE_DUPLICATES GEN_KERNEL_SRCS)
list(LENGTH GEN_KERNEL_SRCS GEN_KERNEL_SRCS_LENGTH)
message(
STATUS
"Generated ${FILENAME_SUFFIX} kernel sources: ${GEN_KERNEL_SRCS_LENGTH}")
set(GEN_KERNEL_SRCS
${GEN_KERNEL_SRCS}
PARENT_SCOPE)
set(GEN_KERNEL_SRCS_LENGTH
${GEN_KERNEL_SRCS_LENGTH}
PARENT_SCOPE)

list(APPEND ATTN_KERNEL_SRCS_GEN ${GEN_KERNEL_SRCS})
set(ATTN_KERNEL_SRCS_GEN
${ATTN_KERNEL_SRCS_GEN}
PARENT_SCOPE)

endfunction()
27 changes: 27 additions & 0 deletions csrc/xpu/attn/xe_3/chunk_prefill_kernel_template.cpp.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "csrc/xpu/attn/xe_2/chunk_prefill.hpp"

using namespace cute;

// clang-format off
// macros to be filled in CMake
#define IMPL_T ${IMPL_T}
#define IMPL_KV_T ${IMPL_KV_T}
#define IMPL_POLICY ${IMPL_POLICY}
#cmakedefine01 IMPL_KISPAGED
#cmakedefine01 IMPL_KISCAUSAL
#cmakedefine01 IMPL_KISSINK
#cmakedefine01 IMPL_KISLOCAL
// clang-format on

#define INSTANTIATE_KERNEL() \
template void policy_dispatch_impl< \
IMPL_POLICY, \
static_cast<bool>(IMPL_KISPAGED), \
static_cast<bool>(IMPL_KISCAUSAL), \
static_cast<bool>(IMPL_KISLOCAL), \
static_cast<bool>(IMPL_KISSINK)>( \
sycl::queue & queue, \
CutlassQKType& cuQKType, \
const chunk_prefill_args_t& args);

INSTANTIATE_KERNEL()
Loading
Loading