Update stack to PT2.10

DamianSzwichtenberg · DamianSzwichtenberg · commit 54366c7afd25 · 2026-03-19T21:11:21.000-07:00
diff --git a/README.md b/README.md
@@ -75,9 +75,8 @@ conda activate forge
 
 Notes:
 - Requires Intel oneAPI toolkit installed at `$ONEAPI_ROOT`, `/opt/intel/oneapi`, or loadable via `module load intel/oneapi`.
-- Python version must match `IPEX_PYTHON_VERSION` in `assets/versions.sh`.
-- The script installs PyTorch + IPEX via vLLM's XPU requirements, then locks their versions with pip constraints.
-- XPU builds install Monarch with `USE_TENSOR_ENGINE=0`, so RDMA and distributed tensor features are disabled for now.
+- Python version must match `XPU_PYTHON_VERSION` in `assets/versions.sh`.
+- XPU build installs Monarch with `USE_TENSOR_ENGINE=0`, so RDMA and distributed tensor features are disabled for now.
 - Optional flag: `--use-sudo` (system packages via `apt`/`dnf` instead of conda).
 - Re-activate your conda environment after install to pick up the oneAPI activation hook.
 
diff --git a/assets/versions.sh b/assets/versions.sh
@@ -11,11 +11,12 @@
 PYTORCH_VERSION="2.9.0"
 # ROCm/XPU builds vLLM from source (no prebuilt ROCm/XPU wheels available)
 VLLM_ROCM_VERSION="v0.10.0"
-VLLM_XPU_VERSION="v0.13.0"
-# IPEX wheels shipped with vLLM has hard python version requirement
-IPEX_PYTHON_VERSION="3.12"
+VLLM_XPU_VERSION="v0.17.0"
+# PyTorch XPU version (vLLM v0.17+ dropped IPEX in favour of native XPU support)
+PYTORCH_XPU_VERSION="2.10.0"
+# vllm-xpu-kernels wheels only ship for Python 3.12
+XPU_PYTHON_VERSION="3.12"
 TORCHSTORE_BRANCH="no-monarch-2026.01.05"
 # ROCm/XPU builds these from source (no ROCm/XPU wheels); CUDA uses pyproject pins.
 TORCHTITAN_VERSION="v0.2.0"
-TORCHTITAN_XPU_COMMIT="e61f2cce4fd9c54d314ff0a2dabe035b80a5d49c"
 MONARCH_VERSION="v0.2.0"
diff --git a/scripts/install_xpu.sh b/scripts/install_xpu.sh
@@ -39,14 +39,18 @@ if [ -z "${TORCHSTORE_BRANCH:-}" ]; then
     log_error "TORCHSTORE_BRANCH not set in $VERSIONS_FILE"
     exit 1
 fi
-if [ -z "${TORCHTITAN_XPU_COMMIT:-}" ]; then
-    log_error "TORCHTITAN_XPU_COMMIT not set in $VERSIONS_FILE"
+if [ -z "${TORCHTITAN_VERSION:-}" ]; then
+    log_error "TORCHTITAN_VERSION not set in $VERSIONS_FILE"
     exit 1
 fi
 if [ -z "${MONARCH_VERSION:-}" ]; then
     log_error "MONARCH_VERSION not set in $VERSIONS_FILE"
     exit 1
 fi
+if [ -z "${PYTORCH_XPU_VERSION:-}" ]; then
+    log_error "PYTORCH_XPU_VERSION not set in $VERSIONS_FILE"
+    exit 1
+fi
 
 # Defaults (override via environment variables)
 FORGE_DEPS_DIR="${FORGE_DEPS_DIR:-$HOME/.cache/torchforge}"
@@ -64,17 +68,17 @@ check_conda_env() {
 }
 
 check_python_version() {
-    local required="$IPEX_PYTHON_VERSION"
+    local required="$XPU_PYTHON_VERSION"
     local actual
     actual=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
 
     if [ "$actual" != "$required" ]; then
-        log_error "Python ${actual} detected, but vLLM for XPU requires Python ${required}"
+        log_error "Python ${actual} detected, but vllm-xpu-kernels requires Python ${required}"
         log_info "Recreate your conda env with the correct version:"
         log_info "  conda create -n forge python=${required} -y"
         exit 1
     fi
-    log_info "Python version ${actual} matches IPEX requirement"
+    log_info "Python version ${actual} matches XPU requirement"
 }
 
 # Check required command
@@ -260,16 +264,13 @@ ensure_rust() {
 create_constraints_file() {
     local torch_version
     torch_version=$(python -c "import torch; print(torch.__version__)")
-    local ipex_version
-    ipex_version=$(python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__)")
 
     local constraints_file="${FORGE_DEPS_DIR}/constraints.txt"
     cat > "$constraints_file" <<EOF
 torch==${torch_version}
-intel-extension-for-pytorch==${ipex_version}
 EOF
     export PIP_CONSTRAINT="$constraints_file"
-    log_info "Pip constraints locked: torch==${torch_version}, IPEX==${ipex_version}"
+    log_info "Pip constraints locked: torch==${torch_version}"
 }
 
 install_vllm_xpu() {
@@ -278,24 +279,52 @@ install_vllm_xpu() {
     log_info "Installing vLLM ${VLLM_XPU_VERSION} from source (XPU)"
     ensure_repo "https://github.com/vllm-project/vllm.git" "$vllm_dir" "$VLLM_XPU_VERSION"
 
-    # Installs PyTorch + IPEX + all XPU deps
+    # Let vLLM's xpu requirements drive the PyTorch + triton-xpu install.
     python -m pip install -r "${vllm_dir}/requirements/xpu.txt"
 
-    # Lock torch + IPEX so later installs can't clobber them
+    # triton-xpu (required by torch 2.10+xpu) and vanilla triton (required by
+    # xgrammar) both install into the same `triton/` namespace directory.
+    # In PyTorch <=2.9 the XPU package was called pytorch-triton-xpu and used a
+    # separate namespace, so the two coexisted.  After the rename to triton-xpu
+    # pip installs both, and vanilla triton's libtriton.so overwrites the XPU
+    # one — stripping the 'intel' backend symbol.
+    #
+    # Fix: force-reinstall triton-xpu so its libtriton.so (with 'intel') wins.
+    # We keep vanilla triton installed so xgrammar's pip dependency stays
+    # satisfied (triton-xpu does not declare Provides: triton).
+    local triton_xpu_version
+    triton_xpu_version=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton-xpu'))")
+    log_info "Fixing triton namespace conflict: reinstalling triton-xpu ${triton_xpu_version}"
+    python -m pip install "triton-xpu==${triton_xpu_version}" --force-reinstall --no-deps \
+        --extra-index-url https://download.pytorch.org/whl/xpu
+
+    # Lock torch so later installs can't clobber it
     create_constraints_file
 
     VLLM_TARGET_DEVICE=xpu \
         python -m pip install -e "$vllm_dir" --no-build-isolation
 }
 
+verify_pytorch_xpu() {
+    local actual_version
+    actual_version=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+
+    if [ "$actual_version" != "${PYTORCH_XPU_VERSION}" ]; then
+        log_error "Expected PyTorch ${PYTORCH_XPU_VERSION} but got ${actual_version}"
+        log_info "vLLM's requirements may have installed an incompatible version"
+        exit 1
+    fi
+    log_info "PyTorch ${actual_version}+xpu verified"
+}
+
 install_torchstore() {
     log_info "Installing torchstore from branch ${TORCHSTORE_BRANCH}"
     python -m pip install "git+https://github.com/meta-pytorch/torchstore.git@${TORCHSTORE_BRANCH}"
 }
 
 install_torchtitan() {
-    log_info "Installing torchtitan from tag ${TORCHTITAN_XPU_COMMIT}"
-    python -m pip install "git+https://github.com/pytorch/torchtitan.git@${TORCHTITAN_XPU_COMMIT}"
+    log_info "Installing torchtitan from tag ${TORCHTITAN_VERSION}"
+    python -m pip install "git+https://github.com/pytorch/torchtitan.git@${TORCHTITAN_VERSION}"
 }
 
 install_monarch() {
@@ -471,8 +500,9 @@ main() {
     install_system_packages "$USE_SUDO"
     setup_xpu_env
 
-    # vLLM install PyTorch + IPEX + creates constraints
+    # vLLM installs PyTorch + triton-xpu, fixes triton conflict, creates constraints
     install_vllm_xpu
+    verify_pytorch_xpu
 
     # Everything below is protected by PIP_CONSTRAINT
     install_torchstore
@@ -504,4 +534,4 @@ main() {
     log_info "  conda deactivate && conda activate $CONDA_DEFAULT_ENV"
 }
 
-main "$@"
+main "$@"
diff --git a/src/forge/actors/vllm/v1/generator.py b/src/forge/actors/vllm/v1/generator.py
@@ -242,6 +242,21 @@ async def setup(self, host_mesh, worker_registry, gpu_ids: list[str]):
             "forge.actors.vllm.v1.forge_executor.ForgeMonarchExecutor"
         )
 
+        # Disable vLLM's async scheduling for our custom executor backend.
+        # vLLM's __post_init__ is called twice: once at VllmConfig construction
+        # and again after EngineCore handshake (_perform_handshakes). In vLLM
+        # >= 0.14, async_scheduling defaults to None (auto-detect), which the
+        # first __post_init__ auto-enables to True since executor is still "mp".
+        # After we override the executor backend above, the second __post_init__
+        # sees async_scheduling=True with an unrecognized backend and raises
+        # ValueError. Setting False explicitly is safe for all vLLM versions:
+        # in <= 0.13 it was already the default, and our MonarchExecutor does
+        # not use vLLM's async scheduling mechanism.
+        if hasattr(self.vllm_config, "scheduler_config") and hasattr(
+            self.vllm_config.scheduler_config, "async_scheduling"
+        ):
+            self.vllm_config.scheduler_config.async_scheduling = False
+
         # Set up prefetching configuration via additional_config
         # There does not seem to be  a real difference between pass by env var or via self.vllm_config
         if self.prefetch_weights_to_shm:
diff --git a/src/forge/actors/vllm/v1/monarch_executor.py b/src/forge/actors/vllm/v1/monarch_executor.py
@@ -9,8 +9,10 @@
 from __future__ import annotations
 
 import base64
+import inspect
 import logging
 import os
+from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import cloudpickle
@@ -211,11 +213,24 @@ class WorkerWrapper(WorkerWrapperBase, Actor):
     stores).
     """
 
+    # Detect whether WorkerWrapperBase accepts vllm_config (vLLM <= 0.13)
+    # or only rpc_rank/global_rank (vLLM >= 0.14).
+    _wrapper_accepts_vllm_config: bool = (
+        "vllm_config" in inspect.signature(WorkerWrapperBase.__init__).parameters
+    )
+
     def __init__(self, vllm_config):
         rank = context().actor_instance.rank.rank
         # rpc_rank: rank within this executor (0 to num_workers-1)
         # global_rank: rank in distributed group (same as rpc_rank for single executor)
-        WorkerWrapperBase.__init__(self, vllm_config, rpc_rank=rank, global_rank=rank)
+        if self._wrapper_accepts_vllm_config:
+            # vLLM <= 0.13: vllm_config passed at wrapper init time
+            WorkerWrapperBase.__init__(
+                self, vllm_config, rpc_rank=rank, global_rank=rank
+            )
+        else:
+            # vLLM >= 0.14: vllm_config flows through init_worker(all_kwargs)
+            WorkerWrapperBase.__init__(self, rpc_rank=rank, global_rank=rank)
         Actor.__init__(self)
 
     def init_worker(self, all_kwargs):
@@ -234,9 +249,15 @@ def init_worker(self, all_kwargs):
         super().init_worker(all_kwargs)
 
     @endpoint
-    def execute_method(self, method: str, *args, **kwargs):
-        # For simplicity, we only support string method names for now
-        fn = getattr(self, method)
+    def execute_method(self, method, *args, **kwargs):
+        # Support both string method names and bytes (cloudpickle'd callables,
+        # used by vLLM >= 0.17 for lambda-based collective_rpc calls).
+        if isinstance(method, bytes):
+            fn = partial(cloudpickle.loads(method), self)
+        elif isinstance(method, str):
+            fn = getattr(self, method)
+        else:
+            fn = partial(method, self)
         return fn(*args, **kwargs)
 
     @endpoint