fix parallelism case failure in xpu (#13270)

sywangyi · sayakpaul · web-flow · commit 9677859ebf95 · 2026-03-17T08:52:15.000+05:30
* fix parallelism case failure in xpu

Signed-off-by: Wang, Yi &lt;yi.a.wang@intel.com&gt;

* updated

Signed-off-by: Wang, Yi &lt;yi.a.wang@intel.com&gt;

---------

Signed-off-by: Wang, Yi &lt;yi.a.wang@intel.com&gt;
Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;
diff --git a/tests/models/testing_utils/parallelism.py b/tests/models/testing_utils/parallelism.py
@@ -26,9 +26,17 @@
 from ...testing_utils import (
     is_context_parallel,
     require_torch_multi_accelerator,
+    torch_device,
 )
 
 
+# Device configuration mapping
+DEVICE_CONFIG = {
+    "cuda": {"backend": "nccl", "module": torch.cuda},
+    "xpu": {"backend": "xccl", "module": torch.xpu},
+}
+
+
 def _find_free_port():
     """Find a free port on localhost."""
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -47,12 +55,17 @@ def _context_parallel_worker(rank, world_size, master_port, model_class, init_di
         os.environ["RANK"] = str(rank)
         os.environ["WORLD_SIZE"] = str(world_size)
 
+        # Get device configuration
+        device_config = DEVICE_CONFIG.get(torch_device, DEVICE_CONFIG["cuda"])
+        backend = device_config["backend"]
+        device_module = device_config["module"]
+
         # Initialize process group
-        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
 
         # Set device for this process
-        torch.cuda.set_device(rank)
-        device = torch.device(f"cuda:{rank}")
+        device_module.set_device(rank)
+        device = torch.device(f"{torch_device}:{rank}")
 
         # Create model
         model = model_class(**init_dict)
@@ -103,10 +116,16 @@ def _custom_mesh_worker(
         os.environ["RANK"] = str(rank)
         os.environ["WORLD_SIZE"] = str(world_size)
 
-        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+        # Get device configuration
+        device_config = DEVICE_CONFIG.get(torch_device, DEVICE_CONFIG["cuda"])
+        backend = device_config["backend"]
+        device_module = device_config["module"]
 
-        torch.cuda.set_device(rank)
-        device = torch.device(f"cuda:{rank}")
+        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
+
+        # Set device for this process
+        device_module.set_device(rank)
+        device = torch.device(f"{torch_device}:{rank}")
 
         model = model_class(**init_dict)
         model.to(device)
@@ -116,7 +135,7 @@ def _custom_mesh_worker(
 
         # DeviceMesh must be created after init_process_group, inside each worker process.
         mesh = torch.distributed.device_mesh.init_device_mesh(
-            "cuda", mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
+            torch_device, mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
         )
         cp_config = ContextParallelConfig(**cp_dict, mesh=mesh)
         model.enable_parallelism(config=cp_config)