[ExecuTorch][WebGPU] Enable backend test suite + x86 CI

JulianCloudNTH · JulianCloudNTH · commit ea8037c04aaa · 2026-06-03T18:38:11.000-07:00
Pull Request resolved: pytorch#19964 Wires the WebGPU backend into the standard ExecuTorch backend test suite and adds an x86 Linux CI job, mirroring the Vulkan delegate: `backends/test/suite/flows/webgpu.py` plus a `WebGPUTester`, run by `oss/.github/workflows/test-backend-webgpu.yml` on SwiftShader (a software Vulkan adapter, via `wgpu-native`, minimal dependencies, no GPU). Two fixes were needed for SwiftShader's downlevel limits: request the adapter's full `requiredLimits` at device creation (software adapters default storage-buffer limits to 0), and make the `add` op's workgroup size dynamic instead of a hardcoded constant. The WGSL now declares a pipeline-overridable `override wg_size: u32 = 256` and the host clamps it to the device's `maxComputeInvocationsPerWorkgroup` (256 on real GPUs and lavapipe, 128 on SwiftShader), so SwiftShader's 128-invocation cap no longer forces a smaller workgroup size on real hardware. This mirrors the dynamic-workgroup-sizing approach in D107259348 and opens the door to selecting device/algorithm-optimal sizes later. The `add` op also validates its 1D dispatch count before allocating any GPU objects, against the device's queried `maxComputeWorkgroupsPerDimension` (falling back to the WebGPU spec-default floor of 65535 only when the limit query fails). Per Stephen's review, the workgroup-size clamp and the dispatch-count computation are factored into reusable `inline` helpers in `runtime/WebGPUUtils.h` (`clamp_workgroup_size` and `compute_1d_workgroup_count`, mirroring the Vulkan delegate's `utils::div_up`) so the other ops can share them rather than re-inlining the logic. The editable CMake build additionally marks the `vulkan_schema` subdirectory `EXCLUDE_FROM_ALL` so the WebGPU `ALL` build does not pull in targets that need glslc. ghstack-source-id: 389636486 @exported-using-ghexport Differential Revision: [D107288999](https://our.internmc.facebook.com/intern/diff/D107288999/)
diff --git a/.ci/scripts/setup-webgpu-linux-deps.sh b/.ci/scripts/setup-webgpu-linux-deps.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+# SwiftShader: software Vulkan adapter for GPU-less CI (LunarG SDK not needed).
+install_swiftshader() {
+  _https_amazon_aws=https://ossci-android.s3.amazonaws.com
+  _swiftshader_archive=swiftshader-abe07b943-prebuilt.tar.gz
+  _swiftshader_dir=/tmp/swiftshader
+  mkdir -p $_swiftshader_dir
+
+  _tmp_archive="/tmp/${_swiftshader_archive}"
+
+  curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
+    --output "${_tmp_archive}" "$_https_amazon_aws/${_swiftshader_archive}"
+
+  tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"
+
+  export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
+  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH}"
+  export ETVK_USING_SWIFTSHADER=1
+}
+
+install_swiftshader
+bash backends/webgpu/scripts/setup-wgpu-native.sh
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
@@ -57,6 +57,14 @@ if [[ "$FLOW" == *vulkan* ]]; then
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *webgpu* ]]; then
+    # Setup swiftshader (software Vulkan adapter for GPU-less runners) and wgpu-native,
+    # which are required to build and run the WebGPU delegate.
+    source .ci/scripts/setup-webgpu-linux-deps.sh
+
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON"
+fi
+
 if [[ "$FLOW" == *arm* ]]; then
     if [[ "$SUITE" == "operators" ]]; then
         PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
diff --git a/.github/workflows/test-backend-webgpu.yml b/.github/workflows/test-backend-webgpu.yml
@@ -0,0 +1,27 @@
+name: Test WebGPU Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-webgpu:
+    uses: ./.github/workflows/_test_backend.yml
+    with:
+      backend: webgpu
+      flows: '["webgpu"]'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      run-linux: true
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1061,6 +1061,10 @@ if(EXECUTORCH_BUILD_PYBIND)
     list(APPEND _dep_libs vulkan_backend)
   endif()
 
+  if(EXECUTORCH_BUILD_WEBGPU)
+    list(APPEND _dep_libs webgpu_backend)
+  endif()
+
   # compile options for pybind
   set(_pybind_compile_options
       $<$<CXX_COMPILER_ID:MSVC>:/EHsc
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
@@ -117,6 +117,12 @@ def _load_vulkan() -> list[TestFlow]:
     return [VULKAN_TEST_FLOW, VULKAN_STATIC_INT8_PER_CHANNEL_TEST_FLOW]
 
 
+def _load_webgpu() -> list[TestFlow]:
+    from executorch.backends.test.suite.flows.webgpu import WEBGPU_TEST_FLOW
+
+    return [WEBGPU_TEST_FLOW]
+
+
 def _load_openvino() -> list[TestFlow]:
     from executorch.backends.test.suite.flows.openvino import (
         OPENVINO_INT8_TEST_FLOW,
@@ -178,6 +184,7 @@ def all_flows() -> dict[str, TestFlow]:
         + _register_flow(_load_xnnpack, "XNNPACK")
         + _register_flow(_load_coreml, "Core ML")
         + _register_flow(_load_vulkan, "Vulkan")
+        + _register_flow(_load_webgpu, "WebGPU")
         + _register_flow(_load_openvino, "OpenVINO")
         + _register_flow(_load_qnn, "QNN")
         + _register_flow(_load_arm, "ARM")
diff --git a/backends/test/suite/flows/webgpu.py b/backends/test/suite/flows/webgpu.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.test.suite.flow import TestFlow
+from executorch.backends.webgpu.test.tester import WebGPUTester
+
+
+def _create_webgpu_flow() -> TestFlow:
+    return TestFlow(
+        "webgpu",
+        backend="webgpu",
+        tester_factory=WebGPUTester,
+        skip_patterns=["float16", "float64"],  # Not supported in swiftshader
+    )
+
+
+WEBGPU_TEST_FLOW = _create_webgpu_flow()
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
@@ -21,7 +21,7 @@ if(NOT TARGET vulkan_schema)
   # target), but vulkan_schema is unconditionally defined.
   add_subdirectory(
     ${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
-    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
+    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema EXCLUDE_FROM_ALL
   )
 endif()
 
diff --git a/backends/webgpu/__init__.py b/backends/webgpu/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp
@@ -121,7 +121,13 @@ WebGPUContext create_webgpu_context() {
   device_cb.callback = on_device_request;
   device_cb.userdata1 = &device_result;
 
+  // Request the adapter's full limits; software adapters default many to 0.
+  WGPULimits supported_limits = {};
   WGPUDeviceDescriptor device_desc = {};
+  if (wgpuAdapterGetLimits(ctx.adapter, &supported_limits) ==
+      WGPUStatus_Success) {
+    device_desc.requiredLimits = &supported_limits;
+  }
   device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;
 
   wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb);
diff --git a/backends/webgpu/runtime/WebGPUUtils.h b/backends/webgpu/runtime/WebGPUUtils.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <webgpu/webgpu.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+namespace executorch::backends::webgpu::utils {
+
+// Clamp workgroup size to device limit (SwiftShader caps at 128).
+inline uint32_t clamp_workgroup_size(WGPUDevice device, uint32_t desired) {
+  WGPULimits limits = {};
+  if (wgpuDeviceGetLimits(device, &limits) == WGPUStatus_Success &&
+      limits.maxComputeInvocationsPerWorkgroup > 0) {
+    return std::min(desired, limits.maxComputeInvocationsPerWorkgroup);
+  }
+  return desired;
+}
+
+// 1D dispatch count (mirrors Vulkan div_up); throws if > device limit.
+inline uint32_t compute_1d_workgroup_count(
+    WGPUDevice device,
+    uint32_t num_threads,
+    uint32_t workgroup_size,
+    const char* op_name) {
+  uint32_t count = (num_threads + workgroup_size - 1) / workgroup_size;
+  WGPULimits limits = {};
+  uint32_t max_count =
+      wgpuDeviceGetLimits(device, &limits) == WGPUStatus_Success &&
+          limits.maxComputeWorkgroupsPerDimension > 0
+      ? limits.maxComputeWorkgroupsPerDimension
+      : 65535u; // WebGPU spec-default floor
+  if (count > max_count) {
+    throw std::runtime_error(
+        std::string("WebGPU ") + op_name +
+        ": workgroup count exceeds the 1D dispatch limit");
+  }
+  return count;
+}
+
+} // namespace executorch::backends::webgpu::utils
diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
 #include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
 #include <executorch/backends/webgpu/runtime/ops/add/binary_add_wgsl.h>
 
@@ -50,6 +51,15 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   uint32_t num_elements =
       static_cast<uint32_t>(out_tensor.nbytes / sizeof(float));
 
+  uint32_t wg_size =
+      utils::clamp_workgroup_size(device, kBinaryAddWorkgroupSize);
+  uint32_t workgroup_count =
+      utils::compute_1d_workgroup_count(device, num_elements, wg_size, "add");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
   // Create uniform buffer for params
   AddParams params = {};
   params.num_elements = num_elements;
@@ -115,6 +125,8 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   pipeline_desc.layout = pipeline_layout;
   pipeline_desc.compute.module = shader;
   pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
   WGPUComputePipeline pipeline =
       wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
 
@@ -146,16 +158,14 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   bg_desc.entries = bg_entries;
   WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
 
-  uint32_t workgroup_count =
-      (num_elements + kBinaryAddWorkgroupSize - 1) / kBinaryAddWorkgroupSize;
-
   graph.add_dispatch({pipeline, bind_group, workgroup_count});
 
   // Release intermediate objects (pipeline and bind_group are kept by dispatch)
   wgpuShaderModuleRelease(shader);
   wgpuBindGroupLayoutRelease(bgl);
   wgpuPipelineLayoutRelease(pipeline_layout);
-  // uniform_buffer is kept alive by the bind group
+  // Drop our ref; the bind group keeps the uniform buffer alive until release.
+  wgpuBufferRelease(uniform_buffer);
 }
 
 } // namespace
diff --git a/backends/webgpu/runtime/ops/add/binary_add.wgsl b/backends/webgpu/runtime/ops/add/binary_add.wgsl
@@ -8,7 +8,9 @@ struct Params {
 }
 @group(0) @binding(3) var<uniform> params: Params;
 
-@compute @workgroup_size(256)
+override wg_size: u32 = 256;
+
+@compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let idx = gid.x;
     if (idx >= params.num_elements) {
diff --git a/backends/webgpu/runtime/ops/add/binary_add_wgsl.h b/backends/webgpu/runtime/ops/add/binary_add_wgsl.h
@@ -24,7 +24,9 @@ struct Params {
 }
 @group(0) @binding(3) var<uniform> params: Params;
 
-@compute @workgroup_size(256)
+override wg_size: u32 = 256;
+
+@compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let idx = gid.x;
     if (idx >= params.num_elements) {
diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS
@@ -0,0 +1,27 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+# AOT export coverage only (lowers via VulkanPartitioner, asserts a VulkanBackend delegate); no GPU runtime.
+python_unittest(
+    name = "test_add",
+    srcs = [
+        "ops/add/test_add.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
+        "//executorch/backends/vulkan:vulkan_preprocess",
+        "//executorch/exir:lib",
+    ],
+)
+
+runtime.python_library(
+    name = "tester",
+    srcs = ["tester.py"],
+    deps = [
+        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
+        "//executorch/backends/vulkan:vulkan_preprocess",
+    ],
+)
diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.backends.vulkan import VulkanPartitioner
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.exir import to_edge_transform_and_lower
 
 
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List, Optional, Tuple
+
+import executorch
+import executorch.backends.test.harness.stages as BaseStages
+
+import torch
+from executorch.backends.test.harness import Tester as TesterBase
+from executorch.backends.test.harness.stages import StageType
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.partitioner import Partitioner
+
+
+# Lowers via VulkanPartitioner (WebGPU consumes the Vulkan VK00 serialization).
+class Partition(BaseStages.Partition):
+    def __init__(self, partitioner: Optional[Partitioner] = None):
+        super().__init__(
+            partitioner=partitioner or VulkanPartitioner({"skip_bool_tensors": True}),
+        )
+
+
+class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower):
+    def __init__(
+        self,
+        partitioners: Optional[List[Partitioner]] = None,
+        edge_compile_config: Optional[EdgeCompileConfig] = None,
+    ):
+        if partitioners is None:
+            partitioners = [VulkanPartitioner({"skip_bool_tensors": True})]
+
+        super().__init__(
+            default_partitioner_cls=VulkanPartitioner,
+            partitioners=partitioners,
+            edge_compile_config=edge_compile_config
+            or EdgeCompileConfig(_check_ir_validity=False),
+        )
+
+
+class WebGPUTester(TesterBase):
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        example_inputs: Tuple[torch.Tensor],
+        dynamic_shapes: Optional[Tuple[Any]] = None,
+    ):
+        stage_classes = (
+            executorch.backends.test.harness.Tester.default_stage_classes()
+            | {
+                StageType.PARTITION: Partition,
+                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
+            }
+        )
+
+        super().__init__(
+            module=module,
+            stage_classes=stage_classes,
+            example_inputs=example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ if(NOT TARGET vulkan_schema)`
`21`	`21`	`# target), but vulkan_schema is unconditionally defined.`
`22`	`22`	`add_subdirectory(`
`23`	`23`	`${CMAKE_CURRENT_SOURCE_DIR}/../vulkan`
`24`		`- ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema`
	`24`	`+ ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema EXCLUDE_FROM_ALL`
`25`	`25`	`)`
`26`	`26`	`endif()`
`27`	`27`