Skip to content

Commit 2f90d51

Browse files
committed
[WebGPU] Add native CMake build and runtime integration
Wire wgpu-native into the CMake build and integrate WebGPUDevice into the compute graph for native Metal/Vulkan execution.
1 parent b5dbe09 commit 2f90d51

4 files changed

Lines changed: 144 additions & 25 deletions

File tree

backends/webgpu/CMakeLists.txt

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,33 +15,62 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
1515
# Ensure vulkan_schema is available even when EXECUTORCH_BUILD_VULKAN is OFF.
1616
# The WebGPU backend reuses the Vulkan FlatBuffer serialization format.
1717
if(NOT TARGET vulkan_schema)
18-
# We need the schema generation from the Vulkan backend. Build only the
19-
# schema target by including the Vulkan CMakeLists.txt. The full Vulkan
20-
# backend will only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the
21-
# vulkan_backend target), but vulkan_schema is unconditionally defined.
18+
# We need the schema generation from the Vulkan backend. Build only the schema
19+
# target by including the Vulkan CMakeLists.txt. The full Vulkan backend will
20+
# only build if EXECUTORCH_BUILD_VULKAN is ON (which gates the vulkan_backend
21+
# target), but vulkan_schema is unconditionally defined.
2222
add_subdirectory(
2323
${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
2424
${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
2525
)
2626
endif()
2727

2828
set(WEBGPU_SRCS
29-
runtime/WebGPUBackend.cpp
30-
runtime/WebGPUGraph.cpp
31-
runtime/WebGPUDelegateHeader.cpp
32-
runtime/ops/OperatorRegistry.cpp
33-
runtime/ops/add/BinaryOp.cpp
29+
runtime/WebGPUBackend.cpp runtime/WebGPUGraph.cpp
30+
runtime/WebGPUDelegateHeader.cpp runtime/WebGPUDevice.cpp
31+
runtime/ops/OperatorRegistry.cpp runtime/ops/add/BinaryOp.cpp
3432
)
3533

3634
add_library(webgpu_backend ${WEBGPU_SRCS})
3735

3836
target_include_directories(
39-
webgpu_backend
40-
PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
37+
webgpu_backend PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
4138
)
4239

4340
target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)
4441

42+
# Native build: link against wgpu-native
43+
set(WGPU_NATIVE_DIR
44+
"${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native"
45+
CACHE PATH "Path to wgpu-native installation"
46+
)
47+
48+
if(NOT EXISTS "${WGPU_NATIVE_DIR}/lib/libwgpu_native.a")
49+
message(FATAL_ERROR "wgpu-native not found at ${WGPU_NATIVE_DIR}. "
50+
"Run: bash backends/webgpu/scripts/setup-wgpu-native.sh"
51+
)
52+
endif()
53+
54+
add_library(wgpu_native STATIC IMPORTED)
55+
set_target_properties(
56+
wgpu_native PROPERTIES IMPORTED_LOCATION
57+
"${WGPU_NATIVE_DIR}/lib/libwgpu_native.a"
58+
)
59+
60+
target_include_directories(
61+
webgpu_backend PUBLIC $<BUILD_INTERFACE:${WGPU_NATIVE_DIR}/include>
62+
)
63+
target_link_libraries(webgpu_backend PRIVATE wgpu_native)
64+
65+
if(APPLE)
66+
target_link_libraries(
67+
webgpu_backend PRIVATE "-framework Metal" "-framework QuartzCore"
68+
"-framework CoreGraphics" "-framework Foundation"
69+
)
70+
else()
71+
target_link_libraries(webgpu_backend PRIVATE dl m pthread)
72+
endif()
73+
4574
target_compile_options(webgpu_backend PRIVATE -fexceptions)
4675

4776
# Link with --whole-archive for static registration of backend + ops
@@ -54,3 +83,37 @@ install(
5483
EXPORT ExecuTorchTargets
5584
DESTINATION ${CMAKE_INSTALL_LIBDIR}
5685
)
86+
87+
# Native test target
88+
if(EXECUTORCH_BUILD_WEBGPU_TEST)
89+
add_executable(webgpu_native_test test/test_webgpu_native.cpp)
90+
91+
target_include_directories(
92+
webgpu_native_test PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
93+
"${WGPU_NATIVE_DIR}/include"
94+
)
95+
96+
target_link_libraries(
97+
webgpu_native_test
98+
PRIVATE webgpu_backend
99+
wgpu_native
100+
executorch_core
101+
extension_module_static
102+
extension_data_loader
103+
extension_tensor
104+
portable_kernels
105+
portable_ops_lib
106+
)
107+
108+
if(APPLE)
109+
target_link_libraries(
110+
webgpu_native_test PRIVATE "-framework Metal" "-framework QuartzCore"
111+
"-framework CoreGraphics"
112+
)
113+
else()
114+
target_link_libraries(webgpu_native_test PRIVATE dl m pthread)
115+
endif()
116+
117+
target_compile_options(webgpu_native_test PRIVATE -fexceptions)
118+
set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17)
119+
endif()

backends/webgpu/runtime/WebGPUGraph.cpp

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,18 @@
1111

1212
#include <executorch/backends/vulkan/serialization/schema_generated.h>
1313

14+
#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
15+
#include <webgpu/wgpu.h>
16+
1417
#include <cstring>
1518
#include <stdexcept>
1619

1720
namespace executorch {
1821
namespace backends {
1922
namespace webgpu {
2023

21-
// vkgraph namespace is declared at global scope in the generated FlatBuffer header
24+
// vkgraph namespace is declared at global scope in the generated FlatBuffer
25+
// header
2226

2327
namespace {
2428

@@ -69,6 +73,13 @@ WebGPUGraph::~WebGPUGraph() {
6973
void WebGPUGraph::build(
7074
const void* flatbuffer_data,
7175
const uint8_t* constant_data) {
76+
if (!device_) {
77+
auto* ctx = get_default_webgpu_context();
78+
if (ctx) {
79+
device_ = ctx->device;
80+
instance_ = ctx->instance;
81+
}
82+
}
7283
if (!device_) {
7384
throw std::runtime_error(
7485
"WebGPU device not available. "
@@ -113,8 +124,7 @@ void WebGPUGraph::build(
113124
// Create GPU buffer
114125
WGPUBufferDescriptor buf_desc = {};
115126
buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4;
116-
buf_desc.usage =
117-
WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
127+
buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
118128
WGPUBufferUsage_CopySrc;
119129
buf_desc.mappedAtCreation = false;
120130
tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
@@ -123,8 +133,7 @@ void WebGPUGraph::build(
123133
int constant_id = vk_tensor->constant_id();
124134
if (constant_id >= 0 && constant_data) {
125135
const auto* constants = graph->constants();
126-
if (constants &&
127-
constant_id < static_cast<int>(constants->size())) {
136+
if (constants && constant_id < static_cast<int>(constants->size())) {
128137
const auto* vk_bytes = constants->Get(constant_id);
129138
// Only upload from embedded bytes (not named data map)
130139
if (vk_bytes->offset() != UINT64_MAX) {
@@ -188,8 +197,7 @@ void WebGPUGraph::build(
188197
std::string op_name = op_call->name()->str();
189198

190199
if (!webgpu_operator_registry().has_op(op_name)) {
191-
throw std::runtime_error(
192-
"WebGPU backend: unsupported op: " + op_name);
200+
throw std::runtime_error("WebGPU backend: unsupported op: " + op_name);
193201
}
194202

195203
const auto* fb_args = op_call->args();
@@ -226,7 +234,8 @@ void WebGPUGraph::execute() {
226234

227235
for (const auto& dispatch : dispatches_) {
228236
wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline);
229-
wgpuComputePassEncoderSetBindGroup(pass, 0, dispatch.bind_group, 0, nullptr);
237+
wgpuComputePassEncoderSetBindGroup(
238+
pass, 0, dispatch.bind_group, 0, nullptr);
230239
wgpuComputePassEncoderDispatchWorkgroups(
231240
pass, dispatch.workgroup_count_x, 1, 1);
232241
}
@@ -273,8 +282,7 @@ void buffer_map_callback(
273282

274283
} // namespace
275284

276-
void WebGPUGraph::copy_outputs(
277-
std::vector<std::pair<void*, size_t>>& outputs) {
285+
void WebGPUGraph::copy_outputs(std::vector<std::pair<void*, size_t>>& outputs) {
278286
for (size_t i = 0; i < outputs.size() && i < output_staging_buffers_.size();
279287
i++) {
280288
MapCallbackData cb_data;
@@ -289,9 +297,12 @@ void WebGPUGraph::copy_outputs(
289297
outputs[i].second,
290298
cb_info);
291299

300+
// Poll until the map callback fires.
301+
wgpuDevicePoll(device_, true, nullptr);
302+
292303
if (cb_data.status == WGPUMapAsyncStatus_Success) {
293-
const void* mapped =
294-
wgpuBufferGetConstMappedRange(output_staging_buffers_[i], 0, outputs[i].second);
304+
const void* mapped = wgpuBufferGetConstMappedRange(
305+
output_staging_buffers_[i], 0, outputs[i].second);
295306
std::memcpy(outputs[i].first, mapped, outputs[i].second);
296307
wgpuBufferUnmap(output_staging_buffers_[i]);
297308
} else {
@@ -300,6 +311,22 @@ void WebGPUGraph::copy_outputs(
300311
}
301312
}
302313

314+
WebGPUMemoryStats WebGPUGraph::memory_stats() const {
315+
WebGPUMemoryStats stats;
316+
for (size_t i = 0; i < value_types_.size(); i++) {
317+
if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
318+
stats.tensor_buffer_bytes += tensors_[i].nbytes;
319+
stats.num_tensors++;
320+
}
321+
}
322+
for (size_t i = 0; i < output_ids_.size(); i++) {
323+
stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes;
324+
}
325+
stats.uniform_buffer_bytes = uniform_buffer_bytes_;
326+
stats.num_dispatches = static_cast<int>(dispatches_.size());
327+
return stats;
328+
}
329+
303330
} // namespace webgpu
304331
} // namespace backends
305332
} // namespace executorch

backends/webgpu/runtime/WebGPUGraph.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,18 @@ struct WebGPUDispatch {
3030
uint32_t workgroup_count_x = 1;
3131
};
3232

33+
struct WebGPUMemoryStats {
34+
size_t tensor_buffer_bytes = 0;
35+
size_t staging_buffer_bytes = 0;
36+
size_t uniform_buffer_bytes = 0;
37+
int num_tensors = 0;
38+
int num_dispatches = 0;
39+
40+
size_t total_bytes() const {
41+
return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes;
42+
}
43+
};
44+
3345
class WebGPUGraph {
3446
public:
3547
WebGPUGraph();
@@ -83,6 +95,19 @@ class WebGPUGraph {
8395
dispatches_.push_back(dispatch);
8496
}
8597

98+
void add_uniform_buffer_bytes(size_t bytes) {
99+
uniform_buffer_bytes_ += bytes;
100+
}
101+
102+
void set_instance(WGPUInstance instance) {
103+
instance_ = instance;
104+
}
105+
void set_device(WGPUDevice device) {
106+
device_ = device;
107+
}
108+
109+
WebGPUMemoryStats memory_stats() const;
110+
86111
int num_values() const {
87112
return static_cast<int>(value_types_.size());
88113
}
@@ -94,6 +119,7 @@ class WebGPUGraph {
94119
}
95120

96121
private:
122+
WGPUInstance instance_ = nullptr;
97123
WGPUDevice device_ = nullptr;
98124
WGPUQueue queue_ = nullptr;
99125

@@ -112,6 +138,8 @@ class WebGPUGraph {
112138
std::vector<WGPUBuffer> output_staging_buffers_;
113139

114140
std::vector<WebGPUDispatch> dispatches_;
141+
142+
size_t uniform_buffer_bytes_ = 0;
115143
};
116144

117145
} // namespace webgpu

backends/webgpu/runtime/ops/add/BinaryOp.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
6464
std::memcpy(mapped, &params, sizeof(AddParams));
6565
wgpuBufferUnmap(uniform_buffer);
6666

67+
graph.add_uniform_buffer_bytes(sizeof(AddParams));
68+
6769
// Create shader module from built-in WGSL source
6870
WGPUShaderSourceWGSL wgsl_desc = {};
6971
wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
@@ -99,8 +101,7 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
99101
WGPUBindGroupLayoutDescriptor bgl_desc = {};
100102
bgl_desc.entryCount = 4;
101103
bgl_desc.entries = entries;
102-
WGPUBindGroupLayout bgl =
103-
wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
104+
WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
104105

105106
// Create pipeline layout
106107
WGPUPipelineLayoutDescriptor pl_desc = {};

0 commit comments

Comments
 (0)