[webgpu] Fix the incorrect shapes when profiling (microsoft#26928)

qjia7 · web-flow · commit aeac7579d779 · 2026-01-14T14:30:21.000+08:00
This pull request refactors how input and output tensor shape
information is stored and accessed in the WebGPU context. Instead of
keeping references to the full input and output tensors, only their
shapes are now stored, which helps avoid issues with accessing released
tensors during profiling.

Before
`"inputs[0] = {1,1,768} inputs[1] = {200064,96,1} inputs[2] = {}
outputs[0] = {} "`
After
`"inputs[0] = {1,1,768} inputs[1] = {200064,96,1} inputs[2] = {19206144}
outputs[0] = {1,1,200064} "`
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -628,17 +628,15 @@ void WebGpuContext::CollectProfilingData(profiling::Events& events) {
 
       for (size_t i = 0; i < pending_kernels.size(); i++) {
         const PendingKernelInfo& pending_kernel_info = pending_kernels[i];
-        const auto& inputs = pending_kernel_info.inputs;
-        const auto& outputs = pending_kernel_info.outputs;
+        const auto& input_shapes = pending_kernel_info.input_shapes;
+        const auto& output_shapes = pending_kernel_info.output_shapes;
 
         SS(shapes, 128);
-        for (size_t s = 0; s < inputs.size(); s++) {
-          const auto& input = inputs[s];
-          shapes << "inputs[" << s << "] = " << input.override_shape.ToString() << " ";
+        for (size_t s = 0; s < input_shapes.size(); s++) {
+          shapes << "inputs[" << s << "] = " << input_shapes[s].ToString() << " ";
         }
-        for (size_t s = 0; s < outputs.size(); s++) {
-          const auto& output = outputs[s];
-          shapes << "outputs[" << s << "] = " << output.override_shape.ToString() << " ";
+        for (size_t s = 0; s < output_shapes.size(); s++) {
+          shapes << "outputs[" << s << "] = " << output_shapes[s].ToString() << " ";
         }
 
         if (gpu_timestamp_offset_ == 0) {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -268,16 +268,26 @@ class WebGpuContext final {
                       std::string_view cache_key,
                       const std::vector<ProgramInput>& inputs,
                       const std::vector<ProgramOutput>& outputs)
-        : name{absl::StrJoin({kernel_name, kernel_type, program_name}, "&")}, cache_key{cache_key}, inputs{inputs}, outputs{outputs} {}
+        : name{absl::StrJoin({kernel_name, kernel_type, program_name}, "&")}, cache_key{cache_key} {
+      // Store shape information instead of tensor pointers to avoid accessing released tensors
+      input_shapes.reserve(inputs.size());
+      for (const auto& input : inputs) {
+        input_shapes.emplace_back(input.use_override_shape ? input.override_shape : input.tensor->Shape());
+      }
+      output_shapes.reserve(outputs.size());
+      for (const auto& output : outputs) {
+        output_shapes.emplace_back(output.use_override_shape ? output.override_shape : output.tensor->Shape());
+      }
+    }
 
     PendingKernelInfo(PendingKernelInfo&&) = default;
     PendingKernelInfo& operator=(PendingKernelInfo&&) = default;
     ORT_DISALLOW_COPY_AND_ASSIGNMENT(PendingKernelInfo);
 
     std::string name;
     std::string cache_key;
-    std::vector<ProgramInput> inputs;
-    std::vector<ProgramOutput> outputs;
+    std::vector<TensorShape> input_shapes;
+    std::vector<TensorShape> output_shapes;
   };
 
   struct PendingQueryInfo {