[webgpu] Fix opset-12 softmax nhwc issue (microsoft#24227)

xhcao · web-flow · commit ad2e56524cc6 · 2025-03-31T09:06:25.000-07:00
### Description
&lt;!-- Describe your changes. --&gt;



### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc
@@ -156,7 +156,9 @@ Status Softmax::ComputeInternal(ComputeContext& context) const {
 
   // normalize axis
   size_t axis = static_cast<size_t>(HandleNegativeAxis(axis_, input_rank));
-  bool is_transpose_required = axis < input_rank - 1;
+  // The `axis` attribute of the opset lower than version 13 describes the axis of the inputs when coerced to 2D,
+  // the 0th axis most likely describes the batch_size, so transpose is not required on old opset versions.
+  bool is_transpose_required = axis < input_rank - 1 && opset_ >= 13;
 
   TensorShape transposed_input_shape;
   Tensor transposed_input_tensor;
@@ -179,7 +181,9 @@ Status Softmax::ComputeInternal(ComputeContext& context) const {
     intermediate_output = context.CreateGPUTensor(output_tensor->DataType(), transposed_input_shape);
   }
 
-  const int64_t cols = is_transpose_required ? transposed_input_shape[input_rank - 1] : input_shape[input_rank - 1];
+  // The `axis` attribute of the opset lower than version 13 separates input tensor's dimensions into two parts,
+  // one part is treated as batch size, and the other part is performed by Softmax.
+  const int64_t cols = is_transpose_required ? transposed_input_shape[input_rank - 1] : (opset_ >= 13 ? input_shape[input_rank - 1] : input_shape.SizeFromDimension(axis));
   const int64_t rows = input_shape.Size() / cols;
   const int64_t components = GetMaxComponents(cols);
   const auto packed_cols = cols / components;
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h
@@ -14,7 +14,7 @@ namespace webgpu {
 class Softmax final : public WebGpuKernel {
  public:
   Softmax(const OpKernelInfo& info) : WebGpuKernel{info} {
-    int opset_ = info.node().SinceVersion();
+    opset_ = info.node().SinceVersion();
     int64_t axis;
     Status status = info.GetAttr<int64_t>("axis", &axis);
 
@@ -33,6 +33,7 @@ class Softmax final : public WebGpuKernel {
 
  private:
   int64_t axis_;
+  int opset_;
 };
 
 class SoftmaxProgram final : public Program<SoftmaxProgram> {
diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc
@@ -422,8 +422,7 @@ TEST(SoftmaxOperator, GH15949_regression_test) {
                           {0.00032932f, 0.01798029f, 0.9816904f});
 
   // disable TRT as it does not support axis=0 as used by the model
-  // TODO: Fix the Softmax operator of WebGPU EP.
-  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 }  // namespace test

Original file line number	Diff line number	Diff line change
`@@ -422,8 +422,7 @@ TEST(SoftmaxOperator, GH15949_regression_test) {`
`422`	`422`	`{0.00032932f, 0.01798029f, 0.9816904f});`
`423`	`423`
`424`	`424`	`// disable TRT as it does not support axis=0 as used by the model`
`425`		`- // TODO: Fix the Softmax operator of WebGPU EP.`
`426`		`- tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kWebGpuExecutionProvider});`
	`425`	`+ tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});`
`427`	`426`	`}`
`428`	`427`
`429`	`428`	`} // namespace test`