ROCm
diff --git a/‎dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp‎
Lines changed: 16 additions & 1 deletion b/‎dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎dispatcher/codegen/arch_specs.json‎
Lines changed: 5 additions & 3 deletions b/‎dispatcher/codegen/arch_specs.json‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎dispatcher/codegen/arch_specs_generated.py‎
Lines changed: 6 additions & 3 deletions b/‎dispatcher/codegen/arch_specs_generated.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎dispatcher/codegen/generate_arch_specs.py‎
Lines changed: 4 additions & 4 deletions b/‎dispatcher/codegen/generate_arch_specs.py‎
Lines changed: 4 additions & 4 deletions
@@ -129,7 +129,22 @@ float conv_bwdw_run(const void* input_ptr,
         return -1.0f;
     if(!input_ptr || !grad_output_ptr || !grad_weight_ptr)
         return -1.0f; // Null data pointer would cause kernel crash
-    return run_bwd_weight_impl(input_ptr, grad_output_ptr, grad_weight_ptr, prob, stream);
+
+    try
+    {
+        return run_bwd_weight_impl(input_ptr, grad_output_ptr, grad_weight_ptr, prob, stream);
+    }
+    catch(const std::exception&)
+    {
+        // Kernel rejected args (e.g. unsupported tile/channel combo)
+        // -3.0f matches conv_ctypes_lib.cpp:316 convention
+        // -2.0f is reserved for "no kernel / not compiled for this direction"
+        return -3.0f;
+    }
+    catch(...)
+    {
+        return -3.0f;
+    }
 #else
     return -1.0f;
 #endif
 
@@ -81,7 +81,9 @@
       "warp_configs": [
         [1, 4, 1],
         [2, 2, 1],
-        [4, 1, 1]
+        [4, 1, 1],
+        [8, 2, 1],
+        [4, 4, 1]
       ],
       "warp_tile_combos": {
         "fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
@@ -256,8 +258,8 @@
       "int8_int8_int32": [[16, 16, 32], [32, 32, 16]]
     },
     "gfx950": {
-      "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
-      "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
+      "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16], [32, 32, 32], [16, 16, 64]],
+      "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16], [32, 32, 32], [16, 16, 64]],
       "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
       "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
     }
 
@@ -1,11 +1,10 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 """
 AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY!
 
 Generated from: arch_specs.json
-Generated at: 2026-01-05T19:34:01.224422
+Generated at: 2026-04-10T20:07:11.665064
 
 To update this file:
 1. Edit arch_specs.json
@@ -50,7 +49,7 @@
     "gfx908": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
     "gfx90a": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
     "gfx942": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
-    "gfx950": [[1, 4, 1], [2, 2, 1], [4, 1, 1]],
+    "gfx950": [[1, 4, 1], [2, 2, 1], [4, 1, 1], [8, 2, 1], [4, 4, 1]],
     "gfx1100": [[2, 4, 1], [1, 8, 1], [8, 1, 1], [4, 2, 1]],
     "gfx1200": [[2, 4, 1], [1, 8, 1], [8, 1, 1], [4, 2, 1]],
     "gfx1201": [[2, 4, 1], [1, 8, 1], [8, 1, 1], [4, 2, 1]],
@@ -226,13 +225,17 @@
             [32, 32, 16],
             [16, 16, 32],
             [64, 4, 16],
+            [32, 32, 32],
+            [16, 16, 64],
         ],
         "bf16_bf16_fp32": [
             [32, 32, 8],
             [16, 16, 16],
             [32, 32, 16],
             [16, 16, 32],
             [64, 4, 16],
+            [32, 32, 32],
+            [16, 16, 64],
         ],
         "fp8_fp8_fp32": [
             [32, 32, 16],
 
@@ -230,7 +230,7 @@ def generate_cpp_header(specs: Dict[str, Any], output_path: Path):
 
     for arch, data in archs.items():
         enum_name = arch.upper().replace("GFX", "GFX_")
-        arch_enums.append(f"    {enum_name},  // {data['description']}")
+        arch_enums.append(f"    {enum_name},")
         arch_to_string_cases.append(
             f'        case GpuArch::{enum_name}: return "{arch}";'
         )
@@ -288,12 +288,12 @@ def generate_cpp_header(specs: Dict[str, Any], output_path: Path):
                 f"    if (pipeline == Pipeline::{pipeline_enum_map[pipeline]}) return {limit};"
             )
 
-    content = f"""// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+    content = f"""// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
 
 /**
  * AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY!
- * 
+ *
  * Generated from: arch_specs.json
  * Generated at: {timestamp}
  *