@@ -95,16 +95,7 @@ using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances = std::tuple
9595 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 32 , 64 , 32 , 8 , 32 , 32 , 1 , 2 , S<4 , 8 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
9696 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 32 , 128 , 32 , 8 , 32 , 32 , 1 , 4 , S<4 , 4 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
9797 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 64 , 32 , 32 , 8 , 32 , 32 , 2 , 1 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , S<4 , 8 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
98-
99- // Problematic instance on gfx90a - accuracy tests fail for 3D bwd weight conv as the instance produces incorrect results.
100- // The problem occurs at least for compiler version
101- // 22.0.0git (https://github.com/ROCm/llvm-project.git 2de9eb6063dd56b109cf139a75550b7b06808273+PATCHED:9a6ac45c97a1e511db838c5b46257324d2de1780)
102- // Older compilers from the 20.0 family produce correct results.
103- #if defined(CK_USE_GFX90A)
104- #else
10598 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 128 , 32 , 32 , 8 , 32 , 32 , 4 , 1 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , S<4 , 4 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
106- #endif
107-
10899 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 64 , 80 , 32 , 8 , 16 , 16 , 4 , 5 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 4 , 4 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 5 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
109100 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 64 , 112 , 32 , 8 , 16 , 16 , 4 , 7 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 4 , 4 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 7 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>
110101 // clang-format on
@@ -177,16 +168,7 @@ using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances = std::tupl
177168 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 32 , 64 , 32 , 8 , 32 , 32 , 1 , 2 , S<4 , 8 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
178169 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 32 , 128 , 32 , 8 , 32 , 32 , 1 , 4 , S<4 , 4 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
179170 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 64 , 32 , 32 , 8 , 32 , 32 , 2 , 1 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , S<4 , 8 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 4 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
180-
181- // Problematic instance on gfx90a - accuracy tests fail for 3D bwd weight conv as the instance produces incorrect results.
182- // The problem occurs at least for compiler version
183- // 22.0.0git (https://github.com/ROCm/llvm-project.git 2de9eb6063dd56b109cf139a75550b7b06808273+PATCHED:9a6ac45c97a1e511db838c5b46257324d2de1780)
184- // Older compilers from the 20.0 family produce correct results.
185- #if defined(CK_USE_GFX90A)
186- #else
187171 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 128 , 32 , 32 , 8 , 32 , 32 , 4 , 1 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , S<4 , 4 , 1 >, S<2 , 0 , 1 >, S<1 , 0 , 2 >, 1 , 8 , 8 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
188- #endif
189-
190172 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 64 , 80 , 32 , 8 , 16 , 16 , 4 , 5 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 4 , 4 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 5 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>,
191173 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, F32, PassThrough, PassThrough, PassThrough, ConvSpec, 64 , 64 , 112 , 32 , 8 , 16 , 16 , 4 , 7 , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 4 , 4 , false , S<4 , 16 , 1 >, S<2 , 0 , 1 >, S<2 , 0 , 1 >, 1 , 7 , 4 , false , 1 , 1 , S<1 , 8 , 1 , 8 >, 2 , Scheduler, PipelineVersion>
192174 // clang-format on
0 commit comments