From d49a1b586d02b01731e56fa302ace90896326080 Mon Sep 17 00:00:00 2001 From: Kasper Nielsen Date: Sat, 18 Apr 2026 20:33:30 +0200 Subject: [PATCH] dot/schedule_bench: fix A_stride_m for transpose_a kernels Correctly supply the A stride for transpose_a kernels --- ynnpack/kernels/dot/schedule_bench.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ynnpack/kernels/dot/schedule_bench.cc b/ynnpack/kernels/dot/schedule_bench.cc index f86ff96591c..4654485f9af 100644 --- a/ynnpack/kernels/dot/schedule_bench.cc +++ b/ynnpack/kernels/dot/schedule_bench.cc @@ -152,7 +152,12 @@ double run_benchmark(TA, TB, TC, const kernel_info& kernel, size_t m, size_t n, size_t a_stride_m, span a_k_strides, const void* b_ptr, span b_k_strides, size_t init_c_stride_m, const void* init_c, void* c_ptr) { - kernel.kernel(m, n, k[2], k[1], k[0], a_stride_m, + // For dot_flag::transpose_a kernels, the 6th kernel arg is the + // stride of the k1/tile_k dimension of the packed A (see dot.h), + // not the m stride. subgraph/dot.cc does the same swap — mirror + // it here. + kernel.kernel(m, n, k[2], k[1], k[0], + pack_a ? a_k_strides[0] : a_stride_m, a_k_strides[2], a_k_strides[1], a_ptr, b_k_strides[2], b_k_strides[1], b_k_strides[0], b_ptr, init_c_stride_m, init_c, c.stride(0) * sizeof(TC), c_ptr);