@@ -85,17 +85,20 @@ static HWY_NOINLINE void DoMatMul_BRGeMM(
8585 ke.K_blk = cfg.K_blk ;
8686 ke.N_blk = cfg.N_blk ;
8787 ke.M_blk = std::min (cfg.M_blk , M);
88+ ke.div_M_blk = hwy::Divisor (ke.M_blk );
89+ ke.div_N_blk = hwy::Divisor (ke.N_blk );
90+ ke.div_K_blk = hwy::Divisor (ke.K_blk );
8891
89- ke.M_tail = M % ke.M_blk ;
90- ke.N_tail = N % ke.N_blk ;
91- ke.K_tail = K % ke.K_blk ;
92+ ke.M_tail = ke.div_M_blk . Remainder (M) ;
93+ ke.N_tail = ke.div_N_blk . Remainder (N) ;
94+ ke.K_tail = ke.div_K_blk . Remainder (K) ;
9295
9396 // Floor division: K_tail remainder is handled by a dedicated brg_ktail
9497 // kernel rather than padding K, avoiding extra memory writes to zero-pad
9598 // A and B along the K dimension.
96- ke.K_chunks = K / ke.K_blk ;
97- ke.N_full_tiles = N / ke.N_blk ;
98- ke.M_full_tiles = M / ke.M_blk ;
99+ ke.K_chunks = ke.div_K_blk . Divide (K) ;
100+ ke.N_full_tiles = ke.div_N_blk . Divide (N) ;
101+ ke.M_full_tiles = ke.div_M_blk . Divide (M) ;
99102 ke.N_total_tiles = ke.N_full_tiles + (ke.N_tail ? 1 : 0 );
100103 ke.M_total_tiles = ke.M_full_tiles + (ke.M_tail ? 1 : 0 );
101104 ke.N_padded = ke.N_total_tiles * ke.N_blk ;
@@ -367,8 +370,8 @@ static HWY_NOINLINE void DoMatMul_BRGeMM(
367370 const auto execute_tile = [&](size_t m_start, size_t n_start,
368371 size_t k_super, float * temp_C,
369372 uint8_t * scratch) HWY_ATTR {
370- const size_t m_tile_idx = m_start / ke.M_blk ;
371- const size_t n_tile_idx = n_start / ke.N_blk ;
373+ const size_t m_tile_idx = ke.div_M_blk . Divide (m_start) ;
374+ const size_t n_tile_idx = ke.div_N_blk . Divide (n_start) ;
372375 const int mi = (m_tile_idx < ke.M_full_tiles ) ? 0 : 1 ;
373376 const int ni = (n_tile_idx < ke.N_full_tiles ) ? 0 : 1 ;
374377 const size_t cur_m = ke.m_sizes [mi];
0 commit comments