Skip to content

Commit 4ba6625

Browse files
authored
fix bug (#8043)
1 parent 02a0042 commit 4ba6625

1 file changed

Lines changed: 2 additions & 1 deletion

File tree

fastdeploy/worker/gpu_model_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1749,7 +1749,8 @@ def _initialize_attn_backend(self) -> None:
17491749
decoder_block_shape_q=decoder_block_shape_q,
17501750
decoder_step_token_num=self.speculative_config.num_speculative_tokens + 1,
17511751
num_heads=num_heads,
1752-
kv_num_heads=max(kv_num_heads_per_layer),
1752+
# This requires the largest possible group size, corresponding to the smallest kv-num-heads.
1753+
kv_num_heads=min(kv_num_heads_per_layer),
17531754
block_size=self.fd_config.cache_config.block_size,
17541755
head_dim=head_dim,
17551756
dtype=self.model_config.dtype,

0 commit comments

Comments
 (0)