Skip to content

Commit c16f370

Browse files
authored
fix: gemma-4 with exllamav3 (#1672)
* fix: gemma-4 with exllamav3 * temporarily remove the ascii art logo * Revert "temporarily remove the ascii art logo" This reverts commit f327076.
1 parent acd0661 commit c16f370

1 file changed

Lines changed: 7 additions & 1 deletion

File tree

  • aphrodite/model_executor/layers/quantization

aphrodite/model_executor/layers/quantization/exl3.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,13 @@ def _linear_prefix_is_exl3(self, prefix: str) -> bool:
461461

462462
if prefix.endswith(".qkv_proj"):
463463
base = prefix.removesuffix(".qkv_proj")
464-
return all(self._is_exl3_prefix(f"{base}.{proj}") for proj in ("q_proj", "k_proj", "v_proj"))
464+
has_q = self._is_exl3_prefix(f"{base}.q_proj")
465+
has_k = self._is_exl3_prefix(f"{base}.k_proj")
466+
has_v = self._is_exl3_prefix(f"{base}.v_proj")
467+
# Gemma 4 full-attention layers can use K=V attention and store
468+
# only q_proj/k_proj tensors. The model loader duplicates K into
469+
# V, so the fused qkv_proj still needs EXL3 parameters.
470+
return has_q and has_k and (has_v or self._storage_entry(f"{base}.v_proj") is None)
465471

466472
if prefix.endswith(".gate_up_proj"):
467473
base = prefix.removesuffix(".gate_up_proj")

0 commit comments

Comments
 (0)