fix: gemma-4 with exllamav3 (#1672)

AlpinDale · web-flow · commit c16f370d517b · 2026-05-07T06:42:02.000+04:30
* fix: gemma-4 with exllamav3 * temporarily remove the ascii art logo * Revert "temporarily remove the ascii art logo" This reverts commit f327076.
diff --git a/aphrodite/model_executor/layers/quantization/exl3.py b/aphrodite/model_executor/layers/quantization/exl3.py
@@ -461,7 +461,13 @@ def _linear_prefix_is_exl3(self, prefix: str) -> bool:
 
         if prefix.endswith(".qkv_proj"):
             base = prefix.removesuffix(".qkv_proj")
-            return all(self._is_exl3_prefix(f"{base}.{proj}") for proj in ("q_proj", "k_proj", "v_proj"))
+            has_q = self._is_exl3_prefix(f"{base}.q_proj")
+            has_k = self._is_exl3_prefix(f"{base}.k_proj")
+            has_v = self._is_exl3_prefix(f"{base}.v_proj")
+            # Gemma 4 full-attention layers can use K=V attention and store
+            # only q_proj/k_proj tensors. The model loader duplicates K into
+            # V, so the fused qkv_proj still needs EXL3 parameters.
+            return has_q and has_k and (has_v or self._storage_entry(f"{base}.v_proj") is None)
 
         if prefix.endswith(".gate_up_proj"):
             base = prefix.removesuffix(".gate_up_proj")