File tree Expand file tree Collapse file tree
aphrodite/model_executor/layers/quantization Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -461,7 +461,13 @@ def _linear_prefix_is_exl3(self, prefix: str) -> bool:
461461
462462 if prefix .endswith (".qkv_proj" ):
463463 base = prefix .removesuffix (".qkv_proj" )
464- return all (self ._is_exl3_prefix (f"{ base } .{ proj } " ) for proj in ("q_proj" , "k_proj" , "v_proj" ))
464+ has_q = self ._is_exl3_prefix (f"{ base } .q_proj" )
465+ has_k = self ._is_exl3_prefix (f"{ base } .k_proj" )
466+ has_v = self ._is_exl3_prefix (f"{ base } .v_proj" )
467+ # Gemma 4 full-attention layers can use K=V attention and store
468+ # only q_proj/k_proj tensors. The model loader duplicates K into
469+ # V, so the fused qkv_proj still needs EXL3 parameters.
470+ return has_q and has_k and (has_v or self ._storage_entry (f"{ base } .v_proj" ) is None )
465471
466472 if prefix .endswith (".gate_up_proj" ):
467473 base = prefix .removesuffix (".gate_up_proj" )
You can’t perform that action at this time.
0 commit comments