[#14679][fix] Fix fused-QKV TP sharding for Phi-3/Phi-4

guan404ming · guan404ming · commit 6f022cab51b7 · 2026-06-18T18:22:42.000+08:00
Signed-off-by: Guan-Ming (Wesley) Chiu &lt;105915352+guan404ming@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/model_registry/models.yaml b/examples/auto_deploy/model_registry/models.yaml
@@ -67,18 +67,15 @@ models:
 - name: Qwen/Qwen3-8B
   config_id: default_ws_2
   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'enable_sharder_ir.yaml']
-# RuntimeError: a and b must have same reduction dim, but got [s44*s70, 5120] X [2560, 5120]. See https://github.com/NVIDIA/TensorRT-LLM/issues/14679
-# - name: microsoft/phi-4
-#   config_id: default_ws_2
-#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
-# RuntimeError: a and b must have same reduction dim, but got [s44*s70, 5120] X [2560, 5120]. See https://github.com/NVIDIA/TensorRT-LLM/issues/14679
-# - name: microsoft/Phi-4-reasoning
-#   config_id: default_ws_2
-#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
-# RuntimeError: a and b must have same reduction dim, but got [s44*s70, 5120] X [2560, 5120]. See https://github.com/NVIDIA/TensorRT-LLM/issues/14679
-# - name: microsoft/Phi-4-reasoning-plus
-#   config_id: default_ws_2
-#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: microsoft/phi-4
+  config_id: default_ws_2
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: microsoft/Phi-4-reasoning
+  config_id: default_ws_2
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: microsoft/Phi-4-reasoning-plus
+  config_id: default_ws_2
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
 # IndexError: list index out of range in AutoDeploy sharding path. See https://github.com/NVIDIA/TensorRT-LLM/issues/14681
 # - name: google/gemma-1.1-7b-it
 #   config_id: default_ws_2
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -2855,7 +2855,7 @@ def _process_mla_sharding(
 
 def _determine_fused_weight_dims(
     linear_nodes: List[Node],
-) -> None:
+) -> Optional[List[int]]:
     """
     Determine the fused weight dims for the given linear nodes and subgraph nodes.
     """
@@ -2900,6 +2900,8 @@ def _determine_fused_weight_dims(
             weight_dim = linear_node.meta["val"].shape[2]
             fused_weight_dims = [weight_dim // num_chunks] * num_chunks
 
+    return fused_weight_dims
+
 
 def _find_upstream_qk_proj(node: Node, gm: GraphModule) -> Optional[str]:
     """
diff --git a/tests/unittest/auto_deploy/multigpu/transformations/library/test_tp_sharding.py b/tests/unittest/auto_deploy/multigpu/transformations/library/test_tp_sharding.py
@@ -40,6 +40,7 @@
     ShardingTransformConfig,
     SplitDimension,
     WeightShardingInfo,
+    _determine_fused_weight_dims,
     _update_node_args,
 )
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
@@ -502,6 +503,44 @@ def test_update_node_args_preserves_nested_symbolic_shape_nodes():
     assert placeholder_targets == ["x"]
 
 
+class _FusedQKVProj(nn.Module):
+    """Single fused qkv_proj sliced into q/k/v (Phi-3/Phi-4 layout)."""
+
+    def __init__(self, hidden_size: int, n_heads: int, n_kv_heads: int, head_dim: int):
+        super().__init__()
+        self.q_dim = n_heads * head_dim
+        self.kv_dim = n_kv_heads * head_dim
+        self.qkv_proj = nn.Linear(hidden_size, self.q_dim + 2 * self.kv_dim, bias=False)
+
+    def forward(self, x):
+        qkv = self.qkv_proj(x)
+        q = qkv[..., : self.q_dim]
+        k = qkv[..., self.q_dim : self.q_dim + self.kv_dim]
+        v = qkv[..., self.q_dim + self.kv_dim :]
+        return q.sum() + k.sum() + v.sum()
+
+
+def test_determine_fused_weight_dims_qkv():
+    """Regression for NVIDIA/TensorRT-LLM#14679: fused qkv_proj column sharding.
+
+    `_determine_fused_weight_dims` must return the [q, k, v] split sizes so the
+    slice boundaries get divided by world_size during column sharding. A missing
+    return made it yield None, leaving the slices at full width and breaking TP
+    for fused-qkv models like Phi-3/Phi-4.
+    """
+    hidden_size, n_heads, n_kv_heads, head_dim = 32, 4, 2, 8
+    model = _FusedQKVProj(hidden_size, n_heads, n_kv_heads, head_dim)
+    x = torch.randn(2, 3, hidden_size)
+    gm = torch_export_to_gm(model, args=(x,), clone=True)
+
+    slice_nodes = [n for n in gm.graph.nodes if is_op(n, torch.ops.aten.slice)]
+    assert len(slice_nodes) == 3, "Expected 3 slice nodes for fused QKV"
+    qkv_node = slice_nodes[0].args[0]
+
+    kv_dim = n_kv_heads * head_dim
+    assert _determine_fused_weight_dims([qkv_node]) == [n_heads * head_dim, kv_dim, kv_dim]
+
+
 def _run_sharding_execution_job(
     model_cls: nn.Module,
     dist_op_expected: str,