update forward

Linboyan-trc · Linboyan-trc · commit 15a153a248c4 · 2026-05-15T17:30:14.000+08:00
diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -362,6 +362,18 @@ def forward(
             fused_read_cache_and_interleave,
         )
 
+        need_do_prefill = forward_meta.max_len_tensor_cpu[1] > 0
+        need_do_decode = forward_meta.max_len_tensor_cpu[2] > 0
+
+        # Idle pass (e.g. CUDAGraph padding): skip all attention computation
+        if not need_do_prefill and not need_do_decode:
+            return self.o_proj(
+                paddle.zeros(
+                    [hidden_states.shape[0], self.num_attention_heads_tp * self.v_head_dim],
+                    dtype=hidden_states.dtype,
+                )
+            )
+
         attn_out = None
         if self.use_gated_attn:
             gate_out = self.gate(hidden_states)
@@ -489,7 +501,6 @@ def forward(
                 attn_out = attn_out * ((F.softsign(gate_out) + 1.0) / 2.0)
             else:
                 raise NotImplementedError(f"{gated_attn_act} not implemented")
-
         output = self.o_proj(attn_out)
         return output