[Models] Update SWA RoPE theta for MLA/GQA attention (#8077)

chang-wenbin · web-flow · commit f4eda5aa9388 · 2026-06-26T12:04:08.000+08:00
* update mla_gqa_swa_rope_theta

* update mla_gqa_swa_rope_theta

* update mla_gqa_swa_rope_theta1
diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py
@@ -69,6 +69,7 @@ class ForwardMeta:
     ids_remove_padding: paddle.Tensor
     # Rotation position embedding
     rotary_embs: Optional[paddle.Tensor] = None
+    swa_rotary_embs: Optional[paddle.Tensor] = None
 
     # Use cuda graph in this step or not. Used to avoid run cuda graph when in dummy run or prefill stage.
     step_use_cudagraph: bool = False
diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -185,6 +185,7 @@ def __init__(
         self.sink_size: int = getattr(fd_config.model_config, "sink_size", 0)
         self.window_attn_skip_freq: list = getattr(fd_config.model_config, "window_attn_skip_freq", [0])
         self.head_wise_swa_ratio: float = getattr(fd_config.model_config, "head_wise_swa_ratio", 0.0)
+        self.swa_rope_theta = getattr(fd_config.model_config, "swa_rope_theta", None)
 
         self.head_wise_full_hidden = 0
         if self.head_wise_swa_ratio > 0.0:
@@ -320,8 +321,11 @@ def forward_mixed(
             forward_meta.rotary_embs = self._get_identity_rotary_embs(forward_meta.rotary_embs)
 
         sliding_window = 0
+        rotary_embs = forward_meta.rotary_embs
         if len(self.window_attn_skip_freq) > 1 and self.window_attn_skip_freq[layer.layer_id] == 1:
             sliding_window = self.sliding_window if self.sliding_window > 0 else layer.sliding_window
+            if self.swa_rope_theta is not None:
+                rotary_embs = forward_meta.swa_rotary_embs
 
         norm_after_rope_in_kernel = not getattr(layer, "qk_norm_before_rope", False)
         q_norm_weight = getattr(layer, "q_norm_weight", None) if norm_after_rope_in_kernel else None
@@ -401,8 +405,8 @@ def forward_mixed(
             assert forward_meta.rotary_embs.shape[0] == 2
             do_rope(
                 qkv,
-                forward_meta.rotary_embs[0],
-                forward_meta.rotary_embs[1],
+                rotary_embs[0],
+                rotary_embs[1],
                 forward_meta.cu_seqlens_q,
                 forward_meta.seq_lens_decoder,
                 forward_meta.batch_id_per_token,
@@ -476,7 +480,7 @@ def forward_mixed(
                 forward_meta.decoder_num_blocks_cpu,
                 forward_meta.max_len_tensor_cpu,
                 res,
-                forward_meta.rotary_embs,
+                rotary_embs,
                 forward_meta.attn_mask,
                 layer.qkv_bias,
                 layer.qkv_scale,
@@ -532,7 +536,7 @@ def forward_mixed(
                 forward_meta.decoder_tile_ids_per_batch,
                 forward_meta.decoder_num_blocks_cpu,
                 forward_meta.max_len_tensor_cpu,
-                forward_meta.rotary_embs,
+                rotary_embs,
                 forward_meta.attn_mask,
                 layer.qkv_bias,
                 layer.qkv_scale,
diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -299,13 +299,21 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = "") -> None
         self.kv_lora_rank = fd_config.model_config.kv_lora_rank
 
         # swa
-        self.swa_layer_list = getattr(fd_config.model_config, "window_attn_skip_freq", None)
+        self.window_attn_skip_freq = getattr(fd_config.model_config, "window_attn_skip_freq", None)
         self.sliding_window = getattr(fd_config.model_config, "sliding_window", 0)
+        self.swa_rope_theta = getattr(fd_config.model_config, "swa_rope_theta", None)
 
         self.attn_softmax_scale = self.qk_head_dim**-0.5
 
         if fd_config.model_config.model_type == "glm_moe_dsa":
             self.rope_theta = fd_config.model_config.rope_parameters["rope_theta"]
+
+        if (
+            self.window_attn_skip_freq is not None
+            and self.window_attn_skip_freq[self.layer_id] == 1
+            and self.swa_rope_theta is not None
+        ):
+            self.rope_theta = self.swa_rope_theta
         else:
             self.rope_theta = fd_config.model_config.rope_theta
 
@@ -525,9 +533,7 @@ def forward(
         need_do_prefill = forward_meta.max_len_tensor_cpu[1] > 0
         need_do_decode = forward_meta.max_len_tensor_cpu[2] > 0
 
-        window_attn_skip_freq = getattr(self.fd_config.model_config, "window_attn_skip_freq", None)
-
-        if window_attn_skip_freq is not None and window_attn_skip_freq[self.layer_id] == 1:
+        if self.window_attn_skip_freq is not None and self.window_attn_skip_freq[self.layer_id] == 1:
             attn_out = self.forward_swa_static(
                 forward_meta=forward_meta,
                 query_nope=query_nope,
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -1466,6 +1466,7 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
         self.forward_meta = ForwardMeta(
             ids_remove_padding=self.share_inputs["ids_remove_padding"],
             rotary_embs=self.share_inputs["rope_emb"],
+            swa_rotary_embs=self.share_inputs["swa_rope_emb"],
             attn_backend=self.attn_backends[0],
             attn_backends=self.attn_backends,
             decoder_batch_ids=self.share_inputs["decoder_batch_ids"],
diff --git a/fastdeploy/worker/input_batch.py b/fastdeploy/worker/input_batch.py
@@ -108,6 +108,9 @@ def __init__(self, fd_config: FDConfig) -> None:
         else:
             self.max_chunk_tokens = self.fd_config.get_max_chunk_tokens(self.model_config.mm_max_tokens_per_item)
 
+        self.swa_rope_theta = getattr(self.fd_config.model_config, "swa_rope_theta", None)
+        self.swa_rope_emb = None
+
     def init_share_inputs(self):
         max_num_seqs = self.scheduler_config.max_num_seqs
 
@@ -245,6 +248,14 @@ def init_share_inputs(self):
                 model_config=self.model_config,
                 partial_rotary_factor=self.model_config.partial_rotary_factor,
             )
+            if self.swa_rope_theta is not None:
+                self.swa_rope_emb = get_rope(
+                    rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
+                    position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
+                    base=self.swa_rope_theta,
+                    model_config=self.model_config,
+                    partial_rotary_factor=self.model_config.partial_rotary_factor,
+                )
             if self.is_mm_model:
                 self.image_features = None
                 self.image_grid_thws = None
@@ -727,6 +738,14 @@ def reset_share_inputs(self):
                     model_config=self.model_config,
                     partial_rotary_factor=self.model_config.partial_rotary_factor,
                 )
+                if self.swa_rope_theta is not None:
+                    self.swa_rope_emb = get_rope(
+                        rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
+                        position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
+                        base=self.swa_rope_theta,
+                        model_config=self.model_config,
+                        partial_rotary_factor=self.model_config.partial_rotary_factor,
+                    )
                 if self.is_mm_model:
                     self.image_features = None
                     self.image_grid_thws = None
@@ -764,6 +783,8 @@ def __init__(self, fd_config: FDConfig, target_model_input_batch: InputBatch) ->
         self.cache_config: CacheConfig = fd_config.cache_config
         self.speculative_config: SpeculativeConfig = fd_config.speculative_config
         self.enable_pd_reorder: bool = False
+        self.swa_rope_theta = getattr(self.fd_config.model_config, "swa_rope_theta", None)
+        self.swa_rope_emb = None
 
     def init_share_inputs(self):
         # share with targe model
@@ -829,6 +850,14 @@ def init_share_inputs(self):
             model_config=self.model_config,
             partial_rotary_factor=self.model_config.partial_rotary_factor,
         )
+        if self.swa_rope_theta is not None:
+            self.swa_rope_emb = get_rope(
+                rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
+                position_ids=tmp_position_ids,
+                base=self.swa_rope_theta,
+                model_config=self.model_config,
+                partial_rotary_factor=self.model_config.partial_rotary_factor,
+            )
 
         # self.caches = self.cache_kvs
         # Inherit generation hyperparameters from the main model for consistency
@@ -1059,6 +1088,14 @@ def reset_model_inputs(self) -> None:
                 model_config=self.model_config,
                 partial_rotary_factor=self.model_config.partial_rotary_factor,
             )
+            if self.swa_rope_theta is not None:
+                self.swa_rope_emb = get_rope(
+                    rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
+                    position_ids=tmp_position_ids,
+                    base=self.swa_rope_theta,
+                    model_config=self.model_config,
+                    partial_rotary_factor=self.model_config.partial_rotary_factor,
+                )
 
             # Reset generation hyperparameters from the main model
             self.top_p = self.target_model_input_batch["top_p"]