Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fastdeploy/model_executor/forward_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class ForwardMeta:
ids_remove_padding: paddle.Tensor
# Rotation position embedding
rotary_embs: Optional[paddle.Tensor] = None
swa_rotary_embs: Optional[paddle.Tensor] = None

# Use cuda graph in this step or not. Used to avoid run cuda graph when in dummy run or prefill stage.
step_use_cudagraph: bool = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def __init__(
self.sink_size: int = getattr(fd_config.model_config, "sink_size", 0)
self.window_attn_skip_freq: list = getattr(fd_config.model_config, "window_attn_skip_freq", [0])
self.head_wise_swa_ratio: float = getattr(fd_config.model_config, "head_wise_swa_ratio", 0.0)
self.swa_rope_theta = getattr(fd_config.model_config, "swa_rope_theta", None)

self.head_wise_full_hidden = 0
if self.head_wise_swa_ratio > 0.0:
Expand Down Expand Up @@ -320,8 +321,11 @@ def forward_mixed(
forward_meta.rotary_embs = self._get_identity_rotary_embs(forward_meta.rotary_embs)

sliding_window = 0
rotary_embs = forward_meta.rotary_embs
if len(self.window_attn_skip_freq) > 1 and self.window_attn_skip_freq[layer.layer_id] == 1:
sliding_window = self.sliding_window if self.sliding_window > 0 else layer.sliding_window
if self.swa_rope_theta is not None:
rotary_embs = forward_meta.swa_rotary_embs

This comment was marked as outdated.


norm_after_rope_in_kernel = not getattr(layer, "qk_norm_before_rope", False)
q_norm_weight = getattr(layer, "q_norm_weight", None) if norm_after_rope_in_kernel else None
Expand Down Expand Up @@ -401,8 +405,8 @@ def forward_mixed(
assert forward_meta.rotary_embs.shape[0] == 2
do_rope(
qkv,
forward_meta.rotary_embs[0],
forward_meta.rotary_embs[1],
rotary_embs[0],
rotary_embs[1],
forward_meta.cu_seqlens_q,
forward_meta.seq_lens_decoder,
forward_meta.batch_id_per_token,
Expand Down Expand Up @@ -476,7 +480,7 @@ def forward_mixed(
forward_meta.decoder_num_blocks_cpu,
forward_meta.max_len_tensor_cpu,
res,
forward_meta.rotary_embs,
rotary_embs,
forward_meta.attn_mask,
layer.qkv_bias,
layer.qkv_scale,
Expand Down Expand Up @@ -532,7 +536,7 @@ def forward_mixed(
forward_meta.decoder_tile_ids_per_batch,
forward_meta.decoder_num_blocks_cpu,
forward_meta.max_len_tensor_cpu,
forward_meta.rotary_embs,
rotary_embs,
forward_meta.attn_mask,
layer.qkv_bias,
layer.qkv_scale,
Expand Down
14 changes: 10 additions & 4 deletions fastdeploy/model_executor/models/deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,21 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = "") -> None
self.kv_lora_rank = fd_config.model_config.kv_lora_rank

# swa
self.swa_layer_list = getattr(fd_config.model_config, "window_attn_skip_freq", None)
self.window_attn_skip_freq = getattr(fd_config.model_config, "window_attn_skip_freq", None)
self.sliding_window = getattr(fd_config.model_config, "sliding_window", 0)
self.swa_rope_theta = getattr(fd_config.model_config, "swa_rope_theta", None)

self.attn_softmax_scale = self.qk_head_dim**-0.5

if fd_config.model_config.model_type == "glm_moe_dsa":
self.rope_theta = fd_config.model_config.rope_parameters["rope_theta"]

if (

This comment was marked as outdated.

self.window_attn_skip_freq is not None
and self.window_attn_skip_freq[self.layer_id] == 1
and self.swa_rope_theta is not None
):
self.rope_theta = self.swa_rope_theta
else:
self.rope_theta = fd_config.model_config.rope_theta

Expand Down Expand Up @@ -525,9 +533,7 @@ def forward(
need_do_prefill = forward_meta.max_len_tensor_cpu[1] > 0
need_do_decode = forward_meta.max_len_tensor_cpu[2] > 0

window_attn_skip_freq = getattr(self.fd_config.model_config, "window_attn_skip_freq", None)

if window_attn_skip_freq is not None and window_attn_skip_freq[self.layer_id] == 1:
if self.window_attn_skip_freq is not None and self.window_attn_skip_freq[self.layer_id] == 1:
attn_out = self.forward_swa_static(
forward_meta=forward_meta,
query_nope=query_nope,
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1466,6 +1466,7 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
self.forward_meta = ForwardMeta(
ids_remove_padding=self.share_inputs["ids_remove_padding"],
rotary_embs=self.share_inputs["rope_emb"],
swa_rotary_embs=self.share_inputs["swa_rope_emb"],
attn_backend=self.attn_backends[0],
attn_backends=self.attn_backends,
decoder_batch_ids=self.share_inputs["decoder_batch_ids"],
Expand Down
37 changes: 37 additions & 0 deletions fastdeploy/worker/input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ def __init__(self, fd_config: FDConfig) -> None:
else:
self.max_chunk_tokens = self.fd_config.get_max_chunk_tokens(self.model_config.mm_max_tokens_per_item)

self.swa_rope_theta = getattr(self.fd_config.model_config, "swa_rope_theta", None)
self.swa_rope_emb = None

def init_share_inputs(self):
max_num_seqs = self.scheduler_config.max_num_seqs

Expand Down Expand Up @@ -245,6 +248,14 @@ def init_share_inputs(self):
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
if self.swa_rope_theta is not None:
self.swa_rope_emb = get_rope(
rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
base=self.swa_rope_theta,
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
if self.is_mm_model:
self.image_features = None
self.image_grid_thws = None
Expand Down Expand Up @@ -727,6 +738,14 @@ def reset_share_inputs(self):
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
if self.swa_rope_theta is not None:
self.swa_rope_emb = get_rope(
rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
base=self.swa_rope_theta,
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
if self.is_mm_model:
self.image_features = None
self.image_grid_thws = None
Expand Down Expand Up @@ -764,6 +783,8 @@ def __init__(self, fd_config: FDConfig, target_model_input_batch: InputBatch) ->
self.cache_config: CacheConfig = fd_config.cache_config
self.speculative_config: SpeculativeConfig = fd_config.speculative_config
self.enable_pd_reorder: bool = False
self.swa_rope_theta = getattr(self.fd_config.model_config, "swa_rope_theta", None)
self.swa_rope_emb = None

def init_share_inputs(self):
# share with targe model
Expand Down Expand Up @@ -829,6 +850,14 @@ def init_share_inputs(self):
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
if self.swa_rope_theta is not None:
self.swa_rope_emb = get_rope(
rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
position_ids=tmp_position_ids,
base=self.swa_rope_theta,
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)

# self.caches = self.cache_kvs
# Inherit generation hyperparameters from the main model for consistency
Expand Down Expand Up @@ -1059,6 +1088,14 @@ def reset_model_inputs(self) -> None:
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
if self.swa_rope_theta is not None:
self.swa_rope_emb = get_rope(
rotary_dim=self.rotary_dim if rotary_percent < 1.0 else self.model_config.head_dim,
position_ids=tmp_position_ids,
base=self.swa_rope_theta,
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)

# Reset generation hyperparameters from the main model
self.top_p = self.target_model_input_batch["top_p"]
Expand Down
Loading