Skip to content

Commit ccca007

Browse files
authored
Merge branch 'develop' into opt_mtp_logprob
2 parents f7a1c67 + b336db7 commit ccca007

6 files changed

Lines changed: 36 additions & 17 deletions

File tree

.github/workflows/_xpu_4cards_case_test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ jobs:
230230
231231
- name: Upload case logs
232232
if: always()
233+
continue-on-error: true
233234
uses: actions/upload-artifact@v6
234235
with:
235236
name: xpu-4cards-case-logs

.github/workflows/_xpu_8cards_case_test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ jobs:
218218
219219
- name: Upload case logs
220220
if: always()
221+
continue-on-error: true
221222
uses: actions/upload-artifact@v6
222223
with:
223224
name: xpu-8cards-case-logs

.github/workflows/ci_metax.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ on:
77
- synchronize
88
branches:
99
- develop
10-
- release/**
1110

1211
permissions:
1312
contents: read
@@ -19,8 +18,7 @@ concurrency:
1918
jobs:
2019
trigger-jenkins:
2120
name: Trigger Jenkins for PR
22-
runs-on:
23-
group: APPROVAL
21+
runs-on: [self-hosted, XPU-P800]
2422
environment: Metax_ci
2523

2624
steps:

fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ def apply_tp(
446446
gate_out = gate_out.cast("float32")
447447
if fc1_latent_proj is not None:
448448
x = fc1_latent_proj(x)
449-
gate_out, topk_weights, topk_idx = get_moe_scores(
449+
gate_out, _, __ = get_moe_scores(
450450
gate_out,
451451
layer.n_group,
452452
layer.topk_group,
@@ -458,11 +458,6 @@ def apply_tp(
458458
use_fused_cast=use_fused,
459459
)
460460

461-
if layer.routed_scaling_factor_learnable:
462-
safe_topk_indices = paddle.clip(topk_idx, min=0)
463-
gathered_scales = F.embedding(safe_topk_indices, layer.per_expert_scale.unsqueeze(1)).squeeze(-1)
464-
topk_weights = topk_weights * gathered_scales
465-
466461
(
467462
permute_input,
468463
token_nums_per_expert,
@@ -484,6 +479,12 @@ def apply_tp(
484479
self.moe_quant_type,
485480
topk_only_mode=True,
486481
)
482+
483+
if layer.routed_scaling_factor_learnable:
484+
safe_topk_indices = paddle.clip(topk_idx, min=0)
485+
gathered_scales = F.embedding(safe_topk_indices, layer.per_expert_scale.unsqueeze(1)).squeeze(-1)
486+
topk_weights = topk_weights * gathered_scales
487+
487488
else:
488489
gate_out = gate_out.cast("float32")
489490
if fc1_latent_proj is not None:

fastdeploy/model_executor/layers/rotary_embedding.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def forward(
268268
return query, key
269269

270270

271-
class GptOssScalingRotaryEmbedding:
271+
class YarnScalingRotaryEmbedding:
272272
def __init__(
273273
self,
274274
rotary_dim,
@@ -345,10 +345,29 @@ def get_rope_impl(
345345
rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base, partial_rotary_factor)
346346
rotary_emb = rotary_emb_layer(position_ids)
347347
elif architecture.startswith("Glm"):
348-
rotary_emb_layer = GlmRotaryEmbedding(rotary_dim, base, partial_rotary_factor)
348+
rope_scaling = getattr(model_config, "rope_scaling", None)
349+
if (
350+
rope_scaling is not None
351+
and isinstance(rope_scaling, dict)
352+
and rope_scaling.get("rope_type", rope_scaling.get("type", "")) == "yarn"
353+
and "factor" in rope_scaling
354+
):
355+
yarn_rotary_dim = int(rotary_dim * partial_rotary_factor) if partial_rotary_factor < 1.0 else rotary_dim
356+
rotary_emb_layer = YarnScalingRotaryEmbedding(
357+
rotary_dim=yarn_rotary_dim,
358+
base=base,
359+
original_max_position_embeddings=rope_scaling["original_max_position_embeddings"],
360+
scale=rope_scaling["factor"],
361+
mscale=rope_scaling.get("mscale", 1.0),
362+
beta_fast=rope_scaling.get("beta_fast", 32),
363+
beta_slow=rope_scaling.get("beta_slow", 1),
364+
use_neox_rotary_style=False,
365+
)
366+
else:
367+
rotary_emb_layer = GlmRotaryEmbedding(rotary_dim, base, partial_rotary_factor)
349368
rotary_emb = rotary_emb_layer(position_ids)
350369
elif architecture.startswith("GptOss"):
351-
rotary_emb_layer = GptOssScalingRotaryEmbedding(
370+
rotary_emb_layer = YarnScalingRotaryEmbedding(
352371
rotary_dim=model_config.head_dim,
353372
base=model_config.rope_theta,
354373
original_max_position_embeddings=model_config.rope_scaling["original_max_position_embeddings"],

scripts/run_pre_ce.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,10 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p
88
python -m pip install -r requirements.txt
99
python -m pip install jsonschema aistudio_sdk==0.3.5
1010
# Use prebuilt wheel files to install xgrammar==0.1.19 and torch==2.8.0 specifically for the CI environment
11-
python -m pip install xgrammar==0.1.19 torch==2.8.0
12-
# python -m pip install \
13-
# https://paddle-qa.bj.bcebos.com/FastDeploy/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl \
14-
# https://paddle-qa.bj.bcebos.com/FastDeploy/triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl \
15-
# https://paddle-qa.bj.bcebos.com/FastDeploy/xgrammar-0.1.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
11+
python -m pip install \
12+
https://paddle-qa.bj.bcebos.com/FastDeploy/torch-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl \
13+
https://paddle-qa.bj.bcebos.com/FastDeploy/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl \
14+
https://paddle-qa.bj.bcebos.com/FastDeploy/xgrammar-0.1.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
1615

1716
failed_files=()
1817
run_path="$DIR/../tests/ci_use/"

0 commit comments

Comments
 (0)