Merge branch 'develop' into opt_mtp_logprob

Sunny-bot1 · web-flow · commit ccca0076add9 · 2026-05-25T18:25:12.000+08:00
diff --git a/.github/workflows/_xpu_4cards_case_test.yml b/.github/workflows/_xpu_4cards_case_test.yml
@@ -230,6 +230,7 @@ jobs:
 
       - name: Upload case logs
         if: always()
+        continue-on-error: true
         uses: actions/upload-artifact@v6
         with:
           name: xpu-4cards-case-logs
diff --git a/.github/workflows/_xpu_8cards_case_test.yml b/.github/workflows/_xpu_8cards_case_test.yml
@@ -218,6 +218,7 @@ jobs:
 
       - name: Upload case logs
         if: always()
+        continue-on-error: true
         uses: actions/upload-artifact@v6
         with:
           name: xpu-8cards-case-logs
diff --git a/.github/workflows/ci_metax.yml b/.github/workflows/ci_metax.yml
@@ -7,7 +7,6 @@ on:
       - synchronize
     branches:
       - develop
-      - release/**
 
 permissions:
   contents: read
@@ -19,8 +18,7 @@ concurrency:
 jobs:
   trigger-jenkins:
     name: Trigger Jenkins for PR
-    runs-on:
-      group: APPROVAL
+    runs-on: [self-hosted, XPU-P800]
     environment: Metax_ci
 
     steps:
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -446,7 +446,7 @@ def apply_tp(
                 gate_out = gate_out.cast("float32")
             if fc1_latent_proj is not None:
                 x = fc1_latent_proj(x)
-            gate_out, topk_weights, topk_idx = get_moe_scores(
+            gate_out, _, __ = get_moe_scores(
                 gate_out,
                 layer.n_group,
                 layer.topk_group,
@@ -458,11 +458,6 @@ def apply_tp(
                 use_fused_cast=use_fused,
             )
 
-            if layer.routed_scaling_factor_learnable:
-                safe_topk_indices = paddle.clip(topk_idx, min=0)
-                gathered_scales = F.embedding(safe_topk_indices, layer.per_expert_scale.unsqueeze(1)).squeeze(-1)
-                topk_weights = topk_weights * gathered_scales
-
             (
                 permute_input,
                 token_nums_per_expert,
@@ -484,6 +479,12 @@ def apply_tp(
                 self.moe_quant_type,
                 topk_only_mode=True,
             )
+
+            if layer.routed_scaling_factor_learnable:
+                safe_topk_indices = paddle.clip(topk_idx, min=0)
+                gathered_scales = F.embedding(safe_topk_indices, layer.per_expert_scale.unsqueeze(1)).squeeze(-1)
+                topk_weights = topk_weights * gathered_scales
+
         else:
             gate_out = gate_out.cast("float32")
             if fc1_latent_proj is not None:
diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py
@@ -268,7 +268,7 @@ def forward(
         return query, key
 
 
-class GptOssScalingRotaryEmbedding:
+class YarnScalingRotaryEmbedding:
     def __init__(
         self,
         rotary_dim,
@@ -345,10 +345,29 @@ def get_rope_impl(
         rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base, partial_rotary_factor)
         rotary_emb = rotary_emb_layer(position_ids)
     elif architecture.startswith("Glm"):
-        rotary_emb_layer = GlmRotaryEmbedding(rotary_dim, base, partial_rotary_factor)
+        rope_scaling = getattr(model_config, "rope_scaling", None)
+        if (
+            rope_scaling is not None
+            and isinstance(rope_scaling, dict)
+            and rope_scaling.get("rope_type", rope_scaling.get("type", "")) == "yarn"
+            and "factor" in rope_scaling
+        ):
+            yarn_rotary_dim = int(rotary_dim * partial_rotary_factor) if partial_rotary_factor < 1.0 else rotary_dim
+            rotary_emb_layer = YarnScalingRotaryEmbedding(
+                rotary_dim=yarn_rotary_dim,
+                base=base,
+                original_max_position_embeddings=rope_scaling["original_max_position_embeddings"],
+                scale=rope_scaling["factor"],
+                mscale=rope_scaling.get("mscale", 1.0),
+                beta_fast=rope_scaling.get("beta_fast", 32),
+                beta_slow=rope_scaling.get("beta_slow", 1),
+                use_neox_rotary_style=False,
+            )
+        else:
+            rotary_emb_layer = GlmRotaryEmbedding(rotary_dim, base, partial_rotary_factor)
         rotary_emb = rotary_emb_layer(position_ids)
     elif architecture.startswith("GptOss"):
-        rotary_emb_layer = GptOssScalingRotaryEmbedding(
+        rotary_emb_layer = YarnScalingRotaryEmbedding(
             rotary_dim=model_config.head_dim,
             base=model_config.rope_theta,
             original_max_position_embeddings=model_config.rope_scaling["original_max_position_embeddings"],
diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh
@@ -8,11 +8,10 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p
 python -m pip install -r requirements.txt
 python -m pip install jsonschema aistudio_sdk==0.3.5
 # Use prebuilt wheel files to install xgrammar==0.1.19 and torch==2.8.0 specifically for the CI environment
-python -m pip install xgrammar==0.1.19 torch==2.8.0
-# python -m pip install  \
-#   https://paddle-qa.bj.bcebos.com/FastDeploy/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl \
-#   https://paddle-qa.bj.bcebos.com/FastDeploy/triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl \
-#   https://paddle-qa.bj.bcebos.com/FastDeploy/xgrammar-0.1.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+ python -m pip install  \
+   https://paddle-qa.bj.bcebos.com/FastDeploy/torch-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl \
+   https://paddle-qa.bj.bcebos.com/FastDeploy/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl \
+   https://paddle-qa.bj.bcebos.com/FastDeploy/xgrammar-0.1.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 
 failed_files=()
 run_path="$DIR/../tests/ci_use/"