【CI】update resume cases (#1687)

kkscilife · web-flow · commit b00e9e0d8375 · 2026-04-23T19:19:35.000+08:00
* add new validation on resume cases

* fix f-string error

* update priority

* add qwen3.5 case about 8nums vs 16nums

* install tilelang

* limit version

* optimizer code
diff --git a/autotest/cluster/clusterx.py b/autotest/cluster/clusterx.py
@@ -45,7 +45,7 @@ def execute_task(self, task_config: Dict[str, Any]):
                 gpus_per_task=resource.get("gpus_per_task", 8),
                 cpus_per_task=resource.get("cpus_per_task", 32),
                 memory_per_task=resource.get("memory_per_task", 512),
-                priority=resource.get("priority", 4),
+                priority=resource.get("priority", 9),
                 priority_preemptible=resource.get("preemptible", False),
                 num_nodes=resource.get("num_nodes", 1),
                 image=resource.get("image", None),
diff --git a/autotest/config-npu.yaml b/autotest/config-npu.yaml
@@ -80,7 +80,7 @@ case:
         -
             type: sft
             pre_action:
-                command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output'
+                command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output npu-qwen3-sft-ep8 sft'
             parameters:
                 config: autotest/config/npu_qwen3_moe_30BA3_ep8_resume.py
                 output_path: /mnt/hwfile/llmrazor/qa-llm-cicd/test_output
diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -79,7 +79,7 @@ case:
         -
             type: sft
             pre_action:
-                command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output'
+                command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-sft-ep8 sft'
             parameters:
                 config: autotest/config/qwen3_moe_30BA3_ep8_resume.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -101,6 +101,8 @@ case:
                     lr: 0
                     memory/max_memory_GB: 0.2
                     runtime_info/text_tokens: 0
+            post_action:
+                command: 'python ./autotest/utils/resume_validation.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-sft-ep8 sft 10 "runtime_info/text_tokens,runtime_info/efficient_attn_ratio,loss/reduced_balancing_loss,loss/reduced_llm_loss,loss/local_loss"'
             timeout: 10800
 
     qwen3-sft-tp2:
@@ -476,8 +478,9 @@ case:
             resource:
                 envs:
                     - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
-                    - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_resume
+                    - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
                     - XTUNER_DETERMINISTIC=true
+                    - XTUNER_GC_ENABLE=1
             assert_info:
                 base_metric: qwen3-5-sft-sp4-resume/625c0018/tracker.jsonl
                 check_metrics:
@@ -493,6 +496,8 @@ case:
 
         -
             type: sft
+            pre_action:
+                command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume sft'
             parameters:
                 config: autotest/config/qwen3_5_moe_30BA3_sp4.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -502,6 +507,7 @@ case:
                     - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
                     - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
                     - XTUNER_DETERMINISTIC=true
+                    - XTUNER_GC_ENABLE=1
             assert_info:
                 base_metric: qwen3-5-sft-sp4-resume/625c0018_resume/tracker.jsonl
                 check_metrics:
@@ -513,6 +519,31 @@ case:
                     memory/max_memory_GB: 0.2
                     runtime_info/tgs: 0.05
                     runtime_info/text_tokens: 0
+            post_action:
+                command: 'python ./autotest/utils/resume_validation.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume sft 10 "runtime_info/text_tokens,runtime_info/efficient_attn_ratio,loss/reduced_balancing_loss,loss/reduced_llm_loss,loss/local_loss"'
+            timeout: 10800
+
+    qwen3-5-sft-8nums-vs-16nums:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/qwen3_5_sft_8nums.py
+                output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
+            resource:
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
+                    - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
+                    - CACHE_DIR=/mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/.cache_qwen3_5
+                    - XTUNER_DETERMINISTIC=true
+            assert_info:
+                base_metric: qwen3-5-sft-8nums-vs-16nums/147cb2ee/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.02
+                    loss/maxvio: 0.05
+                    loss/local_loss: 0.02
+                    loss/reduced_balancing_loss: 0.02
+                    loss/reduced_llm_loss: 0.02
+                    lr: 0
             timeout: 10800
 
     qwen3-rl-lmdeploy:
diff --git a/autotest/config/qwen3_5_moe_30BA3_sp4.py b/autotest/config/qwen3_5_moe_30BA3_sp4.py
@@ -53,4 +53,6 @@
     work_dir=f"{os.environ['WORK_DIR']}",
     seed=0,
     resume_cfg=ResumeConfig(auto_resume=True),
+    checkpoint_interval=10,
+    checkpoint_maxkeep=2,
 )
diff --git a/autotest/config/qwen3_5_sft_8nums.py b/autotest/config/qwen3_5_sft_8nums.py
@@ -0,0 +1,56 @@
+import os
+
+from xtuner.v1.config import (
+    AdamWConfig,
+    FSDPConfig,
+    LRConfig,
+)
+from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
+from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig
+from xtuner.v1.loss.ce_loss import CELossConfig
+from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
+from xtuner.v1.train import ResumeConfig, TrainerConfig
+
+
+QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
+ALPACA_PATH = os.environ["ALPACA_PATH"]
+CACHE_DIR = os.environ["CACHE_DIR"]
+
+
+moe_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
+optim_cfg = AdamWConfig(lr=6e-05)
+lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
+fsdp_cfg = FSDPConfig(
+    torch_compile=True,
+    cpu_offload=False,
+)
+
+dataset_config = [
+    {
+        "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0, cache_dir=CACHE_DIR),
+        "tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='qwen3', max_length=16384),
+    },
+]
+
+dataloader_config = DataloaderConfig(pack_max_length=16384)
+
+loss_cfg = CELossConfig(mode="chunk", chunk_size=1024, loss_reduction="square")
+
+
+trainer = TrainerConfig(
+    load_from=QWEN3_MOE_PATH,
+    model_cfg=moe_cfg,
+    optim_cfg=optim_cfg,
+    fsdp_cfg=fsdp_cfg,
+    sp_size=8,
+    dataset_cfg=dataset_config,
+    dataloader_cfg=dataloader_config,
+    lr_cfg=lr_cfg,
+    loss_cfg=loss_cfg,
+    tokenizer_path=QWEN3_MOE_PATH,
+    global_batch_size=16,
+    total_epoch=1,
+    work_dir=f"{os.environ['WORK_DIR']}",
+    seed=0,
+    resume_cfg=ResumeConfig(auto_resume=True),
+)
diff --git a/autotest/module/train.py b/autotest/module/train.py
@@ -84,7 +84,11 @@ def pre_action(config=None):
                 run_cmd(action_cmd)
 
     def post_action(config=None):
-        return True, config
+        action_info = config.get("post_action", None)
+        if action_info:
+            action_cmd = action_info.get("command", None)
+            if action_cmd:
+                run_cmd(action_cmd)
 
 
 def get_latest_subdir(work_dir):
diff --git a/autotest/test_all.py b/autotest/test_all.py
@@ -71,5 +71,4 @@ def exec_step_test(step_config, task_executor, context):
     assert result, info
 
     # post action
-    result, info = handler.post_action(step_config.get("type"), step_config)
-    assert result, info
+    handler.post_action(step_config.get("type"), step_config)
diff --git a/autotest/utils/resume_validation.py b/autotest/utils/resume_validation.py
@@ -0,0 +1,89 @@
+import json
+import math
+import os
+import sys
+
+
+def get_latest_subdir(work_dir):
+    dirs = [
+        d
+        for d in os.listdir(work_dir)
+        if os.path.isdir(os.path.join(work_dir, d)) and len(d) == 14 and d.isdigit()
+    ]
+    if not dirs:
+        return None
+    latest = max(dirs, key=lambda d: os.path.getmtime(os.path.join(work_dir, d)))
+    return os.path.join(work_dir, latest)
+
+
+def extract_value(file, center_step, metrics):
+    window_steps = list(range(center_step + 1, center_step + 4))
+    want = frozenset(window_steps)
+    by_step = {s: {m: [] for m in metrics} for s in window_steps}
+    with open(file, encoding="utf-8") as f:
+        for line in f:
+            obj = json.loads(line)
+            s = obj.get("step")
+            if s not in want:
+                continue
+            row = by_step[s]
+            for m in metrics:
+                row[m].append(obj[m] if m in obj else None)
+    return window_steps, by_step
+
+
+def verify_window(path, center_step, metrics):
+    window_steps, by_step = extract_value(path, center_step, metrics)
+    missing_steps = {}
+    missing_keys = []
+    not_equal = {}
+
+    for m in metrics:
+        miss = [s for s in window_steps if not by_step[s][m]]
+        if miss:
+            missing_steps[m] = miss
+            continue
+
+        bad_step = None
+        for s in window_steps:
+            vals = by_step[s][m]
+            if any(v is None for v in vals):
+                bad_step = s
+                break
+        if bad_step is not None:
+            missing_keys.append((m, bad_step))
+            continue
+
+        for s in window_steps:
+            vals = by_step[s][m]
+            if len(vals) > 1:
+                first = vals[0]
+                if any(not math.isclose(v, first, rel_tol=1e-6, abs_tol=0.0) for v in vals[1:]):
+                    not_equal.setdefault(m, []).append((s, list(vals)))
+
+    check_result = not (missing_steps or missing_keys or not_equal)
+    if not check_result:
+        if missing_steps:
+            print("Missing step data (no records for this step):", file=sys.stderr)
+            for m, steps in missing_steps.items():
+                print(f"  {m}: step {steps}", file=sys.stderr)
+        if missing_keys:
+            print("Missing key (metric absent in tracker line, value is None):", file=sys.stderr)
+            for m, s in missing_keys:
+                print(f"  {m}: step {s}", file=sys.stderr)
+        if not_equal:
+            print("Inconsistent metric values across duplicate records at the same step:", file=sys.stderr)
+            for m, pairs in not_equal.items():
+                parts = []
+                for s, vals in pairs:
+                    parts.append(f"step {s}: {vals}")
+                print(f"  {m}: " + "; ".join(parts), file=sys.stderr)
+    return check_result
+
+if __name__ == "__main__":
+    base_dir = f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/{sys.argv[2]}/{sys.argv[3]}"
+    real_dir = get_latest_subdir(base_dir)
+    tracker = os.path.join(real_dir, "logs/exp_tracking/rank0/tracker.jsonl")
+    center_step = int(sys.argv[4])
+    metrics = sys.argv[5].split(',')
+    assert verify_window(tracker, center_step, metrics), "Resume validation failed, see the printed output for details"
diff --git a/autotest/utils/run_cmd.py b/autotest/utils/run_cmd.py
@@ -1,10 +1,11 @@
+import pytest
 import subprocess
 
 
 def run_cmd(command):
     try:
         result = subprocess.run(command, shell=True, capture_output=True, text=True)
-        if result.returncode != 0:
-            print(f"run command error:{result.stderr}")
     except Exception as e:
-        print(f"Unknown error: {e}")
+        pytest.fail(f"Unknown error: {e}")
+    if result.returncode != 0:
+        pytest.fail(f"run command error:{result.stderr}")
diff --git a/autotest/utils/update_meta.py b/autotest/utils/update_meta.py
@@ -29,15 +29,7 @@ def get_latest_subdir(work_dir):
     return os.path.join(work_dir, latest)
 
 
-device = os.environ.get("DEVICE", "")
-if device == "npu":
-    base_dir = (
-        f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/npu-qwen3-sft-ep8/sft"
-    )
-else:
-    base_dir = (
-        f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/qwen3-sft-ep8/sft"
-    )
+base_dir = f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/{sys.argv[2]}/{sys.argv[3]}"
 real_dir = get_latest_subdir(base_dir)
 new_meta = {"end": 10, "exp_dir": real_dir, "checkpoint_list": [f"{real_dir}/checkpoints/ckpt-step-10"]}
 update_meta(f"{base_dir}/.xtuner", new_meta)

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ case:`
`80`	`80`	`-`
`81`	`81`	`type: sft`
`82`	`82`	`pre_action:`
`83`		`- command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output'`
	`83`	`+ command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output npu-qwen3-sft-ep8 sft'`
`84`	`84`	`parameters:`
`85`	`85`	`config: autotest/config/npu_qwen3_moe_30BA3_ep8_resume.py`
`86`	`86`	`output_path: /mnt/hwfile/llmrazor/qa-llm-cicd/test_output`
Original file line number	Diff line number	Diff line change
`@@ -53,4 +53,6 @@`
`53`	`53`	`work_dir=f"{os.environ['WORK_DIR']}",`
`54`	`54`	`seed=0,`
`55`	`55`	`resume_cfg=ResumeConfig(auto_resume=True),`
	`56`	`+ checkpoint_interval=10,`
	`57`	`+ checkpoint_maxkeep=2,`
`56`	`58`	`)`