Skip to content

Commit b00e9e0

Browse files
authored
【CI】update resume cases (#1687)
* add new validation on resume cases * fix f-string error * update priority * add qwen3.5 case about 8nums vs 16nums * install tilelang * limit version * optimizer code
1 parent 99a9602 commit b00e9e0

10 files changed

Lines changed: 193 additions & 19 deletions

File tree

autotest/cluster/clusterx.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def execute_task(self, task_config: Dict[str, Any]):
4545
gpus_per_task=resource.get("gpus_per_task", 8),
4646
cpus_per_task=resource.get("cpus_per_task", 32),
4747
memory_per_task=resource.get("memory_per_task", 512),
48-
priority=resource.get("priority", 4),
48+
priority=resource.get("priority", 9),
4949
priority_preemptible=resource.get("preemptible", False),
5050
num_nodes=resource.get("num_nodes", 1),
5151
image=resource.get("image", None),

autotest/config-npu.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ case:
8080
-
8181
type: sft
8282
pre_action:
83-
command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output'
83+
command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output npu-qwen3-sft-ep8 sft'
8484
parameters:
8585
config: autotest/config/npu_qwen3_moe_30BA3_ep8_resume.py
8686
output_path: /mnt/hwfile/llmrazor/qa-llm-cicd/test_output

autotest/config.yaml

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ case:
7979
-
8080
type: sft
8181
pre_action:
82-
command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output'
82+
command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-sft-ep8 sft'
8383
parameters:
8484
config: autotest/config/qwen3_moe_30BA3_ep8_resume.py
8585
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -101,6 +101,8 @@ case:
101101
lr: 0
102102
memory/max_memory_GB: 0.2
103103
runtime_info/text_tokens: 0
104+
post_action:
105+
command: 'python ./autotest/utils/resume_validation.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-sft-ep8 sft 10 "runtime_info/text_tokens,runtime_info/efficient_attn_ratio,loss/reduced_balancing_loss,loss/reduced_llm_loss,loss/local_loss"'
104106
timeout: 10800
105107

106108
qwen3-sft-tp2:
@@ -476,8 +478,9 @@ case:
476478
resource:
477479
envs:
478480
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
479-
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_resume
481+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
480482
- XTUNER_DETERMINISTIC=true
483+
- XTUNER_GC_ENABLE=1
481484
assert_info:
482485
base_metric: qwen3-5-sft-sp4-resume/625c0018/tracker.jsonl
483486
check_metrics:
@@ -493,6 +496,8 @@ case:
493496

494497
-
495498
type: sft
499+
pre_action:
500+
command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume sft'
496501
parameters:
497502
config: autotest/config/qwen3_5_moe_30BA3_sp4.py
498503
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -502,6 +507,7 @@ case:
502507
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
503508
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
504509
- XTUNER_DETERMINISTIC=true
510+
- XTUNER_GC_ENABLE=1
505511
assert_info:
506512
base_metric: qwen3-5-sft-sp4-resume/625c0018_resume/tracker.jsonl
507513
check_metrics:
@@ -513,6 +519,31 @@ case:
513519
memory/max_memory_GB: 0.2
514520
runtime_info/tgs: 0.05
515521
runtime_info/text_tokens: 0
522+
post_action:
523+
command: 'python ./autotest/utils/resume_validation.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume sft 10 "runtime_info/text_tokens,runtime_info/efficient_attn_ratio,loss/reduced_balancing_loss,loss/reduced_llm_loss,loss/local_loss"'
524+
timeout: 10800
525+
526+
qwen3-5-sft-8nums-vs-16nums:
527+
-
528+
type: sft
529+
parameters:
530+
config: autotest/config/qwen3_5_sft_8nums.py
531+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
532+
resource:
533+
envs:
534+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
535+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
536+
- CACHE_DIR=/mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/.cache_qwen3_5
537+
- XTUNER_DETERMINISTIC=true
538+
assert_info:
539+
base_metric: qwen3-5-sft-8nums-vs-16nums/147cb2ee/tracker.jsonl
540+
check_metrics:
541+
grad_norm: 0.02
542+
loss/maxvio: 0.05
543+
loss/local_loss: 0.02
544+
loss/reduced_balancing_loss: 0.02
545+
loss/reduced_llm_loss: 0.02
546+
lr: 0
516547
timeout: 10800
517548

518549
qwen3-rl-lmdeploy:

autotest/config/qwen3_5_moe_30BA3_sp4.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,6 @@
5353
work_dir=f"{os.environ['WORK_DIR']}",
5454
seed=0,
5555
resume_cfg=ResumeConfig(auto_resume=True),
56+
checkpoint_interval=10,
57+
checkpoint_maxkeep=2,
5658
)
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import os
2+
3+
from xtuner.v1.config import (
4+
AdamWConfig,
5+
FSDPConfig,
6+
LRConfig,
7+
)
8+
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
9+
from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig
10+
from xtuner.v1.loss.ce_loss import CELossConfig
11+
from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
12+
from xtuner.v1.train import ResumeConfig, TrainerConfig
13+
14+
15+
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
16+
ALPACA_PATH = os.environ["ALPACA_PATH"]
17+
CACHE_DIR = os.environ["CACHE_DIR"]
18+
19+
20+
moe_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
21+
optim_cfg = AdamWConfig(lr=6e-05)
22+
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
23+
fsdp_cfg = FSDPConfig(
24+
torch_compile=True,
25+
cpu_offload=False,
26+
)
27+
28+
dataset_config = [
29+
{
30+
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0, cache_dir=CACHE_DIR),
31+
"tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='qwen3', max_length=16384),
32+
},
33+
]
34+
35+
dataloader_config = DataloaderConfig(pack_max_length=16384)
36+
37+
loss_cfg = CELossConfig(mode="chunk", chunk_size=1024, loss_reduction="square")
38+
39+
40+
trainer = TrainerConfig(
41+
load_from=QWEN3_MOE_PATH,
42+
model_cfg=moe_cfg,
43+
optim_cfg=optim_cfg,
44+
fsdp_cfg=fsdp_cfg,
45+
sp_size=8,
46+
dataset_cfg=dataset_config,
47+
dataloader_cfg=dataloader_config,
48+
lr_cfg=lr_cfg,
49+
loss_cfg=loss_cfg,
50+
tokenizer_path=QWEN3_MOE_PATH,
51+
global_batch_size=16,
52+
total_epoch=1,
53+
work_dir=f"{os.environ['WORK_DIR']}",
54+
seed=0,
55+
resume_cfg=ResumeConfig(auto_resume=True),
56+
)

autotest/module/train.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,11 @@ def pre_action(config=None):
8484
run_cmd(action_cmd)
8585

8686
def post_action(config=None):
87-
return True, config
87+
action_info = config.get("post_action", None)
88+
if action_info:
89+
action_cmd = action_info.get("command", None)
90+
if action_cmd:
91+
run_cmd(action_cmd)
8892

8993

9094
def get_latest_subdir(work_dir):

autotest/test_all.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,5 +71,4 @@ def exec_step_test(step_config, task_executor, context):
7171
assert result, info
7272

7373
# post action
74-
result, info = handler.post_action(step_config.get("type"), step_config)
75-
assert result, info
74+
handler.post_action(step_config.get("type"), step_config)
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import json
2+
import math
3+
import os
4+
import sys
5+
6+
7+
def get_latest_subdir(work_dir):
8+
dirs = [
9+
d
10+
for d in os.listdir(work_dir)
11+
if os.path.isdir(os.path.join(work_dir, d)) and len(d) == 14 and d.isdigit()
12+
]
13+
if not dirs:
14+
return None
15+
latest = max(dirs, key=lambda d: os.path.getmtime(os.path.join(work_dir, d)))
16+
return os.path.join(work_dir, latest)
17+
18+
19+
def extract_value(file, center_step, metrics):
20+
window_steps = list(range(center_step + 1, center_step + 4))
21+
want = frozenset(window_steps)
22+
by_step = {s: {m: [] for m in metrics} for s in window_steps}
23+
with open(file, encoding="utf-8") as f:
24+
for line in f:
25+
obj = json.loads(line)
26+
s = obj.get("step")
27+
if s not in want:
28+
continue
29+
row = by_step[s]
30+
for m in metrics:
31+
row[m].append(obj[m] if m in obj else None)
32+
return window_steps, by_step
33+
34+
35+
def verify_window(path, center_step, metrics):
36+
window_steps, by_step = extract_value(path, center_step, metrics)
37+
missing_steps = {}
38+
missing_keys = []
39+
not_equal = {}
40+
41+
for m in metrics:
42+
miss = [s for s in window_steps if not by_step[s][m]]
43+
if miss:
44+
missing_steps[m] = miss
45+
continue
46+
47+
bad_step = None
48+
for s in window_steps:
49+
vals = by_step[s][m]
50+
if any(v is None for v in vals):
51+
bad_step = s
52+
break
53+
if bad_step is not None:
54+
missing_keys.append((m, bad_step))
55+
continue
56+
57+
for s in window_steps:
58+
vals = by_step[s][m]
59+
if len(vals) > 1:
60+
first = vals[0]
61+
if any(not math.isclose(v, first, rel_tol=1e-6, abs_tol=0.0) for v in vals[1:]):
62+
not_equal.setdefault(m, []).append((s, list(vals)))
63+
64+
check_result = not (missing_steps or missing_keys or not_equal)
65+
if not check_result:
66+
if missing_steps:
67+
print("Missing step data (no records for this step):", file=sys.stderr)
68+
for m, steps in missing_steps.items():
69+
print(f" {m}: step {steps}", file=sys.stderr)
70+
if missing_keys:
71+
print("Missing key (metric absent in tracker line, value is None):", file=sys.stderr)
72+
for m, s in missing_keys:
73+
print(f" {m}: step {s}", file=sys.stderr)
74+
if not_equal:
75+
print("Inconsistent metric values across duplicate records at the same step:", file=sys.stderr)
76+
for m, pairs in not_equal.items():
77+
parts = []
78+
for s, vals in pairs:
79+
parts.append(f"step {s}: {vals}")
80+
print(f" {m}: " + "; ".join(parts), file=sys.stderr)
81+
return check_result
82+
83+
if __name__ == "__main__":
84+
base_dir = f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/{sys.argv[2]}/{sys.argv[3]}"
85+
real_dir = get_latest_subdir(base_dir)
86+
tracker = os.path.join(real_dir, "logs/exp_tracking/rank0/tracker.jsonl")
87+
center_step = int(sys.argv[4])
88+
metrics = sys.argv[5].split(',')
89+
assert verify_window(tracker, center_step, metrics), "Resume validation failed, see the printed output for details"

autotest/utils/run_cmd.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import pytest
12
import subprocess
23

34

45
def run_cmd(command):
56
try:
67
result = subprocess.run(command, shell=True, capture_output=True, text=True)
7-
if result.returncode != 0:
8-
print(f"run command error:{result.stderr}")
98
except Exception as e:
10-
print(f"Unknown error: {e}")
9+
pytest.fail(f"Unknown error: {e}")
10+
if result.returncode != 0:
11+
pytest.fail(f"run command error:{result.stderr}")

autotest/utils/update_meta.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,7 @@ def get_latest_subdir(work_dir):
2929
return os.path.join(work_dir, latest)
3030

3131

32-
device = os.environ.get("DEVICE", "")
33-
if device == "npu":
34-
base_dir = (
35-
f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/npu-qwen3-sft-ep8/sft"
36-
)
37-
else:
38-
base_dir = (
39-
f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/qwen3-sft-ep8/sft"
40-
)
32+
base_dir = f"{sys.argv[1]}/{os.environ['GITHUB_RUN_ID']}/{sys.argv[2]}/{sys.argv[3]}"
4133
real_dir = get_latest_subdir(base_dir)
4234
new_meta = {"end": 10, "exp_dir": real_dir, "checkpoint_list": [f"{real_dir}/checkpoints/ckpt-step-10"]}
4335
update_meta(f"{base_dir}/.xtuner", new_meta)

0 commit comments

Comments
 (0)