Skip to content

Commit 8b33a95

Browse files
authored
【CI】add qwen3.5 cases (#1593)
* add case config * run case * run case * run rl case * update run info * fix config * baseline * ready to PR * optimize format * add qwen3_5 fp8 case * add recompute case * add tp case * add qwen3 dense vl case * add qwen3 dense vl case * fix config and set threshold * add resume case * bigger cpu * increase memory * adjust threshold * memory * add rl-vl case * fix config path * fix config name
1 parent 9c3d549 commit 8b33a95

11 files changed

Lines changed: 805 additions & 2 deletions

autotest/cluster/clusterx.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def execute_task(self, task_config: Dict[str, Any]):
5050
num_nodes=resource.get("num_nodes", 1),
5151
image=resource.get("image", None),
5252
no_env=resource.get("no_env", True),
53+
image_pull_policy=resource.get("image_pull_policy","Always"),
5354
)
5455

5556
job_schema = self.cluster.run(params)

autotest/config.yaml

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,193 @@ case:
277277
runtime_info/text_tokens: 0
278278
timeout: 1080
279279

280+
qwen3-sft-cache:
281+
-
282+
type: sft
283+
parameters:
284+
config: autotest/config/qwen3_sft_cache.py
285+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
286+
resource:
287+
cpus_per_task: 80
288+
envs:
289+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-30B-A3B
290+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
291+
- CACHE_DIR=/mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/.cache
292+
- XTUNER_DETERMINISTIC=true
293+
assert_info:
294+
base_metric: qwen3-sft-cache/e968368a/tracker.jsonl
295+
check_metrics:
296+
grad_norm: 0.000001
297+
loss/reduced_llm_loss: 0.000001
298+
lr: 0
299+
memory/max_memory_GB: 0.2
300+
runtime_info/tgs: 0.05
301+
runtime_info/text_tokens: 0
302+
timeout: 10800
303+
304+
qwen3-sft-vl-dense:
305+
-
306+
type: sft
307+
parameters:
308+
config: autotest/config/qwen3_vl_8B_dense.py
309+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
310+
resource:
311+
envs:
312+
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-VL-8B-Instruct
313+
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
314+
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
315+
- XTUNER_DETERMINISTIC=true
316+
assert_info:
317+
base_metric: qwen3-sft-vl-dense/812c1021/tracker.jsonl
318+
check_metrics:
319+
grad_norm: 0.000001
320+
loss/reduced_llm_loss: 0.000001
321+
lr: 0
322+
memory/max_memory_GB: 0.2
323+
runtime_info/tgs: 0.05
324+
runtime_info/text_tokens: 0
325+
timeout: 10800
326+
327+
qwen3-5-sft-vl-moe:
328+
-
329+
type: sft
330+
parameters:
331+
config: autotest/config/qwen3_5_35B_sft_vl.py
332+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
333+
resource:
334+
cpus_per_task: 80
335+
envs:
336+
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
337+
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
338+
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
339+
- XTUNER_DETERMINISTIC=true
340+
assert_info:
341+
base_metric: qwen3-5-sft-vl-moe/e968368a/tracker.jsonl
342+
check_metrics:
343+
grad_norm: 5
344+
loss/reduced_llm_loss: 5
345+
lr: 0
346+
memory/max_memory_GB: 0.2
347+
runtime_info/tgs: 0.05
348+
runtime_info/text_tokens: 0
349+
timeout: 10800
350+
351+
qwen3-5-sft-fp8:
352+
-
353+
type: sft
354+
parameters:
355+
config: autotest/config/qwen3_5_fp8.py
356+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
357+
resource:
358+
num_nodes: 1
359+
cpus_per_task: 80
360+
envs:
361+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
362+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
363+
- XTUNER_DETERMINISTIC=true
364+
- XTUNER_ACTIVATION_OFFLOAD=1
365+
- XTUNER_USE_FA3=1
366+
assert_info:
367+
base_metric: qwen3-5-sft-fp8/625c0018/tracker.jsonl
368+
check_metrics:
369+
grad_norm: 0.1
370+
loss/reduced_llm_loss: 0.000001
371+
lr: 0
372+
memory/max_memory_GB: 0.2
373+
runtime_info/tgs: 0.05
374+
runtime_info/text_tokens: 0
375+
timeout: 10800
376+
377+
qwen3-5-sft-recompute:
378+
-
379+
type: sft
380+
parameters:
381+
config: autotest/config/qwen3_5_recompute.py
382+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
383+
resource:
384+
num_nodes: 2
385+
cpus_per_task: 80
386+
envs:
387+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
388+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
389+
- XTUNER_DETERMINISTIC=true
390+
assert_info:
391+
base_metric: qwen3-5-sft-recompute/625c0018/tracker.jsonl
392+
check_metrics:
393+
grad_norm: 0.000001
394+
loss/reduced_llm_loss: 0.000001
395+
lr: 0
396+
memory/max_memory_GB: 0.2
397+
runtime_info/tgs: 0.05
398+
runtime_info/text_tokens: 0
399+
timeout: 10800
400+
401+
qwen3-5-sft-tp2:
402+
-
403+
type: sft
404+
parameters:
405+
config: autotest/config/qwen3_5_moe_30BA3_tp2.py
406+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
407+
resource:
408+
envs:
409+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
410+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
411+
- XTUNER_DETERMINISTIC=true
412+
assert_info:
413+
base_metric: qwen3-5-sft-tp2/625c0018/tracker.jsonl
414+
check_metrics:
415+
grad_norm: 0.05
416+
loss/reduced_llm_loss: 0.000001
417+
lr: 0
418+
memory/max_memory_GB: 0.2
419+
runtime_info/tgs: 0.05
420+
runtime_info/text_tokens: 0
421+
timeout: 10800
422+
423+
qwen3-5-sft-sp4-resume:
424+
-
425+
type: sft
426+
parameters:
427+
config: autotest/config/qwen3_5_moe_30BA3_sp4.py
428+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
429+
resource:
430+
envs:
431+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
432+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_resume
433+
- XTUNER_DETERMINISTIC=true
434+
assert_info:
435+
base_metric: qwen3-5-sft-sp4-resume/625c0018/tracker.jsonl
436+
check_metrics:
437+
grad_norm: 0.02
438+
loss/reduced_llm_loss: 0.000001
439+
lr: 0
440+
memory/max_memory_GB: 0.2
441+
runtime_info/tgs: 0.05
442+
runtime_info/text_tokens: 0
443+
timeout: 10800
444+
445+
-
446+
type: sft
447+
parameters:
448+
config: autotest/config/qwen3_5_moe_30BA3_sp4.py
449+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
450+
resource:
451+
memory_per_task: 1200
452+
envs:
453+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
454+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
455+
- XTUNER_DETERMINISTIC=true
456+
assert_info:
457+
base_metric: qwen3-5-sft-sp4-resume/625c0018_resume/tracker.jsonl
458+
check_metrics:
459+
grad_norm: 0.02
460+
loss/reduced_llm_loss: 0.000001
461+
lr: 0
462+
memory/max_memory_GB: 0.2
463+
runtime_info/tgs: 0.05
464+
runtime_info/text_tokens: 0
465+
timeout: 10800
466+
280467
qwen3-rl-lmdeploy:
281468
-
282469
type: rl
@@ -319,3 +506,47 @@ case:
319506
method: absolute
320507
operator: <
321508
timeout: 2460
509+
510+
qwen3-rl-vl-lmdeploy:
511+
-
512+
type: rl
513+
parameters:
514+
config: autotest/config/rl_qwen3_vl_geometry3k_grpo.py
515+
infer_backend: lmdeploy
516+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
517+
resource:
518+
envs:
519+
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-VL-8B-Instruct
520+
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k/train.jsonl
521+
- EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k/test.jsonl
522+
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k
523+
- XTUNER_DETERMINISTIC=true
524+
assert_info:
525+
base_metric: qwen3-rl-vl-lmdeploy/fb28789f/tracker.jsonl
526+
check_metrics:
527+
-
528+
metric: eval/accuracy
529+
threshold: 0.1
530+
method: absolute
531+
operator: <
532+
-
533+
metric: response/rewards/mean
534+
threshold: 0.1
535+
method: absolute
536+
operator: <
537+
-
538+
metric: mismatch/mismatch_k3_kl
539+
threshold: 0.0001
540+
method: absolute
541+
operator: <=
542+
-
543+
metric: response/response_len/mean
544+
threshold: 0.12
545+
method: relative
546+
operator: <
547+
-
548+
metric: time/step
549+
threshold: 10
550+
method: absolute
551+
operator: <
552+
timeout: 4200

autotest/config/gptoss.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
FSDPConfig,
66
LRConfig,
77
)
8-
from xtuner.v1.datasets import FTDPTokenizeFnConfig
98
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
9+
from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig
1010
from xtuner.v1.loss.ce_loss import CELossConfig
1111
from xtuner.v1.model.moe.gpt_oss import GptOss21BA3P6Config
1212
from xtuner.v1.module.rope import RopeScalingConfig
@@ -38,7 +38,7 @@
3838
dataset_config = [
3939
{
4040
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
41-
"tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
41+
"tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='gpt-oss', max_length=16384),
4242
},
4343
]
4444

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import os
2+
3+
from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig
4+
from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig
5+
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
6+
from xtuner.v1.loss.ce_loss import CELossConfig
7+
from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
8+
from xtuner.v1.train import TrainerConfig
9+
10+
11+
MEDIA_ROOT = os.environ["MEDIA_ROOT"]
12+
MODEL_PATH = os.environ["MODEL_PATH"]
13+
DATA_PATH = os.environ["DATA_PATH"]
14+
15+
16+
moe_cfg = Qwen3_5_VLMoE35BA3Config()
17+
18+
optim_cfg = AdamWConfig(lr=6e-05)
19+
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
20+
fsdp_cfg = FSDPConfig(
21+
torch_compile=True,
22+
cpu_offload=False,
23+
)
24+
25+
dataset_config = [
26+
{
27+
"dataset": DatasetConfig(
28+
name="sft",
29+
anno_path=DATA_PATH,
30+
class_name="VLMJsonlDataset",
31+
media_root=MEDIA_ROOT,
32+
sample_ratio=1.0,
33+
),
34+
"tokenize_fn": Qwen3VLTokenizeFnConfig(
35+
processor_path=MODEL_PATH,
36+
max_length=16384,
37+
add_vision_id=True,
38+
),
39+
},
40+
]
41+
42+
dataloader_config = DataloaderConfig(
43+
dataset_config_list=dataset_config,
44+
pack_max_length=16384,
45+
collator="qwen3_vl_sft_collator",
46+
)
47+
48+
loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)
49+
50+
trainer = TrainerConfig(
51+
load_from=MODEL_PATH,
52+
model_cfg=moe_cfg,
53+
optim_cfg=optim_cfg,
54+
fsdp_cfg=fsdp_cfg,
55+
dataloader_cfg=dataloader_config,
56+
lr_cfg=lr_cfg,
57+
loss_cfg=loss_cfg,
58+
tokenizer_path=MODEL_PATH,
59+
global_batch_size=16,
60+
total_epoch=1,
61+
work_dir=f"{os.environ['WORK_DIR']}",
62+
seed=0,
63+
)

autotest/config/qwen3_5_fp8.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
3+
from xtuner.v1.config import (
4+
AdamWConfig,
5+
FSDPConfig,
6+
LRConfig,
7+
)
8+
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
9+
from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig
10+
from xtuner.v1.float8.config import Float8Config, ScalingGranularity
11+
from xtuner.v1.loss.ce_loss import CELossConfig
12+
from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
13+
from xtuner.v1.train import TrainerConfig
14+
15+
16+
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
17+
ALPACA_PATH = os.environ["ALPACA_PATH"]
18+
19+
float8_cfg = Float8Config(
20+
scaling_granularity_gemm=ScalingGranularity.TILEWISE,
21+
scaling_granularity_grouped_gemm=ScalingGranularity.TILEWISE,
22+
)
23+
24+
moe_cfg = Qwen3_5_VLMoE35BA3Config(float8_cfg=float8_cfg)
25+
optim_cfg = AdamWConfig(lr=6e-05)
26+
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
27+
fsdp_cfg = FSDPConfig(
28+
torch_compile=False,
29+
cpu_offload=False,
30+
)
31+
32+
dataset_config = [
33+
{
34+
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
35+
"tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='qwen3', max_length=16384),
36+
},
37+
]
38+
39+
dataloader_config = DataloaderConfig(pack_max_length=16384)
40+
41+
loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)
42+
43+
44+
trainer = TrainerConfig(
45+
load_from=QWEN3_MOE_PATH,
46+
model_cfg=moe_cfg,
47+
optim_cfg=optim_cfg,
48+
fsdp_cfg=fsdp_cfg,
49+
dataset_cfg=dataset_config,
50+
dataloader_cfg=dataloader_config,
51+
lr_cfg=lr_cfg,
52+
loss_cfg=loss_cfg,
53+
tokenizer_path=QWEN3_MOE_PATH,
54+
global_batch_size=16,
55+
intra_layer_micro_batch=1,
56+
total_epoch=1,
57+
work_dir=f"{os.environ['WORK_DIR']}",
58+
seed=0,
59+
)

0 commit comments

Comments
 (0)