[CE]fix ce yaml

xiegegege · xiegegege · commit ea59c872ec03 · 2026-06-18T16:56:42.000+08:00
diff --git a/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml b/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml
diff --git a/benchmarks/yaml/GLM45-air-32k-bf16-mtp.yaml b/benchmarks/yaml/GLM45-air-32k-bf16-mtp.yaml
@@ -5,3 +5,5 @@ graph_optimization_config:
   use_cudagraph: True
   draft_model_use_cudagraph: True
 load_choices: "default_v1"
+max_num_batched_tokens: 4096
+speculative_config: '{"method":"mtp","num_speculative_tokens":1,"num_model_steps":1,"model":"/root/paddlejob/tmpspace/GLM-4.5-Air"}'
diff --git a/benchmarks/yaml/GLM45-air-fa2-mtp-32k-bf16.yaml b/benchmarks/yaml/GLM45-air-fa2-mtp-32k-bf16.yaml
@@ -0,0 +1,10 @@
+max_num_seqs: 128
+max_model_len: 32768
+enable_prefix_caching: True
+disable_custom_all_reduce: True
+graph_optimization_config: '{"use_cudagraph":true,"use_unique_memory_pool":true,"draft_model_use_cudagraph": true}'
+speculative_config: '{"method": "mtp", "num_speculative_tokens": 3, "num_model_steps": 3, "model": "/root/paddlejob/tmpspace/glm_mtp_multi_step", "verify_strategy": "target_match"}'
+tensor_parallel_size: 4
+enable_logprob: True
+moe_gate_fp32: True
+swap_space: 300
diff --git a/benchmarks/yaml/deepseek-32k-tp8-wint4.yaml b/benchmarks/yaml/deepseek-32k-tp8-wint4.yaml
@@ -3,7 +3,7 @@ load_choices: "default_v1"
 graph_optimization_config:
   use_cudagraph: True
   use_unique_memory_pool: True
-enable_prefix_caching: False
 max_num_seqs: 256
 max_model_len: 32768
 tensor_parallel_size: 8
+enable_prefix_caching: False
diff --git a/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml b/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
@@ -5,7 +5,6 @@ quantization: block_wise_fp8
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 enable_chunked_prefill: True
-max_num_batched_tokens: 1024
 max_num_partial_prefills: 3
 max_long_partial_prefills: 3
 enable_prefix_caching: True
diff --git a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
@@ -4,3 +4,4 @@ gpu_memory_utilization: 0.8
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
 quantization: wint4
+speculative_config: '{"method": "mtp", "num_speculative_tokens": 1, "model": "/root/paddlejob/ERNIE-45-Turbo/mtp/"}'
diff --git a/benchmarks/yaml/eb45-32k-wint4-tp1-dp4_ep.yaml b/benchmarks/yaml/eb45-32k-wint4-tp1-dp4_ep.yaml
@@ -5,3 +5,4 @@ data_parallel_size: 4
 tensor_parallel_size: 1
 enable_expert_parallel: True
 quantization: wint4
+max_num_batched_tokens: 4096
diff --git a/benchmarks/yaml/eb45-vl-28b-thinking-32k-wint8.yaml b/benchmarks/yaml/eb45-vl-28b-thinking-32k-wint8.yaml
@@ -6,3 +6,4 @@ reasoning_parser: ernie-45-vl-thinking
 tool_call_parser: ernie-45-vl-thinking
 load_choices: "default_v1"
 mm-processor-kwargs: '{"image_max_pixels": 12845056 }'
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
diff --git a/benchmarks/yaml/eb45-vl-lite-32k-bf16-a800-tp1.yaml b/benchmarks/yaml/eb45-vl-lite-32k-bf16-a800-tp1.yaml
@@ -7,3 +7,4 @@ tensor_parallel_size: 1
 enable_chunked_prefill: True
 max_num_batched_tokens: 384
 reasoning_parser: ernie-45-vl
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
diff --git a/benchmarks/yaml/eb45-vl-lite-32k-wint4-a800-tp1.yaml b/benchmarks/yaml/eb45-vl-lite-32k-wint4-a800-tp1.yaml
@@ -8,3 +8,4 @@ enable_chunked_prefill: True
 max_num_batched_tokens: 384
 quantization: wint4
 reasoning_parser: ernie-45-vl
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
diff --git a/benchmarks/yaml/request_yaml/GLM-32k-rl.yaml b/benchmarks/yaml/request_yaml/GLM-32k-rl.yaml
@@ -0,0 +1 @@
+max_tokens: 32768