diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3d6a17ff7..afd061718 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1801,7 +1801,7 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } dsv4-fp4-b200-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc15.post1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1814,15 +1814,15 @@ dsv4-fp4-b200-trt: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } dsv4-fp4-b200-trt-mtp: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc15.post1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1835,7 +1835,7 @@ dsv4-fp4-b200-trt-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d69f528a8..cb57b605f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3459,6 +3459,13 @@ - "Add 1k1k/8k1k FP8 recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1648 +- config-keys: + - dsv4-fp4-b200-trt + - dsv4-fp4-b200-trt-mtp + description: + - "Update B200 DeepSeek-V4-Pro TRT image to the official nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc15.post1 (non-MTP and MTP), replacing the older ghcr.io semianalysis 9aa3715 build. The official release uses the V1 KV-cache manager (use_kv_cache_manager_v2=False), avoiding the custom feat/deepseek_v4 build's V2 max_num_requests=2x doubling that OOM'd conc-256 dpa=true on B200, and runs the overlap scheduler natively (mirrors the B300 setup in PR #1636 / run 26999118817)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1664 + - config-keys: - minimaxm2.5-fp8-b300-dynamo-vllm description: