@@ -332,7 +332,8 @@ gptoss-fp4-h200-vllm:
332332
333333dsr1-fp4-gb200-dynamo-trt :
334334 image : nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
335- model : deepseek-r1-fp4
335+ # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
336+ model : /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
336337 model-prefix : dsr1
337338 runner : gb200
338339 precision : fp4
@@ -773,8 +774,10 @@ dsr1-fp4-gb200-dynamo-trt:
773774 - " DECODE_MTP_SIZE=0"
774775
775776dsr1-fp8-gb200-dynamo-sglang :
776- image : nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1
777- model : deepseek-ai/DeepSeek-R1-0528
777+ image : lmsysorg/sglang:v0.5.5.post2
778+ # model: deepseek-ai/DeepSeek-R1-0528
779+ # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
780+ model : /mnt/lustre01/models/deepseek-r1-0528
778781 model-prefix : dsr1
779782 runner : gb200
780783 precision : fp8
@@ -798,6 +801,7 @@ dsr1-fp8-gb200-dynamo-sglang:
798801 additional-settings :
799802 - " PREFILL_NODES=4"
800803 - " N_ADDITIONAL_FRONTENDS=9"
804+ - " SCRIPT_MODE=1k1k-max-tpt"
801805 decode :
802806 num-worker : 1
803807 tp : 1
@@ -819,7 +823,7 @@ dsr1-fp8-gb200-dynamo-sglang:
819823 additional-settings :
820824 - " PREFILL_NODES=1"
821825 - " N_ADDITIONAL_FRONTENDS=9"
822- - " SCRIPT_MODE=1p_4d "
826+ - " SCRIPT_MODE=1k1k-low-latency "
823827 decode :
824828 num-worker : 4
825829 tp : 1
@@ -841,6 +845,7 @@ dsr1-fp8-gb200-dynamo-sglang:
841845 additional-settings :
842846 - " PREFILL_NODES=6"
843847 - " N_ADDITIONAL_FRONTENDS=9"
848+ - " SCRIPT_MODE=1k1k-max-tpt"
844849 decode :
845850 num-worker : 1
846851 tp : 1
@@ -852,22 +857,193 @@ dsr1-fp8-gb200-dynamo-sglang:
852857 - isl : 8192
853858 osl : 1024
854859 search-space :
860+ # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
861+ - spec-decoding : " none"
862+ conc-list : [ 4, 8, 16, 32, 64, 128, 256, 512 ]
863+ prefill :
864+ num-worker : 1
865+ tp : 1
866+ ep : 1
867+ dp-attn : true
868+ additional-settings :
869+ - " PREFILL_NODES=1"
870+ - " N_ADDITIONAL_FRONTENDS=8"
871+ - " SCRIPT_MODE=8k1k-low-latency"
872+ decode :
873+ num-worker : 1
874+ tp : 1
875+ ep : 1
876+ dp-attn : true
877+ additional-settings :
878+ - " DECODE_NODES=1"
879+
880+ # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
881+ - spec-decoding : " none"
882+ conc-list : [ 512, 1024, 2048, 6144 ]
883+ prefill :
884+ num-worker : 5
885+ tp : 1
886+ ep : 1
887+ dp-attn : true
888+ additional-settings :
889+ - " PREFILL_NODES=10"
890+ - " N_ADDITIONAL_FRONTENDS=8"
891+ - " SCRIPT_MODE=8k1k-max-tpt"
892+ decode :
893+ num-worker : 1
894+ tp : 1
895+ ep : 1
896+ dp-attn : true
897+ additional-settings :
898+ - " DECODE_NODES=8"
899+
900+ dsr1-fp4-gb200-dynamo-sglang :
901+ image : lmsysorg/sglang:v0.5.5.post2
902+ # TODO: what is the right name?
903+ # model: deepseek-ai/DeepSeek-R1-0528-fp4-v2
904+ # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
905+ model : /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
906+ model-prefix : dsr1
907+ runner : gb200
908+ precision : fp4
909+ framework : dynamo-sglang
910+ multinode : true
911+ disagg : true
912+ seq-len-configs :
913+ - isl : 1024
914+ osl : 1024
915+ search-space :
916+ # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4)
917+ - spec-decoding : " none"
918+ conc-list : [ 4, 8, 32, 64 ]
919+ prefill :
920+ num-worker : 1
921+ tp : 1
922+ ep : 1
923+ dp-attn : true
924+ additional-settings :
925+ - " PREFILL_NODES=1"
926+ - " N_ADDITIONAL_FRONTENDS=8"
927+ - " SCRIPT_MODE=1k1k-low-latency"
928+ decode :
929+ num-worker : 2
930+ tp : 1
931+ ep : 1
932+ dp-attn : true
933+ additional-settings :
934+ - " DECODE_NODES=2"
935+
936+ # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48)
855937 - spec-decoding : " none"
856- conc-list : [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ]
938+ conc-list : [ 512, 1024, 2048, 4096, 8192 ]
939+ prefill :
940+ num-worker : 4
941+ tp : 1
942+ ep : 1
943+ dp-attn : true
944+ additional-settings :
945+ - " PREFILL_NODES=4"
946+ - " N_ADDITIONAL_FRONTENDS=8"
947+ - " SCRIPT_MODE=1k1k-middle-curve"
948+ decode :
949+ num-worker : 1
950+ tp : 1
951+ ep : 1
952+ dp-attn : true
953+ additional-settings :
954+ - " DECODE_NODES=12"
955+
956+ # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32)
957+ - spec-decoding : " none"
958+ conc-list : [ 8192, 12000, 15000 ]
959+ prefill :
960+ num-worker : 4
961+ tp : 1
962+ ep : 1
963+ dp-attn : true
964+ additional-settings :
965+ - " PREFILL_NODES=4"
966+ - " N_ADDITIONAL_FRONTENDS=8"
967+ - " SCRIPT_MODE=1k1k-max-tpt"
968+ decode :
969+ num-worker : 1
970+ tp : 1
971+ ep : 1
972+ dp-attn : true
973+ additional-settings :
974+ - " DECODE_NODES=8"
975+ - isl : 8192
976+ osl : 1024
977+ search-space :
978+ - spec-decoding : " none"
979+ conc-list : [ 4, 8, 32, 64 ]
980+ prefill :
981+ num-worker : 1
982+ tp : 1
983+ ep : 1
984+ dp-attn : false
985+ additional-settings :
986+ - " PREFILL_NODES=1"
987+ - " N_ADDITIONAL_FRONTENDS=8"
988+ - " SCRIPT_MODE=8k1k-low-latency"
989+ decode :
990+ num-worker : 4
991+ tp : 1
992+ ep : 1
993+ dp-attn : true
994+ additional-settings :
995+ - " DECODE_NODES=4"
996+ - spec-decoding : " none"
997+ conc-list : [ 512, 1024, 2048, 4096 ]
857998 prefill :
858999 num-worker : 6
859- # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
860- # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
1000+ tp : 1
1001+ ep : 1
1002+ dp-attn : false
1003+ additional-settings :
1004+ - " PREFILL_NODES=6"
1005+ - " N_ADDITIONAL_FRONTENDS=9"
1006+ - " SCRIPT_MODE=8k1k-middle-curve"
1007+ decode :
1008+ num-worker : 1
8611009 tp : 1
8621010 ep : 1
8631011 dp-attn : true
8641012 additional-settings :
865- - " PREFILL_NODES=12"
1013+ - " DECODE_NODES=12"
1014+ - spec-decoding : " none"
1015+ conc-list : [ 1024, 2048, ]
1016+ prefill :
1017+ num-worker : 10
1018+ tp : 1
1019+ ep : 1
1020+ dp-attn : true
1021+ additional-settings :
1022+ - " PREFILL_NODES=10"
8661023 - " N_ADDITIONAL_FRONTENDS=8"
1024+ - " SCRIPT_MODE=8k1k-max-tpt"
8671025 decode :
8681026 num-worker : 1
8691027 tp : 1
8701028 ep : 1
8711029 dp-attn : true
8721030 additional-settings :
873- - " DECODE_NODES=6"
1031+ - " DECODE_NODES=8"
1032+ - spec-decoding : " none"
1033+ conc-list : [ 8192 ]
1034+ prefill :
1035+ num-worker : 10
1036+ tp : 1
1037+ ep : 1
1038+ dp-attn : true
1039+ additional-settings :
1040+ - " PREFILL_NODES=10"
1041+ - " N_ADDITIONAL_FRONTENDS=8"
1042+ - " SCRIPT_MODE=8k1k-max-tpt"
1043+ decode :
1044+ num-worker : 1
1045+ tp : 1
1046+ ep : 1
1047+ dp-attn : true
1048+ additional-settings :
1049+ - " DECODE_NODES=8"
0 commit comments