@@ -798,6 +798,7 @@ dsr1-fp8-gb200-dynamo-sglang:
798798 additional-settings :
799799 - " PREFILL_NODES=4"
800800 - " N_ADDITIONAL_FRONTENDS=9"
801+ - " SCRIPT_MODE=max-tpt"
801802 decode :
802803 num-worker : 1
803804 tp : 1
@@ -852,22 +853,112 @@ dsr1-fp8-gb200-dynamo-sglang:
852853 - isl : 8192
853854 osl : 1024
854855 search-space :
856+ # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
855857 - spec-decoding : " none"
856- conc-list : [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ]
858+ conc-list : [ 4, 8, 16, 32, 64, 128, 256, 512 ]
857859 prefill :
858- num-worker : 6
859- # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
860- # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
860+ num-worker : 1
861861 tp : 1
862862 ep : 1
863863 dp-attn : true
864864 additional-settings :
865- - " PREFILL_NODES=12 "
865+ - " PREFILL_NODES=1 "
866866 - " N_ADDITIONAL_FRONTENDS=8"
867867 decode :
868868 num-worker : 1
869869 tp : 1
870870 ep : 1
871871 dp-attn : true
872872 additional-settings :
873- - " DECODE_NODES=6"
873+ - " DECODE_NODES=1"
874+
875+ # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
876+ - spec-decoding : " none"
877+ conc-list : [ 512, 1024, 2048, 6144 ]
878+ prefill :
879+ num-worker : 5
880+ tp : 1
881+ ep : 1
882+ dp-attn : true
883+ additional-settings :
884+ - " PREFILL_NODES=2"
885+ - " N_ADDITIONAL_FRONTENDS=8"
886+ decode :
887+ num-worker : 1
888+ tp : 1
889+ ep : 1
890+ dp-attn : true
891+ additional-settings :
892+ - " DECODE_NODES=8"
893+
894+ dsr1-fp4-gb200-dynamo-sglang :
895+ # TODO: swap
896+ image : nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1
897+ # TODO: what is the right name?
898+ model : deepseek-ai/DeepSeek-R1-0528-fp4-v2
899+ model-prefix : dsr1
900+ runner : gb200
901+ precision : fp4
902+ framework : dynamo-sglang
903+ multinode : true
904+ disagg : true
905+ seq-len-configs :
906+ - isl : 1024
907+ osl : 1024
908+ search-space :
909+ # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4)
910+ - spec-decoding : " none"
911+ conc-list : [ 4, 8, 32, 64, 128, 112, 128, 256 ]
912+ prefill :
913+ num-worker : 1
914+ tp : 1
915+ ep : 1
916+ dp-attn : true
917+ additional-settings :
918+ - " PREFILL_NODES=1"
919+ - " N_ADDITIONAL_FRONTENDS=8"
920+ decode :
921+ num-worker : 2
922+ tp : 1
923+ ep : 1
924+ dp-attn : true
925+ additional-settings :
926+ - " DECODE_NODES=2"
927+
928+ # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48)
929+ - spec-decoding : " none"
930+ conc-list : [ 512, 1024, 2048, 4096, 8192 ]
931+ prefill :
932+ num-worker : 1
933+ tp : 1
934+ ep : 1
935+ dp-attn : true
936+ additional-settings :
937+ - " PREFILL_NODES=1"
938+ - " N_ADDITIONAL_FRONTENDS=8"
939+ decode :
940+ num-worker : 2
941+ tp : 1
942+ ep : 1
943+ dp-attn : true
944+ additional-settings :
945+ - " DECODE_NODES=12"
946+
947+ # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32)
948+ - spec-decoding : " none"
949+ conc-list : [ 8192, 12000, 15000 ]
950+ prefill :
951+ num-worker : 1
952+ tp : 1
953+ ep : 1
954+ dp-attn : true
955+ additional-settings :
956+ - " PREFILL_NODES=1"
957+ - " N_ADDITIONAL_FRONTENDS=8"
958+ decode :
959+ num-worker : 2
960+ tp : 1
961+ ep : 1
962+ dp-attn : true
963+ additional-settings :
964+ - " DECODE_NODES=8"
0 commit comments