-
Notifications
You must be signed in to change notification settings - Fork 196
[NV] Update B300 DSV4 SGLang Pareto sweep #1575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a947d18
c183bc8
deee4cc
999175d
fbeb15a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,135 +12,104 @@ check_env_vars \ | |
| RANDOM_RANGE_RATIO \ | ||
| RESULT_FILENAME | ||
|
|
||
| # `hf download` creates the target dir if missing and is itself idempotent. | ||
| # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE | ||
| # Either way, MODEL_PATH is what the server is launched with. | ||
| if [[ -n "${MODEL_PATH:-}" ]]; then | ||
| if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then | ||
| hf download "$MODEL" --local-dir "$MODEL_PATH" | ||
| fi | ||
| else | ||
| hf download "$MODEL" | ||
| export MODEL_PATH="$MODEL" | ||
| fi | ||
|
|
||
| if [[ -n "$SLURM_JOB_ID" ]]; then | ||
| echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" | ||
| fi | ||
|
|
||
| # The B300 runner overrides MODEL to a pre-staged /data/models path, so skip | ||
| # `hf download`. Only fetch when MODEL looks like a HF repo ID. | ||
| if [[ "$MODEL" != /* ]]; then | ||
| hf download "$MODEL" | ||
| fi | ||
|
|
||
| nvidia-smi | ||
|
|
||
| # ─── Common env vars (all profiles) ─────────────────────────────────────────── | ||
| export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 | ||
| # Common SGLANG env vars (apply to every config). | ||
| export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 | ||
| export SGLANG_OPT_USE_JIT_NORM=1 | ||
| export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 | ||
| export SGLANG_OPT_USE_TOPK_V2=1 | ||
| export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 | ||
|
|
||
| # This config uses a standard sglang dev image (lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44), which | ||
| # installs sglang under /sgl-workspace, so launch_b300-nv.sh bind-mounts our repo | ||
| # at /workspace (it only switches to /ix for the deepseek-v4 editable images). | ||
| # Paths in this script are $PWD-relative so they work under either mount dir. | ||
|
|
||
| SERVER_LOG="$PWD/server.log" | ||
| PORT=${PORT:-8888} | ||
|
|
||
| echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" | ||
|
|
||
| EVAL_CONTEXT_ARGS="" | ||
| if [ "${EVAL_ONLY}" = "true" ]; then | ||
| setup_eval_context | ||
| EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" | ||
| else | ||
| EVAL_CONTEXT_ARGS="--context-length 16384" | ||
| fi | ||
|
|
||
| start_gpu_monitor --output "$PWD/gpu_metrics.csv" | ||
|
|
||
| # ─── Per-concurrency launch profile ────────────────────────────────────────── | ||
| # Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO, | ||
| # and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars. | ||
| # | ||
| # SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was | ||
| # 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was | ||
| # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. | ||
| if [[ "$ISL" == "1024" ]]; then | ||
| SWA_FULL_TOKENS_RATIO=0.5 | ||
| else | ||
| SWA_FULL_TOKENS_RATIO=0.1 | ||
| fi | ||
|
|
||
| if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then | ||
| # TP-only, no DP attention | ||
| MEM_FRACTION_STATIC=0.90 | ||
| SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) | ||
| PARALLEL_ARGS=( | ||
| --moe-runner-backend flashinfer_mxfp4 | ||
| --chunked-prefill-size 8192 | ||
| --disable-flashinfer-autotune | ||
| ) | ||
|
|
||
| elif [ "$CONC" = "512" ]; then | ||
| # DP attention, flashinfer_mxfp4 | ||
| export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 | ||
| MEM_FRACTION_STATIC=0.94 | ||
| SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) | ||
| PARALLEL_ARGS=( | ||
| --dp-size "$TP" | ||
| --enable-dp-attention | ||
| --moe-runner-backend flashinfer_mxfp4 | ||
| --disable-flashinfer-autotune | ||
| --chunked-prefill-size 16384 | ||
| --enable-prefill-delayer | ||
| ) | ||
| # Pick the launch recipe based on the two-line submission frontier: | ||
| # TP8/no-DP-attn for low latency and DEP8/DP-attn for throughput. | ||
|
|
||
| elif [ "$CONC" = "2048" ]; then | ||
| # DP attention, megamoe | ||
| if [ "${DP_ATTENTION}" = "true" ]; then | ||
| export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 | ||
| export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8 | ||
| export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 | ||
| export SGLANG_OPT_USE_FAST_MASK_EP=1 | ||
| export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 | ||
| export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 | ||
| export NVSHMEM_DISABLE_IB=1 | ||
| export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 | ||
| export SGLANG_LOG_FORWARD_ITERS=1 | ||
| export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 | ||
| MEM_FRACTION_STATIC=0.87 | ||
| SWA_FULL_TOKENS_RATIO=0.06 | ||
| MAX_RUNNING_REQUESTS=2560 | ||
| PARALLEL_ARGS=( | ||
| --dp-size "$TP" | ||
| --enable-dp-attention | ||
| --moe-a2a-backend megamoe | ||
| --cuda-graph-max-bs 288 | ||
| --chunked-prefill-size 65536 | ||
| --tokenizer-worker-num 4 | ||
| --enable-prefill-delayer | ||
| ) | ||
| export SGLANG_OPT_USE_ONLINE_COMPRESS=1 | ||
| export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=2048 | ||
| export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1 | ||
| export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1 | ||
| export SGLANG_EXPERIMENTAL_ENABLE_PIECEWISE_CUDA_GRAPH_MOE_A2A=1 | ||
| export NCCL_MNNVL_ENABLE=1 | ||
| export NCCL_CUMEM_ENABLE=1 | ||
| export MC_FORCE_MNNVL=1 | ||
| export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True | ||
|
|
||
| elif [ "$CONC" = "4096" ]; then | ||
| # DP attention, megamoe | ||
| export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 | ||
| export NVSHMEM_DISABLE_IB=1 | ||
| export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 | ||
| export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 | ||
| MEM_FRACTION_STATIC=0.835 | ||
| SWA_FULL_TOKENS_RATIO=0.075 | ||
| MAX_RUNNING_REQUESTS=4352 | ||
| SWA_FULL_TOKENS_RATIO=0.075 | ||
| PARALLEL_ARGS=( | ||
| --dp-size "$TP" | ||
| --enable-dp-attention | ||
| --moe-a2a-backend megamoe | ||
| --cuda-graph-max-bs 544 | ||
| --chunked-prefill-size 65536 | ||
| --enable-mixed-chunk | ||
| --chunked-prefill-size 16384 | ||
| --max-prefill-tokens 16384 | ||
| --tokenizer-worker-num 8 | ||
| --enable-prefill-delayer | ||
| --decode-log-interval 5 | ||
| --stream-interval 30 | ||
| ) | ||
|
|
||
| elif [ "$CONC" = "8192" ]; then | ||
| # DP attention, megamoe | ||
| export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 | ||
| export NVSHMEM_DISABLE_IB=1 | ||
| export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 | ||
| export SGLANG_OPT_USE_ONLINE_COMPRESS=1 | ||
| export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 | ||
| MEM_FRACTION_STATIC=0.80 | ||
| SWA_FULL_TOKENS_RATIO=0.3 | ||
| MAX_RUNNING_REQUESTS=8192 | ||
| else | ||
| export SGLANG_JIT_DEEPGEMM_PRECOMPILE=1 | ||
| MEM_FRACTION_STATIC=0.90 | ||
| MAX_RUNNING_REQUESTS=512 | ||
| PARALLEL_ARGS=( | ||
| --dp-size "$TP" | ||
| --enable-dp-attention | ||
| --moe-a2a-backend megamoe | ||
| --cuda-graph-max-bs 1088 | ||
| --chunked-prefill-size 65536 | ||
| --tokenizer-worker-num 16 | ||
| --enable-prefill-delayer | ||
| --moe-runner-backend flashinfer_mxfp4 | ||
| --chunked-prefill-size 8192 | ||
| --disable-flashinfer-autotune | ||
| --cuda-graph-max-bs 512 | ||
| --tokenizer-worker-num 8 | ||
| --decode-log-interval 60 | ||
| --stream-interval 30 | ||
| --scheduler-recv-interval 30 | ||
| ) | ||
|
|
||
| else | ||
| echo "ERROR: unsupported CONC=$CONC" >&2 | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Print all SGLANG_* env vars to both the CI step log and server.log so the | ||
|
|
@@ -153,12 +122,12 @@ fi | |
|
|
||
| set -x | ||
| PYTHONNOUSERSITE=1 sglang serve \ | ||
| --model-path $MODEL_PATH --served-model-name $MODEL \ | ||
| --model-path $MODEL \ | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Server ignores staged MODEL_PATHHigh Severity On B300, Reviewed by Cursor Bugbot for commit fbeb15a. Configure here. |
||
| --host 0.0.0.0 \ | ||
| --port $PORT \ | ||
| --trust-remote-code \ | ||
| --tp $TP \ | ||
| --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ | ||
| --max-running-requests "$MAX_RUNNING_REQUESTS" \ | ||
| --mem-fraction-static "$MEM_FRACTION_STATIC" \ | ||
| --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ | ||
| "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & | ||
|
|
@@ -168,7 +137,6 @@ SERVER_PID=$! | |
| wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" | ||
|
|
||
| pip install -q datasets pandas | ||
| pip install -q --upgrade transformers | ||
|
|
||
| run_benchmark_serving \ | ||
| --model "$MODEL" \ | ||
|
|
||


Uh oh!
There was an error while loading. Please reload this page.