Merge remote-tracking branch 'origin/main' into feat/m3-mi300x-blockfp8-clean

Oseltamivir · Oseltamivir · commit 6c29d32d52c4 · 2026-06-18T09:04:43.000+08:00
# Conflicts:
#	perf-changelog.yaml
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
@@ -2261,15 +2261,8 @@ dsv4-fp4-mi355x-vllm-mtp:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
 
-# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
-# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
-# the AITER sparse-attention kernel / multi-request path lands upstream.
-# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
-# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
-# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
 dsv4-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
+  image: rocm/atom-dev:nightly_202606161823
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2281,13 +2274,20 @@ dsv4-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024-2048, DPA TBO
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 512 }
+        # conc4-64, TP8
+        # conc128, DPA
+        # conc256-2048, DPA TBO
+      - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
+      - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/.github/workflows/recover-pr-1767-ingest.yml b/.github/workflows/recover-pr-1767-ingest.yml
@@ -0,0 +1,172 @@
+name: Recover PR 1767 ingest
+run-name: "Recover PR #1767 ingest from run 27595478969"
+
+on:
+    workflow_dispatch:
+        inputs:
+            confirm:
+                description: "Enter recover-pr-1767 to run the artifact-only recovery"
+                required: true
+                type: string
+
+permissions:
+    actions: read
+    contents: read
+
+jobs:
+    recover-ingest:
+        if: ${{ inputs.confirm == 'recover-pr-1767' }}
+        runs-on: ubuntu-latest
+        env:
+            SOURCE_REPO: SemiAnalysisAI/InferenceX
+            SOURCE_RUN_ID: "27595478969"
+            SOURCE_PR_NUMBER: "1767"
+            SOURCE_HEAD_SHA: 728eb321dd4b1decd81b2d460cb39aa369a0c9c8
+            ORIGINAL_BASE_SHA: d99c824b1c4f0b1b007631191657e458ef2a332c
+            ORIGINAL_MERGE_SHA: 7b9843d3a6e1fe7a2d92d327e25aae57ed3506c5
+        steps:
+            - name: Checkout recovery code
+              uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+              with:
+                  fetch-depth: 0
+
+            - name: Validate reusable source run
+              env:
+                  GH_TOKEN: ${{ secrets.REPO_PAT || github.token }}
+              run: |
+                  run_json=$(gh api "repos/${SOURCE_REPO}/actions/runs/${SOURCE_RUN_ID}")
+                  jq -e \
+                    --arg expected_head "$SOURCE_HEAD_SHA" \
+                    '.event == "pull_request" and
+                     .status == "completed" and
+                     .conclusion == "success" and
+                     .path == ".github/workflows/run-sweep.yml" and
+                     .head_sha == $expected_head' \
+                    <<<"$run_json" >/dev/null
+
+                  gh api "repos/${SOURCE_REPO}/pulls/${SOURCE_PR_NUMBER}/commits" \
+                    --paginate --jq '.[].sha' \
+                    | grep -Fxq "$SOURCE_HEAD_SHA"
+
+                  artifacts_json=$(gh api \
+                    "repos/${SOURCE_REPO}/actions/runs/${SOURCE_RUN_ID}/artifacts?per_page=100")
+                  for required in results_bmk eval_results_all run-stats; do
+                      jq -e --arg name "$required" \
+                        '.artifacts[] | select(.name == $name and (.expired | not))' \
+                        <<<"$artifacts_json" >/dev/null
+                  done
+
+            - name: Reconstruct corrected merge configuration
+              run: |
+                  git checkout --detach "$ORIGINAL_MERGE_SHA"
+                  test "$(git rev-parse "${ORIGINAL_MERGE_SHA}^")" = "$ORIGINAL_BASE_SHA"
+
+                  perl -0pi -e \
+                    's/^  - config-keys:\n(    - dsr1-fp8-gb300-dynamo-trt\n)/- config-keys:\n$1/m' \
+                    perf-changelog.yaml
+                  grep -A1 '^- config-keys:$' perf-changelog.yaml \
+                    | grep -q 'dsr1-fp8-gb300-dynamo-trt'
+                  if grep -A1 '^  - config-keys:$' perf-changelog.yaml \
+                    | grep -q 'dsr1-fp8-gb300-dynamo-trt'; then
+                      echo "PR #1767 changelog indentation is still malformed" >&2
+                      exit 1
+                  fi
+
+                  git add perf-changelog.yaml
+                  fixed_tree=$(git write-tree)
+                  fixed_sha=$(printf '%s\n' 'Synthetic corrected PR #1767 merge tree' \
+                    | git -c user.name='InferenceX Recovery' \
+                          -c user.email='actions@users.noreply.github.com' \
+                          commit-tree "$fixed_tree" -p "$ORIGINAL_BASE_SHA")
+
+                  pip install pydantic
+                  python3 utils/process_changelog.py \
+                    --changelog-file perf-changelog.yaml \
+                    --base-ref "$ORIGINAL_BASE_SHA" \
+                    --head-ref "$fixed_sha" \
+                    > "$RUNNER_TEMP/full-config.json"
+                  jq empty "$RUNNER_TEMP/full-config.json"
+
+                  mkdir -p "$RUNNER_TEMP/changelog-metadata"
+                  jq \
+                    --arg base "$ORIGINAL_BASE_SHA" \
+                    --arg head "$ORIGINAL_MERGE_SHA" \
+                    '.changelog_metadata | .base_ref = $base | .head_ref = $head' \
+                    "$RUNNER_TEMP/full-config.json" \
+                    > "$RUNNER_TEMP/changelog-metadata/changelog_metadata.json"
+
+            - name: Download reusable benchmark artifacts
+              env:
+                  GH_TOKEN: ${{ secrets.REPO_PAT || github.token }}
+              run: |
+                  artifacts_dir="$RUNNER_TEMP/source-artifacts"
+                  gh run download "$SOURCE_RUN_ID" \
+                    --repo "$SOURCE_REPO" \
+                    -D "$artifacts_dir"
+
+                  rm -rf "$artifacts_dir/changelog-metadata"
+                  for artifact_dir in "$artifacts_dir"/*; do
+                      [ -e "$artifact_dir" ] || continue
+                      name=$(basename "$artifact_dir")
+                      case "$name" in
+                          results_bmk|eval_results_all|run-stats|bmk_*|eval_*|server_logs_*|multinode_server_logs_*|agentic_aggregated)
+                              ;;
+                          *)
+                              rm -rf "$artifact_dir"
+                              ;;
+                      esac
+                  done
+
+                  mkdir -p "$artifacts_dir/reused-ingest-metadata"
+                  jq -n \
+                    --arg source_run_id "$SOURCE_RUN_ID" \
+                    --arg source_run_attempt "1" \
+                    --arg source_run_url "https://github.com/${SOURCE_REPO}/actions/runs/${SOURCE_RUN_ID}" \
+                    --arg source_pr_number "$SOURCE_PR_NUMBER" \
+                    --arg source_head_sha "$SOURCE_HEAD_SHA" \
+                    --arg ingest_run_id "$GITHUB_RUN_ID" \
+                    --arg ingest_run_attempt "$GITHUB_RUN_ATTEMPT" \
+                    --arg ingest_run_url "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" \
+                    '{
+                      source_run_id: $source_run_id,
+                      source_run_attempt: $source_run_attempt,
+                      source_run_url: $source_run_url,
+                      source_pr_number: $source_pr_number,
+                      source_head_sha: $source_head_sha,
+                      ingest_run_id: $ingest_run_id,
+                      ingest_run_attempt: $ingest_run_attempt,
+                      ingest_run_url: $ingest_run_url
+                    }' \
+                    > "$artifacts_dir/reused-ingest-metadata/reuse_source_run.json"
+
+            - name: Validate reusable artifacts
+              run: |
+                  python3 utils/validate_reusable_sweep_artifacts.py \
+                    --config-json "$RUNNER_TEMP/full-config.json" \
+                    --artifacts-dir "$RUNNER_TEMP/source-artifacts"
+
+            - name: Upload reusable ingest artifacts
+              uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+              with:
+                  name: reused-ingest-artifacts
+                  path: ${{ runner.temp }}/source-artifacts/*
+
+            - name: Upload corrected changelog metadata
+              uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+              with:
+                  name: changelog-metadata
+                  path: ${{ runner.temp }}/changelog-metadata/changelog_metadata.json
+
+            - name: Trigger database ingest
+              run: |
+                  curl -sSf -X POST \
+                    -H "Authorization: Bearer ${{ secrets.INFX_FRONTEND_PAT }}" \
+                    -H "Accept: application/vnd.github+v3+json" \
+                    https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
+                    -d '{
+                      "event_type": "ingest-results",
+                      "client_payload": {
+                        "run-id": "${{ github.run_id }}",
+                        "run-attempt": "${{ github.run_attempt }}"
+                      }
+                    }'
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,31 +22,51 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
+CUDAGRAPH_SIZES='[1, 2, 4, 8, 16, 32, 48, 64, 128, 256, 512]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
-    else #DP+TP
-        PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+    else #DPA+TP
+        #DPA+TP+TBO
+        if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+            export GPU_MAX_HW_QUEUES=5
+        elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+            export GPU_MAX_HW_QUEUES=5
+        else
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+        fi
     fi
 fi 
 
+BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
+    export EVAL_MAX_MODEL_LEN
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
 set -x
 export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
-# TODO: add --no-enable_chunked_prefill, when dsv4 prefix caching is supported 
-#https://github.com/ROCm/ATOM/commit/7df93a181da4d3c3250c2441c7d5e2745a03d0cd#diff-61b1ba0b8b74523530d2d5cdc739d4f3a23a43bedf69015a5235844d46e9373bL1127
+MEM_FRAC_STATIC=0.9
+OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}')
+
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
     "${PARALLEL_ARGS[@]}" \
     --kv_cache_dtype fp8 \
     --trust-remote-code \
-    --gpu-memory-utilization 0.85 \
-    > $SERVER_LOG 2>&1 &
+    --gpu-memory-utilization $MEM_FRAC_STATIC \
+    --no-enable_prefix_caching \
+    --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
+    "${OPT_ARGS[@]}" \
+    > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3928,6 +3928,14 @@
     - "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798
   
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
+    - "Update Applied TBO on high concurrencies"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
+
 - config-keys:
     - minimaxm3-fp8-mi300x-vllm
   description: