SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.github/configs/amd-master.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.github/configs/nvidia-master.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.github/configs/nvidia-master.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/claude.yml‎
Lines changed: 7 additions & 27 deletions b/‎.github/workflows/claude.yml‎
Lines changed: 7 additions & 27 deletions
diff --git a/‎.github/workflows/docker-tag-monitor.yml‎
Lines changed: 93 additions & 3 deletions b/‎.github/workflows/docker-tag-monitor.yml‎
Lines changed: 93 additions & 3 deletions
@@ -368,7 +368,7 @@ qwen3.5-fp8-mi300x-sglang:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 glm5-fp8-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413
+  image: lmsysorg/sglang-rocm:v0.5.11-rocm720-mi35x-20260513
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi355x
@@ -380,11 +380,11 @@ glm5-fp8-mi355x-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 256 }
 
 glm5-fp8-mi355x-sglang-mtp:
   image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
@@ -1644,7 +1644,7 @@ dsv4-fp8-mi355x-vllm:
 # the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
 # the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
 dsv4-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  image: rocm/atom-dev:nightly_202605130853
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -1656,8 +1656,8 @@ dsv4-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 512 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 512 }
@@ -2228,7 +2228,7 @@ glm5-fp4-b200-sglang:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
 glm5-fp4-b200-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.10.post1-cu130
+  image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
   runner: b200
@@ -2438,7 +2438,7 @@ qwen3.5-bf16-b300-sglang-mtp:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 kimik2.5-int4-b200-vllm:
-  image: vllm/vllm-openai:v0.15.1
+  image: vllm/vllm-openai:v0.20.2
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: b200
@@ -2481,7 +2481,7 @@ kimik2.5-int4-b300-vllm:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-h200-vllm:
-  image: vllm/vllm-openai:v0.16.0
+  image: vllm/vllm-openai:v0.20.2
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: h200
 
@@ -24,7 +24,7 @@ jobs:
         uses: actions/checkout@v6.0.2
         with:
           fetch-depth: 0
-          token: ${{ secrets.PAT_WITH_WORKFLOW_SCOPE }}
+          token: ${{ secrets.CLAUDE_PAT }}
 
       - name: Setup MCP Server
         run: |
@@ -35,16 +35,17 @@ jobs:
         id: claude
         uses: anthropics/claude-code-action@v1
         env:
-          GH_TOKEN: ${{ secrets.PAT_WITH_WORKFLOW_SCOPE }}
+          GH_TOKEN: ${{ secrets.CLAUDE_PAT }}
+          GITHUB_TOKEN: ${{ secrets.CLAUDE_PAT }}
           INFERENCEMAX_ROOT: ${{ github.workspace }}
           BASH_DEFAULT_TIMEOUT_MS: "1800000"
           BASH_MAX_TIMEOUT_MS: "3600000"
         with:
           anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
-          github_token: ${{ secrets.PAT_WITH_WORKFLOW_SCOPE }}
+          github_token: ${{ secrets.CLAUDE_PAT }}
           trigger_phrase: "${{ contains(github.event.comment.body || github.event.issue.body || github.event.issue.title || '', '@Klaud-Cold') && '@Klaud-Cold' || '@claude' }}"
           track_progress: true
-          allowed_bots: 'Klaud-Cold'
+          allowed_bots: '*'
           additional_permissions: |
             actions: read
           settings: |
@@ -220,27 +221,7 @@ jobs:
 
             ## Updating perf-changelog.yaml
 
-            When making changes to benchmark scripts or master config files that affect image tags, environment variables, or configuration parameters, you MUST add an entry to `perf-changelog.yaml`.
-
-            **When to update perf-changelog.yaml:**
-            - Updating image tags in `.github/configs/*-master.yaml` or `benchmarks/*.sh` scripts
-            - Adding or modifying environment variables in benchmark configurations
-            - Changing configuration parameters that affect performance
-
-            **Entry format:**
-            ```yaml
-            - config-keys:
-                - dsr1-fp8-*-vllm  # Use wildcards to match multiple configs
-              description:
-                - "Update vLLM image from v0.11.2 to v0.13.0"
-                - "Add VLLM_MXFP4_USE_MARLIN=1 environment variable"
-              pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
-            ```
-
-            **Guidelines:**
-            - Use wildcards (`*`) in config-keys to match multiple related configurations
-            - Each description item should be a concise change summary
-            - The pr-link should reference the PR number (use XXX as placeholder until PR is created)
+            See `AGENTS.md` → "Updating Docker Images" for entry format and rules. Required whenever you change image tags, env vars, or perf-affecting params in `.github/configs/*-master.yaml` or `benchmarks/*.sh`. Use `XXX` as the PR-link placeholder until the PR exists.
 
             ## Spawning Additional Workers:
             You CAN spawn additional Claude workers by commenting "@claude" with a specific task.
@@ -271,8 +252,7 @@ jobs:
 
             ### Additional Knowledge
             - MI355 is gfx950 not gfx1201
-            - **STP (Single Token Prediction)**: Standard autoregressive decoding — one token per forward pass. No speculative decoding or MTP. Benchmarks labeled "STP only" use vanilla decoding.
-            - **MTP (Multi-Token Prediction)**: Predicts multiple tokens per forward pass using speculative decoding (e.g., EAGLE, NEXTN).
+            - STP/MTP terminology: see `AGENTS.md` → "Terminology"
 
             ### Expert Parallelism in Benchmark Scripts
             vLLM and SGLang handle expert parallelism differently. When writing or reviewing benchmark scripts for MoE models:
 
@@ -1,5 +1,15 @@
 name: Docker Tag Monitor
 
+# Downstream merge note (human-only — intentionally NOT in the @claude prompt):
+#   Once the per-config-key PRs this workflow asks Claude to open have a
+#   green run-sweep.yml run and the `full-sweep-enabled` label, merge them
+#   with `utils/merge_with_reuse.sh <pr-number>` instead of the GitHub UI.
+#   That script posts /reuse-sweep-run, auto-resolves perf-changelog.yaml
+#   conflicts, cancels the merge-triggered sweep, and squash-merges with
+#   --admin so the post-merge run-sweep run reuses the PR's prior sweep.
+#   Claude doesn't have admin merge rights and shouldn't be told about this
+#   path — it's a maintainer-only finalization step.
+
 on:
   schedule:
     - cron: '0 7 * * 6'
@@ -137,6 +147,38 @@ jobs:
 
           echo "has_updates=true" >> "$GITHUB_OUTPUT"
 
+          # Snapshot self-hosted runner availability from the CI tracker.
+          # Used to constrain which SKUs Claude is allowed to open PRs against.
+          RUNNERS_TABLE=$(mktemp)
+          ALLOWED_SKUS_FILE=$(mktemp)
+          if curl -sf --max-time 10 "https://inferencex-ci-tracker.vercel.app/api/ci" -o /tmp/ci.json; then
+            jq -r '
+              [
+                "| SKU | Total | Busy | Idle | Offline | Pressure | Available |",
+                "|-----|-------|------|------|---------|----------|-----------|"
+              ] + (
+                .skus
+                | sort_by(.label)
+                | map(
+                    .label as $l
+                    | .summary as $s
+                    | (($s.idleRunners > 0) and ($s.offlineRunners < $s.totalRunners)) as $ok
+                    | "| `\($l)` | \($s.totalRunners) | \($s.busyRunners) | \($s.idleRunners) | \($s.offlineRunners) | \($s.pressureLevel) | \(if $ok then "yes" else "no" end) |"
+                  )
+              )
+              | .[]
+            ' /tmp/ci.json > "$RUNNERS_TABLE"
+            jq -r '
+              .skus[]
+              | select((.summary.pressureLevel == "clear") and (.summary.idleRunners > 0) and (.summary.offlineRunners < .summary.totalRunners))
+              | .label
+            ' /tmp/ci.json | paste -sd, - > "$ALLOWED_SKUS_FILE"
+          else
+            echo "_Could not reach https://inferencex-ci-tracker.vercel.app/api/ci — proceed cautiously._" > "$RUNNERS_TABLE"
+            : > "$ALLOWED_SKUS_FILE"
+          fi
+          ALLOWED_SKUS=$(cat "$ALLOWED_SKUS_FILE")
+
           # Build issue body and write to file
           BODY_FILE=$(mktemp)
           {
@@ -153,19 +195,67 @@ jobs:
               echo "| \`$repo\` | \`$tag\` | $tag_ver | $cur_ver | $pub_date |"
             done <<< "$UPDATES"
             echo ""
+            echo "### Self-Hosted Runner Snapshot"
+            echo ""
+            echo "_Source: https://inferencex-ci-tracker.vercel.app/api/ci at $(date -u +%Y-%m-%dT%H:%M:%SZ)_"
+            echo ""
+            cat "$RUNNERS_TABLE"
+            echo ""
+            if [[ -n "$ALLOWED_SKUS" ]]; then
+              echo "**Allowed SKUs for this run:** \`$ALLOWED_SKUS\`"
+            else
+              echo "**Allowed SKUs for this run:** _none — skip PR creation and post a comment explaining the runner shortage._"
+            fi
+            echo ""
             echo "---"
             echo ""
             echo "@claude Please update the configurations:"
             echo ""
             echo "1. Update image tags in \`.github/configs/nvidia-master.yaml\` and/or \`.github/configs/amd-master.yaml\`"
             echo "2. Add entries to \`perf-changelog.yaml\` documenting the version changes"
-            echo "3. Create separate PRs grouped by framework and image family, and link each PR."
+            echo "3. For each eligible config-key, push a branch and actually open a PR — do not stop at the \"Create a pull request for ...\" remote hint that \`git push\` prints. Run \`gh pr create\` (or the equivalent MCP tool) and verify the returned PR URL. Link every PR back to this issue in a comment."
+            echo ""
+            echo "**Required PR label:** Every PR you open from this issue MUST carry the \`full-sweep-enabled\` label. Apply it at creation time via \`gh pr create --label full-sweep-enabled\` (or add it immediately after with \`gh pr edit <num> --add-label full-sweep-enabled\`). Do not skip this — downstream automation keys off the label."
+            echo ""
+            echo "**PR title / commit message formatting:** Multi-line titles and bodies MUST use a heredoc, not \`\\n\` escapes and not \`\$'...'\` ANSI-C quoting. A prior run produced commits literally starting with \`\$\` and containing \`\\n\\n\` as text because of mis-quoted ANSI-C strings. Use this pattern instead:"
             echo ""
-            echo "Group by framework plus CUDA/ROCm image family (for example sglang-cuda, sglang-rocm, vllm-cuda, vllm-rocm, atom, trt), not by individual GPU. Split by GPU only when an update is genuinely hardware-specific."
+            echo "\`\`\`bash"
+            echo "git commit -m \"\$(cat <<'EOF'"
+            echo "Update qwen3.5-bf16-b300-sglang-mtp SGLang image to v0.5.11-cu130"
             echo ""
-            echo "Focus on updating single-node configurations. For each framework/image family, check if there are multiple CUDA/ROCm versions available and choose appropriately based on current usage patterns in the configs."
+            echo "Ref #<this issue's number>"
+            echo "EOF"
+            echo ")\""
+            echo ""
+            echo "gh pr create --title \"Update qwen3.5-bf16-b300-sglang-mtp SGLang image to v0.5.11-cu130\" \\"
+            echo "  --label full-sweep-enabled \\"
+            echo "  --body \"\$(cat <<'EOF'"
+            echo "Updates the SGLang image tag for \`qwen3.5-bf16-b300-sglang-mtp\` to v0.5.11-cu130."
+            echo ""
+            echo "Ref #<this issue's number>"
+            echo "EOF"
+            echo ")\""
+            echo "\`\`\`"
+            echo ""
+            echo "PR titles must be a single line (no newlines). Bodies should contain real newlines (use a heredoc), not the literal characters \`\\n\`. Never put \`\$\` in front of a quoted message string."
+            echo ""
+            echo "**Runner gating:** Only open PRs for config-keys whose runner SKU is in the allowed list (\`$ALLOWED_SKUS\`). The runner SKU is the hardware segment in the config-key (e.g. \`dsr1-fp4-b200-sglang\` → \`b200\`). For any config-key whose SKU is not in the allowed list, skip it and list the skipped keys plus the reason (not clear / all-offline / no idle capacity) in a single comment on this issue."
+            echo ""
+            echo "**Single-node only:** Skip any config-key whose master-config entry has \`multinode: true\` or otherwise targets a multinode runner. Only update single-node configurations."
+            echo ""
+            echo "**Per-SKU cap (max 5):** For each allowed SKU, work on at most 5 config-keys. If a SKU has more than 5 eligible config-keys, pick 5 in alphabetical order and list the deferred remainder in your wrap-up comment on this issue so a future run can pick them up."
+            echo ""
+            echo "**Sequential execution per SKU:** Within a single SKU, process config-keys one at a time. For each config-key: open the PR, dispatch its e2e test (\`mcp__github__run_workflow\` against \`e2e-tests.yml\` on the PR branch), poll with exponential backoff until the run reaches a terminal state (success/failure/cancelled), then move on to the next config-key for that SKU. Do not dispatch the next e2e run for the same SKU until the previous one has finished. Different SKUs may be processed in parallel since they target disjoint hardware, but each SKU's queue must stay serial."
+            echo ""
+            echo "One PR per config-key — do not bundle multiple config-keys into one PR even when they share a framework or image family."
+            echo ""
+            echo "**Exception — MTP pairs:** When a config-key and its \`-mtp\` sibling exist for the same model/precision/runner/framework (e.g. \`qwen3.5-fp4-b300-sglang\` and \`qwen3.5-fp4-b300-sglang-mtp\`), bundle both into one PR. Treat the pair as a single unit for the per-SKU cap (counts as 1, not 2) and the sequential e2e queue. If only one side of the pair is present in the updates, open a PR for just that one."
+            echo ""
+            echo "If Docker Hub lists multiple variants for the same base version (e.g. \`cu128\` vs \`cu130\`, \`rocm70\` vs \`rocm72\`), pick the variant whose suffix matches what the config-key's current image entry already uses — don't switch CUDA/ROCm minor versions in this update."
           } > "$BODY_FILE"
 
+          rm -f "$RUNNERS_TABLE" "$ALLOWED_SKUS_FILE" /tmp/ci.json
+
           echo "body_file=$BODY_FILE" >> "$GITHUB_OUTPUT"
 
           echo "=== Issue body ==="