Skip to content

Commit 6f5a399

Browse files
committed
Merge remote-tracking branch 'origin/main' into feat/m3-mi300x-blockfp8-clean
# Conflicts: # perf-changelog.yaml
2 parents 6c29d32 + 529a500 commit 6f5a399

2 files changed

Lines changed: 211 additions & 0 deletions

File tree

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
name: Recover PR 1798 ingest
2+
run-name: "Recover PR #1798 ingest from run 27622347964"
3+
4+
on:
5+
workflow_dispatch:
6+
inputs:
7+
confirm:
8+
description: "Enter recover-pr-1798 to run the artifact-only recovery"
9+
required: true
10+
type: string
11+
12+
permissions:
13+
actions: read
14+
contents: read
15+
16+
jobs:
17+
recover-ingest:
18+
if: ${{ inputs.confirm == 'recover-pr-1798' }}
19+
runs-on: ubuntu-latest
20+
env:
21+
SOURCE_REPO: SemiAnalysisAI/InferenceX
22+
SOURCE_RUN_ID: "27622347964"
23+
SOURCE_RUN_ATTEMPT: "2"
24+
SOURCE_PR_NUMBER: "1798"
25+
SOURCE_HEAD_SHA: ffe21af32e2d7b3fbd568e2f7fc066659d9b16c9
26+
TARGET_RUN_ID: "27712344914"
27+
TARGET_JOB_ID: "81976315082"
28+
ORIGINAL_BASE_SHA: 7b9843d3a6e1fe7a2d92d327e25aae57ed3506c5
29+
ORIGINAL_MERGE_SHA: ba5879b126cd6e976518641e5bc88b11976af890
30+
steps:
31+
- name: Checkout recovery code
32+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
33+
with:
34+
fetch-depth: 0
35+
36+
- name: Validate failed target and reusable source run
37+
env:
38+
GH_TOKEN: ${{ secrets.REPO_PAT || github.token }}
39+
run: |
40+
target_json=$(gh api "repos/${SOURCE_REPO}/actions/runs/${TARGET_RUN_ID}")
41+
jq -e \
42+
--arg expected_head "$ORIGINAL_MERGE_SHA" \
43+
'.event == "push" and
44+
.status == "completed" and
45+
.conclusion == "failure" and
46+
.path == ".github/workflows/run-sweep.yml" and
47+
.head_sha == $expected_head' \
48+
<<<"$target_json" >/dev/null
49+
50+
target_jobs=$(gh api \
51+
"repos/${SOURCE_REPO}/actions/runs/${TARGET_RUN_ID}/jobs?per_page=100")
52+
jq -e \
53+
--argjson expected_job "$TARGET_JOB_ID" \
54+
'.jobs[] |
55+
select(.id == $expected_job and
56+
.name == "setup" and
57+
.conclusion == "failure")' \
58+
<<<"$target_jobs" >/dev/null
59+
60+
run_json=$(gh api "repos/${SOURCE_REPO}/actions/runs/${SOURCE_RUN_ID}")
61+
jq -e \
62+
--arg expected_head "$SOURCE_HEAD_SHA" \
63+
--argjson expected_attempt "$SOURCE_RUN_ATTEMPT" \
64+
'.event == "pull_request" and
65+
.status == "completed" and
66+
.conclusion == "success" and
67+
.path == ".github/workflows/run-sweep.yml" and
68+
.head_sha == $expected_head and
69+
.run_attempt == $expected_attempt' \
70+
<<<"$run_json" >/dev/null
71+
72+
gh api "repos/${SOURCE_REPO}/pulls/${SOURCE_PR_NUMBER}/commits" \
73+
--paginate --jq '.[].sha' \
74+
| grep -Fxq "$SOURCE_HEAD_SHA"
75+
76+
artifacts_json=$(gh api \
77+
"repos/${SOURCE_REPO}/actions/runs/${SOURCE_RUN_ID}/artifacts?per_page=100")
78+
for required in results_bmk eval_results_all run-stats; do
79+
jq -e --arg name "$required" \
80+
'.artifacts[] | select(.name == $name and (.expired | not))' \
81+
<<<"$artifacts_json" >/dev/null
82+
done
83+
84+
- name: Reconstruct PR 1798 merge configuration
85+
run: |
86+
git checkout --detach "$ORIGINAL_MERGE_SHA"
87+
test "$(git rev-parse "${ORIGINAL_MERGE_SHA}^")" = "$ORIGINAL_BASE_SHA"
88+
89+
perl -0pi -e '
90+
$count = s/^- config-keys:\n(?= - dsr1-fp8-gb300-dynamo-trt\n description:\n - "Fix gsm8k)/ - config-keys:\n/m;
91+
END {
92+
die "Expected exactly one PR #1767 indentation repair\n"
93+
unless $count == 1;
94+
}
95+
' perf-changelog.yaml
96+
grep -A2 '^ - config-keys:$' perf-changelog.yaml \
97+
| grep -q 'dsr1-fp8-gb300-dynamo-trt'
98+
grep -A1 '^- config-keys:$' perf-changelog.yaml \
99+
| grep -q 'glm5-fp4-gb300-dynamo-trt'
100+
101+
git add perf-changelog.yaml
102+
fixed_tree=$(git write-tree)
103+
fixed_sha=$(printf '%s\n' \
104+
'Synthetic PR #1798 merge tree without the unrelated PR #1767 indentation repair' \
105+
| git -c user.name='InferenceX Recovery' \
106+
-c user.email='actions@users.noreply.github.com' \
107+
commit-tree "$fixed_tree" -p "$ORIGINAL_BASE_SHA")
108+
109+
pip install pydantic
110+
python3 utils/process_changelog.py \
111+
--changelog-file perf-changelog.yaml \
112+
--base-ref "$ORIGINAL_BASE_SHA" \
113+
--head-ref "$fixed_sha" \
114+
> "$RUNNER_TEMP/full-config.json"
115+
jq empty "$RUNNER_TEMP/full-config.json"
116+
117+
mkdir -p "$RUNNER_TEMP/changelog-metadata"
118+
jq \
119+
--arg base "$ORIGINAL_BASE_SHA" \
120+
--arg head "$ORIGINAL_MERGE_SHA" \
121+
'.changelog_metadata | .base_ref = $base | .head_ref = $head' \
122+
"$RUNNER_TEMP/full-config.json" \
123+
> "$RUNNER_TEMP/changelog-metadata/changelog_metadata.json"
124+
125+
- name: Download reusable benchmark artifacts
126+
env:
127+
GH_TOKEN: ${{ secrets.REPO_PAT || github.token }}
128+
run: |
129+
artifacts_dir="$RUNNER_TEMP/source-artifacts"
130+
gh run download "$SOURCE_RUN_ID" \
131+
--repo "$SOURCE_REPO" \
132+
-D "$artifacts_dir"
133+
134+
rm -rf "$artifacts_dir/changelog-metadata"
135+
for artifact_dir in "$artifacts_dir"/*; do
136+
[ -e "$artifact_dir" ] || continue
137+
name=$(basename "$artifact_dir")
138+
case "$name" in
139+
results_bmk|eval_results_all|run-stats|bmk_*|eval_*|server_logs_*|multinode_server_logs_*|agentic_aggregated)
140+
;;
141+
*)
142+
rm -rf "$artifact_dir"
143+
;;
144+
esac
145+
done
146+
147+
mkdir -p "$artifacts_dir/reused-ingest-metadata"
148+
jq -n \
149+
--arg source_run_id "$SOURCE_RUN_ID" \
150+
--arg source_run_attempt "$SOURCE_RUN_ATTEMPT" \
151+
--arg source_run_url "https://github.com/${SOURCE_REPO}/actions/runs/${SOURCE_RUN_ID}" \
152+
--arg source_pr_number "$SOURCE_PR_NUMBER" \
153+
--arg source_head_sha "$SOURCE_HEAD_SHA" \
154+
--arg target_run_id "$TARGET_RUN_ID" \
155+
--arg target_job_id "$TARGET_JOB_ID" \
156+
--arg ingest_run_id "$GITHUB_RUN_ID" \
157+
--arg ingest_run_attempt "$GITHUB_RUN_ATTEMPT" \
158+
--arg ingest_run_url "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" \
159+
'{
160+
source_run_id: $source_run_id,
161+
source_run_attempt: $source_run_attempt,
162+
source_run_url: $source_run_url,
163+
source_pr_number: $source_pr_number,
164+
source_head_sha: $source_head_sha,
165+
target_run_id: $target_run_id,
166+
target_job_id: $target_job_id,
167+
ingest_run_id: $ingest_run_id,
168+
ingest_run_attempt: $ingest_run_attempt,
169+
ingest_run_url: $ingest_run_url
170+
}' \
171+
> "$artifacts_dir/reused-ingest-metadata/reuse_source_run.json"
172+
173+
- name: Validate reusable artifacts
174+
run: |
175+
python3 utils/validate_reusable_sweep_artifacts.py \
176+
--config-json "$RUNNER_TEMP/full-config.json" \
177+
--artifacts-dir "$RUNNER_TEMP/source-artifacts"
178+
179+
- name: Upload reusable ingest artifacts
180+
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
181+
with:
182+
name: reused-ingest-artifacts
183+
path: ${{ runner.temp }}/source-artifacts/*
184+
185+
- name: Upload PR 1798 changelog metadata
186+
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
187+
with:
188+
name: changelog-metadata
189+
path: ${{ runner.temp }}/changelog-metadata/changelog_metadata.json
190+
191+
- name: Trigger database ingest
192+
run: |
193+
curl -sSf -X POST \
194+
-H "Authorization: Bearer ${{ secrets.INFX_FRONTEND_PAT }}" \
195+
-H "Accept: application/vnd.github+v3+json" \
196+
https://api.github.com/repos/SemiAnalysisAI/InferenceX-app/dispatches \
197+
-d '{
198+
"event_type": "ingest-results",
199+
"client_payload": {
200+
"run-id": "${{ github.run_id }}",
201+
"run-attempt": "${{ github.run_attempt }}"
202+
}
203+
}'

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3928,6 +3928,14 @@
39283928
- "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4"
39293929
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798
39303930

3931+
- config-keys:
3932+
- dsv4-fp4-mi355x-atom
3933+
description:
3934+
- "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
3935+
- "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
3936+
- "Update Applied TBO on high concurrencies"
3937+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
3938+
39313939
- config-keys:
39323940
- dsv4-fp4-mi355x-atom
39333941
description:

0 commit comments

Comments
 (0)