Skip to content

Commit 84835be

Browse files
sbryngelsonclaude
andcommitted
Make benchmark pipeline robust to transient GPU failures
Add three layers of defense against transient failures (e.g. ROCm HSA_STATUS_ERROR_INVALID_ARGUMENT) tanking the entire benchmark: 1. Retry failed cases once (5s delay) before marking as failed 2. Always write partial results YAML before raising on failure 3. CI scripts warn on non-zero exit instead of aborting, and bench.yml runs diff() via `if: always()` so partial results are still compared Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d4e7c8f commit 84835be

4 files changed

Lines changed: 100 additions & 75 deletions

File tree

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,16 @@ else
5252
echo "Master job completed successfully"
5353
fi
5454

55-
# Check if either job failed
55+
# Warn if either job failed (partial results may still be usable)
5656
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
57-
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
58-
exit 1
57+
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
58+
echo "Checking for partial results..."
59+
else
60+
echo "=========================================="
61+
echo "Both benchmark jobs completed successfully!"
62+
echo "=========================================="
5963
fi
6064

61-
echo "=========================================="
62-
echo "Both benchmark jobs completed successfully!"
63-
echo "=========================================="
64-
6565
# Final verification that output files exist before proceeding
6666
pr_yaml="pr/bench-${device}-${interface}.yaml"
6767
master_yaml="master/bench-${device}-${interface}.yaml"

.github/scripts/submit_and_monitor_bench.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,13 @@ fi
3737
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
3838

3939
# Use the monitoring script from PR (where this script lives)
40-
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
41-
42-
echo "[$dir] Monitoring complete for job $job_id"
40+
monitor_exit=0
41+
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
42+
if [ "$monitor_exit" -ne 0 ]; then
43+
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
44+
else
45+
echo "[$dir] Monitoring complete for job $job_id"
46+
fi
4347

4448
# Verify the YAML output file was created
4549
yaml_file="${job_slug}.yaml"

.github/workflows/bench.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ jobs:
176176
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
177177

178178
- name: Generate & Post Comment
179+
if: always()
179180
run: |
180181
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
181182
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml)

toolchain/mfc/bench.py

Lines changed: 85 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os, sys, uuid, subprocess, dataclasses, typing, math, traceback
1+
import os, sys, uuid, subprocess, dataclasses, typing, math, traceback, time
22

33
import rich.table
44

@@ -53,6 +53,8 @@ def bench(targets = None):
5353

5454
failed_cases = []
5555

56+
max_attempts = 2
57+
5658
for i, case in enumerate(CASES):
5759
summary_filepath = os.path.join(bench_dirpath, f"{case.slug}.yaml")
5860
log_filepath = os.path.join(bench_dirpath, f"{case.slug}.out")
@@ -64,71 +66,94 @@ def bench(targets = None):
6466
cons.print(f"> Summary: [bold]{os.path.relpath(summary_filepath)}[/bold]")
6567

6668
try:
67-
with open(log_filepath, "w") as log_file:
68-
result = system(
69-
["./mfc.sh", "run", case.path, "--case-optimization"] +
70-
["--targets"] + [t.name for t in targets] +
71-
["--output-summary", summary_filepath] +
72-
case.args +
73-
["--", "--gbpp", str(ARG('mem'))],
74-
stdout=log_file,
75-
stderr=subprocess.STDOUT)
76-
77-
# Check return code (handle CompletedProcess or int defensively)
78-
rc = result.returncode if hasattr(result, "returncode") else result
79-
if rc != 0:
80-
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
81-
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
82-
failed_cases.append(case.slug)
83-
continue
84-
85-
# Validate summary file exists
86-
if not os.path.exists(summary_filepath):
87-
cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
88-
cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]")
89-
failed_cases.append(case.slug)
90-
continue
91-
92-
# Load summary
93-
summary = file_load_yaml(summary_filepath)
94-
95-
# Validate all targets have required data
96-
validation_failed = False
97-
for target in targets:
98-
if target.name not in summary:
99-
cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
100-
validation_failed = True
101-
break
102-
103-
if "exec" not in summary[target.name]:
104-
cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
105-
validation_failed = True
69+
for attempt in range(1, max_attempts + 1):
70+
try:
71+
with open(log_filepath, "w") as log_file:
72+
result = system(
73+
["./mfc.sh", "run", case.path, "--case-optimization"] +
74+
["--targets"] + [t.name for t in targets] +
75+
["--output-summary", summary_filepath] +
76+
case.args +
77+
["--", "--gbpp", str(ARG('mem'))],
78+
stdout=log_file,
79+
stderr=subprocess.STDOUT)
80+
81+
# Check return code (handle CompletedProcess or int defensively)
82+
rc = result.returncode if hasattr(result, "returncode") else result
83+
if rc != 0:
84+
if attempt < max_attempts:
85+
cons.print(f"[bold yellow]WARNING[/bold yellow]: Case {case.slug} failed with exit code {rc} (attempt {attempt}/{max_attempts})")
86+
cons.print(f"Retrying in 5s...")
87+
time.sleep(5)
88+
continue
89+
cons.print(f"[bold red]ERROR[/bold red]: Case {case.slug} failed with exit code {rc}")
90+
cons.print(f"[bold red] Check log at: {log_filepath}[/bold red]")
91+
failed_cases.append(case.slug)
92+
break
93+
94+
# Validate summary file exists
95+
if not os.path.exists(summary_filepath):
96+
if attempt < max_attempts:
97+
cons.print(f"[bold yellow]WARNING[/bold yellow]: Summary file not created for {case.slug} (attempt {attempt}/{max_attempts})")
98+
cons.print(f"Retrying in 5s...")
99+
time.sleep(5)
100+
continue
101+
cons.print(f"[bold red]ERROR[/bold red]: Summary file not created for {case.slug}")
102+
cons.print(f"[bold red] Expected: {summary_filepath}[/bold red]")
103+
failed_cases.append(case.slug)
104+
break
105+
106+
# Load summary
107+
summary = file_load_yaml(summary_filepath)
108+
109+
# Validate all targets have required data
110+
validation_failed = False
111+
for target in targets:
112+
if target.name not in summary:
113+
cons.print(f"[bold red]ERROR[/bold red]: Target {target.name} missing from summary for {case.slug}")
114+
validation_failed = True
115+
break
116+
117+
if "exec" not in summary[target.name]:
118+
cons.print(f"[bold red]ERROR[/bold red]: 'exec' time missing for {target.name} in {case.slug}")
119+
validation_failed = True
120+
break
121+
122+
if target.name == "simulation" and "grind" not in summary[target.name]:
123+
cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
124+
validation_failed = True
125+
break
126+
127+
if validation_failed:
128+
failed_cases.append(case.slug)
129+
break
130+
131+
# Add to results
132+
results["cases"][case.slug] = {
133+
"description": dataclasses.asdict(case),
134+
"output_summary": summary,
135+
}
136+
cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
106137
break
107138

108-
if target.name == "simulation" and "grind" not in summary[target.name]:
109-
cons.print(f"[bold red]ERROR[/bold red]: 'grind' time missing for simulation in {case.slug}")
110-
validation_failed = True
111-
break
139+
except Exception as e:
140+
if attempt < max_attempts:
141+
cons.print(f"[bold yellow]WARNING[/bold yellow]: Unexpected error running {case.slug} (attempt {attempt}/{max_attempts}): {e}")
142+
cons.print(f"Retrying in 5s...")
143+
time.sleep(5)
144+
continue
145+
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
146+
cons.print(f"[dim]{traceback.format_exc()}[/dim]")
147+
failed_cases.append(case.slug)
112148

113-
if validation_failed:
114-
failed_cases.append(case.slug)
115-
continue
116-
117-
# Add to results
118-
results["cases"][case.slug] = {
119-
"description": dataclasses.asdict(case),
120-
"output_summary": summary,
121-
}
122-
cons.print(f"[bold green]✓[/bold green] Case {case.slug} completed successfully")
123-
124-
except Exception as e:
125-
cons.print(f"[bold red]ERROR[/bold red]: Unexpected error running {case.slug}: {e}")
126-
cons.print(f"[dim]{traceback.format_exc()}[/dim]")
127-
failed_cases.append(case.slug)
128149
finally:
129150
cons.unindent()
130151

131-
# Report results
152+
# Always write results (even partial) so diff() can compare the intersection
153+
file_dump_yaml(ARG("output"), results)
154+
cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
155+
156+
# Report failures after writing results
132157
if failed_cases:
133158
cons.print()
134159
cons.print(f"[bold red]Failed cases ({len(failed_cases)}):[/bold red]")
@@ -137,11 +162,6 @@ def bench(targets = None):
137162
cons.print()
138163
raise MFCException(f"Benchmarking failed: {len(failed_cases)}/{len(CASES)} cases failed")
139164

140-
# Write output
141-
file_dump_yaml(ARG("output"), results)
142-
143-
cons.print(f"Wrote results to [bold magenta]{os.path.relpath(ARG('output'))}[/bold magenta].")
144-
145165
finally:
146166
cons.unindent()
147167

0 commit comments

Comments
 (0)