Skip to content

Commit e5ce527

Browse files
sbryngelsonclaude
andcommitted
Limit parallel builds to 2 concurrent on login node
Running all 5-6 builds simultaneously on a Frontier login node causes memory pressure that crashes the Cray compiler (optcg segfault). Cap concurrent builds at 2 using a polling semaphore to stay within login node resource limits. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 553ff35 commit e5ce527

2 files changed

Lines changed: 36 additions & 10 deletions

File tree

.github/scripts/run_frontier_all_benchmarks.sh

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,35 @@ for cfg in "${configs[@]}"; do
4646
done
4747

4848
# --- Phase 2: Build all configs on login node in parallel ---
49+
MAX_PARALLEL=2
50+
4951
echo ""
5052
echo "=========================================="
51-
echo "Starting parallel builds (${num_nodes} configs)..."
53+
echo "Starting parallel builds (${num_nodes} configs, max $MAX_PARALLEL concurrent)..."
5254
echo "=========================================="
53-
5455
build_pids=()
55-
for cfg in "${configs[@]}"; do
56-
read -r version cluster device interface <<< "$cfg"
56+
running=()
57+
for i in "${!configs[@]}"; do
58+
# Wait until a build slot is available
59+
while [ ${#running[@]} -ge $MAX_PARALLEL ]; do
60+
sleep 2
61+
still_running=()
62+
for pid in "${running[@]}"; do
63+
kill -0 "$pid" 2>/dev/null && still_running+=("$pid")
64+
done
65+
running=("${still_running[@]}")
66+
done
67+
68+
read -r version cluster device interface <<< "${configs[$i]}"
5769
dir="${version}-${cluster}-${device}-${interface}"
5870
log="build-${version}-${cluster}-${device}-${interface}.log"
5971
echo " Starting: $version $cluster $device $interface"
6072
(
6173
cd "$dir"
6274
bash .github/workflows/${cluster}/build.sh "$device" "$interface" bench
6375
) > "$log" 2>&1 &
64-
build_pids+=($!)
76+
build_pids[$i]=$!
77+
running+=($!)
6578
done
6679

6780
# Periodic heartbeat while builds run

.github/scripts/run_frontier_all_tests.sh

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,22 +51,35 @@ for cfg in "${configs[@]}"; do
5151
done
5252

5353
# --- Phase 2: Build all configs on login node in parallel ---
54+
MAX_PARALLEL=2
55+
5456
echo ""
5557
echo "=========================================="
56-
echo "Starting parallel builds (${num_nodes} configs)..."
58+
echo "Starting parallel builds (${num_nodes} configs, max $MAX_PARALLEL concurrent)..."
5759
echo "=========================================="
58-
5960
build_pids=()
60-
for cfg in "${configs[@]}"; do
61-
read -r cluster device interface <<< "$cfg"
61+
running=()
62+
for i in "${!configs[@]}"; do
63+
# Wait until a build slot is available
64+
while [ ${#running[@]} -ge $MAX_PARALLEL ]; do
65+
sleep 2
66+
still_running=()
67+
for pid in "${running[@]}"; do
68+
kill -0 "$pid" 2>/dev/null && still_running+=("$pid")
69+
done
70+
running=("${still_running[@]}")
71+
done
72+
73+
read -r cluster device interface <<< "${configs[$i]}"
6274
dir="test-${cluster}-${device}-${interface}"
6375
log="build-${cluster}-${device}-${interface}.log"
6476
echo " Starting: $cluster $device $interface"
6577
(
6678
cd "$dir"
6779
bash .github/workflows/${cluster}/build.sh "$device" "$interface"
6880
) > "$log" 2>&1 &
69-
build_pids+=($!)
81+
build_pids[$i]=$!
82+
running+=($!)
7083
done
7184

7285
# Periodic heartbeat while builds run

0 commit comments

Comments
 (0)