-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubmit_simulations.sh
More file actions
executable file
·79 lines (71 loc) · 3.44 KB
/
submit_simulations.sh
File metadata and controls
executable file
·79 lines (71 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
# ─────────────────────────────────────────────────────────────────────────────
# Launcher for the CMRES RQMC sharded simulation array.
#
# This script is *not* a SLURM job — invoke it directly from a login shell:
#
# bash submit_simulations.sh
# N_SHARDS=16 bash submit_simulations.sh
# SUBMIT_MERGE=0 bash submit_simulations.sh # don't auto-submit merge
#
# It submits two SLURM jobs:
# 1. the RUN array (N_GRIDS × N_SHARDS tasks, each runs one shard)
# 2. the MERGE array (N_GRIDS tasks, each merges one grid's shards)
# with --dependency=afterok:<RUN_JOBID>
#
# All actual workload runs in slurm_run_simulations.sh, which contains no
# nested sbatch calls and so passes through site-wide submit filters that
# reject scripts containing the ``sbatch`` keyword.
# ─────────────────────────────────────────────────────────────────────────────
set -e
N_GRIDS=${N_GRIDS:-11}
N_SHARDS=${N_SHARDS:-48}
SUBMIT_MERGE=${SUBMIT_MERGE:-1}
TOTAL_TASKS=$(( N_GRIDS * N_SHARDS ))
WORKER="$(cd "$(dirname "$0")" && pwd)/slurm_run_simulations.sh"
if [[ ! -r "${WORKER}" ]]; then
echo "ERROR: cannot read worker script at ${WORKER}" >&2
exit 1
fi
mkdir -p logs data/res
echo "============================================================"
echo "Submitting CMRES RQMC array job"
echo " Worker : ${WORKER}"
echo " N_GRIDS : ${N_GRIDS}"
echo " N_SHARDS : ${N_SHARDS}"
echo " Total tasks : ${TOTAL_TASKS}"
echo " Auto-merge : ${SUBMIT_MERGE}"
echo "============================================================"
# ── RUN phase ─────────────────────────────────────────────────────────────────
RUN_OUT=$(sbatch -p rosa_express.p --parsable --array="1-${TOTAL_TASKS}" "${WORKER}" "$@")
RUN_JOBID=$(echo "${RUN_OUT}" | tr -d '\n')
if [[ -z "${RUN_JOBID}" ]]; then
echo "ERROR: RUN-phase sbatch failed. See SLURM error above." >&2
exit 1
fi
echo " RUN job id : ${RUN_JOBID}"
# ── MERGE phase ───────────────────────────────────────────────────────────────
if [[ "${SUBMIT_MERGE}" != "1" ]]; then
echo
echo "MERGE phase skipped (SUBMIT_MERGE=0)."
echo "Run manually after the RUN array finishes:"
for i in $(seq 1 ${N_GRIDS}); do
echo " python ./experiments/re/run_simulation.py ${i} --merge"
done
exit 0
fi
MERGE_OUT=$(CMRES_MERGE_PHASE=1 sbatch -p rosa_express.p --parsable \
--dependency=afterok:"${RUN_JOBID}" \
--time=00:30:00 \
--array="1-${N_GRIDS}" \
"${WORKER}" "$@")
MERGE_JOBID=$(echo "${MERGE_OUT}" | tr -d '\n')
if [[ -z "${MERGE_JOBID}" ]]; then
echo "ERROR: MERGE-phase sbatch failed. RUN job ${RUN_JOBID} continues." >&2
echo " Run merge manually after RUN finishes:" >&2
for i in $(seq 1 ${N_GRIDS}); do
echo " python ./experiments/re/run_simulation.py ${i} --merge" >&2
done
exit 1
fi
echo " MERGE job id : ${MERGE_JOBID} (depends on afterok:${RUN_JOBID})"