-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_schism_hbv3.yml
More file actions
62 lines (57 loc) · 3.56 KB
/
sample_schism_hbv3.yml
File metadata and controls
62 lines (57 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
resource_group: schism_rg # the resource group containing the batch account
job_name: simulation_scenario # job name, will be used to name the pool and the job
batch_account_name: mybatchaccount # batch account name
storage_account_name: mybatchassociatedstore # storage account name
storage_container_name: test # this is mounted to $AZ_BATCH_MOUNTS_DIR/<<storage_container_name>> in addition to batch container which is mounted to $AZ_BATCH_MOUNTS_DIR/batch
study_copy_flags: --recursive --preserve-symlinks --exclude-regex ".*out.*nc"
study_dir: simulations/my_scenario # path to simulation directory within storage_container_name
setup_dirs: # supplementary directories copied alongside study_dir
- hrrr # meteorological forcing data directory
vm_size: Standard_HB120rs_v3 #Standard_D16s_v3 # "Standard_D8as_v6" #"standard_hc44rs" #"Standard_HB120rs_v2" # this is the VM size for the pool, e.g. "Standard_D16s_v3"
num_hosts: 2 # number of compute nodes in the pool
# num_cores: 240 # optional; defaults to num_hosts * cores_per_node from vm_core_map
# HBv3 has 120 physical cores, so num_cores = 2 * 120 = 240 by default
num_scribes: 10 # number of SCHISM scribes; use fewer for GEN_TVD-VL output variant
# HBv3 (Standard_HB120rs_v3): 120 cores AMD EPYC Milan-X, 448 GiB, 3D V-Cache, NDR200 InfiniBand
# Milan-X has significantly larger L3 cache than Rome/Milan which benefits dense-mesh simulations.
# This template is specifically tuned for HBv3 with MVAPICH2; for HBv2 use sample_schism_mvapich2_2024apr.yml.
vm_size: "Standard_HB120rs_v3"
node_type: 'TargetDedicatedNodes' # 'TargetDedicatedNodes' or 'TargetLowPriorityNodes' (spot/preemptible)
task_slots_per_node: 1
max_retry_count: 3 # retry a task up to this many times if it fails
delete_after_mins: 1000 # job and nodes auto-deleted after this many idle minutes
mpi_command: |
cd sflux; rm -f *.nc; python3 make_links_full.py; cd ../;
rm -f *.nc;
echo "------running make_links";
python make_links.py;
echo "------sflux links made";
ls | head ;
cd ../;
echo "------checking param.nml link";
ls -l param.nml;
echo "------checking param.nml content";
head -10 param.nml
current_date=$(date);
if [ -f $AZ_BATCH_TASK_ID.state.txt ]; then
echo $current_date >> $AZ_BATCH_TASK_ID.state.txt;
echo "Restarting from previous run: $current_date";
iterations=$($SCHISM_SCRIPTS_HOME/batch/get_iterations_from_fluxout.sh);
echo "flux.out @ $iterations. Copying hotstart_\d+_$iterations\.nc files";
azcopy copy --include-regex="hotstart_\d+_$iterations\.nc" --recursive "https://{storage_account_name}.blob.core.windows.net/{storage_container_name}/{study_dir}/outputs/?{sas}" . || true;
echo "Generating and linking hotstart for $iterations";
$SCHISM_SCRIPTS_HOME/batch/generate_and_link_hotstart.sh $iterations;
$SCHISM_SCRIPTS_HOME/batch/update_param_for_restart.sh param.nml;
else
echo $current_date > $AZ_BATCH_TASK_ID.state.txt;
fi
echo "------checking files with ls";
ls;
echo "------starting SCHISM";
mpirun {mpi_opts} pschism_PREC_EVAP_GOTM_TVD-VL {num_scribes}
# GEN_TVD-VL variant includes GEN tracer transport (e.g. for salmon/nutrient simulations)
# Replace with pschism_PREC_EVAP_GOTM_TVD-VL if GEN tracers are not needed
#mpirun {mpi_opts} pschism_PREC_EVAP_GOTM_GEN_TVD-VL {num_scribes}
# template — Alma Linux 8.7 HPC image with MVAPICH2 optimized for HBv3 (Standard_HB120rs_v3)
# Use this when targeting HBv3 nodes with their larger on-chip L3 cache.
template_name: "alma810_mvapich2_202505290_hbv3" # this is the template name for the pool. Found in azure_dms_batch/dmsbatch/templates