File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -1090,7 +1090,13 @@ def get_container_flags(
10901090 sbatch_script = fill_template ("slurm.sh.j2" , vars_to_fill )
10911091
10921092 # For non-container mode, substitute /{RUNDIR_NAME} paths with actual job directory
1093- if self .executor .container_image is None :
1093+ # Check both top-level container_image and resource_group container images
1094+ has_container = self .executor .container_image is not None
1095+ if self .executor .run_as_group and self .executor .resource_group :
1096+ has_container = has_container or any (
1097+ rg .container_image is not None for rg in self .executor .resource_group
1098+ )
1099+ if not has_container :
10941100 actual_job_dir = os .path .join (slurm_job_dir , job_directory_name )
10951101 sbatch_script = sbatch_script .replace (f"/{ RUNDIR_NAME } " , actual_job_dir )
10961102
Original file line number Diff line number Diff line change @@ -33,7 +33,7 @@ export ENV_VAR=value
3333
3434# Command 1
3535
36- srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:- 0} .out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4
36+ srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:- 0} .out --container-image test_image --container- mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4
3737
3838exitcode=$?
3939
Original file line number Diff line number Diff line change @@ -62,7 +62,7 @@ echo "$SLURM_JOB_ID ${SLURM_RESTART_COUNT:-0} X" >> "$JOB_RESULTS_FILE"
6262
6363# Command 1
6464
65- srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:- 0} .out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh
65+ srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:- 0} .out --container-image test_image --container- mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh
6666
6767exitcode=$?
6868
Original file line number Diff line number Diff line change @@ -54,6 +54,7 @@ def dummy_slurm_request_with_artifact(
5454 account = "account" ,
5555 job_dir = "/root/sample_job" ,
5656 tunnel = LocalTunnel (job_dir = "/root" ),
57+ container_image = "test_image" ,
5758 )
5859 slurm_config .job_name = "sample_job"
5960 max_retries = 3
@@ -79,6 +80,7 @@ def ft_slurm_request_with_artifact(
7980 account = "account" ,
8081 job_dir = "/root/sample_job" ,
8182 tunnel = LocalTunnel (job_dir = "/root/" ),
83+ container_image = "test_image" ,
8284 )
8385 slurm_config .job_name = "sample_job"
8486 slurm_config .launcher = FaultTolerance (
You can’t perform that action at this time.
0 commit comments