Skip to content

Commit 3fa9c34

Browse files
committed
fix
Signed-off-by: Hemil Desai <hemild@nvidia.com>
1 parent afe7f04 commit 3fa9c34

4 files changed

Lines changed: 11 additions & 3 deletions

File tree

nemo_run/core/execution/slurm.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1090,7 +1090,13 @@ def get_container_flags(
10901090
sbatch_script = fill_template("slurm.sh.j2", vars_to_fill)
10911091

10921092
# For non-container mode, substitute /{RUNDIR_NAME} paths with actual job directory
1093-
if self.executor.container_image is None:
1093+
# Check both top-level container_image and resource_group container images
1094+
has_container = self.executor.container_image is not None
1095+
if self.executor.run_as_group and self.executor.resource_group:
1096+
has_container = has_container or any(
1097+
rg.container_image is not None for rg in self.executor.resource_group
1098+
)
1099+
if not has_container:
10941100
actual_job_dir = os.path.join(slurm_job_dir, job_directory_name)
10951101
sbatch_script = sbatch_script.replace(f"/{RUNDIR_NAME}", actual_job_dir)
10961102

test/core/execution/artifacts/dummy_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export ENV_VAR=value
3333

3434
# Command 1
3535

36-
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4
36+
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4
3737

3838
exitcode=$?
3939

test/core/execution/artifacts/ft_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ echo "$SLURM_JOB_ID ${SLURM_RESTART_COUNT:-0} X" >> "$JOB_RESULTS_FILE"
6262

6363
# Command 1
6464

65-
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh
65+
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh
6666

6767
exitcode=$?
6868

test/core/execution/test_slurm_templates.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def dummy_slurm_request_with_artifact(
5454
account="account",
5555
job_dir="/root/sample_job",
5656
tunnel=LocalTunnel(job_dir="/root"),
57+
container_image="test_image",
5758
)
5859
slurm_config.job_name = "sample_job"
5960
max_retries = 3
@@ -79,6 +80,7 @@ def ft_slurm_request_with_artifact(
7980
account="account",
8081
job_dir="/root/sample_job",
8182
tunnel=LocalTunnel(job_dir="/root/"),
83+
container_image="test_image",
8284
)
8385
slurm_config.job_name = "sample_job"
8486
slurm_config.launcher = FaultTolerance(

0 commit comments

Comments
 (0)