@@ -532,7 +532,10 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String clusterName
532532 Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
533533
534534 def cleanupCommands = [
535- " rm -rf ${ cluster.scratchPath} /users/svc_tensorrt/containers/container-${ slurmJobID} .sqsh || true" ,
535+ // .sqsh is shared across jobs (named by image digest), so age-prune
536+ // instead of deleting per job; reused images keep a refreshed mtime.
537+ " find ${ cluster.scratchPath} /users/svc_tensorrt/containers -maxdepth 1 -name 'container-*.sqsh' -mtime +3 -delete 2>/dev/null || true" ,
538+ " find ${ cluster.scratchPath} /users/svc_tensorrt/containers -maxdepth 1 \\ ( -name 'container-*.tmp' -o -name 'container-*.lock' \\ ) -mtime +1 -delete 2>/dev/null || true" ,
536539 " rm -rf ${ jobWorkspace} || true" ,
537540 ]. join(" ; " )
538541 Utils . exec(
@@ -571,7 +574,10 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String clusterName,
571574 def entrypoint = SlurmConfig . containerRuntimeToEntrypoint[cluster. containerRuntime]
572575 def cleanupCommands = [
573576 " rm -rf /home/svc_tensorrt/bloom/scripts/agent-${ nodeName} .jar /home/svc_tensorrt/bloom/scripts/${ nodeName} -${ entrypoint} || true" ,
574- " rm -rf ${ cluster.scratchPath} /users/svc_tensorrt/containers/container-${ slurmJobID} .sqsh || true" ,
577+ // .sqsh is shared across jobs (named by image digest), so age-prune
578+ // instead of deleting per job; reused images keep a refreshed mtime.
579+ " find ${ cluster.scratchPath} /users/svc_tensorrt/containers -maxdepth 1 -name 'container-*.sqsh' -mtime +3 -delete 2>/dev/null || true" ,
580+ " find ${ cluster.scratchPath} /users/svc_tensorrt/containers -maxdepth 1 \\ ( -name 'container-*.tmp' -o -name 'container-*.lock' \\ ) -mtime +1 -delete 2>/dev/null || true" ,
575581 ]. join(" ; " )
576582 Utils . exec(
577583 pipeline,
@@ -1153,37 +1159,69 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11531159 def containerImageArg = container
11541160 def srunPrologue = " "
11551161 if (cluster. containerRuntime. toString() == " ENROOT" ) {
1156- def enrootImagePath = " ${ cluster.scratchPath} /users/svc_tensorrt/containers/container-\$ {SLURM_JOB_ID}.sqsh"
1157- containerImageArg = enrootImagePath
1162+ def containerDir = " ${ cluster.scratchPath} /users/svc_tensorrt/containers"
1163+ // Name the .sqsh by image digest (not job ID) so jobs sharing
1164+ // an image reuse one .sqsh instead of re-running `enroot import`
1165+ // per job. Path is resolved at runtime into ${enrootImagePath}.
1166+ containerImageArg = " \$ {enrootImagePath}"
11581167
11591168 srunPrologue = """
11601169 export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot'
11611170
1171+ containerDir="$containerDir "
1172+ mkdir -p "\$ containerDir"
1173+ imageDigest=\$ (printf '%s' "$container " | sha256sum | cut -d' ' -f1)
1174+ export enrootImagePath="\$ containerDir/container-\$ {imageDigest}.sqsh"
1175+
11621176 importContainerWithRetries() {
11631177 local docker_uri=\$ 1
11641178 local output_path=\$ 2
11651179 local max_attempts=\$ {3:-3}
11661180 local delay=\$ {4:-60}
11671181 local attempt=1
1182+ local tmp_path
1183+
1184+ # Best-effort lock so racing jobs don't all import the same
1185+ # image. flock may be a no-op on some shared filesystems;
1186+ # correctness still holds since the import publishes atomically.
1187+ exec 9>"\$ {output_path}.lock" || true
1188+ flock 9 || true
1189+
1190+ if [ -f "\$ output_path" ]
1191+ then
1192+ echo "Reusing cached container image: \$ output_path"
1193+ # Refresh mtime so reused images survive age-based pruning.
1194+ touch "\$ output_path" || true
1195+ flock -u 9 || true
1196+ return 0
1197+ fi
11681198
1169- rm -f "\$ output_path"
1199+ # Import to a temp path, then mv to publish atomically so
1200+ # other jobs never see a partial .sqsh.
1201+ tmp_path="\$ {output_path}.\$ {SLURM_JOB_ID}.tmp"
1202+ rm -f "\$ tmp_path"
11701203
1171- until enroot import -o "\$ output_path " -- "docker://\$ docker_uri"
1204+ until enroot import -o "\$ tmp_path " -- "docker://\$ docker_uri"
11721205 do
11731206 if ((attempt >= max_attempts))
11741207 then
11751208 echo "enroot import failed after \$ max_attempts attempts"
1209+ rm -f "\$ tmp_path"
1210+ flock -u 9 || true
11761211 return 1
11771212 fi
11781213
11791214 echo "enroot import failed (attempt \$ attempt of \$ max_attempts). Retrying in \$ {delay}s..."
1180- rm -f "\$ output_path "
1215+ rm -f "\$ tmp_path "
11811216 sleep \$ delay
1182- ((attempt++ ))
1217+ attempt= \$ ((attempt + 1 ))
11831218 done
1219+
1220+ mv -f "\$ tmp_path" "\$ output_path"
1221+ flock -u 9 || true
11841222 }
11851223
1186- importContainerWithRetries "$container " "$enrootImagePath "
1224+ importContainerWithRetries "$container " "\ $ enrootImagePath"
11871225 """ . replaceAll(" (?m)^\\ s*" , " " )
11881226 }
11891227
0 commit comments