Skip to content

Commit dbdcc82

Browse files
committed
Reuse image sqsh file
Signed-off-by: EmmaQiaoCh <qqiao@nvidia.com>
1 parent 2148a3e commit dbdcc82

1 file changed

Lines changed: 47 additions & 9 deletions

File tree

jenkins/L0_Test.groovy

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,10 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String clusterName
532532
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
533533

534534
def cleanupCommands = [
535-
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
535+
// .sqsh is shared across jobs (named by image digest), so age-prune
536+
// instead of deleting per job; reused images keep a refreshed mtime.
537+
"find ${cluster.scratchPath}/users/svc_tensorrt/containers -maxdepth 1 -name 'container-*.sqsh' -mtime +3 -delete 2>/dev/null || true",
538+
"find ${cluster.scratchPath}/users/svc_tensorrt/containers -maxdepth 1 \\( -name 'container-*.tmp' -o -name 'container-*.lock' \\) -mtime +1 -delete 2>/dev/null || true",
536539
"rm -rf ${jobWorkspace} || true",
537540
].join(" ; ")
538541
Utils.exec(
@@ -571,7 +574,10 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String clusterName,
571574
def entrypoint = SlurmConfig.containerRuntimeToEntrypoint[cluster.containerRuntime]
572575
def cleanupCommands = [
573576
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true",
574-
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
577+
// .sqsh is shared across jobs (named by image digest), so age-prune
578+
// instead of deleting per job; reused images keep a refreshed mtime.
579+
"find ${cluster.scratchPath}/users/svc_tensorrt/containers -maxdepth 1 -name 'container-*.sqsh' -mtime +3 -delete 2>/dev/null || true",
580+
"find ${cluster.scratchPath}/users/svc_tensorrt/containers -maxdepth 1 \\( -name 'container-*.tmp' -o -name 'container-*.lock' \\) -mtime +1 -delete 2>/dev/null || true",
575581
].join(" ; ")
576582
Utils.exec(
577583
pipeline,
@@ -1153,37 +1159,69 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11531159
def containerImageArg = container
11541160
def srunPrologue = ""
11551161
if (cluster.containerRuntime.toString() == "ENROOT") {
1156-
def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
1157-
containerImageArg = enrootImagePath
1162+
def containerDir = "${cluster.scratchPath}/users/svc_tensorrt/containers"
1163+
// Name the .sqsh by image digest (not job ID) so jobs sharing
1164+
// an image reuse one .sqsh instead of re-running `enroot import`
1165+
// per job. Path is resolved at runtime into ${enrootImagePath}.
1166+
containerImageArg = "\${enrootImagePath}"
11581167

11591168
srunPrologue = """
11601169
export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot'
11611170
1171+
containerDir="$containerDir"
1172+
mkdir -p "\$containerDir"
1173+
imageDigest=\$(printf '%s' "$container" | sha256sum | cut -d' ' -f1)
1174+
export enrootImagePath="\$containerDir/container-\${imageDigest}.sqsh"
1175+
11621176
importContainerWithRetries() {
11631177
local docker_uri=\$1
11641178
local output_path=\$2
11651179
local max_attempts=\${3:-3}
11661180
local delay=\${4:-60}
11671181
local attempt=1
1182+
local tmp_path
1183+
1184+
# Best-effort lock so racing jobs don't all import the same
1185+
# image. flock may be a no-op on some shared filesystems;
1186+
# correctness still holds since the import publishes atomically.
1187+
exec 9>"\${output_path}.lock" || true
1188+
flock 9 || true
1189+
1190+
if [ -f "\$output_path" ]
1191+
then
1192+
echo "Reusing cached container image: \$output_path"
1193+
# Refresh mtime so reused images survive age-based pruning.
1194+
touch "\$output_path" || true
1195+
flock -u 9 || true
1196+
return 0
1197+
fi
11681198
1169-
rm -f "\$output_path"
1199+
# Import to a temp path, then mv to publish atomically so
1200+
# other jobs never see a partial .sqsh.
1201+
tmp_path="\${output_path}.\${SLURM_JOB_ID}.tmp"
1202+
rm -f "\$tmp_path"
11701203
1171-
until enroot import -o "\$output_path" -- "docker://\$docker_uri"
1204+
until enroot import -o "\$tmp_path" -- "docker://\$docker_uri"
11721205
do
11731206
if ((attempt >= max_attempts))
11741207
then
11751208
echo "enroot import failed after \$max_attempts attempts"
1209+
rm -f "\$tmp_path"
1210+
flock -u 9 || true
11761211
return 1
11771212
fi
11781213
11791214
echo "enroot import failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..."
1180-
rm -f "\$output_path"
1215+
rm -f "\$tmp_path"
11811216
sleep \$delay
1182-
((attempt++))
1217+
attempt=\$((attempt + 1))
11831218
done
1219+
1220+
mv -f "\$tmp_path" "\$output_path"
1221+
flock -u 9 || true
11841222
}
11851223
1186-
importContainerWithRetries "$container" "$enrootImagePath"
1224+
importContainerWithRetries "$container" "\$enrootImagePath"
11871225
""".replaceAll("(?m)^\\s*", "")
11881226
}
11891227

0 commit comments

Comments
 (0)