From d4ad982bbd601c1363c3d03e2dce9aeeb4e219f3 Mon Sep 17 00:00:00 2001 From: Ling Yan <54994915+neilyan-msft@users.noreply.github.com> Date: Fri, 15 May 2026 03:40:09 -0700 Subject: [PATCH 1/2] Move slime + Megatron-LM sources to /opt for non-root AML jobs The slime curated image (slime-pytorch-2.9-cuda12.8:2) was unusable by Foundry/Vienna Training Block Slime SFT jobs running on AML/Singularity as uid 9000 (aiscuser). The Dockerfile cloned slime into /root/slime and Megatron-LM into /root/Megatron-LM, then pip installed each editably. Because /root is mode 700, the non-root job user could not: - read /root/slime/train.py or /root/slime/slime/__init__.py - resolve import slime (the editable .pth points into /root/slime) - pick up Megatron-LM via PYTHONPATH=/root/Megatron-LM - run python train.py ... (no readable entrypoint) Passing esources.dockerArgs="--user root" was not a viable workaround because the AML job still launches as aiscuser regardless. Relocate both editable trees to /opt: - Clone slime into /opt/slime and Megatron-LM into /opt/Megatron-LM. - Update the Megatron patch path, both pip install -e invocations, and the int4_qat in-tree install to use /opt/slime. - Update PYTHONPATH and WORKDIR to /opt. - chmod -R a+rX /opt/slime /opt/Megatron-LM so a defensive umask cannot strip world read+traverse from the source trees. Validation additions: - smoke_test.py now asserts that slime.__file__ resolves under /opt/slime, and that /opt/slime, /opt/slime/train.py, /opt/slime/slime/__init__.py, and /opt/Megatron-LM are world-readable and (for directories) world-traversable. - The build now also runs unuser -u nobody -- python against an importlib-based slime import check, giving end-to-end confidence that a non-root user can import slime before the image is published. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../context/Dockerfile | 35 +++++++++++++------ .../context/smoke_test.py | 20 +++++++++++ 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile index a226cb27fd..35e49c753f 100644 --- a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile +++ b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile @@ -1,7 +1,11 @@ FROM mcr.microsoft.com/azureml/openmpi5.0-cuda12.8-ubuntu24.04:{{latest-image-tag}} USER root -WORKDIR /root +# Editable sources live under /opt so AML/Singularity jobs (uid 9000) can +# read them. /root is mode 700 and would make `import slime`, the slime +# train.py entrypoint, and the Megatron-LM editable install unreachable +# from a non-root job user. +WORKDIR /opt ARG SLIME_COMMIT=9b50665190d70cefcc9cc42e5994ad4de5f0cd88 ARG PATCH_VERSION=latest @@ -14,7 +18,7 @@ ENV DEBIAN_FRONTEND=noninteractive ENV PIP_NO_CACHE_DIR=1 ENV PYTHONUNBUFFERED=1 ENV MAX_JOBS=1 -ENV PYTHONPATH=/root/Megatron-LM:${PYTHONPATH} +ENV PYTHONPATH=/opt/Megatron-LM:${PYTHONPATH} RUN set -eux; \ find /etc/apt -type f \( -name '*.list' -o -name '*.sources' \) \ @@ -47,8 +51,8 @@ RUN python -m pip install --no-cache-dir \ flash-linear-attention==0.4.1 \ tilelang -f https://tile-ai.github.io/whl/nightly/cu128/ -RUN git clone https://github.com/THUDM/slime.git /root/slime && \ - cd /root/slime && \ +RUN git clone https://github.com/THUDM/slime.git /opt/slime && \ + cd /opt/slime && \ git checkout ${SLIME_COMMIT} RUN NVCC_APPEND_FLAGS="--threads 1" \ @@ -57,10 +61,10 @@ RUN NVCC_APPEND_FLAGS="--threads 1" \ --config-settings "--build-option=--cpp_ext --cuda_ext --parallel 1" \ git+https://github.com/NVIDIA/apex.git@10417aceddd7d5d05d7cbf7b0fc2daad1105f8b4 -RUN git clone https://github.com/NVIDIA/Megatron-LM.git --recursive /root/Megatron-LM && \ - cd /root/Megatron-LM && \ +RUN git clone https://github.com/NVIDIA/Megatron-LM.git --recursive /opt/Megatron-LM && \ + cd /opt/Megatron-LM && \ git checkout ${MEGATRON_COMMIT} && \ - git apply /root/slime/docker/patch/${PATCH_VERSION}/megatron.patch --3way && \ + git apply /opt/slime/docker/patch/${PATCH_VERSION}/megatron.patch --3way && \ if grep -R -n '^<<<<<<< ' .; then \ echo "Megatron patch failed to apply cleanly." && \ exit 1; \ @@ -99,12 +103,23 @@ RUN LOG4J_VERSION=${LOG4J_VERSION} \ python /tmp/patch_ray_log4j.py && \ rm /tmp/patch_ray_log4j.py -RUN cd /root/slime && \ +RUN cd /opt/slime && \ python -m pip install --no-cache-dir -e . --no-deps -RUN cd /root/slime/slime/backends/megatron_utils/kernels/int4_qat && \ +RUN cd /opt/slime/slime/backends/megatron_utils/kernels/int4_qat && \ python -m pip install --no-cache-dir . --no-build-isolation +# AML/Singularity jobs run as uid 9000 (aiscuser). Explicitly grant +# world read + traverse on the editable slime and Megatron-LM trees so +# `import slime`, `python /opt/slime/train.py`, and the PYTHONPATH-based +# Megatron-LM import all succeed without requiring `--user root`. +RUN chmod -R a+rX /opt/slime /opt/Megatron-LM + COPY smoke_test.py /tmp/smoke_test.py RUN python /tmp/smoke_test.py && \ - rm /tmp/smoke_test.py + echo 'import importlib.util' > /tmp/slime_nonroot_check.py && \ + echo 'import slime' >> /tmp/slime_nonroot_check.py && \ + echo 'assert importlib.util.find_spec("slime") is not None' >> /tmp/slime_nonroot_check.py && \ + chmod a+r /tmp/slime_nonroot_check.py && \ + runuser -u nobody -- python /tmp/slime_nonroot_check.py && \ + rm /tmp/smoke_test.py /tmp/slime_nonroot_check.py diff --git a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py index d5da6f87f9..b2d8e81849 100644 --- a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py +++ b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py @@ -16,6 +16,7 @@ LOG4J_ARTIFACTS = ("log4j-api", "log4j-core", "log4j-slf4j-impl") RAY_DIST_NAMES = ("ray_dist.jar", "ray__dist.jar") +EXPECTED_SLIME_ROOT = pathlib.Path("/opt/slime") def find_ray_dist() -> pathlib.Path: @@ -37,6 +38,25 @@ def find_ray_dist() -> pathlib.Path: assert sglang assert slime +# AML/Singularity jobs run as uid 9000 (aiscuser); /root is mode 700 so +# slime must be editable-installed from a non-/root location. Pin the +# expected location and verify world read+traverse on key files. +slime_path = pathlib.Path(slime.__file__).resolve() +assert EXPECTED_SLIME_ROOT in slime_path.parents, ( + f"slime resolved at {slime_path}, expected under {EXPECTED_SLIME_ROOT}" +) +for path in ( + EXPECTED_SLIME_ROOT, + EXPECTED_SLIME_ROOT / "train.py", + EXPECTED_SLIME_ROOT / "slime" / "__init__.py", + pathlib.Path("/opt/Megatron-LM"), +): + assert path.exists(), f"missing {path}" + mode = path.stat().st_mode & 0o777 + assert mode & 0o004, f"{path} not world-readable (mode={oct(mode)})" + if path.is_dir(): + assert mode & 0o001, f"{path} not world-traversable (mode={oct(mode)})" + ray_dist = find_ray_dist() for artifact in LOG4J_ARTIFACTS: properties_name = f"META-INF/maven/org.apache.logging.log4j/{artifact}/pom.properties" From 1be121dfb8d1fe408ea5ec37101e8ef7d1492261 Mon Sep 17 00:00:00 2001 From: Ling Yan <54994915+neilyan-msft@users.noreply.github.com> Date: Sat, 16 May 2026 10:12:53 -0700 Subject: [PATCH 2/2] Fix slime smoke test namespace path handling Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../context/smoke_test.py | 51 +++++++++++++++---- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py index b2d8e81849..abcc69eefe 100644 --- a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py +++ b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py @@ -3,9 +3,9 @@ """Smoke-test the slime curated environment image.""" +import importlib.util import pathlib import zipfile - from packaging.version import Version import PIL import ray @@ -32,6 +32,38 @@ def find_ray_dist() -> pathlib.Path: return matches[0] +def assert_slime_resolves_under_expected_root() -> None: + """Verify slime resolves to the readable editable source tree.""" + spec = importlib.util.find_spec("slime") + assert spec is not None, "importlib cannot find slime" + + paths: list[pathlib.Path] = [] + slime_file = getattr(slime, "__file__", None) + if slime_file: + paths.append(pathlib.Path(slime_file).resolve()) + if spec.origin and spec.origin != "namespace": + paths.append(pathlib.Path(spec.origin).resolve()) + for location in spec.submodule_search_locations or (): + paths.append(pathlib.Path(location).resolve()) + for location in getattr(slime, "__path__", ()): + paths.append(pathlib.Path(location).resolve()) + + assert paths, "slime imported but no source paths were discoverable" + assert any( + path == EXPECTED_SLIME_ROOT or EXPECTED_SLIME_ROOT in path.parents + for path in paths + ), f"slime resolved at {paths}, expected under {EXPECTED_SLIME_ROOT}" + + +def assert_world_accessible(path: pathlib.Path) -> None: + """Verify a file/tree is readable by the non-root AML job user.""" + assert path.exists(), f"missing {path}" + mode = path.stat().st_mode & 0o777 + assert mode & 0o004, f"{path} not world-readable (mode={oct(mode)})" + if path.is_dir(): + assert mode & 0o001, f"{path} not world-traversable (mode={oct(mode)})" + + assert torch.cuda.is_available() or torch.version.cuda assert torch.__version__.startswith("2.9.1") assert Version(PIL.__version__) >= Version("12.2.0") @@ -41,21 +73,18 @@ def find_ray_dist() -> pathlib.Path: # AML/Singularity jobs run as uid 9000 (aiscuser); /root is mode 700 so # slime must be editable-installed from a non-/root location. Pin the # expected location and verify world read+traverse on key files. -slime_path = pathlib.Path(slime.__file__).resolve() -assert EXPECTED_SLIME_ROOT in slime_path.parents, ( - f"slime resolved at {slime_path}, expected under {EXPECTED_SLIME_ROOT}" -) +assert_slime_resolves_under_expected_root() for path in ( EXPECTED_SLIME_ROOT, EXPECTED_SLIME_ROOT / "train.py", - EXPECTED_SLIME_ROOT / "slime" / "__init__.py", + EXPECTED_SLIME_ROOT / "slime", pathlib.Path("/opt/Megatron-LM"), ): - assert path.exists(), f"missing {path}" - mode = path.stat().st_mode & 0o777 - assert mode & 0o004, f"{path} not world-readable (mode={oct(mode)})" - if path.is_dir(): - assert mode & 0o001, f"{path} not world-traversable (mode={oct(mode)})" + assert_world_accessible(path) + +slime_init = EXPECTED_SLIME_ROOT / "slime" / "__init__.py" +if slime_init.exists(): + assert_world_accessible(slime_init) ray_dist = find_ray_dist() for artifact in LOG4J_ARTIFACTS: