diff --git a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile index a226cb27fd..35e49c753f 100644 --- a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile +++ b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/Dockerfile @@ -1,7 +1,11 @@ FROM mcr.microsoft.com/azureml/openmpi5.0-cuda12.8-ubuntu24.04:{{latest-image-tag}} USER root -WORKDIR /root +# Editable sources live under /opt so AML/Singularity jobs (uid 9000) can +# read them. /root is mode 700 and would make `import slime`, the slime +# train.py entrypoint, and the Megatron-LM editable install unreachable +# from a non-root job user. +WORKDIR /opt ARG SLIME_COMMIT=9b50665190d70cefcc9cc42e5994ad4de5f0cd88 ARG PATCH_VERSION=latest @@ -14,7 +18,7 @@ ENV DEBIAN_FRONTEND=noninteractive ENV PIP_NO_CACHE_DIR=1 ENV PYTHONUNBUFFERED=1 ENV MAX_JOBS=1 -ENV PYTHONPATH=/root/Megatron-LM:${PYTHONPATH} +ENV PYTHONPATH=/opt/Megatron-LM:${PYTHONPATH} RUN set -eux; \ find /etc/apt -type f \( -name '*.list' -o -name '*.sources' \) \ @@ -47,8 +51,8 @@ RUN python -m pip install --no-cache-dir \ flash-linear-attention==0.4.1 \ tilelang -f https://tile-ai.github.io/whl/nightly/cu128/ -RUN git clone https://github.com/THUDM/slime.git /root/slime && \ - cd /root/slime && \ +RUN git clone https://github.com/THUDM/slime.git /opt/slime && \ + cd /opt/slime && \ git checkout ${SLIME_COMMIT} RUN NVCC_APPEND_FLAGS="--threads 1" \ @@ -57,10 +61,10 @@ RUN NVCC_APPEND_FLAGS="--threads 1" \ --config-settings "--build-option=--cpp_ext --cuda_ext --parallel 1" \ git+https://github.com/NVIDIA/apex.git@10417aceddd7d5d05d7cbf7b0fc2daad1105f8b4 -RUN git clone https://github.com/NVIDIA/Megatron-LM.git --recursive /root/Megatron-LM && \ - cd /root/Megatron-LM && \ +RUN git clone https://github.com/NVIDIA/Megatron-LM.git --recursive /opt/Megatron-LM && \ + cd /opt/Megatron-LM && \ git checkout ${MEGATRON_COMMIT} && \ - git apply /root/slime/docker/patch/${PATCH_VERSION}/megatron.patch --3way && \ + git apply /opt/slime/docker/patch/${PATCH_VERSION}/megatron.patch --3way && \ if grep -R -n '^<<<<<<< ' .; then \ echo "Megatron patch failed to apply cleanly." && \ exit 1; \ @@ -99,12 +103,23 @@ RUN LOG4J_VERSION=${LOG4J_VERSION} \ python /tmp/patch_ray_log4j.py && \ rm /tmp/patch_ray_log4j.py -RUN cd /root/slime && \ +RUN cd /opt/slime && \ python -m pip install --no-cache-dir -e . --no-deps -RUN cd /root/slime/slime/backends/megatron_utils/kernels/int4_qat && \ +RUN cd /opt/slime/slime/backends/megatron_utils/kernels/int4_qat && \ python -m pip install --no-cache-dir . --no-build-isolation +# AML/Singularity jobs run as uid 9000 (aiscuser). Explicitly grant +# world read + traverse on the editable slime and Megatron-LM trees so +# `import slime`, `python /opt/slime/train.py`, and the PYTHONPATH-based +# Megatron-LM import all succeed without requiring `--user root`. +RUN chmod -R a+rX /opt/slime /opt/Megatron-LM + COPY smoke_test.py /tmp/smoke_test.py RUN python /tmp/smoke_test.py && \ - rm /tmp/smoke_test.py + echo 'import importlib.util' > /tmp/slime_nonroot_check.py && \ + echo 'import slime' >> /tmp/slime_nonroot_check.py && \ + echo 'assert importlib.util.find_spec("slime") is not None' >> /tmp/slime_nonroot_check.py && \ + chmod a+r /tmp/slime_nonroot_check.py && \ + runuser -u nobody -- python /tmp/slime_nonroot_check.py && \ + rm /tmp/smoke_test.py /tmp/slime_nonroot_check.py diff --git a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py index d5da6f87f9..abcc69eefe 100644 --- a/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py +++ b/assets/training/finetune_acft_hf_nlp/environments/slime-pytorch-2.9-cuda12.8/context/smoke_test.py @@ -3,9 +3,9 @@ """Smoke-test the slime curated environment image.""" +import importlib.util import pathlib import zipfile - from packaging.version import Version import PIL import ray @@ -16,6 +16,7 @@ LOG4J_ARTIFACTS = ("log4j-api", "log4j-core", "log4j-slf4j-impl") RAY_DIST_NAMES = ("ray_dist.jar", "ray__dist.jar") +EXPECTED_SLIME_ROOT = pathlib.Path("/opt/slime") def find_ray_dist() -> pathlib.Path: @@ -31,12 +32,60 @@ def find_ray_dist() -> pathlib.Path: return matches[0] +def assert_slime_resolves_under_expected_root() -> None: + """Verify slime resolves to the readable editable source tree.""" + spec = importlib.util.find_spec("slime") + assert spec is not None, "importlib cannot find slime" + + paths: list[pathlib.Path] = [] + slime_file = getattr(slime, "__file__", None) + if slime_file: + paths.append(pathlib.Path(slime_file).resolve()) + if spec.origin and spec.origin != "namespace": + paths.append(pathlib.Path(spec.origin).resolve()) + for location in spec.submodule_search_locations or (): + paths.append(pathlib.Path(location).resolve()) + for location in getattr(slime, "__path__", ()): + paths.append(pathlib.Path(location).resolve()) + + assert paths, "slime imported but no source paths were discoverable" + assert any( + path == EXPECTED_SLIME_ROOT or EXPECTED_SLIME_ROOT in path.parents + for path in paths + ), f"slime resolved at {paths}, expected under {EXPECTED_SLIME_ROOT}" + + +def assert_world_accessible(path: pathlib.Path) -> None: + """Verify a file/tree is readable by the non-root AML job user.""" + assert path.exists(), f"missing {path}" + mode = path.stat().st_mode & 0o777 + assert mode & 0o004, f"{path} not world-readable (mode={oct(mode)})" + if path.is_dir(): + assert mode & 0o001, f"{path} not world-traversable (mode={oct(mode)})" + + assert torch.cuda.is_available() or torch.version.cuda assert torch.__version__.startswith("2.9.1") assert Version(PIL.__version__) >= Version("12.2.0") assert sglang assert slime +# AML/Singularity jobs run as uid 9000 (aiscuser); /root is mode 700 so +# slime must be editable-installed from a non-/root location. Pin the +# expected location and verify world read+traverse on key files. +assert_slime_resolves_under_expected_root() +for path in ( + EXPECTED_SLIME_ROOT, + EXPECTED_SLIME_ROOT / "train.py", + EXPECTED_SLIME_ROOT / "slime", + pathlib.Path("/opt/Megatron-LM"), +): + assert_world_accessible(path) + +slime_init = EXPECTED_SLIME_ROOT / "slime" / "__init__.py" +if slime_init.exists(): + assert_world_accessible(slime_init) + ray_dist = find_ray_dist() for artifact in LOG4J_ARTIFACTS: properties_name = f"META-INF/maven/org.apache.logging.log4j/{artifact}/pom.properties"