Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
FROM mcr.microsoft.com/azureml/openmpi5.0-cuda12.8-ubuntu24.04:{{latest-image-tag}}

USER root
WORKDIR /root
# Editable sources live under /opt so AML/Singularity jobs (uid 9000) can
# read them. /root is mode 700 and would make `import slime`, the slime
# train.py entrypoint, and the Megatron-LM editable install unreachable
# from a non-root job user.
WORKDIR /opt

ARG SLIME_COMMIT=9b50665190d70cefcc9cc42e5994ad4de5f0cd88
ARG PATCH_VERSION=latest
Expand All @@ -14,7 +18,7 @@ ENV DEBIAN_FRONTEND=noninteractive
ENV PIP_NO_CACHE_DIR=1
ENV PYTHONUNBUFFERED=1
ENV MAX_JOBS=1
ENV PYTHONPATH=/root/Megatron-LM:${PYTHONPATH}
ENV PYTHONPATH=/opt/Megatron-LM:${PYTHONPATH}

RUN set -eux; \
find /etc/apt -type f \( -name '*.list' -o -name '*.sources' \) \
Expand Down Expand Up @@ -47,8 +51,8 @@ RUN python -m pip install --no-cache-dir \
flash-linear-attention==0.4.1 \
tilelang -f https://tile-ai.github.io/whl/nightly/cu128/

RUN git clone https://github.com/THUDM/slime.git /root/slime && \
cd /root/slime && \
RUN git clone https://github.com/THUDM/slime.git /opt/slime && \
cd /opt/slime && \
git checkout ${SLIME_COMMIT}

RUN NVCC_APPEND_FLAGS="--threads 1" \
Expand All @@ -57,10 +61,10 @@ RUN NVCC_APPEND_FLAGS="--threads 1" \
--config-settings "--build-option=--cpp_ext --cuda_ext --parallel 1" \
git+https://github.com/NVIDIA/apex.git@10417aceddd7d5d05d7cbf7b0fc2daad1105f8b4

RUN git clone https://github.com/NVIDIA/Megatron-LM.git --recursive /root/Megatron-LM && \
cd /root/Megatron-LM && \
RUN git clone https://github.com/NVIDIA/Megatron-LM.git --recursive /opt/Megatron-LM && \
cd /opt/Megatron-LM && \
git checkout ${MEGATRON_COMMIT} && \
git apply /root/slime/docker/patch/${PATCH_VERSION}/megatron.patch --3way && \
git apply /opt/slime/docker/patch/${PATCH_VERSION}/megatron.patch --3way && \
if grep -R -n '^<<<<<<< ' .; then \
echo "Megatron patch failed to apply cleanly." && \
exit 1; \
Expand Down Expand Up @@ -99,12 +103,23 @@ RUN LOG4J_VERSION=${LOG4J_VERSION} \
python /tmp/patch_ray_log4j.py && \
rm /tmp/patch_ray_log4j.py

RUN cd /root/slime && \
RUN cd /opt/slime && \
python -m pip install --no-cache-dir -e . --no-deps

RUN cd /root/slime/slime/backends/megatron_utils/kernels/int4_qat && \
RUN cd /opt/slime/slime/backends/megatron_utils/kernels/int4_qat && \
python -m pip install --no-cache-dir . --no-build-isolation

# AML/Singularity jobs run as uid 9000 (aiscuser). Explicitly grant
# world read + traverse on the editable slime and Megatron-LM trees so
# `import slime`, `python /opt/slime/train.py`, and the PYTHONPATH-based
# Megatron-LM import all succeed without requiring `--user root`.
RUN chmod -R a+rX /opt/slime /opt/Megatron-LM

COPY smoke_test.py /tmp/smoke_test.py
RUN python /tmp/smoke_test.py && \
rm /tmp/smoke_test.py
echo 'import importlib.util' > /tmp/slime_nonroot_check.py && \
echo 'import slime' >> /tmp/slime_nonroot_check.py && \
echo 'assert importlib.util.find_spec("slime") is not None' >> /tmp/slime_nonroot_check.py && \
chmod a+r /tmp/slime_nonroot_check.py && \
runuser -u nobody -- python /tmp/slime_nonroot_check.py && \
rm /tmp/smoke_test.py /tmp/slime_nonroot_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

"""Smoke-test the slime curated environment image."""

import importlib.util
import pathlib
import zipfile

from packaging.version import Version
import PIL
import ray
Expand All @@ -16,6 +16,7 @@

LOG4J_ARTIFACTS = ("log4j-api", "log4j-core", "log4j-slf4j-impl")
RAY_DIST_NAMES = ("ray_dist.jar", "ray__dist.jar")
EXPECTED_SLIME_ROOT = pathlib.Path("/opt/slime")


def find_ray_dist() -> pathlib.Path:
Expand All @@ -31,12 +32,60 @@ def find_ray_dist() -> pathlib.Path:
return matches[0]


def assert_slime_resolves_under_expected_root() -> None:
"""Verify slime resolves to the readable editable source tree."""
spec = importlib.util.find_spec("slime")
assert spec is not None, "importlib cannot find slime"

paths: list[pathlib.Path] = []
slime_file = getattr(slime, "__file__", None)
if slime_file:
paths.append(pathlib.Path(slime_file).resolve())
if spec.origin and spec.origin != "namespace":
paths.append(pathlib.Path(spec.origin).resolve())
for location in spec.submodule_search_locations or ():
paths.append(pathlib.Path(location).resolve())
for location in getattr(slime, "__path__", ()):
paths.append(pathlib.Path(location).resolve())

assert paths, "slime imported but no source paths were discoverable"
assert any(
path == EXPECTED_SLIME_ROOT or EXPECTED_SLIME_ROOT in path.parents
for path in paths
), f"slime resolved at {paths}, expected under {EXPECTED_SLIME_ROOT}"


def assert_world_accessible(path: pathlib.Path) -> None:
"""Verify a file/tree is readable by the non-root AML job user."""
assert path.exists(), f"missing {path}"
mode = path.stat().st_mode & 0o777
assert mode & 0o004, f"{path} not world-readable (mode={oct(mode)})"
if path.is_dir():
assert mode & 0o001, f"{path} not world-traversable (mode={oct(mode)})"


assert torch.cuda.is_available() or torch.version.cuda
assert torch.__version__.startswith("2.9.1")
assert Version(PIL.__version__) >= Version("12.2.0")
assert sglang
assert slime

# AML/Singularity jobs run as uid 9000 (aiscuser); /root is mode 700 so
# slime must be editable-installed from a non-/root location. Pin the
# expected location and verify world read+traverse on key files.
assert_slime_resolves_under_expected_root()
for path in (
EXPECTED_SLIME_ROOT,
EXPECTED_SLIME_ROOT / "train.py",
EXPECTED_SLIME_ROOT / "slime",
pathlib.Path("/opt/Megatron-LM"),
):
assert_world_accessible(path)

slime_init = EXPECTED_SLIME_ROOT / "slime" / "__init__.py"
if slime_init.exists():
assert_world_accessible(slime_init)

ray_dist = find_ray_dist()
for artifact in LOG4J_ARTIFACTS:
properties_name = f"META-INF/maven/org.apache.logging.log4j/{artifact}/pom.properties"
Expand Down
Loading