diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2b215cecad91..1eb0fb2bc737 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,16 +37,16 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures @@ -61,13 +61,13 @@ notify_test_failures = false use_new_test_structure = false ### On by default -sanity_tests = true -security_tests = true +sanity_tests = false +security_tests = false safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true -ec2_tests = true +ecs_tests = false +eks_tests = false +ec2_tests = false # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-10-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec-2-10-ec2.yml b/pytorch/training/buildspec-2-10-ec2.yml new file mode 100644 index 000000000000..292b7e686334 --- /dev/null +++ b/pytorch/training/buildspec-2-10-ec2.yml @@ -0,0 +1,75 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.10.0 +short_version: &SHORT_VERSION "2.10" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + setup_oss_compliance: + source: ../../scripts/setup_oss_compliance.sh + target: setup_oss_compliance.sh + +images: + BuildEC2CPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 7200 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py313 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT + BuildEC2GPUPTTrainPy3cu130DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 28000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py313 + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec-2-10-sm.yml b/pytorch/training/buildspec-2-10-sm.yml new file mode 100644 index 000000000000..233ef153d7b1 --- /dev/null +++ b/pytorch/training/buildspec-2-10-sm.yml @@ -0,0 +1,75 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.10.0 +short_version: &SHORT_VERSION "2.10" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + setup_oss_compliance: + source: ../../scripts/setup_oss_compliance.sh + target: setup_oss_compliance.sh + +images: + BuildSageMakerCPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 7200 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py313 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT + BuildSageMakerGPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 28000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py313 + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/docker/2.10/py3/Dockerfile.cpu b/pytorch/training/docker/2.10/py3/Dockerfile.cpu new file mode 100644 index 000000000000..5a2a921f698b --- /dev/null +++ b/pytorch/training/docker/2.10/py3/Dockerfile.cpu @@ -0,0 +1,319 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.13.11 +ARG PYTHON_SHORT_VERSION=3.13 +ARG PYTORCH_VERSION=2.10.0 + +ARG OPEN_MPI_VERSION=4.1.7 + +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHDATA_VERSION=0.11.0 +ARG TORCHAUDIO_VERSION=2.10.0 +ARG TORCHVISION_VERSION=0.25.0 + +FROM ubuntu:22.04 AS base_image + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM base_image AS common + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION +ARG PYTORCH_VERSION +ARG TORCHTNT_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION + +ARG OPEN_MPI_VERSION + +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + +# Python won't try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + liblzma-dev \ + libffi-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \ + && cd openmpi-${OPEN_MPI_VERSION} \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && rm -rf openmpi-${OPEN_MPI_VERSION} + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="/home/.openmpi/bin:${PATH}" +ENV LD_LIBRARY_PATH="/home/.openmpi/lib:${LD_LIBRARY_PATH}" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# install python +RUN cd /tmp/ \ +&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ +&& tar xzf Python-${PYTHON_VERSION}.tgz \ +&& cd Python-${PYTHON_VERSION} \ +&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \ +&& make -j "$(nproc)" \ +&& make altinstall \ +&& cd .. \ +&& rm -rf Python-${PYTHON_VERSION} \ +&& rm Python-${PYTHON_VERSION}.tgz \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \ +# This installation generate a .python_history file in the root directory leads sanity check to fail +&& rm -f /root/.python_history + +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# this will add pip systemlink to pip${PYTHON_SHORT_VERSION} +RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org + +# Install common packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + boto3 \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=80.10.1" \ + "urllib3>=2.5.0" \ + "awscli" \ + opencv-python==4.11.0.86 \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 \ + "filelock>=3.20.1" \ + pytz \ + tzdata + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cpu \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thinc, blis + spacy \ + thinc \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.10/license.txt + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON + +WORKDIR / + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] + +# Starts framework +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +ARG PYTHON + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +WORKDIR / + +# Install SM packages +# Updated for SageMaker SDK v3 compatibility +# Note: sagemaker-experiments removed as it's deprecated and merged into sagemaker>=3 +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=3.0.0" \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + scikit-learn \ + seaborn \ + shap \ + cloudpickle + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/pytorch/training/docker/2.10/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json b/pytorch/training/docker/2.10/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json new file mode 100644 index 000000000000..16e44d8f5a81 --- /dev/null +++ b/pytorch/training/docker/2.10/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json @@ -0,0 +1,12 @@ +{ + "85151": "[pkg: protobuf] advisory='Affected versions of the protobuf package are vulnerable to Denial of Service (DoS) due to missing recursion depth accounting that allows the max_recursion_depth limit to be bypassed. The google.protobuf.json_format.ParseDict() parser fails to increment or enforce max_recursion_depth when traversing nested google.protobuf.Any messages in its internal Any-handling logic, allowing attacker-controlled JSON to recurse far deeper than intended.', reason_to_ignore='N/A', spec='<=6.33.4'", + "71577": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'", + "71578": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'", + "71579": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.27.0 or newer, enabling a maliciously crafted Recipe to execute arbitrary code on an end user's system when run.', reason_to_ignore='N/A', spec='>=1.27.0'", + "71584": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform affected versions, enabling a maliciously uploaded LightGBM scikit-learn model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.23.0'", + "71587": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded PyFunc model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=0.9.0'", + "71691": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.27.0 or newer, enabling a maliciously crafted Recipe to execute arbitrary code on an end user's system when run.', reason_to_ignore='N/A', spec='>=1.27.0'", + "71692": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded Tensorflow model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=2.0.0rc0'", + "71693": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded pmdarima model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.24.0'", + "71782": "[pkg: skops] advisory='Deserialization of untrusted data can occur in versions of the skops library, enabling a maliciously crafted model to run arbitrary code on an end user's system when loaded.', reason_to_ignore='CVE-2024-37065, pre-existing vulnerability in skops dependency', spec='>=0'" +} diff --git a/pytorch/training/docker/2.10/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.10/py3/cu130/Dockerfile.gpu new file mode 100644 index 000000000000..d621e237422f --- /dev/null +++ b/pytorch/training/docker/2.10/py3/cu130/Dockerfile.gpu @@ -0,0 +1,290 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.13.11 +ARG PYTHON_SHORT_VERSION=3.13 +ARG PYTORCH_VERSION=2.10.0 +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHAUDIO_VERSION=2.10.0 +ARG TORCHVISION_VERSION=0.25.0 +ARG TORCHDATA_VERSION=0.11.0 + +ARG GDRCOPY_VERSION=2.5.1 +ARG TE_VERSION=2.11 +ARG FLASH_ATTN_VERSION=2.8.3 + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM public.ecr.aws/deep-learning-containers/base:13.0.2-gpu-py313-cu130-ubuntu22.04-ec2 AS common +# base has EFA, PYTHON and CUDA 13.0 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTORCH_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION +ARG TORCHTNT_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION +ARG GDRCOPY_VERSION + +ENV CUDA_HOME="/usr/local/cuda" +ENV PATH="${CUDA_HOME}/bin:${PATH}" +ENV EFA_PATH="/opt/amazon/efa" +ENV OPEN_MPI_PATH="/opt/amazon/openmpi" + +# Python won't try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + libgl1-mesa-glx \ + build-essential \ + ca-certificates \ + zlib1g-dev \ + openssl \ + python3-dev \ + pkg-config \ + check \ + llvm \ + xz-utils \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# Install common conda packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=80.10.1" \ + "urllib3>=2.5.0" \ + ninja \ + opencv-python==4.11.0.86 \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 \ + "filelock>=3.20.1" \ + pytz \ + tzdata + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cu130 \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + triton \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thinc, blis + spacy \ + thinc \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch + +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation + +RUN pip install --no-cache-dir nvidia-mathdx + +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.10/license.txt + +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +RUN chmod +x /usr/local/bin/start_cuda_compat.sh + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '13s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install common packages used by both EC2 and SageMaker +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl \ + wget \ + git \ + jq \ + emacs \ + vim \ + unzip \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON + +WORKDIR / + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +ARG PYTHON + +WORKDIR / + +# Install SM packages +# Updated for SageMaker SDK v3 compatibility +# Note: sagemaker-experiments removed as it's deprecated and merged into sagemaker>=3 +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=3.0.0" \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + shap \ + scikit-learn \ + seaborn \ + cloudpickle + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/pytorch/training/docker/2.10/py3/cu130/Dockerfile.sagemaker.gpu.py_scan_allowlist.json b/pytorch/training/docker/2.10/py3/cu130/Dockerfile.sagemaker.gpu.py_scan_allowlist.json new file mode 100644 index 000000000000..16e44d8f5a81 --- /dev/null +++ b/pytorch/training/docker/2.10/py3/cu130/Dockerfile.sagemaker.gpu.py_scan_allowlist.json @@ -0,0 +1,12 @@ +{ + "85151": "[pkg: protobuf] advisory='Affected versions of the protobuf package are vulnerable to Denial of Service (DoS) due to missing recursion depth accounting that allows the max_recursion_depth limit to be bypassed. The google.protobuf.json_format.ParseDict() parser fails to increment or enforce max_recursion_depth when traversing nested google.protobuf.Any messages in its internal Any-handling logic, allowing attacker-controlled JSON to recurse far deeper than intended.', reason_to_ignore='N/A', spec='<=6.33.4'", + "71577": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'", + "71578": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'", + "71579": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.27.0 or newer, enabling a maliciously crafted Recipe to execute arbitrary code on an end user's system when run.', reason_to_ignore='N/A', spec='>=1.27.0'", + "71584": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform affected versions, enabling a maliciously uploaded LightGBM scikit-learn model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.23.0'", + "71587": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded PyFunc model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=0.9.0'", + "71691": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.27.0 or newer, enabling a maliciously crafted Recipe to execute arbitrary code on an end user's system when run.', reason_to_ignore='N/A', spec='>=1.27.0'", + "71692": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded Tensorflow model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=2.0.0rc0'", + "71693": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded pmdarima model to run arbitrary code on an end user's system when interacted with.', reason_to_ignore='N/A', spec='>=1.24.0'", + "71782": "[pkg: skops] advisory='Deserialization of untrusted data can occur in versions of the skops library, enabling a maliciously crafted model to run arbitrary code on an end user's system when loaded.', reason_to_ignore='CVE-2024-37065, pre-existing vulnerability in skops dependency', spec='>=0'" +} diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 9f54a4995d56..e4207a1ff103 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -55,6 +55,7 @@ # ECR repo name fixtures # PyTorch "pytorch_training", + "pytorch_training___2__10", "pytorch_training___2__9", "pytorch_training___2__8", "pytorch_training___2__7", @@ -1686,11 +1687,13 @@ def lookup_condition(lookup, image): # Extract ecr repo name from the image and check if it exactly matches the lookup (fixture name) repo_name = get_ecr_repo_name(image) - # If lookup includes tag, check that we match beginning of string + # If lookup includes tag, check that we match beginning of string. + # Append a non-digit boundary after the version to prevent prefix collisions + # e.g. "pytorch-training:2.1" must not match "pytorch-training:2.10.0-gpu-..." if ":" in lookup and ":" in image: _, tag = get_repository_and_tag_from_image_uri(image) generic_repo_tag = f"{repo_name}:{tag}".replace("pr-", "").replace("beta-", "") - if re.match(rf"^{lookup}", generic_repo_tag): + if re.match(rf"^{re.escape(lookup)}(\D|$)", generic_repo_tag): return True job_types = ( diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_10.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_10.py new file mode 100644 index 000000000000..526ce797fd8c --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_10.py @@ -0,0 +1,137 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True +) +def test_pytorch_2_10_gpu( + pytorch_training___2__10, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__10 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.10 GPU") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_heavy_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +@pytest.mark.skipif( + test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), + reason="Skip GPU Heavy tests in PR context unless explicitly enabled", +) +def test_pytorch_2_10_gpu_heavy( + pytorch_training___2__10, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__10 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), + (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.10 GPU Heavy") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +def test_pytorch_2_10_gpu_inductor( + pytorch_training___2__10, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__10 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.10 GPU Inductor") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_cpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_pytorch_2_10_cpu(pytorch_training___2__10, ec2_connection, cpu_only): + pytorch_training = pytorch_training___2__10 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases += [ + (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.10 CPU") diff --git a/test/dlc_tests/sanity/test_pre_release.py b/test/dlc_tests/sanity/test_pre_release.py index b2959cfcdca7..340e93a3d4fa 100644 --- a/test/dlc_tests/sanity/test_pre_release.py +++ b/test/dlc_tests/sanity/test_pre_release.py @@ -689,6 +689,13 @@ def test_pip_check(image): rf"^({'|'.join(exception_strings)}) is not supported on this platform" ) + # mlflow (transitive dep via smclarify/sagemaker) requires pandas<3, but PT 2.10+ SM SDK v3 + # images install pandas>=3. This is an upstream compatibility gap, not a DLC issue. + if Version(framework_version) >= Version("2.10"): + allowed_exceptions.append( + r"^mlflow \d+(\.\d+)* has requirement pandas<3.*but you have pandas \d+(\.\d+)*\.$" + ) + if "pytorch" in image and "trcomp" in image: allowed_exceptions.extend( [ diff --git a/test/dlc_tests/sanity/test_remote_function_compatibility.py b/test/dlc_tests/sanity/test_remote_function_compatibility.py index 2a9e03de4760..82431a63534a 100644 --- a/test/dlc_tests/sanity/test_remote_function_compatibility.py +++ b/test/dlc_tests/sanity/test_remote_function_compatibility.py @@ -1,6 +1,7 @@ import pytest from invoke.context import Context +from packaging.version import Version from test import test_utils @@ -19,6 +20,13 @@ def test_remote_function(training): f"Skipping remote function compatibility test for {training}. Test only for training images with Python>3.6" ) + # SageMaker SDK v3 (used in PyTorch >= 2.10) removed sagemaker.remote_function module + framework, framework_version = test_utils.get_framework_and_version_from_tag(training) + if framework == "pytorch" and Version(framework_version) >= Version("2.10"): + pytest.skip( + "Skipping remote function test for SM SDK v3 images (sagemaker.remote_function removed in v3)" + ) + container_name = test_utils.get_container_name("remote-function-test", training) ctx = Context() diff --git a/test/dlc_tests/sanity/test_utility_installation.py b/test/dlc_tests/sanity/test_utility_installation.py index 518cc4c0f4d8..1722b3cc9c9c 100644 --- a/test/dlc_tests/sanity/test_utility_installation.py +++ b/test/dlc_tests/sanity/test_utility_installation.py @@ -102,12 +102,22 @@ def test_utility_packages_using_import(training): packages_to_import = SM_TRAINING_UTILITY_PACKAGES_IMPORT for package in packages_to_import: + # SageMaker SDK v3 removed __version__ attribute; use importlib.metadata as fallback + if package == "sagemaker": + version_cmd = ( + "import sagemaker; " + 'v = getattr(sagemaker, "__version__", None); ' + 'v = v or __import__("importlib.metadata", fromlist=["version"]).version("sagemaker"); ' + "print(v)" + ) + else: + version_cmd = f"import {package}; print({package}.__version__)" version = re.search( r"\d+(\.\d+)+", test_utils.run_cmd_on_container( container_name, ctx, - f"import {package}; print({package}.__version__)", + version_cmd, executable="python", ).stdout, ).group() diff --git a/test/sagemaker_tests/__init__.py b/test/sagemaker_tests/__init__.py index d2fffc66d923..898910216300 100644 --- a/test/sagemaker_tests/__init__.py +++ b/test/sagemaker_tests/__init__.py @@ -19,6 +19,20 @@ import botocore.exceptions import sagemaker +try: + _SageMakerSession = sagemaker.Session +except AttributeError: + # SageMaker SDK v3 moved Session to sagemaker.core.helper.session_helper + from sagemaker.core.helper.session_helper import Session as _SageMakerSession + +try: + _UnexpectedStatusException = sagemaker.exceptions.UnexpectedStatusException +except (AttributeError, ModuleNotFoundError): + # SageMaker SDK v3 removed sagemaker.exceptions; define a placeholder that never matches + class _UnexpectedStatusException(Exception): + pass + + from botocore.config import Config from tenacity import retry, retry_if_exception_type, wait_fixed, stop_after_delay @@ -88,7 +102,7 @@ class SMThrottlingError(Exception): def get_sagemaker_session(region, default_bucket=None): - return sagemaker.Session( + return _SageMakerSession( boto_session=boto3.Session(region_name=region), default_bucket=default_bucket ) @@ -188,7 +202,7 @@ def invoke_sm_helper_function(ecr_image, sagemaker_regions, test_function, *test try: test_function(tested_ecr_image, sagemaker_session, *test_function_args) return - except sagemaker.exceptions.UnexpectedStatusException as e: + except _UnexpectedStatusException as e: if "CapacityError" in str(e): error = e continue @@ -235,7 +249,7 @@ def invoke_sm_endpoint_helper_function( sagemaker_client = get_sagemaker_client(region) boto_session = boto3.Session(region_name=region) sagemaker_runtime_client = get_sagemaker_runtime_client(region) - sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=region)) + sagemaker_session = _SageMakerSession(boto_session=boto3.Session(region_name=region)) # Reupload the image to test region if needed tested_ecr_image = ( get_ecr_image(ecr_image, region) if region != ecr_image_region else ecr_image @@ -269,7 +283,7 @@ def invoke_sm_endpoint_helper_function( **test_function_args, ) return return_value - except sagemaker.exceptions.UnexpectedStatusException as e: + except _UnexpectedStatusException as e: if "CapacityError" in str(e): error = e continue diff --git a/test/sagemaker_tests/pytorch/__init__.py b/test/sagemaker_tests/pytorch/__init__.py index 216fdc52bdc6..53fcf1403b44 100644 --- a/test/sagemaker_tests/pytorch/__init__.py +++ b/test/sagemaker_tests/pytorch/__init__.py @@ -16,7 +16,16 @@ import botocore.exceptions import sagemaker -import sagemaker.exceptions + +try: + import sagemaker.exceptions + + _UnexpectedStatusException = sagemaker.exceptions.UnexpectedStatusException +except (ImportError, AttributeError): + # SageMaker SDK v3 removed sagemaker.exceptions + class _UnexpectedStatusException(Exception): + pass + from tenacity import retry, retry_if_exception_type, wait_fixed, stop_after_delay @@ -114,7 +123,7 @@ def invoke_pytorch_helper_function( try: helper_function(tested_ecr_image, sagemaker_session, **helper_function_args) return - except sagemaker.exceptions.UnexpectedStatusException as e: + except _UnexpectedStatusException as e: if "CapacityError" in str(e): error = e continue diff --git a/test/sagemaker_tests/pytorch/training/conftest.py b/test/sagemaker_tests/pytorch/training/conftest.py index 196096c79056..18fcf6e28f90 100644 --- a/test/sagemaker_tests/pytorch/training/conftest.py +++ b/test/sagemaker_tests/pytorch/training/conftest.py @@ -24,8 +24,19 @@ import pytest from botocore.exceptions import ClientError -from sagemaker import LocalSession, Session -from sagemaker.pytorch import PyTorch + +try: + from sagemaker import LocalSession, Session +except ImportError: + # SageMaker SDK v3 removed LocalSession and Session from top-level sagemaker namespace + LocalSession = None + Session = None + +try: + from sagemaker.pytorch import PyTorch +except ImportError: + # SageMaker SDK v3 removed sagemaker.pytorch; v3 tests use ModelTrainer instead + PyTorch = None from . import get_efa_test_instance_type @@ -267,6 +278,8 @@ def fixture_build_base_image( @pytest.fixture(scope="session", name="sagemaker_session") def fixture_sagemaker_session(region): + if Session is None: + pytest.skip("sagemaker.Session not available in SageMaker SDK v3") return Session(boto_session=boto3.Session(region_name=region)) @@ -280,6 +293,8 @@ def fixture_efa_instance_type(request): @pytest.fixture(scope="session", name="sagemaker_local_session") def fixture_sagemaker_local_session(region): + if LocalSession is None: + pytest.skip("sagemaker.LocalSession not available in SageMaker SDK v3") return LocalSession(boto_session=boto3.Session(region_name=region)) diff --git a/test/sagemaker_tests/pytorch/training/integration/local/test_single_machine_training.py b/test/sagemaker_tests/pytorch/training/integration/local/test_single_machine_training.py index cd72bc1707d6..6e42512355c9 100644 --- a/test/sagemaker_tests/pytorch/training/integration/local/test_single_machine_training.py +++ b/test/sagemaker_tests/pytorch/training/integration/local/test_single_machine_training.py @@ -61,6 +61,8 @@ def test_fastai_mnist(docker_image, instance_type, py_version, sagemaker_local_s pytest.skip("Fast ai is not supported on PyTorch v1.9.x, v1.10.x, v1.11.x, v1.12.x") if Version(image_framework_version) in SpecifierSet("~=2.6.0"): pytest.skip("Fast ai doesn't release for PyTorch v2.6.x") + if Version(image_framework_version) in SpecifierSet(">=2.10"): + pytest.skip("fastai removed from PyTorch 2.10+ images (requires torch<2.10)") estimator = PyTorch( entry_point=fastai_mnist_script, role=ROLE, diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/__init__.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/__init__.py index 87222ae09833..5906443d99a3 100644 --- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/__init__.py +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/__init__.py @@ -16,11 +16,29 @@ import botocore.exceptions import pytest -import sagemaker.exceptions import sagemaker -from sagemaker.pytorch import PyTorch -from sagemaker import utils +try: + import sagemaker.exceptions + + _UnexpectedStatusException = sagemaker.exceptions.UnexpectedStatusException +except (ImportError, AttributeError): + # SageMaker SDK v3 removed sagemaker.exceptions + class _UnexpectedStatusException(Exception): + pass + + +try: + from sagemaker.pytorch import PyTorch +except ImportError: + # SageMaker SDK v3 removed sagemaker.pytorch; v3 tests use ModelTrainer instead + PyTorch = None + +try: + from sagemaker import utils +except ImportError: + # SageMaker SDK v3 removed sagemaker.utils from top-level namespace + utils = None from tenacity import retry, retry_if_exception_type, wait_fixed, stop_after_delay from .timeout import timeout @@ -108,7 +126,7 @@ def invoke_pytorch_estimator( pytorch.fit(inputs=inputs, job_name=job_name) return pytorch, sagemaker_session - except sagemaker.exceptions.UnexpectedStatusException as e: + except _UnexpectedStatusException as e: if "CapacityError" in str(e): error = e continue diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/__init__.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/__init__.py new file mode 100644 index 000000000000..1ae9f10a0c72 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/__init__.py @@ -0,0 +1,209 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import time + +import boto3 +import botocore.exceptions +import pytest + +from sagemaker.train import ModelTrainer +from sagemaker.train.configs import SourceCode, InputData, Compute +from sagemaker.train.distributed import Torchrun +from tenacity import retry, retry_if_exception_type, wait_fixed, stop_after_delay + +from packaging.version import Version +from packaging.specifiers import SpecifierSet +from test.test_utils import get_framework_and_version_from_tag + +from .timeout import timeout +from ...integration import training_dir, mnist_script, DEFAULT_TIMEOUT +from ..... import ( + get_ecr_image, + get_ecr_image_region, + get_sagemaker_session, + LOW_AVAILABILITY_INSTANCE_TYPES, + SMInstanceCapacityError, + SMResourceLimitExceededError, + SMThrottlingError, +) + + +def skip_if_not_v3_compatible(ecr_image): + """Skip test if the image is not PyTorch >= 2.10 (v3 SDK only).""" + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + if Version(image_framework_version) not in SpecifierSet(">=2.10"): + pytest.skip("SageMaker SDK v3 tests only run on PyTorch >= 2.10 images") + + +def upload_s3_data_v3(sagemaker_session, path, key_prefix): + sagemaker_session.default_bucket() + inputs = sagemaker_session.upload_data(path=path, key_prefix=key_prefix) + return inputs + + +@retry( + reraise=True, + retry=retry_if_exception_type( + (SMInstanceCapacityError, SMThrottlingError, SMResourceLimitExceededError) + ), + stop=stop_after_delay(20 * 60), + wait=wait_fixed(60), +) +def invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code, + compute_params, + hyperparameters=None, + distributed_runner=None, + input_data_config=None, + upload_s3_data_args=None, + job_name=None, + environment=None, +): + """ + Used to invoke PyTorch training job using SageMaker SDK v3 ModelTrainer. + The ECR image and the sagemaker session are used depending on the AWS region. + This function will rerun for all SM regions after a defined wait time if + capacity issues occur. + + :param ecr_image: ECR image in us-west-2 region + :param sagemaker_regions: List of SageMaker regions + :param source_code: SourceCode config for ModelTrainer + :param compute_params: dict with instance_type, instance_count + :param hyperparameters: dict of hyperparameters + :param distributed_runner: Torchrun or other distributed config + :param input_data_config: list of InputData objects + :param upload_s3_data_args: Data to be uploaded to S3 for training job + :param job_name: Training job base name + :param environment: dict of environment variables + + :return: (model_trainer, sagemaker_session) + """ + + ecr_image_region = get_ecr_image_region(ecr_image) + error = None + for test_region in sagemaker_regions: + sagemaker_session = get_sagemaker_session(test_region) + # Reupload the image to test region if needed + tested_ecr_image = ( + get_ecr_image(ecr_image, test_region) if test_region != ecr_image_region else ecr_image + ) + + env = environment.copy() if environment else {} + env["AWS_REGION"] = test_region + + try: + compute = Compute( + instance_type=compute_params.get("instance_type", "ml.m5.xlarge"), + instance_count=compute_params.get("instance_count", 1), + ) + + trainer_kwargs = { + "training_image": tested_ecr_image, + "source_code": source_code, + "compute": compute, + } + if hyperparameters: + trainer_kwargs["hyperparameters"] = hyperparameters + if distributed_runner: + trainer_kwargs["distributed_runner"] = distributed_runner + if job_name: + trainer_kwargs["base_job_name"] = job_name + if env: + trainer_kwargs["environment"] = env + + model_trainer = ModelTrainer(**trainer_kwargs) + + if upload_s3_data_args: + training_input = upload_s3_data_v3( + sagemaker_session, + upload_s3_data_args["path"], + upload_s3_data_args["key_prefix"], + ) + input_data_config = [InputData(channel_name="training", data_source=training_input)] + + model_trainer.train( + input_data_config=input_data_config, + wait=True, + ) + return model_trainer, sagemaker_session + + except Exception as e: + error_str = str(e) + if "CapacityError" in error_str: + error = e + continue + elif any( + exc_type in error_str + for exc_type in ["ThrottlingException", "ResourceLimitExceeded"] + ): + error = e + continue + else: + raise e + + instance_types = [] + if "instance_type" in compute_params: + instance_types = [compute_params["instance_type"]] + if any(instance_type in LOW_AVAILABILITY_INSTANCE_TYPES for instance_type in instance_types): + pytest.skip(f"Failed to launch job due to low capacity on {instance_types}") + if error and "CapacityError" in str(error): + raise SMInstanceCapacityError from error + elif error and "ResourceLimitExceeded" in str(error): + raise SMResourceLimitExceededError from error + else: + raise SMThrottlingError from error + + +def _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version, + dist_backend, + instance_type=None, + instance_count=2, + use_inductor=False, +): + """v3 equivalent of _test_mnist_distributed using ModelTrainer.""" + from ...integration import mnist_path, mnist_script + + hyperparameters = {"backend": dist_backend, "epochs": 1} + if use_inductor: + hyperparameters["inductor"] = 1 + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="mnist.py", + ) + + compute_params = { + "instance_type": instance_type or "ml.m5.xlarge", + "instance_count": instance_count, + } + + distributed_runner = Torchrun() if dist_backend.lower() in ("nccl", "gloo") else None + + job_name = "test-pt-v3-mnist-distributed" + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + distributed_runner=distributed_runner, + job_name=job_name, + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/requirements.txt b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/requirements.txt new file mode 100644 index 000000000000..55c8c61562a3 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/requirements.txt @@ -0,0 +1,22 @@ +boto3>=1.42.2,<2.0 +mock>=4.0 +protobuf +sagemaker>=3,<4 +coverage +flake8==3.7.7 +Flask==1.1.1 +pytest<8.1 +pytest-cov +pytest-rerunfailures +pytest-xdist +PyYAML +requests +tox +requests_mock +fabric +invoke +retrying +tenacity +gitpython +toml +packaging diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_dgl.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_dgl.py new file mode 100644 index 000000000000..d3126b19339c --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_dgl.py @@ -0,0 +1,84 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode + +from ...integration import resources_path, DEFAULT_TIMEOUT +from .timeout import timeout + +from test.test_utils import get_framework_and_version_from_tag, get_cuda_version_from_tag +from packaging.version import Version +from packaging.specifiers import SpecifierSet +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer + + +DGL_DATA_PATH = os.path.join(resources_path, "dgl-gcn") +DGL_SCRIPT_PATH = os.path.join(DGL_DATA_PATH, "train.py") + + +@pytest.mark.skip("DGL binaries are not installed in DLCs by default") +@pytest.mark.skip_gpu +@pytest.mark.skip_py2_containers +@pytest.mark.integration("dgl") +@pytest.mark.processor("cpu") +@pytest.mark.model("gcn") +@pytest.mark.team("dgl") +def test_dgl_gcn_training_cpu(ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + + source_code = SourceCode( + source_dir=DGL_DATA_PATH, + entry_script="train.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-dgl-image", + ) + + +@pytest.mark.skip("DGL binaries are not installed in DLCs by default") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.integration("dgl") +@pytest.mark.processor("gpu") +@pytest.mark.model("gcn") +@pytest.mark.team("dgl") +def test_dgl_gcn_training_gpu(ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.g5.4xlarge" + + source_code = SourceCode( + source_dir=DGL_DATA_PATH, + entry_script="train.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-dgl-image", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_dgl_inductor.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_dgl_inductor.py new file mode 100644 index 000000000000..eda419c6293f --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_dgl_inductor.py @@ -0,0 +1,90 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode + +from ...integration import resources_path, DEFAULT_TIMEOUT +from .timeout import timeout + +from test.test_utils import get_framework_and_version_from_tag, get_cuda_version_from_tag +from packaging.version import Version +from packaging.specifiers import SpecifierSet +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer + + +DGL_DATA_PATH = os.path.join(resources_path, "dgl-gcn") +DGL_SCRIPT_PATH = os.path.join(DGL_DATA_PATH, "train.py") +inductor_instance_types = ["ml.g5.12xlarge", "ml.g5.12xlarge", "ml.g4dn.12xlarge"] + + +@pytest.mark.skip("DGL binaries are not installed in DLCs by default") +@pytest.mark.skip_gpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_inductor_test +@pytest.mark.integration("dgl") +@pytest.mark.processor("cpu") +@pytest.mark.model("gcn") +@pytest.mark.team("dgl") +def test_dgl_gcn_training_cpu(ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + + source_code = SourceCode( + source_dir=DGL_DATA_PATH, + entry_script="train.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters={"inductor": 1}, + job_name="test-pt-v3-dgl-inductor", + ) + + +@pytest.mark.skip("DGL binaries are not installed in DLCs by default") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_inductor_test +@pytest.mark.integration("dgl") +@pytest.mark.processor("gpu") +@pytest.mark.model("gcn") +@pytest.mark.team("dgl") +@pytest.mark.parametrize("instance_type", inductor_instance_types, indirect=True) +def test_dgl_gcn_training_gpu(ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.g5.8xlarge" + + source_code = SourceCode( + source_dir=DGL_DATA_PATH, + entry_script="train.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters={"inductor": 1}, + job_name="test-pt-v3-dgl-inductor", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_distributed_operations.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_distributed_operations.py new file mode 100644 index 000000000000..7f253e2691fd --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_distributed_operations.py @@ -0,0 +1,322 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import boto3 +import pytest +from sagemaker.train import ModelTrainer +from sagemaker.train.configs import SourceCode, Compute +from urllib.parse import urlparse +from test.test_utils import get_framework_and_version_from_tag, get_cuda_version_from_tag +from packaging.version import Version +from packaging.specifiers import SpecifierSet +from ....training import get_efa_test_instance_type +from ...integration import ( + data_dir, + dist_operations_path, + fastai_path, + mnist_script, + DEFAULT_TIMEOUT, + mnist_path, + gpt2_path, +) +from .timeout import timeout +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer + +MULTI_GPU_INSTANCE = "ml.g5.12xlarge" +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + + +def validate_or_skip_smmodelparallel(ecr_image): + if not can_run_smmodelparallel(ecr_image): + pytest.skip("Model Parallelism is supported on CUDA 11 on PyTorch v1.6 and above") + + +def can_run_smmodelparallel(ecr_image): + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + image_cuda_version = get_cuda_version_from_tag(ecr_image) + return Version(image_framework_version) in SpecifierSet(">=1.6") and Version( + image_cuda_version.strip("cu") + ) >= Version("110") + + +def validate_or_skip_smmodelparallel_efa(ecr_image): + if not can_run_smmodelparallel_efa(ecr_image): + pytest.skip("EFA is only supported on CUDA 11, and on PyTorch 1.8.1 or higher") + + +def skip_unsupported_instances_smmodelparallel(instance_type): + if instance_type.startswith("ml.p5"): + pytest.skip(f"{instance_type} is not supported by smdataparallel") + + +def can_run_smmodelparallel_efa(ecr_image): + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + image_cuda_version = get_cuda_version_from_tag(ecr_image) + return Version(image_framework_version) in SpecifierSet(">=1.8.1") and Version( + image_cuda_version.strip("cu") + ) >= Version("110") + + +@pytest.mark.processor("cpu") +@pytest.mark.multinode(3) +@pytest.mark.model("unknown_model") +@pytest.mark.skip_gpu +@pytest.mark.deploy_test +@pytest.mark.skip_test_in_region +@pytest.mark.team("conda") +def test_dist_operations_cpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_cpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + + source_code = SourceCode( + source_dir=os.path.dirname(dist_operations_path), + entry_script=os.path.basename(dist_operations_path), + ) + compute_params = {"instance_type": instance_type, "instance_count": 3} + hyperparameters = {"backend": dist_cpu_backend} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + job_name="test-pt-v3-dist-operations", + ) + + +@pytest.mark.processor("gpu") +@pytest.mark.multinode(3) +@pytest.mark.model("unknown_model") +@pytest.mark.skip_cpu +@pytest.mark.deploy_test +@pytest.mark.team("conda") +def test_dist_operations_gpu( + framework_version, instance_type, ecr_image, sagemaker_regions, dist_gpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.g5.4xlarge" + + source_code = SourceCode( + source_dir=os.path.dirname(dist_operations_path), + entry_script=os.path.basename(dist_operations_path), + ) + compute_params = {"instance_type": instance_type, "instance_count": 3} + hyperparameters = {"backend": dist_gpu_backend} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + job_name="test-pt-v3-dist-operations", + ) + + +@pytest.mark.processor("gpu") +@pytest.mark.model("unknown_model") +@pytest.mark.skip_cpu +@pytest.mark.team("conda") +def test_dist_operations_multi_gpu( + framework_version, ecr_image, sagemaker_regions, dist_gpu_backend +): + skip_if_not_v3_compatible(ecr_image) + + source_code = SourceCode( + source_dir=os.path.dirname(dist_operations_path), + entry_script=os.path.basename(dist_operations_path), + ) + compute_params = {"instance_type": MULTI_GPU_INSTANCE, "instance_count": 1} + hyperparameters = {"backend": dist_gpu_backend} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + job_name="test-pt-v3-dist-operations-multigpu", + ) + + +@pytest.mark.processor("gpu") +@pytest.mark.integration("fastai") +@pytest.mark.model("mnist") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.team("conda") +def test_dist_operations_fastai_gpu(framework_version, ecr_image, sagemaker_regions): + skip_if_not_v3_compatible(ecr_image) + + source_code = SourceCode( + source_dir=fastai_path, + entry_script="train_distributed.py", + ) + compute_params = {"instance_type": MULTI_GPU_INSTANCE, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-fastai", + ) + + +@pytest.mark.skip("SM Model Parallel team is maintaining their own Docker Container") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.usefixtures("feature_smmp_present") +@pytest.mark.integration("smmodelparallel") +@pytest.mark.model("gpt2") +@pytest.mark.processor("gpu") +@pytest.mark.team("smmodelparallel") +@pytest.mark.parametrize("test_script, num_processes", [("train_gpt_simple.py", 8)]) +def test_smmodelparallel_gpt2_multigpu_singlenode( + ecr_image, instance_type, sagemaker_regions, test_script, num_processes +): + skip_if_not_v3_compatible(ecr_image) + # TODO: Implement v3 equivalent for smmodelparallel tests when needed + pytest.skip("SM Model Parallel v3 test not yet implemented") + + +@pytest.mark.skip("SM Model Parallel team is maintaining their own Docker Container") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.usefixtures("feature_smmp_present") +@pytest.mark.integration("smmodelparallel") +@pytest.mark.model("gpt2") +@pytest.mark.processor("gpu") +@pytest.mark.team("smmodelparallel") +@pytest.mark.parametrize("test_script, num_processes", [("train_gpt_simple.py", 8)]) +def test_smmodelparallel_gpt2_multigpu_singlenode_flashattn( + ecr_image, instance_type, sagemaker_regions, test_script, num_processes +): + skip_if_not_v3_compatible(ecr_image) + pytest.skip("SM Model Parallel v3 test not yet implemented") + + +@pytest.mark.skip("SM Model Parallel team is maintaining their own Docker Container") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.usefixtures("feature_smmp_present") +@pytest.mark.integration("smmodelparallel") +@pytest.mark.model("mnist") +@pytest.mark.processor("gpu") +@pytest.mark.multinode(2) +@pytest.mark.team("smmodelparallel") +@pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_pt_mnist.py", 8)]) +def test_smmodelparallel_mnist_multigpu_multinode( + ecr_image, instance_type, sagemaker_regions, test_script, num_processes +): + skip_if_not_v3_compatible(ecr_image) + pytest.skip("SM Model Parallel v3 test not yet implemented") + + +@pytest.mark.skip("SM Model Parallel team is maintaining their own Docker Container") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.usefixtures("feature_smmp_present") +@pytest.mark.integration("smmodelparallel") +@pytest.mark.model("mnist") +@pytest.mark.processor("gpu") +@pytest.mark.multinode(2) +@pytest.mark.team("smmodelparallel") +@pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_pt_mnist.py", 8)]) +def test_hc_smmodelparallel_mnist_multigpu_multinode( + ecr_image, instance_type, sagemaker_regions, test_script, num_processes +): + skip_if_not_v3_compatible(ecr_image) + pytest.skip("SM Model Parallel v3 test not yet implemented") + + +@pytest.mark.skip("SM Model Parallel team is maintaining their own Docker Container") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.usefixtures("feature_smmp_present") +@pytest.mark.integration("smmodelparallel") +@pytest.mark.model("mnist") +@pytest.mark.processor("gpu") +@pytest.mark.multinode(2) +@pytest.mark.team("smmodelparallel") +@pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_pt_mnist.py", 8)]) +@pytest.mark.efa() +def test_smmodelparallel_mnist_multigpu_multinode_efa( + ecr_image, efa_instance_type, sagemaker_regions, test_script, num_processes +): + skip_if_not_v3_compatible(ecr_image) + pytest.skip("SM Model Parallel v3 test not yet implemented") + + +@pytest.mark.skip("SM Model Parallel team is maintaining their own Docker Container") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.integration("smmodelparallel") +@pytest.mark.model("gpt2") +@pytest.mark.processor("gpu") +@pytest.mark.multinode(2) +@pytest.mark.team("smmodelparallel") +@pytest.mark.parametrize("test_script, num_processes", [("train_gpt_simple.py", 8)]) +@pytest.mark.efa() +def test_smmodelparallel_gpt2_sdp_multinode_efa( + ecr_image, efa_instance_type, sagemaker_regions, test_script, num_processes +): + skip_if_not_v3_compatible(ecr_image) + pytest.skip("SM Model Parallel v3 test not yet implemented") + + +@pytest.mark.skip(reason="Sagemaker efa test is a duplicate of ec2 efa test on p4d instances") +@pytest.mark.integration("smmodelparallel") +@pytest.mark.model("mnist") +@pytest.mark.processor("gpu") +@pytest.mark.skip_cpu +@pytest.mark.efa() +@pytest.mark.skip_py2_containers +@pytest.mark.team("conda") +def test_sanity_efa(ecr_image, efa_instance_type, sagemaker_regions): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_smmodelparallel_efa(ecr_image) + skip_unsupported_instances_smmodelparallel(efa_instance_type) + efa_test_path = os.path.join(RESOURCE_PATH, "efa", "test_efa.sh") + + source_code = SourceCode( + source_dir=os.path.dirname(efa_test_path), + entry_script=os.path.basename(efa_test_path), + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-efa-sanity", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_gdrcopy.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_gdrcopy.py new file mode 100644 index 000000000000..e1c2021b6ab1 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_gdrcopy.py @@ -0,0 +1,76 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode, Compute +from sagemaker.train.distributed import Torchrun +from test.test_utils import get_framework_and_version_from_tag, get_cuda_version_from_tag +from packaging.version import Version +from packaging.specifiers import SpecifierSet +from ...integration import DEFAULT_TIMEOUT +from .timeout import timeout +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer +from ....training import get_efa_test_instance_type + +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") +GDRCOPY_SANITY_TEST_CMD = os.path.join(RESOURCE_PATH, "gdrcopy", "test_gdrcopy.sh") + + +def validate_or_skip_gdrcopy(ecr_image): + if not can_run_gdrcopy(ecr_image): + pytest.skip("GDRCopy is only supported on CUDA 11.7+, and on PyTorch 1.13.1 or higher") + + +def can_run_gdrcopy(ecr_image): + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + image_cuda_version = get_cuda_version_from_tag(ecr_image) + return Version(image_framework_version) in SpecifierSet(">=1.13.1") and Version( + image_cuda_version.strip("cu") + ) >= Version("117") + + +@pytest.mark.skip( + reason="gdrcopy sanity test in the sagemaker test job is duplicate test to the gdrcopy test in the ec2 test job" +) +@pytest.mark.integration("smdataparallel") +@pytest.mark.model("N/A") +@pytest.mark.processor("gpu") +@pytest.mark.skip_cpu +@pytest.mark.skip_trcomp_containers +@pytest.mark.gdrcopy() +@pytest.mark.team("smdataparallel") +@pytest.mark.parametrize( + "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True +) +@pytest.mark.team("conda") +def test_sanity_gdrcopy(ecr_image, efa_instance_type, sagemaker_regions): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_gdrcopy(ecr_image) + + source_code = SourceCode( + source_dir=os.path.dirname(GDRCOPY_SANITY_TEST_CMD), + entry_script=os.path.basename(GDRCOPY_SANITY_TEST_CMD), + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-gdrcopy-sanity", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_mnist.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_mnist.py new file mode 100644 index 000000000000..e14f4f3a4d13 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_mnist.py @@ -0,0 +1,99 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest + +from . import skip_if_not_v3_compatible, _test_mnist_distributed_v3 + + +@pytest.mark.processor("cpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_gpu +@pytest.mark.team("conda") +def test_mnist_distributed_cpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_cpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_cpu_backend, + instance_type=instance_type, + ) + + +@pytest.mark.processor("gpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_cpu +@pytest.mark.team("conda") +def test_mnist_distributed_gpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_gpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.g4dn.12xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_gpu_backend, + instance_type=instance_type, + ) + + +@pytest.mark.processor("cpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_gpu +@pytest.mark.team("conda") +def test_hc_mnist_distributed_cpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_cpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_cpu_backend, + instance_type=instance_type, + instance_count=2, + ) + + +@pytest.mark.processor("gpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_cpu +@pytest.mark.team("conda") +def test_hc_mnist_distributed_gpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_gpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.g4dn.12xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_gpu_backend, + instance_type=instance_type, + instance_count=2, + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_mnist_inductor.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_mnist_inductor.py new file mode 100644 index 000000000000..651b4de50b07 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_mnist_inductor.py @@ -0,0 +1,115 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest + +from . import skip_if_not_v3_compatible, _test_mnist_distributed_v3 + +inductor_instance_types = ["ml.g5.12xlarge", "ml.g5.12xlarge", "ml.g4dn.12xlarge"] + + +@pytest.mark.processor("cpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_gpu +@pytest.mark.skip_inductor_test +@pytest.mark.skip(reason="known issue: https://github.com/pytorch/pytorch/issues/98436") +@pytest.mark.team("training-compiler") +def test_mnist_distributed_cpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_cpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_cpu_backend, + instance_type=instance_type, + use_inductor=True, + ) + + +@pytest.mark.processor("gpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_cpu +@pytest.mark.skip_inductor_test +@pytest.mark.parametrize("instance_type", inductor_instance_types, indirect=True) +@pytest.mark.skip(reason="known issue: https://github.com/pytorch/pytorch/issues/99067") +@pytest.mark.team("training-compiler") +def test_mnist_distributed_gpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_gpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.g5.8xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_gpu_backend, + instance_type=instance_type, + use_inductor=True, + ) + + +@pytest.mark.processor("cpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_gpu +@pytest.mark.skip_inductor_test +@pytest.mark.skip(reason="known issue: https://github.com/pytorch/pytorch/issues/98436") +@pytest.mark.team("training-compiler") +def test_hc_mnist_distributed_cpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_cpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_cpu_backend, + instance_type=instance_type, + instance_count=2, + use_inductor=True, + ) + + +@pytest.mark.processor("gpu") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.integration("smexperiments") +@pytest.mark.skip_cpu +@pytest.mark.skip_inductor_test +@pytest.mark.parametrize("instance_type", inductor_instance_types, indirect=True) +@pytest.mark.skip(reason="known issue: https://github.com/pytorch/pytorch/issues/99067") +@pytest.mark.team("training-compiler") +def test_hc_mnist_distributed_gpu( + framework_version, ecr_image, sagemaker_regions, instance_type, dist_gpu_backend +): + skip_if_not_v3_compatible(ecr_image) + instance_type = instance_type or "ml.g5.12xlarge" + _test_mnist_distributed_v3( + ecr_image, + sagemaker_regions, + framework_version=framework_version, + dist_backend=dist_gpu_backend, + instance_type=instance_type, + instance_count=2, + use_inductor=True, + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_neuron.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_neuron.py new file mode 100644 index 000000000000..4b95d872d462 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_neuron.py @@ -0,0 +1,136 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import pytest +from sagemaker.train import ModelTrainer +from sagemaker.train.configs import SourceCode, InputData, Compute +from sagemaker.train.distributed import Torchrun +from ...integration import neuron_allreduce_path, neuron_mlp_path, DEFAULT_TIMEOUT +from .timeout import timeout +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer + + +@pytest.mark.processor("neuronx") +@pytest.mark.model("unknown_model") +@pytest.mark.parametrize("instance_types", ["ml.trn1.32xlarge"]) +@pytest.mark.neuronx_test +@pytest.mark.team("neuron") +def test_neuron_allreduce_distributed( + framework_version, ecr_image, sagemaker_regions, instance_types +): + skip_if_not_v3_compatible(ecr_image) + + source_code = SourceCode( + source_dir=neuron_allreduce_path, + entry_script="all_reduce.py", + ) + compute_params = {"instance_type": instance_types, "instance_count": 2} + distributed_runner = Torchrun() + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + distributed_runner=distributed_runner, + environment={"FI_EFA_FORK_SAFE": "1"}, + job_name="test-pt-v3-neuron-allreduce-dist", + ) + + +@pytest.mark.processor("neuronx") +@pytest.mark.model("mlp") +@pytest.mark.parametrize("instance_types", ["ml.trn1.32xlarge"]) +@pytest.mark.neuronx_test +@pytest.mark.team("neuron") +def test_neuron_mlp_distributed(framework_version, ecr_image, sagemaker_regions, instance_types): + skip_if_not_v3_compatible(ecr_image) + + source_code = SourceCode( + source_dir=neuron_mlp_path, + entry_script="train_torchrun.py", + ) + compute_params = {"instance_type": instance_types, "instance_count": 2} + distributed_runner = Torchrun() + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + distributed_runner=distributed_runner, + environment={"FI_EFA_FORK_SAFE": "1"}, + job_name="test-pt-v3-neuron-mlp-dist", + ) + + +@pytest.mark.processor("neuronx") +@pytest.mark.model("unknown_model") +@pytest.mark.neuronx_test +@pytest.mark.team("neuron") +def test_neuron_allreduce_process(framework_version, ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + + source_code = SourceCode( + source_dir=neuron_allreduce_path, + entry_script="entrypoint.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + hyperparameters = {"nproc-per-node": 2, "nnodes": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + upload_s3_data_args={ + "path": neuron_allreduce_path, + "key_prefix": "pytorch/neuron_allreduce", + }, + job_name="test-pt-v3-neuron-allreduce", + ) + + +@pytest.mark.processor("neuronx") +@pytest.mark.model("mlp") +@pytest.mark.neuronx_test +@pytest.mark.team("neuron") +def test_neuron_mlp_process(framework_version, ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + + source_code = SourceCode( + source_dir=neuron_mlp_path, + entry_script="entrypoint.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + hyperparameters = {"nproc-per-node": 2, "nnodes": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + upload_s3_data_args={ + "path": neuron_mlp_path, + "key_prefix": "pytorch/neuron_mlp", + }, + job_name="test-pt-v3-neuron-mlp", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_pytorchddp.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_pytorchddp.py new file mode 100644 index 000000000000..bdf5df75b98f --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_pytorchddp.py @@ -0,0 +1,65 @@ +# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode +from sagemaker.train.distributed import Torchrun + +from ...integration import DEFAULT_TIMEOUT, mnist_path +from .timeout import timeout +from ....training import get_efa_test_instance_type +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer +from .test_torch_distributed import validate_or_skip_distributed_training + + +@pytest.mark.skipif( + os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge", + reason="Low availability of instance type; Must ensure test works on new instances.", +) +@pytest.mark.skip_pytorchddp_test +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.processor("gpu") +@pytest.mark.model("N/A") +@pytest.mark.multinode(2) +@pytest.mark.integration("pytorchddp") +@pytest.mark.parametrize( + "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True +) +@pytest.mark.efa() +@pytest.mark.team("conda") +def test_pytorchddp_throughput_gpu( + framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir +): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_distributed_training(ecr_image) + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="pytorchddp_throughput_mnist.py", + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 2} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + distributed_runner=Torchrun(), + job_name="test-pt-v3-pytorchddp-throughput-gpu", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_pytorchddp_inductor.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_pytorchddp_inductor.py new file mode 100644 index 000000000000..53ed31e6f8cb --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_pytorchddp_inductor.py @@ -0,0 +1,66 @@ +# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode +from sagemaker.train.distributed import Torchrun + +from ...integration import DEFAULT_TIMEOUT, mnist_path +from .timeout import timeout +from ....training import get_efa_test_instance_type +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer +from .test_torch_distributed import validate_or_skip_distributed_training + + +@pytest.mark.skipif( + os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge", + reason="Low availability of instance type; Must ensure test works on new instances.", +) +@pytest.mark.skip_pytorchddp_test +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_inductor_test +@pytest.mark.processor("gpu") +@pytest.mark.model("N/A") +@pytest.mark.multinode(2) +@pytest.mark.integration("pytorchddp") +@pytest.mark.parametrize( + "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True +) +@pytest.mark.efa() +@pytest.mark.team("training-compiler") +def test_pytorchddp_throughput_gpu( + framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir +): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_distributed_training(ecr_image) + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="pytorchddp_throughput_mnist.py", + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 2} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters={"inductor": 1}, + distributed_runner=Torchrun(), + job_name="test-pt-v3-pytorchddp-inductor-throughput-gpu", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smart_sifting.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smart_sifting.py new file mode 100644 index 000000000000..5ecfb4dcef99 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smart_sifting.py @@ -0,0 +1,95 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest +from packaging.version import Version +from packaging.specifiers import SpecifierSet + +from sagemaker.train.configs import SourceCode + +from .timeout import timeout +from ...integration import smart_sifting_path, DEFAULT_TIMEOUT +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer +from test.test_utils import get_framework_and_version_from_tag, get_cuda_version_from_tag + + +def validate_or_skip_smart_sifting(ecr_image): + if not can_run_smart_sifting(ecr_image): + pytest.skip("Smart sifting is only available for use with PT 2.0.1") + + +def can_run_smart_sifting(ecr_image): + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + image_cuda_version = get_cuda_version_from_tag(image_uri=ecr_image) + return Version(image_framework_version) in SpecifierSet("==2.0.*") and ( + not image_cuda_version or image_cuda_version == "cu118" + ) + + +@pytest.mark.usefixtures("feature_smart_sifting_present") +@pytest.mark.processor("cpu") +@pytest.mark.model("bert") +@pytest.mark.integration("smart_sifting") +@pytest.mark.skip_gpu +@pytest.mark.skip_py2_containers +def test_smart_sifting_cpu(framework_version, ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_smart_sifting(ecr_image) + instance_type = instance_type or "ml.c5.xlarge" + + source_code = SourceCode( + source_dir=smart_sifting_path, + entry_script="train_plt_smart_sifting.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + hyperparameters = {"epochs": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + job_name="test-pt-v3-smart-sifting", + ) + + +@pytest.mark.usefixtures("feature_smart_sifting_present") +@pytest.mark.processor("gpu") +@pytest.mark.model("bert") +@pytest.mark.integration("smart_sifting") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +def test_smart_sifting_gpu(framework_version, ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_smart_sifting(ecr_image) + instance_type = instance_type or "ml.g4dn.12xlarge" + + source_code = SourceCode( + source_dir=smart_sifting_path, + entry_script="train_plt_smart_sifting.py", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + hyperparameters = {"epochs": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + job_name="test-pt-v3-smart-sifting", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smdataparallel.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smdataparallel.py new file mode 100644 index 000000000000..a722d71e1b71 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smdataparallel.py @@ -0,0 +1,260 @@ +# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest +import os +from sagemaker.train.configs import SourceCode, Compute + +from packaging.version import Version +from packaging.specifiers import SpecifierSet +from ...integration import DEFAULT_TIMEOUT, mnist_path, throughput_path +from .timeout import timeout +from .test_distributed_operations import can_run_smmodelparallel +from ....training import get_efa_test_instance_type +from test.test_utils import get_framework_and_version_from_tag, get_cuda_version_from_tag +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer + + +def validate_or_skip_smdataparallel(ecr_image): + if not can_run_smdataparallel(ecr_image): + pytest.skip("Data Parallelism is supported on CUDA 11 on PyTorch v1.6 and above") + + +def can_run_smdataparallel(ecr_image): + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + image_cuda_version = get_cuda_version_from_tag(ecr_image) + return Version(image_framework_version) in SpecifierSet(">=1.6") and Version( + image_cuda_version.strip("cu") + ) >= Version("110") + + +def skip_unsupported_instances_smdataparallel(instance_type): + if instance_type.startswith("ml.p5"): + pytest.skip(f"{instance_type} is not supported by smdataparallel") + + +def validate_or_skip_smdataparallel_efa(ecr_image): + if not can_run_smdataparallel_efa(ecr_image): + pytest.skip("EFA is only supported on CUDA 11, and on PyTorch 1.8.1 or higher") + + +def can_run_smdataparallel_efa(ecr_image): + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + image_cuda_version = get_cuda_version_from_tag(ecr_image) + return Version(image_framework_version) in SpecifierSet(">=1.8.1") and Version( + image_cuda_version.strip("cu") + ) >= Version("110") + + +@pytest.mark.skip("SMDDP binary releases are decoupled from DLC releases") +@pytest.mark.skip_cpu +@pytest.mark.skip_trcomp_containers +@pytest.mark.processor("gpu") +@pytest.mark.model("N/A") +@pytest.mark.multinode(2) +@pytest.mark.integration("smdataparallel") +@pytest.mark.team("smdataparallel") +@pytest.mark.parametrize( + "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True +) +@pytest.mark.efa() +def test_smdataparallel_throughput( + framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir +): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_smdataparallel_efa(ecr_image) + skip_unsupported_instances_smdataparallel(efa_instance_type) + + source_code = SourceCode( + source_dir=throughput_path, + entry_script="smdataparallel_throughput.py", + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 2} + hyperparameters = { + "size": 64, + "num_tensors": 20, + "iterations": 100, + "warmup": 10, + "bucket_size": 25, + "info": f"PT-{efa_instance_type}-N2", + } + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + job_name="test-pt-v3-smddp-throughput", + ) + + +@pytest.mark.skip("SMDDP binary releases are decoupled from DLC releases") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.usefixtures("feature_smddp_present") +@pytest.mark.integration("smdataparallel") +@pytest.mark.model("mnist") +@pytest.mark.processor("gpu") +@pytest.mark.team("smdataparallel") +def test_smdataparallel_mnist_script_mode_multigpu( + ecr_image, sagemaker_regions, instance_type, tmpdir +): + """ + Tests SM Distributed DataParallel single-node via script mode + """ + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_smdataparallel(ecr_image) + instance_type = "ml.p4d.24xlarge" + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="smdataparallel_mnist_script_mode.sh", + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-smddp-mnist-script-mode", + ) + + +@pytest.mark.skip("SMDDP binary releases are decoupled from DLC releases") +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.processor("gpu") +@pytest.mark.skip_cpu +@pytest.mark.multinode(2) +@pytest.mark.integration("smdataparallel") +@pytest.mark.model("mnist") +@pytest.mark.flaky(reruns=2) +@pytest.mark.efa() +@pytest.mark.team("smdataparallel") +@pytest.mark.parametrize( + "efa_instance_type", + get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), + indirect=True, +) +def test_smdataparallel_mnist(ecr_image, sagemaker_regions, efa_instance_type, tmpdir): + """ + Tests smddprun command via ModelTrainer distribution parameter + """ + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_smdataparallel_efa(ecr_image) + skip_unsupported_instances_smdataparallel(efa_instance_type) + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="smdataparallel_mnist.py", + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 2} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-smddp-mnist", + ) + + +@pytest.mark.skip("SMDDP binary releases are decoupled from DLC releases") +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.processor("gpu") +@pytest.mark.skip_cpu +@pytest.mark.multinode(2) +@pytest.mark.integration("smdataparallel") +@pytest.mark.model("mnist") +@pytest.mark.flaky(reruns=2) +@pytest.mark.efa() +@pytest.mark.parametrize( + "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True +) +@pytest.mark.team("smdataparallel") +def test_hc_smdataparallel_mnist(ecr_image, sagemaker_regions, efa_instance_type, tmpdir): + """ + Tests smddprun command via ModelTrainer distribution parameter + """ + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_smdataparallel_efa(ecr_image) + skip_unsupported_instances_smdataparallel(efa_instance_type) + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="smdataparallel_mnist.py", + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 2} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-hc-smddp-mnist", + ) + + +@pytest.mark.skip( + "SMDDP binary releases are decoupled from DLC releases and SM Model Parallel team is maintaining their own Docker Container" +) +@pytest.mark.skip_cpu +@pytest.mark.skip_trcomp_containers +@pytest.mark.usefixtures("feature_smmp_present") +@pytest.mark.usefixtures("feature_smddp_present") +@pytest.mark.processor("gpu") +@pytest.mark.integration("smdataparallel_smmodelparallel") +@pytest.mark.model("mnist") +@pytest.mark.parametrize("instance_types", ["ml.p4d.24xlarge"]) +@pytest.mark.team("smdataparallel") +def test_smmodelparallel_smdataparallel_mnist( + instance_types, ecr_image, sagemaker_regions, py_version, tmpdir +): + """ + Tests SM Distributed DataParallel and ModelParallel single-node via script mode + """ + skip_if_not_v3_compatible(ecr_image) + can_run_modelparallel = can_run_smmodelparallel(ecr_image) + can_run_dataparallel = can_run_smdataparallel(ecr_image) + if can_run_dataparallel and can_run_modelparallel: + entry_point = "smdataparallel_smmodelparallel_mnist_script_mode.sh" + elif can_run_dataparallel: + entry_point = "smdataparallel_mnist_script_mode.sh" + elif can_run_modelparallel: + entry_point = "smmodelparallel_mnist_script_mode.sh" + else: + pytest.skip("Both modelparallel and dataparallel dont support this image, nothing to run") + + source_code = SourceCode( + source_dir=mnist_path, + entry_script=entry_point, + ) + compute_params = {"instance_type": instance_types, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + job_name="test-pt-v3-smdmp-smddp-mnist", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smppy.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smppy.py new file mode 100644 index 000000000000..fcac05f06ef3 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_smppy.py @@ -0,0 +1,108 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. +from __future__ import absolute_import + +import os +import time + +import boto3 +import pytest +from packaging.specifiers import SpecifierSet +from packaging.version import Version +from sagemaker.train.configs import SourceCode +from sagemaker.train.distributed import Torchrun + +from test.test_utils import get_framework_and_version_from_tag +from ...integration import DEFAULT_TIMEOUT, smppy_mnist_script, training_dir, mnist_path +from .timeout import timeout +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer +from .test_torch_distributed import validate_or_skip_distributed_training + +INSTANCE_TYPE = "ml.g4dn.12xlarge" + + +def _skip_if_image_is_not_compatible_with_smppy(image_uri): + _, framework_version = get_framework_and_version_from_tag(image_uri) + compatible_versions = SpecifierSet(">=2.0") + if Version(framework_version) not in compatible_versions: + pytest.skip(f"This test only works for PT versions in {compatible_versions}") + + +@pytest.mark.skip_smppy_test +@pytest.mark.usefixtures("feature_smppy_present") +@pytest.mark.processor("gpu") +@pytest.mark.integration("smppy") +@pytest.mark.model("mnist") +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +def test_training_smppy(framework_version, ecr_image, sagemaker_regions): + skip_if_not_v3_compatible(ecr_image) + _skip_if_image_is_not_compatible_with_smppy(ecr_image) + + source_code = SourceCode( + source_dir=os.path.dirname(smppy_mnist_script), + entry_script=os.path.basename(smppy_mnist_script), + ) + compute_params = {"instance_type": INSTANCE_TYPE, "instance_count": 1} + hyperparameters = {"epochs": 1} + + # TODO: ProfilerConfig/Profiler from SM SDK v2 does not have a direct v3 equivalent yet. + # Profiling configuration is omitted for now. Add v3 profiling support when available. + + with timeout(minutes=DEFAULT_TIMEOUT): + model_trainer, _ = invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + upload_s3_data_args={"path": training_dir, "key_prefix": "pytorch/mnist"}, + job_name="test-pt-v3-smppy-training", + ) + + +@pytest.mark.skip_smppy_test +@pytest.mark.usefixtures("feature_smppy_present") +@pytest.mark.processor("gpu") +@pytest.mark.integration("smppy") +@pytest.mark.model("mnist") +@pytest.mark.multinode(2) +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +def test_training_smppy_distributed(framework_version, ecr_image, sagemaker_regions): + skip_if_not_v3_compatible(ecr_image) + _skip_if_image_is_not_compatible_with_smppy(ecr_image) + validate_or_skip_distributed_training(ecr_image) + + source_code = SourceCode( + source_dir=os.path.dirname(smppy_mnist_script), + entry_script=os.path.basename(smppy_mnist_script), + ) + compute_params = {"instance_type": INSTANCE_TYPE, "instance_count": 2} + hyperparameters = {"epochs": 1} + distributed_runner = Torchrun() + + # TODO: ProfilerConfig/Profiler from SM SDK v2 does not have a direct v3 equivalent yet. + # Profiling configuration is omitted for now. Add v3 profiling support when available. + + with timeout(minutes=DEFAULT_TIMEOUT): + model_trainer, _ = invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + distributed_runner=distributed_runner, + upload_s3_data_args={"path": training_dir, "key_prefix": "pytorch/mnist"}, + job_name="test-pt-v3-smppy-training-distributed", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_torch_distributed.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_torch_distributed.py new file mode 100644 index 000000000000..06dac14f2b71 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_torch_distributed.py @@ -0,0 +1,77 @@ +# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode +from sagemaker.train.distributed import Torchrun + +from packaging.version import Version +from packaging.specifiers import SpecifierSet + +from ...integration import DEFAULT_TIMEOUT, mnist_path +from .timeout import timeout +from ....training import get_efa_test_instance_type +from test.test_utils import get_framework_and_version_from_tag +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer + + +def validate_or_skip_distributed_training(ecr_image): + if not can_run_distributed_training(ecr_image): + pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above") + + +def can_run_distributed_training(ecr_image): + _, image_framework_version = get_framework_and_version_from_tag(ecr_image) + return Version(image_framework_version) in SpecifierSet(">=1.10") + + +@pytest.mark.skipif( + os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge", + reason="Low availability of instance type; Must ensure test works on new instances.", +) +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_trcomp_containers +@pytest.mark.processor("gpu") +@pytest.mark.model("N/A") +@pytest.mark.multinode(2) +@pytest.mark.integration("torch_distributed") +@pytest.mark.parametrize( + "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True +) +@pytest.mark.efa() +@pytest.mark.team("conda") +def test_torch_distributed_throughput_gpu( + framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir +): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_distributed_training(ecr_image) + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="torch_distributed_throughput_mnist.py", + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 2} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + distributed_runner=Torchrun(), + job_name="test-pt-v3-torch-distributed-throughput-gpu", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_torch_distributed_inductor.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_torch_distributed_inductor.py new file mode 100644 index 000000000000..bdb5fb3fb947 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_torch_distributed_inductor.py @@ -0,0 +1,65 @@ +# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode +from sagemaker.train.distributed import Torchrun + +from ...integration import DEFAULT_TIMEOUT, mnist_path +from .timeout import timeout +from ....training import get_efa_test_instance_type +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer +from .test_torch_distributed import validate_or_skip_distributed_training + + +@pytest.mark.skipif( + os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge", + reason="Low availability of instance type; Must ensure test works on new instances.", +) +@pytest.mark.skip_cpu +@pytest.mark.skip_py2_containers +@pytest.mark.skip_inductor_test +@pytest.mark.processor("gpu") +@pytest.mark.model("N/A") +@pytest.mark.multinode(2) +@pytest.mark.integration("torch_distributed") +@pytest.mark.parametrize( + "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True +) +@pytest.mark.efa() +@pytest.mark.team("training-compiler") +def test_torch_distributed_throughput_gpu( + framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir +): + skip_if_not_v3_compatible(ecr_image) + validate_or_skip_distributed_training(ecr_image) + + source_code = SourceCode( + source_dir=mnist_path, + entry_script="torch_distributed_throughput_mnist.py", + ) + compute_params = {"instance_type": efa_instance_type, "instance_count": 2} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters={"inductor": 1}, + distributed_runner=Torchrun(), + job_name="test-pt-v3-torch-distributed-inductor-throughput-gpu", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_training_smdebug.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_training_smdebug.py new file mode 100644 index 000000000000..e3bc713a160f --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/test_training_smdebug.py @@ -0,0 +1,92 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import pytest +from sagemaker.train.configs import SourceCode + +from ...integration import training_dir, smdebug_mnist_script, DEFAULT_TIMEOUT +from .timeout import timeout +from . import skip_if_not_v3_compatible, invoke_pytorch_model_trainer + + +@pytest.mark.skip("SM Debugger/Profiler v1 deprecated") +@pytest.mark.skip_py2_containers +@pytest.mark.usefixtures("feature_smdebug_present") +@pytest.mark.integration("smdebug") +@pytest.mark.model("mnist") +@pytest.mark.team("smdebug") +def test_training_smdebug(framework_version, ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + + hyperparameters = { + "random_seed": True, + "num_steps": 50, + "smdebug_path": "/tmp/ml/output/tensors", + "epochs": 1, + "data_dir": training_dir, + } + + source_code = SourceCode( + source_dir=os.path.dirname(smdebug_mnist_script), + entry_script=os.path.basename(smdebug_mnist_script), + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + upload_s3_data_args={"path": training_dir, "key_prefix": "pytorch/mnist"}, + job_name="test-pt-v3-smdebug-training", + ) + + +@pytest.mark.skip("SM Debugger/Profiler v1 deprecated") +@pytest.mark.skip_py2_containers +@pytest.mark.usefixtures("feature_smdebug_present") +@pytest.mark.integration("smdebug") +@pytest.mark.model("mnist") +@pytest.mark.team("smdebug") +def test_hc_training_smdebug(framework_version, ecr_image, sagemaker_regions, instance_type): + skip_if_not_v3_compatible(ecr_image) + + hyperparameters = { + "random_seed": True, + "num_steps": 50, + "smdebug_path": "/tmp/ml/output/tensors", + "epochs": 1, + "data_dir": training_dir, + } + + source_code = SourceCode( + source_dir=os.path.dirname(smdebug_mnist_script), + entry_script=os.path.basename(smdebug_mnist_script), + ) + compute_params = {"instance_type": instance_type, "instance_count": 1} + + with timeout(minutes=DEFAULT_TIMEOUT): + invoke_pytorch_model_trainer( + ecr_image, + sagemaker_regions, + source_code=source_code, + compute_params=compute_params, + hyperparameters=hyperparameters, + upload_s3_data_args={"path": training_dir, "key_prefix": "pytorch/mnist"}, + job_name="test-pt-v3-hc-smdebug-training", + ) diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/timeout.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/timeout.py new file mode 100644 index 000000000000..b2c54ad09b81 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker_v3/timeout.py @@ -0,0 +1,51 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +# +# Standalone timeout utilities for SM SDK v3 tests. +from __future__ import absolute_import +import signal +from contextlib import contextmanager +import logging + +LOGGER = logging.getLogger("timeout") + + +class TimeoutError(Exception): + pass + + +@contextmanager +def timeout(seconds=0, minutes=0, hours=0): + """Add a signal-based timeout to any block of code. + If multiple time units are specified, they will be added together to determine time limit. + Usage: + with timeout(seconds=5): + my_slow_function(...) + Args: + - seconds: The time limit, in seconds. + - minutes: The time limit, in minutes. + - hours: The time limit, in hours. + """ + + limit = seconds + 60 * minutes + 3600 * hours + + def handler(signum, frame): + raise TimeoutError("timed out after {} seconds".format(limit)) + + try: + signal.signal(signal.SIGALRM, handler) + signal.alarm(limit) + + yield + finally: + signal.alarm(0) diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index c42f219772fe..94521a76f88e 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -211,6 +211,17 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): else: integration_path = os.path.join("integration", sagemaker_test_type) + # Use SageMaker SDK v3 tests for PyTorch >= 2.10 + if ( + framework == "pytorch" + and job_type == "training" + and sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE + ): + from packaging.version import Version + + if Version(framework_version) >= Version("2.10"): + integration_path = os.path.join("integration", "sagemaker_v3") + # Conditions for modifying tensorflow SageMaker pytest commands if framework == "tensorflow" and sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE: if job_type == "inference": @@ -447,6 +458,14 @@ def execute_sagemaker_remote_tests(process_index, image, global_pytest_cache, py context.run(f"virtualenv {tag}") with context.prefix(f"source {tag}/bin/activate"): context.run("pip install -r requirements.txt", warn=True) + # For PyTorch >= 2.10, install SM SDK v3 requirements to override v2 + framework, framework_version = get_framework_and_version_from_tag(image) + if framework == "pytorch": + from packaging.version import Version + + if Version(framework_version) >= Version("2.10"): + v3_req = os.path.join("integration", "sagemaker_v3", "requirements.txt") + context.run(f"pip install -r {v3_req}", warn=True) pytest_cache_util.download_pytest_cache_from_s3_to_local( path, **pytest_cache_params, custom_cache_directory=str(process_index) )