From ca2804fd8bf6fc38d3522458032e783130ef5b3e Mon Sep 17 00:00:00 2001 From: Dushyant Behl Date: Tue, 24 Mar 2026 15:48:48 +0530 Subject: [PATCH] fix few errors Signed-off-by: Dushyant Behl --- .github/workflows/pr-command.yaml | 7 ---- build/nvcr.Dockerfile | 54 +++++++------------------------ tests/test_sft_trainer.py | 2 +- 3 files changed, 12 insertions(+), 51 deletions(-) diff --git a/.github/workflows/pr-command.yaml b/.github/workflows/pr-command.yaml index 695f05d11..816ceff23 100644 --- a/.github/workflows/pr-command.yaml +++ b/.github/workflows/pr-command.yaml @@ -64,13 +64,6 @@ jobs: username: ${{ secrets.QUAY_USERNAME }} password: ${{ secrets.QUAY_ROBOT_TOKEN }} - - name: Run basic sanity checks/tests on the new image before pushing - run: | - echo 'check if accelerate is installed and in the PATH' - IMAGE_NAME=${{ vars.QUAY_REPOSITORY }}fms-hf-tuning:pr-${{ github.event.issue.number }}-nvcr - docker run --rm -it --entrypoint which "$IMAGE_NAME" accelerate - echo 'checks done' - - name: Push docker image run: | docker push ${{ vars.QUAY_REPOSITORY }}fms-hf-tuning:pr-${{ github.event.issue.number }}-nvcr diff --git a/build/nvcr.Dockerfile b/build/nvcr.Dockerfile index aa575a2ef..b85c62dd9 100644 --- a/build/nvcr.Dockerfile +++ b/build/nvcr.Dockerfile @@ -15,7 +15,7 @@ ## Global Args ################################################################# ## If the nvcr container is updated, ensure to check the torch and python ## installation version inside the dockerfile before pushing changes. -ARG NVCR_IMAGE_VERSION=25.02-py3 +ARG NVCR_IMAGE_VERSION=25.10-py3 # This is based on what is inside the NVCR image already ARG PYTHON_VERSION=3.12 @@ -28,58 +28,26 @@ ARG USER_UID=0 ARG WORKDIR=/app ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning -ARG ENABLE_FMS_ACCELERATION=true -ARG ENABLE_AIM=false -ARG ENABLE_MLFLOW=false -ARG ENABLE_SCANNER=false -ARG ENABLE_CLEARML=true ARG ENABLE_TRITON_KERNELS=true -ARG ENABLE_RECOMMENDER=true # Ensures to always build mamba_ssm from source ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm -# upgrade torch as the base layer contains only torch 2.7 -RUN python -m pip install --upgrade pip && \ - pip install --upgrade setuptools && \ - pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128 +# install triton kernels +RUN pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels" # Install main package + flash attention COPY . ${SOURCE_DIR} RUN cd ${SOURCE_DIR} -RUN pip install --no-cache-dir ${SOURCE_DIR} && \ - pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[flash-attn] && \ - pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[mamba] - -# Optional extras -RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[fms-accel] && \ - python -m fms_acceleration.cli install fms_acceleration_peft && \ - python -m fms_acceleration.cli install fms_acceleration_foak && \ - python -m fms_acceleration.cli install fms_acceleration_aadp && \ - python -m fms_acceleration.cli install fms_acceleration_moe && \ - python -m fms_acceleration.cli install fms_acceleration_odm; \ - fi - -RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \ - pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \ - fi -RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \ - fi -RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[aim]; \ - fi -RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[mlflow]; \ - fi -RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \ - fi -RUN if [[ "${ENABLE_RECOMMENDER}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[tuning_config_recommender]; \ - fi +RUN pip install --no-cache-dir ${SOURCE_DIR}[flash-attn,mamba,fms-accel,clearml,tuning_config_recommender] + +# install fms-accel packages +RUN python -m fms_acceleration.cli install fms_acceleration_peft && \ + python -m fms_acceleration.cli install fms_acceleration_foak && \ + python -m fms_acceleration.cli install fms_acceleration_aadp && \ + python -m fms_acceleration.cli install fms_acceleration_moe && \ + python -m fms_acceleration.cli install fms_acceleration_odm # cleanup build artifacts and caches RUN rm -rf /root/.cache /tmp/pip-* \ diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index df972dfbc..029cf63ad 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -2192,7 +2192,7 @@ def test_empty_data(): data_args = copy.deepcopy(DATA_ARGS) data_args.training_data_path = EMPTY_DATA - with pytest.raises((DatasetGenerationError, ValueError)): + with pytest.raises((DatasetGenerationError, ValueError, StopIteration)): sft_trainer.train( copy.deepcopy(MODEL_ARGS), data_args,