Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions .github/workflows/pr-command.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,6 @@ jobs:
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_TOKEN }}

- name: Run basic sanity checks/tests on the new image before pushing
run: |
echo 'check if accelerate is installed and in the PATH'
IMAGE_NAME=${{ vars.QUAY_REPOSITORY }}fms-hf-tuning:pr-${{ github.event.issue.number }}-nvcr
docker run --rm -it --entrypoint which "$IMAGE_NAME" accelerate
echo 'checks done'

- name: Push docker image
run: |
docker push ${{ vars.QUAY_REPOSITORY }}fms-hf-tuning:pr-${{ github.event.issue.number }}-nvcr
Expand Down
54 changes: 11 additions & 43 deletions build/nvcr.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
## Global Args #################################################################
## If the nvcr container is updated, ensure to check the torch and python
## installation version inside the dockerfile before pushing changes.
ARG NVCR_IMAGE_VERSION=25.02-py3
ARG NVCR_IMAGE_VERSION=25.10-py3

# This is based on what is inside the NVCR image already
ARG PYTHON_VERSION=3.12
Expand All @@ -28,58 +28,26 @@ ARG USER_UID=0
ARG WORKDIR=/app
ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning

ARG ENABLE_FMS_ACCELERATION=true
ARG ENABLE_AIM=false
ARG ENABLE_MLFLOW=false
ARG ENABLE_SCANNER=false
ARG ENABLE_CLEARML=true
ARG ENABLE_TRITON_KERNELS=true
ARG ENABLE_RECOMMENDER=true

# Ensures to always build mamba_ssm from source
ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm

# upgrade torch as the base layer contains only torch 2.7
RUN python -m pip install --upgrade pip && \
pip install --upgrade setuptools && \
pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128
# install triton kernels
RUN pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"

# Install main package + flash attention
COPY . ${SOURCE_DIR}
RUN cd ${SOURCE_DIR}

RUN pip install --no-cache-dir ${SOURCE_DIR} && \
pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[flash-attn] && \
pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[mamba]

# Optional extras
RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[fms-accel] && \
python -m fms_acceleration.cli install fms_acceleration_peft && \
python -m fms_acceleration.cli install fms_acceleration_foak && \
python -m fms_acceleration.cli install fms_acceleration_aadp && \
python -m fms_acceleration.cli install fms_acceleration_moe && \
python -m fms_acceleration.cli install fms_acceleration_odm; \
fi

RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \
pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \
fi
RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \
fi
RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[aim]; \
fi
RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[mlflow]; \
fi
RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \
fi
RUN if [[ "${ENABLE_RECOMMENDER}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[tuning_config_recommender]; \
fi
RUN pip install --no-cache-dir ${SOURCE_DIR}[flash-attn,mamba,fms-accel,clearml,tuning_config_recommender]

# install fms-accel packages
RUN python -m fms_acceleration.cli install fms_acceleration_peft && \
python -m fms_acceleration.cli install fms_acceleration_foak && \
python -m fms_acceleration.cli install fms_acceleration_aadp && \
python -m fms_acceleration.cli install fms_acceleration_moe && \
python -m fms_acceleration.cli install fms_acceleration_odm

# cleanup build artifacts and caches
RUN rm -rf /root/.cache /tmp/pip-* \
Expand Down
2 changes: 1 addition & 1 deletion tests/test_sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2192,7 +2192,7 @@ def test_empty_data():
data_args = copy.deepcopy(DATA_ARGS)
data_args.training_data_path = EMPTY_DATA

with pytest.raises((DatasetGenerationError, ValueError)):
with pytest.raises((DatasetGenerationError, ValueError, StopIteration)):
sft_trainer.train(
copy.deepcopy(MODEL_ARGS),
data_args,
Expand Down
Loading