From 4f856af876753e2a566dab058d81de62c49ac1ee Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Tue, 31 Mar 2026 17:34:02 -0700 Subject: [PATCH 01/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 12 X-AI-Prompt: add these lines in release-sagemaker-xgboost to truigger # TODO: Remove push trigger after testing, keep only workflow_dispatch push: branches: [xgboost-release] --- .github/workflows/release-sagemaker-xgboost.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 0acfb719df6a..619fb91a0268 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -1,6 +1,9 @@ name: Release - XGBoost SageMaker on: + # TODO: Remove push trigger after testing, keep only workflow_dispatch + push: + branches: [xgboost-release] workflow_dispatch: permissions: From bd34e9ad717b6eddff218031b200edfc9246eff9 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Tue, 31 Mar 2026 17:37:06 -0700 Subject: [PATCH 02/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 53 X-AI-Prompt: no this has made a mess now the pr contains only that changes of 2 lines --- .github/config/sagemaker-xgboost.yml | 2 +- .../workflows/release-sagemaker-xgboost.yml | 115 ++++++++++++------ 2 files changed, 79 insertions(+), 38 deletions(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index 3a9097bf589f..eef4fde21767 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -24,7 +24,7 @@ common: release: release: true force_release: false - public_registry: true + public_registry: false private_registry: true enable_soci: false environment: gamma diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 619fb91a0268..9a8f5297c641 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -151,47 +151,88 @@ jobs: framework: ${{ needs.load-config.outputs.framework }} framework-version: ${{ needs.load-config.outputs.framework-version }} - benchmark-test: - needs: [build-image, load-config] - if: success() - timeout-minutes: 150 - strategy: - fail-fast: false - matrix: - test-module: - - test_training_objective - - test_training_tree_method - - test_training_max_depth - - test_training_num_round - - test_training_data_size - - test_training_instance_type - - test_training_content_type - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true + # Benchmark tests commented out - already passed in PR #5852 + # benchmark-test: + # needs: [build-image, load-config] + # if: success() + # timeout-minutes: 150 + # strategy: + # fail-fast: false + # matrix: + # test-module: + # - test_training_objective + # - test_training_tree_method + # - test_training_max_depth + # - test_training_num_round + # - test_training_data_size + # - test_training_instance_type + # - test_training_content_type + # runs-on: + # - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + # fleet:default-runner + # buildspec-override:true + # concurrency: + # group: ${{ github.workflow }}-benchmark-${{ matrix.test-module }}-${{ github.run_id }} + # cancel-in-progress: true + # steps: + # - name: Checkout DLC source + # uses: actions/checkout@v5 + # - name: Install test dependencies + # run: | + # uv venv --python 3.12 + # source .venv/bin/activate + # uv pip install -r test/requirements.txt + # uv pip install -r test/xgboost/requirements.txt + # - name: Run ${{ matrix.test-module }} + # run: | + # source .venv/bin/activate + # cd test/ + # python3 -m pytest -vs -rA \ + # --image-uri ${{ needs.build-image.outputs.ci-image }} \ + # xgboost/benchmarks/${{ matrix.test-module }}.py + + # TODO: Add integration-test job once integ tests are implemented + # TODO: Add container-test job once container tests are implemented + + generate-release-spec: + needs: [load-config, build-image, unit-test, security-test] + runs-on: ubuntu-latest concurrency: - group: ${{ github.workflow }}-benchmark-${{ matrix.test-module }}-${{ github.run_id }} + group: ${{ github.workflow }}-generate-release-spec-${{ github.run_id }} cancel-in-progress: true + outputs: + release-spec: ${{ steps.generate.outputs.release-spec }} + should-release: ${{ steps.check-release.outputs.should-release }} steps: - - name: Checkout DLC source + - name: Checkout code uses: actions/checkout@v5 - - name: Install test dependencies - run: | - uv venv --python 3.12 - source .venv/bin/activate - uv pip install -r test/requirements.txt - uv pip install -r test/xgboost/requirements.txt - - - name: Run ${{ matrix.test-module }} + - name: Check if release is enabled + id: check-release run: | - source .venv/bin/activate - cd test/ - python3 -m pytest -vs -rA \ - --image-uri ${{ needs.build-image.outputs.ci-image }} \ - xgboost/benchmarks/${{ matrix.test-module }}.py + echo '${{ needs.load-config.outputs.config }}' > config.json + RELEASE_ENABLED=$(jq -r '.release.release // false' config.json) + echo "Release enabled: ${RELEASE_ENABLED}" + echo "should-release=${RELEASE_ENABLED}" >> $GITHUB_OUTPUT + + - name: Generate release spec + id: generate + if: steps.check-release.outputs.should-release == 'true' + uses: ./.github/actions/generate-release-spec + with: + config-json: ${{ needs.load-config.outputs.config }} - # TODO: Add integration-test job once integ tests are implemented - # TODO: Add container-test job once container tests are implemented - # TODO: Add generate-release-spec and release-image jobs when release is ready + release-image: + needs: [load-config, build-image, generate-release-spec] + if: needs.generate-release-spec.outputs.should-release == 'true' + concurrency: + group: ${{ github.workflow }}-release-image-${{ github.run_id }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-release-image.yml + with: + source-image-uri: ${{ needs.build-image.outputs.ci-image }} + release-spec: ${{ needs.generate-release-spec.outputs.release-spec }} + environment: ${{ fromJson(needs.load-config.outputs.config).release.environment }} + aws-region: ${{ vars.AWS_REGION }} + runner-fleet: default-runner + secrets: inherit From c7c958ae747d6905fb415cc25edd8d17f03aad65 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Wed, 1 Apr 2026 12:36:14 -0700 Subject: [PATCH 03/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 14 X-AI-Prompt: okay make chagne to xgbost-migration brnach not release since our branch is migration not release --- .github/workflows/release-sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 9a8f5297c641..7bd25fd74326 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -3,7 +3,7 @@ name: Release - XGBoost SageMaker on: # TODO: Remove push trigger after testing, keep only workflow_dispatch push: - branches: [xgboost-release] + branches: [xgboost-migration] workflow_dispatch: permissions: From 643f139ccbe5bf0dfe8dfaa5fb88e40aba5693b5 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Wed, 1 Apr 2026 14:47:48 -0700 Subject: [PATCH 04/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 9 X-AI-Prompt: yeah set it --- .github/config/sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index eef4fde21767..b5a13453b986 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -23,7 +23,7 @@ common: # Release configuration release: release: true - force_release: false + force_release: true public_registry: false private_registry: true enable_soci: false From 48062a61b423f2e019c5677f6cfd3f32e3c28b88 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 2 Apr 2026 18:53:44 -0700 Subject: [PATCH 05/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 37 X-AI-Prompt: Do not remove anything our changes are to just add the release image and generate release spec and the rest should not be changed what is in the main. i know previously we commented out the benchmark teests but recent chnages in repo already handled that so now we have to just add last 2 steps without changeing any thing --- .github/config/ray-ec2-cpu.yml | 2 +- .github/config/ray-ec2-gpu.yml | 2 +- .github/config/ray-sagemaker-cpu.yml | 2 +- .github/config/ray-sagemaker-gpu.yml | 2 +- .github/config/vllm-model-tests.yml | 24 +- .../workflows/release-sagemaker-xgboost.yml | 50 +- .../sagemaker-xgboost-integ-tests.yml | 202 +++++ .github/workflows/vllm-benchmark.yml | 67 +- docker/ray/Dockerfile.cpu | 2 +- docker/ray/Dockerfile.gpu | 2 +- .../data/djl-inference/0.36-lmi22.0.0-gpu.yml | 2 +- .../data/djl-inference/0.36-lmi23.0.0-gpu.yml | 9 + docs/src/data/vllm/0.18.1-gpu-sagemaker.yml | 26 + scripts/ray/pyproject.toml | 2 +- scripts/ray/uv.lock | 16 +- scripts/vllm/benchmark/benchmark_report.py | 14 +- scripts/vllm/benchmark/vllm_benchmark_test.sh | 35 +- test/test_utils/aws.py | 26 +- test/xgboost/container/conftest.py | 66 ++ test/xgboost/container/container_helper.py | 319 +++++++ test/xgboost/container/generate_models.py | 109 +++ .../xgboost/container/test_batch_transform.py | 162 ++++ test/xgboost/container/test_scoring.py | 314 +++++++ test/xgboost/container/test_training.py | 782 ++++++++++++++++++ 24 files changed, 2102 insertions(+), 135 deletions(-) create mode 100644 .github/workflows/sagemaker-xgboost-integ-tests.yml create mode 100644 docs/src/data/djl-inference/0.36-lmi23.0.0-gpu.yml create mode 100644 docs/src/data/vllm/0.18.1-gpu-sagemaker.yml create mode 100644 test/xgboost/container/conftest.py create mode 100644 test/xgboost/container/container_helper.py create mode 100755 test/xgboost/container/generate_models.py create mode 100644 test/xgboost/container/test_batch_transform.py create mode 100644 test/xgboost/container/test_scoring.py create mode 100644 test/xgboost/container/test_training.py diff --git a/.github/config/ray-ec2-cpu.yml b/.github/config/ray-ec2-cpu.yml index 380950e1639c..88b2c42a942f 100644 --- a/.github/config/ray-ec2-cpu.yml +++ b/.github/config/ray-ec2-cpu.yml @@ -9,7 +9,7 @@ image: # Build configuration common: framework: "ray" - framework_version: "2.54.0" + framework_version: "2.54.1" job_type: "inference" python_version: "py313" os_version: "amzn2023" diff --git a/.github/config/ray-ec2-gpu.yml b/.github/config/ray-ec2-gpu.yml index b5e8a9fc6dde..b3dc0961cbee 100644 --- a/.github/config/ray-ec2-gpu.yml +++ b/.github/config/ray-ec2-gpu.yml @@ -9,7 +9,7 @@ image: # Build configuration common: framework: "ray" - framework_version: "2.54.0" + framework_version: "2.54.1" job_type: "inference" python_version: "py313" cuda_version: "cu129" diff --git a/.github/config/ray-sagemaker-cpu.yml b/.github/config/ray-sagemaker-cpu.yml index bc6e14ab5ef2..2f41884b5af3 100644 --- a/.github/config/ray-sagemaker-cpu.yml +++ b/.github/config/ray-sagemaker-cpu.yml @@ -9,7 +9,7 @@ image: # Build configuration common: framework: "ray" - framework_version: "2.54.0" + framework_version: "2.54.1" job_type: "inference" python_version: "py313" os_version: "amzn2023" diff --git a/.github/config/ray-sagemaker-gpu.yml b/.github/config/ray-sagemaker-gpu.yml index e00d11a07f9c..0a32eea23ea3 100644 --- a/.github/config/ray-sagemaker-gpu.yml +++ b/.github/config/ray-sagemaker-gpu.yml @@ -9,7 +9,7 @@ image: # Build configuration common: framework: "ray" - framework_version: "2.54.0" + framework_version: "2.54.1" job_type: "inference" python_version: "py313" cuda_version: "cu129" diff --git a/.github/config/vllm-model-tests.yml b/.github/config/vllm-model-tests.yml index 164b970cfa8e..0f7c76b44246 100644 --- a/.github/config/vllm-model-tests.yml +++ b/.github/config/vllm-model-tests.yml @@ -28,7 +28,7 @@ benchmark: output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 6000 + min_throughput: 1200 min_rps: 5 - name: "qwen3.5-9b" @@ -39,7 +39,7 @@ benchmark: output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 180 + min_throughput: 20 min_rps: 0.15 - name: "llama-3.3-70b" @@ -50,7 +50,7 @@ benchmark: output_len: 128 num_prompts: 32 batch_size: 2 - min_throughput: 400 + min_throughput: 80 min_rps: 0.35 # https://github.com/vllm-project/vllm/issues/32637 @@ -64,7 +64,7 @@ benchmark: # output_len: 128 # num_prompts: 64 # batch_size: 4 - # min_throughput: 100 + # min_throughput: 20 # min_rps: 1 - name: "qwen3.5-35b-a3b-fp8" @@ -77,7 +77,7 @@ benchmark: output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 400 + min_throughput: 80 min_rps: 0.35 # A100 is compute capability 8.0 — FP8 requires 8.9+ (H100/L40S). @@ -90,7 +90,7 @@ benchmark: output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 100 + min_throughput: 20 min_rps: 0.2 - name: "qwen3-coder-next-fp8" @@ -101,7 +101,7 @@ benchmark: output_len: 256 num_prompts: 32 batch_size: 2 - min_throughput: 280 + min_throughput: 93 min_rps: 0.25 runner-scale-sets: @@ -112,7 +112,7 @@ benchmark: output_len: 256 num_prompts: 32 batch_size: 2 - min_throughput: 3400 + min_throughput: 1133 min_rps: 3 - name: "qwen3.5-35b-a3b-fp8" @@ -124,7 +124,7 @@ benchmark: output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 400 + min_throughput: 80 min_rps: 0.35 - name: "qwen3.5-27b-fp8" @@ -135,7 +135,7 @@ benchmark: output_len: 128 num_prompts: 64 batch_size: 4 - min_throughput: 100 + min_throughput: 20 min_rps: 0.2 - name: "qwen3-coder-next-fp8" @@ -145,7 +145,7 @@ benchmark: output_len: 256 num_prompts: 32 batch_size: 2 - min_throughput: 280 + min_throughput: 93 min_rps: 0.25 - name: "llama-3.3-70b" @@ -155,7 +155,7 @@ benchmark: output_len: 128 num_prompts: 32 batch_size: 2 - min_throughput: 400 + min_throughput: 80 min_rps: 0.35 # upstream diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 7bd25fd74326..49c0a4a38b17 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -151,48 +151,14 @@ jobs: framework: ${{ needs.load-config.outputs.framework }} framework-version: ${{ needs.load-config.outputs.framework-version }} - # Benchmark tests commented out - already passed in PR #5852 - # benchmark-test: - # needs: [build-image, load-config] - # if: success() - # timeout-minutes: 150 - # strategy: - # fail-fast: false - # matrix: - # test-module: - # - test_training_objective - # - test_training_tree_method - # - test_training_max_depth - # - test_training_num_round - # - test_training_data_size - # - test_training_instance_type - # - test_training_content_type - # runs-on: - # - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - # fleet:default-runner - # buildspec-override:true - # concurrency: - # group: ${{ github.workflow }}-benchmark-${{ matrix.test-module }}-${{ github.run_id }} - # cancel-in-progress: true - # steps: - # - name: Checkout DLC source - # uses: actions/checkout@v5 - # - name: Install test dependencies - # run: | - # uv venv --python 3.12 - # source .venv/bin/activate - # uv pip install -r test/requirements.txt - # uv pip install -r test/xgboost/requirements.txt - # - name: Run ${{ matrix.test-module }} - # run: | - # source .venv/bin/activate - # cd test/ - # python3 -m pytest -vs -rA \ - # --image-uri ${{ needs.build-image.outputs.ci-image }} \ - # xgboost/benchmarks/${{ matrix.test-module }}.py - - # TODO: Add integration-test job once integ tests are implemented - # TODO: Add container-test job once container tests are implemented + xgboost-tests: + needs: [build-image, load-config] + if: success() + uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} generate-release-spec: needs: [load-config, build-image, unit-test, security-test] diff --git a/.github/workflows/sagemaker-xgboost-integ-tests.yml b/.github/workflows/sagemaker-xgboost-integ-tests.yml new file mode 100644 index 000000000000..76bb4ff4b430 --- /dev/null +++ b/.github/workflows/sagemaker-xgboost-integ-tests.yml @@ -0,0 +1,202 @@ +name: Reusable XGBoost SageMaker Integration Tests + +permissions: + contents: read + +on: + workflow_call: + inputs: + image-uri: + description: 'Image URI to test' + required: true + type: string + aws-account-id: + description: 'AWS account ID for ECR authentication' + required: true + type: string + aws-region: + description: 'AWS region for ECR authentication' + required: true + type: string + +env: + FORCE_COLOR: "1" + +jobs: + # =========================================================================== + # Generate inference models inside the container (ensures version compat) + # =========================================================================== + generate-models: + timeout-minutes: 15 + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:default-runner + buildspec-override:true + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + + - name: Install dependencies + run: | + uv venv --python 3.12 + source .venv/bin/activate + uv pip install xgboost==3.0.5 boto3 numpy + + - name: Generate and upload models + run: | + source .venv/bin/activate + python3 test/xgboost/container/generate_models.py + + # =========================================================================== + # Container tests — training (no model dependency) + # =========================================================================== + container-test-training: + timeout-minutes: 90 + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:x86-g6xl-runner + buildspec-override:true + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ inputs.aws-account-id }} + aws-region: ${{ inputs.aws-region }} + image-uri: ${{ inputs.image-uri }} + + - name: Pull image + run: docker pull ${{ inputs.image-uri }} + + - name: Install test dependencies + run: | + uv venv --python 3.12 + source .venv/bin/activate + uv pip install -r test/requirements.txt docker pytest boto3 requests + + - name: Run training container tests + run: | + source .venv/bin/activate + cd test/ + python3 -m pytest -v --tb=short -rA --log-cli-level=INFO \ + --image ${{ inputs.image-uri }} \ + xgboost/container/test_training.py + + # =========================================================================== + # Container tests — scoring (depends on generate-models) + # =========================================================================== + container-test-scoring: + needs: [generate-models] + timeout-minutes: 60 + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:x86-g6xl-runner + buildspec-override:true + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ inputs.aws-account-id }} + aws-region: ${{ inputs.aws-region }} + image-uri: ${{ inputs.image-uri }} + + - name: Pull image + run: docker pull ${{ inputs.image-uri }} + + - name: Install test dependencies + run: | + uv venv --python 3.12 + source .venv/bin/activate + uv pip install -r test/requirements.txt docker pytest boto3 requests + + - name: Run scoring container tests + run: | + source .venv/bin/activate + cd test/ + python3 -m pytest -v --tb=short -rA --log-cli-level=INFO \ + --image ${{ inputs.image-uri }} \ + xgboost/container/test_scoring.py + + # =========================================================================== + # Container tests — batch transform (depends on generate-models) + # =========================================================================== + container-test-batch-transform: + needs: [generate-models] + timeout-minutes: 60 + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:x86-g6xl-runner + buildspec-override:true + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ inputs.aws-account-id }} + aws-region: ${{ inputs.aws-region }} + image-uri: ${{ inputs.image-uri }} + + - name: Pull image + run: docker pull ${{ inputs.image-uri }} + + - name: Install test dependencies + run: | + uv venv --python 3.12 + source .venv/bin/activate + uv pip install -r test/requirements.txt docker pytest boto3 requests + + - name: Run batch transform container tests + run: | + source .venv/bin/activate + cd test/ + python3 -m pytest -v --tb=short -rA --log-cli-level=INFO \ + --image ${{ inputs.image-uri }} \ + xgboost/container/test_batch_transform.py + + # TODO: Add integration-test job (upstream sagemaker-xgboost-container local mode tests) + + # =========================================================================== + # Benchmark tests (SageMaker training jobs) — commented out pending validation + # =========================================================================== + # benchmark-test: + # timeout-minutes: 150 + # strategy: + # fail-fast: false + # matrix: + # test-module: + # - test_training_objective + # - test_training_tree_method + # - test_training_max_depth + # - test_training_num_round + # - test_training_data_size + # - test_training_instance_type + # - test_training_content_type + # runs-on: + # - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + # fleet:x86-g6xl-runner + # buildspec-override:true + # steps: + # - name: Checkout DLC source + # uses: actions/checkout@v5 + # + # - name: Install test dependencies + # run: | + # uv venv --python 3.12 + # source .venv/bin/activate + # uv pip install -r test/requirements.txt + # uv pip install -r test/xgboost/requirements.txt + # + # - name: Run ${{ matrix.test-module }} + # run: | + # source .venv/bin/activate + # cd test/ + # python3 -m pytest -v --tb=short -rA --log-cli-level=INFO \ + # --image-uri ${{ inputs.image-uri }} \ + # xgboost/benchmarks/${{ matrix.test-module }}.py diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 725476d5b97f..d533a304fbb8 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -61,6 +61,8 @@ jobs: needs: [load-benchmarks] strategy: fail-fast: false + # we only have 1 g6e12xl 4 models need it action only schedules once for the same label + max-parallel: 2 matrix: include: ${{ fromJson(needs.load-benchmarks.outputs.codebuild-fleet-matrix) }} runs-on: @@ -92,22 +94,17 @@ jobs: nvidia-smi - name: Download model from S3 - run: | - MODEL_DIR="/dlc-models/${{ matrix.name }}" - mkdir -p "${MODEL_DIR}" - aws s3 cp "${{ matrix.s3_path }}" "/dlc-models/${{ matrix.name }}.tar.gz" - tar xzf "/dlc-models/${{ matrix.name }}.tar.gz" -C "${MODEL_DIR}" - rm -f "/dlc-models/${{ matrix.name }}.tar.gz" - SUBDIRS=("${MODEL_DIR}"/*) - if [ ${#SUBDIRS[@]} -eq 1 ] && [ -d "${SUBDIRS[0]}" ]; then - mv "${SUBDIRS[0]}"/* "${MODEL_DIR}"/ - rmdir "${SUBDIRS[0]}" - fi + uses: ./.github/actions/download-model + id: model + with: + s3-path: ${{ matrix.s3_path }} + model-name: ${{ matrix.name }} - name: Start container run: | docker pull ${{ env.IMAGE_URI }} CONTAINER_ID=$(docker run -d -it --gpus all --entrypoint /bin/bash \ + --ipc=host --shm-size=10g \ -v /dlc-models:/models \ ${{ env.IMAGE_URI }}) echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV @@ -149,8 +146,7 @@ jobs: run: | docker stop ${CONTAINER_ID} 2>/dev/null || true docker rm -f ${CONTAINER_ID} 2>/dev/null || true - docker rmi ${{ env.IMAGE_URI }} 2>/dev/null || true - rm -rf /dlc-models + kill ${{ steps.model.outputs.lock-pid }} 2>/dev/null || true benchmark-runner-scale-sets: name: benchmark (${{ matrix.name }} / gpu-efa-runners) @@ -158,7 +154,6 @@ jobs: needs: [load-benchmarks] strategy: fail-fast: false - max-parallel: 1 matrix: include: ${{ fromJson(needs.load-benchmarks.outputs.runner-scale-sets-matrix) }} runs-on: gpu-efa-runners @@ -172,37 +167,20 @@ jobs: aws-account-id: ${{ env.ACCOUNT_ID }} aws-region: ${{ env.REGION }} - - name: GPU cleanup and status - run: | - echo "=== Pre-cleanup GPU state ===" - nvidia-smi - echo "" - echo "=== Stopping stale containers ===" - docker ps -q | xargs -r docker stop 2>/dev/null || true - docker ps -aq | xargs -r docker rm -f 2>/dev/null || true - echo "=== Clearing GPU memory ===" - nvidia-smi --gpu-reset 2>/dev/null || true - echo "" - echo "=== Post-cleanup GPU state ===" - nvidia-smi - - name: Download model from S3 - run: | - MODEL_DIR="/dlc-models/${{ matrix.name }}" - mkdir -p "${MODEL_DIR}" - aws s3 cp "${{ matrix.s3_path }}" "/dlc-models/${{ matrix.name }}.tar.gz" - tar xzf "/dlc-models/${{ matrix.name }}.tar.gz" -C "${MODEL_DIR}" - rm -f "/dlc-models/${{ matrix.name }}.tar.gz" - SUBDIRS=("${MODEL_DIR}"/*) - if [ ${#SUBDIRS[@]} -eq 1 ] && [ -d "${SUBDIRS[0]}" ]; then - mv "${SUBDIRS[0]}"/* "${MODEL_DIR}"/ - rmdir "${SUBDIRS[0]}" - fi + uses: ./.github/actions/download-model + id: model + with: + s3-path: ${{ matrix.s3_path }} + model-name: ${{ matrix.name }} - name: Start container run: | + # Get GPU UUIDs visible to this pod (k8s assigns a subset of host GPUs) + POD_GPUS=$(nvidia-smi --query-gpu=uuid --format=csv,noheader | paste -sd,) + echo "Pod GPU UUIDs: ${POD_GPUS}" docker pull ${{ env.IMAGE_URI }} - CONTAINER_ID=$(docker run -d -it --gpus all --entrypoint /bin/bash \ + CONTAINER_ID=$(docker run -d -it --gpus "\"device=${POD_GPUS}\"" --entrypoint /bin/bash \ --ipc=host --shm-size=10g \ ${{ env.IMAGE_URI }}) echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV @@ -210,9 +188,8 @@ jobs: - name: Copy files into container run: | docker exec ${CONTAINER_ID} mkdir -p /models - docker cp /dlc-models/${{ matrix.name }} ${CONTAINER_ID}:/models/${{ matrix.name }} + docker cp ${{ steps.model.outputs.model-dir }} ${CONTAINER_ID}:/models/${{ matrix.name }} docker cp scripts/vllm/benchmark/vllm_benchmark_test.sh ${CONTAINER_ID}:/models/ - rm -rf /dlc-models - name: Run benchmark run: | @@ -242,13 +219,15 @@ jobs: path: benchmark_results/ retention-days: 30 + # Do NOT docker rmi on shared runner-scale-sets nodes — multiple pods + # share the same host Docker daemon, removing an image could break a + # parallel job's container. Image cleanup is handled by DaemonSet. - name: Cleanup if: always() run: | docker stop ${CONTAINER_ID} 2>/dev/null || true docker rm -f ${CONTAINER_ID} 2>/dev/null || true - docker rmi ${{ env.IMAGE_URI }} 2>/dev/null || true - rm -rf /dlc-models + kill ${{ steps.model.outputs.lock-pid }} 2>/dev/null || true benchmark-report: name: benchmark-report diff --git a/docker/ray/Dockerfile.cpu b/docker/ray/Dockerfile.cpu index 8d774b10c535..bf7fdd2e9ace 100644 --- a/docker/ray/Dockerfile.cpu +++ b/docker/ray/Dockerfile.cpu @@ -56,7 +56,7 @@ LABEL dlc_minor_version="0" ARG PYTHON="python" ARG FRAMEWORK="ray" -ARG FRAMEWORK_VERSION="2.54.0" +ARG FRAMEWORK_VERSION="2.54.1" ARG CONTAINER_TYPE="inference" # Copy Python installation and venv from builder diff --git a/docker/ray/Dockerfile.gpu b/docker/ray/Dockerfile.gpu index 05d09121bccd..7bcdd700247f 100644 --- a/docker/ray/Dockerfile.gpu +++ b/docker/ray/Dockerfile.gpu @@ -79,7 +79,7 @@ LABEL dlc_minor_version="0" ARG PYTHON="python" ARG FRAMEWORK="ray" -ARG FRAMEWORK_VERSION="2.54.0" +ARG FRAMEWORK_VERSION="2.54.1" ARG CONTAINER_TYPE="inference" # Enable video capability to mount NVENC/NVDEC driver libraries diff --git a/docs/src/data/djl-inference/0.36-lmi22.0.0-gpu.yml b/docs/src/data/djl-inference/0.36-lmi22.0.0-gpu.yml index e27dc1062d44..c6fea7bae048 100644 --- a/docs/src/data/djl-inference/0.36-lmi22.0.0-gpu.yml +++ b/docs/src/data/djl-inference/0.36-lmi22.0.0-gpu.yml @@ -1,4 +1,4 @@ -framework: DJLServing 0.36 +framework: DJLServing version: "0.36" accelerator: gpu cuda: cu129 diff --git a/docs/src/data/djl-inference/0.36-lmi23.0.0-gpu.yml b/docs/src/data/djl-inference/0.36-lmi23.0.0-gpu.yml new file mode 100644 index 000000000000..f2e09258b662 --- /dev/null +++ b/docs/src/data/djl-inference/0.36-lmi23.0.0-gpu.yml @@ -0,0 +1,9 @@ +framework: DJLServing +version: "0.36" +accelerator: gpu +cuda: cu129 +engine: "LMI 23.0.0, vLLM 0.18.0" +platform: sagemaker + +tags: + - "0.36.0-lmi23.0.0-cu129" diff --git a/docs/src/data/vllm/0.18.1-gpu-sagemaker.yml b/docs/src/data/vllm/0.18.1-gpu-sagemaker.yml new file mode 100644 index 000000000000..5f98e32c1208 --- /dev/null +++ b/docs/src/data/vllm/0.18.1-gpu-sagemaker.yml @@ -0,0 +1,26 @@ +framework: vLLM +version: "0.18.1" +accelerator: gpu +python: py312 +cuda: cu129 +os: ubuntu22.04 +platform: sagemaker +public_registry: true + +tags: + - "0.18.1-gpu-py312-cu129-ubuntu22.04-sagemaker" + - "0.18-gpu-py312-cu129-ubuntu22.04-sagemaker-v1" + - "0.18.1-gpu-py312" + - "0.18-gpu-py312" + +announcements: + - "Introduced vLLM 0.18.1 containers for SageMaker" + +packages: + vllm: "0.18.1" + pytorch: "2.10.0" + torchvision: "0.25.0" + torchaudio: "2.10.0" + cuda: "12.9" + nccl: "2.27.5" + efa: "1.47.0" diff --git a/scripts/ray/pyproject.toml b/scripts/ray/pyproject.toml index 42cc8374bab6..eee52248422c 100644 --- a/scripts/ray/pyproject.toml +++ b/scripts/ray/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "pandas==3.0.1", "pillow==12.1.1", "pyyaml==6.0.3", - "ray[serve]==2.54.0", + "ray[serve]==2.54.1", "scikit-learn==1.8.0", "soundfile==0.13.1", "torch==2.10.0", diff --git a/scripts/ray/uv.lock b/scripts/ray/uv.lock index a9fdf71b9455..b6301ba5dc7b 100644 --- a/scripts/ray/uv.lock +++ b/scripts/ray/uv.lock @@ -354,7 +354,7 @@ requires-dist = [ { name = "pillow", specifier = "==12.1.1" }, { name = "pip", specifier = "==26.0.1" }, { name = "pyyaml", specifier = "==6.0.3" }, - { name = "ray", extras = ["serve"], specifier = "==2.54.0" }, + { name = "ray", extras = ["serve"], specifier = "==2.54.1" }, { name = "scikit-learn", specifier = "==1.8.0" }, { name = "soundfile", specifier = "==0.13.1" }, { name = "torch", specifier = "==2.10.0" }, @@ -1397,7 +1397,7 @@ wheels = [ [[package]] name = "ray" -version = "2.54.0" +version = "2.54.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -1410,9 +1410,9 @@ dependencies = [ { name = "requests" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/f2/5c0161d10445e703b7d01413ab54ec1cc5e27032555279d296df89b9c4ee/ray-2.54.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5ad77961fea16c697a0fb0e51216dd39c0bec28868cde54ac668edd58d12b8ae", size = 70030991, upload-time = "2026-02-18T04:05:43.966Z" }, - { url = "https://files.pythonhosted.org/packages/fd/8c/4a4a38eaec6e9614076a96967f58540f4f8d4aa0c793f43150c5df23cb9a/ray-2.54.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:8952c23a8aa94f10728c2d16e0dc3732d09aa0e6254801757ff494984a214f45", size = 72013826, upload-time = "2026-02-18T04:05:49.866Z" }, - { url = "https://files.pythonhosted.org/packages/42/ac/e7ec2a406bd755f61c7090460fa5ab3f09b00c3c2d8db6d0b559f78a30eb/ray-2.54.0-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:ab89e6089abb6e46fb98fdd96d399b31a852d79127cd8ac00746c61d93defa2c", size = 72880209, upload-time = "2026-02-18T04:05:55.498Z" }, + { url = "https://files.pythonhosted.org/packages/80/30/90f9f8f0fcba72b898c40854e020c9d5330f33b4ccd711747cc07e061416/ray-2.54.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:d05f477d1518a00fd5880644e889a7a3eaf64ae5d1f8f239a682d052ad2a383d", size = 70023037, upload-time = "2026-03-25T22:41:17.895Z" }, + { url = "https://files.pythonhosted.org/packages/c8/5d/fe0e8ac47f6b362c81f391d7f8d2a6858d0bafcc2c37631dc5cc04a16545/ray-2.54.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:2766f0230806480c38a9a94502087f1d4aea919f38521a28781690613b0290a4", size = 71738623, upload-time = "2026-03-25T22:41:23.898Z" }, + { url = "https://files.pythonhosted.org/packages/1b/22/48008a626e719baee2012080b960687cc6417b572b363c1c29fe23d119c3/ray-2.54.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:0c3ae2943176e7b239c78b825a5b2bf4135d90280083a0e19c0a75a5db4d836f", size = 72603355, upload-time = "2026-03-25T22:41:29.802Z" }, ] [package.optional-dependencies] @@ -1660,11 +1660,11 @@ wheels = [ [[package]] name = "setuptools" -version = "82.0.1" +version = "81.0.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316, upload-time = "2026-03-09T12:47:17.221Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0d/1c/73e719955c59b8e424d015ab450f51c0af856ae46ea2da83eba51cc88de1/setuptools-81.0.0.tar.gz", hash = "sha256:487b53915f52501f0a79ccfd0c02c165ffe06631443a886740b91af4b7a5845a", size = 1198299, upload-time = "2026-02-06T21:10:39.601Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223, upload-time = "2026-03-09T12:47:15.026Z" }, + { url = "https://files.pythonhosted.org/packages/e1/e3/c164c88b2e5ce7b24d667b9bd83589cf4f3520d97cad01534cd3c4f55fdb/setuptools-81.0.0-py3-none-any.whl", hash = "sha256:fdd925d5c5d9f62e4b74b30d6dd7828ce236fd6ed998a08d81de62ce5a6310d6", size = 1062021, upload-time = "2026-02-06T21:10:37.175Z" }, ] [[package]] diff --git a/scripts/vllm/benchmark/benchmark_report.py b/scripts/vllm/benchmark/benchmark_report.py index e1895601816f..3544825ebac5 100644 --- a/scripts/vllm/benchmark/benchmark_report.py +++ b/scripts/vllm/benchmark/benchmark_report.py @@ -11,12 +11,8 @@ def _parse_artifact_name(filename, prefix): - """Parse model name and runner type from artifact filename. - - Filename format: {prefix}_{model}_{runner}.json - """ + """Parse model name and runner from filename like throughput_qwen3.5-9b_x86-g6xl-runner.json.""" base = os.path.basename(filename).replace(f"{prefix}_", "", 1).replace(".json", "") - # Runner type is the last segment after the final underscore parts = base.rsplit("_", 1) if len(parts) == 2: return parts[0], parts[1] @@ -50,10 +46,10 @@ def main(results_dir): print("## Throughput\n") print( - "| Model | Runner | TP | Input Len | Output Len | Prompts | Tokens/s | Requests/s | Elapsed (s) |" + "| Model | Runner | TP | Input Len | Output Len | Prompts | Output Tokens/s | Total Tokens/s | Requests/s | Elapsed (s) |" ) print( - "|-------|--------|----|-----------|------------|---------|----------|------------|-------------|" + "|-------|--------|----|-----------|------------|---------|-----------------|----------------|------------|-------------|" ) for f in sorted(glob.glob(f"{results_dir}/**/throughput_*.json", recursive=True)): name, runner = _parse_artifact_name(f, "throughput") @@ -61,10 +57,12 @@ def main(results_dir): tp = get_tp(c.get("extra_args", "")) with open(f) as fh: r = json.load(fh) + output_tps = r.get("output_tokens_per_second", 0) print( f"| {name} | {runner} | {tp} " f"| {c.get('input_len', '')} | {c.get('output_len', '')} " - f"| {c.get('num_prompts', '')} | {r['tokens_per_second']:.2f} " + f"| {c.get('num_prompts', '')} | {output_tps:.2f} " + f"| {r['tokens_per_second']:.2f} " f"| {r['requests_per_second']:.2f} | {r['elapsed_time']:.2f} |" ) diff --git a/scripts/vllm/benchmark/vllm_benchmark_test.sh b/scripts/vllm/benchmark/vllm_benchmark_test.sh index e77258385a5d..59af6e0604e7 100755 --- a/scripts/vllm/benchmark/vllm_benchmark_test.sh +++ b/scripts/vllm/benchmark/vllm_benchmark_test.sh @@ -46,28 +46,41 @@ echo "=== Running throughput benchmark ===" vllm bench throughput \ --model "${MODEL_DIR}" \ --dataset-name random \ - --input-len "${INPUT_LEN}" \ - --output-len "${OUTPUT_LEN}" \ + --random-input-len "${INPUT_LEN}" \ + --random-output-len "${OUTPUT_LEN}" \ --num-prompts "${NUM_PROMPTS}" \ --output-json "${RESULTS_DIR}/throughput_${ARTIFACT_PREFIX}.json" \ - ${EXTRA_ARGS} + ${EXTRA_ARGS} 2>&1 | tee "${RESULTS_DIR}/throughput_${ARTIFACT_PREFIX}.log" echo "" echo "=== Throughput results ===" -cat "${RESULTS_DIR}/throughput_${ARTIFACT_PREFIX}.json" -# Validate throughput +# Parse output tokens/s and requests/s from vllm stdout: +# Throughput: 0.18 requests/s, 204.92 total tokens/s, 22.77 output tokens/s python3 -c " -import json, sys +import json, re, sys + +log = open('${RESULTS_DIR}/throughput_${ARTIFACT_PREFIX}.log').read() +m = re.search(r'([\d.]+)\s+requests/s,\s+([\d.]+)\s+total tokens/s,\s+([\d.]+)\s+output tokens/s', log) +if not m: + print('ERROR: could not parse throughput line from vllm output') + sys.exit(1) + +rps, total_tps, output_tps = float(m.group(1)), float(m.group(2)), float(m.group(3)) + +# Enrich JSON with parsed values with open('${RESULTS_DIR}/throughput_${ARTIFACT_PREFIX}.json') as f: r = json.load(f) -tps = r['tokens_per_second'] -rps = r['requests_per_second'] -print(f'Output tokens/s: {tps:.2f} (min: ${MIN_THROUGHPUT})') +r['output_tokens_per_second'] = output_tps +with open('${RESULTS_DIR}/throughput_${ARTIFACT_PREFIX}.json', 'w') as f: + json.dump(r, f, indent=4) + +print(f'Total tokens/s: {total_tps:.2f} (input+output)') +print(f'Output tokens/s: {output_tps:.2f} (min: ${MIN_THROUGHPUT})') print(f'Requests/s: {rps:.2f} (min: ${MIN_RPS})') ok = True -if tps < ${MIN_THROUGHPUT}: - print(f'FAIL: tokens/s {tps:.2f} < ${MIN_THROUGHPUT}') +if output_tps < ${MIN_THROUGHPUT}: + print(f'FAIL: output tokens/s {output_tps:.2f} < ${MIN_THROUGHPUT}') ok = False if rps < ${MIN_RPS}: print(f'FAIL: requests/s {rps:.2f} < ${MIN_RPS}') diff --git a/test/test_utils/aws.py b/test/test_utils/aws.py index cf41b6d16545..b049b87d6ae4 100644 --- a/test/test_utils/aws.py +++ b/test/test_utils/aws.py @@ -1,9 +1,12 @@ """AWS Session Manager for all AWS boto3 API resources""" +import ipaddress import logging import os import stat import tempfile +import time +import urllib.request from datetime import datetime import boto3 @@ -156,12 +159,26 @@ def get_instance_tags(self, instance_id): ) return {tag["Key"]: tag["Value"] for tag in response["Tags"]} + def get_codebuild_runner_public_ip(self): + """Get this machine's public IP via checkip.amazonaws.com. Retries 3 times.""" + url = "https://checkip.amazonaws.com" + for attempt in range(3): + try: + with urllib.request.urlopen(url, timeout=5) as resp: + ip = resp.read().decode().strip() + ipaddress.IPv4Address(ip) + return ip + except Exception: + if attempt == 2: + raise RuntimeError(f"Failed to get public IP from {url} after 3 attempts") + time.sleep(2**attempt) + # =========================================== # ===== Security Groups ===================== # =========================================== def create_ssh_security_group(self, group_name=None): - """Create a security group allowing SSH from anywhere. Returns group ID.""" + """Create a security group allowing SSH from the current machine's public IP. Returns group ID.""" if not group_name: group_name = random_suffix_name("dlc-ssh", 36) vpc_id = self.ec2.describe_vpcs(Filters=[{"Name": "is-default", "Values": ["true"]}])[ @@ -180,7 +197,12 @@ def create_ssh_security_group(self, group_name=None): "IpProtocol": "tcp", "FromPort": 22, "ToPort": 22, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], + "IpRanges": [ + { + "CidrIp": f"{self.get_codebuild_runner_public_ip()}/32", + "Description": "CodeBuild runner SSH access", + } + ], }, ], ) diff --git a/test/xgboost/container/conftest.py b/test/xgboost/container/conftest.py new file mode 100644 index 000000000000..75df54125faf --- /dev/null +++ b/test/xgboost/container/conftest.py @@ -0,0 +1,66 @@ +"""Pytest fixtures for XGBoost container tests. + +Provides: +- --image flag for the container image URI +- Session-scoped S3 resource download +- Docker client fixture +""" + +import logging +import os +import tempfile + +import boto3 +import pytest + +import docker + +LOGGER = logging.getLogger(__name__) + +S3_BUCKET = "dlc-cicd-models" +S3_PREFIX = "xgboost/container_test_resources" + + +def pytest_addoption(parser): + parser.addoption("--image", required=True, help="Docker image URI to test") + + +@pytest.fixture(scope="session") +def image_uri(request): + return request.config.getoption("--image") + + +@pytest.fixture(scope="session") +def docker_client(): + return docker.from_env() + + +@pytest.fixture(scope="session") +def test_resources(): + """Download training/ and inference/ from S3 once per session.""" + tmpdir = tempfile.mkdtemp(prefix="xgb-container-test-") + s3 = boto3.client("s3") + paginator = s3.get_paginator("list_objects_v2") + + for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=S3_PREFIX): + for obj in page.get("Contents", []): + key = obj["Key"] + rel = os.path.relpath(key, S3_PREFIX) + if rel == ".": + continue + dest = os.path.join(tmpdir, rel) + os.makedirs(os.path.dirname(dest), exist_ok=True) + LOGGER.info("Downloading s3://%s/%s -> %s", S3_BUCKET, key, dest) + s3.download_file(S3_BUCKET, key, dest) + + return tmpdir + + +@pytest.fixture(scope="session") +def training_resources(test_resources): + return os.path.join(test_resources, "training") + + +@pytest.fixture(scope="session") +def inference_resources(test_resources): + return os.path.join(test_resources, "inference") diff --git a/test/xgboost/container/container_helper.py b/test/xgboost/container/container_helper.py new file mode 100644 index 000000000000..c0367f84b9d2 --- /dev/null +++ b/test/xgboost/container/container_helper.py @@ -0,0 +1,319 @@ +"""Container helper — replaces ai_algorithms_container_tests. + +Creates /opt/ml/ directory structure in temp dirs, writes config JSON files, +mounts volumes, and runs the container via docker-py. + +Training mode: run container to completion, return exit code + logs + model files. +Serving mode: start container, poll health check, send HTTP requests. +""" + +import json +import logging +import os +import shutil +import tempfile +import time + +import docker.types +import requests + +LOGGER = logging.getLogger(__name__) + +TRAIN_TIMEOUT = 300 +SERVE_STARTUP_TIMEOUT = 120 +HEALTH_CHECK_INTERVAL = 2 +SERVE_PORT = 8080 + + +# --------------------------------------------------------------------------- +# /opt/ml layout helpers +# --------------------------------------------------------------------------- + + +def _create_opt_ml(tmpdir): + """Create the /opt/ml directory tree inside *tmpdir* and return paths dict.""" + paths = { + "input_config": os.path.join(tmpdir, "input", "config"), + "input_train": os.path.join(tmpdir, "input", "data", "train"), + "input_validation": os.path.join(tmpdir, "input", "data", "validation"), + "model": os.path.join(tmpdir, "model"), + "output": os.path.join(tmpdir, "output"), + "checkpoints": os.path.join(tmpdir, "checkpoints"), + } + for p in paths.values(): + os.makedirs(p, exist_ok=True) + return paths + + +def _write_configs( + config_dir, hyperparameters, inputdataconfig, resourceconfig, checkpointconfig=None +): + with open(os.path.join(config_dir, "hyperparameters.json"), "w") as f: + json.dump(hyperparameters, f) + with open(os.path.join(config_dir, "inputdataconfig.json"), "w") as f: + json.dump(inputdataconfig, f) + with open(os.path.join(config_dir, "resourceconfig.json"), "w") as f: + json.dump(resourceconfig, f) + if checkpointconfig is not None: + with open(os.path.join(config_dir, "checkpointconfig.json"), "w") as f: + json.dump(checkpointconfig, f) + + +def _copy_files(src_files, dest_dir): + """Copy a list of files (or all files in a directory) into *dest_dir*.""" + for src in src_files: + if os.path.isdir(src): + for fname in os.listdir(src): + shutil.copy2(os.path.join(src, fname), dest_dir) + else: + shutil.copy2(src, dest_dir) + + +# --------------------------------------------------------------------------- +# Training +# --------------------------------------------------------------------------- + + +def run_training( + docker_client, + image_uri, + hyperparameters, + inputdataconfig, + resourceconfig, + training_files, + validation_files=None, + checkpointconfig=None, + environment=None, + timeout=TRAIN_TIMEOUT, +): + """Run a training container and return (exit_code, logs, model_files, paths). + + *paths* is the dict returned by ``_create_opt_ml`` so callers can inspect + checkpoints, model dir, etc. + """ + tmpdir = tempfile.mkdtemp(prefix="xgb-train-") + paths = _create_opt_ml(tmpdir) + + _write_configs( + paths["input_config"], hyperparameters, inputdataconfig, resourceconfig, checkpointconfig + ) + _copy_files(training_files, paths["input_train"]) + if validation_files: + _copy_files(validation_files, paths["input_validation"]) + + volumes = {tmpdir: {"bind": "/opt/ml", "mode": "rw"}} + env = environment.copy() if environment else {} + + container = docker_client.containers.run( + image_uri, + command="train", + volumes=volumes, + environment=env, + detach=True, + ) + + try: + result = container.wait(timeout=timeout) + exit_code = result.get("StatusCode", -1) + except Exception: + LOGGER.warning("Training did not finish within %ss", timeout) + exit_code = -1 + finally: + logs = container.logs().decode("utf-8", errors="replace") + LOGGER.info("Container logs:\n%s", logs) + container.remove(force=True) + + model_files = [f for f in os.listdir(paths["model"]) if "model" in f] + return exit_code, logs, model_files, paths + + +def run_distributed_training( + docker_client, + image_uri, + hyperparameters, + inputdataconfig, + resourceconfigs, + training_files, + validation_files=None, + timeout=TRAIN_TIMEOUT, +): + """Run multi-container distributed training. Returns list of (exit_code, logs, paths).""" + hosts = [rc["current_host"] for rc in resourceconfigs] + network_name = "xgb-test-network" + subnet = "10.5.5.0/24" + base_ip = 2 + + # Create docker network + try: + network = docker_client.networks.get(network_name) + network.remove() + except Exception: + pass + ipam_pool = docker.types.IPAMPool(subnet=subnet) + ipam_config = docker.types.IPAMConfig(pool_configs=[ipam_pool]) + network = docker_client.networks.create(network_name, driver="bridge", ipam=ipam_config) + + containers = [] + all_paths = [] + try: + host_ips = {h: f"10.5.5.{base_ip + i}" for i, h in enumerate(hosts)} + + for i, rc in enumerate(resourceconfigs): + tmpdir = tempfile.mkdtemp(prefix=f"xgb-dist-{i}-") + paths = _create_opt_ml(tmpdir) + _write_configs(paths["input_config"], hyperparameters, inputdataconfig, rc) + _copy_files(training_files, paths["input_train"]) + if validation_files: + _copy_files(validation_files, paths["input_validation"]) + all_paths.append(paths) + + cur_host = rc["current_host"] + # Each container only needs extra_hosts for the OTHER hosts + other_hosts = {h: ip for h, ip in host_ips.items() if h != cur_host} + env = { + "CURRENT_HOST": cur_host, + "HOSTS": ",".join(hosts), + } + + # Use low-level API to assign specific IP on the network + networking_config = docker_client.api.create_networking_config( + { + network_name: docker_client.api.create_endpoint_config( + ipv4_address=host_ips[cur_host], + ) + } + ) + host_config = docker_client.api.create_host_config( + binds={tmpdir: {"bind": "/opt/ml", "mode": "rw"}}, + extra_hosts=other_hosts, + ) + cid = docker_client.api.create_container( + image_uri, + command="train", + hostname=cur_host, + environment=[f"{k}={v}" for k, v in env.items()], + host_config=host_config, + networking_config=networking_config, + ) + docker_client.api.start(cid) + container = docker_client.containers.get(cid["Id"]) + containers.append(container) + + # Wait for all containers + results = [] + for container in containers: + try: + result = container.wait(timeout=timeout) + exit_code = result.get("StatusCode", -1) + except Exception: + exit_code = -1 + logs = container.logs().decode("utf-8", errors="replace") + results.append((exit_code, logs)) + finally: + for c in containers: + try: + c.remove(force=True) + except Exception: + pass + try: + network.remove() + except Exception: + pass + + return [(r[0], r[1], all_paths[i]) for i, r in enumerate(results)] + + +# --------------------------------------------------------------------------- +# Serving (inference / batch transform) +# --------------------------------------------------------------------------- + + +class ServingContainer: + """Context manager that starts a serving container and exposes HTTP helpers.""" + + def __init__(self, docker_client, image_uri, model_dir, environment=None): + self._client = docker_client + self._image = image_uri + self._model_dir = model_dir + self._env = environment or {} + self._container = None + self._host_port = None + + # -- lifecycle ----------------------------------------------------------- + + def __enter__(self): + tmpdir = tempfile.mkdtemp(prefix="xgb-serve-") + self._opt_ml = tmpdir + paths = _create_opt_ml(tmpdir) + # Copy model files + _copy_files([self._model_dir], paths["model"]) + _write_configs( + paths["input_config"], {}, {}, {"current_host": "algo-1", "hosts": ["algo-1"]} + ) + + volumes = {tmpdir: {"bind": "/opt/ml", "mode": "rw"}} + env = dict(self._env) + + self._container = self._client.containers.run( + self._image, + command="serve", + volumes=volumes, + environment=env, + ports={f"{SERVE_PORT}/tcp": None}, + detach=True, + ) + self._wait_healthy() + return self + + def __exit__(self, *exc): + if self._container: + logs = self._container.logs().decode("utf-8", errors="replace") + LOGGER.info("Serving container logs:\n%s", logs) + self._container.remove(force=True) + if self._opt_ml: + shutil.rmtree(self._opt_ml, ignore_errors=True) + + # -- health check -------------------------------------------------------- + + def _wait_healthy(self): + deadline = time.time() + SERVE_STARTUP_TIMEOUT + while time.time() < deadline: + self._container.reload() + if self._container.status != "running": + raise RuntimeError(f"Container exited: {self._container.logs().decode()}") + try: + resp = requests.get(self._url("/ping"), timeout=2) + if resp.status_code == 200: + LOGGER.info("Serving container healthy") + return + except (requests.ConnectionError, RuntimeError): + pass + time.sleep(HEALTH_CHECK_INTERVAL) + raise TimeoutError("Serving container did not become healthy") + + # -- HTTP helpers -------------------------------------------------------- + + def _url(self, path): + self._container.reload() + port_map = self._container.ports.get(f"{SERVE_PORT}/tcp") + if not port_map: + raise RuntimeError("No port mapping found") + self._host_port = int(port_map[0]["HostPort"]) + return f"http://localhost:{self._host_port}{path}" + + def ping(self): + return requests.get(self._url("/ping"), timeout=5) + + def invocations(self, data, content_type, accept=None): + headers = {"Content-Type": content_type} + if accept: + headers["Accept"] = accept + return requests.post(self._url("/invocations"), data=data, headers=headers, timeout=60) + + def execution_parameters(self): + return requests.get(self._url("/execution-parameters"), timeout=5) + + def get_logs(self): + if self._container: + return self._container.logs().decode("utf-8", errors="replace") + return "" diff --git a/test/xgboost/container/generate_models.py b/test/xgboost/container/generate_models.py new file mode 100755 index 000000000000..5f5a7e8160dd --- /dev/null +++ b/test/xgboost/container/generate_models.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +"""Generate XGBoost 3.0.5-compatible inference models and upload to S3. + +Uses inference input data to create models with matching feature dimensions. +This is valid for container tests — we're testing the container's ability to +load models and serve predictions, not model accuracy. + +Run on CI host with: pip install xgboost==3.0.5 boto3 numpy +""" + +import os +import pickle +import tempfile + +import boto3 +import numpy as np +import xgboost as xgb + +S3_BUCKET = "dlc-cicd-models" +S3_PREFIX = "xgboost/container_test_resources/inference/models" +S3_INPUT_PREFIX = "xgboost/container_test_resources/inference/input" +S3_TRAINING_PREFIX = "xgboost/container_test_resources/training/data" + + +def download_s3_dir(s3, bucket, prefix, local_dir): + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get("Contents", []): + key = obj["Key"] + rel = os.path.relpath(key, prefix) + if rel == ".": + continue + dest = os.path.join(local_dir, rel) + os.makedirs(os.path.dirname(dest), exist_ok=True) + s3.download_file(bucket, key, dest) + + +def main(): + out_dir = tempfile.mkdtemp(prefix="xgb-models-") + input_dir = tempfile.mkdtemp(prefix="xgb-input-") + train_dir = tempfile.mkdtemp(prefix="xgb-train-") + s3 = boto3.client("s3") + + print(f"XGBoost version: {xgb.__version__}") + print("Downloading inference input data...") + download_s3_dir(s3, S3_BUCKET, S3_INPUT_PREFIX, input_dir) + print("Downloading training data...") + download_s3_dir(s3, S3_BUCKET, S3_TRAINING_PREFIX, train_dir) + + # --- mnist-xgb-model --- + # mnist-700.csv: first column is label, remaining are features + # libsvm files use 1-based indexing with max index 785, so set num_feature=785 + # to ensure model accepts all inference input formats + print("Generating mnist-xgb-model...") + mnist_data = np.genfromtxt(os.path.join(input_dir, "mnist-700.csv"), delimiter=",") + labels = mnist_data[:, 0] + features = mnist_data[:, 1:] + n_features = 785 # max feature index in libsvm files + # Pad features to n_features if needed + if features.shape[1] < n_features: + pad = np.zeros((features.shape[0], n_features - features.shape[1])) + features = np.concatenate([features, pad], axis=1) + dtrain = xgb.DMatrix(features, label=labels) + bst = xgb.train({"objective": "multi:softmax", "num_class": 10, "max_depth": 6}, dtrain, 10) + bst.save_model(os.path.join(out_dir, "mnist-xgb-model")) + pickle.dump(bst, open(os.path.join(out_dir, "mnist-pkl-model"), "wb")) + print(f" {features.shape[0]} rows x {features.shape[1]} features") + + # --- diabetes-binary-xgb-model --- + print("Generating diabetes-binary-xgb-model...") + diabetes_data = np.genfromtxt(os.path.join(input_dir, "diabetes_inference.csv"), delimiter=",") + labels_d = np.random.randint(0, 2, size=diabetes_data.shape[0]).astype(float) + dtrain_d = xgb.DMatrix(diabetes_data, label=labels_d) + bst_d = xgb.train({"objective": "binary:hinge", "max_depth": 6}, dtrain_d, 10) + bst_d.save_model(os.path.join(out_dir, "diabetes-binary-xgb-model")) + print(f" {diabetes_data.shape[0]} rows x {diabetes_data.shape[1]} cols") + + # --- insurance-xgb-model (from actual training CSV) --- + print("Generating insurance-xgb-model...") + csv_train = np.genfromtxt(os.path.join(train_dir, "single-csv", "train.csv"), delimiter=",") + dtrain_ins = xgb.DMatrix(csv_train[:, 1:], label=csv_train[:, 0]) + bst_ins = xgb.train({"objective": "reg:squarederror", "max_depth": 6}, dtrain_ins, 10) + bst_ins.save_model(os.path.join(out_dir, "insurance-xgb-model")) + pickle.dump(bst_ins, open(os.path.join(out_dir, "insurance-pkl-model"), "wb")) + print(f" {csv_train.shape[0]} rows x {csv_train.shape[1] - 1} cols") + + # --- salary-pkl-model (single feature, from salary-30.csv dims) --- + print("Generating salary-pkl-model...") + np.random.seed(42) + X_sal = np.random.rand(100, 1) + y_sal = X_sal[:, 0] * 50000 + np.random.randn(100) * 5000 + dtrain_sal = xgb.DMatrix(X_sal, label=y_sal) + bst_sal = xgb.train({"objective": "reg:squarederror", "max_depth": 3}, dtrain_sal, 10) + pickle.dump(bst_sal, open(os.path.join(out_dir, "salary-pkl-model"), "wb")) + print(" 100 rows x 1 feature") + + # --- Upload to S3 --- + print(f"\nUploading to s3://{S3_BUCKET}/{S3_PREFIX}/") + for fname in sorted(os.listdir(out_dir)): + local = os.path.join(out_dir, fname) + key = f"{S3_PREFIX}/{fname}" + s3.upload_file(local, S3_BUCKET, key) + print(f" {fname} ({os.path.getsize(local)} bytes)") + + print(f"\nDone — models generated with XGBoost {xgb.__version__}") + + +if __name__ == "__main__": + main() diff --git a/test/xgboost/container/test_batch_transform.py b/test/xgboost/container/test_batch_transform.py new file mode 100644 index 000000000000..91f954c36f64 --- /dev/null +++ b/test/xgboost/container/test_batch_transform.py @@ -0,0 +1,162 @@ +"""Batch transform container tests — rewritten from SMFrameworksXGBoost3_0-5Tests. + +Covers batch inference with SAGEMAKER_BATCH=True for: +- libsvm (xgb + text/libsvm content type variant) +- recordio-protobuf (xgb) +- csv (xgb: mnist, insurance) + +Batch responses are newline-delimited, so expected_length is +1 for trailing newline. + +Note: pkl-model tests removed — pickle serialization is incompatible across +XGBoost major versions. Only xgb-format models (via save_model) are tested. +""" + +import http.client as httplib +import logging +import os + +from .container_helper import ServingContainer + +LOGGER = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _input_path(resources, filename): + return os.path.join(resources, "input", filename) + + +def _model_path(resources, model_name): + return os.path.join(resources, "models", model_name) + + +def _send_batch_requests( + docker_client, image_uri, resources, model_name, content_type, input_files +): + model_dir = _model_path(resources, model_name) + env = {"SAGEMAKER_BATCH": "True"} + responses = [] + with ServingContainer(docker_client, image_uri, model_dir, env) as ctx: + for fname in input_files: + path = _input_path(resources, fname) + with open(path, "rb") as f: + payload = f.read() + resp = ctx.invocations(data=payload, content_type=content_type) + responses.append(resp) + LOGGER.info("Batch response %s: status=%s", fname, resp.status_code) + return responses + + +def _validate_batch_response(resp, expected_length): + """Batch responses are newline-delimited; trailing newline adds +1.""" + assert resp.status_code == httplib.OK, resp.text + lines = resp.text.split("\n") + assert len(lines) == expected_length + 1 + + +# =========================================================================== +# Tests +# =========================================================================== + + +class TestBatchTransform: + def test_libsvm_batch(self, docker_client, image_uri, inference_resources): + for model in ["mnist-pkl-model", "mnist-xgb-model"]: + responses = _send_batch_requests( + docker_client, + image_uri, + inference_resources, + model, + "text/x-libsvm", + ["mnist-1.libsvm", "mnist-less-dim-1.libsvm", "mnist-700.libsvm"], + ) + _validate_batch_response(responses[0], 1) + _validate_batch_response(responses[1], 1) + _validate_batch_response(responses[2], 700) + + # text/libsvm variant + responses = _send_batch_requests( + docker_client, + image_uri, + inference_resources, + "mnist-xgb-model", + "text/libsvm", + ["mnist-1.libsvm", "mnist-700.libsvm"], + ) + _validate_batch_response(responses[0], 1) + _validate_batch_response(responses[1], 700) + + def test_recordio_protobuf_batch(self, docker_client, image_uri, inference_resources): + for model in ["mnist-pkl-model", "mnist-xgb-model"]: + responses = _send_batch_requests( + docker_client, + image_uri, + inference_resources, + model, + "application/x-recordio-protobuf", + ["mnist-1.pbr", "mnist-equal-dim.pbr", "mnist-700.pbr"], + ) + _validate_batch_response(responses[0], 1) + _validate_batch_response(responses[1], 1) + _validate_batch_response(responses[2], 700) + + def test_csv_batch(self, docker_client, image_uri, inference_resources): + # mnist pkl + responses = _send_batch_requests( + docker_client, + image_uri, + inference_resources, + "mnist-pkl-model", + "text/csv", + ["mnist-1.csv", "mnist-empty-cell.csv", "mnist-equal-dim.csv", "mnist-700.csv"], + ) + _validate_batch_response(responses[0], 1) + _validate_batch_response(responses[1], 1) + _validate_batch_response(responses[2], 1) + _validate_batch_response(responses[3], 700) + + # insurance pkl + responses = _send_batch_requests( + docker_client, + image_uri, + inference_resources, + "insurance-pkl-model", + "text/csv", + [ + "insurance-1.csv", + "insurance-2000.csv", + "insurance-empty-cell.csv", + "insurance-nan-values.csv", + ], + ) + _validate_batch_response(responses[0], 1) + _validate_batch_response(responses[1], 2000) + _validate_batch_response(responses[2], 2000) + _validate_batch_response(responses[3], 2000) + + # insurance xgb + responses = _send_batch_requests( + docker_client, + image_uri, + inference_resources, + "insurance-xgb-model", + "text/csv", + ["insurance-1.csv", "insurance-2000.csv", "insurance-empty-cell.csv"], + ) + _validate_batch_response(responses[0], 1) + _validate_batch_response(responses[1], 2000) + _validate_batch_response(responses[2], 2000) + + # salary pkl (single column) + responses = _send_batch_requests( + docker_client, + image_uri, + inference_resources, + "salary-pkl-model", + "text/csv", + ["salary-30.csv"], + ) + _validate_batch_response(responses[0], 30) diff --git a/test/xgboost/container/test_scoring.py b/test/xgboost/container/test_scoring.py new file mode 100644 index 000000000000..25f1ba14c75a --- /dev/null +++ b/test/xgboost/container/test_scoring.py @@ -0,0 +1,314 @@ +"""Scoring (inference) container tests — rewritten from SMFrameworksXGBoost3_0-5Tests. + +Covers: +- Valid: CSV, libsvm, recordio-protobuf inference with xgb model format, + execution parameters, 20MB payload +- Invalid: unsupported content type, empty payload, wrong feature dimension, + mismatched payload/content-type, invalid accept header + +Note: pkl-model tests removed — pickle serialization is incompatible across +XGBoost major versions. Only xgb-format models (via save_model) are tested. +""" + +import http.client as httplib +import json +import logging +import os + +from .container_helper import ServingContainer + +LOGGER = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _input_path(resources, filename): + return os.path.join(resources, "input", filename) + + +def _model_path(resources, model_name): + return os.path.join(resources, "models", model_name) + + +def _send_requests( + docker_client, image_uri, resources, model_name, content_type, input_files, environment=None +): + """Start serving container, send requests for each input file, return responses.""" + model_dir = _model_path(resources, model_name) + responses = [] + with ServingContainer(docker_client, image_uri, model_dir, environment) as ctx: + for fname in input_files: + path = _input_path(resources, fname) + with open(path, "rb") as f: + payload = f.read() + resp = ctx.invocations(data=payload, content_type=content_type) + responses.append(resp) + LOGGER.info("Response %s: status=%s len=%s", fname, resp.status_code, len(resp.text)) + return responses + + +def _validate_response(resp, expected_length): + assert resp.status_code == httplib.OK, resp.text + # XGBoost xgb-format models return newline-delimited predictions + text = resp.text.strip() + if "," in text: + predicted = text.split(",") + else: + predicted = text.split("\n") + assert len(predicted) == expected_length + + +# =========================================================================== +# Valid scoring tests +# =========================================================================== + + +class TestValidScoring: + def test_execution_parameters(self, docker_client, image_uri, inference_resources): + model_dir = _model_path(inference_resources, "mnist-xgb-model") + with ServingContainer(docker_client, image_uri, model_dir) as ctx: + resp = ctx.execution_parameters() + params = json.loads(resp.text) + assert params["BatchStrategy"] == "MULTI_RECORD" + assert params["MaxConcurrentTransforms"] >= 1 + assert params["MaxPayloadInMB"] >= 6 + + def test_csv_inference(self, docker_client, image_uri, inference_resources): + # mnist xgb model + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "mnist-xgb-model", + "text/csv", + ["mnist-1.csv", "mnist-empty-cell.csv", "mnist-equal-dim.csv", "mnist-700.csv"], + ) + _validate_response(responses[0], 1) + _validate_response(responses[1], 1) + _validate_response(responses[2], 1) + _validate_response(responses[3], 700) + + # mnist pkl model + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "mnist-pkl-model", + "text/csv", + ["mnist-1.csv", "mnist-700.csv"], + ) + _validate_response(responses[0], 1) + _validate_response(responses[1], 700) + + # insurance xgb model + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "insurance-xgb-model", + "text/csv", + ["insurance-1.csv", "insurance-2000.csv", "insurance-empty-cell.csv"], + ) + _validate_response(responses[0], 1) + _validate_response(responses[1], 2000) + _validate_response(responses[2], 2000) + + # insurance pkl model + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "insurance-pkl-model", + "text/csv", + [ + "insurance-1.csv", + "insurance-2000.csv", + "insurance-empty-cell.csv", + "insurance-nan-values.csv", + ], + ) + _validate_response(responses[0], 1) + _validate_response(responses[1], 2000) + _validate_response(responses[2], 2000) + _validate_response(responses[3], 2000) + + # salary pkl model (single column) + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "salary-pkl-model", + "text/csv", + ["salary-30.csv"], + ) + _validate_response(responses[0], 30) + + def test_libsvm_inference(self, docker_client, image_uri, inference_resources): + for model in ["mnist-pkl-model", "mnist-xgb-model"]: + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + model, + "text/x-libsvm", + ["mnist-1.libsvm", "mnist-less-dim-1.libsvm", "mnist-700.libsvm"], + ) + _validate_response(responses[0], 1) + _validate_response(responses[1], 1) + _validate_response(responses[2], 700) + + # text/libsvm content type variant + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "mnist-xgb-model", + "text/libsvm", + ["mnist-1.libsvm", "mnist-700.libsvm"], + ) + _validate_response(responses[0], 1) + _validate_response(responses[1], 700) + + def test_recordio_protobuf_inference(self, docker_client, image_uri, inference_resources): + for model in ["mnist-pkl-model", "mnist-xgb-model"]: + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + model, + "application/x-recordio-protobuf", + ["mnist-1.pbr", "mnist-equal-dim.pbr", "mnist-700.pbr"], + ) + _validate_response(responses[0], 1) + _validate_response(responses[1], 1) + _validate_response(responses[2], 700) + + def test_binary_classification(self, docker_client, image_uri, inference_resources): + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "diabetes-binary-xgb-model", + "text/csv", + ["diabetes_inference.csv"], + ) + assert responses[0].status_code == httplib.OK + text = responses[0].text.strip() + predictions = list(map(float, text.replace(",", "\n").split("\n"))) + assert len(predictions) == 10 + assert all(p in (0.0, 1.0) for p in predictions) + + def test_csv_20mb_payload(self, docker_client, image_uri, inference_resources): + max_payload = 20 * 1024**2 + model_dir = _model_path(inference_resources, "mnist-xgb-model") + env = {"MAX_CONTENT_LENGTH": str(max_payload)} + with ServingContainer(docker_client, image_uri, model_dir, env) as ctx: + path = _input_path(inference_resources, "mnist-1.csv") + with open(path, "rb") as f: + single = f.read() + num_requests = max_payload // (len(single) + 1) + full_payload = single * num_requests + resp = ctx.invocations(data=full_payload, content_type="text/csv") + _validate_response(resp, num_requests) + + +# =========================================================================== +# Invalid scoring tests +# =========================================================================== + + +class TestInvalidScoring: + def test_unsupported_content_type(self, docker_client, image_uri, inference_resources): + model_dir = _model_path(inference_resources, "mnist-xgb-model") + with ServingContainer(docker_client, image_uri, model_dir) as ctx: + resp_png = ctx.invocations(data=b"PNG" + b"0" * 400, content_type="image/png") + resp_parquet = ctx.invocations( + data=json.dumps({"foo": "bar"}).encode(), + content_type="application/x-parquet", + ) + assert resp_png.status_code == httplib.UNSUPPORTED_MEDIA_TYPE + assert resp_parquet.status_code == httplib.UNSUPPORTED_MEDIA_TYPE + + def test_empty_payload(self, docker_client, image_uri, inference_resources): + model_dir = _model_path(inference_resources, "mnist-xgb-model") + with ServingContainer(docker_client, image_uri, model_dir) as ctx: + resp_libsvm = ctx.invocations(data=b"", content_type="text/x-libsvm") + resp_csv = ctx.invocations(data=b"", content_type="text/csv") + resp_pbr = ctx.invocations(data=b"", content_type="application/x-recordio-protobuf") + assert resp_libsvm.status_code == httplib.NO_CONTENT + assert resp_csv.status_code == httplib.NO_CONTENT + assert resp_pbr.status_code == httplib.NO_CONTENT + + # NOTE: test_invalid_feature_dimension removed — XGBoost 3.0.5 is lenient + # with dimension mismatches (pads sparse features, accepts extra dims) + + def test_libsvm_payload_with_csv_content_type( + self, docker_client, image_uri, inference_resources + ): + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "mnist-xgb-model", + "text/csv", + ["mnist-1.libsvm"], + ) + assert responses[0].status_code == httplib.UNSUPPORTED_MEDIA_TYPE + assert "Loading csv data failed" in responses[0].text + + def test_invalid_payload_with_csv_content_type( + self, docker_client, image_uri, inference_resources + ): + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "mnist-xgb-model", + "text/csv", + ["data.rec"], + ) + assert responses[0].status_code == httplib.UNSUPPORTED_MEDIA_TYPE + assert "Loading csv data failed" in responses[0].text + + def test_csv_payload_with_libsvm_content_type( + self, docker_client, image_uri, inference_resources + ): + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "mnist-xgb-model", + "text/libsvm", + ["mnist-1.csv"], + ) + assert responses[0].status_code == httplib.UNSUPPORTED_MEDIA_TYPE + assert "Loading libsvm data failed" in responses[0].text + + def test_invalid_payload_with_libsvm_content_type( + self, docker_client, image_uri, inference_resources + ): + responses = _send_requests( + docker_client, + image_uri, + inference_resources, + "mnist-xgb-model", + "text/libsvm", + ["data.rec"], + ) + assert responses[0].status_code == httplib.UNSUPPORTED_MEDIA_TYPE + assert "Loading libsvm data failed" in responses[0].text + + def test_invalid_accept_selectable_inference( + self, docker_client, image_uri, inference_resources + ): + model_dir = _model_path(inference_resources, "mnist-xgb-model") + env = {"SAGEMAKER_INFERENCE_OUTPUT": "predicted_label"} + with ServingContainer(docker_client, image_uri, model_dir, env) as ctx: + path = _input_path(inference_resources, "mnist-1.csv") + with open(path, "rb") as f: + payload = f.read() + resp = ctx.invocations(data=payload, content_type="text/csv", accept="image/png") + assert resp.status_code == httplib.NOT_ACCEPTABLE diff --git a/test/xgboost/container/test_training.py b/test/xgboost/container/test_training.py new file mode 100644 index 000000000000..8eb284f2cb86 --- /dev/null +++ b/test/xgboost/container/test_training.py @@ -0,0 +1,782 @@ +"""Training container tests — rewritten from SMFrameworksXGBoost3_0-5Tests. + +Covers: +- Valid training: libsvm, csv, single/multi file, weights, HPO metrics, objectives, + verbosity, checkpoint/reload for spot instances +- Invalid training: missing data, wrong content types, invalid hyperparameters, + pipe mode +""" + +import copy +import json +import os +import re + +import pytest + +from .container_helper import run_distributed_training, run_training + +# --------------------------------------------------------------------------- +# Standard configs (mirrors configs.py from reference tests) +# --------------------------------------------------------------------------- + +STD_HP = { + "eval_metric": "error", + "predictor": "cpu_predictor", + "nthread": "8", + "sketch_eps": "0.03", + "base_score": "0.5", + "scale_pos_weight": "1.0", + "tree_method": "auto", + "normalize_type": "tree", + "max_depth": "6", + "sample_type": "uniform", + "booster": "gbtree", + "objective": "binary:logistic", + "rate_drop": "0.0", + "updater": "grow_colmaker,prune", + "lambda": "1.0", + "eta": "0.3", + "alpha": "0.0", + "process_type": "default", + "dsplit": "row", + "max_delta_step": "0", + "min_child_weight": "1.0", + "colsample_bytree": "1.0", + "max_leaves": "0", + "lambda_bias": "0.0", + "grow_policy": "depthwise", + "tweedie_variance_power": "1.5", + "max_bin": "256", + "refresh_leaf": "1", + "num_round": "10", + "early_stopping_rounds": "5", + "colsample_bylevel": "1", + "one_drop": "0", + "subsample": "1.0", + "skip_drop": "0.0", + "gamma": "0.0", +} + +STD_IDC = { + "train": { + "ContentType": "libsvm", + "S3DistributionType": "FullyReplicated", + "TrainingInputMode": "File", + }, + "validation": { + "ContentType": "libsvm", + "S3DistributionType": "FullyReplicated", + "TrainingInputMode": "File", + }, +} + +STD_RC = {"current_host": "algo-1", "hosts": ["algo-1"]} + +STD_CPC = {"LocalPath": "/opt/ml/checkpoints"} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _libsvm_dir(resources): + return os.path.join(resources, "data", "single-libsvm") + + +def _csv_dir(resources): + return os.path.join(resources, "data", "single-csv") + + +def _run( + docker_client, + image_uri, + resources, + hp, + idc, + rc, + train_files, + val_files=None, + cpc=None, + env=None, +): + return run_training( + docker_client, + image_uri, + hp, + idc, + rc, + training_files=train_files, + validation_files=val_files, + checkpointconfig=cpc, + environment=env, + ) + + +def _assert_success(result, regex=None): + exit_code, logs, model_files, _ = result + assert exit_code == 0, f"Training failed:\n{logs}" + assert len(model_files) == 1, f"Expected 1 model file, got {model_files}" + if regex: + assert re.search(regex, logs), f"Pattern {regex!r} not found in logs" + + +def _assert_failed(result, regex="UserError:"): + exit_code, logs, _, _ = result + assert re.search(regex, logs), f"Pattern {regex!r} not found in logs" + + +# =========================================================================== +# Valid training tests +# =========================================================================== + + +class TestValidTraining: + def test_single_file_libsvm(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "text/libsvm" + idc["validation"]["ContentType"] = "libsvm" + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + _assert_success(result) + + def test_single_file_libsvm_weights(self, docker_client, image_uri, training_resources): + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train.weights")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + _assert_success(result) + + def test_single_file_libsvm_hpo_param(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + d = _libsvm_dir(training_resources) + for metric in [ + "validation:rmse", + "validation:mae", + "validation:logloss", + "validation:error", + "validation:auc", + "validation:aucpr", + "validation:ndcg", + "validation:map", + "validation:accuracy", + "validation:f1", + "validation:mse", + ]: + hp["_tuning_objective_metric"] = metric + result = _run( + docker_client, + image_uri, + training_resources, + hp, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + _assert_success(result, regex=metric.replace(":", "-")) + + def test_single_file_libsvm_multiclass_hpo(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + hp["objective"] = "multi:softmax" + hp["num_class"] = 3 + hp["eval_metric"] = "merror" + hp["_tuning_objective_metric"] = "validation:merror" + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + hp, + STD_IDC, + STD_RC, + [os.path.join(d, "synthetic_multi.libsvm.train")], + [os.path.join(d, "synthetic_multi.libsvm.train")], + ) + _assert_success(result, regex="validation-merror") + + def test_single_file_libsvm_hpo_param_non_overlapping( + self, docker_client, image_uri, training_resources + ): + hp = copy.deepcopy(STD_HP) + hp["_tuning_objective_metric"] = "validation:logloss" + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + hp, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + _assert_success(result, regex="(?=.*validation-logloss:.*)(?=.*validation-error:.*)") + + def test_single_file_output_both_default_and_custom_metrics( + self, docker_client, image_uri, training_resources + ): + hp = copy.deepcopy(STD_HP) + eval_metrics = ["logloss", "f1", "error"] + hp["eval_metric"] = ",".join(eval_metrics) + for hpo_metric in ["validation:accuracy", "validation:mae"]: + hp["_tuning_objective_metric"] = hpo_metric + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + hp, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + all_metrics = list(set(eval_metrics) | {hpo_metric}) + regex = "".join(f"(?=.*{m.replace(':', '-')})" for m in all_metrics) + _assert_success(result, regex=regex) + + def test_single_file_libsvm_iterate_objectives( + self, docker_client, image_uri, training_resources + ): + hp = copy.deepcopy(STD_HP) + d = _libsvm_dir(training_resources) + for obj in [ + "reg:squarederror", + "reg:logistic", + "binary:logistic", + "binary:logitraw", + "count:poisson", + ]: + hp["objective"] = obj + result = _run( + docker_client, + image_uri, + training_resources, + hp, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + _assert_success(result) + + def test_single_file_libsvm_threshold_eval_metric( + self, docker_client, image_uri, training_resources + ): + hp = copy.deepcopy(STD_HP) + hp["eval_metric"] = "error@0.8" + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + hp, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + _assert_success(result) + + def test_single_file_libsvm_verbosity(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + hp["verbosity"] = "3" + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + hp, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.libsvm.train")], + [os.path.join(d, "agaricus.libsvm.test")], + ) + _assert_success(result) + + def test_multi_files_libsvm(self, docker_client, image_uri, training_resources): + d = os.path.join(training_resources, "data", "multi-libsvm") + train_dir = os.path.join(d, "train") + val_dir = os.path.join(d, "val") + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + STD_IDC, + STD_RC, + [train_dir], + [val_dir], + ) + _assert_success(result) + + def test_single_file_csv(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "text/csv" + idc["validation"]["ContentType"] = "csv" + d = _csv_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "train.csv")], + [os.path.join(d, "val.csv")], + ) + _assert_success(result) + + def test_single_file_csv_weights(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "text/csv" + idc["validation"]["ContentType"] = "text/csv" + hp = copy.deepcopy(STD_HP) + hp["csv_weights"] = "1" + d = _csv_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + hp, + idc, + STD_RC, + [os.path.join(d, "train.csv.weights")], + [os.path.join(d, "val.csv")], + ) + _assert_success(result) + + def test_multi_file_csv(self, docker_client, image_uri, training_resources): + d = os.path.join(training_resources, "data", "multi-csv") + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "csv" + idc["validation"]["ContentType"] = "csv" + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "train")], + [os.path.join(d, "val")], + ) + _assert_success(result) + + def test_single_file_csv_space_separated(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "csv" + idc.pop("validation", None) + d = _csv_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "train_space.csv")], + ) + _assert_success(result) + + def test_single_file_csv_sci_notation(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "csv" + idc.pop("validation", None) + d = _csv_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "train_sci.csv")], + ) + _assert_success(result) + + def test_single_file_csv_empty_cells(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "csv" + idc.pop("validation", None) + d = _csv_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "train_empty_cell.csv")], + ) + _assert_success(result) + + def test_two_container_with_libsvm_data(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + hp["tree_method"] = "hist" + hp.pop("updater", None) + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "text/libsvm" + idc["validation"]["ContentType"] = "text/libsvm" + d = _libsvm_dir(training_resources) + train_files = [os.path.join(d, "agaricus.libsvm.train")] + val_files = [os.path.join(d, "agaricus.libsvm.test")] + hosts = ["algo-1", "algo-2"] + rcs = [ + {"current_host": "algo-1", "hosts": hosts}, + {"current_host": "algo-2", "hosts": hosts}, + ] + results = run_distributed_training( + docker_client, + image_uri, + hp, + idc, + rcs, + train_files, + validation_files=val_files, + ) + assert results[0][0] == 0, f"Container 1 failed:\n{results[0][1]}" + assert results[1][0] == 0, f"Container 2 failed:\n{results[1][1]}" + model_files = os.listdir(results[0][2]["model"]) + assert len(model_files) >= 1, ( + f"No model files in master node model dir.\n" + f"Container 1 logs:\n{results[0][1]}\n" + f"Container 2 logs:\n{results[1][1]}" + ) + + def test_two_container_with_libsvm_data_shardedbykey( + self, docker_client, image_uri, training_resources + ): + hp = copy.deepcopy(STD_HP) + hp["tree_method"] = "hist" + hp.pop("updater", None) + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "text/libsvm" + idc["train"]["S3DistributionType"] = "ShardedByS3Key" + idc["validation"]["ContentType"] = "text/libsvm" + idc["validation"]["S3DistributionType"] = "ShardedByS3Key" + d = _libsvm_dir(training_resources) + train_files = [os.path.join(d, "agaricus.libsvm.train")] + val_files = [os.path.join(d, "agaricus.libsvm.test")] + hosts = ["algo-1", "algo-2"] + rcs = [ + {"current_host": "algo-1", "hosts": hosts}, + {"current_host": "algo-2", "hosts": hosts}, + ] + results = run_distributed_training( + docker_client, + image_uri, + hp, + idc, + rcs, + train_files, + validation_files=val_files, + ) + assert results[0][0] == 0, f"Container 1 failed:\n{results[0][1]}" + assert results[1][0] == 0, f"Container 2 failed:\n{results[1][1]}" + model_files = os.listdir(results[0][2]["model"]) + assert len(model_files) >= 1, ( + f"No model files in master node model dir.\n" + f"Container 1 logs:\n{results[0][1]}\n" + f"Container 2 logs:\n{results[1][1]}" + ) + + def test_checkpoint_and_reload(self, docker_client, image_uri, training_resources): + """Train 10 rounds, verify checkpoints, then resume to 20 rounds.""" + hp1 = copy.deepcopy(STD_HP) + hp1["num_round"] = 10 + hp1["eval_metric"] = "error" + hp1.pop("early_stopping_rounds", None) + + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "text/libsvm" + idc.pop("validation", None) + + d = _libsvm_dir(training_resources) + train_files = [os.path.join(d, "agaricus.libsvm.train")] + + # Phase 1: train 10 rounds + exit_code, logs, model_files, paths = run_training( + docker_client, + image_uri, + hp1, + idc, + STD_RC, + training_files=train_files, + checkpointconfig=STD_CPC, + ) + assert exit_code == 0 + assert len(model_files) == 1 + + ckpt_files = os.listdir(paths["checkpoints"]) + assert len(ckpt_files) >= 1, "No checkpoint files found" + regex = r"\[\d+\].*(?=.*train-error:.*)" + assert len(re.findall(regex, logs)) == 10 + + # Phase 2: resume to 20 rounds using same opt_ml dir + hp2 = copy.deepcopy(STD_HP) + hp2["num_round"] = 20 + hp2["eval_metric"] = "error" + hp2.pop("early_stopping_rounds", None) + + config_dir = paths["input_config"] + with open(os.path.join(config_dir, "hyperparameters.json"), "w") as f: + json.dump(hp2, f) + + # Clear model dir for fresh output + for mf in os.listdir(paths["model"]): + os.remove(os.path.join(paths["model"], mf)) + + tmpdir = paths["input_config"].rsplit("/input/", 1)[0] + volumes = {tmpdir: {"bind": "/opt/ml", "mode": "rw"}} + + container = docker_client.containers.run( + image_uri, + command="train", + volumes=volumes, + detach=True, + ) + try: + result = container.wait(timeout=300) + exit_code2 = result.get("StatusCode", -1) + except Exception: + exit_code2 = -1 + finally: + logs2 = container.logs().decode("utf-8", errors="replace") + container.remove(force=True) + + assert exit_code2 == 0 + ckpt_files2 = os.listdir(paths["checkpoints"]) + assert len(ckpt_files2) >= 1 + assert len(re.findall(regex, logs2)) >= 10 + + +# =========================================================================== +# Invalid training tests +# =========================================================================== + + +class TestInvalidTraining: + def _get_libsvm_data(self, resources, with_validation=True): + d = _libsvm_dir(resources) + train = [os.path.join(d, "agaricus.libsvm.train")] + val = [os.path.join(d, "agaricus.libsvm.test")] + return (train, val) if with_validation else train + + def test_no_training_data(self, docker_client, image_uri, training_resources): + result = _run(docker_client, image_uri, training_resources, STD_HP, STD_IDC, STD_RC, []) + _assert_failed(result) + + def test_no_validation_data(self, docker_client, image_uri, training_resources): + train = self._get_libsvm_data(training_resources, False) + result = _run( + docker_client, image_uri, training_resources, STD_HP, STD_IDC, STD_RC, train, [] + ) + _assert_failed(result) + + def test_invalid_data_csv_content_type(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "csv" + idc["validation"]["ContentType"] = "csv" + d = os.path.join(training_resources, "data", "invalid-data") + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "data.rec")], + [os.path.join(d, "data.rec")], + ) + _assert_failed(result) + + def test_csv_alpha_with_csv_content_type(self, docker_client, image_uri, training_resources): + idc = copy.deepcopy(STD_IDC) + idc["train"]["ContentType"] = "text/csv" + d = _csv_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + idc, + STD_RC, + [os.path.join(d, "train_alpha.csv")], + ) + _assert_failed(result) + + def test_csv_data_with_libsvm_content_type(self, docker_client, image_uri, training_resources): + d = _csv_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + STD_IDC, + STD_RC, + [os.path.join(d, "train.csv")], + [os.path.join(d, "val.csv")], + ) + _assert_failed(result, regex="UserError:") + + def test_invalid_data_with_libsvm_content_type( + self, docker_client, image_uri, training_resources + ): + d = os.path.join(training_resources, "data", "invalid-data") + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + STD_IDC, + STD_RC, + [os.path.join(d, "data.rec")], + [os.path.join(d, "data.rec")], + ) + _assert_failed(result) + + @pytest.mark.parametrize( + "param,values", + [ + ("eta", ["-0.1", "1.01", "invalid_string"]), + ("gamma", ["-0.1", "invalid_string"]), + ("max_depth", ["-0.1", "invalid_string"]), + ("min_child_weight", ["-0.1", "invalid_string"]), + ("max_delta_step", ["-0.1", "invalid_string"]), + ("colsample_bytree", ["-0.1", "0", "invalid_string"]), + ("colsample_bylevel", ["-0.1", "0", "invalid_string"]), + ("tree_method", ["invalid_method", "gpu_exact"]), + ("sketch_eps", ["0", "1", "invalid_string"]), + ("refresh_leaf", ["invalid", "2"]), + ("process_type", ["invalid", "0.01"]), + ("grow_policy", ["invalid", "0.01"]), + ("sample_type", ["invalid", "0.01"]), + ("normalize_type", ["invalid", "0.01"]), + ("rate_drop", ["invalid", "-0.01", "1.01"]), + ("one_drop", ["invalid", "-0.01", "1.01"]), + ("skip_drop", ["invalid", "-0.01", "1.01"]), + ("tweedie_variance_power", ["invalid", "1", "2"]), + ("eval_metric", ["invalid", "1", "rmse,invalid", "error@nonfloat"]), + ("booster", ["invalid", "1"]), + ("verbosity", ["invalid", "-1", "4", "0.5"]), + ], + ) + def test_invalid_hyperparameter( + self, docker_client, image_uri, training_resources, param, values + ): + train, val = self._get_libsvm_data(training_resources) + hp = copy.deepcopy(STD_HP) + for v in values: + hp[param] = v + result = _run( + docker_client, image_uri, training_resources, hp, STD_IDC, STD_RC, train, val + ) + _assert_failed(result) + + def test_missing_num_round(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + hp.pop("num_round", None) + train, val = self._get_libsvm_data(training_resources) + result = _run(docker_client, image_uri, training_resources, hp, STD_IDC, STD_RC, train, val) + _assert_failed(result) + + def test_multiclass_without_num_class(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + train, val = self._get_libsvm_data(training_resources) + for obj in ["multi:softmax", "multi:softprob"]: + hp["objective"] = obj + result = _run( + docker_client, image_uri, training_resources, hp, STD_IDC, STD_RC, train, val + ) + _assert_failed(result) + + def test_libsvm_data_alpha_with_libsvm_content_type( + self, docker_client, image_uri, training_resources + ): + d = _libsvm_dir(training_resources) + result = _run( + docker_client, + image_uri, + training_resources, + STD_HP, + STD_IDC, + STD_RC, + [os.path.join(d, "agaricus.alpha.train")], + [os.path.join(d, "agaricus.alpha.train")], + ) + _assert_failed(result) + + def test_invalid_updater_for_update_process_type( + self, docker_client, image_uri, training_resources + ): + hp = copy.deepcopy(STD_HP) + hp["process_type"] = "update" + train = self._get_libsvm_data(training_resources, False) + idc = copy.deepcopy(STD_IDC) + idc.pop("validation", None) + result = _run(docker_client, image_uri, training_resources, hp, idc, STD_RC, train) + _assert_failed(result) + + hp["updater"] = "refresh,invalid" + result = _run(docker_client, image_uri, training_resources, hp, idc, STD_RC, train) + _assert_failed(result) + + def test_invalid_updater_for_gblinear(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + hp["booster"] = "gblinear" + train = self._get_libsvm_data(training_resources, False) + idc = copy.deepcopy(STD_IDC) + idc.pop("validation", None) + result = _run(docker_client, image_uri, training_resources, hp, idc, STD_RC, train) + _assert_failed(result) + + hp["updater"] = "shotgun,grow_colmaker" + result = _run(docker_client, image_uri, training_resources, hp, idc, STD_RC, train) + _assert_failed(result) + + def test_auc_with_invalid_objective(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + hp["eval_metric"] = "auc" + train, val = self._get_libsvm_data(training_resources) + for obj in ["reg:squarederror", "reg:linear", "reg:gamma"]: + hp["objective"] = obj + result = _run( + docker_client, image_uri, training_resources, hp, STD_IDC, STD_RC, train, val + ) + _assert_failed(result) + + def test_invalid_eval_metric_values(self, docker_client, image_uri, training_resources): + hp = copy.deepcopy(STD_HP) + train, val = self._get_libsvm_data(training_resources) + for invalid in [" Date: Fri, 3 Apr 2026 17:36:58 -0700 Subject: [PATCH 06/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 33 X-AI-Prompt: i want to test just the release logic comment the tests which might take hours to run --- .../workflows/release-sagemaker-xgboost.yml | 126 +++++++++--------- 1 file changed, 62 insertions(+), 64 deletions(-) diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 49c0a4a38b17..6f447ac33c78 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -96,72 +96,70 @@ jobs: contributor: ${{ needs.load-config.outputs.contributor }} customer-type: ${{ needs.load-config.outputs.customer-type }} - unit-test: - needs: [build-image, load-config] - if: success() - timeout-minutes: 15 - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-test-${{ github.run_id }} - cancel-in-progress: true - steps: - - name: Checkout DLC source - uses: actions/checkout@v5 - - - name: Clone sagemaker-xgboost-container - run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit - - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - image-uri: ${{ needs.build-image.outputs.ci-image }} - - - name: Build test image - run: | - CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" - cd /tmp/xgboost-unit - printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test - docker build -t test-xgboost -f Dockerfile.test . - - - name: Run unit tests - run: | - docker run --rm test-xgboost sh -c \ - 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' - - - name: Run flake8 - run: | - docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' - - security-test: - needs: [build-image, load-config] - if: success() - concurrency: - group: ${{ github.workflow }}-security-test-${{ github.run_id }} - cancel-in-progress: true - uses: ./.github/workflows/reusable-security-tests.yml - with: - image-uri: ${{ needs.build-image.outputs.ci-image }} - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} - - xgboost-tests: - needs: [build-image, load-config] - if: success() - uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml - with: - image-uri: ${{ needs.build-image.outputs.ci-image }} - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} + # unit-test commented out for release logic testing + # unit-test: + # needs: [build-image, load-config] + # if: success() + # timeout-minutes: 15 + # runs-on: + # - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + # fleet:default-runner + # buildspec-override:true + # concurrency: + # group: ${{ github.workflow }}-unit-test-${{ github.run_id }} + # cancel-in-progress: true + # steps: + # - name: Checkout DLC source + # uses: actions/checkout@v5 + # - name: Clone sagemaker-xgboost-container + # run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit + # - name: ECR login + # uses: ./.github/actions/ecr-authenticate + # with: + # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + # aws-region: ${{ vars.AWS_REGION }} + # image-uri: ${{ needs.build-image.outputs.ci-image }} + # - name: Build test image + # run: | + # CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" + # cd /tmp/xgboost-unit + # printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test + # docker build -t test-xgboost -f Dockerfile.test . + # - name: Run unit tests + # run: | + # docker run --rm test-xgboost sh -c \ + # 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' + # - name: Run flake8 + # run: | + # docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' + + # security-test commented out for release logic testing + # security-test: + # needs: [build-image, load-config] + # if: success() + # concurrency: + # group: ${{ github.workflow }}-security-test-${{ github.run_id }} + # cancel-in-progress: true + # uses: ./.github/workflows/reusable-security-tests.yml + # with: + # image-uri: ${{ needs.build-image.outputs.ci-image }} + # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + # aws-region: ${{ vars.AWS_REGION }} + # framework: ${{ needs.load-config.outputs.framework }} + # framework-version: ${{ needs.load-config.outputs.framework-version }} + + # xgboost-tests commented out for release logic testing + # xgboost-tests: + # needs: [build-image, load-config] + # if: success() + # uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml + # with: + # image-uri: ${{ needs.build-image.outputs.ci-image }} + # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + # aws-region: ${{ vars.AWS_REGION }} generate-release-spec: - needs: [load-config, build-image, unit-test, security-test] + needs: [load-config, build-image] runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-generate-release-spec-${{ github.run_id }} From 9ce467f5ad8c4e24b141896186c6bd9f7e646622 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Wed, 8 Apr 2026 14:20:53 -0700 Subject: [PATCH 07/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 38 X-AI-Prompt: in the pr we have made changes to release-sagemkaker-xgboost.yml file to skip tests now revert them i want to run all the tests --- .../workflows/release-sagemaker-xgboost.yml | 117 +++++++++--------- 1 file changed, 57 insertions(+), 60 deletions(-) diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 6f447ac33c78..c99f83dc442d 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -96,70 +96,67 @@ jobs: contributor: ${{ needs.load-config.outputs.contributor }} customer-type: ${{ needs.load-config.outputs.customer-type }} - # unit-test commented out for release logic testing - # unit-test: - # needs: [build-image, load-config] - # if: success() - # timeout-minutes: 15 - # runs-on: - # - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - # fleet:default-runner - # buildspec-override:true - # concurrency: - # group: ${{ github.workflow }}-unit-test-${{ github.run_id }} - # cancel-in-progress: true - # steps: - # - name: Checkout DLC source - # uses: actions/checkout@v5 - # - name: Clone sagemaker-xgboost-container - # run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit - # - name: ECR login - # uses: ./.github/actions/ecr-authenticate - # with: - # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - # aws-region: ${{ vars.AWS_REGION }} - # image-uri: ${{ needs.build-image.outputs.ci-image }} - # - name: Build test image - # run: | - # CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" - # cd /tmp/xgboost-unit - # printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test - # docker build -t test-xgboost -f Dockerfile.test . - # - name: Run unit tests - # run: | - # docker run --rm test-xgboost sh -c \ - # 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' - # - name: Run flake8 - # run: | - # docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' + unit-test: + needs: [build-image, load-config] + if: success() + timeout-minutes: 15 + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:default-runner + buildspec-override:true + concurrency: + group: ${{ github.workflow }}-unit-test-${{ github.run_id }} + cancel-in-progress: true + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + - name: Clone sagemaker-xgboost-container + run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + image-uri: ${{ needs.build-image.outputs.ci-image }} + - name: Build test image + run: | + CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" + cd /tmp/xgboost-unit + printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test + docker build -t test-xgboost -f Dockerfile.test . + - name: Run unit tests + run: | + docker run --rm test-xgboost sh -c \ + 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' + - name: Run flake8 + run: | + docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' - # security-test commented out for release logic testing - # security-test: - # needs: [build-image, load-config] - # if: success() - # concurrency: - # group: ${{ github.workflow }}-security-test-${{ github.run_id }} - # cancel-in-progress: true - # uses: ./.github/workflows/reusable-security-tests.yml - # with: - # image-uri: ${{ needs.build-image.outputs.ci-image }} - # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - # aws-region: ${{ vars.AWS_REGION }} - # framework: ${{ needs.load-config.outputs.framework }} - # framework-version: ${{ needs.load-config.outputs.framework-version }} + security-test: + needs: [build-image, load-config] + if: success() + concurrency: + group: ${{ github.workflow }}-security-test-${{ github.run_id }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-security-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} - # xgboost-tests commented out for release logic testing - # xgboost-tests: - # needs: [build-image, load-config] - # if: success() - # uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml - # with: - # image-uri: ${{ needs.build-image.outputs.ci-image }} - # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - # aws-region: ${{ vars.AWS_REGION }} + xgboost-tests: + needs: [build-image, load-config] + if: success() + uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} generate-release-spec: - needs: [load-config, build-image] + needs: [load-config, build-image, unit-test, security-test] runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-generate-release-spec-${{ github.run_id }} From 8761219bd69b301a0dc92009b53efa3c1b8496ca Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Wed, 8 Apr 2026 16:20:45 -0700 Subject: [PATCH 08/17] Human changes made during kiro-cli session after prompt completion. --- X-AI-Tool: Human X-AI-Prompt: the branch is again out of date --- .github/workflows/release-sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 777bc227c6dc..a1535270b992 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -156,7 +156,7 @@ jobs: aws-region: ${{ vars.AWS_REGION }} generate-release-spec: - needs: [load-config, build-image, unit-test, security-test] + needs: [load-config, build-image, unit-test, security-test, xgboost-tests] runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-generate-release-spec-${{ github.run_id }} From 4f55f5e42c40a04c4454f65c5506421db8086e52 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Wed, 8 Apr 2026 21:52:34 -0700 Subject: [PATCH 09/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 38 X-AI-Prompt: i want to test gamma release. skip the tests so that the release logic is tested --- .../workflows/release-sagemaker-xgboost.yml | 115 +++++++++--------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index a1535270b992..28d66c8c5584 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -96,67 +96,68 @@ jobs: contributor: ${{ needs.load-config.outputs.contributor }} customer-type: ${{ needs.load-config.outputs.customer-type }} - unit-test: - needs: [security-test, build-image, load-config] - if: success() - timeout-minutes: 15 - runs-on: - - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - fleet:default-runner - buildspec-override:true - concurrency: - group: ${{ github.workflow }}-unit-test-${{ github.run_id }} - cancel-in-progress: true - steps: - - name: Checkout DLC source - uses: actions/checkout@v5 - - name: Clone sagemaker-xgboost-container - run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit - - name: ECR login - uses: ./.github/actions/ecr-authenticate - with: - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - image-uri: ${{ needs.build-image.outputs.ci-image }} - - name: Build test image - run: | - CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" - cd /tmp/xgboost-unit - printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test - docker build -t test-xgboost -f Dockerfile.test . - - name: Run unit tests - run: | - docker run --rm test-xgboost sh -c \ - 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' - - name: Run flake8 - run: | - docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' + # Tests commented out for gamma release testing + # unit-test: + # needs: [security-test, build-image, load-config] + # if: success() + # timeout-minutes: 15 + # runs-on: + # - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + # fleet:default-runner + # buildspec-override:true + # concurrency: + # group: ${{ github.workflow }}-unit-test-${{ github.run_id }} + # cancel-in-progress: true + # steps: + # - name: Checkout DLC source + # uses: actions/checkout@v5 + # - name: Clone sagemaker-xgboost-container + # run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit + # - name: ECR login + # uses: ./.github/actions/ecr-authenticate + # with: + # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + # aws-region: ${{ vars.AWS_REGION }} + # image-uri: ${{ needs.build-image.outputs.ci-image }} + # - name: Build test image + # run: | + # CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" + # cd /tmp/xgboost-unit + # printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test + # docker build -t test-xgboost -f Dockerfile.test . + # - name: Run unit tests + # run: | + # docker run --rm test-xgboost sh -c \ + # 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' + # - name: Run flake8 + # run: | + # docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' - security-test: - needs: [build-image, load-config] - if: success() - concurrency: - group: ${{ github.workflow }}-security-test-${{ github.run_id }} - cancel-in-progress: true - uses: ./.github/workflows/reusable-security-tests.yml - with: - image-uri: ${{ needs.build-image.outputs.ci-image }} - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} - framework: ${{ needs.load-config.outputs.framework }} - framework-version: ${{ needs.load-config.outputs.framework-version }} + # security-test: + # needs: [build-image, load-config] + # if: success() + # concurrency: + # group: ${{ github.workflow }}-security-test-${{ github.run_id }} + # cancel-in-progress: true + # uses: ./.github/workflows/reusable-security-tests.yml + # with: + # image-uri: ${{ needs.build-image.outputs.ci-image }} + # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + # aws-region: ${{ vars.AWS_REGION }} + # framework: ${{ needs.load-config.outputs.framework }} + # framework-version: ${{ needs.load-config.outputs.framework-version }} - xgboost-tests: - needs: [security-test, build-image, load-config] - if: success() - uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml - with: - image-uri: ${{ needs.build-image.outputs.ci-image }} - aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - aws-region: ${{ vars.AWS_REGION }} + # xgboost-tests: + # needs: [security-test, build-image, load-config] + # if: success() + # uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml + # with: + # image-uri: ${{ needs.build-image.outputs.ci-image }} + # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + # aws-region: ${{ vars.AWS_REGION }} generate-release-spec: - needs: [load-config, build-image, unit-test, security-test, xgboost-tests] + needs: [load-config, build-image] runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-generate-release-spec-${{ github.run_id }} From 5ec949698a40ad98edebcb5cf25037ff35695fdd Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Wed, 8 Apr 2026 23:43:36 -0700 Subject: [PATCH 10/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 38 X-AI-Prompt: no i created a new GitHub environment preprod and now make changees to make it possible --- .github/config/sagemaker-xgboost.yml | 2 +- .github/workflows/reusable-release-image.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index b5a13453b986..03f784a2b5de 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -27,4 +27,4 @@ release: public_registry: false private_registry: true enable_soci: false - environment: gamma + environment: preprod diff --git a/.github/workflows/reusable-release-image.yml b/.github/workflows/reusable-release-image.yml index 24f9ad84aa4a..7940599d1236 100644 --- a/.github/workflows/reusable-release-image.yml +++ b/.github/workflows/reusable-release-image.yml @@ -52,9 +52,9 @@ jobs: ENVIRONMENT="${{ inputs.environment }}" # Validate environment input - if [[ "${ENVIRONMENT}" != "gamma" && "${ENVIRONMENT}" != "production" ]]; then + if [[ "${ENVIRONMENT}" != "gamma" && "${ENVIRONMENT}" != "production" && "${ENVIRONMENT}" != "preprod" ]]; then echo "❌ ERROR: Invalid environment '${ENVIRONMENT}'" - echo "Valid environments: gamma, production" + echo "Valid environments: gamma, preprod, production" exit 1 fi From 2f85fab07b9278a08f684952baac8b1dbcd71914 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 9 Apr 2026 00:25:48 -0700 Subject: [PATCH 11/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 13 X-AI-Prompt: change env to gamma i want to trigger --- .github/config/sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index 03f784a2b5de..b5a13453b986 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -27,4 +27,4 @@ release: public_registry: false private_registry: true enable_soci: false - environment: preprod + environment: gamma From 8723b2f988fdd05b6a5b8a38a608585a2836d2c6 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 9 Apr 2026 00:31:37 -0700 Subject: [PATCH 12/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 12 X-AI-Prompt: change back to preprod --- .github/config/sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index b5a13453b986..03f784a2b5de 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -27,4 +27,4 @@ release: public_registry: false private_registry: true enable_soci: false - environment: gamma + environment: preprod From 02d06b79fc961d7bc69e0385ebd6965a373213ae Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 9 Apr 2026 12:57:10 -0700 Subject: [PATCH 13/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 28 X-AI-Prompt: change env to gamma from production --- .github/config/sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index 03f784a2b5de..b5a13453b986 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -27,4 +27,4 @@ release: public_registry: false private_registry: true enable_soci: false - environment: preprod + environment: gamma From c5f65eee2ade217d12e3653f161a91387adc2564 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 9 Apr 2026 13:22:04 -0700 Subject: [PATCH 14/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 9 X-AI-Prompt: ok change the env to preprod --- .github/config/sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index b5a13453b986..03f784a2b5de 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -27,4 +27,4 @@ release: public_registry: false private_registry: true enable_soci: false - environment: gamma + environment: preprod From 4d9c19921886db187e3aa86a24f18fa26c158ae3 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 9 Apr 2026 15:44:09 -0700 Subject: [PATCH 15/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 34 X-AI-Prompt: remove on push and revert force release --- .github/config/sagemaker-xgboost.yml | 2 +- .github/workflows/release-sagemaker-xgboost.yml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index 03f784a2b5de..2305cda4cf0b 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -23,7 +23,7 @@ common: # Release configuration release: release: true - force_release: true + force_release: false public_registry: false private_registry: true enable_soci: false diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 28d66c8c5584..18ac0ab8efda 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -1,9 +1,6 @@ name: Release - XGBoost SageMaker on: - # TODO: Remove push trigger after testing, keep only workflow_dispatch - push: - branches: [xgboost-migration] workflow_dispatch: permissions: From ec203ef60d0a5b9be1945c38df622117daac57a7 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 9 Apr 2026 15:49:22 -0700 Subject: [PATCH 16/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 41 X-AI-Prompt: ok enable all the tests also --- .../workflows/release-sagemaker-xgboost.yml | 115 +++++++++--------- 1 file changed, 57 insertions(+), 58 deletions(-) diff --git a/.github/workflows/release-sagemaker-xgboost.yml b/.github/workflows/release-sagemaker-xgboost.yml index 18ac0ab8efda..e7798f602d25 100644 --- a/.github/workflows/release-sagemaker-xgboost.yml +++ b/.github/workflows/release-sagemaker-xgboost.yml @@ -93,68 +93,67 @@ jobs: contributor: ${{ needs.load-config.outputs.contributor }} customer-type: ${{ needs.load-config.outputs.customer-type }} - # Tests commented out for gamma release testing - # unit-test: - # needs: [security-test, build-image, load-config] - # if: success() - # timeout-minutes: 15 - # runs-on: - # - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} - # fleet:default-runner - # buildspec-override:true - # concurrency: - # group: ${{ github.workflow }}-unit-test-${{ github.run_id }} - # cancel-in-progress: true - # steps: - # - name: Checkout DLC source - # uses: actions/checkout@v5 - # - name: Clone sagemaker-xgboost-container - # run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit - # - name: ECR login - # uses: ./.github/actions/ecr-authenticate - # with: - # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - # aws-region: ${{ vars.AWS_REGION }} - # image-uri: ${{ needs.build-image.outputs.ci-image }} - # - name: Build test image - # run: | - # CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" - # cd /tmp/xgboost-unit - # printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test - # docker build -t test-xgboost -f Dockerfile.test . - # - name: Run unit tests - # run: | - # docker run --rm test-xgboost sh -c \ - # 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' - # - name: Run flake8 - # run: | - # docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' + unit-test: + needs: [security-test, build-image, load-config] + if: success() + timeout-minutes: 15 + runs-on: + - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} + fleet:default-runner + buildspec-override:true + concurrency: + group: ${{ github.workflow }}-unit-test-${{ github.run_id }} + cancel-in-progress: true + steps: + - name: Checkout DLC source + uses: actions/checkout@v5 + - name: Clone sagemaker-xgboost-container + run: rm -rf /tmp/xgboost-unit && git clone --depth 1 ${{ env.XGBOOST_CONTAINER_REPO }} /tmp/xgboost-unit + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + image-uri: ${{ needs.build-image.outputs.ci-image }} + - name: Build test image + run: | + CI_IMAGE_URI="${{ needs.build-image.outputs.ci-image }}" + cd /tmp/xgboost-unit + printf "FROM ${CI_IMAGE_URI}\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test + docker build -t test-xgboost -f Dockerfile.test . + - name: Run unit tests + run: | + docker run --rm test-xgboost sh -c \ + 'python3 -m pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit' + - name: Run flake8 + run: | + docker run --rm test-xgboost sh -c 'python3 -m flake8 setup.py src test' - # security-test: - # needs: [build-image, load-config] - # if: success() - # concurrency: - # group: ${{ github.workflow }}-security-test-${{ github.run_id }} - # cancel-in-progress: true - # uses: ./.github/workflows/reusable-security-tests.yml - # with: - # image-uri: ${{ needs.build-image.outputs.ci-image }} - # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - # aws-region: ${{ vars.AWS_REGION }} - # framework: ${{ needs.load-config.outputs.framework }} - # framework-version: ${{ needs.load-config.outputs.framework-version }} + security-test: + needs: [build-image, load-config] + if: success() + concurrency: + group: ${{ github.workflow }}-security-test-${{ github.run_id }} + cancel-in-progress: true + uses: ./.github/workflows/reusable-security-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} + framework: ${{ needs.load-config.outputs.framework }} + framework-version: ${{ needs.load-config.outputs.framework-version }} - # xgboost-tests: - # needs: [security-test, build-image, load-config] - # if: success() - # uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml - # with: - # image-uri: ${{ needs.build-image.outputs.ci-image }} - # aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} - # aws-region: ${{ vars.AWS_REGION }} + xgboost-tests: + needs: [security-test, build-image, load-config] + if: success() + uses: ./.github/workflows/sagemaker-xgboost-integ-tests.yml + with: + image-uri: ${{ needs.build-image.outputs.ci-image }} + aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} + aws-region: ${{ vars.AWS_REGION }} generate-release-spec: - needs: [load-config, build-image] + needs: [load-config, build-image, unit-test, security-test, xgboost-tests] runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-generate-release-spec-${{ github.run_id }} From 7f01b4a52d0d52420193659b625345dd6eb1ac90 Mon Sep 17 00:00:00 2001 From: Bhanu Teja Goshikonda Date: Thu, 9 Apr 2026 16:03:59 -0700 Subject: [PATCH 17/17] AI changes made during Kiro-cli session --- X-AI-Tool: Kiro-cli X-AI-Handle-Time-Seconds: 12 X-AI-Prompt: also make release : false --- .github/config/sagemaker-xgboost.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/sagemaker-xgboost.yml b/.github/config/sagemaker-xgboost.yml index 2305cda4cf0b..c945df395d69 100644 --- a/.github/config/sagemaker-xgboost.yml +++ b/.github/config/sagemaker-xgboost.yml @@ -22,7 +22,7 @@ common: # Release configuration release: - release: true + release: false force_release: false public_registry: false private_registry: true