test(serve): isolate Bedrock model-import integ tests behind import_model marker (#5952)

lucasjia-aws · web-flow · commit f93524bcab55 · 2026-06-16T14:43:37.000-07:00
The "Concurrent model import jobs" Bedrock quota is fixed at 1 and not
raisable. Running these tests in parallel under PR checks (-n auto) makes
them collide on the quota and flake. Register a new `import_model` pytest
marker and tag the four import-job tests so they can be split out of PR
checks into a dedicated serial scheduled run.
diff --git a/.github/workflows/gpu-integ-tests.yml b/.github/workflows/gpu-integ-tests.yml
@@ -79,13 +79,36 @@ jobs:
           project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
           source-version: refs/heads/master
 
+  # Bedrock model-import integ tests. Run serially (concurrency 1) in their own
+  # CodeBuild project because the "Concurrent model import jobs" Bedrock quota is
+  # fixed at 1 and not raisable; running them in parallel (as PR checks did)
+  # makes them collide and flake. us-west-2 only (no us_east_1-marked tests).
+  # Folded into the same run-level pass/fail metric as the GPU jobs below, so it
+  # shares the GpuIntegRunAlarm rather than getting a separate alarm.
+  import-model-integ-tests:
+    needs: check-prior-success
+    if: needs.check-prior-success.outputs.already_succeeded != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
+          aws-region: us-west-2
+          role-duration-seconds: 10800
+      - name: Run Bedrock Model-Import Integ Tests
+        uses: aws-actions/aws-codebuild-run-build@v1
+        with:
+          project-name: sagemaker-python-sdk-ci-health-import-model-integ-tests
+          source-version: refs/heads/master
+
   # Run-level result: a run is successful only if BOTH region jobs succeeded.
   # Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in
   # us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and
   # cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate
   # short-circuited today's run (an earlier run already succeeded).
   report-result:
-    needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1]
+    needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1, import-model-integ-tests]
     # Only emit the daily alarm metric for scheduled runs that actually executed
     # the test jobs:
     #   - check-prior-success.result == 'success': if the gate job itself failed,
@@ -111,12 +134,13 @@ jobs:
             exit 0
           fi
           if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \
-             [ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then
+             [ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ] && \
+             [ "${{ needs.import-model-integ-tests.result }}" == "success" ]; then
             value=0
-            echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0"
+            echo "All region/import jobs succeeded; emitting GpuIntegRunFailure=0"
           else
             value=1
-            echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1"
+            echo "At least one region/import job did not succeed; emitting GpuIntegRunFailure=1"
           fi
           aws cloudwatch put-metric-data \
             --namespace GpuIntegRunMetrics \
diff --git a/sagemaker-serve/tests/integ/test_bedrock_provisioned_throughput.py b/sagemaker-serve/tests/integ/test_bedrock_provisioned_throughput.py
@@ -170,6 +170,7 @@ def _setup_model_files(s3_artifacts_uri, s3_client):
 
 
 @pytest.mark.serial
+@pytest.mark.import_model
 class TestBedrockImportJobPolling:
     """Test import job polling for OSS models (Option C: deploy only waits for import)."""
 
@@ -236,6 +237,7 @@ def test_deploy_oss_model_waits_for_import_completion(
 
 
 @pytest.mark.serial
+@pytest.mark.import_model
 class TestBedrockProvisionedThroughput:
     """Test create_provisioned_throughput as a standalone method.
 
diff --git a/sagemaker-serve/tests/integ/test_model_customization_deployment.py b/sagemaker-serve/tests/integ/test_model_customization_deployment.py
@@ -588,6 +588,7 @@ def test_bedrock_model_builder_creation(self, training_job):
                 f"BedrockModelBuilder creation failed: {str(e)}. This might be due to sagemaker-core integration issues.")
 
     @pytest.mark.slow
+    @pytest.mark.import_model
     def test_bedrock_job_created(self, deployed_model_arn):
         """Test that Bedrock import job was created successfully."""
         assert deployed_model_arn is not None
@@ -596,6 +597,7 @@ def test_bedrock_job_created(self, deployed_model_arn):
     # Documentation recommends retries: https://docs.aws.amazon.com/bedrock/latest/userguide/invoke-imported-model.html#handle-model-not-ready-exception.
     # TODO: Fix using provisioned throughput or better wait mechanism
     @pytest.mark.slow
+    @pytest.mark.import_model
     def test_bedrock_model_invoke(self, deployed_model_arn, bedrock_runtime):
         logger.warning(
             "This test is known to be flaky due to 'model not ready' exceptions from Bedrock. "
diff --git a/sagemaker-serve/tox.ini b/sagemaker-serve/tox.ini
@@ -65,6 +65,7 @@ markers =
     timeout: mark a test as a timeout.
     gpu_intensive: mark a test as GPU resource intensive (runs on scheduled CI, not PR checks).
     us_east_1: mark a test that requires us-east-1 test account credentials (784379639078).
+    import_model: mark a test that creates a Bedrock model import job. Concurrent model import jobs are capped at 1 by a non-raisable Bedrock service quota, so these run serially in a dedicated scheduled CI run, not in PR checks.
 
 [testenv]
 setenv =