Skip to content

Commit 95560e9

Browse files
authored
Merge branch 'master' into master-mtrl-eval-issue-fix
2 parents 3dd7e63 + f93524b commit 95560e9

5 files changed

Lines changed: 51 additions & 8 deletions

File tree

.github/workflows/gpu-integ-tests.yml

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,36 @@ jobs:
7979
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
8080
source-version: refs/heads/master
8181

82+
# Bedrock model-import integ tests. Run serially (concurrency 1) in their own
83+
# CodeBuild project because the "Concurrent model import jobs" Bedrock quota is
84+
# fixed at 1 and not raisable; running them in parallel (as PR checks did)
85+
# makes them collide and flake. us-west-2 only (no us_east_1-marked tests).
86+
# Folded into the same run-level pass/fail metric as the GPU jobs below, so it
87+
# shares the GpuIntegRunAlarm rather than getting a separate alarm.
88+
import-model-integ-tests:
89+
needs: check-prior-success
90+
if: needs.check-prior-success.outputs.already_succeeded != 'true'
91+
runs-on: ubuntu-latest
92+
steps:
93+
- name: Configure AWS Credentials
94+
uses: aws-actions/configure-aws-credentials@v4
95+
with:
96+
role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
97+
aws-region: us-west-2
98+
role-duration-seconds: 10800
99+
- name: Run Bedrock Model-Import Integ Tests
100+
uses: aws-actions/aws-codebuild-run-build@v1
101+
with:
102+
project-name: sagemaker-python-sdk-ci-health-import-model-integ-tests
103+
source-version: refs/heads/master
104+
82105
# Run-level result: a run is successful only if BOTH region jobs succeeded.
83106
# Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in
84107
# us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and
85108
# cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate
86109
# short-circuited today's run (an earlier run already succeeded).
87110
report-result:
88-
needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1]
111+
needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1, import-model-integ-tests]
89112
# Only emit the daily alarm metric for scheduled runs that actually executed
90113
# the test jobs:
91114
# - check-prior-success.result == 'success': if the gate job itself failed,
@@ -111,12 +134,13 @@ jobs:
111134
exit 0
112135
fi
113136
if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \
114-
[ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then
137+
[ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ] && \
138+
[ "${{ needs.import-model-integ-tests.result }}" == "success" ]; then
115139
value=0
116-
echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0"
140+
echo "All region/import jobs succeeded; emitting GpuIntegRunFailure=0"
117141
else
118142
value=1
119-
echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1"
143+
echo "At least one region/import job did not succeed; emitting GpuIntegRunFailure=1"
120144
fi
121145
aws cloudwatch put-metric-data \
122146
--namespace GpuIntegRunMetrics \

sagemaker-mlops/tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ markers =
6464
release
6565
image_uris_unit_test
6666
timeout: mark a test as a timeout.
67+
serial: mark a test that must not run concurrently with others sharing the same resources.
6768

6869
[testenv]
6970
setenv =

sagemaker-serve/tests/integ/test_bedrock_provisioned_throughput.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ def _setup_model_files(s3_artifacts_uri, s3_client):
170170

171171

172172
@pytest.mark.serial
173+
@pytest.mark.import_model
173174
class TestBedrockImportJobPolling:
174175
"""Test import job polling for OSS models (Option C: deploy only waits for import)."""
175176

@@ -236,6 +237,7 @@ def test_deploy_oss_model_waits_for_import_completion(
236237

237238

238239
@pytest.mark.serial
240+
@pytest.mark.import_model
239241
class TestBedrockProvisionedThroughput:
240242
"""Test create_provisioned_throughput as a standalone method.
241243

sagemaker-serve/tests/integ/test_model_customization_deployment.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ def test_deploy_from_training_job(self, training_job_name, endpoint_name, cleanu
115115
from sagemaker.serve import ModelBuilder
116116
import time
117117

118+
from sagemaker.core.utils.exceptions import FailedStatusError
119+
118120
training_job = TrainingJob.get(training_job_name=training_job_name, region=AWS_REGION)
119121
model_builder = ModelBuilder(model=training_job, instance_type="ml.g5.4xlarge", sagemaker_session=sagemaker_session)
120122
model_builder.accept_eula = True
@@ -123,10 +125,21 @@ def test_deploy_from_training_job(self, training_job_name, endpoint_name, cleanu
123125
peft_type = model_builder._fetch_peft()
124126
adapter_name = f"{endpoint_name}-adapter"
125127

126-
endpoint = model_builder.deploy(
127-
endpoint_name=endpoint_name,
128-
inference_component_name=adapter_name if peft_type == "LORA" else None,
129-
)
128+
try:
129+
endpoint = model_builder.deploy(
130+
endpoint_name=endpoint_name,
131+
inference_component_name=adapter_name if peft_type == "LORA" else None,
132+
)
133+
except FailedStatusError as e:
134+
# Endpoint provisioning can fail when the region is temporarily out of
135+
# capacity for the requested instance type. This is an environmental
136+
# condition unrelated to the SDK, so xfail rather than fail the build.
137+
if "InsufficientInstanceCapacity" in str(e):
138+
cleanup_endpoints.append(endpoint_name)
139+
pytest.xfail(
140+
f"InsufficientInstanceCapacity for ml.g5.4xlarge in {AWS_REGION}: {e}"
141+
)
142+
raise
130143

131144
cleanup_endpoints.append(endpoint_name)
132145

@@ -575,6 +588,7 @@ def test_bedrock_model_builder_creation(self, training_job):
575588
f"BedrockModelBuilder creation failed: {str(e)}. This might be due to sagemaker-core integration issues.")
576589

577590
@pytest.mark.slow
591+
@pytest.mark.import_model
578592
def test_bedrock_job_created(self, deployed_model_arn):
579593
"""Test that Bedrock import job was created successfully."""
580594
assert deployed_model_arn is not None
@@ -583,6 +597,7 @@ def test_bedrock_job_created(self, deployed_model_arn):
583597
# Documentation recommends retries: https://docs.aws.amazon.com/bedrock/latest/userguide/invoke-imported-model.html#handle-model-not-ready-exception.
584598
# TODO: Fix using provisioned throughput or better wait mechanism
585599
@pytest.mark.slow
600+
@pytest.mark.import_model
586601
def test_bedrock_model_invoke(self, deployed_model_arn, bedrock_runtime):
587602
logger.warning(
588603
"This test is known to be flaky due to 'model not ready' exceptions from Bedrock. "

sagemaker-serve/tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ markers =
6565
timeout: mark a test as a timeout.
6666
gpu_intensive: mark a test as GPU resource intensive (runs on scheduled CI, not PR checks).
6767
us_east_1: mark a test that requires us-east-1 test account credentials (784379639078).
68+
import_model: mark a test that creates a Bedrock model import job. Concurrent model import jobs are capped at 1 by a non-raisable Bedrock service quota, so these run serially in a dedicated scheduled CI run, not in PR checks.
6869

6970
[testenv]
7071
setenv =

0 commit comments

Comments
 (0)