Skip to content

Commit 796bdf3

Browse files
authored
test: fix canaries-v3 (#5940)
* chore: trigger PR check to run integ tests * test(serve): eagerly reap leaked provisioned throughputs in bedrock PT test The provisioned throughput integ test only deleted the PT it created in its own teardown, so a process killed before teardown leaked the PT. These accumulated and exhausted the model-unit quota, causing ServiceQuotaExceededException in later runs. Add eager cleanup in the bedrock_client fixture to delete test-pt-integ-* throughputs older than 2h on setup, matching the import-job test's self-healing pattern. * test: make evaluator integ fixtures race-safe under pytest-xdist The MTRL evaluator integ test fixtures used a check-then-create pattern for the shared model package group and package. Under `pytest -n auto`, concurrent workers race: both see the group missing, both call CreateModelPackageGroup, and the loser gets "ValidationException: Model Package Group already exists", which crashed the module-scoped fixture and errored the dependent test (test_model_package_config_fields). Wrap the create calls and treat "already exists" / concurrent creation as success, falling back to reusing the existing resource.
1 parent 6e3a6b4 commit 796bdf3

3 files changed

Lines changed: 91 additions & 10 deletions

File tree

sagemaker-core/src/sagemaker/core/user_agent.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,5 @@ def get_user_agent_extra_suffix():
7474
suffix = "{} md/{}#{}".format(suffix, STUDIO_PREFIX, studio_app_type)
7575

7676
return suffix
77+
78+
# Trigger PR check: run full integ test suite.

sagemaker-serve/tests/integ/test_bedrock_provisioned_throughput.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import time
1818
import random
1919
import logging
20+
from datetime import datetime, timezone, timedelta
2021
from urllib.parse import urlparse
2122

2223
import boto3
@@ -43,10 +44,59 @@ def role_arn():
4344
return get_execution_role()
4445

4546

47+
# Prefix used for all provisioned throughputs created by this test module.
48+
PT_TEST_PREFIX = "test-pt-integ-"
49+
# Provisioned throughputs older than this are considered leaked and reaped on setup.
50+
PT_STALE_AGE = timedelta(hours=2)
51+
52+
4653
@pytest.fixture(scope="module")
4754
def bedrock_client():
48-
"""Create Bedrock client."""
49-
return boto3.client("bedrock", region_name=AWS_REGION)
55+
"""Create Bedrock client and eagerly reap leaked test provisioned throughputs.
56+
57+
Provisioned throughputs cost money and consume a small, easily-exhausted
58+
model-unit quota. A test process killed before its teardown runs (CodeBuild
59+
timeout, worker crash, etc.) leaks its PT, and these accumulate across runs
60+
until the quota is full and CreateProvisionedModelThroughput starts failing.
61+
62+
To stay self-healing, on setup we delete any ``test-pt-integ-*`` PT older
63+
than PT_STALE_AGE. The age guard avoids racing a PT that another concurrent
64+
run just created.
65+
"""
66+
client = boto3.client("bedrock", region_name=AWS_REGION)
67+
68+
try:
69+
cutoff = datetime.now(timezone.utc) - PT_STALE_AGE
70+
paginator_token = None
71+
while True:
72+
params = {"maxResults": 100}
73+
if paginator_token:
74+
params["nextToken"] = paginator_token
75+
response = client.list_provisioned_model_throughputs(**params)
76+
for pt in response.get("provisionedModelSummaries", []):
77+
name = pt.get("provisionedModelName", "")
78+
if not name.startswith(PT_TEST_PREFIX):
79+
continue
80+
created = pt.get("creationTime")
81+
if created and created >= cutoff:
82+
continue
83+
# Only InService/Failed PTs can be deleted.
84+
if pt.get("status") not in ("InService", "Failed"):
85+
continue
86+
try:
87+
logger.info("Eager cleanup of stale provisioned throughput: %s", name)
88+
client.delete_provisioned_model_throughput(
89+
provisionedModelId=pt["provisionedModelArn"]
90+
)
91+
except Exception as e:
92+
logger.warning("Eager cleanup failed for %s: %s", name, e)
93+
paginator_token = response.get("nextToken")
94+
if not paginator_token:
95+
break
96+
except Exception as e:
97+
logger.warning("Failed to list provisioned throughputs for eager cleanup: %s", e)
98+
99+
return client
50100

51101

52102
@pytest.fixture(scope="module")

sagemaker-train/tests/integ/train/test_mtrl_evaluator.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,18 +60,37 @@ def test_config():
6060

6161

6262
def _ensure_model_package_group_exists(sm_client, group_name):
63-
"""Create the model package group if it doesn't already exist."""
63+
"""Create the model package group if it doesn't already exist.
64+
65+
Race-safe: with pytest-xdist (`-n auto`) multiple workers run this
66+
concurrently, so a plain check-then-create races. If another worker wins
67+
the create, CreateModelPackageGroup raises "already exists"; treat that as
68+
success rather than letting the fixture error out.
69+
"""
6470
try:
6571
sm_client.describe_model_package_group(ModelPackageGroupName=group_name)
72+
return
6673
except Exception:
74+
pass
75+
76+
try:
6777
sm_client.create_model_package_group(
6878
ModelPackageGroupName=group_name,
6979
ModelPackageGroupDescription="Auto-created for MTRL evaluator integ tests",
7080
)
81+
except Exception as e:
82+
# Another concurrent worker created it between our describe and create.
83+
if "already exists" in str(e):
84+
return
85+
raise
7186

7287

7388
def _ensure_model_package_exists(sm_client, group_name, base_model_name):
74-
"""Create a model package in the group if none exists, for test purposes."""
89+
"""Create a model package in the group if none exists, for test purposes.
90+
91+
Race-safe: if a concurrent worker creates one between our list and create,
92+
fall back to listing again and reusing whatever package now exists.
93+
"""
7594
resp = sm_client.list_model_packages(
7695
ModelPackageGroupName=group_name,
7796
MaxResults=1,
@@ -80,12 +99,22 @@ def _ensure_model_package_exists(sm_client, group_name, base_model_name):
8099
return resp["ModelPackageSummaryList"][0]["ModelPackageArn"]
81100

82101
# Create a minimal unversioned model package (no InferenceSpecification needed)
83-
resp = sm_client.create_model_package(
84-
ModelPackageGroupName=group_name,
85-
ModelPackageDescription="Test model package for MTRL evaluator integ tests",
86-
ModelApprovalStatus="Approved",
87-
)
88-
return resp["ModelPackageArn"]
102+
try:
103+
resp = sm_client.create_model_package(
104+
ModelPackageGroupName=group_name,
105+
ModelPackageDescription="Test model package for MTRL evaluator integ tests",
106+
ModelApprovalStatus="Approved",
107+
)
108+
return resp["ModelPackageArn"]
109+
except Exception:
110+
# A concurrent worker may have created one; reuse the existing package.
111+
resp = sm_client.list_model_packages(
112+
ModelPackageGroupName=group_name,
113+
MaxResults=1,
114+
)
115+
if resp.get("ModelPackageSummaryList"):
116+
return resp["ModelPackageSummaryList"][0]["ModelPackageArn"]
117+
raise
89118

90119

91120
@pytest.fixture(scope="module")

0 commit comments

Comments
 (0)