Skip to content

Commit d9b47ef

Browse files
authored
fix: aws_batch integ test resources are now uniquely named by test run. (#5666)
1 parent 10e3310 commit d9b47ef

File tree

4 files changed

+47
-59
lines changed

4 files changed

+47
-59
lines changed

sagemaker-train/tests/integ/train/aws_batch/manager.py

Lines changed: 22 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,18 @@ class BatchTestResourceManager:
2121
def __init__(
2222
self,
2323
batch_client,
24+
test_id,
2425
queue_name="pysdk-test-qm-queue",
2526
service_env_name="pysdk-test-qm-queue-service-environment",
2627
scheduling_policy_name="pysdk-test-qm-scheduling-policy",
2728
quota_share_name="pysdk-test-quota-share",
2829
):
30+
self.test_id = test_id
2931
self.batch_client = batch_client
30-
self.queue_name = queue_name
31-
self.service_environment_name = service_env_name
32-
self.scheduling_policy_name = scheduling_policy_name
33-
self.quota_share_name = quota_share_name
32+
self.queue_name = f"{queue_name}-{test_id}"
33+
self.service_environment_name = f"{service_env_name}-{test_id}"
34+
self.scheduling_policy_name = f"{scheduling_policy_name}-{test_id}"
35+
self.quota_share_name = f"{quota_share_name}-{test_id}"
3436

3537
def _create_or_get_service_environment(self, service_environment_name):
3638
print(f"Creating service environment: {service_environment_name}")
@@ -277,65 +279,31 @@ def _delete_quota_share(self, quota_share_arn: str):
277279
print("Waiting for QuotaShare deletion to finish...")
278280
self._wait_for_quota_share_state(quota_share_arn, "DELETED", "DISABLED")
279281

280-
def get_or_create_resources(
281-
self,
282-
queue_name=None,
283-
service_environment_name=None,
284-
scheduling_policy_name=None,
285-
quota_share_name=None
286-
):
287-
queue_name = queue_name or self.queue_name
288-
service_environment_name = service_environment_name or self.service_environment_name
289-
scheduling_policy_name = scheduling_policy_name or self.scheduling_policy_name
290-
quota_share_name = quota_share_name or self.quota_share_name
291-
292-
service_environment = self._create_or_get_service_environment(service_environment_name)
293-
if service_environment.get("state") != "ENABLED":
294-
self._update_service_environment_state(service_environment_name, "ENABLED")
295-
self._wait_for_service_environment_state(service_environment_name, "VALID", "ENABLED")
296-
time.sleep(10)
297-
298-
scheduling_policy = self._create_or_get_scheduling_policy(scheduling_policy_name)
299-
scheduling_policy_arn = scheduling_policy.get("arn")
300-
301-
queue = self._create_or_get_queue(queue_name, service_environment["serviceEnvironmentArn"],
302-
scheduling_policy_arn)
303-
if queue.get("state") != "ENABLED":
304-
self._update_queue_state(queue_name, "ENABLED")
305-
self._wait_for_queue_state(queue_name, "VALID", "ENABLED")
306-
time.sleep(10)
307-
308-
quota_share = self._create_or_get_quota_share(quota_share_name, queue_name)
309-
if quota_share.get("state") != "ENABLED":
310-
self._update_quota_share_state(quota_share["quotaShareArn"], "ENABLED")
311-
self._wait_for_quota_share_state(quota_share["quotaShareArn"], "VALID", "ENABLED")
312-
time.sleep(10)
282+
def get_or_create_resources(self):
283+
service_environment = self._create_or_get_service_environment(self.service_environment_name)
284+
scheduling_policy = self._create_or_get_scheduling_policy(self.scheduling_policy_name)
313285

314-
return queue, service_environment, scheduling_policy, quota_share
286+
queue = self._create_or_get_queue(self.queue_name, service_environment["serviceEnvironmentArn"],
287+
scheduling_policy.get("arn"))
288+
self._wait_for_queue_state(self.queue_name, "VALID", "ENABLED")
315289

316-
def delete_resources(
317-
self,
318-
queue_name=None,
319-
service_environment_name=None,
320-
scheduling_policy_name=None,
321-
quota_share_name=None
322-
):
323-
queue_name = queue_name or self.queue_name
324-
service_environment_name = service_environment_name or self.service_environment_name
325-
scheduling_policy_name = scheduling_policy_name or self.scheduling_policy_name
326-
quota_share_name = quota_share_name or self.quota_share_name
290+
quota_share = self._create_or_get_quota_share(self.quota_share_name, self.queue_name)
291+
self._wait_for_quota_share_state(quota_share["quotaShareArn"], "VALID", "ENABLED")
292+
293+
return queue, service_environment, scheduling_policy, quota_share
327294

295+
def delete_resources(self):
328296
# Get ARNs needed for deletion
329-
desc_jq = self.batch_client.describe_job_queues(jobQueues=[queue_name])
297+
desc_jq = self.batch_client.describe_job_queues(jobQueues=[self.queue_name])
330298
if desc_jq["jobQueues"]:
331299
jq_arn = desc_jq["jobQueues"][0]["jobQueueArn"]
332-
quota_share_arn = f"{jq_arn}/quota-share/{quota_share_name}"
300+
quota_share_arn = f"{jq_arn}/quota-share/{self.quota_share_name}"
333301
self._delete_quota_share(quota_share_arn)
334302

335-
self._delete_job_queue(queue_name)
303+
self._delete_job_queue(self.queue_name)
336304

337-
sp = self._find_scheduling_policy(scheduling_policy_name)
305+
sp = self._find_scheduling_policy(self.scheduling_policy_name)
338306
if sp:
339307
self._delete_scheduling_policy(sp["arn"])
340308

341-
self._delete_service_environment(service_environment_name)
309+
self._delete_service_environment(self.service_environment_name)

sagemaker-train/tests/integ/train/aws_batch/test_queue.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import boto3
1616
import botocore
1717
import pytest
18+
import random
19+
import string
1820

1921
from sagemaker.train.model_trainer import ModelTrainer
2022
from sagemaker.train.configs import SourceCode, InputData, Compute
@@ -29,16 +31,34 @@
2931
from .manager import BatchTestResourceManager
3032

3133

34+
class ShortId:
35+
ALPHABET = string.ascii_lowercase + string.digits
36+
DEFAULT_LENGTH = 8
37+
38+
@staticmethod
39+
def get(length=DEFAULT_LENGTH):
40+
return "".join(random.choices(ShortId.ALPHABET, k=length))
41+
42+
3243
@pytest.fixture(scope="module")
3344
def batch_client():
3445
return boto3.client("batch", region_name="us-west-2")
3546

3647

3748
@pytest.fixture(scope="function")
3849
def batch_test_resource_manager(batch_client):
39-
resource_manager = BatchTestResourceManager(batch_client=batch_client)
40-
resource_manager.get_or_create_resources()
41-
yield resource_manager
50+
# Guarantee AWS Batch resource name uniqueness across concurrent test runtimes
51+
test_id = ShortId.get()
52+
print(f"Integration test ID (used in AWS Batch resource naming): {test_id}")
53+
54+
resource_manager = BatchTestResourceManager(batch_client=batch_client, test_id=test_id)
55+
56+
try:
57+
resource_manager.get_or_create_resources()
58+
yield resource_manager
59+
except Exception as e:
60+
print(f"Exception thrown while creating or yielding AWS Batch resources: {str(e)}")
61+
4262
resource_manager.delete_resources()
4363

4464

v3-examples/training-examples/aws_batch/sm-training-queues_quota-management.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@
177177
"metadata": {},
178178
"source": [
179179
"## Create TrainingQueue object\n",
180-
"Using our queue is as easy as referring to it by name in the TrainingQueue contructor. The TrainingQueue class within the SageMaker Python SDK provides built in support for working with Batch queues."
180+
"Using our queue is as easy as referring to it by name in the TrainingQueue constructor. The TrainingQueue class within the SageMaker Python SDK provides built in support for working with Batch queues."
181181
]
182182
},
183183
{

v3-examples/training-examples/aws_batch/utils/aws_batch_resource_management.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ def create_quota_share(self, create_qs_request: dict):
473473
jobQueues=[create_qs_request["jobQueue"]]
474474
)
475475
jq_arn = desc_jqs_resp["jobQueues"][0]["jobQueueArn"]
476-
quota_share_arn = f"{jq_arn}/quota-share/{create_qs_request["quotaShareName"]}"
476+
quota_share_arn = f"{jq_arn}/quota-share/{create_qs_request['quotaShareName']}"
477477
return {
478478
"quotaShareName": create_qs_request["quotaShareName"],
479479
"quotaShareArn": quota_share_arn,

0 commit comments

Comments
 (0)