Skip to content

Commit c475138

Browse files
committed
fix: aws_batch integ test resources are now uniquely named by test run.
1 parent 6192859 commit c475138

File tree

3 files changed

+46
-58
lines changed

3 files changed

+46
-58
lines changed

sagemaker-train/tests/integ/train/aws_batch/manager.py

Lines changed: 22 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,18 @@ class BatchTestResourceManager:
2121
def __init__(
2222
self,
2323
batch_client,
24+
test_id,
2425
queue_name="pysdk-test-qm-queue",
2526
service_env_name="pysdk-test-qm-queue-service-environment",
2627
scheduling_policy_name="pysdk-test-qm-scheduling-policy",
2728
quota_share_name="pysdk-test-quota-share",
2829
):
30+
self.test_id = test_id
2931
self.batch_client = batch_client
30-
self.queue_name = queue_name
31-
self.service_environment_name = service_env_name
32-
self.scheduling_policy_name = scheduling_policy_name
33-
self.quota_share_name = quota_share_name
32+
self.queue_name = f"{queue_name}-{test_id}"
33+
self.service_environment_name = f"{service_env_name}-{test_id}"
34+
self.scheduling_policy_name = f"{scheduling_policy_name}-{test_id}"
35+
self.quota_share_name = f"{quota_share_name}-{test_id}"
3436

3537
def _create_or_get_service_environment(self, service_environment_name):
3638
print(f"Creating service environment: {service_environment_name}")
@@ -277,65 +279,31 @@ def _delete_quota_share(self, quota_share_arn: str):
277279
print("Waiting for QuotaShare deletion to finish...")
278280
self._wait_for_quota_share_state(quota_share_arn, "DELETED", "DISABLED")
279281

280-
def get_or_create_resources(
281-
self,
282-
queue_name=None,
283-
service_environment_name=None,
284-
scheduling_policy_name=None,
285-
quota_share_name=None
286-
):
287-
queue_name = queue_name or self.queue_name
288-
service_environment_name = service_environment_name or self.service_environment_name
289-
scheduling_policy_name = scheduling_policy_name or self.scheduling_policy_name
290-
quota_share_name = quota_share_name or self.quota_share_name
291-
292-
service_environment = self._create_or_get_service_environment(service_environment_name)
293-
if service_environment.get("state") != "ENABLED":
294-
self._update_service_environment_state(service_environment_name, "ENABLED")
295-
self._wait_for_service_environment_state(service_environment_name, "VALID", "ENABLED")
296-
time.sleep(10)
297-
298-
scheduling_policy = self._create_or_get_scheduling_policy(scheduling_policy_name)
299-
scheduling_policy_arn = scheduling_policy.get("arn")
300-
301-
queue = self._create_or_get_queue(queue_name, service_environment["serviceEnvironmentArn"],
302-
scheduling_policy_arn)
303-
if queue.get("state") != "ENABLED":
304-
self._update_queue_state(queue_name, "ENABLED")
305-
self._wait_for_queue_state(queue_name, "VALID", "ENABLED")
306-
time.sleep(10)
307-
308-
quota_share = self._create_or_get_quota_share(quota_share_name, queue_name)
309-
if quota_share.get("state") != "ENABLED":
310-
self._update_quota_share_state(quota_share["quotaShareArn"], "ENABLED")
311-
self._wait_for_quota_share_state(quota_share["quotaShareArn"], "VALID", "ENABLED")
312-
time.sleep(10)
282+
def get_or_create_resources(self):
283+
service_environment = self._create_or_get_service_environment(self.service_environment_name)
284+
scheduling_policy = self._create_or_get_scheduling_policy(self.scheduling_policy_name)
313285

314-
return queue, service_environment, scheduling_policy, quota_share
286+
queue = self._create_or_get_queue(self.queue_name, service_environment["serviceEnvironmentArn"],
287+
scheduling_policy.get("arn"))
288+
self._wait_for_queue_state(self.queue_name, "VALID", "ENABLED")
315289

316-
def delete_resources(
317-
self,
318-
queue_name=None,
319-
service_environment_name=None,
320-
scheduling_policy_name=None,
321-
quota_share_name=None
322-
):
323-
queue_name = queue_name or self.queue_name
324-
service_environment_name = service_environment_name or self.service_environment_name
325-
scheduling_policy_name = scheduling_policy_name or self.scheduling_policy_name
326-
quota_share_name = quota_share_name or self.quota_share_name
290+
quota_share = self._create_or_get_quota_share(self.quota_share_name, self.queue_name)
291+
self._wait_for_quota_share_state(quota_share["quotaShareArn"], "VALID", "ENABLED")
292+
293+
return queue, service_environment, scheduling_policy, quota_share
327294

295+
def delete_resources(self):
328296
# Get ARNs needed for deletion
329-
desc_jq = self.batch_client.describe_job_queues(jobQueues=[queue_name])
297+
desc_jq = self.batch_client.describe_job_queues(jobQueues=[self.queue_name])
330298
if desc_jq["jobQueues"]:
331299
jq_arn = desc_jq["jobQueues"][0]["jobQueueArn"]
332-
quota_share_arn = f"{jq_arn}/quota-share/{quota_share_name}"
300+
quota_share_arn = f"{jq_arn}/quota-share/{self.quota_share_name}"
333301
self._delete_quota_share(quota_share_arn)
334302

335-
self._delete_job_queue(queue_name)
303+
self._delete_job_queue(self.queue_name)
336304

337-
sp = self._find_scheduling_policy(scheduling_policy_name)
305+
sp = self._find_scheduling_policy(self.scheduling_policy_name)
338306
if sp:
339307
self._delete_scheduling_policy(sp["arn"])
340308

341-
self._delete_service_environment(service_environment_name)
309+
self._delete_service_environment(self.service_environment_name)

sagemaker-train/tests/integ/train/aws_batch/test_queue.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import boto3
1616
import botocore
1717
import pytest
18+
import random
19+
import string
1820

1921
from sagemaker.train.model_trainer import ModelTrainer
2022
from sagemaker.train.configs import SourceCode, InputData, Compute
@@ -29,16 +31,34 @@
2931
from .manager import BatchTestResourceManager
3032

3133

34+
class ShortId:
35+
ALPHABET = string.ascii_lowercase + string.digits
36+
DEFAULT_LENGTH = 8
37+
38+
@staticmethod
39+
def get(length=DEFAULT_LENGTH):
40+
return "".join(random.choices(ShortId.ALPHABET, k=length))
41+
42+
3243
@pytest.fixture(scope="module")
3344
def batch_client():
3445
return boto3.client("batch", region_name="us-west-2")
3546

3647

3748
@pytest.fixture(scope="function")
3849
def batch_test_resource_manager(batch_client):
39-
resource_manager = BatchTestResourceManager(batch_client=batch_client)
40-
resource_manager.get_or_create_resources()
41-
yield resource_manager
50+
# Guarantee AWS Batch resource name uniqueness across concurrent test runtimes
51+
test_id = ShortId.get()
52+
print(f"Integration test ID (used in AWS Batch resource naming): {test_id}")
53+
54+
resource_manager = BatchTestResourceManager(batch_client=batch_client, test_id=test_id)
55+
56+
try:
57+
resource_manager.get_or_create_resources()
58+
yield resource_manager
59+
except Exception as e:
60+
print(f"Exception thrown while creating or yielding AWS Batch resources: {str(e)}")
61+
4262
resource_manager.delete_resources()
4363

4464

v3-examples/training-examples/aws_batch/utils/aws_batch_resource_management.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ def create_quota_share(self, create_qs_request: dict):
473473
jobQueues=[create_qs_request["jobQueue"]]
474474
)
475475
jq_arn = desc_jqs_resp["jobQueues"][0]["jobQueueArn"]
476-
quota_share_arn = f"{jq_arn}/quota-share/{create_qs_request["quotaShareName"]}"
476+
quota_share_arn = f"{jq_arn}/quota-share/{create_qs_request['quotaShareName']}"
477477
return {
478478
"quotaShareName": create_qs_request["quotaShareName"],
479479
"quotaShareArn": quota_share_arn,

0 commit comments

Comments
 (0)