Skip to content

Commit 7e54110

Browse files
committed
fix(integ): Fix region handling and add get-or-create Nova training job
The TestModelCustomizationDeployment integ tests were failing with DescribeTrainingJob 'Requested resource not found' because the SageMaker SDK caches the first session's region internally. The session-scoped cleanup_e2e_endpoints fixture (autouse) was creating a session in us-east-1 (default) before the class fixtures could set us-west-2, causing all subsequent TrainingJob.get calls to hit the wrong region. Fix by setting AWS_DEFAULT_REGION=us-west-2 in the cleanup_e2e_endpoints fixture before any SageMaker session is created. Add tests/integ/conftest.py with a session-scoped nova_training_job_name fixture that implements get-or-create: - Checks if sdk-integ-nova-micro-sft exists and is Completed - If InProgress, waits for completion - If not found, uploads minimal training data to S3 and launches a Nova Micro SFT training job via SFTTrainer - Reused across test_bedrock_nova_e2e.py and TestBedrockNovaDeployment in test_model_customization_deployment Update both Nova test files to use the shared fixture instead of hardcoded training job names.
1 parent 3b6a716 commit 7e54110

3 files changed

Lines changed: 167 additions & 11 deletions

File tree

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
"""Shared fixtures for integration tests.
14+
15+
Provides a "get or create" Nova training job that is reused across all
16+
integ test modules in the session.
17+
"""
18+
from __future__ import absolute_import
19+
20+
import json
21+
import logging
22+
import time
23+
24+
import boto3
25+
import pytest
26+
27+
logger = logging.getLogger(__name__)
28+
29+
# ── Constants ───────────────────────────────────────────────────────────────
30+
31+
NOVA_REGION = "us-east-1"
32+
NOVA_TRAINING_JOB_NAME = "sdk-integ-nova-micro-sft"
33+
NOVA_MODEL_ID = "nova-textgeneration-micro"
34+
NOVA_BUCKET_PREFIX = "sagemaker-us-east-1"
35+
36+
37+
def _get_or_create_nova_training_job():
38+
"""Return a completed Nova training job, creating one if it doesn't exist.
39+
40+
Uses a fixed job name so the job is created once and reused across runs.
41+
"""
42+
sm = boto3.client("sagemaker", region_name=NOVA_REGION)
43+
44+
# ── Check if the job already exists ─────────────────────────────────
45+
try:
46+
resp = sm.describe_training_job(TrainingJobName=NOVA_TRAINING_JOB_NAME)
47+
status = resp["TrainingJobStatus"]
48+
logger.info("Found existing training job %s (status=%s)", NOVA_TRAINING_JOB_NAME, status)
49+
50+
if status == "Completed":
51+
return NOVA_TRAINING_JOB_NAME
52+
if status == "InProgress":
53+
logger.info("Training job in progress, waiting for completion...")
54+
_wait_for_training_job(sm, NOVA_TRAINING_JOB_NAME)
55+
return NOVA_TRAINING_JOB_NAME
56+
if status in ("Failed", "Stopped"):
57+
logger.warning(
58+
"Training job %s has status %s — will create a new one with timestamp suffix",
59+
NOVA_TRAINING_JOB_NAME, status,
60+
)
61+
# Fall through to create a new one with a unique name
62+
job_name = f"{NOVA_TRAINING_JOB_NAME}-{int(time.time())}"
63+
else:
64+
return NOVA_TRAINING_JOB_NAME
65+
except sm.exceptions.ClientError as e:
66+
if "Requested resource not found" in str(e):
67+
logger.info("Training job %s not found, creating...", NOVA_TRAINING_JOB_NAME)
68+
job_name = NOVA_TRAINING_JOB_NAME
69+
else:
70+
raise
71+
72+
# ── Upload minimal training data ────────────────────────────────────
73+
account_id = boto3.client("sts").get_caller_identity()["Account"]
74+
bucket = f"{NOVA_BUCKET_PREFIX}-{account_id}"
75+
s3 = boto3.client("s3", region_name=NOVA_REGION)
76+
77+
try:
78+
s3.head_bucket(Bucket=bucket)
79+
except Exception:
80+
s3.create_bucket(Bucket=bucket)
81+
82+
train_key = "integ-test-data/nova-sft-train.jsonl"
83+
train_uri = f"s3://{bucket}/{train_key}"
84+
85+
# Only upload if not already there
86+
try:
87+
s3.head_object(Bucket=bucket, Key=train_key)
88+
except Exception:
89+
rows = []
90+
for i in range(20):
91+
rows.append(json.dumps({
92+
"messages": [
93+
{"role": "user", "content": f"What is {i+1} + {i+1}?"},
94+
{"role": "assistant", "content": f"The answer is {(i+1)*2}."},
95+
]
96+
}))
97+
s3.put_object(Bucket=bucket, Key=train_key, Body="\n".join(rows).encode())
98+
logger.info("Uploaded training data to %s", train_uri)
99+
100+
# ── Launch training job via SFTTrainer ──────────────────────────────
101+
import os
102+
original_region = os.environ.get("AWS_DEFAULT_REGION")
103+
os.environ["AWS_DEFAULT_REGION"] = NOVA_REGION
104+
try:
105+
from sagemaker.train.sft_trainer import SFTTrainer
106+
107+
trainer = SFTTrainer(
108+
model=NOVA_MODEL_ID,
109+
training_dataset=train_uri,
110+
accept_eula=True,
111+
model_package_group="sdk-integ-nova-models",
112+
)
113+
trainer.train(wait=False)
114+
actual_name = trainer._latest_training_job.training_job_name
115+
logger.info("Started training job: %s", actual_name)
116+
finally:
117+
if original_region:
118+
os.environ["AWS_DEFAULT_REGION"] = original_region
119+
else:
120+
os.environ.pop("AWS_DEFAULT_REGION", None)
121+
122+
_wait_for_training_job(sm, actual_name)
123+
return actual_name
124+
125+
126+
def _wait_for_training_job(sm_client, job_name, poll_interval=30, max_wait=7200):
127+
"""Poll until training job completes or fails."""
128+
elapsed = 0
129+
while elapsed < max_wait:
130+
resp = sm_client.describe_training_job(TrainingJobName=job_name)
131+
status = resp["TrainingJobStatus"]
132+
logger.info("Training job %s status: %s (elapsed %ds)", job_name, status, elapsed)
133+
if status == "Completed":
134+
return
135+
if status in ("Failed", "Stopped"):
136+
reason = resp.get("FailureReason", "unknown")
137+
raise RuntimeError(
138+
f"Training job {job_name} ended with status {status}: {reason}"
139+
)
140+
time.sleep(poll_interval)
141+
elapsed += poll_interval
142+
raise RuntimeError(f"Timed out after {max_wait}s waiting for training job {job_name}")
143+
144+
145+
# ── Session-scoped fixtures ─────────────────────────────────────────────────
146+
147+
148+
@pytest.fixture(scope="session")
149+
def nova_training_job_name():
150+
"""Get or create a completed Nova training job. Reused across all tests."""
151+
return _get_or_create_nova_training_job()

sagemaker-serve/tests/integ/test_bedrock_nova_e2e.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@
3535

3636
logger = logging.getLogger(__name__)
3737

38-
# Nova training job in us-east-1 (from existing test pattern)
39-
NOVA_TRAINING_JOB_NAME = "nova-textgeneration-lite-v2-sft-20251202132123"
4038
REGION = "us-east-1"
4139

4240

@@ -68,11 +66,11 @@ def bedrock_client(boto_session):
6866

6967

7068
@pytest.fixture(scope="module")
71-
def training_job(region_env):
72-
"""Fetch the Nova training job."""
69+
def training_job(region_env, nova_training_job_name):
70+
"""Fetch the Nova training job using the shared get-or-create fixture."""
7371
session = boto3.Session(region_name=region_env)
7472
return TrainingJob.get(
75-
training_job_name=NOVA_TRAINING_JOB_NAME,
73+
training_job_name=nova_training_job_name,
7674
session=session,
7775
region=region_env,
7876
)

sagemaker-serve/tests/integ/test_model_customization_deployment.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,14 @@ def endpoint_name():
5151
@pytest.fixture(scope="session", autouse=True)
5252
def cleanup_e2e_endpoints():
5353
"""Cleanup e2e endpoints before and after tests."""
54+
import os
5455
from sagemaker.core.resources import Endpoint
5556
from botocore.exceptions import ClientError
5657

58+
# Ensure region is set before any SageMaker session is created
59+
if "AWS_DEFAULT_REGION" not in os.environ:
60+
os.environ["AWS_DEFAULT_REGION"] = "us-west-2"
61+
5762
# Cleanup before tests
5863
try:
5964
for endpoint in Endpoint.get_all():
@@ -329,7 +334,9 @@ def setup_config(self, training_job_name):
329334
@pytest.fixture(scope="class")
330335
def training_job(self, setup_config):
331336
"""Get the training job."""
332-
return TrainingJob.get(training_job_name=setup_config["training_job_name"])
337+
return TrainingJob.get(
338+
training_job_name=setup_config["training_job_name"],
339+
)
333340

334341
@pytest.fixture(scope="class")
335342
def s3_client(self, setup_config):
@@ -553,7 +560,6 @@ def test_model_customization_workflow(training_job_name):
553560

554561
class TestBedrockNovaDeployment:
555562
"""Test suite for deploying Nova models to Bedrock."""
556-
NOVA_TRAINING_JOB_NAME = "nova-textgeneration-lite-v2-sft-20251202132123"
557563

558564
@pytest.fixture(scope="class", autouse=True)
559565
def setup_region(self):
@@ -568,14 +574,15 @@ def setup_region(self):
568574
os.environ.pop('AWS_DEFAULT_REGION', None)
569575

570576
@pytest.fixture(scope="class")
571-
def training_job(self, setup_region):
572-
"""Get Nova training job."""
577+
def training_job(self, setup_region, nova_training_job_name):
578+
"""Get Nova training job using the shared get-or-create fixture."""
573579
import boto3
574580
session = boto3.Session(region_name="us-east-1")
575581
return TrainingJob.get(
576-
training_job_name=self.NOVA_TRAINING_JOB_NAME,
582+
training_job_name=nova_training_job_name,
577583
session=session,
578-
region="us-east-1")
584+
region="us-east-1",
585+
)
579586

580587
@pytest.mark.skip(reason="Bedrock Nova deployment test skipped per team decision")
581588
def test_bedrock_model_builder_creation(self, training_job):

0 commit comments

Comments
 (0)