Skip to content

Commit ccb0e37

Browse files
authored
Merge branch 'master' into feature/lumen-ai-inference-recommender
2 parents 19cba63 + 7444009 commit ccb0e37

19 files changed

Lines changed: 299 additions & 69 deletions

File tree

.github/workflows/gpu-integ-tests.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,18 @@ jobs:
2222
with:
2323
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
2424
source-version: refs/heads/master
25+
26+
gpu-integ-tests-us-east-1:
27+
runs-on: ubuntu-latest
28+
steps:
29+
- name: Configure AWS Credentials (us-east-1)
30+
uses: aws-actions/configure-aws-credentials@v4
31+
with:
32+
role-to-assume: ${{ secrets.CI_AWS_ROLE_US_EAST_1_ARN }}
33+
aws-region: us-east-1
34+
role-duration-seconds: 10800
35+
- name: Run GPU Integ Tests (us-east-1)
36+
uses: aws-actions/aws-codebuild-run-build@v1
37+
with:
38+
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
39+
source-version: refs/heads/master

.github/workflows/pr-checks-master.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,3 +215,21 @@ jobs:
215215
with:
216216
project-name: ${{ github.event.repository.name }}-ci-${{ matrix.submodule }}-integ-tests
217217
source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}'
218+
219+
integ-tests-us-east-1:
220+
runs-on: ubuntu-latest
221+
needs: [detect-changes]
222+
if: contains(fromJson(needs.detect-changes.outputs.submodules), 'sagemaker-train')
223+
steps:
224+
- name: Configure AWS Credentials (us-east-1)
225+
uses: aws-actions/configure-aws-credentials@v4
226+
with:
227+
role-to-assume: ${{ secrets.CI_AWS_ROLE_US_EAST_1_ARN }}
228+
aws-region: us-east-1
229+
role-duration-seconds: 10800
230+
231+
- name: Run us-east-1 Integ Tests for sagemaker-train
232+
uses: aws-actions/aws-codebuild-run-build@v1
233+
with:
234+
project-name: ${{ github.event.repository.name }}-ci-sagemaker-train-integ-tests
235+
source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}'

sagemaker-core/src/sagemaker/core/remote_function/job.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,6 +860,10 @@ def _get_default_spark_image(session):
860860
except ImportError:
861861
pass
862862

863+
# Spark 3.3 and below do not support py312; use 3.5 which supports both py39 and py312
864+
if py_version == "312" and spark_version in ("2.4", "3.0", "3.1", "3.2", "3.3"):
865+
spark_version = "3.5"
866+
863867
image_uri = image_uris.retrieve(
864868
framework=SPARK_NAME,
865869
region=region,

sagemaker-core/tests/integ/remote_function/conftest.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,52 @@ def spark_test_container(sagemaker_session, sagemaker_sdk_tar_path, tmp_path_fac
171171
)
172172

173173

174+
@pytest.fixture(scope="session")
175+
def spark_pre_execution_commands(sagemaker_session):
176+
"""Build sagemaker-core wheel, upload to S3, and return pre-execution install commands.
177+
178+
This mirrors the pattern used in sagemaker-mlops feature_processor integ tests.
179+
The Spark processing image does not have sagemaker-core pre-installed, so we must
180+
build the local dev wheel and install it in the container via pre_execution_commands.
181+
"""
182+
import subprocess
183+
import glob
184+
import tempfile
185+
from sagemaker.core.s3 import S3Uploader
186+
187+
repo_root = os.path.abspath(
188+
os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")
189+
)
190+
core_dir = os.path.join(repo_root, "sagemaker-core")
191+
192+
with tempfile.TemporaryDirectory() as dist_dir:
193+
subprocess.run(
194+
f"python -m build --wheel --outdir {dist_dir}",
195+
shell=True,
196+
cwd=core_dir,
197+
check=True,
198+
)
199+
wheels = glob.glob(os.path.join(dist_dir, "sagemaker_core-*.whl"))
200+
if not wheels:
201+
raise FileNotFoundError(f"No sagemaker-core wheel found in {dist_dir}")
202+
wheel_path = wheels[0]
203+
wheel_name = os.path.basename(wheel_path)
204+
205+
s3_prefix = "s3://{}/spark-integ-test/wheels".format(
206+
sagemaker_session.default_bucket()
207+
)
208+
S3Uploader.upload(wheel_path, s3_prefix, sagemaker_session=sagemaker_session)
209+
210+
PIP = "python3 -m pip install --root-user-action=ignore"
211+
AWS = "python3 -m awscli"
212+
cmds = [
213+
f"{PIP} awscli",
214+
f"{AWS} s3 cp {s3_prefix}/{wheel_name} /tmp/{wheel_name}",
215+
f"{PIP} /tmp/{wheel_name}",
216+
]
217+
return cmds
218+
219+
174220
@pytest.fixture(scope="session")
175221
def conda_env_yml():
176222
"""Write conda yml file needed for tests."""

sagemaker-core/tests/integ/remote_function/test_decorator.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -574,16 +574,18 @@ def my_func():
574574
assert client_error_message in str(error)
575575

576576

577-
@pytest.mark.skipif(
578-
sys.version_info[:2] not in [(3, 9), (3, 12)],
579-
reason="SageMaker Spark image only available for Python 3.9 and 3.12",
580-
)
581-
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type):
577+
# @pytest.mark.skipif(
578+
# sys.version_info[:2] not in [(3, 9), (3, 12)],
579+
# reason="SageMaker Spark image only available for Python 3.9 and 3.12",
580+
# )
581+
@pytest.mark.spark_py312
582+
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type, spark_pre_execution_commands):
582583
@remote(
583584
role=ROLE,
584585
instance_type=cpu_instance_type,
585586
sagemaker_session=sagemaker_session,
586587
keep_alive_period_in_seconds=60,
588+
pre_execution_commands=spark_pre_execution_commands,
587589
spark_config=SparkConfig(
588590
configuration=[
589591
{
@@ -598,7 +600,14 @@ def test_spark_transform():
598600

599601
spark = SparkSession.builder.getOrCreate()
600602

601-
assert spark.conf.get("spark.app.name") == "remote-spark-test"
603+
# Avoid bare assert here: pytest's assertion rewriting injects _pytest
604+
# module references into the function bytecode, which causes
605+
# deserialization to fail in the Spark container (no pytest installed).
606+
app_name = spark.conf.get("spark.app.name")
607+
if app_name != "remote-spark-test":
608+
raise RuntimeError(
609+
f"Expected spark.app.name='remote-spark-test', got '{app_name}'"
610+
)
602611

603612
test_spark_transform()
604613

sagemaker-mlops/tests/integ/feature_store/feature_processor/test_feature_processor_integ.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -798,11 +798,11 @@ def transform(raw_s3_data_as_df):
798798
# sys.version_info[:2] not in [(3, 9), (3, 12)],
799799
# reason=f"SageMaker Spark image only supports Python 3.9 and 3.12, got {sys.version_info[:2]}",
800800
# )
801-
@pytest.mark.skip(
802-
reason="Lake Formation credential vending (GetTemporaryGlueTableCredentials) requires "
803-
"full LF environment setup (resource registration, trust policy, data location grants) "
804-
"that is not configured in CI. See quip-amazon.com/S3FEAMMMuKm0 for details."
805-
)
801+
# @pytest.mark.skip(
802+
# reason="Lake Formation credential vending (GetTemporaryGlueTableCredentials) requires "
803+
# "full LF environment setup (resource registration, trust policy, data location grants) "
804+
# "that is not configured in CI. See quip-amazon.com/S3FEAMMMuKm0 for details."
805+
# )
806806
@pytest.mark.spark_py312
807807
@pytest.mark.slow_test
808808
def test_to_pipeline_and_execute_with_lake_formation(

sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717
import sys
1818
import json
1919

20-
from sagemaker.train.container_drivers.distributed_drivers.mpi_utils import (
20+
from pathlib import Path
21+
22+
sys.path.insert(0, str(Path(__file__).parent.parent))
23+
24+
from distributed_drivers.mpi_utils import ( # noqa: E402 # pylint: disable=C0413,E0611
2125
start_sshd_daemon,
2226
bootstrap_master_node,
2327
bootstrap_worker_node,
@@ -27,7 +31,7 @@
2731
)
2832

2933

30-
from sagemaker.train.container_drivers.common.utils import (
34+
from common.utils import ( # noqa: E402 # pylint: disable=C0413,E0611
3135
logger,
3236
hyperparameters_to_cli_args,
3337
get_process_count,

sagemaker-train/tests/integ/ai_registry/test_dataset.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,9 @@ def test_create_dataset_from_s3_oss_dpo(self, unique_name, test_bucket, cleanup_
7979
assert dataset.name == unique_name
8080
assert dataset.customization_technique == CustomizationTechnique.DPO
8181

82+
@pytest.mark.us_east_1
8283
def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup_list):
83-
"""Test creating RLVR dataset from S3 URI."""
84+
"""Test creating Nova SFT dataset from S3 URI."""
8485
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_sft_train.jsonl"
8586
dataset = DataSet.create(
8687
name=unique_name,
@@ -92,8 +93,9 @@ def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup
9293
assert dataset.name == unique_name
9394
assert dataset.customization_technique == CustomizationTechnique.SFT
9495

96+
@pytest.mark.us_east_1
9597
def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup_list):
96-
"""Test creating RLVR dataset from S3 URI."""
98+
"""Test creating Nova DPO dataset from S3 URI."""
9799
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_dpo_train.jsonl"
98100
dataset = DataSet.create(
99101
name=unique_name,
@@ -105,8 +107,9 @@ def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup
105107
assert dataset.name == unique_name
106108
assert dataset.customization_technique == CustomizationTechnique.DPO
107109

110+
@pytest.mark.us_east_1
108111
def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup_list):
109-
"""Test creating RLVR dataset from S3 URI."""
112+
"""Test creating Nova RFT dataset from S3 URI."""
110113
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_rft_train.jsonl"
111114
dataset = DataSet.create(
112115
name=unique_name,
@@ -118,8 +121,9 @@ def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup
118121
assert dataset.name == unique_name
119122
assert dataset.customization_technique == CustomizationTechnique.RLVR
120123

124+
@pytest.mark.us_east_1
121125
def test_create_dataset_from_s3_nova_eval(self, unique_name, test_bucket, cleanup_list):
122-
"""Test creating RLVR dataset from S3 URI."""
126+
"""Test creating Nova eval dataset from S3 URI."""
123127
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_eval.jsonl"
124128
dataset = DataSet.create(
125129
name=unique_name,

sagemaker-train/tests/integ/train/test_benchmark_evaluator.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
EvaluationPipelineExecution,
2424
)
2525

26-
pytestmark = pytest.mark.gpu_intensive
27-
2826
# Configure logging
2927
logging.basicConfig(
3028
level=logging.INFO,
@@ -63,13 +61,11 @@
6361
"region": "us-west-2",
6462
}
6563

66-
# Nova model evaluation configuration (from commented section in notebook)
64+
# Nova model evaluation configuration (uses dedicated test account in us-east-1)
6765
NOVA_CONFIG = {
68-
"model_package_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package/test-nova-finetuned-models/3",
69-
"dataset_s3_uri": "s3://sagemaker-us-east-1-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl",
70-
"s3_output_path": "s3://mufi-test-serverless-iad/eval/",
71-
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-east-1:052150106756:mlflow-tracking-server/mlflow-prod-server",
72-
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package-group/test-nova-finetuned-models",
66+
"dataset_s3_uri": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/zc_test.jsonl",
67+
"s3_output_path": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/",
68+
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:784379639078:model-package-group/sdk-test-finetuned-models",
7369
"region": "us-east-1",
7470
}
7571

@@ -288,7 +284,8 @@ def test_benchmark_subtasks_validation(self):
288284

289285
logger.info("Subtask validation tests passed")
290286

291-
@pytest.mark.skip(reason="Pipeline creation fails - under investigation")
287+
# @pytest.mark.skip(reason="Pipeline creation fails - under investigation")
288+
@pytest.mark.gpu_intensive
292289
def test_benchmark_evaluation_base_model_only(self):
293290
"""
294291
Test benchmark evaluation with base model only (no fine-tuned model).
@@ -341,28 +338,45 @@ def test_benchmark_evaluation_base_model_only(self):
341338
assert execution.status.overall_status == "Succeeded"
342339
logger.info("Base model only evaluation completed successfully")
343340

344-
@pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5")
341+
@pytest.mark.gpu_intensive
342+
@pytest.mark.us_east_1
345343
def test_benchmark_evaluation_nova_model(self):
346344
"""
347345
Test benchmark evaluation with Nova model.
348346
349347
This test uses a Nova fine-tuned model package in us-east-1 region.
350348
Configuration from commented section in benchmark_demo.ipynb.
351349
352-
Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
353-
when you want to enable it.
350+
Note: This test requires a model package to exist in the model package group.
351+
It should be run after a successful SFT or RLVR training job has produced one.
354352
"""
353+
import boto3
354+
355355
# Get benchmarks
356356
Benchmark = get_benchmarks()
357357

358+
# Dynamically find the latest model package in the group
359+
sm_client = boto3.client("sagemaker", region_name=NOVA_CONFIG["region"])
360+
packages = sm_client.list_model_packages(
361+
ModelPackageGroupName="sdk-test-finetuned-models",
362+
SortBy="CreationTime",
363+
SortOrder="Descending",
364+
MaxResults=1,
365+
)
366+
367+
if not packages["ModelPackageSummaryList"]:
368+
pytest.skip("No model packages available in sdk-test-finetuned-models group. Run SFT/RLVR training first.")
369+
370+
model_package_arn = packages["ModelPackageSummaryList"][0]["ModelPackageArn"]
371+
logger.info(f"Using model package: {model_package_arn}")
372+
358373
logger.info("Creating BenchmarkEvaluator with Nova model")
359374

360375
# Create evaluator with Nova model package
361376
evaluator = BenchMarkEvaluator(
362377
benchmark=Benchmark.MMLU,
363-
model=NOVA_CONFIG["model_package_arn"],
378+
model=model_package_arn,
364379
s3_output_path=NOVA_CONFIG["s3_output_path"],
365-
mlflow_resource_arn=NOVA_CONFIG["mlflow_tracking_server_arn"],
366380
model_package_group=NOVA_CONFIG["model_package_group_arn"],
367381
base_eval_name="integ-test-nova-eval",
368382
region=NOVA_CONFIG["region"],
@@ -371,7 +385,7 @@ def test_benchmark_evaluation_nova_model(self):
371385
# Verify evaluator was created
372386
assert evaluator is not None
373387
assert evaluator.benchmark == Benchmark.MMLU
374-
assert evaluator.model == NOVA_CONFIG["model_package_arn"]
388+
assert evaluator.model == model_package_arn
375389
assert evaluator.region == NOVA_CONFIG["region"]
376390

377391
logger.info(f"Created evaluator: {evaluator.base_eval_name}")
@@ -401,8 +415,8 @@ def test_benchmark_evaluation_nova_model(self):
401415
logger.info(f"Status after refresh: {execution.status.overall_status}")
402416

403417
# Wait for completion
404-
logger.info("Waiting for evaluation to complete (timeout: 1 hour)")
405-
execution.wait(target_status="Succeeded", poll=30, timeout=3600)
418+
logger.info("Waiting for evaluation to complete (timeout: 3 hours)")
419+
execution.wait(target_status="Succeeded", poll=30, timeout=10800)
406420

407421
# Verify completion
408422
assert execution.status.overall_status == "Succeeded"

0 commit comments

Comments
 (0)