Skip to content

Commit 57e1de7

Browse files
authored
Merge branch 'master' into context_length
2 parents 165bcad + 7444009 commit 57e1de7

10 files changed

Lines changed: 246 additions & 42 deletions

File tree

sagemaker-core/src/sagemaker/core/remote_function/job.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,6 +860,10 @@ def _get_default_spark_image(session):
860860
except ImportError:
861861
pass
862862

863+
# Spark 3.3 and below do not support py312; use 3.5 which supports both py39 and py312
864+
if py_version == "312" and spark_version in ("2.4", "3.0", "3.1", "3.2", "3.3"):
865+
spark_version = "3.5"
866+
863867
image_uri = image_uris.retrieve(
864868
framework=SPARK_NAME,
865869
region=region,

sagemaker-core/tests/integ/remote_function/conftest.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,52 @@ def spark_test_container(sagemaker_session, sagemaker_sdk_tar_path, tmp_path_fac
171171
)
172172

173173

174+
@pytest.fixture(scope="session")
175+
def spark_pre_execution_commands(sagemaker_session):
176+
"""Build sagemaker-core wheel, upload to S3, and return pre-execution install commands.
177+
178+
This mirrors the pattern used in sagemaker-mlops feature_processor integ tests.
179+
The Spark processing image does not have sagemaker-core pre-installed, so we must
180+
build the local dev wheel and install it in the container via pre_execution_commands.
181+
"""
182+
import subprocess
183+
import glob
184+
import tempfile
185+
from sagemaker.core.s3 import S3Uploader
186+
187+
repo_root = os.path.abspath(
188+
os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")
189+
)
190+
core_dir = os.path.join(repo_root, "sagemaker-core")
191+
192+
with tempfile.TemporaryDirectory() as dist_dir:
193+
subprocess.run(
194+
f"python -m build --wheel --outdir {dist_dir}",
195+
shell=True,
196+
cwd=core_dir,
197+
check=True,
198+
)
199+
wheels = glob.glob(os.path.join(dist_dir, "sagemaker_core-*.whl"))
200+
if not wheels:
201+
raise FileNotFoundError(f"No sagemaker-core wheel found in {dist_dir}")
202+
wheel_path = wheels[0]
203+
wheel_name = os.path.basename(wheel_path)
204+
205+
s3_prefix = "s3://{}/spark-integ-test/wheels".format(
206+
sagemaker_session.default_bucket()
207+
)
208+
S3Uploader.upload(wheel_path, s3_prefix, sagemaker_session=sagemaker_session)
209+
210+
PIP = "python3 -m pip install --root-user-action=ignore"
211+
AWS = "python3 -m awscli"
212+
cmds = [
213+
f"{PIP} awscli",
214+
f"{AWS} s3 cp {s3_prefix}/{wheel_name} /tmp/{wheel_name}",
215+
f"{PIP} /tmp/{wheel_name}",
216+
]
217+
return cmds
218+
219+
174220
@pytest.fixture(scope="session")
175221
def conda_env_yml():
176222
"""Write conda yml file needed for tests."""

sagemaker-core/tests/integ/remote_function/test_decorator.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -574,16 +574,18 @@ def my_func():
574574
assert client_error_message in str(error)
575575

576576

577-
@pytest.mark.skipif(
578-
sys.version_info[:2] not in [(3, 9), (3, 12)],
579-
reason="SageMaker Spark image only available for Python 3.9 and 3.12",
580-
)
581-
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type):
577+
# @pytest.mark.skipif(
578+
# sys.version_info[:2] not in [(3, 9), (3, 12)],
579+
# reason="SageMaker Spark image only available for Python 3.9 and 3.12",
580+
# )
581+
@pytest.mark.spark_py312
582+
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type, spark_pre_execution_commands):
582583
@remote(
583584
role=ROLE,
584585
instance_type=cpu_instance_type,
585586
sagemaker_session=sagemaker_session,
586587
keep_alive_period_in_seconds=60,
588+
pre_execution_commands=spark_pre_execution_commands,
587589
spark_config=SparkConfig(
588590
configuration=[
589591
{
@@ -598,7 +600,14 @@ def test_spark_transform():
598600

599601
spark = SparkSession.builder.getOrCreate()
600602

601-
assert spark.conf.get("spark.app.name") == "remote-spark-test"
603+
# Avoid bare assert here: pytest's assertion rewriting injects _pytest
604+
# module references into the function bytecode, which causes
605+
# deserialization to fail in the Spark container (no pytest installed).
606+
app_name = spark.conf.get("spark.app.name")
607+
if app_name != "remote-spark-test":
608+
raise RuntimeError(
609+
f"Expected spark.app.name='remote-spark-test', got '{app_name}'"
610+
)
602611

603612
test_spark_transform()
604613

sagemaker-mlops/tests/integ/feature_store/feature_processor/test_feature_processor_integ.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -798,11 +798,11 @@ def transform(raw_s3_data_as_df):
798798
# sys.version_info[:2] not in [(3, 9), (3, 12)],
799799
# reason=f"SageMaker Spark image only supports Python 3.9 and 3.12, got {sys.version_info[:2]}",
800800
# )
801-
@pytest.mark.skip(
802-
reason="Lake Formation credential vending (GetTemporaryGlueTableCredentials) requires "
803-
"full LF environment setup (resource registration, trust policy, data location grants) "
804-
"that is not configured in CI. See quip-amazon.com/S3FEAMMMuKm0 for details."
805-
)
801+
# @pytest.mark.skip(
802+
# reason="Lake Formation credential vending (GetTemporaryGlueTableCredentials) requires "
803+
# "full LF environment setup (resource registration, trust policy, data location grants) "
804+
# "that is not configured in CI. See quip-amazon.com/S3FEAMMMuKm0 for details."
805+
# )
806806
@pytest.mark.spark_py312
807807
@pytest.mark.slow_test
808808
def test_to_pipeline_and_execute_with_lake_formation(

sagemaker-train/tests/integ/ai_registry/test_dataset.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,9 @@ def test_create_dataset_from_s3_oss_dpo(self, unique_name, test_bucket, cleanup_
7979
assert dataset.name == unique_name
8080
assert dataset.customization_technique == CustomizationTechnique.DPO
8181

82+
@pytest.mark.us_east_1
8283
def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup_list):
83-
"""Test creating RLVR dataset from S3 URI."""
84+
"""Test creating Nova SFT dataset from S3 URI."""
8485
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_sft_train.jsonl"
8586
dataset = DataSet.create(
8687
name=unique_name,
@@ -92,8 +93,9 @@ def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup
9293
assert dataset.name == unique_name
9394
assert dataset.customization_technique == CustomizationTechnique.SFT
9495

96+
@pytest.mark.us_east_1
9597
def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup_list):
96-
"""Test creating RLVR dataset from S3 URI."""
98+
"""Test creating Nova DPO dataset from S3 URI."""
9799
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_dpo_train.jsonl"
98100
dataset = DataSet.create(
99101
name=unique_name,
@@ -105,8 +107,9 @@ def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup
105107
assert dataset.name == unique_name
106108
assert dataset.customization_technique == CustomizationTechnique.DPO
107109

110+
@pytest.mark.us_east_1
108111
def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup_list):
109-
"""Test creating RLVR dataset from S3 URI."""
112+
"""Test creating Nova RFT dataset from S3 URI."""
110113
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_rft_train.jsonl"
111114
dataset = DataSet.create(
112115
name=unique_name,
@@ -118,8 +121,9 @@ def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup
118121
assert dataset.name == unique_name
119122
assert dataset.customization_technique == CustomizationTechnique.RLVR
120123

124+
@pytest.mark.us_east_1
121125
def test_create_dataset_from_s3_nova_eval(self, unique_name, test_bucket, cleanup_list):
122-
"""Test creating RLVR dataset from S3 URI."""
126+
"""Test creating Nova eval dataset from S3 URI."""
123127
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_eval.jsonl"
124128
dataset = DataSet.create(
125129
name=unique_name,

sagemaker-train/tests/integ/train/test_benchmark_evaluator.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,11 @@
6161
"region": "us-west-2",
6262
}
6363

64-
# Nova model evaluation configuration (uses our own test account in us-east-1)
64+
# Nova model evaluation configuration (uses dedicated test account in us-east-1)
6565
NOVA_CONFIG = {
66-
"model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65",
67-
"dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl",
68-
"s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/",
69-
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models",
66+
"dataset_s3_uri": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/zc_test.jsonl",
67+
"s3_output_path": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/",
68+
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:784379639078:model-package-group/sdk-test-finetuned-models",
7069
"region": "us-east-1",
7170
}
7271

@@ -339,25 +338,44 @@ def test_benchmark_evaluation_base_model_only(self):
339338
assert execution.status.overall_status == "Succeeded"
340339
logger.info("Base model only evaluation completed successfully")
341340

342-
@pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account")
341+
@pytest.mark.gpu_intensive
342+
@pytest.mark.us_east_1
343343
def test_benchmark_evaluation_nova_model(self):
344344
"""
345345
Test benchmark evaluation with Nova model.
346346
347347
This test uses a Nova fine-tuned model package in us-east-1 region.
348348
Configuration from commented section in benchmark_demo.ipynb.
349349
350-
Note: This test is currently skipped pending us-east-1 test infra migration.
350+
Note: This test requires a model package to exist in the model package group.
351+
It should be run after a successful SFT or RLVR training job has produced one.
351352
"""
353+
import boto3
354+
352355
# Get benchmarks
353356
Benchmark = get_benchmarks()
354357

358+
# Dynamically find the latest model package in the group
359+
sm_client = boto3.client("sagemaker", region_name=NOVA_CONFIG["region"])
360+
packages = sm_client.list_model_packages(
361+
ModelPackageGroupName="sdk-test-finetuned-models",
362+
SortBy="CreationTime",
363+
SortOrder="Descending",
364+
MaxResults=1,
365+
)
366+
367+
if not packages["ModelPackageSummaryList"]:
368+
pytest.skip("No model packages available in sdk-test-finetuned-models group. Run SFT/RLVR training first.")
369+
370+
model_package_arn = packages["ModelPackageSummaryList"][0]["ModelPackageArn"]
371+
logger.info(f"Using model package: {model_package_arn}")
372+
355373
logger.info("Creating BenchmarkEvaluator with Nova model")
356374

357375
# Create evaluator with Nova model package
358376
evaluator = BenchMarkEvaluator(
359377
benchmark=Benchmark.MMLU,
360-
model=NOVA_CONFIG["model_package_arn"],
378+
model=model_package_arn,
361379
s3_output_path=NOVA_CONFIG["s3_output_path"],
362380
model_package_group=NOVA_CONFIG["model_package_group_arn"],
363381
base_eval_name="integ-test-nova-eval",
@@ -367,7 +385,7 @@ def test_benchmark_evaluation_nova_model(self):
367385
# Verify evaluator was created
368386
assert evaluator is not None
369387
assert evaluator.benchmark == Benchmark.MMLU
370-
assert evaluator.model == NOVA_CONFIG["model_package_arn"]
388+
assert evaluator.model == model_package_arn
371389
assert evaluator.region == NOVA_CONFIG["region"]
372390

373391
logger.info(f"Created evaluator: {evaluator.base_eval_name}")
@@ -397,8 +415,8 @@ def test_benchmark_evaluation_nova_model(self):
397415
logger.info(f"Status after refresh: {execution.status.overall_status}")
398416

399417
# Wait for completion
400-
logger.info("Waiting for evaluation to complete (timeout: 1 hour)")
401-
execution.wait(target_status="Succeeded", poll=30, timeout=3600)
418+
logger.info("Waiting for evaluation to complete (timeout: 3 hours)")
419+
execution.wait(target_status="Succeeded", poll=30, timeout=10800)
402420

403421
# Verify completion
404422
assert execution.status.overall_status == "Succeeded"

sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py

Lines changed: 127 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@
5454
"region": "us-west-2",
5555
}
5656

57+
# Base model only evaluation configuration (uses JumpStart model ID directly, no model package)
58+
BASE_MODEL_ONLY_CONFIG = {
59+
"base_model_id": "meta-textgeneration-llama-3-2-1b-instruct",
60+
"evaluator_arn": "arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/eval-lambda-test/0.0.1",
61+
"dataset_s3_uri": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/zc_test.jsonl",
62+
"s3_output_path": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/",
63+
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-west-2:729646638167:mlflow-app/app-W7FOBBXZANVX",
64+
"region": "us-west-2",
65+
}
66+
5767

5868
# @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
5969
@pytest.mark.xdist_group("custom_scorer_evaluator")
@@ -288,13 +298,125 @@ def test_custom_scorer_with_builtin_metric(self):
288298
logger.info("Built-in metric evaluation completed successfully")
289299

290300
# @pytest.mark.skip(reason="Base model only evaluation - not working yet per notebook")
301+
@pytest.mark.gpu_intensive
291302
def test_custom_scorer_base_model_only(self):
292303
"""
293304
Test custom scorer evaluation with base model only (no fine-tuned model).
294305
295-
Note: Per the notebook, "Evaluation with Base Model Only is yet to be
296-
implemented/tested - Not Working currently". This test is skipped until
297-
that functionality is available.
306+
This test uses a JumpStart model ID directly instead of a model package ARN,
307+
which triggers the CUSTOM_SCORER_TEMPLATE_BASE_MODEL_ONLY template path.
308+
The evaluation runs against only the base model without any fine-tuned weights.
309+
310+
This test covers:
311+
1. Creating CustomScorerEvaluator with a JumpStart model ID (base model only)
312+
2. Accessing hyperparameters
313+
3. Starting evaluation
314+
4. Monitoring execution
315+
5. Waiting for completion
316+
6. Viewing results
317+
7. Retrieving execution by ARN
298318
"""
299-
logger.info("Base model only evaluation - not yet implemented")
300-
pass
319+
# Step 1: Create CustomScorerEvaluator with JumpStart model ID
320+
logger.info("Creating CustomScorerEvaluator with base model only (JumpStart model ID)")
321+
322+
evaluator = CustomScorerEvaluator(
323+
evaluator=BASE_MODEL_ONLY_CONFIG["evaluator_arn"],
324+
dataset=BASE_MODEL_ONLY_CONFIG["dataset_s3_uri"],
325+
model=BASE_MODEL_ONLY_CONFIG["base_model_id"],
326+
s3_output_path=BASE_MODEL_ONLY_CONFIG["s3_output_path"],
327+
evaluate_base_model=False,
328+
)
329+
330+
# Verify evaluator was created with base model ID
331+
assert evaluator is not None
332+
assert evaluator.evaluator == BASE_MODEL_ONLY_CONFIG["evaluator_arn"]
333+
assert evaluator.model == BASE_MODEL_ONLY_CONFIG["base_model_id"]
334+
assert evaluator.dataset == BASE_MODEL_ONLY_CONFIG["dataset_s3_uri"]
335+
336+
logger.info(f"Created evaluator with base model: {BASE_MODEL_ONLY_CONFIG['base_model_id']}")
337+
338+
# Step 2: Access hyperparameters
339+
logger.info("Accessing hyperparameters")
340+
hyperparams = evaluator.hyperparameters.to_dict()
341+
342+
# Verify hyperparameters structure
343+
assert isinstance(hyperparams, dict)
344+
assert "max_new_tokens" in hyperparams
345+
assert "temperature" in hyperparams
346+
347+
logger.info(f"Hyperparameters: {hyperparams}")
348+
349+
# Step 3: Start evaluation
350+
logger.info("Starting evaluation execution")
351+
execution = evaluator.evaluate()
352+
353+
# Verify execution was created
354+
assert execution is not None
355+
assert execution.arn is not None
356+
assert execution.name is not None
357+
assert execution.eval_type is not None
358+
359+
logger.info(f"Pipeline Execution ARN: {execution.arn}")
360+
logger.info(f"Initial Status: {execution.status.overall_status}")
361+
362+
# Step 4: Monitor execution
363+
logger.info("Refreshing execution status")
364+
execution.refresh()
365+
366+
# Verify status was updated
367+
assert execution.status.overall_status is not None
368+
369+
# Log step details if available
370+
if execution.status.step_details:
371+
logger.info("Step Details:")
372+
for step in execution.status.step_details:
373+
logger.info(f" {step.name}: {step.status}")
374+
375+
# Step 5: Wait for completion
376+
logger.info(f"Waiting for evaluation to complete (timeout: {EVALUATION_TIMEOUT_SECONDS}s / {EVALUATION_TIMEOUT_SECONDS//3600}h)")
377+
378+
try:
379+
execution.wait(target_status="Succeeded", poll=30, timeout=EVALUATION_TIMEOUT_SECONDS)
380+
logger.info(f"Final Status: {execution.status.overall_status}")
381+
382+
# Verify completion
383+
assert execution.status.overall_status == "Succeeded"
384+
385+
# Step 6: View results
386+
logger.info("Displaying results")
387+
execution.show_results()
388+
389+
# Verify S3 output path is set
390+
assert execution.s3_output_path is not None
391+
logger.info(f"Results stored at: {execution.s3_output_path}")
392+
393+
except Exception as e:
394+
logger.error(f"Evaluation failed or timed out: {e}")
395+
logger.error(f"Final status: {execution.status.overall_status}")
396+
if execution.status.failure_reason:
397+
logger.error(f"Failure reason: {execution.status.failure_reason}")
398+
399+
# Log step failures
400+
if execution.status.step_details:
401+
for step in execution.status.step_details:
402+
if "failed" in step.status.lower():
403+
logger.error(f"Failed step: {step.name}")
404+
if step.failure_reason:
405+
logger.error(f" Reason: {step.failure_reason}")
406+
407+
# Re-raise to fail the test
408+
raise
409+
410+
# Step 7: Retrieve execution by ARN
411+
logger.info("Retrieving execution by ARN")
412+
retrieved_execution = EvaluationPipelineExecution.get(
413+
arn=execution.arn,
414+
region=BASE_MODEL_ONLY_CONFIG["region"]
415+
)
416+
417+
# Verify retrieved execution matches
418+
assert retrieved_execution.arn == execution.arn
419+
assert retrieved_execution.status.overall_status == "Succeeded"
420+
421+
logger.info(f"Retrieved execution status: {retrieved_execution.status.overall_status}")
422+
logger.info("Base model only evaluation completed successfully")

0 commit comments

Comments
 (0)