2323 EvaluationPipelineExecution ,
2424)
2525
26- pytestmark = pytest .mark .gpu_intensive
27-
2826# Configure logging
2927logging .basicConfig (
3028 level = logging .INFO ,
6361 "region" : "us-west-2" ,
6462}
6563
66- # Nova model evaluation configuration (from commented section in notebook )
64+ # Nova model evaluation configuration (uses dedicated test account in us-east-1 )
6765NOVA_CONFIG = {
68- "model_package_arn" : "arn:aws:sagemaker:us-east-1:052150106756:model-package/test-nova-finetuned-models/3" ,
69- "dataset_s3_uri" : "s3://sagemaker-us-east-1-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl" ,
70- "s3_output_path" : "s3://mufi-test-serverless-iad/eval/" ,
71- "mlflow_tracking_server_arn" : "arn:aws:sagemaker:us-east-1:052150106756:mlflow-tracking-server/mlflow-prod-server" ,
72- "model_package_group_arn" : "arn:aws:sagemaker:us-east-1:052150106756:model-package-group/test-nova-finetuned-models" ,
66+ "dataset_s3_uri" : "s3://sagemaker-us-east-1-784379639078/model-customization/eval/zc_test.jsonl" ,
67+ "s3_output_path" : "s3://sagemaker-us-east-1-784379639078/model-customization/eval/" ,
68+ "model_package_group_arn" : "arn:aws:sagemaker:us-east-1:784379639078:model-package-group/sdk-test-finetuned-models" ,
7369 "region" : "us-east-1" ,
7470}
7571
@@ -288,7 +284,8 @@ def test_benchmark_subtasks_validation(self):
288284
289285 logger .info ("Subtask validation tests passed" )
290286
291- @pytest .mark .skip (reason = "Pipeline creation fails - under investigation" )
287+ # @pytest.mark.skip(reason="Pipeline creation fails - under investigation")
288+ @pytest .mark .gpu_intensive
292289 def test_benchmark_evaluation_base_model_only (self ):
293290 """
294291 Test benchmark evaluation with base model only (no fine-tuned model).
@@ -341,28 +338,45 @@ def test_benchmark_evaluation_base_model_only(self):
341338 assert execution .status .overall_status == "Succeeded"
342339 logger .info ("Base model only evaluation completed successfully" )
343340
344- @pytest .mark .skip (reason = "Requires us-east-1 test infrastructure - tracked in AI-5" )
341+ @pytest .mark .gpu_intensive
342+ @pytest .mark .us_east_1
345343 def test_benchmark_evaluation_nova_model (self ):
346344 """
347345 Test benchmark evaluation with Nova model.
348346
349347 This test uses a Nova fine-tuned model package in us-east-1 region.
350348 Configuration from commented section in benchmark_demo.ipynb.
351349
352- Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
353- when you want to enable it .
350+ Note: This test requires a model package to exist in the model package group.
351+ It should be run after a successful SFT or RLVR training job has produced one .
354352 """
353+ import boto3
354+
355355 # Get benchmarks
356356 Benchmark = get_benchmarks ()
357357
358+ # Dynamically find the latest model package in the group
359+ sm_client = boto3 .client ("sagemaker" , region_name = NOVA_CONFIG ["region" ])
360+ packages = sm_client .list_model_packages (
361+ ModelPackageGroupName = "sdk-test-finetuned-models" ,
362+ SortBy = "CreationTime" ,
363+ SortOrder = "Descending" ,
364+ MaxResults = 1 ,
365+ )
366+
367+ if not packages ["ModelPackageSummaryList" ]:
368+ pytest .skip ("No model packages available in sdk-test-finetuned-models group. Run SFT/RLVR training first." )
369+
370+ model_package_arn = packages ["ModelPackageSummaryList" ][0 ]["ModelPackageArn" ]
371+ logger .info (f"Using model package: { model_package_arn } " )
372+
358373 logger .info ("Creating BenchmarkEvaluator with Nova model" )
359374
360375 # Create evaluator with Nova model package
361376 evaluator = BenchMarkEvaluator (
362377 benchmark = Benchmark .MMLU ,
363- model = NOVA_CONFIG [ " model_package_arn" ] ,
378+ model = model_package_arn ,
364379 s3_output_path = NOVA_CONFIG ["s3_output_path" ],
365- mlflow_resource_arn = NOVA_CONFIG ["mlflow_tracking_server_arn" ],
366380 model_package_group = NOVA_CONFIG ["model_package_group_arn" ],
367381 base_eval_name = "integ-test-nova-eval" ,
368382 region = NOVA_CONFIG ["region" ],
@@ -371,7 +385,7 @@ def test_benchmark_evaluation_nova_model(self):
371385 # Verify evaluator was created
372386 assert evaluator is not None
373387 assert evaluator .benchmark == Benchmark .MMLU
374- assert evaluator .model == NOVA_CONFIG [ " model_package_arn" ]
388+ assert evaluator .model == model_package_arn
375389 assert evaluator .region == NOVA_CONFIG ["region" ]
376390
377391 logger .info (f"Created evaluator: { evaluator .base_eval_name } " )
@@ -401,8 +415,8 @@ def test_benchmark_evaluation_nova_model(self):
401415 logger .info (f"Status after refresh: { execution .status .overall_status } " )
402416
403417 # Wait for completion
404- logger .info ("Waiting for evaluation to complete (timeout: 1 hour )" )
405- execution .wait (target_status = "Succeeded" , poll = 30 , timeout = 3600 )
418+ logger .info ("Waiting for evaluation to complete (timeout: 3 hours )" )
419+ execution .wait (target_status = "Succeeded" , poll = 30 , timeout = 10800 )
406420
407421 # Verify completion
408422 assert execution .status .overall_status == "Succeeded"
0 commit comments