Skip to content

Commit 92f8d42

Browse files
authored
fix: unskip evaluator integ test classes in sm-train (#5854)
* fix: unskip evaluator integ test classes in sm-train * debug: unskip all sm-train integ tests * test: replace redundant LLM-as-judge integ tests with unit tests for None metrics handling * mark TestCustomScorerEvaluatorIntegration tests as serial * update test config for test_benchmark_evaluation_base_model_only * unskip test_hp_contract_mpi_script * fix: comment out non-existent mlflow_resource_arn in base_model_only benchmark test * fix: mark three unfixed tests as skipped, to fix them in other pr
1 parent 9fc50b5 commit 92f8d42

5 files changed

Lines changed: 103 additions & 107 deletions

File tree

sagemaker-train/tests/integ/train/test_benchmark_evaluator.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,12 @@
5252
"region": "us-west-2",
5353
}
5454

55-
# Base model only evaluation configuration (from commented section in notebook)
55+
# Base model only evaluation configuration
5656
BASE_MODEL_ONLY_CONFIG = {
5757
"base_model_id": "meta-textgeneration-llama-3-2-1b-instruct",
58-
"dataset_s3_uri": "s3://sagemaker-us-west-2-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl",
59-
"s3_output_path": "s3://mufi-test-serverless-smtj/eval/",
60-
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-west-2:052150106756:mlflow-tracking-server/mmlu-eval-experiment",
58+
"dataset_s3_uri": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/zc_test.jsonl",
59+
"s3_output_path": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/",
60+
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-west-2:729646638167:mlflow-app/app-W7FOBBXZANVX",
6161
"region": "us-west-2",
6262
}
6363

@@ -72,7 +72,7 @@
7272
}
7373

7474

75-
@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
75+
# @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
7676
class TestBenchmarkEvaluatorIntegration:
7777
"""Integration tests for BenchmarkEvaluator with fine-tuned model package"""
7878

@@ -286,7 +286,7 @@ def test_benchmark_subtasks_validation(self):
286286

287287
logger.info("Subtask validation tests passed")
288288

289-
@pytest.mark.skip(reason="Base model only evaluation - to be enabled when needed")
289+
@pytest.mark.skip(reason="Pipeline creation fails - under investigation")
290290
def test_benchmark_evaluation_base_model_only(self):
291291
"""
292292
Test benchmark evaluation with base model only (no fine-tuned model).
@@ -307,7 +307,7 @@ def test_benchmark_evaluation_base_model_only(self):
307307
benchmark=Benchmark.MMLU,
308308
model=BASE_MODEL_ONLY_CONFIG["base_model_id"],
309309
s3_output_path=BASE_MODEL_ONLY_CONFIG["s3_output_path"],
310-
mlflow_resource_arn=BASE_MODEL_ONLY_CONFIG["mlflow_tracking_server_arn"],
310+
# mlflow_resource_arn=BASE_MODEL_ONLY_CONFIG["mlflow_tracking_server_arn"],
311311
base_eval_name="integ-test-base-model-only",
312312
# Note: model_package_group not needed for JumpStart models
313313
)
@@ -339,7 +339,7 @@ def test_benchmark_evaluation_base_model_only(self):
339339
assert execution.status.overall_status == "Succeeded"
340340
logger.info("Base model only evaluation completed successfully")
341341

342-
@pytest.mark.skip(reason="Nova model evaluation - to be enabled when needed")
342+
@pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5")
343343
def test_benchmark_evaluation_nova_model(self):
344344
"""
345345
Test benchmark evaluation with Nova model.

sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@
5555
}
5656

5757

58-
@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
58+
# @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
59+
@pytest.mark.xdist_group("custom_scorer_evaluator")
5960
class TestCustomScorerEvaluatorIntegration:
6061
"""Integration tests for CustomScorerEvaluator with custom evaluator"""
6162

@@ -233,7 +234,7 @@ def test_custom_scorer_evaluator_validation(self):
233234

234235
logger.info("Validation tests passed")
235236

236-
@pytest.mark.skip(reason="Built-in metric evaluation - to be enabled when needed")
237+
# @pytest.mark.skip(reason="Built-in metric evaluation - to be enabled when needed")
237238
def test_custom_scorer_with_builtin_metric(self):
238239
"""
239240
Test custom scorer evaluation with built-in metric.
@@ -285,7 +286,7 @@ def test_custom_scorer_with_builtin_metric(self):
285286
assert execution.status.overall_status == "Succeeded"
286287
logger.info("Built-in metric evaluation completed successfully")
287288

288-
@pytest.mark.skip(reason="Base model only evaluation - not working yet per notebook")
289+
# @pytest.mark.skip(reason="Base model only evaluation - not working yet per notebook")
289290
def test_custom_scorer_base_model_only(self):
290291
"""
291292
Test custom scorer evaluation with base model only (no fine-tuned model).

sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py

Lines changed: 1 addition & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
}
8585

8686

87-
@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
87+
# @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
8888
class TestLLMAsJudgeEvaluatorIntegration:
8989
"""Integration tests for LLMAsJudgeEvaluator"""
9090

@@ -254,98 +254,4 @@ def test_llm_as_judge_builtin_metrics_prefix_handling(self):
254254

255255
logger.info("Built-in metrics prefix handling tests passed")
256256

257-
@pytest.mark.skip(reason="Built-in metrics only test - to be enabled when needed")
258-
def test_llm_as_judge_builtin_metrics_only(self):
259-
"""
260-
Test LLM-as-Judge evaluation with only built-in metrics (no custom metrics).
261-
262-
This test uses only built-in metrics without custom metrics.
263-
264-
Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
265-
when you want to enable it.
266-
"""
267-
logger.info("Creating LLMAsJudgeEvaluator with built-in metrics only")
268-
269-
# Create evaluator with only built-in metrics
270-
evaluator = LLMAsJudgeEvaluator(
271-
model=TEST_CONFIG["model_package_arn"],
272-
evaluator_model=TEST_CONFIG["evaluator_model"],
273-
dataset=TEST_CONFIG["dataset_s3_uri"],
274-
builtin_metrics=["Completeness", "Faithfulness", "Helpfulness"],
275-
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
276-
s3_output_path=TEST_CONFIG["s3_output_path"],
277-
evaluate_base_model=False,
278-
)
279-
280-
# Verify evaluator was created
281-
assert evaluator is not None
282-
assert evaluator.builtin_metrics == ["Completeness", "Faithfulness", "Helpfulness"]
283-
assert evaluator.custom_metrics is None
284-
285-
logger.info("Created evaluator with built-in metrics only")
286-
287-
# Start evaluation
288-
logger.info("Starting evaluation execution")
289-
execution = evaluator.evaluate()
290-
291-
# Verify execution was created
292-
assert execution is not None
293-
assert execution.arn is not None
294-
295-
logger.info(f"Pipeline Execution ARN: {execution.arn}")
296-
297-
# Wait for completion
298-
logger.info(f"Waiting for evaluation to complete (timeout: {EVALUATION_TIMEOUT_SECONDS}s / {EVALUATION_TIMEOUT_SECONDS//3600}h)")
299-
execution.wait(target_status="Succeeded", poll=30, timeout=EVALUATION_TIMEOUT_SECONDS)
300-
301-
# Verify completion
302-
assert execution.status.overall_status == "Succeeded"
303-
logger.info("Built-in metrics only evaluation completed successfully")
304257

305-
@pytest.mark.skip(reason="Custom metrics only test - to be enabled when needed")
306-
def test_llm_as_judge_custom_metrics_only(self):
307-
"""
308-
Test LLM-as-Judge evaluation with only custom metrics (no built-in metrics).
309-
310-
This test uses only custom metrics without built-in metrics.
311-
312-
Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
313-
when you want to enable it.
314-
"""
315-
logger.info("Creating LLMAsJudgeEvaluator with custom metrics only")
316-
317-
# Create evaluator with only custom metrics
318-
evaluator = LLMAsJudgeEvaluator(
319-
model=TEST_CONFIG["model_package_arn"],
320-
evaluator_model=TEST_CONFIG["evaluator_model"],
321-
dataset=TEST_CONFIG["dataset_s3_uri"],
322-
custom_metrics=TEST_CONFIG["custom_metrics_json"],
323-
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
324-
s3_output_path=TEST_CONFIG["s3_output_path"],
325-
evaluate_base_model=False,
326-
)
327-
328-
# Verify evaluator was created
329-
assert evaluator is not None
330-
assert evaluator.custom_metrics == TEST_CONFIG["custom_metrics_json"]
331-
assert evaluator.builtin_metrics is None
332-
333-
logger.info("Created evaluator with custom metrics only")
334-
335-
# Start evaluation
336-
logger.info("Starting evaluation execution")
337-
execution = evaluator.evaluate()
338-
339-
# Verify execution was created
340-
assert execution is not None
341-
assert execution.arn is not None
342-
343-
logger.info(f"Pipeline Execution ARN: {execution.arn}")
344-
345-
# Wait for completion
346-
logger.info(f"Waiting for evaluation to complete (timeout: {EVALUATION_TIMEOUT_SECONDS}s / {EVALUATION_TIMEOUT_SECONDS//3600}h)")
347-
execution.wait(target_status="Succeeded", poll=30, timeout=EVALUATION_TIMEOUT_SECONDS)
348-
349-
# Verify completion
350-
assert execution.status.overall_status == "Succeeded"
351-
logger.info("Custom metrics only evaluation completed successfully")

sagemaker-train/tests/integ/train/test_model_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def test_hp_contract_basic_sh_script(sagemaker_session):
9696

9797

9898
# skip this test for now as requirments.txt is not resolved
99-
@pytest.mark.skip
99+
@pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes")
100100
def test_hp_contract_mpi_script(sagemaker_session):
101101
compute = Compute(instance_type="ml.m5.xlarge", instance_count=2)
102102
model_trainer = ModelTrainer(

sagemaker-train/tests/unit/train/evaluate/test_llm_as_judge_evaluator.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,95 @@ def test_llm_as_judge_evaluator_get_llmaj_template_additions_no_metrics(mock_art
515515
assert additions['custom_metrics'] is None
516516

517517

518+
@patch('sagemaker.train.common_utils.model_resolution._resolve_base_model')
519+
@patch('sagemaker.core.resources.Artifact')
520+
def test_llm_as_judge_evaluator_builtin_metrics_only_no_custom(mock_artifact, mock_resolve):
521+
"""Test that evaluator handles builtin_metrics with custom_metrics=None correctly."""
522+
mock_info = Mock()
523+
mock_info.base_model_name = DEFAULT_MODEL
524+
mock_info.base_model_arn = DEFAULT_BASE_MODEL_ARN
525+
mock_info.source_model_package_arn = None
526+
mock_resolve.return_value = mock_info
527+
528+
mock_artifact.get_all.return_value = iter([])
529+
mock_artifact_instance = Mock()
530+
mock_artifact_instance.artifact_arn = DEFAULT_ARTIFACT_ARN
531+
mock_artifact.create.return_value = mock_artifact_instance
532+
533+
mock_session = Mock()
534+
mock_session.boto_region_name = DEFAULT_REGION
535+
mock_session.boto_session = Mock()
536+
mock_session.get_caller_identity_arn.return_value = DEFAULT_ROLE
537+
538+
evaluator = LLMAsJudgeEvaluator(
539+
evaluator_model=DEFAULT_EVALUATOR_MODEL,
540+
dataset=DEFAULT_DATASET,
541+
model=DEFAULT_MODEL,
542+
builtin_metrics=["Completeness", "Faithfulness"],
543+
custom_metrics=None,
544+
s3_output_path=DEFAULT_S3_OUTPUT,
545+
mlflow_resource_arn=DEFAULT_MLFLOW_ARN,
546+
model_package_group=DEFAULT_MODEL_PACKAGE_GROUP_ARN,
547+
sagemaker_session=mock_session,
548+
)
549+
550+
assert evaluator.builtin_metrics == ["Completeness", "Faithfulness"]
551+
assert evaluator.custom_metrics is None
552+
553+
eval_name = "test-eval"
554+
additions = evaluator._get_llmaj_template_additions(eval_name)
555+
556+
assert additions['llmaj_metrics'] == json.dumps(["Completeness", "Faithfulness"])
557+
assert additions['custom_metrics'] is None
558+
559+
560+
@patch('sagemaker.core.s3.client.S3Uploader.upload_string_as_file_body')
561+
@patch('sagemaker.train.common_utils.model_resolution._resolve_base_model')
562+
@patch('sagemaker.core.resources.Artifact')
563+
def test_llm_as_judge_evaluator_custom_metrics_only_no_builtin(mock_artifact, mock_resolve, mock_s3_upload):
564+
"""Test that evaluator handles custom_metrics with builtin_metrics=None correctly."""
565+
mock_info = Mock()
566+
mock_info.base_model_name = DEFAULT_MODEL
567+
mock_info.base_model_arn = DEFAULT_BASE_MODEL_ARN
568+
mock_info.source_model_package_arn = None
569+
mock_resolve.return_value = mock_info
570+
571+
mock_artifact.get_all.return_value = iter([])
572+
mock_artifact_instance = Mock()
573+
mock_artifact_instance.artifact_arn = DEFAULT_ARTIFACT_ARN
574+
mock_artifact.create.return_value = mock_artifact_instance
575+
576+
mock_session = Mock()
577+
mock_session.boto_region_name = DEFAULT_REGION
578+
mock_session.boto_session = Mock()
579+
mock_session.get_caller_identity_arn.return_value = DEFAULT_ROLE
580+
581+
custom_metrics_json = json.dumps([{"customMetricDefinition": {"name": "TestMetric"}}])
582+
583+
evaluator = LLMAsJudgeEvaluator(
584+
evaluator_model=DEFAULT_EVALUATOR_MODEL,
585+
dataset=DEFAULT_DATASET,
586+
model=DEFAULT_MODEL,
587+
builtin_metrics=None,
588+
custom_metrics=custom_metrics_json,
589+
s3_output_path=DEFAULT_S3_OUTPUT,
590+
mlflow_resource_arn=DEFAULT_MLFLOW_ARN,
591+
model_package_group=DEFAULT_MODEL_PACKAGE_GROUP_ARN,
592+
sagemaker_session=mock_session,
593+
)
594+
595+
assert evaluator.builtin_metrics is None
596+
assert evaluator.custom_metrics == custom_metrics_json
597+
598+
eval_name = "test-eval"
599+
additions = evaluator._get_llmaj_template_additions(eval_name)
600+
601+
assert additions['llmaj_metrics'] == json.dumps([])
602+
assert additions['custom_metrics'] is not None
603+
assert additions['custom_metrics'].startswith("s3://")
604+
mock_s3_upload.assert_called_once()
605+
606+
518607
@pytest.mark.skip(reason="Integration test - requires full pipeline execution setup")
519608
@patch('sagemaker.train.evaluate.execution.Pipeline')
520609
@patch('sagemaker.train.evaluate.llm_as_judge_evaluator.EvaluationPipelineExecution')

0 commit comments

Comments
 (0)