Skip to content

Commit 16ffb94

Browse files
authored
tests: unskip three tests with individual issues in train (#5894)
* tests: unskip three tests * fix: use relative imports in mpi_driver.py for container compatibility The MPI driver script used absolute imports (from sagemaker.train.container_drivers...) which fail at runtime in the training container because sagemaker-train is not installed there. The driver scripts are copied to /opt/ml/input/data/sm_drivers/ and executed directly by the container entrypoint. Changed to sys.path-based relative imports matching the pattern used by torchrun_driver.py, which works correctly in the container environment. * test: migrate nova eval test to use own account resources Remove cross-account dependency in test_benchmark_evaluation_nova_model by replacing resources from account 052150106756 with our test account (729646638167) in us-east-1. Also removed mlflow_tracking_server_arn since no MLflow server exists in us-east-1. Test remains skipped pending us-east-1 test infrastructure migration to a dedicated test account. * test: remove module level skip
1 parent 4374751 commit 16ffb94

3 files changed

Lines changed: 15 additions & 16 deletions

File tree

sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717
import sys
1818
import json
1919

20-
from sagemaker.train.container_drivers.distributed_drivers.mpi_utils import (
20+
from pathlib import Path
21+
22+
sys.path.insert(0, str(Path(__file__).parent.parent))
23+
24+
from distributed_drivers.mpi_utils import ( # noqa: E402 # pylint: disable=C0413,E0611
2125
start_sshd_daemon,
2226
bootstrap_master_node,
2327
bootstrap_worker_node,
@@ -27,7 +31,7 @@
2731
)
2832

2933

30-
from sagemaker.train.container_drivers.common.utils import (
34+
from common.utils import ( # noqa: E402 # pylint: disable=C0413,E0611
3135
logger,
3236
hyperparameters_to_cli_args,
3337
get_process_count,

sagemaker-train/tests/integ/train/test_benchmark_evaluator.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
EvaluationPipelineExecution,
2424
)
2525

26-
pytestmark = pytest.mark.gpu_intensive
27-
2826
# Configure logging
2927
logging.basicConfig(
3028
level=logging.INFO,
@@ -63,13 +61,12 @@
6361
"region": "us-west-2",
6462
}
6563

66-
# Nova model evaluation configuration (from commented section in notebook)
64+
# Nova model evaluation configuration (uses our own test account in us-east-1)
6765
NOVA_CONFIG = {
68-
"model_package_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package/test-nova-finetuned-models/3",
69-
"dataset_s3_uri": "s3://sagemaker-us-east-1-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl",
70-
"s3_output_path": "s3://mufi-test-serverless-iad/eval/",
71-
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-east-1:052150106756:mlflow-tracking-server/mlflow-prod-server",
72-
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package-group/test-nova-finetuned-models",
66+
"model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65",
67+
"dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl",
68+
"s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/",
69+
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models",
7370
"region": "us-east-1",
7471
}
7572

@@ -288,7 +285,7 @@ def test_benchmark_subtasks_validation(self):
288285

289286
logger.info("Subtask validation tests passed")
290287

291-
@pytest.mark.skip(reason="Pipeline creation fails - under investigation")
288+
# @pytest.mark.skip(reason="Pipeline creation fails - under investigation")
292289
@pytest.mark.gpu_intensive
293290
def test_benchmark_evaluation_base_model_only(self):
294291
"""
@@ -342,16 +339,15 @@ def test_benchmark_evaluation_base_model_only(self):
342339
assert execution.status.overall_status == "Succeeded"
343340
logger.info("Base model only evaluation completed successfully")
344341

345-
@pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5")
342+
@pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account")
346343
def test_benchmark_evaluation_nova_model(self):
347344
"""
348345
Test benchmark evaluation with Nova model.
349346
350347
This test uses a Nova fine-tuned model package in us-east-1 region.
351348
Configuration from commented section in benchmark_demo.ipynb.
352349
353-
Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
354-
when you want to enable it.
350+
Note: This test is currently skipped pending us-east-1 test infra migration.
355351
"""
356352
# Get benchmarks
357353
Benchmark = get_benchmarks()
@@ -363,7 +359,6 @@ def test_benchmark_evaluation_nova_model(self):
363359
benchmark=Benchmark.MMLU,
364360
model=NOVA_CONFIG["model_package_arn"],
365361
s3_output_path=NOVA_CONFIG["s3_output_path"],
366-
mlflow_resource_arn=NOVA_CONFIG["mlflow_tracking_server_arn"],
367362
model_package_group=NOVA_CONFIG["model_package_group_arn"],
368363
base_eval_name="integ-test-nova-eval",
369364
region=NOVA_CONFIG["region"],

sagemaker-train/tests/integ/train/test_model_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def test_hp_contract_basic_sh_script(sagemaker_session):
9696

9797

9898
# skip this test for now as requirments.txt is not resolved
99-
@pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes")
99+
# @pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes")
100100
def test_hp_contract_mpi_script(sagemaker_session):
101101
compute = Compute(instance_type="ml.m5.xlarge", instance_count=2)
102102
model_trainer = ModelTrainer(

0 commit comments

Comments
 (0)