Merge branch 'master' into context_length

guanweim · web-flow · commit 57e1de7d623f · 2026-06-01T10:21:28.000-07:00
diff --git a/sagemaker-core/src/sagemaker/core/remote_function/job.py b/sagemaker-core/src/sagemaker/core/remote_function/job.py
@@ -860,6 +860,10 @@ def _get_default_spark_image(session):
         except ImportError:
             pass
 
+        # Spark 3.3 and below do not support py312; use 3.5 which supports both py39 and py312
+        if py_version == "312" and spark_version in ("2.4", "3.0", "3.1", "3.2", "3.3"):
+            spark_version = "3.5"
+
         image_uri = image_uris.retrieve(
             framework=SPARK_NAME,
             region=region,
diff --git a/sagemaker-core/tests/integ/remote_function/conftest.py b/sagemaker-core/tests/integ/remote_function/conftest.py
@@ -171,6 +171,52 @@ def spark_test_container(sagemaker_session, sagemaker_sdk_tar_path, tmp_path_fac
     )
 
 
+@pytest.fixture(scope="session")
+def spark_pre_execution_commands(sagemaker_session):
+    """Build sagemaker-core wheel, upload to S3, and return pre-execution install commands.
+
+    This mirrors the pattern used in sagemaker-mlops feature_processor integ tests.
+    The Spark processing image does not have sagemaker-core pre-installed, so we must
+    build the local dev wheel and install it in the container via pre_execution_commands.
+    """
+    import subprocess
+    import glob
+    import tempfile
+    from sagemaker.core.s3 import S3Uploader
+
+    repo_root = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")
+    )
+    core_dir = os.path.join(repo_root, "sagemaker-core")
+
+    with tempfile.TemporaryDirectory() as dist_dir:
+        subprocess.run(
+            f"python -m build --wheel --outdir {dist_dir}",
+            shell=True,
+            cwd=core_dir,
+            check=True,
+        )
+        wheels = glob.glob(os.path.join(dist_dir, "sagemaker_core-*.whl"))
+        if not wheels:
+            raise FileNotFoundError(f"No sagemaker-core wheel found in {dist_dir}")
+        wheel_path = wheels[0]
+        wheel_name = os.path.basename(wheel_path)
+
+        s3_prefix = "s3://{}/spark-integ-test/wheels".format(
+            sagemaker_session.default_bucket()
+        )
+        S3Uploader.upload(wheel_path, s3_prefix, sagemaker_session=sagemaker_session)
+
+    PIP = "python3 -m pip install --root-user-action=ignore"
+    AWS = "python3 -m awscli"
+    cmds = [
+        f"{PIP} awscli",
+        f"{AWS} s3 cp {s3_prefix}/{wheel_name} /tmp/{wheel_name}",
+        f"{PIP} /tmp/{wheel_name}",
+    ]
+    return cmds
+
+
 @pytest.fixture(scope="session")
 def conda_env_yml():
     """Write conda yml file needed for tests."""
diff --git a/sagemaker-core/tests/integ/remote_function/test_decorator.py b/sagemaker-core/tests/integ/remote_function/test_decorator.py
@@ -574,16 +574,18 @@ def my_func():
     assert client_error_message in str(error)
 
 
-@pytest.mark.skipif(
-    sys.version_info[:2] not in [(3, 9), (3, 12)],
-    reason="SageMaker Spark image only available for Python 3.9 and 3.12",
-)
-def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type):
+# @pytest.mark.skipif(
+#     sys.version_info[:2] not in [(3, 9), (3, 12)],
+#     reason="SageMaker Spark image only available for Python 3.9 and 3.12",
+# )
+@pytest.mark.spark_py312
+def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type, spark_pre_execution_commands):
     @remote(
         role=ROLE,
         instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
         keep_alive_period_in_seconds=60,
+        pre_execution_commands=spark_pre_execution_commands,
         spark_config=SparkConfig(
             configuration=[
                 {
@@ -598,7 +600,14 @@ def test_spark_transform():
 
         spark = SparkSession.builder.getOrCreate()
 
-        assert spark.conf.get("spark.app.name") == "remote-spark-test"
+        # Avoid bare assert here: pytest's assertion rewriting injects _pytest
+        # module references into the function bytecode, which causes
+        # deserialization to fail in the Spark container (no pytest installed).
+        app_name = spark.conf.get("spark.app.name")
+        if app_name != "remote-spark-test":
+            raise RuntimeError(
+                f"Expected spark.app.name='remote-spark-test', got '{app_name}'"
+            )
 
     test_spark_transform()
 
diff --git a/sagemaker-mlops/tests/integ/feature_store/feature_processor/test_feature_processor_integ.py b/sagemaker-mlops/tests/integ/feature_store/feature_processor/test_feature_processor_integ.py
@@ -798,11 +798,11 @@ def transform(raw_s3_data_as_df):
 #     sys.version_info[:2] not in [(3, 9), (3, 12)],
 #     reason=f"SageMaker Spark image only supports Python 3.9 and 3.12, got {sys.version_info[:2]}",
 # )
-@pytest.mark.skip(
-    reason="Lake Formation credential vending (GetTemporaryGlueTableCredentials) requires "
-    "full LF environment setup (resource registration, trust policy, data location grants) "
-    "that is not configured in CI. See quip-amazon.com/S3FEAMMMuKm0 for details."
-)
+# @pytest.mark.skip(
+#     reason="Lake Formation credential vending (GetTemporaryGlueTableCredentials) requires "
+#     "full LF environment setup (resource registration, trust policy, data location grants) "
+#     "that is not configured in CI. See quip-amazon.com/S3FEAMMMuKm0 for details."
+# )
 @pytest.mark.spark_py312
 @pytest.mark.slow_test
 def test_to_pipeline_and_execute_with_lake_formation(
diff --git a/sagemaker-train/tests/integ/ai_registry/test_dataset.py b/sagemaker-train/tests/integ/ai_registry/test_dataset.py
@@ -79,8 +79,9 @@ def test_create_dataset_from_s3_oss_dpo(self, unique_name, test_bucket, cleanup_
         assert dataset.name == unique_name
         assert dataset.customization_technique == CustomizationTechnique.DPO
 
+    @pytest.mark.us_east_1
     def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup_list):
-        """Test creating RLVR dataset from S3 URI."""
+        """Test creating Nova SFT dataset from S3 URI."""
         s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_sft_train.jsonl"
         dataset = DataSet.create(
             name=unique_name,
@@ -92,8 +93,9 @@ def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup
         assert dataset.name == unique_name
         assert dataset.customization_technique == CustomizationTechnique.SFT
 
+    @pytest.mark.us_east_1
     def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup_list):
-        """Test creating RLVR dataset from S3 URI."""
+        """Test creating Nova DPO dataset from S3 URI."""
         s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_dpo_train.jsonl"
         dataset = DataSet.create(
             name=unique_name,
@@ -105,8 +107,9 @@ def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup
         assert dataset.name == unique_name
         assert dataset.customization_technique == CustomizationTechnique.DPO
 
+    @pytest.mark.us_east_1
     def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup_list):
-        """Test creating RLVR dataset from S3 URI."""
+        """Test creating Nova RFT dataset from S3 URI."""
         s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_rft_train.jsonl"
         dataset = DataSet.create(
             name=unique_name,
@@ -118,8 +121,9 @@ def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup
         assert dataset.name == unique_name
         assert dataset.customization_technique == CustomizationTechnique.RLVR
 
+    @pytest.mark.us_east_1
     def test_create_dataset_from_s3_nova_eval(self, unique_name, test_bucket, cleanup_list):
-        """Test creating RLVR dataset from S3 URI."""
+        """Test creating Nova eval dataset from S3 URI."""
         s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_eval.jsonl"
         dataset = DataSet.create(
             name=unique_name,
diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py
@@ -61,12 +61,11 @@
     "region": "us-west-2",
 }
 
-# Nova model evaluation configuration (uses our own test account in us-east-1)
+# Nova model evaluation configuration (uses dedicated test account in us-east-1)
 NOVA_CONFIG = {
-    "model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65",
-    "dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl",
-    "s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/",
-    "model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models",
+    "dataset_s3_uri": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/zc_test.jsonl",
+    "s3_output_path": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/",
+    "model_package_group_arn": "arn:aws:sagemaker:us-east-1:784379639078:model-package-group/sdk-test-finetuned-models",
     "region": "us-east-1",
 }
 
@@ -339,25 +338,44 @@ def test_benchmark_evaluation_base_model_only(self):
         assert execution.status.overall_status == "Succeeded"
         logger.info("Base model only evaluation completed successfully")
 
-    @pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account")
+    @pytest.mark.gpu_intensive
+    @pytest.mark.us_east_1
     def test_benchmark_evaluation_nova_model(self):
         """
         Test benchmark evaluation with Nova model.
         
         This test uses a Nova fine-tuned model package in us-east-1 region.
         Configuration from commented section in benchmark_demo.ipynb.
         
-        Note: This test is currently skipped pending us-east-1 test infra migration.
+        Note: This test requires a model package to exist in the model package group.
+        It should be run after a successful SFT or RLVR training job has produced one.
         """
+        import boto3
+        
         # Get benchmarks
         Benchmark = get_benchmarks()
         
+        # Dynamically find the latest model package in the group
+        sm_client = boto3.client("sagemaker", region_name=NOVA_CONFIG["region"])
+        packages = sm_client.list_model_packages(
+            ModelPackageGroupName="sdk-test-finetuned-models",
+            SortBy="CreationTime",
+            SortOrder="Descending",
+            MaxResults=1,
+        )
+        
+        if not packages["ModelPackageSummaryList"]:
+            pytest.skip("No model packages available in sdk-test-finetuned-models group. Run SFT/RLVR training first.")
+        
+        model_package_arn = packages["ModelPackageSummaryList"][0]["ModelPackageArn"]
+        logger.info(f"Using model package: {model_package_arn}")
+        
         logger.info("Creating BenchmarkEvaluator with Nova model")
         
         # Create evaluator with Nova model package
         evaluator = BenchMarkEvaluator(
             benchmark=Benchmark.MMLU,
-            model=NOVA_CONFIG["model_package_arn"],
+            model=model_package_arn,
             s3_output_path=NOVA_CONFIG["s3_output_path"],
             model_package_group=NOVA_CONFIG["model_package_group_arn"],
             base_eval_name="integ-test-nova-eval",
@@ -367,7 +385,7 @@ def test_benchmark_evaluation_nova_model(self):
         # Verify evaluator was created
         assert evaluator is not None
         assert evaluator.benchmark == Benchmark.MMLU
-        assert evaluator.model == NOVA_CONFIG["model_package_arn"]
+        assert evaluator.model == model_package_arn
         assert evaluator.region == NOVA_CONFIG["region"]
         
         logger.info(f"Created evaluator: {evaluator.base_eval_name}")
@@ -397,8 +415,8 @@ def test_benchmark_evaluation_nova_model(self):
         logger.info(f"Status after refresh: {execution.status.overall_status}")
         
         # Wait for completion
-        logger.info("Waiting for evaluation to complete (timeout: 1 hour)")
-        execution.wait(target_status="Succeeded", poll=30, timeout=3600)
+        logger.info("Waiting for evaluation to complete (timeout: 3 hours)")
+        execution.wait(target_status="Succeeded", poll=30, timeout=10800)
         
         # Verify completion
         assert execution.status.overall_status == "Succeeded"
diff --git a/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py b/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py
@@ -54,6 +54,16 @@
     "region": "us-west-2",
 }
 
+# Base model only evaluation configuration (uses JumpStart model ID directly, no model package)
+BASE_MODEL_ONLY_CONFIG = {
+    "base_model_id": "meta-textgeneration-llama-3-2-1b-instruct",
+    "evaluator_arn": "arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/eval-lambda-test/0.0.1",
+    "dataset_s3_uri": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/zc_test.jsonl",
+    "s3_output_path": "s3://sagemaker-us-west-2-729646638167/model-customization/eval/",
+    "mlflow_tracking_server_arn": "arn:aws:sagemaker:us-west-2:729646638167:mlflow-app/app-W7FOBBXZANVX",
+    "region": "us-west-2",
+}
+
 
 # @pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/")
 @pytest.mark.xdist_group("custom_scorer_evaluator")
@@ -288,13 +298,125 @@ def test_custom_scorer_with_builtin_metric(self):
         logger.info("Built-in metric evaluation completed successfully")
 
     # @pytest.mark.skip(reason="Base model only evaluation - not working yet per notebook")
+    @pytest.mark.gpu_intensive
     def test_custom_scorer_base_model_only(self):
         """
         Test custom scorer evaluation with base model only (no fine-tuned model).
         
-        Note: Per the notebook, "Evaluation with Base Model Only is yet to be 
-        implemented/tested - Not Working currently". This test is skipped until
-        that functionality is available.
+        This test uses a JumpStart model ID directly instead of a model package ARN,
+        which triggers the CUSTOM_SCORER_TEMPLATE_BASE_MODEL_ONLY template path.
+        The evaluation runs against only the base model without any fine-tuned weights.
+        
+        This test covers:
+        1. Creating CustomScorerEvaluator with a JumpStart model ID (base model only)
+        2. Accessing hyperparameters
+        3. Starting evaluation
+        4. Monitoring execution
+        5. Waiting for completion
+        6. Viewing results
+        7. Retrieving execution by ARN
         """
-        logger.info("Base model only evaluation - not yet implemented")
-        pass
+        # Step 1: Create CustomScorerEvaluator with JumpStart model ID
+        logger.info("Creating CustomScorerEvaluator with base model only (JumpStart model ID)")
+        
+        evaluator = CustomScorerEvaluator(
+            evaluator=BASE_MODEL_ONLY_CONFIG["evaluator_arn"],
+            dataset=BASE_MODEL_ONLY_CONFIG["dataset_s3_uri"],
+            model=BASE_MODEL_ONLY_CONFIG["base_model_id"],
+            s3_output_path=BASE_MODEL_ONLY_CONFIG["s3_output_path"],
+            evaluate_base_model=False,
+        )
+        
+        # Verify evaluator was created with base model ID
+        assert evaluator is not None
+        assert evaluator.evaluator == BASE_MODEL_ONLY_CONFIG["evaluator_arn"]
+        assert evaluator.model == BASE_MODEL_ONLY_CONFIG["base_model_id"]
+        assert evaluator.dataset == BASE_MODEL_ONLY_CONFIG["dataset_s3_uri"]
+        
+        logger.info(f"Created evaluator with base model: {BASE_MODEL_ONLY_CONFIG['base_model_id']}")
+        
+        # Step 2: Access hyperparameters
+        logger.info("Accessing hyperparameters")
+        hyperparams = evaluator.hyperparameters.to_dict()
+        
+        # Verify hyperparameters structure
+        assert isinstance(hyperparams, dict)
+        assert "max_new_tokens" in hyperparams
+        assert "temperature" in hyperparams
+        
+        logger.info(f"Hyperparameters: {hyperparams}")
+        
+        # Step 3: Start evaluation
+        logger.info("Starting evaluation execution")
+        execution = evaluator.evaluate()
+        
+        # Verify execution was created
+        assert execution is not None
+        assert execution.arn is not None
+        assert execution.name is not None
+        assert execution.eval_type is not None
+        
+        logger.info(f"Pipeline Execution ARN: {execution.arn}")
+        logger.info(f"Initial Status: {execution.status.overall_status}")
+        
+        # Step 4: Monitor execution
+        logger.info("Refreshing execution status")
+        execution.refresh()
+        
+        # Verify status was updated
+        assert execution.status.overall_status is not None
+        
+        # Log step details if available
+        if execution.status.step_details:
+            logger.info("Step Details:")
+            for step in execution.status.step_details:
+                logger.info(f"  {step.name}: {step.status}")
+        
+        # Step 5: Wait for completion
+        logger.info(f"Waiting for evaluation to complete (timeout: {EVALUATION_TIMEOUT_SECONDS}s / {EVALUATION_TIMEOUT_SECONDS//3600}h)")
+        
+        try:
+            execution.wait(target_status="Succeeded", poll=30, timeout=EVALUATION_TIMEOUT_SECONDS)
+            logger.info(f"Final Status: {execution.status.overall_status}")
+            
+            # Verify completion
+            assert execution.status.overall_status == "Succeeded"
+            
+            # Step 6: View results
+            logger.info("Displaying results")
+            execution.show_results()
+            
+            # Verify S3 output path is set
+            assert execution.s3_output_path is not None
+            logger.info(f"Results stored at: {execution.s3_output_path}")
+            
+        except Exception as e:
+            logger.error(f"Evaluation failed or timed out: {e}")
+            logger.error(f"Final status: {execution.status.overall_status}")
+            if execution.status.failure_reason:
+                logger.error(f"Failure reason: {execution.status.failure_reason}")
+            
+            # Log step failures
+            if execution.status.step_details:
+                for step in execution.status.step_details:
+                    if "failed" in step.status.lower():
+                        logger.error(f"Failed step: {step.name}")
+                        if step.failure_reason:
+                            logger.error(f"  Reason: {step.failure_reason}")
+            
+            # Re-raise to fail the test
+            raise
+        
+        # Step 7: Retrieve execution by ARN
+        logger.info("Retrieving execution by ARN")
+        retrieved_execution = EvaluationPipelineExecution.get(
+            arn=execution.arn,
+            region=BASE_MODEL_ONLY_CONFIG["region"]
+        )
+        
+        # Verify retrieved execution matches
+        assert retrieved_execution.arn == execution.arn
+        assert retrieved_execution.status.overall_status == "Succeeded"
+        
+        logger.info(f"Retrieved execution status: {retrieved_execution.status.overall_status}")
+        logger.info("Base model only evaluation completed successfully")
diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py
diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py
diff --git a/sagemaker-train/tox.ini b/sagemaker-train/tox.ini