rsareddy0329
diff --git a/‎sagemaker-mlops/tests/integ/test_feature_store.py‎
Lines changed: 20 additions & 4 deletions b/‎sagemaker-mlops/tests/integ/test_feature_store.py‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎sagemaker-mlops/tests/integ/test_feature_store_lakeformation.py‎
Lines changed: 31 additions & 5 deletions b/‎sagemaker-mlops/tests/integ/test_feature_store_lakeformation.py‎
Lines changed: 31 additions & 5 deletions
diff --git a/‎sagemaker-serve/src/sagemaker/serve/bedrock_model_builder.py‎
Lines changed: 11 additions & 13 deletions b/‎sagemaker-serve/src/sagemaker/serve/bedrock_model_builder.py‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎sagemaker-serve/src/sagemaker/serve/model_builder.py‎
Lines changed: 3 additions & 0 deletions b/‎sagemaker-serve/src/sagemaker/serve/model_builder.py‎
Lines changed: 3 additions & 0 deletions
@@ -155,10 +155,26 @@ def test_delete_feature_group(feature_group_name, sample_dataframe, bucket, role
     fg.wait_for_status("Created")
 
     fg.delete()
-    time.sleep(2)
-    
-    with pytest.raises(Exception):
-        FeatureGroup.get(feature_group_name=feature_group_name)
+
+    # FeatureGroup deletion is asynchronous: after delete() returns the group
+    # stays in "Deleting" status and is still describable for a while, so a
+    # fixed short sleep + single get() is racy. Poll until get() raises (the
+    # group is fully gone) or we hit the timeout.
+    deadline = time.time() + 120
+    last_exc = None
+    while time.time() < deadline:
+        try:
+            FeatureGroup.get(feature_group_name=feature_group_name)
+        except Exception as e:  # noqa: BLE001 - any error means it's no longer retrievable
+            last_exc = e
+            break
+        time.sleep(5)
+    else:
+        pytest.fail(
+            f"FeatureGroup {feature_group_name} was still retrievable 120s after delete()"
+        )
+
+    assert last_exc is not None
 
 
 # Test 7: Ingest to both OnlineStore and OfflineStore
 
@@ -160,7 +160,12 @@ def test_create_feature_group_and_enable_lake_formation(s3_uri, role, region):
         assert fg.feature_group_status == "Created"
 
         # Enable Lake Formation governance
-        result = fg.enable_lake_formation(hybrid_access_mode_enabled=False, acknowledge_risk=True)
+        result = fg.enable_lake_formation(
+            hybrid_access_mode_enabled=False,
+            acknowledge_risk=True,
+            use_service_linked_role=False,
+            registration_role_arn=role,
+        )
 
         # Verify all phases completed successfully
         assert result["s3_location_registered"] is True
@@ -198,6 +203,8 @@ def test_create_feature_group_with_lake_formation_enabled(s3_uri, role, region):
             enabled=True,
             hybrid_access_mode_enabled = False,
             acknowledge_risk=True,
+            use_service_linked_role=False,
+            registration_role_arn=role,
         )
 
         fg = FeatureGroupManager.create(
@@ -467,8 +474,17 @@ def test_enable_lake_formation_fails_with_nonexistent_role(
     # Verify we got an appropriate error
     error_msg = str(exc_info.value)
     print(exc_info)
-    # Should mention role-related issues (not found, invalid, access denied, etc.)
-    assert "EntityNotFoundException" in error_msg
+    # The registration must fail because the role is not usable. Depending on
+    # how the build/execution role's iam:PassRole policy is scoped, this surfaces
+    # either as Lake Formation rejecting the unknown role (EntityNotFoundException)
+    # or as IAM denying PassRole before the call reaches Lake Formation
+    # (AccessDeniedException on iam:PassRole). Both are valid "nonexistent / not
+    # usable role" outcomes for this negative test.
+    assert (
+        "EntityNotFoundException" in error_msg
+        or "AccessDeniedException" in error_msg
+        or "iam:PassRole" in error_msg
+    ), f"Unexpected error for nonexistent role registration: {error_msg}"
 
 
 # ============================================================================
@@ -503,7 +519,12 @@ def test_enable_lake_formation_full_flow_with_policy_output(s3_uri, role, region
 
         # Enable Lake Formation governance
         with caplog.at_level(logging.WARNING, logger="sagemaker.mlops.feature_store.feature_group_manager"):
-            result = fg.enable_lake_formation(hybrid_access_mode_enabled=False, acknowledge_risk=True)
+            result = fg.enable_lake_formation(
+                hybrid_access_mode_enabled=False,
+                acknowledge_risk=True,
+                use_service_linked_role=False,
+                registration_role_arn=role,
+            )
 
         # Verify all phases completed successfully
         assert result["s3_location_registered"] is True
@@ -546,7 +567,12 @@ def test_enable_lake_formation_default_logs_recommended_policy(s3_uri, role, reg
 
         # Enable Lake Formation governance with hybrid_access_mode_enabled=False
         with caplog.at_level(logging.WARNING, logger="sagemaker.mlops.feature_store.feature_group_manager"):
-            result = fg.enable_lake_formation(hybrid_access_mode_enabled=False, acknowledge_risk=True)
+            result = fg.enable_lake_formation(
+                hybrid_access_mode_enabled=False,
+                acknowledge_risk=True,
+                use_service_linked_role=False,
+                registration_role_arn=role,
+            )
 
         # Verify phases completed successfully
         assert result["s3_location_registered"] is True
 
@@ -645,10 +645,9 @@ def _get_checkpoint_uri_from_manifest(self) -> Optional[str]:
         """Get checkpoint URI from manifest.json for Nova models.
 
         Steps:
-        1. Fetch S3 model artifacts from training job
-        2. Construct path to manifest.json in the output directory
-        3. Read and parse manifest.json
-        4. Return checkpoint_s3_bucket value
+        1. Build the manifest.json path from the training job output_data_config
+        2. Read and parse manifest.json
+        3. Return checkpoint_s3_bucket value
 
         Returns:
             Checkpoint URI from manifest.json.
@@ -660,16 +659,15 @@ def _get_checkpoint_uri_from_manifest(self) -> Optional[str]:
         if not isinstance(self.model, TrainingJob):
             raise ValueError("Model must be a TrainingJob instance for Nova models")
 
-        s3_artifacts = self.model.model_artifacts.s3_model_artifacts
-        if not s3_artifacts:
-            raise ValueError("No S3 model artifacts found in training job")
+        # Nova serverless training jobs have no model_artifacts; the manifest
+        # lives under the job's output_data_config path.
+        output_data_config = getattr(self.model, "output_data_config", None)
+        s3_output_path = getattr(output_data_config, "s3_output_path", None)
+        if not s3_output_path:
+            raise ValueError("No S3 output path found in training job output_data_config")
 
-        logger.info("S3 artifacts path: %s", s3_artifacts)
-
-        # Construct manifest path
-        # s3://bucket/path/output/model.tar.gz -> s3://bucket/path/output/output/manifest.json
-        parts = s3_artifacts.rstrip("/").rsplit("/", 1)
-        manifest_path = parts[0] + "/output/manifest.json"
+        output_path = s3_output_path.rstrip("/")
+        manifest_path = f"{output_path}/{self.model.training_job_name}/output/output/manifest.json"
 
         logger.info("Manifest path: %s", manifest_path)
 
 
@@ -4699,6 +4699,9 @@ def _resolve_nova_escrow_uri(self) -> str:
             training_job = self.model
         elif isinstance(self.model, ModelTrainer):
             training_job = self.model._latest_training_job
+        elif isinstance(self.model, BaseTrainer) and hasattr(self.model, "_latest_training_job"):
+            # SFTTrainer / RLVRTrainer / DPOTrainer expose the job via _latest_training_job.
+            training_job = self.model._latest_training_job
         else:
             raise ValueError("Nova escrow URI resolution requires a TrainingJob or ModelTrainer")