@@ -148,7 +148,7 @@ def _get_3p_test_config():
148148 boto_session = boto3 .Session (region_name = _REGION )
149149 account_id = boto_session .client ("sts" ).get_caller_identity ()["Account" ]
150150 return {
151- "base_model" : "openai-reasoning-gpt- oss-20b " ,
151+ "base_model" : "mock- oss-test " ,
152152 "dataset" : os .environ .get (
153153 "MTRL_3P_DATASET" ,
154154 f"s3://sagemaker-rft-{ account_id } /prompts/gsm8k_small/prompts.parquet" ,
@@ -240,21 +240,23 @@ def lambda_agent_arn(test_config):
240240 return _ensure_lambda_exists (test_config ["account_id" ])
241241
242242
243+ @pytest .mark .gpu_intensive
244+ @pytest .mark .serial
243245class TestMTRLEvaluator3PAgentIntegration :
244246 """Integration tests for MultiTurnRLEvaluator with Lambda-based 3P agent."""
245247
246- def test_evaluate_base_model_with_lambda_agent (self , lambda_agent_arn , test_config ):
247- """Test evaluating a base model using a Lambda ARN as agent_config .
248+ def test_evaluate_with_lambda_agent_wait_for_completion (self , lambda_agent_arn , test_config ):
249+ """Test full end-to-end: start evaluation, wait for completion, and verify discoverability .
248250
249- This is the primary 3P integration pattern: customer provides a
250- Lambda function that wraps their agent (LangChain, Strands, etc.)
251- and the evaluator runs rollouts against it .
251+ This test validates the complete lifecycle including wait() using
252+ the standard sagemaker-core pipeline execution path, and verifies
253+ the evaluation is discoverable via get_all() .
252254 """
253255 evaluator = MultiTurnRLEvaluator (
254256 model = test_config ["base_model" ],
255257 dataset = test_config ["dataset" ],
256258 agent_config = lambda_agent_arn ,
257- s3_output_path = f'{ test_config ["s3_output_path" ]} lambda-base-model /' ,
259+ s3_output_path = f'{ test_config ["s3_output_path" ]} lambda-e2e /' ,
258260 mlflow_resource_arn = test_config ["mlflow_resource_arn" ],
259261 role = test_config ["role" ],
260262 region = test_config ["region" ],
@@ -267,9 +269,27 @@ def test_evaluate_base_model_with_lambda_agent(self, lambda_agent_arn, test_conf
267269 assert execution .arn is not None
268270 assert "pipeline" in execution .arn .lower ()
269271 logger .info (f"Started 3P agent base model evaluation: { execution .arn } " )
270- logger .info (f"Status: { execution .status .overall_status } " )
271272
272- @pytest .mark .skip (reason = "Quota limited (1 concurrent eval job) - run manually" )
273+ execution .wait (timeout = EVALUATION_TIMEOUT_SECONDS )
274+ assert execution .status .overall_status in ("Succeeded" , "Failed" , "Stopped" )
275+ logger .info (f"Execution completed: { execution .status .overall_status } " )
276+
277+ if execution .status .overall_status == "Failed" :
278+ logger .error (f"Failure reason: { execution .status .failure_reason } " )
279+
280+ # Verify it's discoverable via get_all
281+ found = False
282+ for ex in MultiTurnRLEvaluator .get_all (region = test_config ["region" ]):
283+ if ex .arn == execution .arn :
284+ found = True
285+ break
286+
287+ assert found , (
288+ f"Evaluation { execution .arn } not found via get_all(). "
289+ "Pipeline tagging may not be working correctly."
290+ )
291+ logger .info (f"Successfully discovered evaluation via get_all: { execution .arn } " )
292+
273293 def test_evaluate_base_model_with_agent_lambda_object (self , lambda_agent_arn , test_config ):
274294 """Test evaluating using an CustomAgentLambda object as agent_config.
275295
@@ -295,83 +315,15 @@ def test_evaluate_base_model_with_agent_lambda_object(self, lambda_agent_arn, te
295315 assert execution .arn is not None
296316 logger .info (f"Started CustomAgentLambda object evaluation: { execution .arn } " )
297317
298- @pytest .mark .skip (reason = "Quota limited (1 concurrent eval job) - run manually" )
299- def test_evaluate_with_lambda_agent_wait_for_completion (self , lambda_agent_arn , test_config ):
300- """Test full end-to-end: start evaluation and wait for completion.
301-
302- This test validates the complete lifecycle including wait() using
303- the standard sagemaker-core pipeline execution path.
304- """
305- evaluator = MultiTurnRLEvaluator (
306- model = test_config ["base_model" ],
307- dataset = test_config ["dataset" ],
308- agent_config = lambda_agent_arn ,
309- s3_output_path = f'{ test_config ["s3_output_path" ]} lambda-e2e/' ,
310- mlflow_resource_arn = test_config ["mlflow_resource_arn" ],
311- role = test_config ["role" ],
312- region = test_config ["region" ],
313- accept_eula = True ,
314- )
315-
316- execution = evaluator .evaluate ()
317- assert execution is not None
318-
319- logger .info (f"Waiting for execution: { execution .arn } " )
320- execution .wait ()
321-
322- assert execution .status .overall_status in ("Succeeded" , "Failed" , "Stopped" )
323- logger .info (f"Execution completed: { execution .status .overall_status } " )
324-
325- if execution .status .overall_status == "Failed" :
326- logger .error (f"Failure reason: { execution .status .failure_reason } " )
327-
328- @pytest .mark .skip (reason = "Quota limited (1 concurrent eval job) - run manually" )
329- def test_evaluate_lambda_agent_discoverable_via_get_all (self , lambda_agent_arn , test_config ):
330- """Test that 3P agent evaluations are discoverable via get_all.
331-
332- Validates that evaluations started with Lambda agents show up in
333- the standard get_all() discovery path (pipeline tagging works).
334- """
335- evaluator = MultiTurnRLEvaluator (
336- model = test_config ["base_model" ],
337- dataset = test_config ["dataset" ],
338- agent_config = lambda_agent_arn ,
339- s3_output_path = f'{ test_config ["s3_output_path" ]} lambda-discovery/' ,
340- mlflow_resource_arn = test_config ["mlflow_resource_arn" ],
341- role = test_config ["role" ],
342- region = test_config ["region" ],
343- accept_eula = True ,
344- )
345-
346- execution = evaluator .evaluate ()
347- assert execution is not None
348- started_arn = execution .arn
349-
350- # Give pipeline time to register
351- time .sleep (10 )
352-
353- # Verify it's discoverable via get_all
354- found = False
355- for ex in MultiTurnRLEvaluator .get_all (region = test_config ["region" ]):
356- if ex .arn == started_arn :
357- found = True
358- break
359-
360- assert found , (
361- f"Evaluation { started_arn } not found via get_all(). "
362- "Pipeline tagging may not be working correctly."
363- )
364- logger .info (f"Successfully discovered evaluation via get_all: { started_arn } " )
365-
318+ execution .wait (timeout = EVALUATION_TIMEOUT_SECONDS )
319+ assert execution .status .overall_status == "Succeeded"
366320
367-
368- @pytest .mark .skip (reason = "Quota limited (1 concurrent eval job) - run manually" )
369321 def test_evaluate_with_attached_trainer (self , lambda_agent_arn , test_config ):
370322 """Test evaluating a fine-tuned model by attaching to an existing training job."""
371323 from sagemaker .train .multi_turn_rl_trainer import MultiTurnRLTrainer
372324
373325 attached_job = MultiTurnRLTrainer .attach (
374- "openai-reasoning-gpt- oss-20b -mtrl-20260602164546 " , session = boto3 .Session (region_name = _REGION )
326+ "mock- oss-test -mtrl-20260615143910 " , session = boto3 .Session (region_name = _REGION )
375327 )
376328
377329 evaluator = MultiTurnRLEvaluator (
@@ -390,3 +342,6 @@ def test_evaluate_with_attached_trainer(self, lambda_agent_arn, test_config):
390342 assert execution is not None
391343 assert execution .arn is not None
392344 logger .info (f"Started attached trainer evaluation: { execution .arn } " )
345+
346+ execution .wait (timeout = EVALUATION_TIMEOUT_SECONDS )
347+ assert execution .status .overall_status == "Succeeded"
0 commit comments