@@ -1407,3 +1407,178 @@ def test_goal_verification_no_extra_context_when_all_verified(
14071407
14081408 # No step summary noise when everything is fully verified
14091409 assert "STEP VERIFICATION SUMMARY" not in goal_text_arg
1410+
1411+
1412+ # ---------------------------------------------------------------------------
1413+ # Integration tests: agent plan-progress suppression under external control
1414+ # ---------------------------------------------------------------------------
1415+
1416+
1417+ class TestAgentPlanProgressSuppression :
1418+ """Tests that the agent suppresses its own stale plan progress injection
1419+ when _external_step_control is True (set by DemoController).
1420+
1421+ This addresses the drift issue where the agent and controller could
1422+ show conflicting step progress to the Claude model.
1423+ """
1424+
1425+ def test_agent_does_not_inject_stale_progress_under_external_control (self ):
1426+ """When _external_step_control=True, the agent should NOT inject
1427+ plan progress text from its own (stale) _plan_steps into messages.
1428+
1429+ The controller provides its own step-aware prompt via the augmented
1430+ task instruction.
1431+ """
1432+ from openadapt_evals .agents .claude_computer_use_agent import (
1433+ ClaudeComputerUseAgent ,
1434+ )
1435+
1436+ agent = ClaudeComputerUseAgent .__new__ (ClaudeComputerUseAgent )
1437+ # Minimally initialize the fields needed for _build_initial_messages
1438+ agent ._plan_steps = [
1439+ {"step_num" : 1 , "text" : "Create sheet" , "status" : "in_progress" },
1440+ {"step_num" : 2 , "text" : "Type headers" , "status" : "pending" },
1441+ ]
1442+ agent ._goal = "Test goal"
1443+ agent ._trajectory = []
1444+ agent ._step_count = 1
1445+ agent .demo = "demo text"
1446+ agent ._external_step_control = True
1447+
1448+ # Call the first message builder
1449+ messages = agent ._build_initial_messages (
1450+ instruction = "Controller says: do step 3" ,
1451+ screenshot_b64 = "fake_b64" ,
1452+ )
1453+
1454+ # The text should NOT contain plan progress from the agent's stale state
1455+ msg_text = messages [0 ]["content" ][0 ]["text" ]
1456+ assert "PLAN PROGRESS" not in msg_text
1457+ assert "Create sheet" not in msg_text
1458+ # It should contain the controller's instruction directly
1459+ assert "Controller says: do step 3" in msg_text
1460+
1461+ def test_agent_injects_progress_without_external_control (self ):
1462+ """When _external_step_control=False (default), the agent should
1463+ inject plan progress normally.
1464+ """
1465+ from openadapt_evals .agents .claude_computer_use_agent import (
1466+ ClaudeComputerUseAgent ,
1467+ )
1468+
1469+ agent = ClaudeComputerUseAgent .__new__ (ClaudeComputerUseAgent )
1470+ agent ._plan_steps = [
1471+ {"step_num" : 1 , "text" : "Create sheet" , "status" : "in_progress" },
1472+ {"step_num" : 2 , "text" : "Type headers" , "status" : "pending" },
1473+ ]
1474+ agent ._goal = "Test goal"
1475+ agent ._trajectory = []
1476+ agent ._step_count = 1
1477+ agent .demo = "demo text"
1478+ agent ._external_step_control = False
1479+
1480+ messages = agent ._build_initial_messages (
1481+ instruction = "Do the task" ,
1482+ screenshot_b64 = "fake_b64" ,
1483+ )
1484+
1485+ msg_text = messages [0 ]["content" ][0 ]["text" ]
1486+ assert "PLAN PROGRESS" in msg_text or "structured plan" in msg_text
1487+ assert "Create sheet" in msg_text
1488+
1489+ @patch ("openadapt_evals.demo_controller.verify_goal_completion" )
1490+ @patch ("openadapt_evals.demo_controller.verify_step" )
1491+ def test_controller_sets_external_control_preventing_stale_progress (
1492+ self , mock_verify_step , mock_verify_goal
1493+ ):
1494+ """End-to-end: DemoController sets _external_step_control on agent,
1495+ which prevents the agent from injecting its own stale plan progress.
1496+
1497+ This is the integration test that verifies all three components work
1498+ together: controller init -> flag set -> agent suppresses progress.
1499+ """
1500+ mock_agent = MagicMock ()
1501+ mock_agent ._external_step_control = False
1502+ mock_adapter = MagicMock ()
1503+
1504+ mock_agent .act .return_value = _make_click_action ()
1505+ mock_adapter .reset .return_value = _make_obs ()
1506+ mock_adapter .step .return_value = (_make_obs (), False , {})
1507+ mock_adapter .evaluate .return_value = BenchmarkResult (
1508+ task_id = "test-task-001" , success = True , score = 1.0
1509+ )
1510+
1511+ controller = DemoController (
1512+ agent = mock_agent ,
1513+ adapter = mock_adapter ,
1514+ demo_text = SAMPLE_DEMO ,
1515+ )
1516+
1517+ # Verify the flag was set
1518+ assert mock_agent ._external_step_control is True
1519+
1520+ mock_verify_step .return_value = _make_verified ()
1521+ mock_verify_goal .return_value = _make_goal_verified ()
1522+
1523+ task = _make_task ()
1524+ controller .execute (task , max_steps = 30 )
1525+
1526+ # Verify that the augmented task passed to agent.act() contains
1527+ # the controller's step prompt, not the agent's stale progress
1528+ assert mock_agent .act .call_count >= 3
1529+ for call in mock_agent .act .call_args_list :
1530+ augmented_task = call .args [1 ] # second arg is task
1531+ # The controller's prompt contains these markers
1532+ assert "GOAL:" in augmented_task .instruction
1533+ assert "YOUR CURRENT TASK:" in augmented_task .instruction
1534+
1535+ @patch ("openadapt_evals.demo_controller.verify_goal_completion" )
1536+ @patch ("openadapt_evals.demo_controller.verify_step" )
1537+ def test_done_override_handled_by_controller_not_agent (
1538+ self , mock_verify_step , mock_verify_goal
1539+ ):
1540+ """When the agent returns 'done' prematurely, the CONTROLLER should
1541+ handle the override (not the agent's internal done-override logic).
1542+
1543+ With _external_step_control=True, the agent's done-override should
1544+ be skipped, allowing the controller to manage it.
1545+ """
1546+ mock_agent = MagicMock ()
1547+ mock_agent ._external_step_control = False
1548+ mock_adapter = MagicMock ()
1549+
1550+ # Agent says done on first call, then gives click actions
1551+ mock_agent .act .side_effect = [
1552+ _make_done_action (), # Step 1: agent says done prematurely
1553+ _make_click_action (), # Step 2 (after controller override)
1554+ _make_click_action (), # Step 3
1555+ ]
1556+ mock_adapter .reset .return_value = _make_obs ()
1557+ mock_adapter .step .return_value = (_make_obs (), False , {})
1558+ mock_adapter .evaluate .return_value = BenchmarkResult (
1559+ task_id = "test-task-001" , success = True , score = 1.0
1560+ )
1561+
1562+ controller = DemoController (
1563+ agent = mock_agent ,
1564+ adapter = mock_adapter ,
1565+ demo_text = SAMPLE_DEMO ,
1566+ )
1567+
1568+ # The controller should have set the flag
1569+ assert mock_agent ._external_step_control is True
1570+
1571+ mock_verify_step .side_effect = [
1572+ _make_verified (), # Step 2
1573+ _make_verified (), # Step 3
1574+ ]
1575+ mock_verify_goal .return_value = _make_goal_verified ()
1576+
1577+ task = _make_task ()
1578+ result = controller .execute (task , max_steps = 30 )
1579+
1580+ # Step 1 was force-marked done by the controller's override
1581+ assert controller .plan_state .steps [0 ].status == "done"
1582+ # Steps 2 and 3 completed normally
1583+ assert controller .plan_state .steps [1 ].status == "done"
1584+ assert controller .plan_state .steps [2 ].status == "done"
0 commit comments