Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions codex-rs/mock-acp-agent/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,75 @@ impl acp::Agent for MockAgent {
}
}

// Support interleaved text and tool calls to test for duplicate message bug
// This sends text DURING the tool call, which should trigger the bug where
// the incomplete ExecCell gets flushed to history, creating duplicates.
if std::env::var("MOCK_AGENT_INTERLEAVED_TOOL_CALL").is_ok() {
eprintln!("Mock agent: sending interleaved tool call sequence");

let tool_call_id = acp::ToolCallId("interleaved-tool-001".to_string().into());

// Step 1: Send tool call (begin)
self.send_tool_call(
session_id.clone(),
acp::ToolCall {
id: tool_call_id.clone(),
title: "Executing interleaved command".to_string(),
kind: acp::ToolKind::Execute,
status: acp::ToolCallStatus::Pending,
content: vec![],
locations: vec![],
raw_input: Some(json!({"command": "test"})),
raw_output: None,
meta: None,
},
)
.await?;

// Small delay to ensure the begin event is processed
sleep(Duration::from_millis(50)).await;

// Step 2: Send text DURING the tool call - this triggers the bug!
// When this text arrives, handle_streaming_delta calls flush_active_cell()
// which moves the incomplete ExecCell to history.
self.send_text_chunk(session_id.clone(), "Processing command...")
.await?;

// Small delay
sleep(Duration::from_millis(50)).await;

// Step 3: Send tool call completion
// At this point, the ExecCell is no longer in active_cell, so a new one
// will be created, resulting in duplicate entries.
self.send_tool_call_update(
session_id.clone(),
acp::ToolCallUpdate {
id: tool_call_id.clone(),
fields: acp::ToolCallUpdateFields {
title: None,
kind: None,
status: Some(acp::ToolCallStatus::Completed),
content: Some(vec![acp::ToolCallContent::Content {
content: acp::ContentBlock::Text(acp::TextContent {
text: "Command completed".to_string(),
annotations: None,
meta: None,
}),
}]),
locations: None,
raw_input: None,
raw_output: Some(json!({"exit_code": 0})),
},
meta: None,
},
)
.await?;

// Final text
self.send_text_chunk(session_id.clone(), "Interleaved test done.")
.await?;
}

// Support sending tool calls for testing ACP tool call display
if std::env::var("MOCK_AGENT_SEND_TOOL_CALL").is_ok() {
eprintln!("Mock agent: sending tool call sequence");
Expand Down
92 changes: 92 additions & 0 deletions codex-rs/tui-pty-e2e/tests/acp_tool_calls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,98 @@ fn test_acp_tool_call_completion_rendered_in_tui() {
insta::assert_snapshot!("acp_tool_call_echo", normalize_for_input_snapshot(contents));
}

/// Test that ACP tool calls do NOT appear twice (once as Running, once as Ran)
///
/// This test verifies that when a tool call completes, there is only ONE entry
/// in the TUI output, not duplicate entries showing both "Running" and "Ran"
/// states. The expected behavior is that the "Running" state should be
/// updated in-place to become "Ran" when the tool call completes.
///
/// ## Bug being tested:
/// When agent text streams while a tool call is active, the incomplete ExecCell
/// gets flushed to history. Then when the tool call completes, a new ExecCell
/// is created, resulting in duplicate entries:
/// 1. "Running ..." (flushed incomplete cell)
/// 2. "Ran ..." (new completed cell)
///
/// This test uses MOCK_AGENT_INTERLEAVED_TOOL_CALL which sends text DURING
/// the tool call to trigger this exact scenario.
#[test]
fn test_acp_tool_call_no_duplicate_messages() {
// Configure mock agent to send interleaved text and tool calls
// This triggers the bug by sending text DURING the tool call execution
let config = SessionConfig::new()
.with_model("mock-model".to_owned())
.with_agent_env("MOCK_AGENT_INTERLEAVED_TOOL_CALL", "1");

let mut session =
TuiSession::spawn_with_config(24, 80, config).expect("Failed to spawn codex in ACP mode");

// Wait for startup
session
.wait_for_text("›", TIMEOUT)
.expect("ACP mode should start");

std::thread::sleep(TIMEOUT_INPUT);

// Send a prompt to trigger the interleaved tool call
session.send_str("Test interleaved").unwrap();
std::thread::sleep(TIMEOUT_INPUT);
session.send_key(Key::Enter).unwrap();

// Wait for the final text which means everything completed
session
.wait_for_text("Interleaved test done", Duration::from_secs(10))
.expect("Should receive completion response");

std::thread::sleep(TIMEOUT_PRESNAPSHOT);

let contents = session.screen_contents();

// Count occurrences of the tool title "Executing interleaved command"
// It should appear exactly ONCE (in the completed "Ran" form)
let tool_title = "Executing interleaved command";
let count = contents.matches(tool_title).count();

assert_eq!(
count, 1,
"Tool call '{}' should appear exactly once, but appeared {} times.\n\
This indicates duplicate messages (both 'Running' and 'Ran' states visible).\n\
Screen contents:\n{}",
tool_title, count, contents
);

// Also verify we see "Ran" (completed state)
assert!(
contents.contains("Ran"),
"Should show completed 'Ran' state. Screen contents:\n{}",
contents
);

// Verify we don't have both "Running" AND "Ran" for this tool call
// (which would indicate duplicates)
let has_running = contents
.lines()
.any(|line| line.contains("Running") && line.contains("Executing interleaved"));
let has_ran = contents
.lines()
.any(|line| line.contains("Ran") && line.contains("Executing interleaved"));

assert!(
!(has_running && has_ran),
"Should NOT have both 'Running' and 'Ran' states for the same tool call.\n\
This indicates duplicate messages.\n\
Screen contents:\n{}",
contents
);

// Snapshot for visual verification
insta::assert_snapshot!(
"acp_tool_call_no_duplicates",
normalize_for_input_snapshot(contents)
);
}

/// Snapshot test for ACP tool call rendering
///
/// This captures the exact visual rendering of an ACP tool call
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
source: tui-pty-e2e/tests/acp_tool_calls.rs
assertion_line: 248
expression: normalize_for_input_snapshot(contents)
---
■ Operation 'ListCustomPrompts' is not supported in ACP mode


› Test interleaved


■ Operation 'AddToHistory' is not supported in ACP mode

─ Worked for 0s ────────────────────────────────────────────────────────────────

• Test message 1Test message 2

─ Worked for 0s ────────────────────────────────────────────────────────────────

• Processing command...Interleaved test done.

• Ran 'Executing interleaved command'
└ (no output)


› [DEFAULT_PROMPT]

100% context left · ? for shortcuts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
source: tui-pty-e2e/tests/startup.rs
expression: normalize_for_input_snapshot(contents)
---
■ Operation 'ListCustomPrompts' is not supported in ACP mode


› [DEFAULT_PROMPT]

100% context left · ? for shortcuts
48 changes: 45 additions & 3 deletions codex-rs/tui-pty-e2e/tests/streaming.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,51 @@ fn test_escape_cancels_streaming() {
// Press Escape to cancel doesn't work?
session.send_key(Key::Escape).unwrap();
std::thread::sleep(TIMEOUT_INPUT);

std::thread::sleep(TIMEOUT);
// Verify cancellation completed
// (exact behavior depends on TUI implementation)
session
.wait_for_text(
"Conversation interrupted - tell the model what to do differently",
TIMEOUT,
)
.expect("No interrupt reported");

std::thread::sleep(TIMEOUT);
assert_snapshot!(
"cancelled_stream",
normalize_for_input_snapshot(session.screen_contents())
)
}

#[test]
fn test_ctrl_c_cancels_streaming() {
// Use git_init to prevent "Snapshots disabled" from racing with "Working" status
let config = SessionConfig::new().with_stream_until_cancel();
let mut session = TuiSession::spawn_with_config(24, 80, config).unwrap();

// Wait for the prompt to appear (indicated by the chevron character)
session
.wait_for_text("›", TIMEOUT)
.expect("Prompt did not appear");
std::thread::sleep(TIMEOUT_INPUT);

// Submit prompt
session.send_str("testing!!!").unwrap();
session.wait_for_text("testing!!!", TIMEOUT).unwrap();
std::thread::sleep(TIMEOUT_INPUT);
session.send_key(Key::Enter).unwrap();
std::thread::sleep(TIMEOUT_INPUT);

// Wait for streaming to start
session
.wait_for_text("Working", TIMEOUT)
.expect("Streaming did not start");

// Press ctrl-c to cancel doesn't work?
// session.send_key(Key::Ctrl('c')).unwrap();
// std::thread::sleep(TIMEOUT_INPUT);
session.send_key(Key::Ctrl('c')).unwrap();
std::thread::sleep(TIMEOUT_INPUT);

std::thread::sleep(TIMEOUT);
// Verify cancellation completed
Expand All @@ -74,7 +116,7 @@ fn test_escape_cancels_streaming() {
)
.expect("No interrupt reported");

std::thread::sleep(TIMEOUT_PRESNAPSHOT);
std::thread::sleep(TIMEOUT);
assert_snapshot!(
"cancelled_stream",
normalize_for_input_snapshot(session.screen_contents())
Expand Down
17 changes: 17 additions & 0 deletions codex-rs/tui/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,23 @@ Most event types (exec begin/end, MCP calls, elicitation) are queued during acti
- The `InterruptManager` still contains `ExecApproval` and `ApplyPatchApproval` variants for completeness, but these methods are marked `#[allow(dead_code)]`
- `on_task_complete()` calls `flush_interrupt_queue()` for any remaining queued items

**Pending ExecCell Tracking:**

The `PendingExecCellTracker` (`chatwidget/pending_exec_cells.rs`) prevents duplicate ACP tool call messages in the chat history. The problem it solves:

1. Agent makes a tool call (e.g., `shell`) which creates an ExecCell in `active_cell`
2. Agent streams text *during* the tool call execution
3. Streaming text causes `flush_active_cell()`, which would normally push the incomplete ExecCell to history and clear `active_cell`
4. When `ExecCommandEnd` arrives, `handle_exec_end_now()` would create a *new* ExecCell since `active_cell` is empty
5. Result: duplicate entries for the same tool call

The tracker intercepts this by:
- `save_pending()`: Called during flush if the ExecCell has pending (incomplete) call_ids - saves the cell keyed by call_id instead of pushing to history
- `retrieve()`: Called in `handle_exec_end_now()` - retrieves and removes the saved cell, restoring it to `active_cell` for completion
- `drain_failed()`: Called in `on_task_complete()` - marks any uncompleted pending cells as failed and returns them for insertion into history

This follows the same encapsulation pattern as `InterruptManager`: self-contained state in its own module file with typed public methods instead of exposing raw data structures.

**ACP File Tracing:**

- The TUI calls `codex_acp::init_file_tracing()` at startup (`tui/src/lib.rs`) to write `.codex-acp.log` in the current directory. Every mock agent logs `ACP agent spawned (pid: ...)` there, which makes the agent-switching tests in `tui-pty-e2e` deterministic and ensures developers can inspect agent subprocess lifecycles during debugging.
Expand Down
Loading
Loading