Skip to content

Commit c6c608e

Browse files
CSResselclaudenori-agent
authored
fix(tui): Fix duplicate tool execution messages in ACP (#88)
When agent text streams during an ACP tool call execution, the incomplete ExecCell was being flushed to history. When the tool call completed, a new ExecCell was created, resulting in duplicate entries showing both "Running" and "Ran" states for the same command. This fix tracks incomplete ExecCells that get flushed during streaming in a pending map keyed by call_id. When ExecCommandEnd arrives, the pending cell is retrieved and completed instead of creating a new one. Changes: - Add `pending_exec_cells` field to ChatWidget to track incomplete cells - Modify `flush_active_cell` to save incomplete ExecCells to pending map - Modify `handle_exec_end_now` to check pending map before creating new - Add `pending_call_ids` method to ExecCell for tracking - Add cleanup of pending cells in `on_task_complete` - Add E2E test with interleaved text/tool call events to verify fix --------- Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Nori <contact@tilework.tech>
1 parent cec1462 commit c6c608e

10 files changed

Lines changed: 457 additions & 21 deletions

File tree

codex-rs/mock-acp-agent/src/main.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,75 @@ impl acp::Agent for MockAgent {
312312
}
313313
}
314314

315+
// Support interleaved text and tool calls to test for duplicate message bug
316+
// This sends text DURING the tool call, which should trigger the bug where
317+
// the incomplete ExecCell gets flushed to history, creating duplicates.
318+
if std::env::var("MOCK_AGENT_INTERLEAVED_TOOL_CALL").is_ok() {
319+
eprintln!("Mock agent: sending interleaved tool call sequence");
320+
321+
let tool_call_id = acp::ToolCallId("interleaved-tool-001".to_string().into());
322+
323+
// Step 1: Send tool call (begin)
324+
self.send_tool_call(
325+
session_id.clone(),
326+
acp::ToolCall {
327+
id: tool_call_id.clone(),
328+
title: "Executing interleaved command".to_string(),
329+
kind: acp::ToolKind::Execute,
330+
status: acp::ToolCallStatus::Pending,
331+
content: vec![],
332+
locations: vec![],
333+
raw_input: Some(json!({"command": "test"})),
334+
raw_output: None,
335+
meta: None,
336+
},
337+
)
338+
.await?;
339+
340+
// Small delay to ensure the begin event is processed
341+
sleep(Duration::from_millis(50)).await;
342+
343+
// Step 2: Send text DURING the tool call - this triggers the bug!
344+
// When this text arrives, handle_streaming_delta calls flush_active_cell()
345+
// which moves the incomplete ExecCell to history.
346+
self.send_text_chunk(session_id.clone(), "Processing command...")
347+
.await?;
348+
349+
// Small delay
350+
sleep(Duration::from_millis(50)).await;
351+
352+
// Step 3: Send tool call completion
353+
// At this point, the ExecCell is no longer in active_cell, so a new one
354+
// will be created, resulting in duplicate entries.
355+
self.send_tool_call_update(
356+
session_id.clone(),
357+
acp::ToolCallUpdate {
358+
id: tool_call_id.clone(),
359+
fields: acp::ToolCallUpdateFields {
360+
title: None,
361+
kind: None,
362+
status: Some(acp::ToolCallStatus::Completed),
363+
content: Some(vec![acp::ToolCallContent::Content {
364+
content: acp::ContentBlock::Text(acp::TextContent {
365+
text: "Command completed".to_string(),
366+
annotations: None,
367+
meta: None,
368+
}),
369+
}]),
370+
locations: None,
371+
raw_input: None,
372+
raw_output: Some(json!({"exit_code": 0})),
373+
},
374+
meta: None,
375+
},
376+
)
377+
.await?;
378+
379+
// Final text
380+
self.send_text_chunk(session_id.clone(), "Interleaved test done.")
381+
.await?;
382+
}
383+
315384
// Support sending tool calls for testing ACP tool call display
316385
if std::env::var("MOCK_AGENT_SEND_TOOL_CALL").is_ok() {
317386
eprintln!("Mock agent: sending tool call sequence");

codex-rs/tui-pty-e2e/tests/acp_tool_calls.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,98 @@ fn test_acp_tool_call_completion_rendered_in_tui() {
159159
insta::assert_snapshot!("acp_tool_call_echo", normalize_for_input_snapshot(contents));
160160
}
161161

162+
/// Test that ACP tool calls do NOT appear twice (once as Running, once as Ran)
163+
///
164+
/// This test verifies that when a tool call completes, there is only ONE entry
165+
/// in the TUI output, not duplicate entries showing both "Running" and "Ran"
166+
/// states. The expected behavior is that the "Running" state should be
167+
/// updated in-place to become "Ran" when the tool call completes.
168+
///
169+
/// ## Bug being tested:
170+
/// When agent text streams while a tool call is active, the incomplete ExecCell
171+
/// gets flushed to history. Then when the tool call completes, a new ExecCell
172+
/// is created, resulting in duplicate entries:
173+
/// 1. "Running ..." (flushed incomplete cell)
174+
/// 2. "Ran ..." (new completed cell)
175+
///
176+
/// This test uses MOCK_AGENT_INTERLEAVED_TOOL_CALL which sends text DURING
177+
/// the tool call to trigger this exact scenario.
178+
#[test]
179+
fn test_acp_tool_call_no_duplicate_messages() {
180+
// Configure mock agent to send interleaved text and tool calls
181+
// This triggers the bug by sending text DURING the tool call execution
182+
let config = SessionConfig::new()
183+
.with_model("mock-model".to_owned())
184+
.with_agent_env("MOCK_AGENT_INTERLEAVED_TOOL_CALL", "1");
185+
186+
let mut session =
187+
TuiSession::spawn_with_config(24, 80, config).expect("Failed to spawn codex in ACP mode");
188+
189+
// Wait for startup
190+
session
191+
.wait_for_text("›", TIMEOUT)
192+
.expect("ACP mode should start");
193+
194+
std::thread::sleep(TIMEOUT_INPUT);
195+
196+
// Send a prompt to trigger the interleaved tool call
197+
session.send_str("Test interleaved").unwrap();
198+
std::thread::sleep(TIMEOUT_INPUT);
199+
session.send_key(Key::Enter).unwrap();
200+
201+
// Wait for the final text which means everything completed
202+
session
203+
.wait_for_text("Interleaved test done", Duration::from_secs(10))
204+
.expect("Should receive completion response");
205+
206+
std::thread::sleep(TIMEOUT_PRESNAPSHOT);
207+
208+
let contents = session.screen_contents();
209+
210+
// Count occurrences of the tool title "Executing interleaved command"
211+
// It should appear exactly ONCE (in the completed "Ran" form)
212+
let tool_title = "Executing interleaved command";
213+
let count = contents.matches(tool_title).count();
214+
215+
assert_eq!(
216+
count, 1,
217+
"Tool call '{}' should appear exactly once, but appeared {} times.\n\
218+
This indicates duplicate messages (both 'Running' and 'Ran' states visible).\n\
219+
Screen contents:\n{}",
220+
tool_title, count, contents
221+
);
222+
223+
// Also verify we see "Ran" (completed state)
224+
assert!(
225+
contents.contains("Ran"),
226+
"Should show completed 'Ran' state. Screen contents:\n{}",
227+
contents
228+
);
229+
230+
// Verify we don't have both "Running" AND "Ran" for this tool call
231+
// (which would indicate duplicates)
232+
let has_running = contents
233+
.lines()
234+
.any(|line| line.contains("Running") && line.contains("Executing interleaved"));
235+
let has_ran = contents
236+
.lines()
237+
.any(|line| line.contains("Ran") && line.contains("Executing interleaved"));
238+
239+
assert!(
240+
!(has_running && has_ran),
241+
"Should NOT have both 'Running' and 'Ran' states for the same tool call.\n\
242+
This indicates duplicate messages.\n\
243+
Screen contents:\n{}",
244+
contents
245+
);
246+
247+
// Snapshot for visual verification
248+
insta::assert_snapshot!(
249+
"acp_tool_call_no_duplicates",
250+
normalize_for_input_snapshot(contents)
251+
);
252+
}
253+
162254
/// Snapshot test for ACP tool call rendering
163255
///
164256
/// This captures the exact visual rendering of an ACP tool call
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
source: tui-pty-e2e/tests/acp_tool_calls.rs
3+
assertion_line: 248
4+
expression: normalize_for_input_snapshot(contents)
5+
---
6+
Operation 'ListCustomPrompts' is not supported in ACP mode
7+
8+
9+
Test interleaved
10+
11+
12+
Operation 'AddToHistory' is not supported in ACP mode
13+
14+
Worked for 0s ────────────────────────────────────────────────────────────────
15+
16+
Test message 1Test message 2
17+
18+
Worked for 0s ────────────────────────────────────────────────────────────────
19+
20+
Processing command...Interleaved test done.
21+
22+
Ran 'Executing interleaved command'
23+
└ (no output)
24+
25+
26+
› [DEFAULT_PROMPT]
27+
28+
100% context left · ? for shortcuts

codex-rs/tui-pty-e2e/tests/snapshots/startup__startup_welcome_dimensions_40x120.snap

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22
source: tui-pty-e2e/tests/startup.rs
33
expression: normalize_for_input_snapshot(contents)
44
---
5-
Operation 'ListCustomPrompts' is not supported in ACP mode
6-
7-
85
› [DEFAULT_PROMPT]
96

107
100% context left · ? for shortcuts

codex-rs/tui-pty-e2e/tests/streaming.rs

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,51 @@ fn test_escape_cancels_streaming() {
6060
// Press Escape to cancel doesn't work?
6161
session.send_key(Key::Escape).unwrap();
6262
std::thread::sleep(TIMEOUT_INPUT);
63+
64+
std::thread::sleep(TIMEOUT);
65+
// Verify cancellation completed
66+
// (exact behavior depends on TUI implementation)
67+
session
68+
.wait_for_text(
69+
"Conversation interrupted - tell the model what to do differently",
70+
TIMEOUT,
71+
)
72+
.expect("No interrupt reported");
73+
74+
std::thread::sleep(TIMEOUT);
75+
assert_snapshot!(
76+
"cancelled_stream",
77+
normalize_for_input_snapshot(session.screen_contents())
78+
)
79+
}
80+
81+
#[test]
82+
fn test_ctrl_c_cancels_streaming() {
83+
// Use git_init to prevent "Snapshots disabled" from racing with "Working" status
84+
let config = SessionConfig::new().with_stream_until_cancel();
85+
let mut session = TuiSession::spawn_with_config(24, 80, config).unwrap();
86+
87+
// Wait for the prompt to appear (indicated by the chevron character)
88+
session
89+
.wait_for_text("›", TIMEOUT)
90+
.expect("Prompt did not appear");
91+
std::thread::sleep(TIMEOUT_INPUT);
92+
93+
// Submit prompt
94+
session.send_str("testing!!!").unwrap();
95+
session.wait_for_text("testing!!!", TIMEOUT).unwrap();
96+
std::thread::sleep(TIMEOUT_INPUT);
97+
session.send_key(Key::Enter).unwrap();
98+
std::thread::sleep(TIMEOUT_INPUT);
99+
100+
// Wait for streaming to start
101+
session
102+
.wait_for_text("Working", TIMEOUT)
103+
.expect("Streaming did not start");
104+
63105
// Press ctrl-c to cancel doesn't work?
64-
// session.send_key(Key::Ctrl('c')).unwrap();
65-
// std::thread::sleep(TIMEOUT_INPUT);
106+
session.send_key(Key::Ctrl('c')).unwrap();
107+
std::thread::sleep(TIMEOUT_INPUT);
66108

67109
std::thread::sleep(TIMEOUT);
68110
// Verify cancellation completed
@@ -74,7 +116,7 @@ fn test_escape_cancels_streaming() {
74116
)
75117
.expect("No interrupt reported");
76118

77-
std::thread::sleep(TIMEOUT_PRESNAPSHOT);
119+
std::thread::sleep(TIMEOUT);
78120
assert_snapshot!(
79121
"cancelled_stream",
80122
normalize_for_input_snapshot(session.screen_contents())

codex-rs/tui/docs.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,23 @@ Most event types (exec begin/end, MCP calls, elicitation) are queued during acti
130130
- The `InterruptManager` still contains `ExecApproval` and `ApplyPatchApproval` variants for completeness, but these methods are marked `#[allow(dead_code)]`
131131
- `on_task_complete()` calls `flush_interrupt_queue()` for any remaining queued items
132132

133+
**Pending ExecCell Tracking:**
134+
135+
The `PendingExecCellTracker` (`chatwidget/pending_exec_cells.rs`) prevents duplicate ACP tool call messages in the chat history. The problem it solves:
136+
137+
1. Agent makes a tool call (e.g., `shell`) which creates an ExecCell in `active_cell`
138+
2. Agent streams text *during* the tool call execution
139+
3. Streaming text causes `flush_active_cell()`, which would normally push the incomplete ExecCell to history and clear `active_cell`
140+
4. When `ExecCommandEnd` arrives, `handle_exec_end_now()` would create a *new* ExecCell since `active_cell` is empty
141+
5. Result: duplicate entries for the same tool call
142+
143+
The tracker intercepts this by:
144+
- `save_pending()`: Called during flush if the ExecCell has pending (incomplete) call_ids - saves the cell keyed by call_id instead of pushing to history
145+
- `retrieve()`: Called in `handle_exec_end_now()` - retrieves and removes the saved cell, restoring it to `active_cell` for completion
146+
- `drain_failed()`: Called in `on_task_complete()` - marks any uncompleted pending cells as failed and returns them for insertion into history
147+
148+
This follows the same encapsulation pattern as `InterruptManager`: self-contained state in its own module file with typed public methods instead of exposing raw data structures.
149+
133150
**ACP File Tracing:**
134151

135152
- The TUI calls `codex_acp::init_file_tracing()` at startup (`tui/src/lib.rs`) to write `.codex-acp.log` in the current directory. Every mock agent logs `ACP agent spawned (pid: ...)` there, which makes the agent-switching tests in `tui-pty-e2e` deterministic and ensures developers can inspect agent subprocess lifecycles during debugging.

0 commit comments

Comments
 (0)