Skip to content

Commit fcc12e4

Browse files
committed
fix(screentracker): stop writeStabilize from fatally timing out when agents don't echo input
Phase 1 of writeStabilize waited 15s for the screen to change after writing message text (echo detection), returning HTTP 500 if it didn't. Many TUI agents using bracketed paste don't echo input until Enter is pressed, causing every message send to fail. Phase 1 timeout is now non-fatal (2s) — if the screen doesn't change, we log at Info level and proceed to Phase 2 (send carriage return). Phase 2 remains the real indicator of agent responsiveness. Key changes: - Guard non-fatal path with errors.Is(err, util.WaitTimedOut) so context cancellation still propagates as a fatal error - Reduce Phase 1 timeout from 15s to 2s (echo is near-instant) - Extract named constants for both timeouts - Add tests for no-echo-success and no-echo-no-react-failure - Add documentation test for TUI selection prompt ESC limitation Closes #123
1 parent 1a7a3c0 commit fcc12e4

File tree

2 files changed

+179
-4
lines changed

2 files changed

+179
-4
lines changed

lib/screentracker/pty_conversation.go

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package screentracker
33
import (
44
"context"
55
"encoding/json"
6+
"errors"
67
"fmt"
78
"log/slog"
89
"os"
@@ -16,6 +17,24 @@ import (
1617
"golang.org/x/xerrors"
1718
)
1819

20+
const (
21+
// writeStabilizeEchoTimeout is the maximum time to wait for
22+
// the screen to change after writing message text to the PTY
23+
// (echo detection). This is intentionally short: terminal
24+
// echo is near-instant when it occurs. Non-echoing agents
25+
// (e.g. TUI agents using bracketed paste) will hit this
26+
// timeout, which is non-fatal — see the Phase 1 error
27+
// handler in writeStabilize. Move to PTYConversationConfig
28+
// if agents need different echo detection windows.
29+
writeStabilizeEchoTimeout = 2 * time.Second
30+
31+
// writeStabilizeProcessTimeout is the maximum time to wait
32+
// for the screen to change after sending a carriage return.
33+
// This detects whether the agent is actually processing the
34+
// input.
35+
writeStabilizeProcessTimeout = 15 * time.Second
36+
)
37+
1938
// A screenSnapshot represents a snapshot of the PTY at a specific time.
2039
type screenSnapshot struct {
2140
timestamp time.Time
@@ -411,7 +430,19 @@ func (c *PTYConversation) sendMessage(ctx context.Context, messageParts ...Messa
411430
return nil
412431
}
413432

414-
// writeStabilize writes messageParts to the screen and waits for the screen to stabilize after the message is written.
433+
// writeStabilize writes messageParts to the PTY and waits for
434+
// the agent to process them. It operates in two phases:
435+
//
436+
// Phase 1 (echo detection): writes the message text and waits
437+
// for the screen to change and stabilize. This detects agents
438+
// that echo typed input. If the screen doesn't change within
439+
// writeStabilizeEchoTimeout, this is non-fatal — many TUI
440+
// agents buffer bracketed-paste input without rendering it.
441+
//
442+
// Phase 2 (processing detection): writes a carriage return
443+
// and waits for the screen to change, indicating the agent
444+
// started processing. This phase is fatal on timeout — if the
445+
// agent doesn't react to Enter, it's unresponsive.
415446
func (c *PTYConversation) writeStabilize(ctx context.Context, messageParts ...MessagePart) error {
416447
screenBeforeMessage := c.cfg.AgentIO.ReadScreen()
417448
for _, part := range messageParts {
@@ -421,7 +452,7 @@ func (c *PTYConversation) writeStabilize(ctx context.Context, messageParts ...Me
421452
}
422453
// wait for the screen to stabilize after the message is written
423454
if err := util.WaitFor(ctx, util.WaitTimeout{
424-
Timeout: 15 * time.Second,
455+
Timeout: writeStabilizeEchoTimeout,
425456
MinInterval: 50 * time.Millisecond,
426457
InitialWait: true,
427458
Clock: c.cfg.Clock,
@@ -438,14 +469,25 @@ func (c *PTYConversation) writeStabilize(ctx context.Context, messageParts ...Me
438469
}
439470
return false, nil
440471
}); err != nil {
441-
return xerrors.Errorf("failed to wait for screen to stabilize: %w", err)
472+
if !errors.Is(err, util.WaitTimedOut) {
473+
// Context cancellation or condition errors are fatal.
474+
return xerrors.Errorf("failed to wait for screen to stabilize: %w", err)
475+
}
476+
// Phase 1 timeout is non-fatal: the agent may not echo
477+
// input (e.g. TUI agents buffer bracketed-paste content
478+
// internally). Proceed to Phase 2 to send the carriage
479+
// return.
480+
c.cfg.Logger.Info(
481+
"screen did not stabilize after writing message, proceeding to send carriage return",
482+
"timeout", writeStabilizeEchoTimeout,
483+
)
442484
}
443485

444486
// wait for the screen to change after the carriage return is written
445487
screenBeforeCarriageReturn := c.cfg.AgentIO.ReadScreen()
446488
lastCarriageReturnTime := time.Time{}
447489
if err := util.WaitFor(ctx, util.WaitTimeout{
448-
Timeout: 15 * time.Second,
490+
Timeout: writeStabilizeProcessTimeout,
449491
MinInterval: 25 * time.Millisecond,
450492
Clock: c.cfg.Clock,
451493
}, func() (bool, error) {

lib/screentracker/pty_conversation_test.go

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
package screentracker_test
22

33
import (
4+
"bytes"
45
"context"
56
"encoding/json"
67
"fmt"
78
"io"
89
"log/slog"
910
"os"
1011
"sync"
12+
"sync/atomic"
1113
"testing"
1214
"time"
1315

@@ -447,6 +449,137 @@ func TestMessages(t *testing.T) {
447449
c, _, _ := newConversation(context.Background(), t)
448450
assert.ErrorIs(t, c.Send(st.MessagePartText{Content: ""}), st.ErrMessageValidationEmpty)
449451
})
452+
453+
t.Run("send-no-echo-agent-reacts", func(t *testing.T) {
454+
ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
455+
t.Cleanup(cancel)
456+
457+
agent := &testAgent{screen: "prompt"}
458+
// Agent doesn't echo input text, but reacts to carriage return.
459+
agent.onWrite = func(data []byte) {
460+
if string(data) == "\r" {
461+
agent.screen = "processing..."
462+
}
463+
}
464+
mClock := quartz.NewMock(t)
465+
mClock.Set(time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC))
466+
cfg := st.PTYConversationConfig{
467+
Clock: mClock,
468+
AgentIO: agent,
469+
SnapshotInterval: interval,
470+
ScreenStabilityLength: 200 * time.Millisecond,
471+
Logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
472+
}
473+
c := st.NewPTY(ctx, cfg, &testEmitter{})
474+
c.Start(ctx)
475+
476+
// Stabilize: fill snapshot buffer so status becomes stable.
477+
advanceFor(ctx, t, mClock, interval*threshold)
478+
479+
// Send and advance. Phase 1 times out (2s, non-fatal),
480+
// Phase 2 writes \r → onWrite changes screen → succeeds.
481+
sendAndAdvance(ctx, t, c, mClock, st.MessagePartText{Content: "hello"})
482+
483+
// User message was recorded.
484+
msgs := c.Messages()
485+
require.True(t, len(msgs) >= 2)
486+
assert.Equal(t, st.ConversationRoleUser, msgs[len(msgs)-1].Role)
487+
assert.Equal(t, "hello", msgs[len(msgs)-1].Message)
488+
})
489+
490+
t.Run("send-no-echo-no-react", func(t *testing.T) {
491+
ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
492+
t.Cleanup(cancel)
493+
494+
agent := &testAgent{screen: "prompt"}
495+
// Agent is completely unresponsive: no echo, no reaction.
496+
agent.onWrite = func(data []byte) {}
497+
mClock := quartz.NewMock(t)
498+
mClock.Set(time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC))
499+
cfg := st.PTYConversationConfig{
500+
Clock: mClock,
501+
AgentIO: agent,
502+
SnapshotInterval: interval,
503+
ScreenStabilityLength: 200 * time.Millisecond,
504+
Logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
505+
}
506+
c := st.NewPTY(ctx, cfg, &testEmitter{})
507+
c.Start(ctx)
508+
509+
// Stabilize.
510+
advanceFor(ctx, t, mClock, interval*threshold)
511+
512+
// Send in a goroutine; can't use sendAndAdvance because it
513+
// calls require.NoError internally.
514+
var sendErr error
515+
var sendDone atomic.Bool
516+
go func() {
517+
sendErr = c.Send(st.MessagePartText{Content: "hello"})
518+
sendDone.Store(true)
519+
}()
520+
advanceUntil(ctx, t, mClock, func() bool { return sendDone.Load() })
521+
522+
require.Error(t, sendErr)
523+
assert.Contains(t, sendErr.Error(), "failed to wait for processing to start")
524+
})
525+
526+
t.Run("send-tui-selection-esc-cancels", func(t *testing.T) {
527+
// Documents a known limitation: when a TUI agent shows a
528+
// selection prompt, sending a user message wraps it in
529+
// bracketed paste. The ESC (\x1b) in the paste-start
530+
// sequence cancels the selection widget. The user's intended
531+
// choice never reaches the selection handler.
532+
//
533+
// For selection prompts, callers should use MessageTypeRaw
534+
// to send raw keystrokes directly to the PTY.
535+
//
536+
// See lib/httpapi/claude.go for the full formatClaudeCodeMessage
537+
// format; this test focuses on the ESC invariant only.
538+
ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
539+
t.Cleanup(cancel)
540+
541+
agent := &testAgent{screen: "selection prompt"}
542+
selectionCancelled := false
543+
agent.onWrite = func(data []byte) {
544+
// Simulate TUI selection widget: ESC cancels the
545+
// selection, changing the screen.
546+
if bytes.Contains(data, []byte("\x1b")) {
547+
selectionCancelled = true
548+
agent.screen = "selection cancelled"
549+
} else if string(data) == "\r" {
550+
agent.screen = "post-cancel"
551+
}
552+
}
553+
mClock := quartz.NewMock(t)
554+
mClock.Set(time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC))
555+
cfg := st.PTYConversationConfig{
556+
Clock: mClock,
557+
AgentIO: agent,
558+
SnapshotInterval: interval,
559+
ScreenStabilityLength: 200 * time.Millisecond,
560+
Logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
561+
}
562+
c := st.NewPTY(ctx, cfg, &testEmitter{})
563+
c.Start(ctx)
564+
565+
// Stabilize.
566+
advanceFor(ctx, t, mClock, interval*threshold)
567+
568+
// Send using bracketed paste, which contains ESC. The
569+
// test focuses on the ESC invariant — any input wrapped
570+
// in bracketed paste will trigger this.
571+
sendAndAdvance(ctx, t, c, mClock,
572+
st.MessagePartText{Content: "\x1b[200~", Hidden: true},
573+
st.MessagePartText{Content: "2"},
574+
st.MessagePartText{Content: "\x1b[201~", Hidden: true},
575+
)
576+
577+
// The send succeeded, but the selection was cancelled by
578+
// ESC — not option "2" being selected.
579+
assert.True(t, selectionCancelled,
580+
"ESC in bracketed paste cancels TUI selection prompts; "+
581+
"use MessageTypeRaw for selection prompts instead")
582+
})
450583
}
451584

452585
func TestStatePersistence(t *testing.T) {

0 commit comments

Comments
 (0)