diff --git a/internal/session/empty_command_claude_restart_test.go b/internal/session/empty_command_claude_restart_test.go new file mode 100644 index 00000000..0252eb1d --- /dev/null +++ b/internal/session/empty_command_claude_restart_test.go @@ -0,0 +1,84 @@ +package session + +import ( + "os" + "strings" + "testing" +) + +// TestBuildClaudeCommand_EmptyBaseCommand_StillEmitsClaudeBinary pins the +// regression behind the user-reported "R/Enter loop forever" symptom on +// feature/sessions-dispear-on-restart. Repro state, captured live from +// the user's state.db on 2026-04-27: +// +// row: tool=claude, command="", tool_data="{}" (no ClaudeSessionID) +// +// Restart() at instance.go fallback path picks +// +// command = i.buildClaudeCommand(i.Command) // i.Command == "" +// +// because IsClaudeCompatible(i.Tool) holds but ClaudeSessionID is empty. +// The current implementation of buildClaudeCommandWithMessage gates its +// claude-build branch on `baseCommand == "claude"` and falls through to +// the custom-command branch when baseCommand is empty, returning JUST +// the env exports (`export AGENTDECK_INSTANCE_ID=...; ` with no claude +// invocation). The pane runs that, exits, status flips back to error, +// and R/Enter loop indefinitely. +// +// Captured restart command from the user's debug.log: +// +// export COLORFGBG='15;0' && unset TELEGRAM_STATE_DIR && export +// COLORFGBG='15;0' && unset TELEGRAM_STATE_DIR && export +// AGENTDECK_INSTANCE_ID=9e618f9f-1773801500; +// +// Note: zero `claude` invocations. +// +// Contract being pinned: when Tool is Claude-compatible, the produced +// command MUST contain a `claude` binary invocation regardless of +// whether i.Command was "" or "claude". An empty command on a Claude +// tool must default to "claude". +func TestBuildClaudeCommand_EmptyBaseCommand_StillEmitsClaudeBinary(t *testing.T) { + origHome := os.Getenv("HOME") + t.Setenv("HOME", t.TempDir()) + ClearUserConfigCache() + t.Cleanup(func() { + os.Setenv("HOME", origHome) + ClearUserConfigCache() + }) + + inst := NewInstanceWithTool("Smithy", t.TempDir(), "claude") + // Smithy's stored state: empty Command, no ClaudeSessionID, no extra + // per-instance ClaudeOptions overrides. Default-shaped Claude row + // from a session that lost its tool-data fingerprint. + inst.Command = "" + inst.ClaudeSessionID = "" + + cmd := inst.buildClaudeCommand(inst.Command) + + // The regression: cmd is just env exports terminated by `; `. The + // strict assertion: a `claude` binary invocation must appear after + // the env prefix. We accept any of: + // - `claude ` (bare binary + flag) + // - `claude\n` (end of command) + // - `claude --` (flag follows) + // To stay robust to the env-prefix-with-claude-substring case, + // require both a claude token AND a --session-id flag (the marker + // of a real new-session start path inside buildClaudeCommandWithMessage). + if !strings.Contains(cmd, "claude") { + t.Fatalf("buildClaudeCommand(\"\") for Tool=claude must invoke the `claude` binary;\n"+ + "got: %q", cmd) + } + if !strings.Contains(cmd, "--session-id") { + t.Errorf("expected --session-id flag (new-session start path) when ClaudeSessionID is empty;\n"+ + "got: %q", cmd) + } + + // Negative assertion: the produced command must not be just env + // exports with no actual binary to run. The simplest signature of + // the bug is the trailing semicolon with nothing executable after + // the AGENTDECK_INSTANCE_ID export. + if strings.HasSuffix(strings.TrimSpace(cmd), "AGENTDECK_INSTANCE_ID="+inst.ID+";") { + t.Errorf("produced command ends at the AGENTDECK_INSTANCE_ID export with no executable;\n"+ + "this is the live-bug signature. got: %q", cmd) + } +} diff --git a/internal/session/instance.go b/internal/session/instance.go index cd9f8683..993fa429 100644 --- a/internal/session/instance.go +++ b/internal/session/instance.go @@ -590,6 +590,17 @@ func (i *Instance) buildClaudeCommandWithMessage(baseCommand, message string) st return baseCommand } + // Default empty baseCommand to "claude" so the Claude-build branch below + // runs. An Instance row with tool=claude and an empty Command field + // (e.g. a session whose tool_data lost its ClaudeSessionID and was + // never assigned an explicit Command) otherwise falls all the way + // through to the custom-command branch and returns just the env + // prefix — pane runs `export ...;` and exits, status loops to error. + // See feature/sessions-dispear-on-restart, Smithy repro 2026-04-27. + if baseCommand == "" { + baseCommand = "claude" + } + // Get the configured Claude command (e.g., "claude", "cdw", "cdp") // If a custom command is set, we skip CLAUDE_CONFIG_DIR prefix since the alias handles it claudeCmd := GetClaudeCommand() diff --git a/internal/session/shell_restart_test.go b/internal/session/shell_restart_test.go new file mode 100644 index 00000000..34f3d27b --- /dev/null +++ b/internal/session/shell_restart_test.go @@ -0,0 +1,184 @@ +package session + +import ( + "fmt" + "os" + "os/exec" + "strings" + "testing" + "time" + + "github.com/asheshgoplani/agent-deck/internal/tmux" +) + +// Tests in this file pin the regression that owns this branch +// (feature/sessions-dispear-on-restart). User report: pressing R on a +// busted Shell-type session leaves the TUI showing "no tmux session +// running" even though `tmux ls` confirms the underlying tmux session +// is alive on the default socket. +// +// All tests use a unique title per run to avoid collisions on a host +// that might already have agent-deck sessions running. + +// TestRestart_ShellSession_PostRestartIsHealthy is the basic contract: +// after Restart() on a Shell instance, Status is not Error, the tmux +// session backing the instance exists, and exactly one tmux session +// matches the title prefix (no orphans). +func TestRestart_ShellSession_PostRestartIsHealthy(t *testing.T) { + skipIfNoTmuxBinary(t) + isolateUserHomeForShellRestart(t) + + title := uniqueShellTestTitle("HealthyRestart") + inst := NewInstance(title, t.TempDir()) + if inst.Tool != "shell" { + t.Fatalf("setup: NewInstance default Tool = %q, want shell", inst.Tool) + } + inst.Command = "" + + if err := inst.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + t.Cleanup(func() { cleanupShellSessions(title) }) + + if !waitForTmuxSession(inst.tmuxSession.Name, 1*time.Second) { + t.Fatalf("tmux session %q never appeared after Start", inst.tmuxSession.Name) + } + + if err := inst.Restart(); err != nil { + t.Fatalf("Restart returned error: %v", err) + } + + // Settle: respawn-pane / new-session is async on some platforms. + if !waitForTmuxSession(inst.tmuxSession.Name, 1*time.Second) { + t.Fatalf("tmux session %q does not exist after Restart", inst.tmuxSession.Name) + } + + if inst.Status == StatusError { + t.Errorf("after Restart, Status = %s; want != error", inst.Status) + } + if !inst.tmuxSession.Exists() { + t.Errorf("after Restart, inst.tmuxSession.Exists() = false; expected live session %q", + inst.tmuxSession.Name) + } + + matches := listTmuxSessionsWithPrefix(tmux.SessionPrefix + sanitizeTitleForPrefix(title) + "_") + if len(matches) != 1 { + t.Errorf("after Restart, expected exactly 1 tmux session matching prefix; got %d (%v) — orphan check", + len(matches), matches) + } +} + +// TestRestart_ShellSession_AdoptsLiveTmuxOnNameMismatch reproduces the +// user's reported state: Instance.tmuxSession.Name does NOT point at any +// live tmux session, but a live tmux session matching the title prefix +// DOES exist on the same socket. Pressing R must heal the instance — +// either by adopting the live session, or by killing it and creating a +// fresh one — never silently leave the orphan running while the +// instance stays in StatusError. +func TestRestart_ShellSession_AdoptsLiveTmuxOnNameMismatch(t *testing.T) { + skipIfNoTmuxBinary(t) + isolateUserHomeForShellRestart(t) + + title := uniqueShellTestTitle("AdoptOnMismatch") + inst := NewInstance(title, t.TempDir()) + inst.Command = "" + + if err := inst.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + t.Cleanup(func() { cleanupShellSessions(title) }) + + if !waitForTmuxSession(inst.tmuxSession.Name, 1*time.Second) { + t.Fatalf("tmux session %q never appeared after Start", inst.tmuxSession.Name) + } + liveName := inst.tmuxSession.Name + + // Mutate inst.tmuxSession.Name to a stale value that does not exist. + // This exactly reproduces the user-reported state where agent-deck's + // view of "my tmux session is dead" disagrees with reality. + staleSess := tmux.NewSession(title, t.TempDir()) + staleSess.SocketName = inst.TmuxSocketName + if staleSess.Name == liveName { + t.Skip("setup: random suffix collision picked the live name; rerun") + } + inst.tmuxSession = staleSess + + if inst.tmuxSession.Exists() { + t.Skip("setup: stale name unexpectedly resolved to a live session; rerun") + } + + if err := inst.Restart(); err != nil { + t.Fatalf("Restart returned error on stale-name adoption path: %v", err) + } + + if inst.Status == StatusError { + t.Errorf("after Restart, Status = error; expected the adoption path to heal status") + } + + matches := listTmuxSessionsWithPrefix(tmux.SessionPrefix + sanitizeTitleForPrefix(title) + "_") + _ = liveName + if len(matches) > 1 { + t.Errorf("after Restart, expected at most 1 tmux session matching prefix; got %d (%v) — orphan", + len(matches), matches) + } + if len(matches) == 0 { + t.Errorf("after Restart, expected exactly 1 tmux session matching prefix; got 0") + } +} + +// --- helpers --- + +func uniqueShellTestTitle(tag string) string { + return fmt.Sprintf("ShellRestart-%s-%d", tag, time.Now().UnixNano()) +} + +// sanitizeTitleForPrefix mirrors tmux.sanitizeName for the limited +// alphabet our titles use ([A-Za-z0-9-] passes through unchanged). +func sanitizeTitleForPrefix(title string) string { return title } + +func cleanupShellSessions(title string) { + for _, name := range listTmuxSessionsWithPrefix(tmux.SessionPrefix + sanitizeTitleForPrefix(title) + "_") { + _ = exec.Command("tmux", "kill-session", "-t", name).Run() + } +} + +func listTmuxSessionsWithPrefix(prefix string) []string { + out, err := exec.Command("tmux", "list-sessions", "-F", "#{session_name}").Output() + if err != nil { + return nil + } + var matches []string + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, prefix) { + matches = append(matches, line) + } + } + return matches +} + +func waitForTmuxSession(name string, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if exec.Command("tmux", "has-session", "-t", name).Run() == nil { + return true + } + time.Sleep(50 * time.Millisecond) + } + return false +} + +// isolateUserHomeForShellRestart prevents these tests from picking up +// the developer's ~/.agent-deck/config.toml (which could carry a custom +// TmuxSocketName, status-bar setting, etc.) Mirrors the pattern used in +// TestInstance_Restart_InterruptsAndResumes. +func isolateUserHomeForShellRestart(t *testing.T) { + t.Helper() + origHome := os.Getenv("HOME") + t.Setenv("HOME", t.TempDir()) + ClearUserConfigCache() + t.Cleanup(func() { + os.Setenv("HOME", origHome) + ClearUserConfigCache() + }) +} diff --git a/internal/ui/sessions_disappear_on_restart_test.go b/internal/ui/sessions_disappear_on_restart_test.go new file mode 100644 index 00000000..183cfa95 --- /dev/null +++ b/internal/ui/sessions_disappear_on_restart_test.go @@ -0,0 +1,136 @@ +package ui + +import ( + "testing" + "time" + + "github.com/asheshgoplani/agent-deck/internal/session" +) + +// Tests in this file are regression guards for the bug that owns this branch +// (feature/sessions-dispear-on-restart). User report: pressing R on a busted +// session caused all OTHER error sessions to vanish from the unfiltered TUI +// view. The user has reported the symptom is intermittent and could not be +// reproduced after restarting agent-deck from the CLI — these tests therefore +// codify the contract ("a successful restart must not prune peer sessions +// from view") so any future regression that DOES make it deterministic gets +// caught here, even if today's run passes. +// +// All tests run pure in-memory: no SQLite, no tmux, no PTY. h.storage is +// nilled out so saveInstances() is a no-op and we can drive the message +// handler directly. + +// TestRestartMsg_DoesNotPrunePeerErrorSessions_Unfiltered pins the contract: +// when sessionRestartedMsg fires successfully for one session, peer sessions +// (including other error-state ones) must remain visible in flatItems with +// no statusFilter active. +func TestRestartMsg_DoesNotPrunePeerErrorSessions_Unfiltered(t *testing.T) { + home := NewHome() + home.storage = nil // prevent disk writes during saveInstances + home.statusFilter = "" + + smithy := newShellInstanceForRestartTest("smithy-id", "Smithy", session.StatusError) + foo := newShellInstanceForRestartTest("foo-id", "Foo", session.StatusError) + bar := newShellInstanceForRestartTest("bar-id", "Bar", session.StatusRunning) + + home.instances = []*session.Instance{smithy, foo, bar} + home.instanceByID = map[string]*session.Instance{ + smithy.ID: smithy, + foo.ID: foo, + bar.ID: bar, + } + home.groupTree = session.NewGroupTree(home.instances) + home.rebuildFlatItems() + + preIDs := visibleSessionIDsFromFlat(home) + if len(preIDs) != 3 { + t.Fatalf("setup: expected 3 visible sessions, got %d (%v)", len(preIDs), preIDs) + } + + model, _ := home.Update(sessionRestartedMsg{sessionID: smithy.ID, err: nil}) + h, ok := model.(*Home) + if !ok { + t.Fatalf("Update returned %T, want *Home", model) + } + + postIDs := visibleSessionIDsFromFlat(h) + for _, want := range []string{smithy.ID, foo.ID, bar.ID} { + if !sliceContainsString(postIDs, want) { + t.Errorf("after sessionRestartedMsg, flatItems missing %q; got %v", want, postIDs) + } + } + if h.statusFilter != "" { + t.Errorf("statusFilter mutated to %q; expected unfiltered", h.statusFilter) + } +} + +// TestRestartMsg_DoesNotPrunePeerErrorSessions_FilteredToErrors pins the +// contract under the error-only filter, where the visual surface for "all +// my errors disappeared" is most visible — pressing R on one error must +// not collapse the other still-erroring peers. +func TestRestartMsg_DoesNotPrunePeerErrorSessions_FilteredToErrors(t *testing.T) { + home := NewHome() + home.storage = nil + home.statusFilter = session.StatusError + + smithy := newShellInstanceForRestartTest("smithy-id", "Smithy", session.StatusError) + foo := newShellInstanceForRestartTest("foo-id", "Foo", session.StatusError) + bar := newShellInstanceForRestartTest("bar-id", "Bar", session.StatusRunning) + + home.instances = []*session.Instance{smithy, foo, bar} + home.instanceByID = map[string]*session.Instance{ + smithy.ID: smithy, + foo.ID: foo, + bar.ID: bar, + } + home.groupTree = session.NewGroupTree(home.instances) + home.rebuildFlatItems() + + preIDs := visibleSessionIDsFromFlat(home) + if !sliceContainsString(preIDs, smithy.ID) || !sliceContainsString(preIDs, foo.ID) { + t.Fatalf("setup: error filter should show smithy + foo, got %v", preIDs) + } + if sliceContainsString(preIDs, bar.ID) { + t.Fatalf("setup: error filter should hide bar, got %v", preIDs) + } + + model, _ := home.Update(sessionRestartedMsg{sessionID: smithy.ID, err: nil}) + h, _ := model.(*Home) + + postIDs := visibleSessionIDsFromFlat(h) + if !sliceContainsString(postIDs, foo.ID) { + t.Errorf("after sessionRestartedMsg, error-filtered list lost peer Foo; got %v", postIDs) + } +} + +// --- helpers --- + +func newShellInstanceForRestartTest(id, title string, status session.Status) *session.Instance { + return &session.Instance{ + ID: id, + Title: title, + Tool: "shell", + Status: status, + GroupPath: session.DefaultGroupPath, + CreatedAt: time.Now(), + } +} + +func visibleSessionIDsFromFlat(h *Home) []string { + var ids []string + for _, item := range h.flatItems { + if item.Type == session.ItemTypeSession && item.Session != nil { + ids = append(ids, item.Session.ID) + } + } + return ids +} + +func sliceContainsString(s []string, want string) bool { + for _, v := range s { + if v == want { + return true + } + } + return false +}