diff --git a/.gitignore b/.gitignore index 425b31d7..c883e6e6 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,9 @@ agent-orchestrator.yaml session-events.jsonl session-events.jsonl.* +# Agent Orchestrator local session state +.ao/ + # Environment .env .env.* diff --git a/README.md b/README.md index 61a639d2..33e08dae 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # agent-orchestrator Rewrite of the agent-orchestrator: a long-running Go backend daemon (`backend/`) -paired with an Electron + TypeScript frontend (`frontend/`). +paired with a placeholder Electron + TypeScript frontend shell (`frontend/`). See [`docs/`](docs/README.md) for architecture and status — start with the Lifecycle Manager + Session Manager lane in [`docs/architecture.md`](docs/architecture.md). @@ -31,8 +31,8 @@ AO_PORT=3019 go run ./cmd/ao start # override per invocation Health check: ```bash -curl localhost:3001/healthz # {"status":"ok"} -curl localhost:3001/readyz # {"status":"ready"} +curl localhost:3001/healthz # includes status/service/pid +curl localhost:3001/readyz # includes status/service/pid ``` ### Configuration (env only) @@ -47,10 +47,12 @@ is intentionally not env-configurable. | `AO_REQUEST_TIMEOUT` | `60s` | per-request timeout (Go duration) | | `AO_SHUTDOWN_TIMEOUT` | `10s` | graceful-shutdown hard cap | | `AO_RUN_FILE` | `/agent-orchestrator/running.json` | PID + port handshake path | +| `AO_DATA_DIR` | `/agent-orchestrator/data` | SQLite DB, WAL files, and managed state | ### Test ```bash -cd backend -gofmt -l . && go build ./... && go vet ./... && go test -race ./... +npm run lint +# optional deeper backend pass: +cd backend && go test -race ./... ``` diff --git a/backend/internal/adapters/runtime/tmux/commands.go b/backend/internal/adapters/runtime/tmux/commands.go deleted file mode 100644 index 6cf8739e..00000000 --- a/backend/internal/adapters/runtime/tmux/commands.go +++ /dev/null @@ -1,97 +0,0 @@ -package tmux - -import ( - "fmt" - "sort" - "strings" - - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -const runtimeName = "tmux" - -func newSessionArgs(id, workspacePath, shellPath, script string) []string { - return []string{"new-session", "-d", "-s", id, "-c", workspacePath, shellPath, "-lc", script} -} - -func setStatusOffArgs(id string) []string { - return []string{"set-option", "-t", exactSessionTarget(id), "status", "off"} -} - -func hasSessionArgs(id string) []string { - return []string{"has-session", "-t", exactSessionTarget(id)} -} - -func killSessionArgs(id string) []string { - return []string{"kill-session", "-t", exactSessionTarget(id)} -} - -func capturePaneArgs(id string, lines int) []string { - return []string{"capture-pane", "-p", "-t", exactPaneTarget(id), "-S", fmt.Sprintf("-%d", lines)} -} - -func sendLiteralArgs(id, message string) []string { - return []string{"send-keys", "-t", exactPaneTarget(id), "-l", message} -} - -func sendEnterArgs(id string) []string { - return []string{"send-keys", "-t", exactPaneTarget(id), "C-m"} -} - -func loadBufferArgs(bufferName, path string) []string { - return []string{"load-buffer", "-b", bufferName, path} -} - -func pasteBufferArgs(id, bufferName string) []string { - return []string{"paste-buffer", "-d", "-t", exactPaneTarget(id), "-b", bufferName} -} - -func exactSessionTarget(id string) string { - return "=" + id + ":" -} - -func exactPaneTarget(id string) string { - return "=" + id + ":0.0" -} - -func wrapLaunchCommand(cfg ports.RuntimeConfig, shellPath string) string { - path := cfg.Env["PATH"] - if path == "" { - path = getenv("PATH") - } - - var b strings.Builder - for _, key := range sortedKeys(cfg.Env) { - if key == "PATH" { - continue - } - b.WriteString("export ") - b.WriteString(key) - b.WriteString("=") - b.WriteString(shellQuote(cfg.Env[key])) - b.WriteString("; ") - } - if path != "" { - b.WriteString("export PATH=") - b.WriteString(shellQuote(path)) - b.WriteString("; ") - } - b.WriteString(cfg.LaunchCommand) - b.WriteString("; exec ") - b.WriteString(shellQuote(shellPath)) - b.WriteString(" -i") - return b.String() -} - -func sortedKeys(m map[string]string) []string { - keys := make([]string, 0, len(m)) - for k := range m { - keys = append(keys, k) - } - sort.Strings(keys) - return keys -} - -func shellQuote(s string) string { - return "'" + strings.ReplaceAll(s, "'", "'\\''") + "'" -} diff --git a/backend/internal/adapters/runtime/tmux/tmux.go b/backend/internal/adapters/runtime/tmux/tmux.go deleted file mode 100644 index ae0d0445..00000000 --- a/backend/internal/adapters/runtime/tmux/tmux.go +++ /dev/null @@ -1,296 +0,0 @@ -// Package tmux implements ports.Runtime using tmux sessions. -package tmux - -import ( - "context" - "crypto/sha256" - "encoding/hex" - "errors" - "fmt" - "os" - "os/exec" - "path/filepath" - "regexp" - "strings" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -const defaultTimeout = 5 * time.Second -const longMessageThreshold = 512 - -var sessionIDPattern = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`) - -var getenv = os.Getenv - -// Options configures a tmux Runtime; every field has a default (see New). -type Options struct { - Binary string - Timeout time.Duration - Shell string -} - -// Runtime runs agent sessions inside tmux sessions, driving them via the tmux -// CLI. It implements ports.Runtime. -type Runtime struct { - binary string - timeout time.Duration - shell string - runner runner -} - -var _ ports.Runtime = (*Runtime)(nil) - -type runner interface { - Run(ctx context.Context, name string, args ...string) ([]byte, error) -} - -type execRunner struct{} - -func (execRunner) Run(ctx context.Context, name string, args ...string) ([]byte, error) { - return exec.CommandContext(ctx, name, args...).CombinedOutput() -} - -// New builds a tmux Runtime, filling unset Options with defaults: binary -// "tmux", shell from $SHELL (else /bin/sh), and the default timeout. -func New(opts Options) *Runtime { - binary := opts.Binary - if binary == "" { - binary = "tmux" - } - timeout := opts.Timeout - if timeout == 0 { - timeout = defaultTimeout - } - shellPath := opts.Shell - if shellPath == "" { - shellPath = os.Getenv("SHELL") - } - if shellPath == "" { - shellPath = "/bin/sh" - } - return &Runtime{binary: binary, timeout: timeout, shell: shellPath, runner: execRunner{}} -} - -// Create starts a new tmux session in the workspace, running the agent's -// launch command, and returns a handle to it. -func (r *Runtime) Create(ctx context.Context, cfg ports.RuntimeConfig) (ports.RuntimeHandle, error) { - id, err := tmuxSessionName(cfg.SessionID) - if err != nil { - return ports.RuntimeHandle{}, err - } - if cfg.WorkspacePath == "" { - return ports.RuntimeHandle{}, errors.New("tmux runtime: workspace path is required") - } - if cfg.LaunchCommand == "" { - return ports.RuntimeHandle{}, errors.New("tmux runtime: launch command is required") - } - - script := wrapLaunchCommand(cfg, r.shell) - if _, err := r.run(ctx, newSessionArgs(id, cfg.WorkspacePath, r.shell, script)...); err != nil { - return ports.RuntimeHandle{}, fmt.Errorf("tmux runtime: create session %s: %w", id, err) - } - if _, err := r.run(ctx, setStatusOffArgs(id)...); err != nil { - _ = r.Destroy(context.Background(), ports.RuntimeHandle{ID: id, RuntimeName: runtimeName}) - return ports.RuntimeHandle{}, fmt.Errorf("tmux runtime: disable status %s: %w", id, err) - } - return ports.RuntimeHandle{ID: id, RuntimeName: runtimeName}, nil -} - -// Destroy kills the handle's tmux session. An already-gone session is treated -// as success. -func (r *Runtime) Destroy(ctx context.Context, handle ports.RuntimeHandle) error { - id, err := handleID(handle) - if err != nil { - return err - } - if _, err := r.run(ctx, killSessionArgs(id)...); err != nil { - var exitErr *exec.ExitError - if errors.As(err, &exitErr) { - return nil - } - return fmt.Errorf("tmux runtime: destroy session %s: %w", id, err) - } - return nil -} - -// SendMessage types a message into the session's pane and presses Enter, -// routing large messages through a tmux paste buffer. -func (r *Runtime) SendMessage(ctx context.Context, handle ports.RuntimeHandle, message string) error { - id, err := handleID(handle) - if err != nil { - return err - } - if useBuffer(message) { - return r.sendViaBuffer(ctx, id, message) - } - if _, err := r.run(ctx, sendLiteralArgs(id, message)...); err != nil { - return fmt.Errorf("tmux runtime: send message %s: %w", id, err) - } - if _, err := r.run(ctx, sendEnterArgs(id)...); err != nil { - return fmt.Errorf("tmux runtime: send enter %s: %w", id, err) - } - return nil -} - -// GetOutput captures the last `lines` lines of the session pane. -func (r *Runtime) GetOutput(ctx context.Context, handle ports.RuntimeHandle, lines int) (string, error) { - id, err := handleID(handle) - if err != nil { - return "", err - } - if lines <= 0 { - return "", errors.New("tmux runtime: lines must be positive") - } - out, err := r.run(ctx, capturePaneArgs(id, lines)...) - if err != nil { - return "", fmt.Errorf("tmux runtime: capture output %s: %w", id, err) - } - return string(out), nil -} - -// IsAlive reports whether the handle's tmux session still exists. -func (r *Runtime) IsAlive(ctx context.Context, handle ports.RuntimeHandle) (bool, error) { - id, err := handleID(handle) - if err != nil { - return false, err - } - _, err = r.run(ctx, hasSessionArgs(id)...) - if err == nil { - return true, nil - } - var exitErr *exec.ExitError - if errors.As(err, &exitErr) { - return false, nil - } - return false, fmt.Errorf("tmux runtime: probe session %s: %w", id, err) -} - -// AttachCommand returns the argv a human runs to attach their terminal to the -// session. -func (r *Runtime) AttachCommand(handle ports.RuntimeHandle) ([]string, error) { - id, err := handleID(handle) - if err != nil { - return nil, err - } - return append([]string{r.binary}, "attach", "-t", exactSessionTarget(id)), nil -} - -func (r *Runtime) sendViaBuffer(ctx context.Context, id, message string) error { - dir := os.TempDir() - file, err := os.CreateTemp(dir, "ao-tmux-message-*") - if err != nil { - return fmt.Errorf("tmux runtime: create message temp file: %w", err) - } - path := file.Name() - defer func() { _ = os.Remove(path) }() - if _, err := file.WriteString(message); err != nil { - _ = file.Close() - return fmt.Errorf("tmux runtime: write message temp file: %w", err) - } - if err := file.Close(); err != nil { - return fmt.Errorf("tmux runtime: close message temp file: %w", err) - } - - bufferName := "ao-" + filepath.Base(path) - if _, err := r.run(ctx, loadBufferArgs(bufferName, path)...); err != nil { - return fmt.Errorf("tmux runtime: load buffer %s: %w", id, err) - } - if _, err := r.run(ctx, pasteBufferArgs(id, bufferName)...); err != nil { - return fmt.Errorf("tmux runtime: paste buffer %s: %w", id, err) - } - if _, err := r.run(ctx, sendEnterArgs(id)...); err != nil { - return fmt.Errorf("tmux runtime: send enter %s: %w", id, err) - } - return nil -} - -func (r *Runtime) run(ctx context.Context, args ...string) ([]byte, error) { - cmdCtx, cancel := context.WithTimeout(ctx, r.timeout) - defer cancel() - out, err := r.runner.Run(cmdCtx, r.binary, args...) - if cmdCtx.Err() != nil { - return out, cmdCtx.Err() - } - if err != nil { - return out, commandError{err: err, output: strings.TrimSpace(string(out))} - } - return out, nil -} - -func tmuxSessionName(id domain.SessionID) (string, error) { - raw := string(id) - if raw == "" { - return "", errors.New("tmux runtime: session id is required") - } - if sessionIDPattern.MatchString(raw) { - return raw, nil - } - return sanitizedSessionName(raw), nil -} - -func sanitizedSessionName(raw string) string { - var b strings.Builder - lastDash := false - for _, r := range raw { - valid := (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' - if valid { - b.WriteRune(r) - lastDash = false - continue - } - if !lastDash { - b.WriteByte('-') - lastDash = true - } - } - base := strings.Trim(b.String(), "-") - if base == "" { - base = "session" - } - if len(base) > 40 { - base = strings.TrimRight(base[:40], "-") - } - sum := sha256.Sum256([]byte(raw)) - return base + "-" + hex.EncodeToString(sum[:4]) -} - -func validateSessionID(id string) error { - if id == "" { - return errors.New("tmux runtime: session id is required") - } - if !sessionIDPattern.MatchString(id) { - return fmt.Errorf("tmux runtime: invalid session id %q", id) - } - return nil -} - -func handleID(handle ports.RuntimeHandle) (string, error) { - if handle.RuntimeName != "" && handle.RuntimeName != runtimeName { - return "", fmt.Errorf("tmux runtime: wrong runtime %q", handle.RuntimeName) - } - if err := validateSessionID(handle.ID); err != nil { - return "", err - } - return handle.ID, nil -} - -func useBuffer(message string) bool { - return strings.Contains(message, "\n") || len(message) > longMessageThreshold -} - -type commandError struct { - err error - output string -} - -func (e commandError) Error() string { - if e.output == "" { - return e.err.Error() - } - return e.err.Error() + ": " + e.output -} - -func (e commandError) Unwrap() error { return e.err } diff --git a/backend/internal/adapters/runtime/tmux/tmux_integration_test.go b/backend/internal/adapters/runtime/tmux/tmux_integration_test.go deleted file mode 100644 index 7e798673..00000000 --- a/backend/internal/adapters/runtime/tmux/tmux_integration_test.go +++ /dev/null @@ -1,112 +0,0 @@ -package tmux - -import ( - "context" - "os/exec" - "strings" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -func TestRuntimeIntegration(t *testing.T) { - if _, err := exec.LookPath("tmux"); err != nil { - t.Skip("tmux unavailable") - } - - r := New(Options{Timeout: 5 * time.Second}) - ctx := context.Background() - id := "ao_itest_tmux" - _ = r.Destroy(ctx, ports.RuntimeHandle{ID: id, RuntimeName: runtimeName}) - - h, err := r.Create(ctx, ports.RuntimeConfig{ - SessionID: "ao_itest_tmux", - WorkspacePath: t.TempDir(), - LaunchCommand: "printf ready\\n", - Env: map[string]string{"AO_SESSION_ID": id}, - }) - if err != nil { - t.Fatalf("Create: %v", err) - } - defer r.Destroy(ctx, h) - - alive, err := r.IsAlive(ctx, h) - if err != nil { - t.Fatalf("IsAlive: %v", err) - } - if !alive { - t.Fatal("alive = false, want true") - } - - if err := r.SendMessage(ctx, h, "printf hello-from-tmux"); err != nil { - t.Fatalf("SendMessage: %v", err) - } - deadline := time.Now().Add(2 * time.Second) - var out string - for time.Now().Before(deadline) { - out, err = r.GetOutput(ctx, h, 20) - if err != nil { - t.Fatalf("GetOutput: %v", err) - } - if strings.Contains(out, "hello-from-tmux") { - break - } - time.Sleep(100 * time.Millisecond) - } - if !strings.Contains(out, "hello-from-tmux") { - t.Fatalf("output = %q, want sent command output", out) - } - - if err := r.Destroy(ctx, h); err != nil { - t.Fatalf("Destroy: %v", err) - } - alive, err = r.IsAlive(ctx, h) - if err != nil { - t.Fatalf("IsAlive after destroy: %v", err) - } - if alive { - t.Fatal("alive after destroy = true, want false") - } -} - -func TestRuntimeIntegrationUsesExactTargets(t *testing.T) { - if _, err := exec.LookPath("tmux"); err != nil { - t.Skip("tmux unavailable") - } - - r := New(Options{Timeout: 5 * time.Second}) - ctx := context.Background() - longID := "ao_exact_target_long" - prefixID := "ao_exact_target" - _ = r.Destroy(ctx, ports.RuntimeHandle{ID: longID, RuntimeName: runtimeName}) - _ = r.Destroy(ctx, ports.RuntimeHandle{ID: prefixID, RuntimeName: runtimeName}) - - h, err := r.Create(ctx, ports.RuntimeConfig{ - SessionID: "ao_exact_target_long", - WorkspacePath: t.TempDir(), - LaunchCommand: "cat", - }) - if err != nil { - t.Fatalf("Create: %v", err) - } - defer r.Destroy(ctx, h) - - alive, err := r.IsAlive(ctx, ports.RuntimeHandle{ID: prefixID, RuntimeName: runtimeName}) - if err != nil { - t.Fatalf("IsAlive prefix: %v", err) - } - if alive { - t.Fatal("prefix handle reported alive; tmux target matching is not exact") - } - if err := r.Destroy(ctx, ports.RuntimeHandle{ID: prefixID, RuntimeName: runtimeName}); err != nil { - t.Fatalf("Destroy prefix: %v", err) - } - alive, err = r.IsAlive(ctx, h) - if err != nil { - t.Fatalf("IsAlive long after prefix destroy: %v", err) - } - if !alive { - t.Fatal("destroying prefix handle killed longer session") - } -} diff --git a/backend/internal/adapters/runtime/tmux/tmux_test.go b/backend/internal/adapters/runtime/tmux/tmux_test.go deleted file mode 100644 index cb56db35..00000000 --- a/backend/internal/adapters/runtime/tmux/tmux_test.go +++ /dev/null @@ -1,256 +0,0 @@ -package tmux - -import ( - "context" - "errors" - "os/exec" - "reflect" - "strings" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -func TestNewDefaultsToPortableShell(t *testing.T) { - t.Setenv("SHELL", "") - r := New(Options{}) - if got, want := r.shell, "/bin/sh"; got != want { - t.Fatalf("default shell = %q, want %q", got, want) - } -} - -func TestCommandBuilders(t *testing.T) { - if got, want := newSessionArgs("sess-1", "/tmp/ws", "/bin/zsh", "echo hi"), []string{"new-session", "-d", "-s", "sess-1", "-c", "/tmp/ws", "/bin/zsh", "-lc", "echo hi"}; !reflect.DeepEqual(got, want) { - t.Fatalf("newSessionArgs = %#v, want %#v", got, want) - } - if got, want := setStatusOffArgs("sess-1"), []string{"set-option", "-t", "=sess-1:", "status", "off"}; !reflect.DeepEqual(got, want) { - t.Fatalf("setStatusOffArgs = %#v, want %#v", got, want) - } - if got, want := capturePaneArgs("sess-1", 42), []string{"capture-pane", "-p", "-t", "=sess-1:0.0", "-S", "-42"}; !reflect.DeepEqual(got, want) { - t.Fatalf("capturePaneArgs = %#v, want %#v", got, want) - } -} - -func TestExactTargets(t *testing.T) { - if got, want := exactSessionTarget("abc"), "=abc:"; got != want { - t.Fatalf("exactSessionTarget = %q, want %q", got, want) - } - if got, want := exactPaneTarget("abc"), "=abc:0.0"; got != want { - t.Fatalf("exactPaneTarget = %q, want %q", got, want) - } -} - -func TestTmuxSessionNameSanitizesIssueRefs(t *testing.T) { - got, err := tmuxSessionName("repo/issue#42.1") - if err != nil { - t.Fatalf("tmuxSessionName: %v", err) - } - if err := validateSessionID(got); err != nil { - t.Fatalf("sanitized id %q is invalid: %v", got, err) - } - if !strings.HasPrefix(got, "repo-issue-42-1-") { - t.Fatalf("sanitized id = %q, want readable prefix", got) - } - if got == "repo/issue#42.1" { - t.Fatal("sanitized id still contains raw unsafe characters") - } -} - -func TestValidateSessionID(t *testing.T) { - valid := []string{"sess-1", "S_2", "abc123"} - for _, id := range valid { - if err := validateSessionID(id); err != nil { - t.Fatalf("validateSessionID(%q): %v", id, err) - } - } - invalid := []string{"", "sess.1", "sess/1", "$(boom)", "with space"} - for _, id := range invalid { - if err := validateSessionID(id); err == nil { - t.Fatalf("validateSessionID(%q): got nil, want error", id) - } - } -} - -func TestWrapLaunchCommandExportsEnvAndKeepsPaneAlive(t *testing.T) { - oldGetenv := getenv - getenv = func(key string) string { - if key == "PATH" { - return "/usr/bin:/bin" - } - return "" - } - defer func() { getenv = oldGetenv }() - - got := wrapLaunchCommand(ports.RuntimeConfig{LaunchCommand: "ao run", Env: map[string]string{ - "AO_SESSION_ID": "sess-1", - "ODD": "can't", - "PATH": "/custom/bin:/usr/bin", - }}, "/bin/zsh") - - for _, want := range []string{ - "export AO_SESSION_ID='sess-1';", - "export ODD='can'\\''t';", - "export PATH='/custom/bin:/usr/bin';", - "ao run; exec '/bin/zsh' -i", - } { - if !strings.Contains(got, want) { - t.Fatalf("wrapped command missing %q in %q", want, got) - } - } -} - -func TestCreateRunsNewSessionAndDisablesStatus(t *testing.T) { - fr := &fakeRunner{} - r := New(Options{Binary: "tmux-test", Timeout: time.Second, Shell: "/bin/zsh"}) - r.runner = fr - - handle, err := r.Create(context.Background(), ports.RuntimeConfig{ - SessionID: "sess-1", - WorkspacePath: "/tmp/ws", - LaunchCommand: "echo ready", - Env: map[string]string{"AO_SESSION_ID": "sess-1"}, - }) - if err != nil { - t.Fatalf("Create: %v", err) - } - if handle != (ports.RuntimeHandle{ID: "sess-1", RuntimeName: runtimeName}) { - t.Fatalf("handle = %+v, want tmux handle", handle) - } - if len(fr.calls) != 2 { - t.Fatalf("calls = %d, want 2", len(fr.calls)) - } - if got, want := fr.calls[0].args[:6], []string{"new-session", "-d", "-s", "sess-1", "-c", "/tmp/ws"}; !reflect.DeepEqual(got, want) { - t.Fatalf("create args prefix = %#v, want %#v", got, want) - } - if got, want := fr.calls[1].args, setStatusOffArgs("sess-1"); !reflect.DeepEqual(got, want) { - t.Fatalf("status args = %#v, want %#v", got, want) - } -} - -func TestCreateNormalizesUnsafeSessionID(t *testing.T) { - fr := &fakeRunner{} - r := New(Options{Binary: "tmux-test", Timeout: time.Second, Shell: "/bin/sh"}) - r.runner = fr - - handle, err := r.Create(context.Background(), ports.RuntimeConfig{ - SessionID: "repo/issue#42", - WorkspacePath: "/tmp/ws", - LaunchCommand: "echo ready", - }) - if err != nil { - t.Fatalf("Create: %v", err) - } - if err := validateSessionID(handle.ID); err != nil { - t.Fatalf("handle id %q invalid: %v", handle.ID, err) - } - if handle.ID == "repo/issue#42" { - t.Fatal("handle kept unsafe raw session id") - } - if got := fr.calls[0].args[3]; got != handle.ID { - t.Fatalf("tmux session arg = %q, want handle id %q", got, handle.ID) - } -} - -func TestSendMessageUsesLiteralForShortInput(t *testing.T) { - fr := &fakeRunner{} - r := New(Options{Timeout: time.Second}) - r.runner = fr - - if err := r.SendMessage(context.Background(), ports.RuntimeHandle{ID: "sess-1", RuntimeName: runtimeName}, "hello"); err != nil { - t.Fatalf("SendMessage: %v", err) - } - if got, want := fr.calls[0].args, sendLiteralArgs("sess-1", "hello"); !reflect.DeepEqual(got, want) { - t.Fatalf("literal args = %#v, want %#v", got, want) - } - if got, want := fr.calls[1].args, sendEnterArgs("sess-1"); !reflect.DeepEqual(got, want) { - t.Fatalf("enter args = %#v, want %#v", got, want) - } -} - -func TestSendMessageUsesBufferForMultilineInput(t *testing.T) { - fr := &fakeRunner{} - r := New(Options{Timeout: time.Second}) - r.runner = fr - - if err := r.SendMessage(context.Background(), ports.RuntimeHandle{ID: "sess-1", RuntimeName: runtimeName}, "hello\nworld"); err != nil { - t.Fatalf("SendMessage: %v", err) - } - if len(fr.calls) != 3 { - t.Fatalf("calls = %d, want 3", len(fr.calls)) - } - if fr.calls[0].args[0] != "load-buffer" { - t.Fatalf("first command = %#v, want load-buffer", fr.calls[0].args) - } - if got := fr.calls[1].args; !reflect.DeepEqual(got[:4], []string{"paste-buffer", "-d", "-t", "=sess-1:0.0"}) { - t.Fatalf("paste args = %#v", got) - } - if got, want := fr.calls[2].args, sendEnterArgs("sess-1"); !reflect.DeepEqual(got, want) { - t.Fatalf("enter args = %#v, want %#v", got, want) - } -} - -func TestIsAliveTreatsExitStatusAsNotAlive(t *testing.T) { - fr := &fakeRunner{err: &exec.ExitError{}} - r := New(Options{Timeout: time.Second}) - r.runner = fr - - alive, err := r.IsAlive(context.Background(), ports.RuntimeHandle{ID: "sess-1", RuntimeName: runtimeName}) - if err != nil { - t.Fatalf("IsAlive: %v", err) - } - if alive { - t.Fatal("alive = true, want false") - } -} - -func TestDestroyIsIdempotentWhenSessionMissing(t *testing.T) { - fr := &fakeRunner{err: &exec.ExitError{}} - r := New(Options{Timeout: time.Second}) - r.runner = fr - - if err := r.Destroy(context.Background(), ports.RuntimeHandle{ID: "sess-1", RuntimeName: runtimeName}); err != nil { - t.Fatalf("Destroy: %v", err) - } - if len(fr.calls) != 1 || fr.calls[0].args[0] != "kill-session" { - t.Fatalf("calls = %#v, want only kill-session", fr.calls) - } -} - -func TestGetOutputValidatesLines(t *testing.T) { - r := New(Options{Timeout: time.Second}) - _, err := r.GetOutput(context.Background(), ports.RuntimeHandle{ID: "sess-1", RuntimeName: runtimeName}, 0) - if err == nil { - t.Fatal("GetOutput lines=0: got nil, want error") - } -} - -type fakeRunner struct { - calls []runnerCall - out []byte - err error -} - -type runnerCall struct { - name string - args []string -} - -func (f *fakeRunner) Run(_ context.Context, name string, args ...string) ([]byte, error) { - f.calls = append(f.calls, runnerCall{name: name, args: append([]string(nil), args...)}) - if f.err != nil { - return f.out, f.err - } - return f.out, nil -} - -func TestCommandErrorUnwraps(t *testing.T) { - base := errors.New("base") - err := commandError{err: base, output: "details"} - if !errors.Is(err, base) { - t.Fatal("commandError should unwrap base error") - } - if !strings.Contains(err.Error(), "details") { - t.Fatalf("error = %q, want output details", err.Error()) - } -} diff --git a/backend/internal/adapters/runtime/zellij/commands.go b/backend/internal/adapters/runtime/zellij/commands.go index d4ca7104..1ecb9c31 100644 --- a/backend/internal/adapters/runtime/zellij/commands.go +++ b/backend/internal/adapters/runtime/zellij/commands.go @@ -10,7 +10,6 @@ import ( ) const ( - runtimeName = "zellij" agentPaneName = "agent" defaultChunkBytes = 16 * 1024 ) @@ -188,6 +187,31 @@ func wrapLaunchCommandCmd(cfg ports.RuntimeConfig) string { return b.String() } +func validateEnvKeys(env map[string]string) error { + for key := range env { + if !validEnvKey(key) { + return fmt.Errorf("zellij runtime: invalid env key %q", key) + } + } + return nil +} + +func validEnvKey(key string) bool { + if key == "" { + return false + } + for i, r := range key { + if r == '_' || (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') { + continue + } + if i > 0 && r >= '0' && r <= '9' { + continue + } + return false + } + return true +} + func sortedKeys(m map[string]string) []string { keys := make([]string, 0, len(m)) for k := range m { diff --git a/backend/internal/adapters/runtime/zellij/zellij.go b/backend/internal/adapters/runtime/zellij/zellij.go index aade6490..71acb635 100644 --- a/backend/internal/adapters/runtime/zellij/zellij.go +++ b/backend/internal/adapters/runtime/zellij/zellij.go @@ -115,6 +115,9 @@ func (r *Runtime) Create(ctx context.Context, cfg ports.RuntimeConfig) (ports.Ru if cfg.LaunchCommand == "" { return ports.RuntimeHandle{}, errors.New("zellij runtime: launch command is required") } + if err := validateEnvKeys(cfg.Env); err != nil { + return ports.RuntimeHandle{}, err + } if err := r.ensureSupportedVersion(ctx); err != nil { return ports.RuntimeHandle{}, err } @@ -130,10 +133,10 @@ func (r *Runtime) Create(ctx context.Context, cfg ports.RuntimeConfig) (ports.Ru } paneID, err := r.findAgentPane(ctx, id) if err != nil { - _ = r.Destroy(context.Background(), ports.RuntimeHandle{ID: id, RuntimeName: runtimeName}) + _ = r.Destroy(context.Background(), ports.RuntimeHandle{ID: id}) return ports.RuntimeHandle{}, err } - return ports.RuntimeHandle{ID: handleIDValue(id, paneID), RuntimeName: runtimeName}, nil + return ports.RuntimeHandle{ID: handleIDValue(id, paneID)}, nil } // Destroy kills the handle's zellij session. An already-gone session is treated @@ -225,13 +228,9 @@ func (r *Runtime) ensureSupportedVersion(ctx context.Context) error { if err != nil { return fmt.Errorf("zellij runtime: check version: %w", err) } - version, err := parseVersion(string(out)) - if err != nil { + if _, err := CheckVersionOutput(string(out)); err != nil { return fmt.Errorf("zellij runtime: check version: %w", err) } - if compareVersion(version, semver{minMajor, minMinor, minPatch}) < 0 { - return fmt.Errorf("zellij runtime: unsupported zellij version %s; require >= %d.%d.%d", version, minMajor, minMinor, minPatch) - } return nil } @@ -384,9 +383,6 @@ func validatePaneID(id string) error { } func handleID(handle ports.RuntimeHandle) (string, string, error) { - if handle.RuntimeName != "" && handle.RuntimeName != runtimeName { - return "", "", fmt.Errorf("zellij runtime: wrong runtime %q", handle.RuntimeName) - } parts := strings.Split(handle.ID, "/") if len(parts) == 1 { if err := validateSessionID(parts[0]); err != nil { @@ -471,6 +467,25 @@ func tailLines(s string, n int) string { return strings.Join(lines[len(lines)-n:], "") } +// RequiredVersion returns the minimum Zellij version AO's runtime adapter +// supports. +func RequiredVersion() string { return minSupportedVersion().String() } + +// CheckVersionOutput parses `zellij --version` output, returning the parsed +// version when it satisfies AO's minimum runtime requirement. +func CheckVersionOutput(out string) (string, error) { + version, err := parseVersion(out) + if err != nil { + return "", err + } + if compareVersion(version, minSupportedVersion()) < 0 { + return version.String(), fmt.Errorf("unsupported zellij version %s; require >= %s", version, RequiredVersion()) + } + return version.String(), nil +} + +func minSupportedVersion() semver { return semver{minMajor, minMinor, minPatch} } + type semver struct { major int minor int diff --git a/backend/internal/adapters/runtime/zellij/zellij_integration_test.go b/backend/internal/adapters/runtime/zellij/zellij_integration_test.go index 6729cc3b..fcc57eaa 100644 --- a/backend/internal/adapters/runtime/zellij/zellij_integration_test.go +++ b/backend/internal/adapters/runtime/zellij/zellij_integration_test.go @@ -25,7 +25,7 @@ func TestRuntimeIntegration(t *testing.T) { } configDir := t.TempDir() r := New(Options{Timeout: 5 * time.Second, SocketDir: socketDir, ConfigDir: configDir}) - _ = r.Destroy(ctx, ports.RuntimeHandle{ID: id, RuntimeName: runtimeName}) + _ = r.Destroy(ctx, ports.RuntimeHandle{ID: id}) h, err := r.Create(ctx, ports.RuntimeConfig{ SessionID: "ao_itest_zj", @@ -90,8 +90,8 @@ func TestRuntimeIntegrationUsesExactSessionParsing(t *testing.T) { r := New(Options{Timeout: 5 * time.Second, SocketDir: socketDir, ConfigDir: t.TempDir()}) longID := "ao_zj_exact_long" prefixID := "ao_zj_exact" - _ = r.Destroy(ctx, ports.RuntimeHandle{ID: longID, RuntimeName: runtimeName}) - _ = r.Destroy(ctx, ports.RuntimeHandle{ID: prefixID, RuntimeName: runtimeName}) + _ = r.Destroy(ctx, ports.RuntimeHandle{ID: longID}) + _ = r.Destroy(ctx, ports.RuntimeHandle{ID: prefixID}) h, err := r.Create(ctx, ports.RuntimeConfig{ SessionID: "ao_zj_exact_long", @@ -103,7 +103,7 @@ func TestRuntimeIntegrationUsesExactSessionParsing(t *testing.T) { } defer r.Destroy(ctx, h) - alive, err := r.IsAlive(ctx, ports.RuntimeHandle{ID: prefixID, RuntimeName: runtimeName}) + alive, err := r.IsAlive(ctx, ports.RuntimeHandle{ID: prefixID}) if err != nil { t.Fatalf("IsAlive prefix: %v", err) } diff --git a/backend/internal/adapters/runtime/zellij/zellij_test.go b/backend/internal/adapters/runtime/zellij/zellij_test.go index a690af03..3f0dc143 100644 --- a/backend/internal/adapters/runtime/zellij/zellij_test.go +++ b/backend/internal/adapters/runtime/zellij/zellij_test.go @@ -83,17 +83,13 @@ func TestValidateSessionAndPaneID(t *testing.T) { } func TestHandleID(t *testing.T) { - session, pane, err := handleID(ports.RuntimeHandle{ID: "sess-1/terminal_7", RuntimeName: runtimeName}) + session, pane, err := handleID(ports.RuntimeHandle{ID: "sess-1/terminal_7"}) if err != nil { t.Fatalf("handleID: %v", err) } if session != "sess-1" || pane != "terminal_7" { t.Fatalf("handleID = %q/%q", session, pane) } - _, _, err = handleID(ports.RuntimeHandle{ID: "sess-1/terminal_7", RuntimeName: "tmux"}) - if err == nil { - t.Fatal("wrong runtime: got nil, want error") - } } func TestBuildLayoutExportsEnvAndKeepsPaneAlive(t *testing.T) { @@ -176,6 +172,20 @@ func TestBuildLayoutUsesCmdLaunchOnCmdShells(t *testing.T) { } } +func TestCreateRejectsInvalidEnvKeys(t *testing.T) { + r := New(Options{Binary: "zellij-test", Timeout: time.Second, Shell: "/bin/zsh"}) + r.runner = &fakeRunner{} + _, err := r.Create(context.Background(), ports.RuntimeConfig{ + SessionID: "sess-1", + WorkspacePath: "/tmp/ws", + LaunchCommand: "echo ready", + Env: map[string]string{"BAD KEY": "x"}, + }) + if err == nil || !strings.Contains(err.Error(), "invalid env key") { + t.Fatalf("Create err = %v, want invalid env key", err) + } +} + func TestCreateStartsSessionAndDiscoversPane(t *testing.T) { fr := &fakeRunner{outputs: [][]byte{[]byte("zellij 0.44.3"), nil, []byte(`[{"id":0,"is_plugin":true,"title":"zellij:tab-bar"},{"id":3,"is_plugin":false,"title":"agent"}]`)}} r := New(Options{Binary: "zellij-test", Timeout: time.Second, Shell: "/bin/zsh", SocketDir: "/tmp/zj", ConfigDir: "/tmp/cfg"}) @@ -190,7 +200,7 @@ func TestCreateStartsSessionAndDiscoversPane(t *testing.T) { if err != nil { t.Fatalf("Create: %v", err) } - if handle != (ports.RuntimeHandle{ID: "sess-1/terminal_3", RuntimeName: runtimeName}) { + if handle != (ports.RuntimeHandle{ID: "sess-1/terminal_3"}) { t.Fatalf("handle = %+v, want zellij handle", handle) } if len(fr.calls) != 3 { @@ -212,7 +222,7 @@ func TestCreateStartsSessionAndDiscoversPane(t *testing.T) { func TestAttachCommandUsesSocketDir(t *testing.T) { r := New(Options{SocketDir: "/tmp/zj"}) - args, err := r.AttachCommand(ports.RuntimeHandle{ID: "sess-1/terminal_0", RuntimeName: runtimeName}) + args, err := r.AttachCommand(ports.RuntimeHandle{ID: "sess-1/terminal_0"}) if err != nil { t.Fatalf("AttachCommand: %v", err) } @@ -270,6 +280,15 @@ func TestParseVersion(t *testing.T) { if compareVersion(semver{0, 44, 2}, semver{0, 44, 3}) >= 0 { t.Fatal("compareVersion should order 0.44.2 before 0.44.3") } + if got := RequiredVersion(); got != "0.44.3" { + t.Fatalf("RequiredVersion = %q, want 0.44.3", got) + } + if got, err := CheckVersionOutput("zellij 0.44.3"); err != nil || got != "0.44.3" { + t.Fatalf("CheckVersionOutput supported = %q, %v", got, err) + } + if _, err := CheckVersionOutput("zellij 0.44.2"); err == nil { + t.Fatal("CheckVersionOutput unsupported: got nil error") + } } func TestSendMessageChunksAndSendsEnter(t *testing.T) { @@ -277,7 +296,7 @@ func TestSendMessageChunksAndSendsEnter(t *testing.T) { r := New(Options{Timeout: time.Second, ChunkSize: 5}) r.runner = fr - if err := r.SendMessage(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0", RuntimeName: runtimeName}, "hello世界"); err != nil { + if err := r.SendMessage(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0"}, "hello世界"); err != nil { t.Fatalf("SendMessage: %v", err) } if len(fr.calls) != 4 { @@ -302,7 +321,7 @@ func TestGetOutputTrimsLines(t *testing.T) { r := New(Options{Timeout: time.Second}) r.runner = fr - out, err := r.GetOutput(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0", RuntimeName: runtimeName}, 2) + out, err := r.GetOutput(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0"}, 2) if err != nil { t.Fatalf("GetOutput: %v", err) } @@ -316,7 +335,7 @@ func TestIsAliveParsesNoFormattingOutput(t *testing.T) { r := New(Options{Timeout: time.Second}) r.runner = fr - alive, err := r.IsAlive(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0", RuntimeName: runtimeName}) + alive, err := r.IsAlive(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0"}) if err != nil { t.Fatalf("IsAlive: %v", err) } @@ -336,7 +355,7 @@ func TestIsAliveTreatsExitStatusAsNotAlive(t *testing.T) { r := New(Options{Timeout: time.Second}) r.runner = fr - alive, err := r.IsAlive(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0", RuntimeName: runtimeName}) + alive, err := r.IsAlive(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0"}) if err != nil { t.Fatalf("IsAlive: %v", err) } @@ -350,7 +369,7 @@ func TestDestroyIsIdempotentWhenSessionMissing(t *testing.T) { r := New(Options{Timeout: time.Second}) r.runner = fr - if err := r.Destroy(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0", RuntimeName: runtimeName}); err != nil { + if err := r.Destroy(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0"}); err != nil { t.Fatalf("Destroy: %v", err) } if len(fr.calls) != 1 || fr.calls[0].args[0] != "kill-session" { @@ -360,7 +379,7 @@ func TestDestroyIsIdempotentWhenSessionMissing(t *testing.T) { func TestGetOutputValidatesLines(t *testing.T) { r := New(Options{Timeout: time.Second}) - _, err := r.GetOutput(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0", RuntimeName: runtimeName}, 0) + _, err := r.GetOutput(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0"}, 0) if err == nil { t.Fatal("GetOutput lines=0: got nil, want error") } diff --git a/backend/internal/adapters/tracker/github/doc.go b/backend/internal/adapters/tracker/github/doc.go index f37c4c90..53acf229 100644 --- a/backend/internal/adapters/tracker/github/doc.go +++ b/backend/internal/adapters/tracker/github/doc.go @@ -36,7 +36,7 @@ // - No List pagination beyond a single page (callers requesting more than // 100 results need to wait for the observer/polling work in issue #35). // - No webhook receiver, no polling goroutine, no fact projection into -// LCM (issue #35). +// the PR service (issue #35). // - No richer per-provider metadata on Issue (milestones, project boards, // reactions); the port only carries fields all v1 providers can fill. package github diff --git a/backend/internal/adapters/tracker/github/tracker.go b/backend/internal/adapters/tracker/github/tracker.go index a184fb14..1d5d6c5d 100644 --- a/backend/internal/adapters/tracker/github/tracker.go +++ b/backend/internal/adapters/tracker/github/tracker.go @@ -230,10 +230,10 @@ func mapStateFromGitHub(state, reason string, labels []string) domain.Normalized } var hasProgress, hasReview bool for _, l := range labels { - switch l { - case labelInProgress: + switch { + case strings.EqualFold(l, labelInProgress): hasProgress = true - case labelInReview: + case strings.EqualFold(l, labelInReview): hasReview = true } } @@ -376,7 +376,10 @@ func (t *Tracker) do(ctx context.Context, method, path string, body any) ([]byte return nil, fmt.Errorf("github tracker: %s %s: %w", method, path, err) } defer func() { _ = resp.Body.Close() }() - respBody, _ := io.ReadAll(resp.Body) + respBody, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return nil, fmt.Errorf("github tracker: read response body: %w", readErr) + } if resp.StatusCode >= 200 && resp.StatusCode < 300 { return respBody, nil } @@ -473,14 +476,9 @@ func parseGitHubID(native string) (owner, repo string, number int, err error) { } repoPart := native[:hash] numPart := native[hash+1:] - slash := strings.IndexByte(repoPart, '/') - if slash < 0 { - return "", "", 0, fmt.Errorf("%w: missing owner/repo separator", ErrBadID) - } - owner = repoPart[:slash] - repo = repoPart[slash+1:] - if owner == "" || repo == "" { - return "", "", 0, fmt.Errorf("%w: empty owner or repo", ErrBadID) + owner, repo, err = parseGitHubRepo(repoPart) + if err != nil { + return "", "", 0, err } n, parseErr := strconv.Atoi(numPart) if parseErr != nil || n <= 0 { diff --git a/backend/internal/adapters/tracker/github/tracker_test.go b/backend/internal/adapters/tracker/github/tracker_test.go index a61a6899..57585b74 100644 --- a/backend/internal/adapters/tracker/github/tracker_test.go +++ b/backend/internal/adapters/tracker/github/tracker_test.go @@ -115,6 +115,8 @@ func TestParseID(t *testing.T) { {"missing slash", "octocat#42", "", "", 0, true}, {"empty owner", "/repo#1", "", "", 0, true}, {"empty repo", "owner/#1", "", "", 0, true}, + {"embedded slash", "o/r/x#1", "", "", 0, true}, + {"space", "o/r space#1", "", "", 0, true}, {"non-numeric", "o/r#abc", "", "", 0, true}, {"zero", "o/r#0", "", "", 0, true}, {"negative", "o/r#-1", "", "", 0, true}, @@ -184,7 +186,7 @@ func TestGet_StateMappingFromGitHubFields(t *testing.T) { wantState domain.NormalizedIssueState }{ {"plain open", "open", "", nil, domain.IssueOpen}, - {"open with in-progress label", "open", "", []string{"in-progress"}, domain.IssueInProgress}, + {"open with in-progress label", "open", "", []string{"In-Progress"}, domain.IssueInProgress}, {"open with in-review label", "open", "", []string{"in-review"}, domain.IssueInReview}, {"review wins over progress when both present", "open", "", []string{"in-progress", "in-review"}, domain.IssueInReview}, {"closed completed", "closed", "completed", nil, domain.IssueDone}, @@ -288,7 +290,7 @@ func TestGet_SecondaryRateLimit(t *testing.T) { func TestGet_RejectsWrongProvider(t *testing.T) { f := newFakeGH(t) tr := newTrackerForTest(t, f) - _, err := tr.Get(ctx(), domain.TrackerID{Provider: domain.TrackerProviderGitLab, Native: "g/p#1"}) + _, err := tr.Get(ctx(), domain.TrackerID{Provider: domain.TrackerProvider("gitlab"), Native: "g/p#1"}) if !errors.Is(err, ErrWrongProvider) { t.Fatalf("err = %v, want ErrWrongProvider", err) } @@ -518,7 +520,7 @@ func TestList_QueryEncoding(t *testing.T) { func TestList_RejectsWrongProvider(t *testing.T) { f := newFakeGH(t) tr := newTrackerForTest(t, f) - _, err := tr.List(ctx(), domain.TrackerRepo{Provider: domain.TrackerProviderGitLab, Native: "g/p"}, domain.ListFilter{}) + _, err := tr.List(ctx(), domain.TrackerRepo{Provider: domain.TrackerProvider("gitlab"), Native: "g/p"}, domain.ListFilter{}) if !errors.Is(err, ErrWrongProvider) { t.Fatalf("err = %v, want ErrWrongProvider", err) } diff --git a/backend/internal/adapters/workspace/gitworktree/commands.go b/backend/internal/adapters/workspace/gitworktree/commands.go index 5a417dd7..356a50e1 100644 --- a/backend/internal/adapters/workspace/gitworktree/commands.go +++ b/backend/internal/adapters/workspace/gitworktree/commands.go @@ -1,5 +1,7 @@ package gitworktree +import "strings" + func checkRefFormatBranchArgs(repo, branch string) []string { return []string{"-C", repo, "check-ref-format", "--branch", branch} } @@ -34,12 +36,11 @@ func worktreeListPorcelainArgs(repo string) []string { } func baseRefCandidates(branch, defaultBranch string) []string { - return []string{"origin/" + branch, "origin/" + defaultBranch, branch} -} - -func chooseWorktreeAddArgs(repo, path, branch, baseRef string, localBranchExists bool) []string { - if localBranchExists { - return worktreeAddBranchArgs(repo, path, branch) + candidates := []string{"origin/" + branch} + if strings.Contains(defaultBranch, "/") { + candidates = append(candidates, defaultBranch) + } else { + candidates = append(candidates, "origin/"+defaultBranch) } - return worktreeAddNewBranchArgs(repo, branch, path, baseRef) + return append(candidates, branch) } diff --git a/backend/internal/adapters/workspace/gitworktree/workspace.go b/backend/internal/adapters/workspace/gitworktree/workspace.go index 9c4cc993..a2a9cd97 100644 --- a/backend/internal/adapters/workspace/gitworktree/workspace.go +++ b/backend/internal/adapters/workspace/gitworktree/workspace.go @@ -143,7 +143,7 @@ func (w *Workspace) Destroy(ctx context.Context, info ports.WorkspaceInfo) error if err != nil { return err } - if worktreeRegistered(records, path) { + if _, ok := findWorktree(records, path); ok { if removeErr != nil { return fmt.Errorf("gitworktree: refusing to remove %q: path is still registered after git worktree prune (worktree remove: %w)", path, removeErr) } @@ -155,26 +155,6 @@ func (w *Workspace) Destroy(ctx context.Context, info ports.WorkspaceInfo) error return nil } -// List returns the managed worktrees that belong to a project. -func (w *Workspace) List(ctx context.Context, project domain.ProjectID) ([]ports.WorkspaceInfo, error) { - if project == "" { - return nil, errors.New("gitworktree: project id is required") - } - repo, err := w.repoPath(project) - if err != nil { - return nil, err - } - records, err := w.listRecords(ctx, repo) - if err != nil { - return nil, err - } - projectRoot, err := w.projectRoot(project) - if err != nil { - return nil, err - } - return filterProjectWorktrees(records, projectRoot, project), nil -} - // Restore re-attaches to an existing worktree for the session if one is still // present, recreating the handle without disturbing its contents. func (w *Workspace) Restore(ctx context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { @@ -220,7 +200,7 @@ func (w *Workspace) addWorktree(ctx context.Context, repo, path, branch string) return err } if localBranch { - if _, err := w.run(ctx, w.binary, chooseWorktreeAddArgs(repo, path, branch, "", true)...); err != nil { + if _, err := w.run(ctx, w.binary, worktreeAddBranchArgs(repo, path, branch)...); err != nil { return fmt.Errorf("gitworktree: worktree add existing branch %q: %w", branch, err) } return nil @@ -229,7 +209,7 @@ func (w *Workspace) addWorktree(ctx context.Context, repo, path, branch string) if err != nil { return err } - if _, err := w.run(ctx, w.binary, chooseWorktreeAddArgs(repo, path, branch, baseRef, false)...); err != nil { + if _, err := w.run(ctx, w.binary, worktreeAddNewBranchArgs(repo, branch, path, baseRef)...); err != nil { return fmt.Errorf("gitworktree: worktree add branch %q from %q: %w", branch, baseRef, err) } return nil @@ -358,11 +338,6 @@ func (w *Workspace) managedPath(project domain.ProjectID, session domain.Session return w.validateManagedPath(path) } -func (w *Workspace) projectRoot(project domain.ProjectID) (string, error) { - path := filepath.Join(w.managedRoot, string(project)) - return w.validateManagedPath(path) -} - func (w *Workspace) validateManagedPath(path string) (string, error) { if path == "" { return "", fmt.Errorf("%w: empty path", ErrUnsafePath) @@ -397,29 +372,6 @@ func pathWithin(root, path string) (bool, error) { return rel == "." || (rel != "" && rel != ".." && !strings.HasPrefix(rel, ".."+string(os.PathSeparator))), nil } -func filterProjectWorktrees(records []worktreeRecord, projectRoot string, project domain.ProjectID) []ports.WorkspaceInfo { - out := make([]ports.WorkspaceInfo, 0, len(records)) - for _, rec := range records { - path := filepath.Clean(rec.Path) - inside, err := pathWithin(projectRoot, path) - if err != nil || !inside || path == projectRoot { - continue - } - out = append(out, ports.WorkspaceInfo{ - Path: path, - Branch: rec.Branch, - SessionID: domain.SessionID(filepath.Base(path)), - ProjectID: project, - }) - } - return out -} - -func worktreeRegistered(records []worktreeRecord, path string) bool { - _, ok := findWorktree(records, path) - return ok -} - func findWorktree(records []worktreeRecord, path string) (worktreeRecord, bool) { clean := filepath.Clean(path) for _, rec := range records { diff --git a/backend/internal/adapters/workspace/gitworktree/workspace_integration_test.go b/backend/internal/adapters/workspace/gitworktree/workspace_integration_test.go index 2b435c85..6dc50f76 100644 --- a/backend/internal/adapters/workspace/gitworktree/workspace_integration_test.go +++ b/backend/internal/adapters/workspace/gitworktree/workspace_integration_test.go @@ -12,7 +12,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -func TestWorkspaceIntegrationCreateListRestoreDestroy(t *testing.T) { +func TestWorkspaceIntegrationCreateRestoreDestroy(t *testing.T) { git := requireGit(t) tmp := t.TempDir() repo := setupOriginClone(t, git, tmp) @@ -35,14 +35,6 @@ func TestWorkspaceIntegrationCreateListRestoreDestroy(t *testing.T) { t.Fatalf("created worktree missing seed file: %v", err) } - listed, err := ws.List(ctx, "proj") - if err != nil { - t.Fatalf("list: %v", err) - } - if len(listed) != 1 || listed[0].Path != info.Path || listed[0].Branch != cfg.Branch || listed[0].SessionID != cfg.SessionID { - t.Fatalf("listed = %#v", listed) - } - restored, err := ws.Restore(ctx, cfg) if err != nil { t.Fatalf("restore registered: %v", err) diff --git a/backend/internal/adapters/workspace/gitworktree/workspace_test.go b/backend/internal/adapters/workspace/gitworktree/workspace_test.go index afa7872f..fa14f527 100644 --- a/backend/internal/adapters/workspace/gitworktree/workspace_test.go +++ b/backend/internal/adapters/workspace/gitworktree/workspace_test.go @@ -9,7 +9,6 @@ import ( "strings" "testing" - "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) @@ -25,8 +24,8 @@ func TestCommandArgs(t *testing.T) { }{ {"check ref", checkRefFormatBranchArgs(repo, branch), []string{"-C", repo, "check-ref-format", "--branch", branch}}, {"rev parse", revParseVerifyArgs(repo, "origin/main"), []string{"-C", repo, "rev-parse", "--verify", "--quiet", "origin/main"}}, - {"add existing", chooseWorktreeAddArgs(repo, path, branch, "", true), []string{"-C", repo, "worktree", "add", path, branch}}, - {"add new", chooseWorktreeAddArgs(repo, path, branch, "origin/main", false), []string{"-C", repo, "worktree", "add", "-b", branch, path, "origin/main"}}, + {"add existing", worktreeAddBranchArgs(repo, path, branch), []string{"-C", repo, "worktree", "add", path, branch}}, + {"add new", worktreeAddNewBranchArgs(repo, branch, path, "origin/main"), []string{"-C", repo, "worktree", "add", "-b", branch, path, "origin/main"}}, // No --force: a dirty worktree must cause `git worktree remove` to fail so // the post-prune safety check surfaces the refusal instead of deleting // uncommitted agent work (review item RA). @@ -49,6 +48,12 @@ func TestBaseRefCandidates(t *testing.T) { if !reflect.DeepEqual(got, want) { t.Fatalf("candidates = %#v, want %#v", got, want) } + + got = baseRefCandidates("feature/test", "upstream/main") + want = []string{"origin/feature/test", "upstream/main", "feature/test"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("qualified candidates = %#v, want %#v", got, want) + } } func TestParseWorktreePorcelain(t *testing.T) { @@ -88,26 +93,6 @@ func TestParseWorktreePorcelain(t *testing.T) { } } -func TestFilterProjectWorktrees(t *testing.T) { - root := filepath.Clean("/managed/proj") - recs := []worktreeRecord{ - {Path: "/repo", Branch: "main"}, - {Path: "/managed/proj/s1", Branch: "feature/one"}, - {Path: "/managed/proj/s2", Branch: ""}, - {Path: "/managed/other/s3", Branch: "feature/three"}, - } - got := filterProjectWorktrees(recs, root, domain.ProjectID("proj")) - if len(got) != 2 { - t.Fatalf("len = %d, want 2: %#v", len(got), got) - } - if got[0].SessionID != "s1" || got[0].Branch != "feature/one" || got[0].ProjectID != "proj" { - t.Fatalf("first = %#v", got[0]) - } - if got[1].SessionID != "s2" || got[1].Branch != "" { - t.Fatalf("second = %#v", got[1]) - } -} - func TestManagedPathSafety(t *testing.T) { root := t.TempDir() ws, err := New(Options{ManagedRoot: root, RepoResolver: StaticRepoResolver{"proj": root}}) diff --git a/backend/internal/cdc/broadcast.go b/backend/internal/cdc/broadcast.go index b914f766..13937559 100644 --- a/backend/internal/cdc/broadcast.go +++ b/backend/internal/cdc/broadcast.go @@ -5,11 +5,11 @@ import ( "sync" ) -// Broadcaster is the in-process fan-out the poller feeds. Subscribers (the -// WS/SSE transport, wired in the frontend task) register a callback; every -// polled Event is delivered to all current subscribers. It is the single seam -// between the CDC poller and live delivery, so the transport can be built and -// swapped without touching the poller. +// Broadcaster is the in-process fan-out the poller feeds. Subscribers such as +// terminal session-state fan-out register a callback; every polled Event is +// delivered to all current subscribers. It is the single seam between the CDC +// poller and live delivery, so transports can be built and swapped without +// touching the poller. type Broadcaster struct { mu sync.RWMutex nextID int diff --git a/backend/internal/cdc/cdc_test.go b/backend/internal/cdc/cdc_test.go index 52a0c574..14ad640c 100644 --- a/backend/internal/cdc/cdc_test.go +++ b/backend/internal/cdc/cdc_test.go @@ -9,33 +9,10 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/cdc" "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/project" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) -// storeSource adapts sqlite.Store to cdc.Source — the same glue the daemon wires. -type storeSource struct{ s *sqlite.Store } - -func (a storeSource) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { - rows, err := a.s.ReadChangeLogAfter(ctx, after, limit) - if err != nil { - return nil, err - } - out := make([]cdc.Event, len(rows)) - for i, r := range rows { - out[i] = cdc.Event{ - Seq: r.Seq, - ProjectID: r.ProjectID, - SessionID: r.SessionID, - Type: cdc.EventType(r.EventType), - Payload: json.RawMessage(r.Payload), - CreatedAt: r.CreatedAt, - } - } - return out, nil -} - -func (a storeSource) LatestSeq(ctx context.Context) (int64, error) { return a.s.MaxChangeLogSeq(ctx) } - func newStore(t *testing.T) *sqlite.Store { t.Helper() s, err := sqlite.Open(t.TempDir()) @@ -50,15 +27,12 @@ func seedSession(t *testing.T, s *sqlite.Store) domain.SessionRecord { t.Helper() ctx := context.Background() now := time.Now().UTC().Truncate(time.Second) - if err := s.UpsertProject(ctx, sqlite.ProjectRow{ID: "mer", Path: "/m", RegisteredAt: now}); err != nil { + if err := s.Upsert(ctx, project.Row{ID: "mer", Path: "/m", RegisteredAt: now}); err != nil { t.Fatal(err) } r, err := s.CreateSession(ctx, domain.SessionRecord{ ProjectID: "mer", Kind: domain.KindWorker, - Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionWorking}, - Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, - }, + Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, CreatedAt: now, UpdatedAt: now, }) if err != nil { @@ -74,18 +48,18 @@ func TestE2E_StoreWriteToBroadcast(t *testing.T) { s := newStore(t) r := seedSession(t, s) // -> session_created (seq 1) - r.Lifecycle.Session.State = domain.SessionIdle + r.Activity.State = domain.ActivityIdle if err := s.UpdateSession(ctx, r); err != nil { // -> session_updated (seq 2) t.Fatal(err) } - if err := s.UpsertPR(ctx, domain.PRRow{URL: "pr1", SessionID: string(r.ID), UpdatedAt: r.UpdatedAt}); err != nil { // -> pr_created (seq 3) + if err := s.WritePR(ctx, domain.PullRequest{URL: "pr1", SessionID: r.ID, UpdatedAt: r.UpdatedAt}, nil, nil); err != nil { // -> pr_created (seq 3) t.Fatal(err) } var got []cdc.Event bc := cdc.NewBroadcaster() bc.Subscribe(func(e cdc.Event) { got = append(got, e) }) - p := cdc.NewPoller(storeSource{s}, bc, cdc.PollerConfig{}) // StartSeq 0: read from the top + p := cdc.NewPoller(s, bc, cdc.PollerConfig{}) // StartSeq 0: read from the top if err := p.Poll(ctx); err != nil { t.Fatal(err) } @@ -109,7 +83,7 @@ func TestE2E_StoreWriteToBroadcast(t *testing.T) { if err := json.Unmarshal(got[0].Payload, &payload); err != nil { t.Fatalf("payload not JSON: %v", err) } - if payload["id"] != string(r.ID) || payload["state"] != "working" { + if payload["id"] != string(r.ID) || payload["activity"] != "active" { t.Fatalf("payload = %v", payload) } @@ -135,17 +109,21 @@ func TestE2E_ConcurrentPollerLiveDelivery(t *testing.T) { bc := cdc.NewBroadcaster() bc.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) - p := cdc.NewPoller(storeSource{s}, bc, cdc.PollerConfig{}) // from the top + p := cdc.NewPoller(s, bc, cdc.PollerConfig{}) // from the top done := p.Start(ctx) const n = 6 for i := 0; i < n; i++ { - r.Lifecycle.IsAlive = i%2 == 0 // toggles is_alive -> sessions_cdc_update fires + if i%2 == 0 { + r.Activity.State = domain.ActivityActive + } else { + r.Activity.State = domain.ActivityIdle + } if err := s.UpdateSession(ctx, r); err != nil { t.Fatal(err) } } - want := 1 + n // session_created + n updates + want := n // session_created + n-1 activity updates; first write is unchanged deadline := time.Now().Add(5 * time.Second) for { diff --git a/backend/internal/cdc/event.go b/backend/internal/cdc/event.go index 16caaf74..571ede2d 100644 --- a/backend/internal/cdc/event.go +++ b/backend/internal/cdc/event.go @@ -1,7 +1,8 @@ // Package cdc is the change-data-capture delivery layer. Change events are // captured durably by SQLite triggers into the change_log table (see the storage // migrations); this package POLLS that log and fans new events out, in order, to -// in-process subscribers (the WS/SSE transport, wired in the frontend task). +// in-process subscribers such as terminal session-state fan-out. Future SSE/event +// endpoints can subscribe here too. // // There is no durable outbox/JSONL/janitor machinery: the change_log table IS // the durable, ordered source of truth, and clients catch up by reading it from @@ -19,13 +20,11 @@ type EventType string // Event types, one per row-change the DB triggers emit into change_log. const ( - EventSessionCreated EventType = "session_created" - EventSessionUpdated EventType = "session_updated" - EventPRCreated EventType = "pr_created" - EventPRUpdated EventType = "pr_updated" - EventPRCheckRecorded EventType = "pr_check_recorded" - EventNotificationCreated EventType = "notification_created" - EventNotificationUpdated EventType = "notification_updated" + EventSessionCreated EventType = "session_created" + EventSessionUpdated EventType = "session_updated" + EventPRCreated EventType = "pr_created" + EventPRUpdated EventType = "pr_updated" + EventPRCheckRecorded EventType = "pr_check_recorded" ) // Event is one CDC change read from change_log. Seq is the monotonic ordering + diff --git a/backend/internal/cli/doctor.go b/backend/internal/cli/doctor.go index 59ad221c..02cd671c 100644 --- a/backend/internal/cli/doctor.go +++ b/backend/internal/cli/doctor.go @@ -10,6 +10,7 @@ import ( "github.com/spf13/cobra" + "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/zellij" "github.com/aoagents/agent-orchestrator/backend/internal/config" ) @@ -38,6 +39,7 @@ func newDoctorCommand(ctx *commandContext) *cobra.Command { cmd := &cobra.Command{ Use: "doctor", Short: "Run local AO health checks", + Args: noArgs, RunE: func(cmd *cobra.Command, args []string) error { checks := ctx.runDoctor(cmd.Context()) failures := 0 @@ -97,12 +99,12 @@ func (c *commandContext) runDoctor(ctx context.Context) []doctorCheck { } else { level := doctorPass switch st.State { - case "stale", "not_ready": + case stateStale, stateNotReady: level = doctorWarn - case "unhealthy": + case stateUnhealthy: level = doctorFail } - msg := st.State + msg := string(st.State) if st.PID != 0 { msg = fmt.Sprintf("%s pid=%d port=%d", msg, st.PID, st.Port) } @@ -114,8 +116,7 @@ func (c *commandContext) runDoctor(ctx context.Context) []doctorCheck { checks = append(checks, c.checkTool("git", true), - c.checkTool("tmux", false), - c.checkTool("zellij", false), + c.checkZellij(ctx), ) return checks } @@ -145,6 +146,24 @@ func checkStore(dataDir string) doctorCheck { } } +func (c *commandContext) checkZellij(ctx context.Context) doctorCheck { + path, err := c.deps.LookPath("zellij") + if err != nil { + return doctorCheck{Level: doctorWarn, Name: "zellij", Message: "not found in PATH"} + } + reqCtx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + out, err := c.deps.CommandOutput(reqCtx, path, "--version") + if err != nil { + return doctorCheck{Level: doctorFail, Name: "zellij", Message: fmt.Sprintf("%s: %v", path, err)} + } + version, err := zellij.CheckVersionOutput(string(out)) + if err != nil { + return doctorCheck{Level: doctorFail, Name: "zellij", Message: fmt.Sprintf("%s: %v", path, err)} + } + return doctorCheck{Level: doctorPass, Name: "zellij", Message: fmt.Sprintf("%s (%s)", path, version)} +} + func (c *commandContext) checkTool(name string, required bool) doctorCheck { path, err := c.deps.LookPath(name) if err == nil { diff --git a/backend/internal/cli/doctor_test.go b/backend/internal/cli/doctor_test.go new file mode 100644 index 00000000..14dfcb2c --- /dev/null +++ b/backend/internal/cli/doctor_test.go @@ -0,0 +1,71 @@ +package cli + +import ( + "context" + "errors" + "strings" + "testing" +) + +func TestDoctorChecksZellijVersion(t *testing.T) { + setConfigEnv(t) + cmdPath := map[string]string{"git": "/bin/git", "zellij": "/bin/zellij"} + c := &commandContext{deps: Deps{ + LookPath: func(name string) (string, error) { return cmdPath[name], nil }, + CommandOutput: func(_ context.Context, name string, args ...string) ([]byte, error) { + if name != "/bin/zellij" || len(args) != 1 || args[0] != "--version" { + t.Fatalf("unexpected command: %s %v", name, args) + } + return []byte("zellij 0.44.3\n"), nil + }, + }.withDefaults()} + + check := findDoctorCheck(t, c.runDoctor(context.Background()), "zellij") + if check.Level != doctorPass || !strings.Contains(check.Message, "0.44.3") { + t.Fatalf("zellij check = %+v, want PASS with version", check) + } +} + +func TestDoctorFailsUnsupportedZellijVersion(t *testing.T) { + setConfigEnv(t) + cmdPath := map[string]string{"git": "/bin/git", "zellij": "/bin/zellij"} + c := &commandContext{deps: Deps{ + LookPath: func(name string) (string, error) { return cmdPath[name], nil }, + CommandOutput: func(context.Context, string, ...string) ([]byte, error) { + return []byte("zellij 0.44.2\n"), nil + }, + }.withDefaults()} + + check := findDoctorCheck(t, c.runDoctor(context.Background()), "zellij") + if check.Level != doctorFail || !strings.Contains(check.Message, "require >= 0.44.3") { + t.Fatalf("zellij check = %+v, want FAIL with minimum version", check) + } +} + +func TestDoctorWarnsWhenZellijMissing(t *testing.T) { + setConfigEnv(t) + c := &commandContext{deps: Deps{ + LookPath: func(name string) (string, error) { + if name == "git" { + return "/bin/git", nil + } + return "", errors.New("missing") + }, + }.withDefaults()} + + check := findDoctorCheck(t, c.runDoctor(context.Background()), "zellij") + if check.Level != doctorWarn { + t.Fatalf("zellij check = %+v, want WARN", check) + } +} + +func findDoctorCheck(t *testing.T, checks []doctorCheck, name string) doctorCheck { + t.Helper() + for _, check := range checks { + if check.Name == name { + return check + } + } + t.Fatalf("doctor check %q not found in %+v", name, checks) + return doctorCheck{} +} diff --git a/backend/internal/cli/process.go b/backend/internal/cli/process.go index 19c4d19f..c81a0361 100644 --- a/backend/internal/cli/process.go +++ b/backend/internal/cli/process.go @@ -13,11 +13,7 @@ type processStartConfig struct { Stderr *os.File } -type processHandle struct { - PID int -} - -func startProcess(cfg processStartConfig) (processHandle, error) { +func startProcess(cfg processStartConfig) error { cmd := exec.Command(cfg.Path, cfg.Args...) cmd.Env = cfg.Env cmd.Stdout = cfg.Stdout @@ -27,8 +23,8 @@ func startProcess(cfg processStartConfig) (processHandle, error) { // freshly spawned daemon (it would otherwise share the launcher's group). cmd.SysProcAttr = detachSysProcAttr() if err := cmd.Start(); err != nil { - return processHandle{}, err + return err } go func() { _ = cmd.Wait() }() - return processHandle{PID: cmd.Process.Pid}, nil + return nil } diff --git a/backend/internal/cli/process_unix.go b/backend/internal/cli/process_unix.go index 9963d9e9..edb610a4 100644 --- a/backend/internal/cli/process_unix.go +++ b/backend/internal/cli/process_unix.go @@ -2,18 +2,7 @@ package cli -import ( - "errors" - "syscall" -) - -func processAlive(pid int) bool { - if pid <= 0 { - return false - } - err := syscall.Kill(pid, 0) - return err == nil || errors.Is(err, syscall.EPERM) -} +import "syscall" // detachSysProcAttr puts the daemon in a new session (Setsid) so it is no // longer in the launcher's foreground process group and won't receive the diff --git a/backend/internal/cli/process_windows.go b/backend/internal/cli/process_windows.go index 3ff8190a..03cc81a1 100644 --- a/backend/internal/cli/process_windows.go +++ b/backend/internal/cli/process_windows.go @@ -3,32 +3,11 @@ package cli import ( - "errors" "syscall" "golang.org/x/sys/windows" ) -func processAlive(pid int) bool { - if pid <= 0 { - return false - } - handle, err := windows.OpenProcess(windows.SYNCHRONIZE, false, uint32(pid)) - if err != nil { - if errors.Is(err, windows.ERROR_ACCESS_DENIED) { - return true - } - return false - } - defer windows.CloseHandle(handle) - - status, err := windows.WaitForSingleObject(handle, 0) - if err != nil { - return false - } - return status == uint32(windows.WAIT_TIMEOUT) -} - // detachSysProcAttr starts the daemon in a new process group so it does not // receive the console's CTRL_C/CTRL_BREAK while `ao start` waits for readiness. func detachSysProcAttr() *syscall.SysProcAttr { diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go index 36e83e5a..ce015738 100644 --- a/backend/internal/cli/root.go +++ b/backend/internal/cli/root.go @@ -3,6 +3,7 @@ package cli import ( + "context" "errors" "io" "net/http" @@ -13,6 +14,7 @@ import ( "github.com/spf13/cobra" "github.com/aoagents/agent-orchestrator/backend/internal/daemon" + "github.com/aoagents/agent-orchestrator/backend/internal/processalive" ) // Execute runs the ao CLI with process stdio. @@ -48,31 +50,37 @@ type Deps struct { Out io.Writer Err io.Writer - HTTPClient *http.Client - Executable func() (string, error) - StartProcess func(processStartConfig) (processHandle, error) - ProcessAlive func(pid int) bool - LookPath func(file string) (string, error) - Now func() time.Time - Sleep func(time.Duration) + HTTPClient *http.Client + Executable func() (string, error) + StartProcess func(processStartConfig) error + ProcessAlive func(pid int) bool + LookPath func(file string) (string, error) + CommandOutput func(ctx context.Context, name string, args ...string) ([]byte, error) + Now func() time.Time + Sleep func(time.Duration) } // DefaultDeps returns production dependencies. func DefaultDeps() Deps { return Deps{ - In: os.Stdin, - Out: os.Stdout, - Err: os.Stderr, - HTTPClient: &http.Client{Timeout: 2 * time.Second}, - Executable: os.Executable, - StartProcess: startProcess, - ProcessAlive: processAlive, - LookPath: exec.LookPath, - Now: time.Now, - Sleep: time.Sleep, + In: os.Stdin, + Out: os.Stdout, + Err: os.Stderr, + HTTPClient: &http.Client{Timeout: 2 * time.Second}, + Executable: os.Executable, + StartProcess: startProcess, + ProcessAlive: processalive.Alive, + LookPath: exec.LookPath, + CommandOutput: commandOutput, + Now: time.Now, + Sleep: time.Sleep, } } +func commandOutput(ctx context.Context, name string, args ...string) ([]byte, error) { + return exec.CommandContext(ctx, name, args...).CombinedOutput() +} + func (d Deps) withDefaults() Deps { def := DefaultDeps() if d.In == nil { @@ -99,6 +107,9 @@ func (d Deps) withDefaults() Deps { if d.LookPath == nil { d.LookPath = def.LookPath } + if d.CommandOutput == nil { + d.CommandOutput = def.CommandOutput + } if d.Now == nil { d.Now = def.Now } @@ -146,11 +157,19 @@ type commandContext struct { deps Deps } +func noArgs(cmd *cobra.Command, args []string) error { + if err := cobra.ExactArgs(0)(cmd, args); err != nil { + return usageError{err} + } + return nil +} + func newDaemonCommand() *cobra.Command { return &cobra.Command{ Use: "daemon", Short: "Run the AO backend daemon", Hidden: true, + Args: noArgs, RunE: func(cmd *cobra.Command, args []string) error { return daemon.Run() }, diff --git a/backend/internal/cli/root_test.go b/backend/internal/cli/root_test.go index 5b920531..f9576cb9 100644 --- a/backend/internal/cli/root_test.go +++ b/backend/internal/cli/root_test.go @@ -34,6 +34,27 @@ func TestRootHelpDoesNotShowDaemon(t *testing.T) { } } +func TestCommandsRejectUnexpectedArgs(t *testing.T) { + for _, args := range [][]string{ + {"daemon", "extra"}, + {"start", "extra"}, + {"stop", "extra"}, + {"status", "extra"}, + {"doctor", "extra"}, + {"version", "extra"}, + } { + t.Run(strings.Join(args, " "), func(t *testing.T) { + _, _, err := executeCLI(t, Deps{}, args...) + if err == nil { + t.Fatal("expected usage error") + } + if got := ExitCode(err); got != 2 { + t.Fatalf("ExitCode(%v) = %d, want 2", err, got) + } + }) + } +} + func TestStatusStoppedJSON(t *testing.T) { setConfigEnv(t) @@ -71,9 +92,9 @@ func TestStartReturnsExistingReadyDaemon(t *testing.T) { var started bool out, _, err := executeCLI(t, Deps{ ProcessAlive: func(pid int) bool { return pid == os.Getpid() }, - StartProcess: func(processStartConfig) (processHandle, error) { + StartProcess: func(processStartConfig) error { started = true - return processHandle{}, nil + return nil }, Now: func() time.Time { return time.Unix(110, 0).UTC() }, }, "start", "--json") @@ -115,7 +136,7 @@ func TestStartClearsStaleRunFileBeforeSpawning(t *testing.T) { out, _, err := executeCLI(t, Deps{ ProcessAlive: func(pid int) bool { return pid == 4242 || pid == os.Getpid() }, - StartProcess: func(processStartConfig) (processHandle, error) { + StartProcess: func(processStartConfig) error { info, err := runfile.Read(cfg.runFile) if err != nil { t.Fatal(err) @@ -127,7 +148,7 @@ func TestStartClearsStaleRunFileBeforeSpawning(t *testing.T) { if err := runfile.Write(cfg.runFile, runfile.Info{PID: os.Getpid(), Port: port, StartedAt: time.Unix(110, 0).UTC()}); err != nil { t.Fatal(err) } - return processHandle{PID: os.Getpid()}, nil + return nil }, Now: func() time.Time { return time.Unix(120, 0).UTC() }, }, "start", "--json") @@ -301,9 +322,9 @@ func TestStartDoesNotSpawnWhenLiveProbeFails(t *testing.T) { var started bool _, _, err := executeCLI(t, Deps{ ProcessAlive: func(pid int) bool { return pid == 4242 }, - StartProcess: func(processStartConfig) (processHandle, error) { + StartProcess: func(processStartConfig) error { started = true - return processHandle{}, nil + return nil }, }, "start", "--timeout", "1ns", "--json") if err == nil { diff --git a/backend/internal/cli/start.go b/backend/internal/cli/start.go index c6e7ee72..a67e4007 100644 --- a/backend/internal/cli/start.go +++ b/backend/internal/cli/start.go @@ -26,6 +26,7 @@ func newStartCommand(ctx *commandContext) *cobra.Command { cmd := &cobra.Command{ Use: "start", Short: "Start the AO daemon", + Args: noArgs, RunE: func(cmd *cobra.Command, args []string) error { st, err := ctx.startDaemon(cmd.Context(), opts) if err != nil { @@ -34,7 +35,7 @@ func newStartCommand(ctx *commandContext) *cobra.Command { if opts.json { return writeJSON(cmd.OutOrStdout(), st) } - if st.State == "ready" { + if st.State == stateReady { _, err = fmt.Fprintf(cmd.OutOrStdout(), "AO daemon ready (pid %d, port %d)\n", st.PID, st.Port) return err } @@ -57,17 +58,17 @@ func (c *commandContext) startDaemon(ctx context.Context, opts startOptions) (da if err != nil { return daemonStatus{}, err } - if st.State == "ready" { + if st.State == stateReady { return st, nil } - if st.State != "stopped" && st.State != "stale" { + if st.State != stateStopped && st.State != stateStale { ready, waitErr := c.waitForReady(ctx, opts.timeout) if waitErr == nil { return ready, nil } return daemonStatus{}, fmt.Errorf("daemon process exists but did not become ready: %w", waitErr) } - if st.State == "stale" { + if st.State == stateStale { if err := runfile.Remove(cfg.RunFilePath); err != nil { return daemonStatus{}, err } @@ -91,7 +92,7 @@ func (c *commandContext) startDaemon(ctx context.Context, opts startOptions) (da } defer func() { _ = logFile.Close() }() - if _, err := c.deps.StartProcess(processStartConfig{ + if err := c.deps.StartProcess(processStartConfig{ Path: exe, Args: []string{"daemon"}, Env: os.Environ(), @@ -128,7 +129,7 @@ func (c *commandContext) waitForReady(ctx context.Context, timeout time.Duration lastErr = err } else { last = st - if st.State == "ready" { + if st.State == stateReady { return st, nil } } diff --git a/backend/internal/cli/status.go b/backend/internal/cli/status.go index 8a020d5d..d0b3995b 100644 --- a/backend/internal/cli/status.go +++ b/backend/internal/cli/status.go @@ -20,17 +20,27 @@ type statusOptions struct { json bool } +type daemonState string + +const ( + stateReady daemonState = "ready" + stateStopped daemonState = "stopped" + stateStale daemonState = "stale" + stateUnhealthy daemonState = "unhealthy" + stateNotReady daemonState = "not_ready" +) + type daemonStatus struct { - State string `json:"state"` - PID int `json:"pid,omitempty"` - Port int `json:"port,omitempty"` - StartedAt *time.Time `json:"startedAt,omitempty"` - Uptime string `json:"uptime,omitempty"` - RunFile string `json:"runFile"` - DataDir string `json:"dataDir"` - Health string `json:"health,omitempty"` - Ready string `json:"ready,omitempty"` - Error string `json:"error,omitempty"` + State daemonState `json:"state"` + PID int `json:"pid,omitempty"` + Port int `json:"port,omitempty"` + StartedAt *time.Time `json:"startedAt,omitempty"` + Uptime string `json:"uptime,omitempty"` + RunFile string `json:"runFile"` + DataDir string `json:"dataDir"` + Health string `json:"health,omitempty"` + Ready string `json:"ready,omitempty"` + Error string `json:"error,omitempty"` owned bool } @@ -45,6 +55,7 @@ func newStatusCommand(ctx *commandContext) *cobra.Command { cmd := &cobra.Command{ Use: "status", Short: "Show AO daemon status", + Args: noArgs, RunE: func(cmd *cobra.Command, args []string) error { st, err := ctx.inspectDaemon(cmd.Context()) if err != nil { @@ -65,7 +76,7 @@ func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error if err != nil { return daemonStatus{}, err } - st := daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir} + st := daemonStatus{State: stateStopped, RunFile: cfg.RunFilePath, DataDir: cfg.DataDir} info, err := runfile.Read(cfg.RunFilePath) if err != nil { @@ -82,47 +93,47 @@ func (c *commandContext) inspectDaemon(ctx context.Context) (daemonStatus, error st.Uptime = formatUptime(c.deps.Now().Sub(info.StartedAt)) if !c.deps.ProcessAlive(info.PID) { - st.State = "stale" + st.State = stateStale st.Error = "run-file points to a dead process" return st, nil } health, err := c.readProbe(ctx, info.Port, "healthz") if err != nil { - st.State = "unhealthy" + st.State = stateUnhealthy st.Error = err.Error() return st, nil } if err := verifyProbeOwner(health, info.PID, "healthz"); err != nil { - st.State = "stale" + st.State = stateStale st.Error = err.Error() return st, nil } st.owned = true st.Health = health.Status if health.Status != "ok" { - st.State = "unhealthy" + st.State = stateUnhealthy return st, nil } ready, err := c.readProbe(ctx, info.Port, "readyz") if err != nil { - st.State = "not_ready" + st.State = stateNotReady st.Error = err.Error() return st, nil } if err := verifyProbeOwner(ready, info.PID, "readyz"); err != nil { - st.State = "stale" + st.State = stateStale st.owned = false st.Error = err.Error() return st, nil } st.Ready = ready.Status - if ready.Status == "ready" { - st.State = "ready" + if ready.Status == string(stateReady) { + st.State = stateReady return st, nil } - st.State = "not_ready" + st.State = stateNotReady return st, nil } diff --git a/backend/internal/cli/stop.go b/backend/internal/cli/stop.go index b363b463..d77f508a 100644 --- a/backend/internal/cli/stop.go +++ b/backend/internal/cli/stop.go @@ -24,6 +24,7 @@ func newStopCommand(ctx *commandContext) *cobra.Command { cmd := &cobra.Command{ Use: "stop", Short: "Stop the AO daemon", + Args: noArgs, RunE: func(cmd *cobra.Command, args []string) error { st, err := ctx.stopDaemon(cmd.Context(), opts) if err != nil { @@ -32,7 +33,7 @@ func newStopCommand(ctx *commandContext) *cobra.Command { if opts.json { return writeJSON(cmd.OutOrStdout(), st) } - if st.State == "stopped" { + if st.State == stateStopped { _, err = fmt.Fprintln(cmd.OutOrStdout(), "AO daemon stopped") return err } @@ -54,13 +55,13 @@ func (c *commandContext) stopDaemon(ctx context.Context, opts stopOptions) (daem return daemonStatus{}, err } switch st.State { - case "stopped": + case stateStopped: return st, nil - case "stale": + case stateStale: if err := runfile.Remove(cfg.RunFilePath); err != nil { return daemonStatus{}, err } - return daemonStatus{State: "stopped", RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil + return daemonStatus{State: stateStopped, RunFile: cfg.RunFilePath, DataDir: cfg.DataDir}, nil } if !st.owned { if st.Error != "" { @@ -112,7 +113,7 @@ func (c *commandContext) waitForStopped(ctx context.Context, pid int, runFilePat } alive := c.deps.ProcessAlive(pid) if info == nil { - return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil + return daemonStatus{State: stateStopped, RunFile: runFilePath, DataDir: dataDir}, nil } if !alive { // Only remove the run-file if it still belongs to the process we @@ -124,7 +125,7 @@ func (c *commandContext) waitForStopped(ctx context.Context, pid int, runFilePat return daemonStatus{}, err } } - return daemonStatus{State: "stopped", RunFile: runFilePath, DataDir: dataDir}, nil + return daemonStatus{State: stateStopped, RunFile: runFilePath, DataDir: dataDir}, nil } if !c.deps.Now().Before(deadline) { return daemonStatus{}, fmt.Errorf("daemon pid %d did not stop within %s", pid, timeout) diff --git a/backend/internal/cli/stop_test.go b/backend/internal/cli/stop_test.go index 85b6a509..e364dce1 100644 --- a/backend/internal/cli/stop_test.go +++ b/backend/internal/cli/stop_test.go @@ -33,7 +33,7 @@ func TestWaitForStoppedKeepsRunFileFromConcurrentStart(t *testing.T) { if err != nil { t.Fatal(err) } - if st.State != "stopped" { + if st.State != stateStopped { t.Fatalf("state = %q, want stopped", st.State) } @@ -70,7 +70,7 @@ func TestWaitForStoppedRemovesOwnRunFile(t *testing.T) { if err != nil { t.Fatal(err) } - if st.State != "stopped" { + if st.State != stateStopped { t.Fatalf("state = %q, want stopped", st.State) } info, err := runfile.Read(runFile) diff --git a/backend/internal/cli/version.go b/backend/internal/cli/version.go index 7297cc13..9af3ee29 100644 --- a/backend/internal/cli/version.go +++ b/backend/internal/cli/version.go @@ -31,6 +31,7 @@ func newVersionCommand() *cobra.Command { return &cobra.Command{ Use: "version", Short: "Print version information", + Args: noArgs, RunE: func(cmd *cobra.Command, args []string) error { _, err := fmt.Fprintln(cmd.OutOrStdout(), VersionString()) return err diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 719e7524..529e3707 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -16,17 +16,14 @@ import ( const ( // LoopbackHost is the only host the daemon ever binds. There is deliberately // no AO_HOST env var: the daemon has no auth/CORS/TLS and a stray - // AO_HOST=0.0.0.0 would turn it into a public no-auth service. The legacy - // TS server bound all-interfaces by accident and docs/CROSS_PLATFORM.md - // already calls that out as a bug; the Go rewrite fixes it by removing the - // knob entirely. If a non-default loopback (e.g. ::1, 127.0.0.2) is ever - // needed, add it back with an IsLoopback() validator — not a raw env read. + // AO_HOST=0.0.0.0 would turn it into a public no-auth service. If a + // non-default loopback (e.g. ::1, 127.0.0.2) is ever needed, add it back with + // an IsLoopback() validator — not a raw env read. LoopbackHost = "127.0.0.1" - // DefaultPort is the single port the whole surface (REST, SSE, WS, static) - // is served from. Single-port keeps it same-origin: no CORS, one lifecycle. + // DefaultPort is the single port for REST, terminal mux, health, and control. DefaultPort = 3001 - // DefaultRequestTimeout bounds a single request. Long-lived surfaces (SSE, - // WS) are mounted outside this timeout; it guards the REST surface only. + // DefaultRequestTimeout bounds a single REST request. Long-lived terminal mux + // connections are mounted outside this timeout. DefaultRequestTimeout = 60 * time.Second // DefaultShutdownTimeout is the hard cap on graceful shutdown. After this // the process exits even if connections are still draining. @@ -47,8 +44,8 @@ type Config struct { // RunFilePath is where the PID + port handshake file (running.json) is // written so the Electron supervisor can discover and reap the daemon. RunFilePath string - // DataDir is the directory holding durable state (the SQLite database and - // the CDC JSONL log). It is created on first use by the storage layer. + // DataDir is the directory holding durable SQLite state: DB and WAL files. + // It is created on first use by the storage layer. DataDir string } @@ -136,7 +133,7 @@ func parsePositiveDuration(name, raw string) (time.Duration, error) { } // resolveRunFilePath picks where running.json lives. An explicit AO_RUN_FILE -// wins; otherwise it sits under the per-user state directory so multiple repos +// wins; otherwise it sits under the per-user config directory so multiple repos // share one supervisor handshake location. func resolveRunFilePath() (string, error) { if p, ok := os.LookupEnv("AO_RUN_FILE"); ok && p != "" { @@ -150,7 +147,7 @@ func resolveRunFilePath() (string, error) { } // resolveDataDir picks where durable state (the SQLite DB) lives. An explicit -// AO_DATA_DIR wins; otherwise it sits under the per-user state directory +// AO_DATA_DIR wins; otherwise it sits under the per-user config directory // alongside running.json. func resolveDataDir() (string, error) { if p, ok := os.LookupEnv("AO_DATA_DIR"); ok && p != "" { diff --git a/backend/internal/config/config_test.go b/backend/internal/config/config_test.go index dfcb5b8a..2d910f9c 100644 --- a/backend/internal/config/config_test.go +++ b/backend/internal/config/config_test.go @@ -1,6 +1,8 @@ package config import ( + "path/filepath" + "strings" "testing" "time" ) @@ -8,7 +10,7 @@ import ( func TestLoadDefaults(t *testing.T) { // Clear every recognised var so we observe pure defaults regardless of the // surrounding environment. - for _, k := range []string{"AO_PORT", "AO_REQUEST_TIMEOUT", "AO_SHUTDOWN_TIMEOUT", "AO_RUN_FILE"} { + for _, k := range []string{"AO_PORT", "AO_REQUEST_TIMEOUT", "AO_SHUTDOWN_TIMEOUT", "AO_RUN_FILE", "AO_DATA_DIR"} { t.Setenv(k, "") } @@ -31,6 +33,15 @@ func TestLoadDefaults(t *testing.T) { if cfg.RunFilePath == "" { t.Error("RunFilePath is empty, want a resolved default path") } + if !strings.HasSuffix(cfg.RunFilePath, filepath.Join("agent-orchestrator", "running.json")) { + t.Errorf("RunFilePath = %q, want agent-orchestrator/running.json suffix", cfg.RunFilePath) + } + if cfg.DataDir == "" { + t.Error("DataDir is empty, want a resolved default path") + } + if !strings.HasSuffix(cfg.DataDir, filepath.Join("agent-orchestrator", "data")) { + t.Errorf("DataDir = %q, want agent-orchestrator/data suffix", cfg.DataDir) + } } func TestLoadOverrides(t *testing.T) { @@ -38,6 +49,7 @@ func TestLoadOverrides(t *testing.T) { t.Setenv("AO_REQUEST_TIMEOUT", "5s") t.Setenv("AO_SHUTDOWN_TIMEOUT", "3s") t.Setenv("AO_RUN_FILE", "/tmp/ao-test-running.json") + t.Setenv("AO_DATA_DIR", "/tmp/ao-test-data") cfg, err := Load() if err != nil { @@ -55,6 +67,9 @@ func TestLoadOverrides(t *testing.T) { if cfg.RunFilePath != "/tmp/ao-test-running.json" { t.Errorf("RunFilePath = %q, want /tmp/ao-test-running.json", cfg.RunFilePath) } + if cfg.DataDir != "/tmp/ao-test-data" { + t.Errorf("DataDir = %q, want /tmp/ao-test-data", cfg.DataDir) + } } func TestLoadInvalid(t *testing.T) { diff --git a/backend/internal/daemon/cdc_wiring.go b/backend/internal/daemon/cdc_wiring.go index a76c5c78..8a0ebbcf 100644 --- a/backend/internal/daemon/cdc_wiring.go +++ b/backend/internal/daemon/cdc_wiring.go @@ -2,18 +2,17 @@ package daemon import ( "context" - "encoding/json" "log/slog" "github.com/aoagents/agent-orchestrator/backend/internal/cdc" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) -// cdcPipeline owns the running CDC poller and the broadcaster the SSE transport -// subscribes to. The DB triggers write change_log; the poller tails it and fans -// each new event out through the broadcaster. Durable catch-up is the client's -// job (it reads change_log from its own Last-Event-ID), so the poller only -// pushes live events and re-seeks to head on restart. +// cdcPipeline owns the running CDC poller and live-event broadcaster. The DB +// triggers write change_log; the poller tails it and fans each new event out to +// live transports such as terminal session-state subscriptions. Durable catch-up +// is a client concern; the poller only pushes live events and re-seeks to head +// on restart. type cdcPipeline struct { Broadcaster *cdc.Broadcaster done <-chan struct{} @@ -23,7 +22,7 @@ type cdcPipeline struct { // when ctx is cancelled; Stop waits for it to drain. func startCDC(ctx context.Context, store *sqlite.Store, logger *slog.Logger) (*cdcPipeline, error) { bcast := cdc.NewBroadcaster() - poller := cdc.NewPoller(cdcSource{store}, bcast, cdc.PollerConfig{Logger: logger}) + poller := cdc.NewPoller(store, bcast, cdc.PollerConfig{Logger: logger}) if err := poller.SeekToHead(ctx); err != nil { return nil, err } @@ -36,29 +35,3 @@ func (p *cdcPipeline) Stop() error { <-p.done return nil } - -// cdcSource adapts *sqlite.Store's change_log reads to cdc.Source. -type cdcSource struct{ store *sqlite.Store } - -func (s cdcSource) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { - rows, err := s.store.ReadChangeLogAfter(ctx, after, limit) - if err != nil { - return nil, err - } - out := make([]cdc.Event, len(rows)) - for i, r := range rows { - out[i] = cdc.Event{ - Seq: r.Seq, - ProjectID: r.ProjectID, - SessionID: r.SessionID, - Type: cdc.EventType(r.EventType), - Payload: json.RawMessage(r.Payload), - CreatedAt: r.CreatedAt, - } - } - return out, nil -} - -func (s cdcSource) LatestSeq(ctx context.Context) (int64, error) { - return s.store.MaxChangeLogSeq(ctx) -} diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 3cb4f45c..b8d89053 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -11,9 +11,10 @@ import ( "os/signal" "syscall" - "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/tmux" + "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/zellij" "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/httpd" + "github.com/aoagents/agent-orchestrator/backend/internal/project" "github.com/aoagents/agent-orchestrator/backend/internal/runfile" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" "github.com/aoagents/agent-orchestrator/backend/internal/terminal" @@ -38,13 +39,9 @@ func Run() error { return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port) } - // Open the durable store and bring up the CDC substrate: the DB triggers - // capture changes into change_log, the poller tails it, and the broadcaster - // fans events out to the SSE transport. The LCM/Session Manager and the HTTP - // API routes that drive and read this store are owned by the daemon lane and - // are wired there once their collaborators (Notifier, AgentMessenger, and the - // runtime/agent/workspace plugins) have production implementations; here we - // stand up the persistence + change-delivery foundation they build on. + // Open the durable store and bring up the CDC substrate: DB triggers capture + // changes into change_log, the poller tails it, and the broadcaster fans + // events out to live transports. store, err := sqlite.Open(cfg.DataDir) if err != nil { return fmt.Errorf("open store: %w", err) @@ -61,46 +58,27 @@ func Run() error { return err } - // Terminal streaming: the tmux runtime supplies the PTY-attach command and + // Terminal streaming: the Zellij runtime supplies the PTY-attach command and // liveness; the CDC broadcaster feeds the session-state channel. The manager // is handed to httpd, which mounts it at /mux. Raw PTY bytes never flow // through the CDC change_log — only session-state events do. - runtimeAdapter := tmux.New(tmux.Options{}) + runtimeAdapter := zellij.New(zellij.Options{}) termMgr := terminal.NewManager(runtimeAdapter, cdcPipe.Broadcaster, log) defer termMgr.Close() - srv, err := httpd.New(cfg, log, termMgr) + srv, err := httpd.NewWithDeps(cfg, log, termMgr, httpd.APIDeps{Projects: project.NewManager(store)}) if err != nil { - return err - } - - // Bring up the Lifecycle Manager (sole store writer) and the reaper (OBSERVE - // timer). This makes the write path live end-to-end: LCM write -> store -> DB - // trigger -> change_log -> poller -> broadcaster. - lcStack := startLifecycle(ctx, store, log) - - // Bring up the Session Manager. Runtime (tmux) and Workspace (gitworktree) - // are real on main; ports.Agent has no production adapter yet, so a loud - // stub returns a sentinel command that makes any Spawn fail at the runtime - // layer rather than start a broken session quietly. Notifier and - // AgentMessenger remain stubbed alongside the LCM until their multiplexers - // land. No HTTP routes wire to this yet — the daemon lane (#10) owns API - // surfacing — so we hold the SM in a local until it does. - sStack, err := startSession(ctx, cfg, lcStack, log) - if err != nil { - // startSession is the first start* call after this point that can - // realistically fail while the cdc poller and the reaper are already - // running. Mirror the bottom-of-run shutdown sequence so both have - // drained before the deferred store.Close() fires. Defers would hit - // the LIFO trap (see comment after srv.Run), hence explicit. stop() - lcStack.Stop() if cdcErr := cdcPipe.Stop(); cdcErr != nil { log.Error("cdc pipeline shutdown", "err", cdcErr) } return err } - _ = sStack + + // Bring up the Lifecycle Manager and the reaper. This makes the session + // lifecycle write path live end-to-end: reducer write -> store -> DB trigger + // -> change_log -> poller -> broadcaster. + lcStack := startLifecycle(ctx, store, runtimeAdapter, log) runErr := srv.Run(ctx) diff --git a/backend/internal/daemon/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go index 65308f0e..5c04002d 100644 --- a/backend/internal/daemon/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -3,147 +3,27 @@ package daemon import ( "context" "log/slog" - "path/filepath" - "sync" - "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/tmux" - "github.com/aoagents/agent-orchestrator/backend/internal/adapters/workspace/gitworktree" - "github.com/aoagents/agent-orchestrator/backend/internal/config" - "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" - "github.com/aoagents/agent-orchestrator/backend/internal/notification" "github.com/aoagents/agent-orchestrator/backend/internal/observe/reaper" "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/session" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) -// lifecycleStack owns the running LCM + reaper. The LCM is the sole writer of -// canonical transitions; the reaper is the OBSERVE-layer timer that probes live -// runtimes and reports facts back through it. Store is exposed so the Session -// Manager construction in startSession can plug the same SessionStore + PRWriter -// instance the LCM already holds (*sqlite.Store satisfies both ports directly). +// lifecycleStack owns the runtime reaper goroutine started with the lifecycle +// reducer. The reducer itself is only used for wiring observations into storage. type lifecycleStack struct { - LCM *lifecycle.Manager - Store *sqlite.Store reaperDone <-chan struct{} } -// startLifecycle constructs the LCM over the store adapter and starts the reaper. -// The goroutine stops when ctx is cancelled; Stop waits for it to drain. -// -// TEMPORARY STUBS (replace as the daemon lane lands the collaborators): -// - noopMessenger — swap for the runtime/agent-plugin-backed AgentMessenger. -// - reaper.MapRegistry{} — empty runtime registry, so the reaper ticks -// escalations but probes nothing until the runtime plugins exist. -func startLifecycle(ctx context.Context, store *sqlite.Store, logger *slog.Logger) *lifecycleStack { - renderer := notification.NewRenderer(store) - notifier := notification.NewEnqueuer(store, renderer, logger) - lcm := lifecycle.New(store, store, notifier, noopMessenger{}) - rp := reaper.New(lcm, reaper.MapRegistry{}, reaper.Config{Logger: logger}) - return &lifecycleStack{LCM: lcm, Store: store, reaperDone: rp.Start(ctx)} +// startLifecycle constructs the Lifecycle Manager over the store and starts the +// reaper. The goroutine stops when ctx is cancelled; Stop waits for it to drain. +func startLifecycle(ctx context.Context, store *sqlite.Store, runtime ports.Runtime, logger *slog.Logger) *lifecycleStack { + lcm := lifecycle.New(store, nil) + rp := reaper.New(lcm, store, runtime, reaper.Config{Logger: logger}) + return &lifecycleStack{reaperDone: rp.Start(ctx)} } -// Stop waits for the reaper goroutine to exit (the caller must have cancelled the -// ctx passed to startLifecycle). +// Stop waits for the reaper goroutine to exit. The caller must cancel the ctx +// passed to startLifecycle before calling Stop. func (l *lifecycleStack) Stop() { <-l.reaperDone } - -// sessionStack holds the daemon's live Session Manager. It mirrors -// lifecycleStack's shape so a future teardown hook (worktree drain, runtime -// shutdown) has a place to attach. -type sessionStack struct { - SM *session.Manager -} - -// startSession constructs the Session Manager over the real tmux Runtime and -// gitworktree Workspace, the LCM and adapter created by startLifecycle, and the -// loud-stub Agent / Messenger / Notifier ports that have no production -// implementations yet. It does NOT mount any HTTP routes — those come with the -// daemon lane (#10). Returning the SM here lets main hold the wired-but-quiet -// instance so future route wiring is a one-line plumb-through. -func startSession(ctx context.Context, cfg config.Config, ls *lifecycleStack, log *slog.Logger) (*sessionStack, error) { - _ = ctx // reserved for future ctx-aware plugin construction; today's tmux/gitworktree constructors are synchronous. - runtime := tmux.New(tmux.Options{}) - - ws, err := gitworktree.New(gitworktree.Options{ - // ManagedRoot is the directory under which per-session worktrees are - // materialised. Co-located with the SQLite DB so a single AO_DATA_DIR - // override moves all durable per-user state together. - ManagedRoot: filepath.Join(cfg.DataDir, "worktrees"), - // An empty resolver fails every project lookup with a clear - // `no repo configured for project %q` error. That's the right loud - // failure until the projects table feeds repo paths into the resolver - // — hard-coding a single repo here would silently misroute spawns. - RepoResolver: gitworktree.StaticRepoResolver{}, - }) - if err != nil { - return nil, err - } - - agent := newNoopAgent(log) - - sm := session.New(session.Deps{ - Runtime: runtime, - Agent: agent, - Workspace: ws, - Store: ls.Store, - Messenger: noopMessenger{}, - Lifecycle: ls.LCM, - }) - - return &sessionStack{SM: sm}, nil -} - -// noopMessenger is a TEMPORARY stub (see startLifecycle): the canonical write -// path and durable notifications work without it; only live agent nudges are -// absent until the real runtime/agent plugin is wired. -type noopMessenger struct{} - -func (noopMessenger) Send(context.Context, domain.SessionID, string) error { return nil } - -// agentNotWiredSentinel is the launch / restore command (and env-var key) -// noopAgent returns. tmux will try to exec a binary named exactly this and fail -// fast, so a Spawn against the loud stub surfaces a clear runtime error rather -// than starting a quiet, broken session. -const agentNotWiredSentinel = "AO_AGENT_HARNESS_NOT_WIRED" - -// noopAgent is a loud stub for ports.Agent. There is no production Agent -// adapter on main yet; rather than panic at construction, this struct lets the -// daemon stand up the Session Manager, then logs a single warning the first -// time any SM call route through it and returns sentinel commands that make -// the runtime layer fail loudly. -type noopAgent struct { - log *slog.Logger - once *sync.Once -} - -var _ ports.Agent = (*noopAgent)(nil) - -func newNoopAgent(log *slog.Logger) *noopAgent { - return &noopAgent{log: log, once: &sync.Once{}} -} - -func (n *noopAgent) warn() { - n.once.Do(func() { - n.log.Warn( - "agent harness not wired: Spawn/Restore will fail at the runtime layer until a ports.Agent adapter is built", - "sentinel", agentNotWiredSentinel, - "next_step", "implement a per-harness ports.Agent adapter and plug it into startSession", - ) - }) -} - -func (n *noopAgent) GetLaunchCommand(ports.AgentConfig) string { - n.warn() - return agentNotWiredSentinel -} - -func (n *noopAgent) GetEnvironment(ports.AgentConfig) map[string]string { - n.warn() - return map[string]string{agentNotWiredSentinel: "1"} -} - -func (n *noopAgent) GetRestoreCommand(string) string { - n.warn() - return agentNotWiredSentinel -} diff --git a/backend/internal/daemon/wiring_test.go b/backend/internal/daemon/wiring_test.go index 3568eeb7..d743fcee 100644 --- a/backend/internal/daemon/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -2,27 +2,21 @@ package daemon import ( "context" - "io" - "log/slog" - "reflect" "sync" "testing" "time" - "unsafe" "github.com/aoagents/agent-orchestrator/backend/internal/cdc" - "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" - "github.com/aoagents/agent-orchestrator/backend/internal/notification" "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/session" + "github.com/aoagents/agent-orchestrator/backend/internal/project" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) // TestWiring_WriteFlowsToBroadcaster exercises the real boot path end to end: // a lifecycle write -> sqlite -> DB trigger -> change_log -> CDC poller -> -// broadcaster, through the production wiring.Adapter and cdcSource. +// broadcaster, through the same cdc.Source implementation the daemon uses. func TestWiring_WriteFlowsToBroadcaster(t *testing.T) { ctx := context.Background() store, err := sqlite.Open(t.TempDir()) @@ -31,13 +25,10 @@ func TestWiring_WriteFlowsToBroadcaster(t *testing.T) { } defer store.Close() - renderer := notification.NewRenderer(store) - logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - notifier := notification.NewEnqueuer(store, renderer, logger) - lcm := lifecycle.New(store, store, notifier, noopMessenger{}) + lcm := lifecycle.New(store, nil) bcast := cdc.NewBroadcaster() - poller := cdc.NewPoller(cdcSource{store}, bcast, cdc.PollerConfig{}) + poller := cdc.NewPoller(store, bcast, cdc.PollerConfig{}) if err := poller.SeekToHead(ctx); err != nil { t.Fatal(err) } @@ -46,19 +37,19 @@ func TestWiring_WriteFlowsToBroadcaster(t *testing.T) { var got []cdc.Event bcast.Subscribe(func(e cdc.Event) { mu.Lock(); got = append(got, e); mu.Unlock() }) - if err := store.UpsertProject(ctx, sqlite.ProjectRow{ID: "mer", Path: "/repo/mer"}); err != nil { + if err := store.Upsert(ctx, project.Row{ID: "mer", Path: "/repo/mer"}); err != nil { t.Fatal(err) } rec, err := store.CreateSession(ctx, domain.SessionRecord{ ProjectID: "mer", Kind: domain.KindWorker, - Lifecycle: domain.CanonicalSessionLifecycle{Version: domain.LifecycleVersion, Session: domain.SessionSubstate{State: domain.SessionNotStarted}}, + Activity: domain.ActivitySubstate{State: domain.ActivityIdle, LastActivityAt: time.Now(), Source: domain.SourceNone}, }) if err != nil { t.Fatal(err) } // A real transition through the engine, which writes the row and fires the - // is_alive/activity_state CDC trigger. - if err := lcm.ApplyActivitySignal(ctx, rec.ID, ports.ActivitySignal{Valid: true, State: domain.ActivityActive, Timestamp: time.Now()}); err != nil { + // activity_state/is_terminated CDC trigger. + if err := lcm.ApplyActivitySignal(ctx, rec.ID, ports.ActivitySignal{Valid: true, State: domain.ActivityActive, Timestamp: time.Now(), Source: domain.SourceNative}); err != nil { t.Fatal(err) } @@ -78,78 +69,3 @@ func TestWiring_WriteFlowsToBroadcaster(t *testing.T) { t.Fatalf("expected a change_log event for %s to reach the broadcaster, got %d events", rec.ID, len(got)) } } - -// TestWiring_SessionManagerSharesLifecycleStoreAndLCM verifies that startSession -// constructs an SM whose Store and Lifecycle dependencies are the exact same -// values the LCM holds: a single canonical-store + LCM pair, not two parallel -// stacks that would diverge under concurrent writes. The brief constraint -// forbids modifying session/manager.go to add accessors, so the assertion -// reaches into the unexported fields via reflect + unsafe — scoped to the test -// and isolated in inspectSessionDeps. -func TestWiring_SessionManagerSharesLifecycleStoreAndLCM(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - - store, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatal(err) - } - // Registered first so it runs LAST (after the reaper has drained). - t.Cleanup(func() { _ = store.Close() }) - - log := slog.New(slog.NewTextHandler(io.Discard, nil)) - cfg := config.Config{DataDir: t.TempDir()} - - lcStack := startLifecycle(ctx, store, log) - // lcStack.Stop blocks on the reaper goroutine, which only exits once its - // ctx is cancelled. Production main.go calls stop() before lcStack.Stop() - // for the same reason — same ordering here. - t.Cleanup(func() { - cancel() - lcStack.Stop() - }) - - sStack, err := startSession(ctx, cfg, lcStack, log) - if err != nil { - t.Fatal(err) - } - if sStack == nil || sStack.SM == nil { - t.Fatal("startSession returned nil Session Manager") - } - - gotStore, gotLCM := inspectSessionDeps(t, sStack.SM) - - // Store should be the exact *sqlite.Store the LCM was constructed with. - gotSqlite, ok := gotStore.(*sqlite.Store) - if !ok { - t.Fatalf("SM.store is %T, want *sqlite.Store", gotStore) - } - if gotSqlite != lcStack.Store { - t.Fatalf("SM.store is a different *sqlite.Store than lcStack.Store") - } - - // Lifecycle should be the exact *lifecycle.Manager pointer from startLifecycle. - gotLCMPtr, ok := gotLCM.(*lifecycle.Manager) - if !ok { - t.Fatalf("SM.lcm is %T, want *lifecycle.Manager", gotLCM) - } - if gotLCMPtr != lcStack.LCM { - t.Fatalf("SM.lcm pointer (%p) differs from lcStack.LCM (%p)", gotLCMPtr, lcStack.LCM) - } -} - -// inspectSessionDeps reads session.Manager's unexported store and lcm fields. -// The brief forbids modifying session/manager.go to expose them; we settle for -// reflect + unsafe scoped to this one test helper. If the field names change -// upstream, the type assertion (and this helper) is the only place to touch. -func inspectSessionDeps(t *testing.T, sm *session.Manager) (store any, lcm any) { - t.Helper() - v := reflect.ValueOf(sm).Elem() - storeField := v.FieldByName("store") - lcmField := v.FieldByName("lcm") - if !storeField.IsValid() || !lcmField.IsValid() { - t.Fatalf("session.Manager fields renamed: store.IsValid=%v lcm.IsValid=%v — update inspectSessionDeps", storeField.IsValid(), lcmField.IsValid()) - } - storeVal := reflect.NewAt(storeField.Type(), unsafe.Pointer(storeField.UnsafeAddr())).Elem() - lcmVal := reflect.NewAt(lcmField.Type(), unsafe.Pointer(lcmField.UnsafeAddr())).Elem() - return storeVal.Interface(), lcmVal.Interface() -} diff --git a/backend/internal/domain/activity.go b/backend/internal/domain/activity.go new file mode 100644 index 00000000..c725a38c --- /dev/null +++ b/backend/internal/domain/activity.go @@ -0,0 +1,63 @@ +package domain + +import "time" + +// ActivityState is how busy the agent is, derived from its output/JSONL. +type ActivityState string + +// Activity states. WaitingInput and Blocked are sticky (see IsSticky). +const ( + ActivityActive ActivityState = "active" + ActivityIdle ActivityState = "idle" + ActivityWaitingInput ActivityState = "waiting_input" + ActivityBlocked ActivityState = "blocked" + ActivityExited ActivityState = "exited" +) + +// IsSticky reports whether an activity state must NOT be aged/demoted by the +// passage of time (a paused agent is still paused until a new signal says so). +func (a ActivityState) IsSticky() bool { + return a == ActivityWaitingInput || a == ActivityBlocked +} + +// ActivitySource records where an activity reading came from, so a weaker +// source can't override a stronger one. +type ActivitySource string + +// Activity signal sources, strongest first. +const ( + SourceNative ActivitySource = "native" + SourceTerminal ActivitySource = "terminal" + SourceHook ActivitySource = "hook" + SourceRuntime ActivitySource = "runtime" + SourceNone ActivitySource = "none" +) + +// CanOverride reports whether a reading from source a may replace a current +// reading from source current. Unknown sources are treated as weakest. +func (a ActivitySource) CanOverride(current ActivitySource) bool { + return activitySourceRank(a) <= activitySourceRank(current) +} + +func activitySourceRank(s ActivitySource) int { + switch s { + case SourceNative: + return 0 + case SourceTerminal: + return 1 + case SourceHook: + return 2 + case SourceRuntime: + return 3 + default: + return 4 + } +} + +// ActivitySubstate is the persisted activity reading: the state, when it was +// last observed, and which source reported it. +type ActivitySubstate struct { + State ActivityState `json:"state"` + LastActivityAt time.Time `json:"lastActivityAt"` + Source ActivitySource `json:"source"` +} diff --git a/backend/internal/domain/decide/decide.go b/backend/internal/domain/decide/decide.go deleted file mode 100644 index be195aef..00000000 --- a/backend/internal/domain/decide/decide.go +++ /dev/null @@ -1,158 +0,0 @@ -// Package decide is the pure DECIDE core: total, deterministic, zero I/O. It -// collapses observed liveness facts (plus the prior detecting memory) into one -// LifecycleDecision. Every function here is side-effect free so the whole -// liveness truth-table can be tested in isolation. -// -// PR-driven behaviour is NOT here: PR display status is derived by -// domain.DeriveStatus from the pr table, and PR-driven nudges are the reaction -// engine's job. decide is only about liveness + the anti-flap quarantine. -package decide - -import ( - "crypto/sha256" - "encoding/hex" - "fmt" - "regexp" - "strings" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" -) - -// Anti-flap tuning. detecting escalates to stuck only after this many -// consecutive unchanged-evidence ticks OR once this much wallclock has elapsed -// since first entering detecting. -const ( - DetectingMaxAttempts = 3 - DetectingMaxDuration = 5 * time.Minute -) - -// ResolveProbeDecision reconciles runtime/process liveness into a decision. -// -// The ordering encodes the load-bearing invariants: -// - an explicit kill short-circuits straight to terminal (the only inferred -// terminal this decider may reach without quarantine); -// - a *failed* probe (timeout/error) is never read as death — it routes to -// detecting, as does any disagreement between the two probes; -// - only runtime-down + process-dead + no-recent-activity reaches terminal. -func ResolveProbeDecision(in ProbeInput) LifecycleDecision { - if in.KillRequested { - reason := in.KillReason - if reason == "" { - reason = domain.TermManuallyKilled - } - return LifecycleDecision{ - Evidence: "manual kill requested", - SessionState: domain.SessionTerminated, - TerminationReason: reason, - IsAlive: false, - } - } - - if in.RuntimeFailed || in.ProcessFailed { - ev := fmt.Sprintf("probe_failed runtimeFailed=%t process=%s processFailed=%t", in.RuntimeFailed, in.Process, in.ProcessFailed) - return detecting(in, ev) - } - - if in.RuntimeAlive { - if in.Process == ProcessDead { - // Runtime up but the agent process is gone: probes disagree. - ev := fmt.Sprintf("disagree runtime=alive process=%s recentActivity=%t", in.Process, in.RecentActivity) - return detecting(in, ev) - } - return LifecycleDecision{ - Evidence: fmt.Sprintf("alive runtime=alive process=%s", in.Process), - SessionState: domain.SessionWorking, - IsAlive: true, - } - } - - // Runtime is gone. Death is only concluded when the process is *also* - // confirmed dead AND nothing has been heard from the agent recently; any - // other shape is ambiguous and quarantines. - if in.Process == ProcessAlive || in.RecentActivity { - ev := fmt.Sprintf("disagree runtime=down process=%s recentActivity=%t", in.Process, in.RecentActivity) - return detecting(in, ev) - } - if in.Process == ProcessDead { - return LifecycleDecision{ - Evidence: "dead runtime=down process=dead recentActivity=false", - SessionState: domain.SessionTerminated, - TerminationReason: domain.TermRuntimeLost, - IsAlive: false, - } - } - // Process indeterminate: cannot confirm death, so quarantine. - ev := fmt.Sprintf("runtime_lost runtime=down process=%s recentActivity=false", in.Process) - return detecting(in, ev) -} - -// CreateDetectingDecision advances or escalates the anti-flap quarantine. -// -// The attempt counter climbs only while the (timestamp-stripped) evidence hash -// is unchanged and resets the moment the evidence moves; StartedAt is preserved -// across the whole detecting episode so the duration cap is a real wall-clock -// safety net even when the evidence keeps flapping. Escalation to stuck fires at -// DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration -// elapsed since first entering detecting. Detecting/stuck leave IsAlive true: -// the probe was ambiguous, so the session is not confirmed dead. -func CreateDetectingDecision(in DetectingInput) LifecycleDecision { - hash := HashEvidence(in.Evidence) - - attempts := 1 - startedAt := in.Now - if in.Prior != nil { - startedAt = in.Prior.StartedAt - if in.Prior.EvidenceHash == hash { - attempts = in.Prior.Attempts + 1 - } - } - - escalate := attempts >= DetectingMaxAttempts || !in.Now.Before(startedAt.Add(DetectingMaxDuration)) - if escalate { - return LifecycleDecision{ - Evidence: in.Evidence, - SessionState: domain.SessionStuck, - IsAlive: true, - } - } - - return LifecycleDecision{ - Evidence: in.Evidence, - Detecting: &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash}, - SessionState: domain.SessionDetecting, - IsAlive: true, - } -} - -// HashEvidence normalises an evidence string (stripping timestamps and -// collapsing whitespace) and hashes it, so unchanged-but-restamped signals -// compare equal and the detecting counter is not reset by clock movement alone. -func HashEvidence(evidence string) string { - s := evidence - for _, re := range timestampPatterns { - s = re.ReplaceAllString(s, "") - } - s = strings.Join(strings.Fields(s), " ") - sum := sha256.Sum256([]byte(s)) - return hex.EncodeToString(sum[:]) -} - -// timestampPatterns is the list of regexes HashEvidence applies (in order) to -// delete the time-varying parts of an evidence string before hashing. -var timestampPatterns = []*regexp.Regexp{ - regexp.MustCompile(`\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?`), - regexp.MustCompile(`\d{2}:\d{2}:\d{2}(?:\.\d+)?`), - regexp.MustCompile(`\b\d{10,13}\b`), -} - -// detecting packages a probe verdict into the shared anti-flap path, so every -// probe-driven ambiguity is counted and escalated by the identical quarantine -// logic instead of each probe branch re-implementing the counter. -func detecting(in ProbeInput, evidence string) LifecycleDecision { - return CreateDetectingDecision(DetectingInput{ - Evidence: evidence, - Prior: in.Prior, - Now: in.Now, - }) -} diff --git a/backend/internal/domain/decide/decide_test.go b/backend/internal/domain/decide/decide_test.go deleted file mode 100644 index bc25af55..00000000 --- a/backend/internal/domain/decide/decide_test.go +++ /dev/null @@ -1,164 +0,0 @@ -package decide - -import ( - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" -) - -var t0 = time.Date(2026, 5, 31, 12, 0, 0, 0, time.UTC) - -func TestResolveProbeDecision(t *testing.T) { - tests := []struct { - name string - in ProbeInput - wantState domain.SessionState - wantReason domain.TerminationReason - wantAlive bool - wantDetect bool // expect a detecting verdict (first attempt -> SessionDetecting) - }{ - { - name: "kill requested -> terminated with reason", - in: ProbeInput{KillRequested: true, KillReason: domain.TermManuallyKilled, Now: t0}, - wantState: domain.SessionTerminated, wantReason: domain.TermManuallyKilled, wantAlive: false, - }, - { - name: "kill requested without reason defaults to manually_killed", - in: ProbeInput{KillRequested: true, Now: t0}, - wantState: domain.SessionTerminated, wantReason: domain.TermManuallyKilled, wantAlive: false, - }, - { - name: "runtime probe failed -> detecting (not death)", - in: ProbeInput{RuntimeFailed: true, Now: t0}, - wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, - }, - { - name: "process probe failed -> detecting", - in: ProbeInput{RuntimeAlive: true, ProcessFailed: true, Now: t0}, - wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, - }, - { - name: "runtime alive + process alive -> working", - in: ProbeInput{RuntimeAlive: true, Process: ProcessAlive, Now: t0}, - wantState: domain.SessionWorking, wantAlive: true, - }, - { - name: "runtime alive + process indeterminate -> working", - in: ProbeInput{RuntimeAlive: true, Process: ProcessIndeterminate, Now: t0}, - wantState: domain.SessionWorking, wantAlive: true, - }, - { - name: "runtime alive + process dead -> detecting (disagree)", - in: ProbeInput{RuntimeAlive: true, Process: ProcessDead, Now: t0}, - wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, - }, - { - name: "runtime down + process dead + no activity -> terminated runtime_lost", - in: ProbeInput{RuntimeAlive: false, Process: ProcessDead, RecentActivity: false, Now: t0}, - wantState: domain.SessionTerminated, wantReason: domain.TermRuntimeLost, wantAlive: false, - }, - { - name: "runtime down + process alive -> detecting (disagree)", - in: ProbeInput{RuntimeAlive: false, Process: ProcessAlive, Now: t0}, - wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, - }, - { - name: "runtime down + process dead + recent activity -> detecting", - in: ProbeInput{RuntimeAlive: false, Process: ProcessDead, RecentActivity: true, Now: t0}, - wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, - }, - { - name: "runtime down + process indeterminate -> detecting", - in: ProbeInput{RuntimeAlive: false, Process: ProcessIndeterminate, Now: t0}, - wantState: domain.SessionDetecting, wantAlive: true, wantDetect: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - d := ResolveProbeDecision(tt.in) - if d.SessionState != tt.wantState { - t.Errorf("state = %q, want %q", d.SessionState, tt.wantState) - } - if d.TerminationReason != tt.wantReason { - t.Errorf("reason = %q, want %q", d.TerminationReason, tt.wantReason) - } - if d.IsAlive != tt.wantAlive { - t.Errorf("isAlive = %v, want %v", d.IsAlive, tt.wantAlive) - } - if tt.wantDetect && d.Detecting == nil { - t.Errorf("expected detecting memory, got nil") - } - }) - } -} - -func TestCreateDetectingDecision(t *testing.T) { - t.Run("first entry sets attempts 1", func(t *testing.T) { - d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Now: t0}) - if d.SessionState != domain.SessionDetecting || d.Detecting == nil || d.Detecting.Attempts != 1 { - t.Fatalf("got %+v", d) - } - }) - t.Run("same evidence climbs the counter", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} - d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(time.Second)}) - if d.Detecting == nil || d.Detecting.Attempts != 2 { - t.Fatalf("attempts = %+v, want 2", d.Detecting) - } - }) - t.Run("changed evidence resets the counter", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 2, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} - d := CreateDetectingDecision(DetectingInput{Evidence: "process dead", Prior: prior, Now: t0.Add(time.Second)}) - if d.Detecting == nil || d.Detecting.Attempts != 1 { - t.Fatalf("attempts = %+v, want 1 (evidence changed)", d.Detecting) - } - }) - t.Run("escalates to stuck at the attempt cap", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} - d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(time.Second)}) - if d.SessionState != domain.SessionStuck { - t.Fatalf("state = %q, want stuck", d.SessionState) - } - }) - t.Run("escalates to stuck past the duration cap", func(t *testing.T) { - prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: HashEvidence("runtime down")} - d := CreateDetectingDecision(DetectingInput{Evidence: "runtime down", Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Second)}) - if d.SessionState != domain.SessionStuck { - t.Fatalf("state = %q, want stuck (duration cap)", d.SessionState) - } - }) -} - -func TestProbeDetectingEscalationFlow(t *testing.T) { - in := ProbeInput{RuntimeAlive: false, Process: ProcessIndeterminate, Now: t0} - var prior *domain.DetectingState - for i := 1; i < DetectingMaxAttempts; i++ { - in.Prior = prior - in.Now = t0.Add(time.Duration(i) * time.Second) - d := ResolveProbeDecision(in) - if d.SessionState != domain.SessionDetecting { - t.Fatalf("attempt %d: state = %q, want detecting", i, d.SessionState) - } - prior = d.Detecting - } - in.Prior = prior - in.Now = t0.Add(time.Hour) - if d := ResolveProbeDecision(in); d.SessionState != domain.SessionStuck { - t.Fatalf("final attempt: state = %q, want stuck", d.SessionState) - } -} - -func TestHashEvidence(t *testing.T) { - // timestamp-only differences hash equal; a real change differs. - a := HashEvidence("runtime down at 2026-05-31T12:00:00Z") - b := HashEvidence("runtime down at 2026-05-31T13:30:45Z") - if a != b { - t.Errorf("restamped evidence should hash equal") - } - c := HashEvidence("process dead at 2026-05-31T12:00:00Z") - if a == c { - t.Errorf("different evidence should hash differently") - } -} diff --git a/backend/internal/domain/decide/types.go b/backend/internal/domain/decide/types.go deleted file mode 100644 index 2e9a5c84..00000000 --- a/backend/internal/domain/decide/types.go +++ /dev/null @@ -1,58 +0,0 @@ -package decide - -import ( - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" -) - -// LifecycleDecision is the output of a decider: the canonical session sub-state -// to persist (state, the liveness bool, and — only for a terminal state — the -// termination reason), the human-readable evidence, and the (possibly updated) -// detecting memory. The display status is NOT here — it is derived on read by -// domain.DeriveStatus from the persisted lifecycle plus the pr table. -// -// PR facts are likewise not here: a liveness verdict knows nothing about the PR, -// and PR-driven display/reactions are handled off the pr table, not the session -// state machine. -type LifecycleDecision struct { - Evidence string - Detecting *domain.DetectingState - SessionState domain.SessionState - TerminationReason domain.TerminationReason // set only when SessionState is terminated - IsAlive bool -} - -// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout or -// error) is distinct from a "dead" verdict and must route to detecting, never to -// a death conclusion. KillRequested short-circuits to terminal with KillReason. -type ProbeInput struct { - RuntimeAlive bool // the runtime probe reports the backing runtime is up - RuntimeFailed bool // the runtime probe itself failed (timeout/error) — not "dead" - Process ProcessLiveness - ProcessFailed bool - RecentActivity bool - KillRequested bool - KillReason domain.TerminationReason // the terminal reason when KillRequested - Prior *domain.DetectingState - Now time.Time -} - -// ProcessLiveness mirrors isProcessRunning's three-valued answer. -type ProcessLiveness string - -// Process liveness readings. -const ( - ProcessAlive ProcessLiveness = "alive" - ProcessDead ProcessLiveness = "dead" - ProcessIndeterminate ProcessLiveness = "indeterminate" -) - -// DetectingInput feeds the anti-flap quarantine counter. Evidence is hashed with -// timestamps stripped, so "same ambiguous signal" keeps the counter climbing -// while any real change resets it. -type DetectingInput struct { - Evidence string - Prior *domain.DetectingState - Now time.Time -} diff --git a/backend/internal/domain/doc.go b/backend/internal/domain/doc.go new file mode 100644 index 00000000..60e53a81 --- /dev/null +++ b/backend/internal/domain/doc.go @@ -0,0 +1,5 @@ +// Package domain holds shared vocabulary for sessions, activity, and PR facts. +// Session state is deliberately small: durable session rows carry activity_state +// plus an is_terminated bit; user-facing status is derived from those fields and +// PR facts at read time. +package domain diff --git a/backend/internal/domain/harness.go b/backend/internal/domain/harness.go new file mode 100644 index 00000000..90d86171 --- /dev/null +++ b/backend/internal/domain/harness.go @@ -0,0 +1,12 @@ +package domain + +// AgentHarness identifies which agent CLI/runtime a session drives. +type AgentHarness string + +// Supported agent harnesses. +const ( + HarnessClaudeCode AgentHarness = "claude-code" + HarnessCodex AgentHarness = "codex" + HarnessAider AgentHarness = "aider" + HarnessOpenCode AgentHarness = "opencode" +) diff --git a/backend/internal/domain/lifecycle.go b/backend/internal/domain/lifecycle.go deleted file mode 100644 index 155c0999..00000000 --- a/backend/internal/domain/lifecycle.go +++ /dev/null @@ -1,209 +0,0 @@ -// Package domain holds the shared contract types for the LCM + Session Manager -// lane: the canonical session state model, the derived display status, and the -// session read-model. It has no behaviour beyond pure derivation (status.go) -// and imports nothing outside the standard library, so every other package can -// depend on it without creating cycles. -package domain - -import "time" - -// LifecycleVersion is the schema version stamped onto every persisted record. -// Greenfield: we start at 1 and carry no migration/synthesis code. -const LifecycleVersion = 1 - -// CanonicalSessionLifecycle is the ONLY lifecycle state persisted for a session. -// The display status is derived from it (plus the session's PR facts, which live -// in the separate pr table) on read — see DeriveStatus — and is never stored, so -// canonical truth and display cannot drift. -// -// PR facts are deliberately NOT here: a session can own several PRs over its -// life, and PR state is owned by the pr table. The runtime axis is collapsed to -// a single IsAlive boolean. Activity and Detecting are decider *inputs* that -// must survive between observations, so they live in the persisted record. -type CanonicalSessionLifecycle struct { - // Version is the Go-only schema-shape constant for this record. It is not - // persisted and is not part of the CDC payload. - Version int - - Session SessionSubstate `json:"session"` - Activity ActivitySubstate `json:"activity"` - - // TerminationReason is set only when Session.State is terminated; '' otherwise. - TerminationReason TerminationReason `json:"terminationReason,omitempty"` - - // IsAlive is the single liveness fact: is the runtime/process backing this - // session still up? It replaces the old runtime (state, reason) axis — the - // nuance the probe decider needs (failed-probe != dead, anti-flap) lives in - // the decide core's inputs, not in a persisted enum. - IsAlive bool `json:"isAlive"` - - // Harness is the agent harness the session runs (claude-code, codex, ...). - Harness AgentHarness `json:"harness,omitempty"` - - // Detecting is the anti-flap quarantine memory. It is non-nil only while - // the session is in the detecting state; it carries the attempt counter, - // the first-entry time, and a hash of the (timestamp-stripped) evidence so - // the decider can tell "same ambiguous signal N times" from "signal moved". - Detecting *DetectingState `json:"detecting,omitempty"` -} - -// ---- agent harness ---- - -// AgentHarness identifies which agent CLI/runtime a session drives. -type AgentHarness string - -// Supported agent harnesses. -const ( - HarnessClaudeCode AgentHarness = "claude-code" - HarnessCodex AgentHarness = "codex" - HarnessAider AgentHarness = "aider" - HarnessOpenCode AgentHarness = "opencode" -) - -// ---- session sub-state ---- - -// SessionState is the canonical lifecycle phase of a session. -type SessionState string - -// The canonical session states (see the package doc for the transition model). -const ( - SessionNotStarted SessionState = "not_started" - SessionWorking SessionState = "working" - SessionIdle SessionState = "idle" - SessionNeedsInput SessionState = "needs_input" - SessionStuck SessionState = "stuck" - SessionDetecting SessionState = "detecting" - SessionDone SessionState = "done" - SessionTerminated SessionState = "terminated" -) - -// TerminationReason is the typed "why" for a terminated session — the only -// state that carries a reason. Empty for every non-terminal state. It decides -// the terminal display status (killed / cleanup / errored). The PR-pipeline -// "why" (fixing CI, awaiting review, …) is NOT here; it is derived on read from -// the pr table, not persisted on the session. -type TerminationReason string - -// Termination reasons; TermNone is the non-terminal zero value. -const ( - TermNone TerminationReason = "" - TermManuallyKilled TerminationReason = "manually_killed" - TermRuntimeLost TerminationReason = "runtime_lost" - TermAgentProcessExited TerminationReason = "agent_process_exited" - TermProbeFailure TerminationReason = "probe_failure" - TermErrorInProcess TerminationReason = "error_in_process" - TermAutoCleanup TerminationReason = "auto_cleanup" - TermPRMerged TerminationReason = "pr_merged" -) - -// SessionSubstate wraps the session phase in a struct so the persisted/CDC JSON -// shape can gain fields without a migration. -type SessionSubstate struct { - State SessionState `json:"state"` -} - -// ---- PR facts (NOT persisted on the session; sourced from the pr table) ---- - -// PRFacts is the per-session PR snapshot the status/reaction derivation reads -// from the pr table. It is the decider input that replaces the old persisted PR -// axis. The zero value (Exists=false) means "no PR", which derivation treats as -// "session has no PR". -type PRFacts struct { - URL string - Number int - Exists bool - Draft bool - Merged bool - Closed bool - CI CIState - Review ReviewDecision - Mergeability Mergeability - ReviewComments bool // has unresolved review comments (any author) to address -} - -// CIState is the aggregate CI status of a PR. -type CIState string - -// CI states. -const ( - CIUnknown CIState = "unknown" - CIPending CIState = "pending" - CIPassing CIState = "passing" - CIFailing CIState = "failing" -) - -// ReviewDecision is the aggregate human-review verdict on a PR. -type ReviewDecision string - -// Review decisions. -const ( - ReviewNone ReviewDecision = "none" - ReviewApproved ReviewDecision = "approved" - ReviewChangesRequest ReviewDecision = "changes_requested" - ReviewRequired ReviewDecision = "review_required" -) - -// Mergeability is whether a PR can currently be merged. -type Mergeability string - -// Mergeability states. -const ( - MergeUnknown Mergeability = "unknown" - MergeMergeable Mergeability = "mergeable" - MergeConflicting Mergeability = "conflicting" - MergeBlocked Mergeability = "blocked" - MergeUnstable Mergeability = "unstable" -) - -// ---- activity sub-state (decider input) ---- - -// ActivityState is how busy the agent is, derived from its output/JSONL. -type ActivityState string - -// Activity states. WaitingInput and Blocked are sticky (see IsSticky). -const ( - ActivityActive ActivityState = "active" - ActivityReady ActivityState = "ready" - ActivityIdle ActivityState = "idle" - ActivityWaitingInput ActivityState = "waiting_input" // sticky: does not decay by wallclock - ActivityBlocked ActivityState = "blocked" // sticky: does not decay by wallclock - ActivityExited ActivityState = "exited" -) - -// IsSticky reports whether an activity state must NOT be aged/demoted by the -// passage of time (a paused agent is still paused until a new signal says so). -func (a ActivityState) IsSticky() bool { - return a == ActivityWaitingInput || a == ActivityBlocked -} - -// ActivitySource records where an activity reading came from, so a weaker -// source can't override a stronger one. -type ActivitySource string - -// Activity signal sources, strongest first. -const ( - SourceNative ActivitySource = "native" - SourceTerminal ActivitySource = "terminal" - SourceHook ActivitySource = "hook" - SourceRuntime ActivitySource = "runtime" - SourceNone ActivitySource = "none" -) - -// ActivitySubstate is the persisted activity reading: the state, when it was -// last observed, and which source reported it. -type ActivitySubstate struct { - State ActivityState `json:"state"` - LastActivityAt time.Time `json:"lastActivityAt"` - Source ActivitySource `json:"source"` -} - -// ---- detecting quarantine memory (decider input) ---- - -// DetectingState is the anti-flap quarantine memory carried while a session is -// detecting: how many ambiguous observations, since when, and a hash of the -// (timestamp-stripped) evidence to tell "same signal again" from "signal moved". -type DetectingState struct { - Attempts int `json:"attempts"` - StartedAt time.Time `json:"startedAt"` - EvidenceHash string `json:"evidenceHash"` -} diff --git a/backend/internal/domain/notification.go b/backend/internal/domain/notification.go deleted file mode 100644 index 8c64c9bc..00000000 --- a/backend/internal/domain/notification.go +++ /dev/null @@ -1,44 +0,0 @@ -package domain - -import ( - "encoding/json" - "time" -) - -// NotificationID is the stable public identifier for a persisted notification. -type NotificationID string - -// Notification is the provider-neutral durable notification read model. It is -// sink-agnostic: desktop, dashboard, Slack, webhooks, etc. all consume the same -// semantic payload and actions later. -type Notification struct { - Seq int64 - ID NotificationID - ProjectID ProjectID - SessionID SessionID - Source string - EventType string - SemanticType string - Priority string - Message string - Payload json.RawMessage - Actions []NotificationAction - DedupeKey string - CauseKey string - ReadAt time.Time - ArchivedAt time.Time - CreatedAt time.Time - UpdatedAt time.Time -} - -// NotificationAction is a provider-neutral action descriptor. Renderers may use -// Route for app-local navigation, URL for external navigation, or Method for a -// future command/action endpoint. -type NotificationAction struct { - ID string `json:"id"` - Kind string `json:"kind"` - Label string `json:"label"` - Route string `json:"route,omitempty"` - URL string `json:"url,omitempty"` - Method string `json:"method,omitempty"` -} diff --git a/backend/internal/domain/pr.go b/backend/internal/domain/pr.go index a31b9958..8d1c2451 100644 --- a/backend/internal/domain/pr.go +++ b/backend/internal/domain/pr.go @@ -2,17 +2,28 @@ package domain import "time" -// The PR rows are the canonical shapes for the pr / pr_checks / pr_comment -// tables, shared by the PRWriter port and the sqlite store (the store maps them -// to/from the sqlc gen.* models). They are flat by design — these tables carry -// no nesting or derivation, so a single definition serves every layer. - -// PRRow is the scalar facts of one tracked pull request (the pr table). A session -// can own several PRs; a PR belongs to one session. PRFacts is the read-model -// derived from these for display status; PRRow is what gets written. -type PRRow struct { +// ---- PR read model ---- + +// PRFacts is the per-session PR snapshot the status derivation reads from the +// pr table. +type PRFacts struct { + URL string + Number int + Draft bool + Merged bool + Closed bool + CI CIState + Review ReviewDecision + Mergeability Mergeability + ReviewComments bool // has unresolved review comments (any author) to address +} + +// PullRequest is the app-level representation of one tracked pull request as +// persisted by the PR store. It is intentionally separate from the sqlc +// generated sqlite row type so storage details do not leak outside sqlite. +type PullRequest struct { URL string - SessionID string + SessionID SessionID Number int Draft bool Merged bool @@ -23,20 +34,18 @@ type PRRow struct { UpdatedAt time.Time } -// PRCheckRow is one CI check run — one row per check name per commit. -type PRCheckRow struct { - PRURL string +// PullRequestCheck is one normalized CI check run for a pull request. +type PullRequestCheck struct { Name string CommitHash string - Status string + Status PRCheckStatus URL string LogTail string CreatedAt time.Time } -// PRComment is one review comment. Feedback is injected into the agent -// regardless of author, so there is no bot/human distinction. -type PRComment struct { +// PullRequestComment is one normalized review comment for a pull request. +type PullRequestComment struct { ID string Author string File string @@ -45,3 +54,63 @@ type PRComment struct { Resolved bool CreatedAt time.Time } + +// CIState is the aggregate CI status of a PR. +type CIState string + +// CI states. +const ( + CIUnknown CIState = "unknown" + CIPending CIState = "pending" + CIPassing CIState = "passing" + CIFailing CIState = "failing" +) + +// ReviewDecision is the aggregate human-review verdict on a PR. +type ReviewDecision string + +// Review decisions. +const ( + ReviewNone ReviewDecision = "none" + ReviewApproved ReviewDecision = "approved" + ReviewChangesRequest ReviewDecision = "changes_requested" + ReviewRequired ReviewDecision = "review_required" +) + +// Mergeability is whether a PR can currently be merged. +type Mergeability string + +// Mergeability states. +const ( + MergeUnknown Mergeability = "unknown" + MergeMergeable Mergeability = "mergeable" + MergeConflicting Mergeability = "conflicting" + MergeBlocked Mergeability = "blocked" + MergeUnstable Mergeability = "unstable" +) + +// PRState is the normalized lifecycle of one tracked pull request as stored in +// the pr table. +type PRState string + +// PR states. +const ( + PRStateDraft PRState = "draft" + PRStateOpen PRState = "open" + PRStateMerged PRState = "merged" + PRStateClosed PRState = "closed" +) + +// PRCheckStatus is one CI check run's normalized status. +type PRCheckStatus string + +// PR check statuses. +const ( + PRCheckUnknown PRCheckStatus = "unknown" + PRCheckQueued PRCheckStatus = "queued" + PRCheckInProgress PRCheckStatus = "in_progress" + PRCheckPassed PRCheckStatus = "passed" + PRCheckFailed PRCheckStatus = "failed" + PRCheckSkipped PRCheckStatus = "skipped" + PRCheckCancelled PRCheckStatus = "cancelled" +) diff --git a/backend/internal/domain/session.go b/backend/internal/domain/session.go index 4d436e2a..76e799fb 100644 --- a/backend/internal/domain/session.go +++ b/backend/internal/domain/session.go @@ -22,50 +22,34 @@ const ( KindOrchestrator SessionKind = "orchestrator" ) -// SessionMetadata is the typed, off-canonical metadata for a session: the -// operational handles and seed inputs the Session Manager and reaper need but -// that are NOT part of the canonical lifecycle. The set of fields is fixed here -// (no free-form keys), so what a session can carry is a compile-time fact, and -// it is folded into the sessions row off the CDC path. -// -// Empty fields mean "unset": the LCM merges metadata without overwriting a -// stored value with an empty one, so a partial write (spawn setting only the -// runtime handle) does not clobber a value set earlier (the branch at creation). +// SessionMetadata is the typed, off-status metadata for a session: operational +// handles and seed inputs used by Session Manager and reaper. type SessionMetadata struct { Branch string `json:"branch,omitempty"` WorkspacePath string `json:"workspacePath,omitempty"` RuntimeHandleID string `json:"runtimeHandleId,omitempty"` - RuntimeName string `json:"runtimeName,omitempty"` AgentSessionID string `json:"agentSessionId,omitempty"` Prompt string `json:"prompt,omitempty"` } -// IsZero reports whether no metadata field is set. -func (m SessionMetadata) IsZero() bool { return m == SessionMetadata{} } - -// SessionRecord is the PERSISTENCE shape: identity, canonical lifecycle, and -// metadata — everything the store holds, and nothing derived. The store reads -// and writes records; it never produces the derived display status. -// -// Metadata is json:"-" on purpose: it lives off the canonical path, so it must -// never ride along in the change_log / snapshot payloads. Enforcing that at the -// type level means no caller has to remember to scrub it before marshalling. +// SessionRecord is the persistence shape. It intentionally stores only durable +// facts: identity, agent harness, activity_state, is_terminated, and operational +// metadata. The user-facing Status is derived from these facts plus PR facts. type SessionRecord struct { - ID SessionID `json:"id"` - ProjectID ProjectID `json:"projectId"` - IssueID IssueID `json:"issueId,omitempty"` - Kind SessionKind `json:"kind"` - Lifecycle CanonicalSessionLifecycle `json:"lifecycle"` - Metadata SessionMetadata `json:"-"` - CreatedAt time.Time `json:"createdAt"` - UpdatedAt time.Time `json:"updatedAt"` + ID SessionID `json:"id"` + ProjectID ProjectID `json:"projectId"` + IssueID IssueID `json:"issueId,omitempty"` + Kind SessionKind `json:"kind"` + Harness AgentHarness `json:"harness,omitempty"` + Activity ActivitySubstate `json:"activity"` + IsTerminated bool `json:"isTerminated"` + Metadata SessionMetadata `json:"-"` + CreatedAt time.Time `json:"createdAt"` + UpdatedAt time.Time `json:"updatedAt"` } -// Session is the read-model returned across the API boundary (to controllers, -// then the frontend): a SessionRecord plus the DERIVED display Status. The -// Session Manager is the single producer of Status — it builds a Session from a -// stored SessionRecord by calling DeriveLegacyStatus, so the store and API -// never recompute (or accidentally persist) it. +// Session is the read-model returned across the API boundary: a SessionRecord +// plus the derived display Status. type Session struct { SessionRecord Status SessionStatus `json:"status"` diff --git a/backend/internal/domain/status.go b/backend/internal/domain/status.go index 5fa0f721..d02ddcb3 100644 --- a/backend/internal/domain/status.go +++ b/backend/internal/domain/status.go @@ -1,15 +1,12 @@ package domain // SessionStatus is the single-word DISPLAY status the dashboard renders. It is -// derived from the canonical lifecycle (plus the session's PR facts) on read and -// never persisted. +// derived from persisted session facts plus PR facts and is never stored. type SessionStatus string // The display statuses the dashboard renders. const ( - StatusSpawning SessionStatus = "spawning" StatusWorking SessionStatus = "working" - StatusDetecting SessionStatus = "detecting" StatusPROpen SessionStatus = "pr_open" StatusDraft SessionStatus = "draft" StatusCIFailed SessionStatus = "ci_failed" @@ -18,78 +15,45 @@ const ( StatusApproved SessionStatus = "approved" StatusMergeable SessionStatus = "mergeable" StatusMerged SessionStatus = "merged" - StatusCleanup SessionStatus = "cleanup" StatusNeedsInput SessionStatus = "needs_input" StatusStuck SessionStatus = "stuck" - StatusErrored SessionStatus = "errored" - StatusKilled SessionStatus = "killed" StatusIdle SessionStatus = "idle" - StatusDone SessionStatus = "done" StatusTerminated SessionStatus = "terminated" ) -// DeriveStatus is the ONLY producer of the display status. It is a pure, total -// function of the canonical record plus the session's PR facts (read from the pr -// table by the caller, since PR state is no longer persisted on the session). -// -// Order matters: -// 1. Terminal / hard session states (done, terminated, needs_input, stuck, -// detecting, not_started) map directly — these OUTRANK PR facts. -// 2. Otherwise, if the session has a PR: a merged PR wins, else the PR pipeline -// ladder (CI failure dominates, then draft/review/merge states). -// 3. Otherwise fall through to the SOFT session state (idle/working). -// -// So "PR facts dominate session facts" applies only to the soft states: an idle -// or working session with an open, CI-failing PR displays as ci_failed — but a -// session that is stuck or needs_input shows that regardless, since it needs a -// human either way. -func DeriveStatus(l CanonicalSessionLifecycle, pr PRFacts) SessionStatus { - switch l.Session.State { - case SessionDone: - return StatusDone - case SessionTerminated: - return terminatedStatus(l.TerminationReason) - case SessionNeedsInput: +// DeriveStatus is the ONLY producer of display status. It is a pure function of +// persisted session facts and PR facts: is_terminated, activity_state, and the PR +// table are the durable facts that tell the UI what it needs to know. +func DeriveStatus(rec SessionRecord, pr *PRFacts) SessionStatus { + if rec.IsTerminated { + if pr != nil && pr.Merged { + return StatusMerged + } + return StatusTerminated + } + + switch rec.Activity.State { + case ActivityWaitingInput: return StatusNeedsInput - case SessionStuck: + case ActivityBlocked: return StatusStuck - case SessionDetecting: - return StatusDetecting - case SessionNotStarted: - return StatusSpawning } - if pr.Exists { + if pr != nil { if pr.Merged { return StatusMerged } if !pr.Closed { - return prPipelineStatus(pr) + return prPipelineStatus(*pr) } } - if l.Session.State == SessionIdle { - return StatusIdle - } - return StatusWorking -} - -func terminatedStatus(r TerminationReason) SessionStatus { - switch r { - case TermManuallyKilled, TermRuntimeLost, TermAgentProcessExited: - return StatusKilled - case TermAutoCleanup, TermPRMerged: - return StatusCleanup - case TermErrorInProcess, TermProbeFailure: - return StatusErrored - default: - return StatusTerminated + if rec.Activity.State == ActivityActive { + return StatusWorking } + return StatusIdle } -// prPipelineStatus maps an open/draft PR's facts to a display status, preserving -// the old ladder: CI failure dominates everything, then draft, then the review / -// merge states. func prPipelineStatus(pr PRFacts) SessionStatus { switch { case pr.CI == CIFailing: diff --git a/backend/internal/domain/status_test.go b/backend/internal/domain/status_test.go index 57512577..7bd02dbf 100644 --- a/backend/internal/domain/status_test.go +++ b/backend/internal/domain/status_test.go @@ -2,58 +2,37 @@ package domain import "testing" -func TestDeriveStatus(t *testing.T) { - // sess builds a non-terminal lifecycle (no reason). - sess := func(s SessionState) CanonicalSessionLifecycle { - return CanonicalSessionLifecycle{Session: SessionSubstate{State: s}} - } - // term builds a terminated lifecycle carrying a TerminationReason. - term := func(r TerminationReason) CanonicalSessionLifecycle { - return CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated}, TerminationReason: r} - } - openPR := func(mut func(*PRFacts)) PRFacts { - f := PRFacts{Exists: true, CI: CIUnknown, Review: ReviewNone, Mergeability: MergeUnknown} - if mut != nil { - mut(&f) - } - return f - } +func rec(activity ActivityState, terminated bool) SessionRecord { + return SessionRecord{Activity: ActivitySubstate{State: activity}, IsTerminated: terminated} +} +func pr(facts PRFacts) *PRFacts { return &facts } + +func TestDeriveStatusFromSessionFactsAndPR(t *testing.T) { tests := []struct { name string - in CanonicalSessionLifecycle - pr PRFacts + rec SessionRecord + pr *PRFacts want SessionStatus }{ - {"not_started maps to spawning", sess(SessionNotStarted), PRFacts{}, StatusSpawning}, - {"terminated+manually_killed -> killed", term(TermManuallyKilled), PRFacts{}, StatusKilled}, - {"terminated+runtime_lost -> killed", term(TermRuntimeLost), PRFacts{}, StatusKilled}, - {"terminated+auto_cleanup -> cleanup", term(TermAutoCleanup), PRFacts{}, StatusCleanup}, - {"terminated+pr_merged -> cleanup", term(TermPRMerged), PRFacts{}, StatusCleanup}, - {"terminated+error -> errored", term(TermErrorInProcess), PRFacts{}, StatusErrored}, - {"needs_input maps directly", sess(SessionNeedsInput), PRFacts{}, StatusNeedsInput}, - {"stuck dominates any PR", sess(SessionStuck), openPR(func(f *PRFacts) { f.CI = CIFailing }), StatusStuck}, - - {"no PR + idle -> idle", sess(SessionIdle), PRFacts{}, StatusIdle}, - {"no PR + working -> working", sess(SessionWorking), PRFacts{}, StatusWorking}, - - {"merged PR dominates idle session", sess(SessionIdle), PRFacts{Exists: true, Merged: true}, StatusMerged}, - {"open PR failing CI -> ci_failed", sess(SessionIdle), openPR(func(f *PRFacts) { f.CI = CIFailing }), StatusCIFailed}, - {"draft PR failing CI -> ci_failed (CI dominates)", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.CI = CIFailing }), StatusCIFailed}, - {"draft PR ignores review state -> draft", sess(SessionWorking), openPR(func(f *PRFacts) { f.Draft = true; f.Review = ReviewApproved }), StatusDraft}, - {"open PR changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewChangesRequest }), StatusChangesRequested}, - {"open PR review comments -> changes_requested", sess(SessionWorking), openPR(func(f *PRFacts) { f.ReviewComments = true }), StatusChangesRequested}, - {"open PR mergeable", sess(SessionWorking), openPR(func(f *PRFacts) { f.Mergeability = MergeMergeable }), StatusMergeable}, - {"open PR approved", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewApproved }), StatusApproved}, - {"open PR review required -> review_pending", sess(SessionWorking), openPR(func(f *PRFacts) { f.Review = ReviewRequired }), StatusReviewPending}, - {"open PR no signal -> pr_open", sess(SessionWorking), openPR(nil), StatusPROpen}, - {"closed PR falls through to soft state", sess(SessionIdle), PRFacts{Exists: true, Closed: true}, StatusIdle}, + {"terminated", rec(ActivityExited, true), nil, StatusTerminated}, + {"merged-pr", rec(ActivityIdle, true), pr(PRFacts{Merged: true}), StatusMerged}, + {"needs-input", rec(ActivityWaitingInput, false), pr(PRFacts{CI: CIFailing}), StatusNeedsInput}, + {"blocked", rec(ActivityBlocked, false), pr(PRFacts{CI: CIFailing}), StatusStuck}, + {"ci-failed", rec(ActivityIdle, false), pr(PRFacts{CI: CIFailing}), StatusCIFailed}, + {"draft", rec(ActivityIdle, false), pr(PRFacts{Draft: true}), StatusDraft}, + {"changes-requested", rec(ActivityIdle, false), pr(PRFacts{Review: ReviewChangesRequest}), StatusChangesRequested}, + {"mergeable", rec(ActivityIdle, false), pr(PRFacts{Mergeability: MergeMergeable}), StatusMergeable}, + {"approved", rec(ActivityIdle, false), pr(PRFacts{Review: ReviewApproved}), StatusApproved}, + {"review-pending", rec(ActivityIdle, false), pr(PRFacts{Review: ReviewRequired}), StatusReviewPending}, + {"pr-open", rec(ActivityIdle, false), pr(PRFacts{}), StatusPROpen}, + {"working", rec(ActivityActive, false), nil, StatusWorking}, + {"idle", rec(ActivityIdle, false), nil, StatusIdle}, } - for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := DeriveStatus(tt.in, tt.pr); got != tt.want { - t.Errorf("DeriveStatus() = %q, want %q", got, tt.want) + if got := DeriveStatus(tt.rec, tt.pr); got != tt.want { + t.Fatalf("got %q, want %q", got, tt.want) } }) } diff --git a/backend/internal/domain/tracker.go b/backend/internal/domain/tracker.go index c5f22262..fde1631b 100644 --- a/backend/internal/domain/tracker.go +++ b/backend/internal/domain/tracker.go @@ -1,23 +1,13 @@ package domain // TrackerProvider identifies an issue-tracker provider implementation. -// Provider differences (label-driven vs state-machine vs close-reason) are -// absorbed inside each adapter; the rest of the system only sees -// NormalizedIssueState. type TrackerProvider string -// Supported tracker providers. -const ( - TrackerProviderGitHub TrackerProvider = "github" - TrackerProviderGitLab TrackerProvider = "gitlab" - TrackerProviderLinear TrackerProvider = "linear" -) +// TrackerProviderGitHub is the only supported issue-tracker provider. +const TrackerProviderGitHub TrackerProvider = "github" -// TrackerID identifies a single issue across providers. Native is the -// provider's own canonical form ("owner/repo#123" for GitHub, -// "group/project#456" for GitLab, "ABC-789" for Linear) and is parsed by the -// adapter. Provider is the discriminator the Session Manager uses to pick an -// adapter. +// TrackerID identifies one issue. Native is the provider's own canonical form +// ("owner/repo#123" for GitHub) and is parsed by the adapter. type TrackerID struct { Provider TrackerProvider `json:"provider"` Native string `json:"native"` @@ -37,9 +27,8 @@ const ( IssueCancelled NormalizedIssueState = "cancelled" ) -// Issue is the minimum projection every tracker can produce. Fields are -// added only when all v1 providers (GitHub, GitLab, Linear) can populate -// them faithfully; richer metadata stays inside provider-specific code paths. +// Issue is the minimum projection every tracker can produce. Provider-specific +// metadata stays inside provider-specific code paths. type Issue struct { ID TrackerID `json:"id"` Title string `json:"title"` @@ -50,11 +39,9 @@ type Issue struct { Assignees []string `json:"assignees,omitempty"` } -// TrackerRepo identifies a repository (or its provider-equivalent) for -// cross-issue queries like Tracker.List. Native is the provider's canonical -// owner/project form: "owner/repo" for GitHub, "group/project" for GitLab. -// Linear has no native repo concept; adapters may use a team or workspace -// identifier in Native when this port reaches Linear. +// TrackerRepo identifies a repository for cross-issue queries like Tracker.List. +// Native is the provider's canonical owner/project form, e.g. "owner/repo" for +// GitHub. type TrackerRepo struct { Provider TrackerProvider `json:"provider"` Native string `json:"native"` @@ -77,12 +64,8 @@ const ( // ListFilter is the query the Session Manager passes to Tracker.List. // Empty / zero values mean "no filter on this dimension". // -// Limit is the requested page size. The adapter applies its own default -// when zero and SILENTLY CAPS at the provider's per-page maximum — a -// caller asking for more than the cap gets exactly cap items back with no -// error and no indication of truncation. v1 has no auto-pagination; -// callers needing more results need to wait for the observer/polling work -// in issue #35. +// Limit is the requested page size. The adapter applies its own default when +// zero and caps at the provider's per-page maximum. type ListFilter struct { State ListStateFilter `json:"state,omitempty"` Labels []string `json:"labels,omitempty"` diff --git a/backend/internal/httpd/api.go b/backend/internal/httpd/api.go index 124a8d78..9480cdad 100644 --- a/backend/internal/httpd/api.go +++ b/backend/internal/httpd/api.go @@ -9,27 +9,20 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/apispec" "github.com/aoagents/agent-orchestrator/backend/internal/httpd/controllers" + "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" "github.com/aoagents/agent-orchestrator/backend/internal/project" ) -// APIDeps bundles every Manager the API layer's controllers depend on. There -// is exactly one Manager per resource, defined in that resource's own package -// (project.Manager, later session.Manager, ...), and the controllers see ONLY -// that interface — they don't reach past it to the LCM, adapters, or stores. -// Whether a Manager impl talks to the registry, the LCM, or an outbound port -// is its own concern. -// -// The route-shell PR (#20) leaves every field nil — handlers answer via -// apispec.NotImplemented and don't dereference them yet. The handler-impl PR -// wires real Managers and flips stubs to real logic one route at a time. +// APIDeps bundles every Manager the API layer's controllers depend on. +// Controllers see only resource-level interfaces; they do not reach through to +// lifecycle reducers, adapters, or storage. A nil dependency keeps its routes +// registered but returns the OpenAPI-backed 501 response. type APIDeps struct { Projects project.Manager } // API owns one controller per resource and is the single Register call the -// router invokes to mount the /api/v1 surface. Splitting per-resource means -// later PRs can land a controller's real handlers without touching the -// surrounding wiring. +// router invokes to mount the /api/v1 surface. type API struct { cfg config.Config projects *controllers.ProjectsController @@ -47,13 +40,8 @@ func NewAPI(cfg config.Config, deps APIDeps) *API { } } -// Register mounts the API surface on root. /api/v1 hosts the REST group with -// the per-request Timeout that the skeleton router (router.go) deliberately -// kept off the global stack — REST routes are bounded, but long-lived surfaces -// (/events SSE, /mux WS) live outside this group when they land. -// -// /mux is mounted outside /api/v1 for parity with the legacy TS surface; it is -// a phase-4 placeholder and stays unregistered here until that lane starts. +// Register mounts the bounded /api/v1 REST surface. Long-lived surfaces such +// as muxed terminal streams stay outside this timeout group. func (a *API) Register(root chi.Router) { timeout := a.cfg.RequestTimeout if timeout <= 0 { @@ -61,20 +49,15 @@ func (a *API) Register(root chi.Router) { } root.Route("/api/v1", func(r chi.Router) { - // The OpenAPI document is the source of truth for every contract on - // this surface; serve it so tooling (SDK generators, the OpenAPI - // validator in #19, the dashboard's developer tools) can fetch the - // whole spec from the same origin as the routes it describes. - apispec.RegisterServe(r, "/openapi.yaml") + // Serve the OpenAPI document from the same origin as the routes it describes. + r.Get("/openapi.yaml", apispec.ServeYAML) r.Group(func(r chi.Router) { r.Use(middleware.Timeout(timeout)) a.projects.Register(r) - // Sibling controllers (sessions, issues, prs, ...) plug in here in - // follow-up PRs #21 / #22 without touching the timeout group. + // Sibling REST controllers plug in here. }) - // Surfaces that intentionally bypass the REST timeout (SSE, future WS) - // register at this level — none exist in the route-shell PR. + // Surfaces that intentionally bypass the REST timeout register at this level. }) } @@ -82,7 +65,7 @@ func (a *API) Register(root chi.Router) { // 404 is a text/plain body; the API surface must answer JSON so consumers can // parse it uniformly. func notFoundJSON(w http.ResponseWriter, r *http.Request) { - writeAPIError(w, r, http.StatusNotFound, "not_found", "ROUTE_NOT_FOUND", + envelope.WriteAPIError(w, r, http.StatusNotFound, "not_found", "ROUTE_NOT_FOUND", r.Method+" "+r.URL.Path+" has no handler", nil) } @@ -90,6 +73,6 @@ func notFoundJSON(w http.ResponseWriter, r *http.Request) { // known path without a matching verb (e.g. PUT /projects/{id} after we drop // the legacy PUT alias). func methodNotAllowedJSON(w http.ResponseWriter, r *http.Request) { - writeAPIError(w, r, http.StatusMethodNotAllowed, "method_not_allowed", "METHOD_NOT_ALLOWED", + envelope.WriteAPIError(w, r, http.StatusMethodNotAllowed, "method_not_allowed", "METHOD_NOT_ALLOWED", r.Method+" not allowed on "+r.URL.Path, nil) } diff --git a/backend/internal/httpd/apispec/apispec.go b/backend/internal/httpd/apispec/apispec.go index 627ad5db..2603820f 100644 --- a/backend/internal/httpd/apispec/apispec.go +++ b/backend/internal/httpd/apispec/apispec.go @@ -16,7 +16,6 @@ import ( "strings" "sync" - "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" yaml "gopkg.in/yaml.v3" ) @@ -65,10 +64,6 @@ func New(yamlBytes []byte) (*Spec, error) { return &Spec{doc: doc}, nil } -// YAML returns the raw embedded document bytes. Used by the /openapi.yaml -// handler. -func (s *Spec) YAML() []byte { return openapiYAML } - // Operation returns the spec slice for a single (method, path) pair, ready // to be JSON-serialised. The slice is the OpenAPI Operation object (the // inner block under e.g. paths./projects.get), with parent path-level @@ -119,9 +114,7 @@ type notImplementedResponse struct { } // NotImplemented writes the locked 501 envelope, embedding the OpenAPI -// Operation slice that documents what this route WILL do. Replaces the -// throwaway PlannedRoute literals that the first cut of the route shell -// duplicated in controller code. +// Operation slice for the capability that is currently unavailable. func NotImplemented(w http.ResponseWriter, r *http.Request, method, path string) { op := Default().Operation(method, path) if op == nil { @@ -130,7 +123,7 @@ func NotImplemented(w http.ResponseWriter, r *http.Request, method, path string) body := notImplementedResponse{ Error: "not_implemented", Code: "NOT_IMPLEMENTED", - Message: method + " " + path + " is registered but not yet implemented", + Message: method + " " + path + " is unavailable in this daemon", RequestID: middleware.GetReqID(r.Context()), Spec: op, } @@ -140,18 +133,9 @@ func NotImplemented(w http.ResponseWriter, r *http.Request, method, path string) _ = json.NewEncoder(w).Encode(body) } -// ServeYAML serves the embedded openapi.yaml document. Mounted at -// /api/v1/openapi.yaml so spec-consuming tooling (#19's validator, -// SDK generators, the dashboard's developer tools) can fetch the -// whole document in one request. +// ServeYAML serves the embedded OpenAPI document for SDK generators, tests, and +// developer tooling. func ServeYAML(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "application/yaml; charset=utf-8") _, _ = w.Write(openapiYAML) } - -// RegisterServe mounts ServeYAML on the supplied router. Kept as a -// helper so the router code only references one symbol from apispec -// for the static serve path. -func RegisterServe(r chi.Router, path string) { - r.Get(path, ServeYAML) -} diff --git a/backend/internal/httpd/apispec/openapi.yaml b/backend/internal/httpd/apispec/openapi.yaml index 2b60a3a5..970ec7f4 100644 --- a/backend/internal/httpd/apispec/openapi.yaml +++ b/backend/internal/httpd/apispec/openapi.yaml @@ -1,13 +1,12 @@ openapi: 3.1.0 info: title: Agent Orchestrator HTTP daemon - version: 0.1.0-route-shell + version: 0.1.0 description: | - Loopback-only HTTP surface served by the Go daemon. This spec is the - source of truth for every route's contract — the 501 stubs in the - route-shell phase return the matching Operation slice as a `spec` - field, so consumers discover the contract by calling the endpoint - they care about. Real handlers in later PRs satisfy this same spec. + Loopback-only HTTP surface served by the Go daemon. This document describes + the registered /api/v1 project routes and the shared error envelope used by + OpenAPI-backed 501 responses. Daemon control endpoints such as /healthz, + /readyz, /shutdown, and /mux are intentionally outside this REST spec. servers: - url: http://127.0.0.1:3001 @@ -135,12 +134,6 @@ paths: schema: { $ref: "#/components/schemas/APIError" } example: { error: internal, code: PROJECT_LOAD_FAILED, message: "Failed to load project" } "501": { $ref: "#/components/responses/NotImplemented" } - x-rest-audit-notes: | - R5: degraded projects return 200 with a `status` discriminator - instead of 200 with an `error` field (as the legacy TS surface did). - Archived projects are hidden from list responses but still resolve by - id so historical sessions can keep their project_id reference. - patch: operationId: updateProjectConfig tags: [projects] @@ -180,13 +173,6 @@ paths: application/json: schema: { $ref: "#/components/schemas/APIError" } example: { error: not_implemented, code: PROJECT_CONFIG_NOT_IMPLEMENTED, message: "Project config patching is not available until config persistence is wired" } - x-rest-audit-notes: | - R3: legacy `PUT /projects/{id}` (a TS alias of PATCH) is NOT - registered. PUT returns 405 Method Not Allowed. - R6: when config persistence lands this route returns { project }, not - { ok: true }. Until then, config patches return 501 instead of - pretending to persist fields the current project store cannot store. - delete: operationId: removeProject tags: [projects] @@ -221,10 +207,6 @@ paths: summary: Repair a degraded project where automatic recovery is available x-replaces: - "POST /api/v1/projects/{id}" - x-rest-audit-notes: | - R4: this canonical path replaces the overloaded - `POST /api/v1/projects/{id}` from the legacy TS surface. - The legacy path is NOT registered; consumers must use /repair. responses: "200": description: Project repaired @@ -323,12 +305,8 @@ components: description: "\"owner/name\" or empty string when unset" defaultBranch: { type: string, default: main } agent: { type: string } - runtime: { type: string } tracker: { $ref: "#/components/schemas/TrackerConfig" } scm: { $ref: "#/components/schemas/SCMConfig" } - reactions: - type: object - additionalProperties: { $ref: "#/components/schemas/ReactionConfig" } DegradedProject: type: object @@ -373,12 +351,8 @@ components: persistence exists. properties: agent: { type: string } - runtime: { type: string } tracker: { $ref: "#/components/schemas/TrackerConfig" } scm: { $ref: "#/components/schemas/SCMConfig" } - reactions: - type: object - additionalProperties: { $ref: "#/components/schemas/ReactionConfig" } RemoveProjectResult: type: object @@ -395,9 +369,7 @@ components: projectCount: { type: integer } degradedCount: { type: integer } - # ---- Behaviour config blobs (ported from the TS Zod schemas) ---- - # These are the known config shapes only. The current Go handler does not - # preserve unknown passthrough keys until config persistence is implemented. + # ---- Behaviour config blobs ---- TrackerConfig: type: object @@ -424,23 +396,3 @@ components: eventHeader: { type: string } deliveryHeader: { type: string } maxBodyBytes: { type: integer } - - ReactionConfig: - type: object - properties: - auto: { type: boolean } - action: - type: string - enum: [send-to-agent, notify, auto-merge] - message: { type: string } - priority: - type: string - enum: [urgent, action, warning, info] - retries: { type: integer } - escalateAfter: - oneOf: - - { type: number } - - { type: string } - description: Either ms (number) or duration string ("30m"). - threshold: { type: string } - includeSummary: { type: boolean } diff --git a/backend/internal/httpd/controllers/projects.go b/backend/internal/httpd/controllers/projects.go index 60e8159e..91a1e47d 100644 --- a/backend/internal/httpd/controllers/projects.go +++ b/backend/internal/httpd/controllers/projects.go @@ -1,16 +1,7 @@ // Package controllers holds the HTTP-facing controllers for the /api/v1 // surface. Each controller groups one resource's routes, exposes a Register -// method that wires them on a chi.Router, and depends on exactly one -// *Manager interface from ports/inbound.go — never on a store, the LCM, an -// adapter, or any other port. Whether the Manager impl reaches past that -// boundary is its own concern. -// -// In the route-shell PR (#20) every handler is a one-line apispec.NotImplemented -// call: the contract lives in the OpenAPI document (apispec/openapi.yaml), and -// the 501 body returns that document's slice for the route so consumers can -// discover the contract from the endpoint itself. When real handlers land, -// the stub one-liner is replaced with the impl; no per-route planned -// metadata in code ever has to be deleted. +// method, and depends on exactly one resource-level Manager interface — never +// directly on stores, lifecycle reducers, or adapters. package controllers import ( @@ -28,10 +19,8 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/project" ) -// ProjectsController owns the 7 canonical /projects routes. The controller -// depends ONLY on project.Manager — it doesn't know whether the impl reaches -// into the registry, the LCM, an adapter, or all three. Mgr is nil while -// handlers are stubs; the handler-impl PR supplies a real project.Manager. +// ProjectsController owns the /projects routes. The controller depends only on +// project.Manager; nil keeps routes registered but returns OpenAPI-backed 501s. type ProjectsController struct { Mgr project.Manager } @@ -39,12 +28,6 @@ type ProjectsController struct { // Register mounts the project routes on the supplied router. Route order // matters: /projects/reload must register before /projects/{id} for the POST // verb, otherwise chi would treat "reload" as an {id} match for repair. -// -// Legacy paths that the REST audit dropped are deliberately NOT registered -// here. They surface as 405 (sibling method exists, e.g. PUT /projects/{id}) -// or 404 (no sibling). The mapping lives in apispec/openapi.yaml as -// `x-replaces` on the canonical operation so consumers discover the -// migration without leaving the spec. func (c *ProjectsController) Register(r chi.Router) { r.Get("/projects", c.list) r.Post("/projects", c.add) diff --git a/backend/internal/httpd/controllers/projects_test.go b/backend/internal/httpd/controllers/projects_test.go index d1ca2442..8d303da5 100644 --- a/backend/internal/httpd/controllers/projects_test.go +++ b/backend/internal/httpd/controllers/projects_test.go @@ -136,7 +136,7 @@ func TestProjectsAPI_UpdateDeleteRepair(t *testing.T) { t.Fatalf("seed create = %d, want 201; body=%s", status, body) } - body, status, _ = doRequest(t, srv, "PATCH", "/api/v1/projects/proj", `{"agent":"claude","runtime":"tmux"}`) + body, status, _ = doRequest(t, srv, "PATCH", "/api/v1/projects/proj", `{"agent":"claude"}`) assertErrorCode(t, body, status, http.StatusNotImplemented, "PROJECT_CONFIG_NOT_IMPLEMENTED") body, status, _ = doRequest(t, srv, "PATCH", "/api/v1/projects/proj", `{"path":"elsewhere"}`) @@ -229,7 +229,6 @@ type projectBody struct { Repo string `json:"repo"` DefaultBranch string `json:"defaultBranch"` Agent string `json:"agent"` - Runtime string `json:"runtime"` } type errorBody struct { diff --git a/backend/internal/httpd/errors.go b/backend/internal/httpd/errors.go deleted file mode 100644 index 8b41c99f..00000000 --- a/backend/internal/httpd/errors.go +++ /dev/null @@ -1,22 +0,0 @@ -package httpd - -import ( - "net/http" - - "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" -) - -// APIError is the locked wire shape for every non-2xx response. It supersedes -// the legacy TS `{error: "msg"}` bag with a machine-readable Code and a -// RequestID for log correlation (sourced from chi's RequestID middleware). -// -// Details is open so collision-style errors can carry typed sub-fields -// (e.g. existingProjectId, suggestedProjectId on POST /projects 409s). -type APIError = envelope.APIError - -// writeAPIError emits the locked envelope for any non-2xx response. The -// request id falls back to empty when the chi middleware hasn't tagged the -// request (e.g. in tests that bypass NewRouter). -func writeAPIError(w http.ResponseWriter, r *http.Request, status int, kind, code, message string, details map[string]any) { - envelope.WriteAPIError(w, r, status, kind, code, message, details) -} diff --git a/backend/internal/httpd/json.go b/backend/internal/httpd/json.go deleted file mode 100644 index 64ccb340..00000000 --- a/backend/internal/httpd/json.go +++ /dev/null @@ -1,14 +0,0 @@ -package httpd - -import ( - "net/http" - - "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" -) - -// writeJSON serialises v as JSON with the given status. It is the single JSON -// writer for the skeleton; the typed error envelope (open item Q1.3) will build -// on this in a later phase. -func writeJSON(w http.ResponseWriter, status int, v any) { - envelope.WriteJSON(w, status, v) -} diff --git a/backend/internal/httpd/logger.go b/backend/internal/httpd/logger.go new file mode 100644 index 00000000..0df29da0 --- /dev/null +++ b/backend/internal/httpd/logger.go @@ -0,0 +1,10 @@ +package httpd + +import "log/slog" + +func loggerOrDefault(log *slog.Logger) *slog.Logger { + if log != nil { + return log + } + return slog.Default() +} diff --git a/backend/internal/httpd/logger_test.go b/backend/internal/httpd/logger_test.go new file mode 100644 index 00000000..ddd6d308 --- /dev/null +++ b/backend/internal/httpd/logger_test.go @@ -0,0 +1,19 @@ +package httpd + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/aoagents/agent-orchestrator/backend/internal/config" +) + +func TestNewRouterAllowsNilLogger(t *testing.T) { + router := NewRouter(config.Config{}, nil, nil) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + router.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("/healthz status = %d, want 200", rec.Code) + } +} diff --git a/backend/internal/httpd/router.go b/backend/internal/httpd/router.go index 19590738..5d73156d 100644 --- a/backend/internal/httpd/router.go +++ b/backend/internal/httpd/router.go @@ -1,7 +1,5 @@ -// Package httpd builds and runs the daemon's HTTP surface. Phase 1a is the -// skeleton: the middleware stack, liveness/readiness probes, and a graceful -// run loop. Route registration (/api/v1, /events, /mux, /) lands in later -// phases on top of the router this package builds. +// Package httpd builds and runs the daemon's HTTP surface: middleware, health +// probes, daemon control, REST APIs, and terminal WebSocket routing. package httpd import ( @@ -15,6 +13,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/config" "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" + "github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope" "github.com/aoagents/agent-orchestrator/backend/internal/terminal" ) @@ -28,11 +27,8 @@ import ( // requestLogger → slog-backed access log, stderr, carries the request id // RealIP → normalise client IP (loopback proxy from the dev server) // -// The per-request Timeout from the decision table is deliberately NOT applied -// globally: it must wrap only the /api/v1 REST surface, never the long-lived -// SSE (/events) or WebSocket (/mux) surfaces, nor the always-must-answer health -// probes. It is therefore applied per-surface when those subrouters are mounted -// in Phase 1b; cfg.RequestTimeout carries the value through to that point. +// The per-request timeout is deliberately not global: it wraps only bounded +// REST routes, never long-lived terminal streams or health probes. func NewRouter(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager) chi.Router { return NewRouterWithAPI(cfg, log, termMgr, APIDeps{}) } @@ -43,9 +39,8 @@ type ControlDeps struct { RequestShutdown func() } -// NewRouterWithAPI is the dependency-injected variant. main.go calls it with -// real Managers when they exist; tests/dev wiring inject mocks explicitly. -// Missing Managers intentionally keep the route-shell 501 behavior. +// NewRouterWithAPI is the dependency-injected variant. Missing Managers keep +// routes registered but return OpenAPI-backed 501 responses. func NewRouterWithAPI(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager, deps APIDeps) chi.Router { return NewRouterWithControl(cfg, log, termMgr, deps, ControlDeps{}) } @@ -53,6 +48,7 @@ func NewRouterWithAPI(cfg config.Config, log *slog.Logger, termMgr *terminal.Man // NewRouterWithControl is NewRouterWithAPI plus daemon-control hooks: it mounts // the same API surface and additionally wires the ControlDeps callbacks. func NewRouterWithControl(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager, deps APIDeps, control ControlDeps) chi.Router { + log = loggerOrDefault(log) r := chi.NewRouter() r.Use(middleware.Recoverer) @@ -67,7 +63,7 @@ func NewRouterWithControl(cfg config.Config, log *slog.Logger, termMgr *terminal r.MethodNotAllowed(methodNotAllowedJSON) mountHealth(r) - mountMux(r, termMgr, log) + mountTerminalMux(r, termMgr, log) mountControl(r, control) NewAPI(cfg, deps).Register(r) @@ -91,13 +87,13 @@ func mountControl(r chi.Router, deps ControlDeps) { } r.Post("/shutdown", func(w http.ResponseWriter, req *http.Request) { if !localControlRequest(req) { - writeJSON(w, http.StatusForbidden, map[string]any{ + envelope.WriteJSON(w, http.StatusForbidden, map[string]any{ "status": "forbidden", "service": daemonmeta.ServiceName, }) return } - writeJSON(w, http.StatusAccepted, map[string]any{ + envelope.WriteJSON(w, http.StatusAccepted, map[string]any{ "status": "shutting_down", "service": daemonmeta.ServiceName, "pid": os.Getpid(), @@ -132,18 +128,17 @@ func localControlRequest(r *http.Request) bool { // handleHealthz is the liveness probe: it answers 200 as long as the process is // up and serving. It does no dependency checks by design. func handleHealthz(w http.ResponseWriter, _ *http.Request) { - writeJSON(w, http.StatusOK, map[string]any{ + envelope.WriteJSON(w, http.StatusOK, map[string]any{ "status": "ok", "service": daemonmeta.ServiceName, "pid": os.Getpid(), }) } -// handleReadyz is the readiness probe. In the 1a skeleton the daemon is ready -// as soon as it is listening; later phases will gate this on dependency -// initialisation (e.g. store/event-bus warm-up). +// handleReadyz is the readiness probe. Dependency initialization happens before +// the server is constructed, so a listening daemon is ready to answer requests. func handleReadyz(w http.ResponseWriter, _ *http.Request) { - writeJSON(w, http.StatusOK, map[string]any{ + envelope.WriteJSON(w, http.StatusOK, map[string]any{ "status": "ready", "service": daemonmeta.ServiceName, "pid": os.Getpid(), diff --git a/backend/internal/httpd/server.go b/backend/internal/httpd/server.go index a9ddcbde..a1b8e615 100644 --- a/backend/internal/httpd/server.go +++ b/backend/internal/httpd/server.go @@ -34,6 +34,12 @@ type Server struct { // the returned Server's lifecycle via Run. termMgr may be nil, in which case // the /mux terminal surface is not mounted. func New(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager) (*Server, error) { + return NewWithDeps(cfg, log, termMgr, APIDeps{}) +} + +// NewWithDeps constructs a Server with API dependencies supplied by the daemon. +func NewWithDeps(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager, deps APIDeps) (*Server, error) { + log = loggerOrDefault(log) ln, err := net.Listen("tcp", cfg.Addr()) if err != nil { return nil, fmt.Errorf("bind %s (is a daemon already running?): %w", cfg.Addr(), err) @@ -46,7 +52,7 @@ func New(cfg config.Config, log *slog.Logger, termMgr *terminal.Manager) (*Serve shutdownRequested: make(chan struct{}), } srv.http = &http.Server{ - Handler: NewRouterWithControl(cfg, log, termMgr, APIDeps{}, ControlDeps{ + Handler: NewRouterWithControl(cfg, log, termMgr, deps, ControlDeps{ RequestShutdown: srv.requestShutdown, }), // ReadHeaderTimeout guards against slow-loris even on loopback; @@ -75,7 +81,7 @@ func (s *Server) Run(ctx context.Context) error { return fmt.Errorf("write run-file: %w", err) } defer func() { - if err := runfile.Remove(s.cfg.RunFilePath); err != nil { + if err := runfile.RemoveIfOwned(s.cfg.RunFilePath, info.PID); err != nil { s.log.Warn("failed to remove run-file", "path", s.cfg.RunFilePath, "err", err) } }() diff --git a/backend/internal/httpd/mux.go b/backend/internal/httpd/terminal_mux.go similarity index 51% rename from backend/internal/httpd/mux.go rename to backend/internal/httpd/terminal_mux.go index 0c17a548..ef038fd2 100644 --- a/backend/internal/httpd/mux.go +++ b/backend/internal/httpd/terminal_mux.go @@ -12,48 +12,50 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/terminal" ) -// muxReadLimit caps a single inbound frame. Client→server frames are small +// terminalMuxReadLimit caps a single inbound frame. Client→server frames are small // (keystrokes, resize, control), so a generous 1 MiB is ample headroom while // still bounding memory per message. -const muxReadLimit = 1 << 20 +const terminalMuxReadLimit = 1 << 20 -// mountMux registers the long-lived terminal-multiplexing WebSocket at /mux. It +// mountTerminalMux registers the long-lived terminal-multiplexing WebSocket at /mux. It // is intentionally outside the per-request Timeout middleware (the connection is // long-lived). When mgr is nil the route is not mounted — the daemon simply has // no terminal surface yet. -func mountMux(r chi.Router, mgr *terminal.Manager, log *slog.Logger) { +func mountTerminalMux(r chi.Router, mgr *terminal.Manager, log *slog.Logger) { if mgr == nil { return } - r.Get("/mux", muxHandler(mgr, log)) + r.Get("/mux", terminalMuxHandler(mgr, log)) } -// muxHandler upgrades the request to a WebSocket and hands the connection to the +// terminalMuxHandler upgrades the request to a WebSocket and hands the connection to the // terminal manager. httpd owns only the upgrade and the transport adaptation; // all stream logic lives in internal/terminal. -func muxHandler(mgr *terminal.Manager, log *slog.Logger) http.HandlerFunc { +func terminalMuxHandler(mgr *terminal.Manager, log *slog.Logger) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { // InsecureSkipVerify disables coder/websocket's same-origin check: the // daemon binds loopback only and the desktop renderer's origin differs // from the loopback host, mirroring the legacy Node mux server. c, err := websocket.Accept(w, r, &websocket.AcceptOptions{InsecureSkipVerify: true}) if err != nil { - log.Warn("mux: websocket upgrade failed", "err", err) + log.Warn("terminal mux: websocket upgrade failed", "err", err) return } - c.SetReadLimit(muxReadLimit) - mgr.Serve(r.Context(), &coderConn{c: c}) + c.SetReadLimit(terminalMuxReadLimit) + mgr.Serve(r.Context(), &terminalMuxConn{c: c}) } } -// coderConn adapts a coder/websocket connection to terminal.wsConn. JSON framing +// terminalMuxConn adapts a coder/websocket connection to terminal.wsConn. JSON framing // uses wsjson (text messages); Ping is a control frame; Close sends a normal // closure. -type coderConn struct{ c *websocket.Conn } +type terminalMuxConn struct{ c *websocket.Conn } -func (a *coderConn) ReadJSON(ctx context.Context, v any) error { return wsjson.Read(ctx, a.c, v) } -func (a *coderConn) WriteJSON(ctx context.Context, v any) error { return wsjson.Write(ctx, a.c, v) } -func (a *coderConn) Ping(ctx context.Context) error { return a.c.Ping(ctx) } -func (a *coderConn) Close(reason string) error { +func (a *terminalMuxConn) ReadJSON(ctx context.Context, v any) error { return wsjson.Read(ctx, a.c, v) } +func (a *terminalMuxConn) WriteJSON(ctx context.Context, v any) error { + return wsjson.Write(ctx, a.c, v) +} +func (a *terminalMuxConn) Ping(ctx context.Context) error { return a.c.Ping(ctx) } +func (a *terminalMuxConn) Close(reason string) error { return a.c.Close(websocket.StatusNormalClosure, reason) } diff --git a/backend/internal/httpd/mux_test.go b/backend/internal/httpd/terminal_mux_test.go similarity index 90% rename from backend/internal/httpd/mux_test.go rename to backend/internal/httpd/terminal_mux_test.go index b334cf8b..fc7bca5f 100644 --- a/backend/internal/httpd/mux_test.go +++ b/backend/internal/httpd/terminal_mux_test.go @@ -17,9 +17,9 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/terminal" ) -// stubSource attaches a throwaway shell command instead of a real tmux pane, so +// stubSource attaches a throwaway shell command instead of a real Zellij pane, so // the /mux path exercises the genuine upgrade + wsjson + Serve + creack/pty flow -// without needing tmux. IsAlive=false means the pane is treated as gone once the +// without needing Zellij. IsAlive=false means the pane is treated as gone once the // command exits (no re-attach). type stubSource struct{ argv []string } @@ -28,7 +28,7 @@ func (stubSource) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { return false, nil } -type muxFrame struct { +type terminalMuxFrame struct { Ch string `json:"ch"` ID string `json:"id"` Type string `json:"type"` @@ -52,12 +52,12 @@ func dialMux(t *testing.T, mgr *terminal.Manager) (*websocket.Conn, func()) { } } -func readFrame(t *testing.T, c *websocket.Conn, ch, typ string, d time.Duration) muxFrame { +func readFrame(t *testing.T, c *websocket.Conn, ch, typ string, d time.Duration) terminalMuxFrame { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), d) defer cancel() for { - var f muxFrame + var f terminalMuxFrame if err := wsjson.Read(ctx, c, &f); err != nil { t.Fatalf("waiting for %s/%s: %v", ch, typ, err) } @@ -81,7 +81,7 @@ func TestMuxUpgradeStreamsTerminal(t *testing.T) { defer done() ctx := context.Background() - if err := wsjson.Write(ctx, c, muxFrame{Ch: "terminal", ID: "t1", Type: "open"}); err != nil { + if err := wsjson.Write(ctx, c, terminalMuxFrame{Ch: "terminal", ID: "t1", Type: "open"}); err != nil { t.Fatalf("write open: %v", err) } diff --git a/backend/internal/integration/lifecycle_sqlite_test.go b/backend/internal/integration/lifecycle_sqlite_test.go index e14a93fe..670fa150 100644 --- a/backend/internal/integration/lifecycle_sqlite_test.go +++ b/backend/internal/integration/lifecycle_sqlite_test.go @@ -1,731 +1,163 @@ -// Package integration exercises the lifecycle + session lane against the real -// SQLite store and the real CDC trigger pipeline. Unit tests stay on the -// in-memory fakes in lifecycle/ and session/; these live-fire tests prove the -// wiring across packages actually flows: SM -> store row -> LCM mutate -> store -// update -> DB trigger -> change_log read. package integration import ( "context" - "io" - "log/slog" - "path/filepath" - "strings" - "sync" "testing" "time" "github.com/aoagents/agent-orchestrator/backend/internal/cdc" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" - "github.com/aoagents/agent-orchestrator/backend/internal/notification" "github.com/aoagents/agent-orchestrator/backend/internal/ports" + prsvc "github.com/aoagents/agent-orchestrator/backend/internal/pr" + "github.com/aoagents/agent-orchestrator/backend/internal/project" "github.com/aoagents/agent-orchestrator/backend/internal/session" "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) -// ---- plugin fakes (minimal: only enough to drive SM through real LCM) ---- +type stubRuntime struct{ created, destroyed int } -type stubRuntime struct { - id, name string -} - -func (s *stubRuntime) Create(_ context.Context, cfg ports.RuntimeConfig) (ports.RuntimeHandle, error) { - return ports.RuntimeHandle{ID: s.id, RuntimeName: s.name}, nil -} -func (s *stubRuntime) Destroy(context.Context, ports.RuntimeHandle) error { return nil } -func (s *stubRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { - return true, nil +func (s *stubRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { + s.created++ + return ports.RuntimeHandle{ID: "h1"}, nil } +func (s *stubRuntime) Destroy(context.Context, ports.RuntimeHandle) error { s.destroyed++; return nil } +func (s *stubRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { return true, nil } type stubAgent struct{} -func (stubAgent) GetLaunchCommand(ports.AgentConfig) string { return "launch" } -func (stubAgent) GetEnvironment(ports.AgentConfig) map[string]string { return map[string]string{} } -func (stubAgent) GetRestoreCommand(id string) string { return "resume " + id } - -type stubWorkspace struct { - root string +func (stubAgent) GetLaunchCommand(ports.AgentConfig) string { return "launch" } +func (stubAgent) GetEnvironment(ports.AgentConfig) map[string]string { + return map[string]string{"X": "1"} } +func (stubAgent) GetRestoreCommand(id string) string { return "resume " + id } -func (w *stubWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { - return ports.WorkspaceInfo{ - Path: filepath.Join(w.root, string(cfg.SessionID)), - Branch: cfg.Branch, - SessionID: cfg.SessionID, - ProjectID: cfg.ProjectID, - }, nil -} -func (w *stubWorkspace) Destroy(context.Context, ports.WorkspaceInfo) error { return nil } -func (w *stubWorkspace) Restore(ctx context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { - return w.Create(ctx, cfg) -} +type stubWorkspace struct{ destroyed int } -type captureMessenger struct { - mu sync.Mutex - msgs []string +func (s *stubWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + return ports.WorkspaceInfo{Path: "/ws/" + string(cfg.SessionID), Branch: cfg.Branch, SessionID: cfg.SessionID, ProjectID: cfg.ProjectID}, nil } - -func (m *captureMessenger) Send(_ context.Context, _ domain.SessionID, msg string) error { - m.mu.Lock() - defer m.mu.Unlock() - m.msgs = append(m.msgs, msg) +func (s *stubWorkspace) Destroy(context.Context, ports.WorkspaceInfo) error { + s.destroyed++ return nil } -func (m *captureMessenger) drain() []string { - m.mu.Lock() - defer m.mu.Unlock() - out := append([]string(nil), m.msgs...) - m.msgs = nil - return out +func (s *stubWorkspace) Restore(ctx context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + return s.Create(ctx, cfg) } -type captureNotifier struct { - mu sync.Mutex - events []ports.Event -} +type captureMessenger struct{ msgs []string } -func (n *captureNotifier) Notify(_ context.Context, e ports.Event) error { - n.mu.Lock() - defer n.mu.Unlock() - n.events = append(n.events, e) +func (c *captureMessenger) Send(_ context.Context, _ domain.SessionID, msg string) error { + c.msgs = append(c.msgs, msg) return nil } -func (n *captureNotifier) drain() []ports.Event { - n.mu.Lock() - defer n.mu.Unlock() - out := append([]ports.Event(nil), n.events...) - n.events = nil - return out -} - -// ---- harness: real store + real LCM + real SM + change_log poller ---- -type liveStack struct { - dataDir string - store *sqlite.Store - lcm *lifecycle.Manager - sm *session.Manager - notifier *captureNotifier - messenger *captureMessenger - - closed bool // guard so the explicit close() and t.Cleanup don't double-close +type stack struct { + store *sqlite.Store + sm *session.Manager + lcm *lifecycle.Manager + prm *prsvc.Manager + rt *stubRuntime + ws *stubWorkspace + msg *captureMessenger } -// openLiveStack opens the store + hydrates the LCM/SM and registers an -// idempotent t.Cleanup so a mid-test t.Fatalf can't leak the SQLite handle. -// Tests that need to simulate a daemon restart still call close() explicitly -// between phases; the cleanup hook becomes a no-op once that runs. -func openLiveStack(t *testing.T, dataDir string) *liveStack { +func newStack(t *testing.T) *stack { t.Helper() - store, err := sqlite.Open(dataDir) + ctx := context.Background() + store, err := sqlite.Open(t.TempDir()) if err != nil { - t.Fatalf("open sqlite: %v", err) - } - notifier := &captureNotifier{} - messenger := &captureMessenger{} - lcm := lifecycle.New(store, store, notifier, messenger) - - wsRoot := t.TempDir() - sm := session.New(session.Deps{ - Runtime: &stubRuntime{id: "h1", name: "tmux"}, - Agent: stubAgent{}, - Workspace: &stubWorkspace{root: wsRoot}, - Store: store, - Messenger: messenger, - Lifecycle: lcm, - }) - st := &liveStack{ - dataDir: dataDir, - store: store, - lcm: lcm, - sm: sm, - notifier: notifier, - messenger: messenger, - } - t.Cleanup(func() { - if st.closed { - return - } - // Best-effort: failures here would be noise after t.Fatalf already - // recorded the real cause. - _ = st.store.Close() - st.closed = true - }) - return st -} - -func (s *liveStack) close(t *testing.T) { - t.Helper() - if s.closed { - return - } - s.closed = true - if err := s.store.Close(); err != nil { - t.Fatalf("close store: %v", err) + t.Fatal(err) } -} - -func seedProject(t *testing.T, store *sqlite.Store, id string) { - t.Helper() - if err := store.UpsertProject(context.Background(), sqlite.ProjectRow{ - ID: id, Path: "/repo/" + id, RegisteredAt: time.Now(), - }); err != nil { - t.Fatalf("upsert project: %v", err) - } -} - -func durableLifecycle(store *sqlite.Store, messenger ports.AgentMessenger) *lifecycle.Manager { - renderer := notification.NewRenderer(store) - logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - notifier := notification.NewEnqueuer(store, renderer, logger) - return lifecycle.New(store, store, notifier, messenger) -} - -func durableRecord(project, issue, branch string) domain.SessionRecord { - now := time.Now().UTC().Truncate(time.Second) - return domain.SessionRecord{ - ProjectID: domain.ProjectID(project), - IssueID: domain.IssueID(issue), - Kind: domain.KindWorker, - Lifecycle: domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: domain.SessionWorking}, - IsAlive: true, - Activity: domain.ActivitySubstate{ - State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceHook, - }, - }, - Metadata: domain.SessionMetadata{Branch: branch, WorkspacePath: "/workspace/" + branch}, - CreatedAt: now, - UpdatedAt: now, + t.Cleanup(func() { _ = store.Close() }) + if err := store.Upsert(ctx, project.Row{ID: "mer", Path: "/repo/mer", RegisteredAt: time.Now()}); err != nil { + t.Fatal(err) } + msg := &captureMessenger{} + lcm := lifecycle.New(store, msg) + prm := prsvc.New(prsvc.Deps{Writer: store, Lifecycle: lcm}) + rt := &stubRuntime{} + ws := &stubWorkspace{} + sm := session.New(session.Deps{Runtime: rt, Agent: stubAgent{}, Workspace: ws, Store: store, Messenger: msg, Lifecycle: lcm}) + return &stack{store: store, sm: sm, lcm: lcm, prm: prm, rt: rt, ws: ws, msg: msg} } -// ---- tests ---- - -// TestHappyPath drives Spawn -> SCM PR observation (open + CI passing) -> Kill, -// asserting via direct store reads that the canonical row, the PR row, and the -// change_log stream all reflect what each step contributed. -func TestHappyPath_Spawn_PR_Kill(t *testing.T) { - t.Parallel() +func TestSpawnPRKillRoundTrip(t *testing.T) { ctx := context.Background() - st := openLiveStack(t, t.TempDir()) - defer st.close(t) - seedProject(t, st.store, "mer") - - // 1. Spawn — SM inserts the session row, LCM marks it live. We only assert - // the structural invariant of the id (project-scoped, non-empty), not the - // literal counter — that's a store-internal detail. - sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ - ProjectID: "mer", Kind: domain.KindWorker, Prompt: "ship it", - }) + st := newStack(t) + sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Branch: "b", Prompt: "do it"}) if err != nil { - t.Fatalf("spawn: %v", err) + t.Fatal(err) } - if sess.ID == "" || !strings.HasPrefix(string(sess.ID), "mer-") { - t.Fatalf("expected project-scoped id like mer-N, got %q", sess.ID) + if sess.ID != "mer-1" || sess.Status != domain.StatusIdle { + t.Fatalf("spawn got %+v", sess) } - - rec, ok, err := st.store.GetSession(ctx, sess.ID) - if err != nil || !ok { - t.Fatalf("get session: ok=%v err=%v", ok, err) - } - if !rec.Lifecycle.IsAlive { - t.Fatal("post-spawn: is_alive should be true") - } - if rec.Lifecycle.Session.State != domain.SessionNotStarted { - t.Fatalf("post-spawn state want not_started, got %q", rec.Lifecycle.Session.State) - } - if rec.Metadata.RuntimeHandleID != "h1" || rec.Metadata.RuntimeName != "tmux" { - t.Fatalf("post-spawn handles missing: %+v", rec.Metadata) - } - if rec.Metadata.WorkspacePath == "" || rec.Metadata.Prompt != "ship it" { - t.Fatalf("post-spawn metadata missing: %+v", rec.Metadata) + rec, ok, _ := st.store.GetSession(ctx, sess.ID) + if !ok || rec.Metadata.RuntimeHandleID != "h1" || rec.IsTerminated { + t.Fatalf("post-spawn row wrong: %+v", rec) } - - // 2. SCM observes a fresh PR — open, CI passing. LCM writes the pr row - // atomically (one tx, triggers fire pr_created). - prURL := "https://github.com/repo/mer/pull/1" - if err := st.lcm.ApplyPRObservation(ctx, sess.ID, ports.PRObservation{ - Fetched: true, URL: prURL, Number: 1, - CI: domain.CIPassing, Review: domain.ReviewNone, Mergeability: domain.MergeMergeable, - Checks: []domain.PRCheckRow{{ - Name: "ci/build", CommitHash: "abc123", Status: "passed", CreatedAt: time.Now(), - }}, - }); err != nil { - t.Fatalf("apply pr: %v", err) + if err := st.prm.ApplyObservation(ctx, sess.ID, ports.PRObservation{Fetched: true, URL: "pr1", Number: 1, CI: domain.CIFailing, Checks: []ports.PRCheckObservation{{Name: "build", CommitHash: "c1", Status: domain.PRCheckFailed, LogTail: "boom"}}}); err != nil { + t.Fatal(err) } - prRow, ok, err := st.store.GetPR(ctx, prURL) - if err != nil || !ok { - t.Fatalf("get pr: ok=%v err=%v", ok, err) + got, err := st.sm.Get(ctx, sess.ID) + if err != nil { + t.Fatal(err) } - if prRow.SessionID != string(sess.ID) || prRow.CI != domain.CIPassing || prRow.Draft || prRow.Merged || prRow.Closed { - t.Fatalf("pr row wrong: %+v", prRow) + if got.Status != domain.StatusCIFailed { + t.Fatalf("want ci_failed, got %q", got.Status) } - - // 3. Kill — SM routes to LCM and tears down runtime+workspace. - freed, err := st.sm.Kill(ctx, sess.ID, domain.TermManuallyKilled) + freed, err := st.sm.Kill(ctx, sess.ID) if err != nil || !freed { t.Fatalf("kill freed=%v err=%v", freed, err) } rec, _, _ = st.store.GetSession(ctx, sess.ID) - if rec.Lifecycle.Session.State != domain.SessionTerminated || - rec.Lifecycle.TerminationReason != domain.TermManuallyKilled || - rec.Lifecycle.IsAlive { - t.Fatalf("post-kill canonical wrong: %+v", rec.Lifecycle) - } - - // 4. Assert the change_log captured the full timeline. The DB triggers - // write the only durable CDC; we don't want to assume an ordering of - // interleaved events, just that each expected event_type shows up. - rows, err := st.store.ReadChangeLogAfter(ctx, 0, 100) - if err != nil { - t.Fatalf("read change_log: %v", err) - } - seen := map[string]bool{} - for _, r := range rows { - seen[r.EventType] = true - } - for _, want := range []string{"session_created", "session_updated", "pr_created", "pr_check_recorded"} { - if !seen[want] { - t.Fatalf("missing change_log event %q (got: %v)", want, seen) - } + if !rec.IsTerminated { + t.Fatalf("post-kill row should be terminated: %+v", rec) } } -// TestRestoreRoundTrip simulates a daemon restart: spawn a session, persist the -// kill, fully close the in-process LCM/SM, open a fresh stack against the SAME -// DB file, and Restore. The restored session must keep its metadata (the agent -// session id is the must-survive bit). -func TestRestoreRoundTrip_PreservesMetadata(t *testing.T) { - t.Parallel() +func TestRestoreRoundTripPreservesMetadata(t *testing.T) { ctx := context.Background() - dir := t.TempDir() - st := openLiveStack(t, dir) - seedProject(t, st.store, "mer") - - // Phase A: spawn with an agent session id, then kill so the row is terminal - // and Restore is legal. - sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ - ProjectID: "mer", Kind: domain.KindWorker, Prompt: "remember me", - }) + st := newStack(t) + sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Branch: "b", Prompt: "prompt"}) if err != nil { - t.Fatalf("spawn: %v", err) - } - // fold an AgentSessionID into the row — the LCM does this through the spawn - // outcome on Restore too, but a fresh spawn doesn't (the agent has not - // reported one yet). We patch via the store so the restore branch has - // something to resume from. Check ok/err: without it, a missed row would - // hand UpdateSession a zero-value record (ID==""), which matches no rows - // and returns nil — Phase B would then fail with a misleading "agent id - // lost across restart" rather than the real cause. - rec, ok, err := st.store.GetSession(ctx, sess.ID) - if err != nil || !ok { - t.Fatalf("get session for patch: ok=%v err=%v", ok, err) + t.Fatal(err) } - rec.Metadata.AgentSessionID = "agent-xyz" + rec, _, _ := st.store.GetSession(ctx, sess.ID) + rec.Metadata.AgentSessionID = "agent-x" if err := st.store.UpdateSession(ctx, rec); err != nil { - t.Fatalf("patch agent id: %v", err) - } - if _, err := st.sm.Kill(ctx, sess.ID, domain.TermManuallyKilled); err != nil { - t.Fatalf("kill: %v", err) - } - st.close(t) - - // Phase B: reopen against the same data dir; everything in memory is gone. - st2 := openLiveStack(t, dir) - defer st2.close(t) - - // Confirm the row survived the restart. - rec2, ok, err := st2.store.GetSession(ctx, sess.ID) - if err != nil || !ok { - t.Fatalf("reopen get: ok=%v err=%v", ok, err) - } - if rec2.Metadata.AgentSessionID != "agent-xyz" { - t.Fatalf("agent session id lost across restart: %+v", rec2.Metadata) - } - if rec2.Lifecycle.Session.State != domain.SessionTerminated { - t.Fatalf("expected terminal after reopen, got %q", rec2.Lifecycle.Session.State) - } - - // Phase C: Restore — must drive a fresh OnSpawnCompleted and surface the - // preserved AgentSessionID into the new outcome. - restored, err := st2.sm.Restore(ctx, sess.ID) - if err != nil { - t.Fatalf("restore: %v", err) - } - if !restored.Lifecycle.IsAlive { - t.Fatal("restored session should be is_alive after spawn-completed") - } - if restored.Metadata.AgentSessionID != "agent-xyz" { - t.Fatalf("restored row dropped AgentSessionID: %+v", restored.Metadata) - } -} - -// TestCIFailureAndRecovery drives the CI-failed reaction path: a failing -// observation injects a nudge into the agent (messenger), a recovery -// observation (CI passing) flips state without re-firing the nudge, and the -// pr_checks history records both runs so the brake's "last 3 all failed" query -// reads the truth. -func TestCIFailureAndRecovery_NudgeThenClears(t *testing.T) { - t.Parallel() - ctx := context.Background() - st := openLiveStack(t, t.TempDir()) - defer st.close(t) - seedProject(t, st.store, "mer") - - sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Prompt: "."}) - if err != nil { - t.Fatalf("spawn: %v", err) - } - // Move the session out of not_started so the reaction path engages on real - // PR facts (not_started doesn't react on PRs). - if err := st.lcm.ApplyActivitySignal(ctx, sess.ID, ports.ActivitySignal{ - Valid: true, State: domain.ActivityActive, Source: domain.SourceHook, Timestamp: time.Now(), - }); err != nil { - t.Fatalf("activity: %v", err) - } - _ = st.messenger.drain() // ignore startup nudges, focus on CI - - prURL := "https://github.com/repo/mer/pull/2" - // Failing CI: handleCIFailure should send a CI-failed nudge with the log - // tail injected. - if err := st.lcm.ApplyPRObservation(ctx, sess.ID, ports.PRObservation{ - Fetched: true, URL: prURL, Number: 2, - CI: domain.CIFailing, Mergeability: domain.MergeUnstable, - Checks: []domain.PRCheckRow{{ - Name: "ci/build", CommitHash: "c1", Status: "failed", LogTail: "panic: nil map", CreatedAt: time.Now(), - }}, - }); err != nil { - t.Fatalf("apply pr (failing): %v", err) + t.Fatal(err) } - got := st.messenger.drain() - if len(got) == 0 { - t.Fatal("expected CI-failed nudge to the agent") + if _, err := st.sm.Kill(ctx, sess.ID); err != nil { + t.Fatal(err) } - if !strings.Contains(got[0], "CI is failing") || !strings.Contains(got[0], "panic: nil map") { - t.Fatalf("ci-failed message missing content: %q", got[0]) - } - - // Brake confirmation: only one failure so far, RecentCheckStatuses should - // reflect it. - history, err := st.store.RecentCheckStatuses(ctx, prURL, "ci/build", 3) + restored, err := st.sm.Restore(ctx, sess.ID) if err != nil { - t.Fatalf("recent checks: %v", err) - } - if len(history) != 1 || history[0] != "failed" { - t.Fatalf("ci history wrong: %v", history) - } - - // Recovery: CI passing on a new commit. With the dedupe slot still on - // rxCIFailed, the dispatch path moves to rxApprovedGreen (mergeable) and - // the human notifier is the one that pages. - if err := st.lcm.ApplyPRObservation(ctx, sess.ID, ports.PRObservation{ - Fetched: true, URL: prURL, Number: 2, - CI: domain.CIPassing, Mergeability: domain.MergeMergeable, - Checks: []domain.PRCheckRow{{ - Name: "ci/build", CommitHash: "c2", Status: "passed", CreatedAt: time.Now(), - }}, - }); err != nil { - t.Fatalf("apply pr (recovery): %v", err) + t.Fatal(err) } - ev := st.notifier.drain() - if len(ev) == 0 { - t.Fatal("recovery: notifier should have received an event (approved-and-green)") - } - if !anyEventType(ev, "reaction.approved-and-green") { - t.Fatalf("recovery should notify approved-and-green, got %+v", ev) - } - - // And the pr row reflects the recovery in the canonical fact store. - prRow, ok, _ := st.store.GetPR(ctx, prURL) - if !ok || prRow.CI != domain.CIPassing { - t.Fatalf("pr ci_state should be passing post-recovery: %+v", prRow) + if restored.IsTerminated || restored.Metadata.AgentSessionID != "agent-x" { + t.Fatalf("restored wrong: %+v", restored) } } -// TestDetectingPersistsAcrossRestart drives the runtime quarantine path: a -// failed probe puts the session into the detecting state, which means the -// decider's anti-flap memory MUST be flushed to the detecting_* columns and -// survive a restart. A subsequent alive probe must clear it. -func TestDetectingPersistsAcrossRestart(t *testing.T) { - t.Parallel() +func TestCDCPollerReceivesSessionAndPREvents(t *testing.T) { ctx := context.Background() - dir := t.TempDir() - st := openLiveStack(t, dir) - seedProject(t, st.store, "mer") - - sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Prompt: "."}) + st := newStack(t) + b := cdc.NewBroadcaster() + var got []cdc.Event + b.Subscribe(func(e cdc.Event) { got = append(got, e) }) + poller := cdc.NewPoller(st.store, b, cdc.PollerConfig{}) + sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker}) if err != nil { - t.Fatalf("spawn: %v", err) - } - // Move to working so the runtime decider doesn't bail on not_started. - if err := st.lcm.ApplyActivitySignal(ctx, sess.ID, ports.ActivitySignal{ - Valid: true, State: domain.ActivityActive, Source: domain.SourceHook, Timestamp: time.Now(), - }); err != nil { - t.Fatalf("activity: %v", err) + t.Fatal(err) } - // One failed probe should park the session in detecting with attempts=1. - if err := st.lcm.ApplyRuntimeObservation(ctx, sess.ID, ports.RuntimeFacts{ - ObservedAt: time.Now(), - Runtime: ports.ProbeFailed, - Process: ports.ProbeFailed, - }); err != nil { - t.Fatalf("apply runtime: %v", err) + if err := st.prm.ApplyObservation(ctx, sess.ID, ports.PRObservation{Fetched: true, URL: "pr1", Number: 1, Review: domain.ReviewApproved}); err != nil { + t.Fatal(err) } - rec, ok, err := st.store.GetSession(ctx, sess.ID) - if err != nil || !ok { - t.Fatalf("get session post-probe: ok=%v err=%v", ok, err) - } - if rec.Lifecycle.Session.State != domain.SessionDetecting { - t.Fatalf("expected detecting state, got %q", rec.Lifecycle.Session.State) - } - if rec.Lifecycle.Detecting == nil || rec.Lifecycle.Detecting.Attempts == 0 { - t.Fatalf("detecting memory should be populated: %+v", rec.Lifecycle.Detecting) - } - - // Restart: close, reopen, verify the detecting_* columns round-tripped. - st.close(t) - st2 := openLiveStack(t, dir) - defer st2.close(t) - - rec2, ok, _ := st2.store.GetSession(ctx, sess.ID) - if !ok || rec2.Lifecycle.Detecting == nil { - t.Fatalf("detecting lost across restart: %+v", rec2.Lifecycle) - } - if rec2.Lifecycle.Detecting.Attempts != rec.Lifecycle.Detecting.Attempts { - t.Fatalf("attempts round-trip mismatch: pre=%d post=%d", - rec.Lifecycle.Detecting.Attempts, rec2.Lifecycle.Detecting.Attempts) - } - if rec2.Lifecycle.Detecting.EvidenceHash != rec.Lifecycle.Detecting.EvidenceHash { - t.Fatal("evidence hash dropped across restart") - } - - // Recovery probe — alive — must clear detecting and flip state out of it. - if err := st2.lcm.ApplyRuntimeObservation(ctx, sess.ID, ports.RuntimeFacts{ - ObservedAt: time.Now(), - Runtime: ports.ProbeAlive, - Process: ports.ProbeAlive, - }); err != nil { - t.Fatalf("recovery probe: %v", err) - } - rec3, ok3, err := st2.store.GetSession(ctx, sess.ID) - if err != nil || !ok3 { - t.Fatalf("get session post-recovery: ok=%v err=%v", ok3, err) - } - if rec3.Lifecycle.Detecting != nil { - t.Fatalf("alive probe should clear detecting, got %+v", rec3.Lifecycle.Detecting) - } - if rec3.Lifecycle.Session.State == domain.SessionDetecting { - t.Fatalf("session state should leave detecting, got %q", rec3.Lifecycle.Session.State) - } -} - -// TestCDCPollerReceivesAllStages drives the full real pipeline including the -// in-process CDC poller — proving the trigger writes become broadcaster events -// in the same order the storage layer observes them. -func TestCDCPollerReceivesAllStages(t *testing.T) { - t.Parallel() - ctx := context.Background() - st := openLiveStack(t, t.TempDir()) - defer st.close(t) - seedProject(t, st.store, "mer") - - bcast := cdc.NewBroadcaster() - src := pollerSource{st.store} - poller := cdc.NewPoller(src, bcast, cdc.PollerConfig{Batch: 100}) - - var ( - mu sync.Mutex - events []cdc.Event - ) - bcast.Subscribe(func(e cdc.Event) { - mu.Lock() - defer mu.Unlock() - events = append(events, e) - }) - - sess, err := st.sm.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Prompt: "."}) - if err != nil { - t.Fatalf("spawn: %v", err) - } - if err := st.lcm.ApplyActivitySignal(ctx, sess.ID, ports.ActivitySignal{ - Valid: true, State: domain.ActivityActive, Source: domain.SourceHook, Timestamp: time.Now(), - }); err != nil { - t.Fatalf("activity: %v", err) - } - if err := st.lcm.ApplyPRObservation(ctx, sess.ID, ports.PRObservation{ - Fetched: true, URL: "https://github.com/repo/mer/pull/3", Number: 3, - CI: domain.CIPassing, Mergeability: domain.MergeMergeable, - }); err != nil { - t.Fatalf("apply pr: %v", err) - } - if err := poller.Poll(ctx); err != nil { - t.Fatalf("poll: %v", err) - } - - mu.Lock() - defer mu.Unlock() - types := map[cdc.EventType]bool{} - for _, e := range events { - types[e.Type] = true - } - for _, want := range []cdc.EventType{cdc.EventSessionCreated, cdc.EventSessionUpdated, cdc.EventPRCreated} { - if !types[want] { - t.Fatalf("poller missed event %q (got %+v)", want, types) - } - } - // Seq monotonicity invariant — the wiring assumes it; assert it here. - var prev int64 - for _, e := range events { - if e.Seq <= prev { - t.Fatalf("seq not monotonic: %d after %d", e.Seq, prev) - } - prev = e.Seq - } -} - -func TestLifecycleDurableNotification_NeedsInput(t *testing.T) { - t.Parallel() - ctx := context.Background() - store, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open sqlite: %v", err) - } - defer store.Close() - seedProject(t, store, "mer") - rec, err := store.CreateSession(ctx, durableRecord("mer", "MER-1", "feat/input")) - if err != nil { - t.Fatalf("create session: %v", err) - } - lcm := durableLifecycle(store, &captureMessenger{}) - startSeq, _ := store.MaxChangeLogSeq(ctx) - - if err := lcm.ApplyActivitySignal(ctx, rec.ID, ports.ActivitySignal{ - Valid: true, State: domain.ActivityWaitingInput, Source: domain.SourceHook, Timestamp: time.Now(), - }); err != nil { - t.Fatalf("activity: %v", err) - } - - notifications, err := store.ListNotifications(ctx, sqlite.NotificationFilter{SessionID: string(rec.ID), Limit: 10}) - if err != nil { - t.Fatalf("list notifications: %v", err) - } - if len(notifications) != 1 || notifications[0].SemanticType != "session.needs_input" || notifications[0].DedupeKey == "" { - t.Fatalf("needs_input notification missing: %+v", notifications) + t.Fatal(err) } - assertNotificationCreatedCDC(t, store, startSeq) -} - -func TestLifecycleDurableNotification_ApprovedAndGreen(t *testing.T) { - t.Parallel() - ctx := context.Background() - store, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open sqlite: %v", err) - } - defer store.Close() - seedProject(t, store, "mer") - rec, err := store.CreateSession(ctx, durableRecord("mer", "MER-2", "feat/green")) - if err != nil { - t.Fatalf("create session: %v", err) - } - lcm := durableLifecycle(store, &captureMessenger{}) - - if err := lcm.ApplyPRObservation(ctx, rec.ID, ports.PRObservation{ - Fetched: true, URL: "https://github.com/org/repo/pull/2", Number: 2, - CI: domain.CIPassing, Review: domain.ReviewApproved, Mergeability: domain.MergeMergeable, - }); err != nil { - t.Fatalf("apply pr: %v", err) - } - notifications, err := store.ListNotifications(ctx, sqlite.NotificationFilter{SessionID: string(rec.ID), Limit: 10}) - if err != nil { - t.Fatalf("list notifications: %v", err) - } - if len(notifications) != 1 || notifications[0].SemanticType != "merge.ready" { - t.Fatalf("approved-and-green notification missing: %+v", notifications) - } -} - -func TestLifecycleDurableNotification_PRMerged(t *testing.T) { - t.Parallel() - ctx := context.Background() - store, err := sqlite.Open(t.TempDir()) - if err != nil { - t.Fatalf("open sqlite: %v", err) - } - defer store.Close() - seedProject(t, store, "mer") - rec, err := store.CreateSession(ctx, durableRecord("mer", "MER-3", "feat/merge")) - if err != nil { - t.Fatalf("create session: %v", err) - } - lcm := durableLifecycle(store, &captureMessenger{}) - startSeq, _ := store.MaxChangeLogSeq(ctx) - - if err := lcm.ApplyPRObservation(ctx, rec.ID, ports.PRObservation{ - Fetched: true, URL: "https://github.com/org/repo/pull/3", Number: 3, Merged: true, - CI: domain.CIPassing, Review: domain.ReviewApproved, Mergeability: domain.MergeMergeable, - }); err != nil { - t.Fatalf("apply pr: %v", err) - } - notifications, err := store.ListNotifications(ctx, sqlite.NotificationFilter{SessionID: string(rec.ID), Limit: 10}) - if err != nil { - t.Fatalf("list notifications: %v", err) - } - if len(notifications) != 1 || notifications[0].SemanticType != "pr.merged" { - t.Fatalf("pr_merged notification missing: %+v", notifications) - } - assertNotificationCreatedCDC(t, store, startSeq) -} - -func assertNotificationCreatedCDC(t *testing.T, store *sqlite.Store, after int64) { - t.Helper() - evs, err := store.ReadChangeLogAfter(context.Background(), after, 20) - if err != nil { - t.Fatalf("read change_log: %v", err) - } - for _, e := range evs { - if e.EventType == string(cdc.EventNotificationCreated) { - return - } - } - t.Fatalf("missing notification_created CDC after %d: %+v", after, evs) -} - -// ---- small helpers ---- - -type pollerSource struct{ *sqlite.Store } - -func (s pollerSource) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { - rows, err := s.ReadChangeLogAfter(ctx, after, limit) - if err != nil { - return nil, err - } - out := make([]cdc.Event, len(rows)) - for i, r := range rows { - out[i] = cdc.Event{ - Seq: r.Seq, - ProjectID: r.ProjectID, - SessionID: r.SessionID, - Type: cdc.EventType(r.EventType), - Payload: []byte(r.Payload), - CreatedAt: r.CreatedAt, - } - } - return out, nil -} -func (s pollerSource) LatestSeq(ctx context.Context) (int64, error) { - return s.MaxChangeLogSeq(ctx) -} - -func anyEventType(evs []ports.Event, t string) bool { - for _, e := range evs { - if e.Type == t { - return true - } + if len(got) < 2 { + t.Fatalf("want CDC events, got %d", len(got)) } - return false } diff --git a/backend/internal/lifecycle/decide_bridge.go b/backend/internal/lifecycle/decide_bridge.go deleted file mode 100644 index 4f88cbe5..00000000 --- a/backend/internal/lifecycle/decide_bridge.go +++ /dev/null @@ -1,112 +0,0 @@ -package lifecycle - -import ( - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/domain/decide" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// defaultRecentActivityWindow is how fresh the last activity must be for the -// probe decider to treat the agent as "recently active" — which keeps an -// ambiguous dead-runtime probe in detecting instead of concluding death. -const defaultRecentActivityWindow = 60 * time.Second - -// probeInput maps a raw RuntimeFacts (plus the prior detecting memory and last -// activity) into the pure decider's input. A failed/unknown probe is reported as -// such, never as a death — that routes to the detecting quarantine. -func probeInput(f ports.RuntimeFacts, cur domain.CanonicalSessionLifecycle, window time.Duration) decide.ProbeInput { - now := nowOr(f.ObservedAt) - - var runtimeAlive, runtimeFailed bool - switch f.Runtime { - case ports.ProbeAlive: - runtimeAlive = true - case ports.ProbeFailed, ports.ProbeUnknown: - runtimeFailed = true // ambiguous: quarantine, never conclude death - } - - var process decide.ProcessLiveness - var processFailed bool - switch f.Process { - case ports.ProbeAlive: - process = decide.ProcessAlive - case ports.ProbeDead: - process = decide.ProcessDead - case ports.ProbeFailed: - process, processFailed = decide.ProcessIndeterminate, true - default: - process = decide.ProcessIndeterminate - } - - return decide.ProbeInput{ - RuntimeAlive: runtimeAlive, - RuntimeFailed: runtimeFailed, - Process: process, - ProcessFailed: processFailed, - RecentActivity: hasRecentActivity(cur.Activity, now, window), - Prior: cur.Detecting, - Now: now, - } -} - -// hasRecentActivity answers the decider's "heard from the agent recently?" -// question. Sticky states (waiting_input/blocked) count as recent (a live-but- -// paused agent); an explicit exited never counts; else age the timestamp. -func hasRecentActivity(a domain.ActivitySubstate, now time.Time, window time.Duration) bool { - switch { - case a.State == domain.ActivityExited: - return false - case a.State.IsSticky(): - return true - case a.LastActivityAt.IsZero(): - return false - default: - return now.Sub(a.LastActivityAt) <= window - } -} - -// activityToSession maps an activity classification onto the session state. -// exited returns ok=false: only the probe pipeline may conclude death. -func activityToSession(a domain.ActivityState) (domain.SessionState, bool) { - switch a { - case domain.ActivityActive: - return domain.SessionWorking, true - case domain.ActivityReady, domain.ActivityIdle: - return domain.SessionIdle, true - case domain.ActivityWaitingInput: - return domain.SessionNeedsInput, true - case domain.ActivityBlocked: - return domain.SessionStuck, true - default: - return "", false - } -} - -// isTerminal reports a final session state — reopened only by an explicit -// Restore, never by an observation. -func isTerminal(s domain.SessionState) bool { - return s == domain.SessionDone || s == domain.SessionTerminated -} - -// writeRuntimeSession reports whether a probe verdict may write the session -// state. A death-axis verdict (detecting/stuck/terminated) always writes; a -// healthy "working" verdict only recovers a detecting session — it must not -// clobber an activity-owned idle/needs_input. -func writeRuntimeSession(d decide.LifecycleDecision, cur domain.CanonicalSessionLifecycle) bool { - if isTerminal(cur.Session.State) { - return false - } - if d.SessionState == domain.SessionWorking { - return cur.Session.State == domain.SessionDetecting - } - return true -} - -func nowOr(t time.Time) time.Time { - if t.IsZero() { - return time.Now() - } - return t -} diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go index 19eada01..03eee005 100644 --- a/backend/internal/lifecycle/manager.go +++ b/backend/internal/lifecycle/manager.go @@ -1,8 +1,7 @@ -// Package lifecycle implements ports.LifecycleManager: the synchronous -// observe -> decide -> persist reducer. Every Apply*/On* entrypoint loads the -// session, runs the pure decider, and persists the full row under a single write -// lock. The DB triggers emit the CDC; the engine never writes the change log. -// After a transition it fires the mapped reaction (see reactions.go). +// Package lifecycle implements the synchronous reducer that writes durable +// session lifecycle facts. It deliberately keeps the session model small: +// activity_state plus an is_terminated bit are the only persisted status-like +// facts on the session row. package lifecycle import ( @@ -12,206 +11,92 @@ import ( "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/domain/decide" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// Manager is the lifecycle engine. mu serialises the load->decide->persist -// read-modify-write across sessions; reactions dispatch after the lock releases -// so a slow agent send never blocks the write path. +type sessionStore interface { + GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) + UpdateSession(ctx context.Context, rec domain.SessionRecord) error +} + +// Manager reduces runtime, activity, spawn, and termination observations into durable session facts. +// It also owns agent nudges caused by PR observations, including merge-conflict, CI-failure, and review-feedback prompts. type Manager struct { - store ports.SessionStore - pr ports.PRWriter - notifier ports.Notifier + store sessionStore messenger ports.AgentMessenger mu sync.Mutex window time.Duration clock func() time.Time - - // in-memory ACT state (policy, not canonical truth — reset on restart). - react reactionState + react reactionState } -var _ ports.LifecycleManager = (*Manager)(nil) - -// New builds a Lifecycle Manager over its collaborators: the session store it -// is the sole writer of, the PR-facts writer, the notifier, and the messenger -// used to nudge running agents. -func New(store ports.SessionStore, pr ports.PRWriter, notifier ports.Notifier, messenger ports.AgentMessenger) *Manager { - return &Manager{ - store: store, - pr: pr, - notifier: notifier, - messenger: messenger, - window: defaultRecentActivityWindow, - clock: time.Now, - react: newReactionState(), - } +// New builds a Lifecycle Manager over the session store it writes and the messenger it uses for agent nudges. +func New(store sessionStore, messenger ports.AgentMessenger) *Manager { + return &Manager{store: store, messenger: messenger, window: defaultRecentActivityWindow, clock: time.Now, react: newReactionState()} } -// mutate runs the shared pipeline: load -> decideFn -> persist (only if changed). -// It returns whether a write happened. A stray observation for an unknown session -// is a clean no-op. -func (m *Manager) mutate( - ctx context.Context, - id domain.SessionID, - fn func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool), -) (bool, error) { +func (m *Manager) mutate(ctx context.Context, id domain.SessionID, fn func(domain.SessionRecord, time.Time) (domain.SessionRecord, bool)) error { m.mu.Lock() defer m.mu.Unlock() rec, ok, err := m.store.GetSession(ctx, id) if err != nil || !ok { - return false, err + return err } - next, changed := fn(rec.Lifecycle) + now := m.clock() + next, changed := fn(rec, now) if !changed { - return false, nil + return nil } - next.Version = domain.LifecycleVersion - rec.Lifecycle = next - rec.UpdatedAt = m.clock() - if err := m.store.UpdateSession(ctx, rec); err != nil { - return false, err + next.UpdatedAt = now + if err := m.store.UpdateSession(ctx, next); err != nil { + return err } - return true, nil + return nil } -// ---- OBSERVE entrypoints ---- - -// ApplyRuntimeObservation feeds the probe decider. is_alive always tracks the -// verdict; the session state follows the runtime-write rule; a non-detecting -// verdict clears stale detecting memory. +// ApplyRuntimeObservation only writes when runtime liveness is unambiguous. A +// failed probe or liveness disagreement is ignored; no transient lifecycle state is stored. func (m *Manager) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error { - changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { - d := decide.ResolveProbeDecision(probeInput(f, cur, m.window)) - next := cur - ch := false - if next.IsAlive != d.IsAlive { - next.IsAlive, ch = d.IsAlive, true - } - if !isTerminal(cur.Session.State) { - if writeRuntimeSession(d, cur) { - ch = setSessionState(&next, d.SessionState, d.TerminationReason) || ch - } - ch = setDetecting(&next, d.Detecting) || ch + return m.mutate(ctx, id, func(cur domain.SessionRecord, now time.Time) (domain.SessionRecord, bool) { + if cur.IsTerminated || !runtimeClearlyDead(f, cur.Activity, now, m.window) { + return cur, false } - return next, ch + next := cur + next.IsTerminated = true + next.Activity = domain.ActivitySubstate{State: domain.ActivityExited, LastActivityAt: timeOr(f.ObservedAt, now), Source: domain.SourceRuntime} + return next, true }) - if err != nil || !changed { - return err - } - return m.runReactions(ctx, id, reactionContent{}) } -// ApplyActivitySignal updates the activity axis. Only a valid signal is -// authoritative, and it is proof of life: it may resolve a detecting session and -// move the session out of any non-terminal state. +// ApplyActivitySignal records an authoritative agent activity signal. func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error { if !s.Valid { return nil } - changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { - if isTerminal(cur.Session.State) { + return m.mutate(ctx, id, func(cur domain.SessionRecord, now time.Time) (domain.SessionRecord, bool) { + if cur.IsTerminated { return cur, false } - next := cur - ch := false - act := domain.ActivitySubstate{State: s.State, LastActivityAt: nowOr(s.Timestamp), Source: s.Source} - if !sameActivity(cur.Activity, act) { - next.Activity, ch = act, true + if !s.Source.CanOverride(cur.Activity.Source) { + return cur, false } - if st, ok := activityToSession(s.State); ok { - ch = setSessionState(&next, st, domain.TermNone) || ch - if next.Detecting != nil { - next.Detecting, ch = nil, true - } + next := cur + act := domain.ActivitySubstate{State: s.State, LastActivityAt: timeOr(s.Timestamp, now), Source: s.Source} + if sameActivity(cur.Activity, act) { + return cur, false } - if s.State != domain.ActivityExited && !next.IsAlive { - next.IsAlive, ch = true, true + next.Activity = act + if s.State == domain.ActivityExited { + next.IsTerminated = true } - return next, ch + return next, true }) - if err != nil || !changed { - return err - } - return m.runReactions(ctx, id, reactionContent{}) -} - -// ApplyPRObservation records the observed PR facts in the pr tables, terminates -// the session on a merge, and fires the PR-driven reactions. A failed fetch is -// dropped (failed probe != "PR closed"). -func (m *Manager) ApplyPRObservation(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { - if !o.Fetched { - return nil - } - rec, ok, err := m.store.GetSession(ctx, id) - if err != nil || !ok { - return err - } - if err := m.writePR(ctx, id, o); err != nil { - return err - } - - if o.Merged { - changed, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { - if isTerminal(cur.Session.State) { - return cur, false - } - next := cur - next.Session.State = domain.SessionTerminated - next.TerminationReason = domain.TermPRMerged - next.IsAlive = false - next.Detecting = nil - return next, true - }) - if err != nil { - return err - } - if changed { - m.clearReactions(id) - return m.fireNotify(ctx, id, rec.ProjectID, rxMerged, reactions[rxMerged]) - } - return nil - } - - return m.runReactions(ctx, id, prContent(o)) } -// writePR persists the observation's scalar facts, check runs, and comment set -// in one atomic store call. PR-table CDC is emitted by the DB triggers. -func (m *Manager) writePR(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { - now := m.clock() - row := domain.PRRow{ - URL: o.URL, SessionID: string(id), Number: o.Number, - Draft: o.Draft, Merged: o.Merged, Closed: o.Closed, - CI: o.CI, Review: o.Review, Mergeability: o.Mergeability, UpdatedAt: now, - } - checks := make([]domain.PRCheckRow, len(o.Checks)) - for i, c := range o.Checks { - c.PRURL = o.URL - if c.CreatedAt.IsZero() { - c.CreatedAt = now - } - checks[i] = c - } - comments := make([]domain.PRComment, len(o.Comments)) - for i, c := range o.Comments { - if c.CreatedAt.IsZero() { - c.CreatedAt = now - } - comments[i] = c - } - return m.pr.WritePR(ctx, row, checks, comments) -} - -// ---- mutation commands from the Session Manager ---- - -// OnSpawnCompleted marks a session live and folds in its handles. It serves a -// fresh spawn (not_started -> live) and a restore (terminal -> reopened): both -// land at not_started + is_alive, with the agent acknowledging via first activity. -func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { +// MarkSpawned marks a newly spawned or restored session live and stores runtime/workspace handles. +func (m *Manager) MarkSpawned(ctx context.Context, id domain.SessionID, metadata domain.SessionMetadata) error { m.mu.Lock() defer m.mu.Unlock() rec, ok, err := m.store.GetSession(ctx, id) @@ -219,115 +104,32 @@ func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o p return err } if !ok { - return fmt.Errorf("lifecycle: OnSpawnCompleted for unknown session %q", id) + return fmt.Errorf("lifecycle: MarkSpawned for unknown session %q", id) } - rec.Lifecycle.Version = domain.LifecycleVersion - rec.Lifecycle.Session.State = domain.SessionNotStarted - rec.Lifecycle.TerminationReason = domain.TermNone - rec.Lifecycle.IsAlive = true - rec.Lifecycle.Detecting = nil - rec.Metadata = mergeMetadata(rec.Metadata, spawnMetadata(o)) - rec.UpdatedAt = m.clock() + now := m.clock() + rec.IsTerminated = false + rec.Activity = domain.ActivitySubstate{State: domain.ActivityIdle, LastActivityAt: now, Source: domain.SourceRuntime} + rec.Metadata = mergeMetadata(rec.Metadata, metadata) + rec.UpdatedAt = now return m.store.UpdateSession(ctx, rec) } -// OnKillRequested is the explicit terminal-write path (the one terminal that does -// not go through the inferred-death decider). It fires no reaction — an explicit -// kill is a human action — but drops the session's ACT state. -func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) error { - _, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle) (domain.CanonicalSessionLifecycle, bool) { - if isTerminal(cur.Session.State) { +// MarkTerminated marks a session terminated without tearing down external resources. +func (m *Manager) MarkTerminated(ctx context.Context, id domain.SessionID) error { + return m.mutate(ctx, id, func(cur domain.SessionRecord, now time.Time) (domain.SessionRecord, bool) { + if cur.IsTerminated { return cur, false } - if reason == domain.TermNone { - reason = domain.TermManuallyKilled - } - next := cur - next.Session.State = domain.SessionTerminated - next.TerminationReason = reason - next.IsAlive = false - next.Detecting = nil - return next, true + cur.IsTerminated = true + cur.Activity = domain.ActivitySubstate{State: domain.ActivityExited, LastActivityAt: now, Source: domain.SourceRuntime} + return cur, true }) - m.clearReactions(id) - return err } -// RunningSessions snapshots every non-terminal session for the reaper to probe. -// Detecting sessions are included — a fresh probe is the only fact that recovers -// or escalates them. -func (m *Manager) RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) { - all, err := m.store.ListAllSessions(ctx) - if err != nil { - return nil, err - } - out := make([]domain.SessionRecord, 0, len(all)) - for _, rec := range all { - if !isTerminal(rec.Lifecycle.Session.State) { - out = append(out, rec) - } - } - return out, nil -} - -// ---- diff + metadata helpers ---- - -// setSessionState sets the state (and, for a terminal state, the reason) when it -// differs. An empty state means "decider doesn't address the session axis". -func setSessionState(next *domain.CanonicalSessionLifecycle, st domain.SessionState, reason domain.TerminationReason) bool { - if st == "" { - return false - } - changed := false - if next.Session.State != st { - next.Session.State, changed = st, true - } - want := domain.TermNone - if st == domain.SessionTerminated { - want = reason - } - if next.TerminationReason != want { - next.TerminationReason, changed = want, true - } - return changed -} - -func setDetecting(next *domain.CanonicalSessionLifecycle, d *domain.DetectingState) bool { - if d != nil { - if next.Detecting != nil && *next.Detecting == *d { - return false - } - dc := *d - next.Detecting = &dc - return true - } - if next.Detecting != nil { - next.Detecting = nil - return true - } - return false -} - -// sameActivity compares with time-aware equality (== on time.Time is -// monotonic-clock sensitive and would spuriously report changes). func sameActivity(a, b domain.ActivitySubstate) bool { return a.State == b.State && a.Source == b.Source && a.LastActivityAt.Equal(b.LastActivityAt) } -func spawnMetadata(o ports.SpawnOutcome) domain.SessionMetadata { - return domain.SessionMetadata{ - Branch: o.Branch, - WorkspacePath: o.WorkspacePath, - RuntimeHandleID: o.RuntimeHandle.ID, - RuntimeName: o.RuntimeHandle.RuntimeName, - AgentSessionID: o.AgentSessionID, - Prompt: o.Prompt, - } -} - -// mergeMetadata overlays set fields of in onto base without clobbering an -// existing value with an empty one (a partial spawn write keeps the branch set -// at creation). func mergeMetadata(base, in domain.SessionMetadata) domain.SessionMetadata { set := func(dst *string, v string) { if v != "" { @@ -337,7 +139,6 @@ func mergeMetadata(base, in domain.SessionMetadata) domain.SessionMetadata { set(&base.Branch, in.Branch) set(&base.WorkspacePath, in.WorkspacePath) set(&base.RuntimeHandleID, in.RuntimeHandleID) - set(&base.RuntimeName, in.RuntimeName) set(&base.AgentSessionID, in.AgentSessionID) set(&base.Prompt, in.Prompt) return base diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go index 8adfd862..19f3616c 100644 --- a/backend/internal/lifecycle/manager_test.go +++ b/backend/internal/lifecycle/manager_test.go @@ -2,7 +2,7 @@ package lifecycle import ( "context" - "fmt" + "errors" "strings" "testing" "time" @@ -13,353 +13,199 @@ import ( var ctx = context.Background() -// ---- fakes ---- - -// fakeStore is a mini SessionStore + PRWriter: it derives PRFacts and recent -// check statuses from what the engine writes, so PR-reaction tests exercise the -// write path and the read-back together. type fakeStore struct { sessions map[domain.SessionID]domain.SessionRecord - pr map[domain.SessionID]domain.PRRow - comments map[string][]domain.PRComment - checks []domain.PRCheckRow - num int } func newFakeStore() *fakeStore { - return &fakeStore{ - sessions: map[domain.SessionID]domain.SessionRecord{}, - pr: map[domain.SessionID]domain.PRRow{}, - comments: map[string][]domain.PRComment{}, - } + return &fakeStore{sessions: map[domain.SessionID]domain.SessionRecord{}} } -func (f *fakeStore) CreateSession(_ context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { - f.num++ - rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, f.num)) - f.sessions[rec.ID] = rec - return rec, nil -} -func (f *fakeStore) UpdateSession(_ context.Context, rec domain.SessionRecord) error { - f.sessions[rec.ID] = rec - return nil -} func (f *fakeStore) GetSession(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { r, ok := f.sessions[id] return r, ok, nil } -func (f *fakeStore) ListSessions(_ context.Context, p domain.ProjectID) ([]domain.SessionRecord, error) { - var out []domain.SessionRecord - for _, r := range f.sessions { - if r.ProjectID == p { - out = append(out, r) - } - } - return out, nil -} -func (f *fakeStore) ListAllSessions(_ context.Context) ([]domain.SessionRecord, error) { - out := make([]domain.SessionRecord, 0, len(f.sessions)) - for _, r := range f.sessions { - out = append(out, r) - } - return out, nil -} -func (f *fakeStore) PRFactsForSession(_ context.Context, id domain.SessionID) (domain.PRFacts, error) { - r, ok := f.pr[id] - if !ok { - return domain.PRFacts{}, nil - } - facts := domain.PRFacts{ - URL: r.URL, Number: r.Number, Exists: true, - Draft: r.Draft, Merged: r.Merged, Closed: r.Closed, - CI: r.CI, Review: r.Review, Mergeability: r.Mergeability, - } - for _, c := range f.comments[r.URL] { - if !c.Resolved { - facts.ReviewComments = true - break - } - } - return facts, nil -} -func (f *fakeStore) WritePR(_ context.Context, pr domain.PRRow, checks []domain.PRCheckRow, comments []domain.PRComment) error { - f.pr[domain.SessionID(pr.SessionID)] = pr - f.checks = append(f.checks, checks...) - f.comments[pr.URL] = comments - return nil -} -func (f *fakeStore) RecentCheckStatuses(_ context.Context, url, name string, limit int) ([]string, error) { - var out []string - for i := len(f.checks) - 1; i >= 0 && len(out) < limit; i-- { - if f.checks[i].PRURL == url && f.checks[i].Name == name { - out = append(out, f.checks[i].Status) - } - } - return out, nil -} - -type fakeNotifier struct{ events []ports.Event } - -func (f *fakeNotifier) Notify(_ context.Context, e ports.Event) error { - f.events = append(f.events, e) - return nil -} -func (f *fakeNotifier) last() string { - if len(f.events) == 0 { - return "" - } - return f.events[len(f.events)-1].Type -} - -type fakeMessenger struct{ msgs []string } -func (f *fakeMessenger) Send(_ context.Context, _ domain.SessionID, m string) error { - f.msgs = append(f.msgs, m) +func (f *fakeStore) UpdateSession(_ context.Context, rec domain.SessionRecord) error { + f.sessions[rec.ID] = rec return nil } -func newManager() (*Manager, *fakeStore, *fakeNotifier, *fakeMessenger) { - st, n, msg := newFakeStore(), &fakeNotifier{}, &fakeMessenger{} - return New(st, st, n, msg), st, n, msg +type fakeMessenger struct { + msgs []string + err error } -func working(id domain.SessionID) domain.SessionRecord { - return domain.SessionRecord{ - ID: id, ProjectID: "mer", - Lifecycle: domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: domain.SessionWorking}, - IsAlive: true, - }, +func (f *fakeMessenger) Send(_ context.Context, _ domain.SessionID, msg string) error { + if f.err != nil { + return f.err } + f.msgs = append(f.msgs, msg) + return nil } -func openPR(o ports.PRObservation) ports.PRObservation { - o.Fetched, o.URL, o.Number = true, "https://example/pr/1", 1 - return o -} - -// ---- runtime observations ---- - -func TestRuntimeObservation_InferredDeath(t *testing.T) { - m, st, n, _ := newManager() - st.sessions["mer-1"] = working("mer-1") - - if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeDead, Process: ports.ProbeDead}); err != nil { - t.Fatal(err) - } - got := st.sessions["mer-1"].Lifecycle - if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermRuntimeLost || got.IsAlive { - t.Fatalf("want terminated/runtime_lost/dead, got %+v", got) - } - if n.last() != "reaction.agent-exited" { - t.Fatalf("want agent-exited notify, got %q", n.last()) - } +func newManager() (*Manager, *fakeStore, *fakeMessenger) { + st := newFakeStore() + msg := &fakeMessenger{} + return New(st, msg), st, msg } -func TestRuntimeObservation_FailedProbeQuarantines(t *testing.T) { - m, st, _, _ := newManager() - st.sessions["mer-1"] = working("mer-1") - - if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeFailed, Process: ports.ProbeFailed}); err != nil { - t.Fatal(err) - } - got := st.sessions["mer-1"].Lifecycle - if got.Session.State != domain.SessionDetecting || !got.IsAlive || got.Detecting == nil { - t.Fatalf("failed probe should quarantine alive, got %+v", got) - } +func working(id domain.SessionID) domain.SessionRecord { + return domain.SessionRecord{ID: id, ProjectID: "mer", Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: time.Now(), Source: domain.SourceNative}} } -func TestRuntimeObservation_RecoversDetecting(t *testing.T) { - m, st, _, _ := newManager() +func TestRuntimeObservation_InferredDeathSetsTerminated(t *testing.T) { + m, st, _ := newManager() rec := working("mer-1") - rec.Lifecycle.Session.State = domain.SessionDetecting - rec.Lifecycle.Detecting = &domain.DetectingState{Attempts: 1} + rec.Activity.LastActivityAt = time.Now().Add(-2 * time.Minute) st.sessions["mer-1"] = rec - - if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Runtime: ports.ProbeAlive, Process: ports.ProbeAlive}); err != nil { + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Probe: ports.ProbeDead}); err != nil { t.Fatal(err) } - got := st.sessions["mer-1"].Lifecycle - if got.Session.State != domain.SessionWorking || got.Detecting != nil { - t.Fatalf("healthy probe should recover to working, got %+v", got) + got := st.sessions["mer-1"] + if !got.IsTerminated || got.Activity.State != domain.ActivityExited { + t.Fatalf("want terminated/exited, got %+v", got) } } -// ---- activity signals ---- - -func TestActivity_WaitingInputPagesHuman(t *testing.T) { - m, st, n, _ := newManager() +func TestRuntimeObservation_FailedProbeDoesNotMutate(t *testing.T) { + m, st, _ := newManager() st.sessions["mer-1"] = working("mer-1") - - if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: true, State: domain.ActivityWaitingInput, Timestamp: time.Now()}); err != nil { + before := st.sessions["mer-1"] + if err := m.ApplyRuntimeObservation(ctx, "mer-1", ports.RuntimeFacts{Probe: ports.ProbeFailed}); err != nil { t.Fatal(err) } - if st.sessions["mer-1"].Lifecycle.Session.State != domain.SessionNeedsInput { - t.Fatalf("want needs_input, got %v", st.sessions["mer-1"].Lifecycle.Session.State) - } - if n.last() != "reaction.agent-needs-input" { - t.Fatalf("want needs-input notify, got %q", n.last()) + if st.sessions["mer-1"] != before { + t.Fatalf("failed probe should not persist a state, got %+v", st.sessions["mer-1"]) } } func TestActivity_InvalidIsIgnored(t *testing.T) { - m, st, _, _ := newManager() + m, st, _ := newManager() st.sessions["mer-1"] = working("mer-1") before := st.sessions["mer-1"] - if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: false, State: domain.ActivityIdle}); err != nil { t.Fatal(err) } if st.sessions["mer-1"] != before { - t.Fatal("invalid signal must not mutate the session") + t.Fatal("invalid signal must not mutate") } } -// ---- PR observations ---- - -func TestPR_CIFailingNudgesAgentWithLogs(t *testing.T) { - m, st, _, msg := newManager() +func TestActivity_WeakerSourceDoesNotOverrideStronger(t *testing.T) { + m, st, _ := newManager() st.sessions["mer-1"] = working("mer-1") - - o := openPR(ports.PRObservation{CI: domain.CIFailing, Checks: []domain.PRCheckRow{{Name: "build", CommitHash: "c1", Status: "failed", LogTail: "boom"}}}) - if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + before := st.sessions["mer-1"] + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: true, State: domain.ActivityIdle, Source: domain.SourceRuntime}); err != nil { t.Fatal(err) } - if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "boom") { - t.Fatalf("want one CI nudge with log tail, got %v", msg.msgs) + if st.sessions["mer-1"] != before { + t.Fatalf("weaker runtime signal should not override native activity, got %+v", st.sessions["mer-1"]) } } -func TestPR_CIBrakeEscalatesAfterThreeFails(t *testing.T) { - m, st, n, msg := newManager() - st.sessions["mer-1"] = working("mer-1") - - for _, commit := range []string{"c1", "c2", "c3"} { - o := openPR(ports.PRObservation{CI: domain.CIFailing, Checks: []domain.PRCheckRow{{Name: "build", CommitHash: commit, Status: "failed", LogTail: "boom"}}}) - if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { - t.Fatal(err) - } - } - if len(msg.msgs) != 2 { - t.Fatalf("want 2 nudges then escalate, got %d nudges", len(msg.msgs)) +func TestActivity_StrongerSourceOverridesWeaker(t *testing.T) { + m, st, _ := newManager() + st.sessions["mer-1"] = domain.SessionRecord{ID: "mer-1", ProjectID: "mer", Activity: domain.ActivitySubstate{State: domain.ActivityIdle, LastActivityAt: time.Now(), Source: domain.SourceRuntime}} + if err := m.ApplyActivitySignal(ctx, "mer-1", ports.ActivitySignal{Valid: true, State: domain.ActivityActive, Source: domain.SourceNative}); err != nil { + t.Fatal(err) } - if n.last() != "reaction.escalated" { - t.Fatalf("3rd failure should escalate, got %q", n.last()) + got := st.sessions["mer-1"].Activity + if got.State != domain.ActivityActive || got.Source != domain.SourceNative { + t.Fatalf("stronger native signal should override runtime, got %+v", got) } } -func TestPR_ReviewCommentsInjectedRegardlessOfAuthor(t *testing.T) { - m, st, _, msg := newManager() +func TestMarkTerminated(t *testing.T) { + m, st, _ := newManager() st.sessions["mer-1"] = working("mer-1") - - o := openPR(ports.PRObservation{ - Review: domain.ReviewChangesRequest, - Comments: []domain.PRComment{{ID: "1", Author: "greptileai", Body: "use a constant here"}}, - }) - if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + if err := m.MarkTerminated(ctx, "mer-1"); err != nil { t.Fatal(err) } - if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "use a constant here") { - t.Fatalf("review feedback should be injected verbatim, got %v", msg.msgs) + got := st.sessions["mer-1"] + if !got.IsTerminated || got.Activity.State != domain.ActivityExited { + t.Fatalf("want terminated/exited, got %+v", got) } } -func TestPR_ApprovedAndGreenNotifies(t *testing.T) { - m, st, n, _ := newManager() +func TestMarkSpawnedStoresRuntimeMetadata(t *testing.T) { + m, st, _ := newManager() st.sessions["mer-1"] = working("mer-1") - - o := openPR(ports.PRObservation{Review: domain.ReviewApproved, Mergeability: domain.MergeMergeable}) - if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + st.sessions["mer-1"] = domain.SessionRecord{ID: "mer-1", ProjectID: "mer", IsTerminated: true} + metadata := domain.SessionMetadata{Branch: "b", WorkspacePath: "/ws", RuntimeHandleID: "h1", AgentSessionID: "agent", Prompt: "prompt"} + if err := m.MarkSpawned(ctx, "mer-1", metadata); err != nil { t.Fatal(err) } - if n.last() != "reaction.approved-and-green" { - t.Fatalf("want approved-and-green, got %q", n.last()) + got := st.sessions["mer-1"] + if got.IsTerminated || got.Activity.State != domain.ActivityIdle || got.Metadata.RuntimeHandleID != "h1" { + t.Fatalf("spawn metadata wrong: %+v", got) } } -func TestPR_MergeTerminatesSession(t *testing.T) { - m, st, n, _ := newManager() +func TestPRObservation_CIFailingNudgesAgentWithLogs(t *testing.T) { + m, st, msg := newManager() st.sessions["mer-1"] = working("mer-1") - - o := openPR(ports.PRObservation{Merged: true}) + o := ports.PRObservation{Fetched: true, URL: "pr1", CI: domain.CIFailing, Checks: []ports.PRCheckObservation{{Name: "build", CommitHash: "c1", Status: domain.PRCheckFailed, LogTail: "boom"}}} if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { t.Fatal(err) } - got := st.sessions["mer-1"].Lifecycle - if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermPRMerged { - t.Fatalf("merge should terminate with pr_merged, got %+v", got) - } - if n.last() != "reaction.pr-merged" { - t.Fatalf("want pr-merged notify, got %q", n.last()) + if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "boom") { + t.Fatalf("want one CI nudge with log tail, got %v", msg.msgs) } } -func TestPR_FailedFetchIsDropped(t *testing.T) { - m, st, _, msg := newManager() +func TestPRObservation_ReviewCommentsNudgeAgent(t *testing.T) { + m, st, msg := newManager() st.sessions["mer-1"] = working("mer-1") - - if err := m.ApplyPRObservation(ctx, "mer-1", ports.PRObservation{Fetched: false, CI: domain.CIFailing}); err != nil { + o := ports.PRObservation{Fetched: true, URL: "pr1", Review: domain.ReviewChangesRequest, Comments: []ports.PRCommentObservation{{ID: "1", Body: "fix this"}}} + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { t.Fatal(err) } - if len(msg.msgs) != 0 || len(st.pr) != 0 { - t.Fatal("a failed fetch must write nothing and fire nothing") + if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "fix this") { + t.Fatalf("want review nudge, got %v", msg.msgs) } } -// ---- explicit kill ---- - -func TestKill_TerminatesWithoutReacting(t *testing.T) { - m, st, n, _ := newManager() +func TestPRObservation_MergeConflictNudgesAgent(t *testing.T) { + m, st, msg := newManager() st.sessions["mer-1"] = working("mer-1") - - if err := m.OnKillRequested(ctx, "mer-1", domain.TermManuallyKilled); err != nil { + o := ports.PRObservation{Fetched: true, URL: "pr1", Mergeability: domain.MergeConflicting} + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { t.Fatal(err) } - got := st.sessions["mer-1"].Lifecycle - if got.Session.State != domain.SessionTerminated || got.TerminationReason != domain.TermManuallyKilled || got.IsAlive { - t.Fatalf("want terminated/manually_killed/dead, got %+v", got) - } - if len(n.events) != 0 { - t.Fatal("an explicit kill must not fire a reaction") + if len(msg.msgs) != 1 || !strings.Contains(msg.msgs[0], "merge conflicts") { + t.Fatalf("want merge-conflict nudge, got %v", msg.msgs) } } -// ---- duration escalation ---- - -func TestTickEscalations_DurationPagesHuman(t *testing.T) { - m, st, n, msg := newManager() - now := time.Now() - m.clock = func() time.Time { return now } +func TestPRObservation_MergedTerminatesWithoutNudge(t *testing.T) { + m, st, msg := newManager() st.sessions["mer-1"] = working("mer-1") - - o := openPR(ports.PRObservation{Mergeability: domain.MergeConflicting}) - if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { + if err := m.ApplyPRObservation(ctx, "mer-1", ports.PRObservation{Fetched: true, URL: "pr1", Merged: true}); err != nil { t.Fatal(err) } - if len(msg.msgs) != 1 { - t.Fatalf("merge-conflict should nudge once, got %d", len(msg.msgs)) + got := st.sessions["mer-1"] + if !got.IsTerminated || got.Activity.State != domain.ActivityExited { + t.Fatalf("merged PR should terminate session, got %+v", got) } - if err := m.TickEscalations(ctx, now.Add(16*time.Minute)); err != nil { - t.Fatal(err) - } - if n.last() != "reaction.escalated" { - t.Fatalf("unaddressed conflict should escalate after 15m, got %q", n.last()) + if len(msg.msgs) != 0 { + t.Fatalf("merged PR should not send nudge, got %v", msg.msgs) } } -func TestRunningSessions_ExcludesTerminal(t *testing.T) { - m, st, _, _ := newManager() +func TestPRObservation_RetriesAfterMessengerFailure(t *testing.T) { + m, st, msg := newManager() st.sessions["mer-1"] = working("mer-1") - dead := working("mer-2") - dead.Lifecycle.Session.State = domain.SessionTerminated - st.sessions["mer-2"] = dead - - got, err := m.RunningSessions(ctx) - if err != nil { + o := ports.PRObservation{Fetched: true, URL: "pr1", Mergeability: domain.MergeConflicting} + msg.err = errors.New("temporary send failure") + if err := m.ApplyPRObservation(ctx, "mer-1", o); err == nil { + t.Fatal("want send error") + } + msg.err = nil + if err := m.ApplyPRObservation(ctx, "mer-1", o); err != nil { t.Fatal(err) } - if len(got) != 1 || got[0].ID != "mer-1" { - t.Fatalf("want only the live session, got %+v", got) + if len(msg.msgs) != 1 { + t.Fatalf("want retry to send once, got %v", msg.msgs) } } diff --git a/backend/internal/lifecycle/reactions.go b/backend/internal/lifecycle/reactions.go index 44419aa6..3f056c55 100644 --- a/backend/internal/lifecycle/reactions.go +++ b/backend/internal/lifecycle/reactions.go @@ -1,413 +1,117 @@ package lifecycle -// reactions.go is the ACT layer: after a persisted transition the engine maps -// the session's (state, PR facts) to at most one reaction and dispatches it — -// nudging the agent or paging the human. Two reactions inject live content (CI -// logs, review comments) and re-fire when that content changes; the rest fire -// once on entry, with duration escalation driven by TickEscalations. -// -// Budgets are in-memory: a restart re-arms them, which costs a few extra nudges, -// never a missed page. - import ( "context" - "fmt" "strings" "sync" - "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -type reactionKey string - -const ( - rxCIFailed reactionKey = "ci-failed" - rxReviewComments reactionKey = "review-comments" - rxMergeConflicts reactionKey = "merge-conflicts" - rxIdle reactionKey = "agent-idle" - rxApprovedGreen reactionKey = "approved-and-green" - rxStuck reactionKey = "agent-stuck" - rxNeedsInput reactionKey = "agent-needs-input" - rxExited reactionKey = "agent-exited" - rxPRClosed reactionKey = "pr-closed" - rxMerged reactionKey = "pr-merged" -) - -// Brakes: stop auto-handling and page a human after this many failed attempts. -const ( - ciBrakeRuns = 3 // last N runs of a failing check all failed - reviewMaxNudge = 3 // re-nudged the agent N times over new review feedback -) - -// reactionConfig is one row of the reaction table. toAgent reactions nudge the -// agent; the rest notify the human. escalateAfter (when set) drives a -// duration-based escalation via TickEscalations. -type reactionConfig struct { - toAgent bool - message string - eventType string - priority ports.Priority - escalateAfter time.Duration -} - -var reactions = map[reactionKey]reactionConfig{ - rxCIFailed: {toAgent: true, eventType: "reaction.ci-failed", priority: ports.PriorityAction, message: "CI is failing on your PR. Review the output below and push a fix."}, - rxReviewComments: {toAgent: true, eventType: "reaction.review-comments", priority: ports.PriorityAction, message: "A reviewer left feedback on your PR. Address it and push."}, - rxMergeConflicts: {toAgent: true, eventType: "reaction.merge-conflicts", priority: ports.PriorityAction, escalateAfter: 15 * time.Minute, message: "Your PR has merge conflicts. Rebase onto the base branch and resolve them."}, - rxIdle: {toAgent: true, eventType: "reaction.agent-idle", priority: ports.PriorityInfo, escalateAfter: 15 * time.Minute, message: "You appear idle. Continue the task or say what is blocking you."}, - rxApprovedGreen: {eventType: "reaction.approved-and-green", priority: ports.PriorityAction, message: "PR is approved and green — ready to merge."}, - rxStuck: {eventType: "reaction.agent-stuck", priority: ports.PriorityUrgent, message: "Agent is stuck and needs attention."}, - rxNeedsInput: {eventType: "reaction.agent-needs-input", priority: ports.PriorityUrgent, message: "Agent needs input to continue."}, - rxExited: {eventType: "reaction.agent-exited", priority: ports.PriorityUrgent, message: "Agent process exited unexpectedly."}, - rxPRClosed: {eventType: "reaction.pr-closed", priority: ports.PriorityAction, message: "PR was closed without merging."}, - rxMerged: {eventType: "reaction.pr-merged", priority: ports.PriorityInfo, message: "PR merged — work complete."}, -} - -// reactionContent carries the live material the feedback reactions inject. Empty -// for runtime/activity transitions; populated from a PR observation. -type reactionContent struct { - ciCheck string - ciCommit string - ciURL string - ciLogTail string - comments []string - reviewSig string -} - -// prContent extracts the CI failure + review feedback from a PR observation. -func prContent(o ports.PRObservation) reactionContent { - c := reactionContent{} - for _, ch := range o.Checks { - if ch.Status == "failed" { - c.ciCheck, c.ciCommit, c.ciLogTail, c.ciURL = ch.Name, ch.CommitHash, ch.LogTail, o.URL - break - } - } - var ids []string - for _, cm := range o.Comments { - if cm.Resolved { - continue - } - c.comments = append(c.comments, cm.Body) - ids = append(ids, cm.ID) - } - c.reviewSig = strings.Join(ids, ",") - return c -} - -// ---- in-memory escalation state ---- - -type trackerKey struct { - id domain.SessionID - key reactionKey -} - -type tracker struct { - attempts int - firstAt time.Time - escalated bool - seenSig bool - lastSig string - projectID domain.ProjectID -} +const reviewMaxNudge = 3 type reactionState struct { mu sync.Mutex - trackers map[trackerKey]*tracker - lastKey map[domain.SessionID]reactionKey + seen map[string]string + attempts map[string]int } func newReactionState() reactionState { - return reactionState{trackers: map[trackerKey]*tracker{}, lastKey: map[domain.SessionID]reactionKey{}} + return reactionState{seen: map[string]string{}, attempts: map[string]int{}} } -// trackerFor returns the (id,key) tracker, creating it on first use. Caller holds mu. -func (rs *reactionState) trackerFor(id domain.SessionID, key reactionKey) *tracker { - k := trackerKey{id, key} - t := rs.trackers[k] - if t == nil { - t = &tracker{} - rs.trackers[k] = t +// ApplyPRObservation reacts to a fetched PR observation after the PR service has +// persisted it. It does not write PR rows; it owns PR-driven lifecycle effects +// and sends actionable agent nudges such as rebase, fix-CI, and +// address-review-feedback prompts. +func (m *Manager) ApplyPRObservation(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { + if !o.Fetched { + return nil } - return t -} - -func (m *Manager) clearReactions(id domain.SessionID) { - m.react.mu.Lock() - defer m.react.mu.Unlock() - for k := range m.react.trackers { - if k.id == id { - delete(m.react.trackers, k) - } + if o.Merged { + return m.MarkTerminated(ctx, id) + } + if o.Closed { + return nil } - delete(m.react.lastKey, id) -} - -// ---- dispatch ---- - -// runReactions is the chokepoint called after every persisted transition. It -// runs unlocked (the write lock is already released) so a busy agent send never -// blocks the write path. -func (m *Manager) runReactions(ctx context.Context, id domain.SessionID, content reactionContent) error { rec, ok, err := m.store.GetSession(ctx, id) if err != nil || !ok { return err } - lc := rec.Lifecycle - project := rec.ProjectID - - if isTerminal(lc.Session.State) { - err := m.dispatch(ctx, id, project, terminalReaction(lc.TerminationReason)) - m.clearReactions(id) // incident over: drop budgets after the final notify - return err - } - - pr, err := m.store.PRFactsForSession(ctx, id) - if err != nil { - return err + if rec.IsTerminated || rec.Activity.State == domain.ActivityBlocked || rec.Activity.State == domain.ActivityWaitingInput { + return nil } - - // Feedback reactions inject live content and re-fire as it changes — only - // while the agent can actually act on it. - if pr.Exists && !pr.Closed && !needsHuman(lc.Session.State) { - if pr.CI == domain.CIFailing && content.ciCheck != "" { - if err := m.handleCIFailure(ctx, id, project, content); err != nil { - return err + if o.CI == domain.CIFailing { + for _, ch := range o.Checks { + if ch.Status == domain.PRCheckFailed { + msg := "CI is failing on your PR. Review the output below and push a fix." + if ch.LogTail != "" { + msg += "\n\nFailing output:\n" + ch.LogTail + } + return m.sendOnce(ctx, id, "ci:"+o.URL+":"+ch.Name, ch.CommitHash+":"+ch.LogTail, msg, 0) } } - if hasReviewFeedback(pr) { - if err := m.handleReviewFeedback(ctx, id, project, content); err != nil { - return err - } - } - } - - return m.dispatch(ctx, id, project, reactionFor(lc, pr)) -} - -// dispatch fires the entry reaction for key, deduped so a steady state does not -// re-fire. Leaving a reaction drops its budget. -func (m *Manager) dispatch(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey) error { - m.react.mu.Lock() - if m.react.lastKey[id] == key { - m.react.mu.Unlock() - return nil - } - if prev := m.react.lastKey[id]; prev != "" { - delete(m.react.trackers, trackerKey{id, prev}) - } - m.react.lastKey[id] = key - m.react.mu.Unlock() - - if key == "" { - return nil } - cfg := reactions[key] - if cfg.toAgent { - return m.fireAgentEntry(ctx, id, project, key, cfg) - } - return m.fireNotify(ctx, id, project, key, cfg) -} - -// reactionFor maps (session state, PR facts) to the reaction to enter. CI failure -// and review feedback return "" here — they are handled by the feedback path. -func reactionFor(lc domain.CanonicalSessionLifecycle, pr domain.PRFacts) reactionKey { - switch lc.Session.State { - case domain.SessionStuck: - return rxStuck - case domain.SessionNeedsInput: - return rxNeedsInput - } - if pr.Exists { - if pr.Closed { - if !pr.Merged { - return rxPRClosed - } - return "" + if o.Review == domain.ReviewChangesRequest || hasUnresolvedComments(o.Comments) { + comments, sig := reviewContent(o.Comments) + msg := "A reviewer left feedback on your PR. Address it and push." + if comments != "" { + msg += "\n\n" + comments } - switch { - case pr.CI == domain.CIFailing, hasReviewFeedback(pr): - return "" // feedback path - case pr.Mergeability == domain.MergeConflicting: - return rxMergeConflicts - case pr.Mergeability == domain.MergeMergeable, pr.Review == domain.ReviewApproved: - return rxApprovedGreen + if sig == "" { + sig = string(o.Review) } + return m.sendOnce(ctx, id, "review:"+o.URL, sig, msg, reviewMaxNudge) } - if lc.Session.State == domain.SessionIdle { - return rxIdle + if o.Mergeability == domain.MergeConflicting { + return m.sendOnce(ctx, id, "merge-conflict:"+o.URL, string(o.Mergeability), "Your PR has merge conflicts. Rebase onto the base branch and resolve them.", 0) } - return "" -} - -func hasReviewFeedback(pr domain.PRFacts) bool { - return pr.Review == domain.ReviewChangesRequest || pr.ReviewComments -} - -func needsHuman(s domain.SessionState) bool { - return s == domain.SessionStuck || s == domain.SessionNeedsInput + return nil } -// terminalReaction is the notify fired when a session reaches a terminal state by -// inferred death. An explicit kill goes through OnKillRequested (no reaction); -// auto_cleanup / pr_merged are notified elsewhere. -func terminalReaction(r domain.TerminationReason) reactionKey { - switch r { - case domain.TermRuntimeLost, domain.TermAgentProcessExited, domain.TermProbeFailure, domain.TermErrorInProcess: - return rxExited - default: - return "" +func hasUnresolvedComments(comments []ports.PRCommentObservation) bool { + for _, c := range comments { + if !c.Resolved { + return true + } } + return false } -// ---- feedback reactions (content-driven re-fire + brake) ---- - -func (m *Manager) handleCIFailure(ctx context.Context, id domain.SessionID, project domain.ProjectID, c reactionContent) error { - msg := reactions[rxCIFailed].message + "\n\nFailing output:\n" + c.ciLogTail - return m.fireFeedback(ctx, id, project, rxCIFailed, c.ciCommit, msg, func(int) (bool, error) { - st, err := m.pr.RecentCheckStatuses(ctx, c.ciURL, c.ciCheck, ciBrakeRuns) - if err != nil { - return false, err +func reviewContent(comments []ports.PRCommentObservation) (string, string) { + var bodies []string + var ids []string + for _, c := range comments { + if c.Resolved { + continue } - return allFailed(st, ciBrakeRuns), nil - }) -} - -func (m *Manager) handleReviewFeedback(ctx context.Context, id domain.SessionID, project domain.ProjectID, c reactionContent) error { - msg := reactions[rxReviewComments].message - if len(c.comments) > 0 { - msg += "\n\n" + strings.Join(c.comments, "\n\n") + bodies = append(bodies, c.Body) + ids = append(ids, c.ID) } - return m.fireFeedback(ctx, id, project, rxReviewComments, c.reviewSig, msg, func(attempts int) (bool, error) { - return attempts > reviewMaxNudge, nil - }) + return strings.Join(bodies, "\n\n"), strings.Join(ids, ",") } -// fireFeedback nudges the agent with fresh content, deduped by signature so the -// same content is not re-sent each poll. braked decides whether to escalate to a -// human instead (CI: history; review: attempt count). -func (m *Manager) fireFeedback(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, sig, message string, braked func(attempts int) (bool, error)) error { - m.react.mu.Lock() - t := m.react.trackerFor(id, key) - if project != "" { - t.projectID = project - } - if t.escalated || (t.seenSig && t.lastSig == sig) { - m.react.mu.Unlock() +func (m *Manager) sendOnce(ctx context.Context, id domain.SessionID, key, sig, msg string, maxAttempts int) error { + if m.messenger == nil { return nil } - t.seenSig, t.lastSig = true, sig - t.attempts++ - attempts, pid := t.attempts, t.projectID - m.react.lastKey[id] = key // feedback owns the slot so a later dispatch("") clears it - m.react.mu.Unlock() - - brake, err := braked(attempts) - if err != nil { - return err - } - if brake { - m.react.mu.Lock() - t.escalated = true - m.react.mu.Unlock() - cause := "max_attempts" - if key == rxCIFailed { - cause = "max_retries" - } - return m.escalate(ctx, id, pid, key, ports.EscalationEvent{Attempts: attempts, Cause: cause}) - } - return m.messenger.Send(ctx, id, message) -} - -// ---- entry reactions ---- - -// fireAgentEntry nudges the agent once on entry into a static reaction -// (idle/merge-conflicts); escalation is duration-based via TickEscalations. -func (m *Manager) fireAgentEntry(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, cfg reactionConfig) error { m.react.mu.Lock() - t := m.react.trackerFor(id, key) - if project != "" { - t.projectID = project - } - if t.escalated { + if m.react.seen[key] == sig { m.react.mu.Unlock() return nil } - if t.firstAt.IsZero() { - t.firstAt = m.clock() - } - t.attempts++ - m.react.mu.Unlock() - return m.messenger.Send(ctx, id, cfg.message) -} - -func (m *Manager) fireNotify(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, cfg reactionConfig) error { - return m.notifier.Notify(ctx, ports.Event{ - Type: cfg.eventType, Priority: cfg.priority, - SessionID: id, ProjectID: project, Message: cfg.message, - Reaction: &ports.ReactionEvent{Key: string(key), Action: "notify"}, - CauseKey: string(key), - OccurredAt: m.clock(), - }) -} - -func (m *Manager) escalate(ctx context.Context, id domain.SessionID, project domain.ProjectID, key reactionKey, esc ports.EscalationEvent) error { - if esc.Cause == "" { - esc.Cause = "max_attempts" - } - return m.notifier.Notify(ctx, ports.Event{ - Type: "reaction.escalated", Priority: ports.PriorityUrgent, - SessionID: id, ProjectID: project, - Message: fmt.Sprintf("Automatic handling of %q is exhausted — needs a human.", key), - Reaction: &ports.ReactionEvent{Key: string(key), Action: "escalated"}, - Escalation: &esc, - CauseKey: string(key) + ":" + esc.Cause, - OccurredAt: m.clock(), - }) -} - -// TickEscalations fires the duration-based escalations the synchronous engine -// cannot wake itself for. The reaper calls it on a timer. -func (m *Manager) TickEscalations(ctx context.Context, now time.Time) error { - type due struct { - id domain.SessionID - project domain.ProjectID - key reactionKey - attempts int - durationMs int64 + attempts := m.react.attempts[key] + if maxAttempts > 0 && attempts >= maxAttempts { + m.react.mu.Unlock() + return nil } - var fire []due - m.react.mu.Lock() - for k, t := range m.react.trackers { - if t.escalated { - continue - } - cfg := reactions[k.key] - if cfg.escalateAfter > 0 && !t.firstAt.IsZero() && now.Sub(t.firstAt) >= cfg.escalateAfter { - t.escalated = true - fire = append(fire, due{k.id, t.projectID, k.key, t.attempts, now.Sub(t.firstAt).Milliseconds()}) - } + if err := m.messenger.Send(ctx, id, msg); err != nil { + m.react.mu.Unlock() + return err } + m.react.seen[key] = sig + m.react.attempts[key] = attempts + 1 m.react.mu.Unlock() - - for _, d := range fire { - if err := m.escalate(ctx, d.id, d.project, d.key, ports.EscalationEvent{Attempts: d.attempts, Cause: "max_duration", DurationMs: d.durationMs}); err != nil { - return err - } - } return nil } - -func allFailed(statuses []string, n int) bool { - if len(statuses) < n { - return false - } - for i := 0; i < n; i++ { - if statuses[i] != "failed" { - return false - } - } - return true -} diff --git a/backend/internal/lifecycle/runtime.go b/backend/internal/lifecycle/runtime.go new file mode 100644 index 00000000..58de7f56 --- /dev/null +++ b/backend/internal/lifecycle/runtime.go @@ -0,0 +1,35 @@ +package lifecycle + +import ( + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +const defaultRecentActivityWindow = 60 * time.Second + +func hasRecentActivity(a domain.ActivitySubstate, now time.Time, window time.Duration) bool { + switch { + case a.State == domain.ActivityExited: + return false + case a.State.IsSticky(): + return true + case a.LastActivityAt.IsZero(): + return false + default: + return now.Sub(a.LastActivityAt) <= window + } +} + +func runtimeClearlyDead(f ports.RuntimeFacts, activity domain.ActivitySubstate, now time.Time, window time.Duration) bool { + observedAt := timeOr(f.ObservedAt, now) + return f.Probe == ports.ProbeDead && !hasRecentActivity(activity, observedAt, window) +} + +func timeOr(t, fallback time.Time) time.Time { + if t.IsZero() { + return fallback + } + return t +} diff --git a/backend/internal/notification/dedupe.go b/backend/internal/notification/dedupe.go deleted file mode 100644 index a4eaf326..00000000 --- a/backend/internal/notification/dedupe.go +++ /dev/null @@ -1,74 +0,0 @@ -package notification - -import ( - "crypto/sha256" - "encoding/hex" - "encoding/json" - "fmt" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// ConditionHash returns a deterministic, compact hash over a condition vector. -func ConditionHash(parts ...string) string { - b, _ := json.Marshal(parts) - sum := sha256.Sum256(b) - return hex.EncodeToString(sum[:16]) -} - -// DedupeKey returns the stable durable notification idempotency key. -func DedupeKey(projectID domain.ProjectID, sessionID domain.SessionID, reactionKey, conditionHash string) string { - return fmt.Sprintf("v1:lifecycle:%s:%s:%s:%s", projectID, sessionID, reactionKey, conditionHash) -} - -// ComputeDedupeKey derives a restart-safe dedupe key from the lifecycle event -// plus current persisted state. It avoids PR updated_at because re-polling the -// same facts after daemon restart would otherwise create duplicate notifications. -func ComputeDedupeKey(event ports.Event, rec domain.SessionRecord, pr domain.PRFacts) string { - projectID := event.ProjectID - if projectID == "" { - projectID = rec.ProjectID - } - reactionKey := reactionKeyForEvent(event) - condition := []string{ - "session_state", string(rec.Lifecycle.Session.State), - "termination", string(rec.Lifecycle.TerminationReason), - "session_updated", timeKey(rec.UpdatedAt), - } - if pr.Exists { - condition = append(condition, - "pr_url", pr.URL, - "pr_number", fmt.Sprint(pr.Number), - "pr_draft", fmt.Sprint(pr.Draft), - "pr_merged", fmt.Sprint(pr.Merged), - "pr_closed", fmt.Sprint(pr.Closed), - "ci", string(pr.CI), - "review", string(pr.Review), - "mergeability", string(pr.Mergeability), - "review_comments", fmt.Sprint(pr.ReviewComments), - ) - } - if event.CauseKey != "" { - condition = append(condition, "cause_key", event.CauseKey) - } - if event.Escalation != nil { - condition = append(condition, "escalation_cause", event.Escalation.Cause) - } - return DedupeKey(projectID, event.SessionID, reactionKey, ConditionHash(condition...)) -} - -func reactionKeyForEvent(event ports.Event) string { - if event.Reaction != nil && event.Reaction.Key != "" { - return event.Reaction.Key - } - return reactionKeyFromType(event.Type) -} - -func timeKey(t time.Time) string { - if t.IsZero() { - return "" - } - return t.UTC().Format(time.RFC3339Nano) -} diff --git a/backend/internal/notification/dedupe_test.go b/backend/internal/notification/dedupe_test.go deleted file mode 100644 index 2730bc10..00000000 --- a/backend/internal/notification/dedupe_test.go +++ /dev/null @@ -1,63 +0,0 @@ -package notification - -import ( - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -func TestDedupeSameReactionConditionProducesSameKey(t *testing.T) { - rec := dedupeRecord("working", time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC)) - e := ports.Event{SessionID: "ao-1", Reaction: &ports.ReactionEvent{Key: "agent-needs-input", Action: "notify"}} - - k1 := ComputeDedupeKey(e, rec, domain.PRFacts{}) - k2 := ComputeDedupeKey(e, rec, domain.PRFacts{}) - if k1 != k2 { - t.Fatalf("dedupe key unstable: %q != %q", k1, k2) - } -} - -func TestDedupeChangedConditionProducesNewKey(t *testing.T) { - e := ports.Event{SessionID: "ao-1", Reaction: &ports.ReactionEvent{Key: "agent-needs-input", Action: "notify"}} - r1 := dedupeRecord("needs_input", time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC)) - r2 := dedupeRecord("needs_input", time.Date(2026, 1, 2, 3, 4, 6, 0, time.UTC)) - - if ComputeDedupeKey(e, r1, domain.PRFacts{}) == ComputeDedupeKey(e, r2, domain.PRFacts{}) { - t.Fatal("changed session updated timestamp should change dedupe key") - } -} - -func TestDedupeEscalationIncludesCauseAndDoesNotCollideWithBase(t *testing.T) { - rec := dedupeRecord("working", time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC)) - base := ports.Event{SessionID: "ao-1", Reaction: &ports.ReactionEvent{Key: "ci-failed", Action: "notify"}} - esc := ports.Event{ - SessionID: "ao-1", - Reaction: &ports.ReactionEvent{Key: "ci-failed", Action: "escalated"}, - Escalation: &ports.EscalationEvent{Attempts: 3, Cause: "max_retries"}, - } - otherCause := esc - otherCause.Escalation = &ports.EscalationEvent{Attempts: 3, Cause: "max_duration"} - - baseKey := ComputeDedupeKey(base, rec, domain.PRFacts{Exists: true, URL: "pr", CI: domain.CIFailing}) - escKey := ComputeDedupeKey(esc, rec, domain.PRFacts{Exists: true, URL: "pr", CI: domain.CIFailing}) - otherKey := ComputeDedupeKey(otherCause, rec, domain.PRFacts{Exists: true, URL: "pr", CI: domain.CIFailing}) - if baseKey == escKey { - t.Fatal("escalation dedupe key should not collide with base reaction") - } - if escKey == otherKey { - t.Fatal("escalation cause should affect dedupe key") - } -} - -func dedupeRecord(state domain.SessionState, updated time.Time) domain.SessionRecord { - return domain.SessionRecord{ - ID: "ao-1", - ProjectID: "ao", - Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: state}, - }, - UpdatedAt: updated, - } -} diff --git a/backend/internal/notification/enqueuer.go b/backend/internal/notification/enqueuer.go deleted file mode 100644 index 686490d2..00000000 --- a/backend/internal/notification/enqueuer.go +++ /dev/null @@ -1,53 +0,0 @@ -package notification - -import ( - "context" - "log/slog" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// Store is the durable write-side used by the enqueuer. *sqlite.Store satisfies -// this interface. -type Store interface { - EnqueueNotification(ctx context.Context, row domain.Notification) (domain.Notification, bool, error) -} - -// Enqueuer is a store-backed ports.Notifier. It does not deliver to external -// sinks; it renders and persists the notification for later dashboard/app sinks. -type Enqueuer struct { - store Store - renderer *Renderer - logger *slog.Logger -} - -var _ ports.Notifier = (*Enqueuer)(nil) - -// NewEnqueuer returns a Notifier that renders events and persists the resulting -// notification rows via store, defaulting the logger to slog.Default. -func NewEnqueuer(store Store, renderer *Renderer, logger *slog.Logger) *Enqueuer { - if logger == nil { - logger = slog.Default() - } - return &Enqueuer{store: store, renderer: renderer, logger: logger} -} - -// Notify renders the event and enqueues the resulting notification row. -func (e *Enqueuer) Notify(ctx context.Context, event ports.Event) error { - row, err := e.renderer.Render(ctx, event) - if err != nil { - return err - } - saved, created, err := e.store.EnqueueNotification(ctx, row) - if err != nil { - return err - } - e.logger.DebugContext(ctx, "notification enqueued", - "id", saved.ID, - "session", saved.SessionID, - "semantic_type", saved.SemanticType, - "created", created, - ) - return nil -} diff --git a/backend/internal/notification/enqueuer_test.go b/backend/internal/notification/enqueuer_test.go deleted file mode 100644 index 1ed14461..00000000 --- a/backend/internal/notification/enqueuer_test.go +++ /dev/null @@ -1,38 +0,0 @@ -package notification - -import ( - "context" - "io" - "log/slog" - "testing" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -type fakeNotificationStore struct { - row domain.Notification - created bool -} - -func (f *fakeNotificationStore) EnqueueNotification(_ context.Context, row domain.Notification) (domain.Notification, bool, error) { - f.row = row - f.created = true - return row, true, nil -} - -func TestEnqueuerRendersAndPersists(t *testing.T) { - store := &fakeNotificationStore{} - renderer := NewRenderer(fakeReader{rec: renderRecord()}) - enq := NewEnqueuer(store, renderer, slog.New(slog.NewTextHandler(io.Discard, nil))) - if err := enq.Notify(context.Background(), ports.Event{ - Type: "reaction.agent-needs-input", Priority: ports.PriorityUrgent, - ProjectID: "ao", SessionID: "ao-7", Message: "needs input", - Reaction: &ports.ReactionEvent{Key: "agent-needs-input", Action: "notify"}, - }); err != nil { - t.Fatal(err) - } - if !store.created || store.row.SemanticType != "session.needs_input" || store.row.DedupeKey == "" { - t.Fatalf("store row not rendered: created=%v row=%+v", store.created, store.row) - } -} diff --git a/backend/internal/notification/payload.go b/backend/internal/notification/payload.go deleted file mode 100644 index b4abaaca..00000000 --- a/backend/internal/notification/payload.go +++ /dev/null @@ -1,75 +0,0 @@ -package notification - -// PayloadSchemaVersion is the durable notification payload contract version. -const PayloadSchemaVersion = 3 - -// Payload is the provider-neutral, rich notification data shape persisted in -// SQLite. It intentionally mirrors legacy AO's NotificationData V3 while only -// filling fields the Go rewrite can source today. -type Payload struct { - SchemaVersion int `json:"schemaVersion"` - SemanticType string `json:"semanticType"` - Subject SubjectPayload `json:"subject"` - Reaction *ReactionPayload `json:"reaction,omitempty"` - Escalation *EscalationPayload `json:"escalation,omitempty"` - CI *CIPayload `json:"ci,omitempty"` - Review *ReviewPayload `json:"review,omitempty"` - Merge *MergePayload `json:"merge,omitempty"` -} - -// SubjectPayload identifies what a notification is about — the session and, -// when relevant, its PR, issue, and branch. -type SubjectPayload struct { - Session *SessionSubjectPayload `json:"session,omitempty"` - PR *PRSubjectPayload `json:"pr,omitempty"` - Issue *IssueSubjectPayload `json:"issue,omitempty"` - Branch string `json:"branch,omitempty"` -} - -// SessionSubjectPayload identifies the session a notification concerns. -type SessionSubjectPayload struct { - ID string `json:"id"` - ProjectID string `json:"projectId"` -} - -// PRSubjectPayload identifies the PR a notification concerns. -type PRSubjectPayload struct { - Number int `json:"number,omitempty"` - URL string `json:"url,omitempty"` - Draft bool `json:"draft,omitempty"` -} - -// IssueSubjectPayload identifies the tracker issue a notification concerns. -type IssueSubjectPayload struct { - ID string `json:"id,omitempty"` -} - -// ReactionPayload carries the reaction that produced the notification. -type ReactionPayload struct { - Key string `json:"key"` - Action string `json:"action"` -} - -// EscalationPayload carries the escalation that produced the notification. -type EscalationPayload struct { - Attempts int `json:"attempts"` - Cause string `json:"cause"` - DurationMs int64 `json:"durationMs"` -} - -// CIPayload is the CI context of a notification. -type CIPayload struct { - Status string `json:"status"` -} - -// ReviewPayload is the review context of a notification. -type ReviewPayload struct { - Decision string `json:"decision"` -} - -// MergePayload is the merge-readiness context of a notification. -type MergePayload struct { - Ready *bool `json:"ready,omitempty"` - Conflicts *bool `json:"conflicts,omitempty"` - IsBehind *bool `json:"isBehind,omitempty"` -} diff --git a/backend/internal/notification/renderer.go b/backend/internal/notification/renderer.go deleted file mode 100644 index e10872cf..00000000 --- a/backend/internal/notification/renderer.go +++ /dev/null @@ -1,201 +0,0 @@ -package notification - -import ( - "context" - "encoding/json" - "fmt" - "strings" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -// Reader is the subset of durable state the renderer rehydrates. *sqlite.Store -// satisfies it directly. -type Reader interface { - GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) - PRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, error) -} - -// Renderer converts lifecycle notification events into durable notification rows. -type Renderer struct { - reader Reader - clock func() time.Time -} - -// NewRenderer returns a Renderer that sources session/PR facts via reader. -func NewRenderer(reader Reader) *Renderer { - return &Renderer{reader: reader, clock: time.Now} -} - -// Render builds a durable Notification (subject + typed payload) from a -// lifecycle Event. -func (r *Renderer) Render(ctx context.Context, event ports.Event) (domain.Notification, error) { - if event.SessionID == "" { - return domain.Notification{}, fmt.Errorf("render notification: missing session id") - } - rec, ok, err := r.reader.GetSession(ctx, event.SessionID) - if err != nil { - return domain.Notification{}, fmt.Errorf("render notification: get session %s: %w", event.SessionID, err) - } - if !ok { - return domain.Notification{}, fmt.Errorf("render notification: session %s not found", event.SessionID) - } - pr, err := r.reader.PRFactsForSession(ctx, event.SessionID) - if err != nil { - return domain.Notification{}, fmt.Errorf("render notification: pr facts for %s: %w", event.SessionID, err) - } - - projectID := event.ProjectID - if projectID == "" { - projectID = rec.ProjectID - } - reaction := reactionPayload(event) - semanticType := SemanticTypeForReaction(reaction.Key) - if semanticType == "" { - semanticType = event.Type - } - payload := Payload{ - SchemaVersion: PayloadSchemaVersion, - SemanticType: semanticType, - Subject: SubjectPayload{ - Session: &SessionSubjectPayload{ID: string(event.SessionID), ProjectID: string(projectID)}, - Branch: rec.Metadata.Branch, - }, - Reaction: &reaction, - } - if rec.IssueID != "" { - payload.Subject.Issue = &IssueSubjectPayload{ID: string(rec.IssueID)} - } - if pr.Exists { - payload.Subject.PR = &PRSubjectPayload{Number: pr.Number, URL: pr.URL, Draft: pr.Draft} - if pr.CI != "" { - payload.CI = &CIPayload{Status: string(pr.CI)} - } - if pr.Review != "" { - payload.Review = &ReviewPayload{Decision: string(pr.Review)} - } - payload.Merge = mergePayload(pr.Mergeability) - } - if event.Escalation != nil { - payload.Escalation = &EscalationPayload{ - Attempts: event.Escalation.Attempts, - Cause: event.Escalation.Cause, - DurationMs: event.Escalation.DurationMs, - } - } - - payloadJSON, err := json.Marshal(payload) - if err != nil { - return domain.Notification{}, fmt.Errorf("render notification payload: %w", err) - } - - occurredAt := event.OccurredAt - if occurredAt.IsZero() { - occurredAt = r.clock().UTC() - } - priority := string(event.Priority) - if priority == "" { - priority = string(ports.PriorityInfo) - } - dedupeKey := event.DedupeKey - if dedupeKey == "" { - dedupeKey = ComputeDedupeKey(event, rec, pr) - } - causeKey := event.CauseKey - if causeKey == "" { - causeKey = reaction.Key - if event.Escalation != nil && event.Escalation.Cause != "" { - causeKey += ":" + event.Escalation.Cause - } - } - - return domain.Notification{ - ProjectID: projectID, - SessionID: event.SessionID, - Source: "lifecycle", - EventType: event.Type, - SemanticType: semanticType, - Priority: priority, - Message: event.Message, - Payload: payloadJSON, - Actions: actionsFor(projectID, event.SessionID, pr), - DedupeKey: dedupeKey, - CauseKey: causeKey, - CreatedAt: occurredAt, - UpdatedAt: occurredAt, - }, nil -} - -func reactionPayload(event ports.Event) ReactionPayload { - key := reactionKeyFromType(event.Type) - action := "notify" - if event.Reaction != nil { - if event.Reaction.Key != "" { - key = event.Reaction.Key - } - if event.Reaction.Action != "" { - action = event.Reaction.Action - } - } - if event.Escalation != nil && event.Reaction == nil { - action = "escalated" - } - return ReactionPayload{Key: key, Action: action} -} - -func reactionKeyFromType(t string) string { - if strings.HasPrefix(t, "reaction.") { - return strings.TrimPrefix(t, "reaction.") - } - return t -} - -func mergePayload(m domain.Mergeability) *MergePayload { - if m == "" { - return nil - } - ready := m == domain.MergeMergeable - conflicts := m == domain.MergeConflicting - return &MergePayload{Ready: &ready, Conflicts: &conflicts} -} - -func actionsFor(projectID domain.ProjectID, sessionID domain.SessionID, pr domain.PRFacts) []domain.NotificationAction { - actions := []domain.NotificationAction{{ - ID: "open-session", - Kind: "route", - Label: "Open session", - Route: fmt.Sprintf("/projects/%s/sessions/%s", projectID, sessionID), - }} - if pr.Exists && pr.URL != "" { - actions = append(actions, domain.NotificationAction{ID: "open-pr", Kind: "url", Label: "Open PR", URL: pr.URL}) - } - return actions -} - -// SemanticTypeForReaction maps internal reaction keys to public semantic types. -func SemanticTypeForReaction(key string) string { - switch key { - case "approved-and-green": - return "merge.ready" - case "agent-stuck": - return "session.stuck" - case "agent-needs-input": - return "session.needs_input" - case "agent-exited": - return "session.exited" - case "pr-closed": - return "pr.closed" - case "pr-merged": - return "pr.merged" - case "ci-failed": - return "ci.failing" - case "review-comments": - return "review.changes_requested" - case "merge-conflicts": - return "merge.conflicts" - default: - return "" - } -} diff --git a/backend/internal/notification/renderer_test.go b/backend/internal/notification/renderer_test.go deleted file mode 100644 index 4cf70c97..00000000 --- a/backend/internal/notification/renderer_test.go +++ /dev/null @@ -1,133 +0,0 @@ -package notification - -import ( - "context" - "encoding/json" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" -) - -type fakeReader struct { - rec domain.SessionRecord - pr domain.PRFacts -} - -func (f fakeReader) GetSession(context.Context, domain.SessionID) (domain.SessionRecord, bool, error) { - return f.rec, true, nil -} -func (f fakeReader) PRFactsForSession(context.Context, domain.SessionID) (domain.PRFacts, error) { - return f.pr, nil -} - -func TestSemanticTypeMapping(t *testing.T) { - cases := map[string]string{ - "approved-and-green": "merge.ready", - "agent-stuck": "session.stuck", - "agent-needs-input": "session.needs_input", - "agent-exited": "session.exited", - "pr-closed": "pr.closed", - "pr-merged": "pr.merged", - "ci-failed": "ci.failing", - "review-comments": "review.changes_requested", - "merge-conflicts": "merge.conflicts", - } - for key, want := range cases { - if got := SemanticTypeForReaction(key); got != want { - t.Fatalf("SemanticTypeForReaction(%q) = %q, want %q", key, got, want) - } - } -} - -func TestRendererPayloadIncludesSessionProjectIssueAndBranch(t *testing.T) { - r := NewRenderer(fakeReader{rec: renderRecord()}) - row, err := r.Render(context.Background(), ports.Event{ - Type: "reaction.agent-needs-input", Priority: ports.PriorityUrgent, - ProjectID: "ao", SessionID: "ao-7", Message: "needs input", - Reaction: &ports.ReactionEvent{Key: "agent-needs-input", Action: "notify"}, - OccurredAt: time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC), - }) - if err != nil { - t.Fatal(err) - } - var p Payload - if err := json.Unmarshal(row.Payload, &p); err != nil { - t.Fatal(err) - } - if p.SchemaVersion != 3 || p.SemanticType != "session.needs_input" { - t.Fatalf("payload header = %+v", p) - } - if p.Subject.Session == nil || p.Subject.Session.ID != "ao-7" || p.Subject.Session.ProjectID != "ao" { - t.Fatalf("session subject missing: %+v", p.Subject.Session) - } - if p.Subject.Issue == nil || p.Subject.Issue.ID != "AO-12" || p.Subject.Branch != "feat/example" { - t.Fatalf("issue/branch missing: %+v", p.Subject) - } -} - -func TestRendererPRPayloadIncludesFacts(t *testing.T) { - r := NewRenderer(fakeReader{rec: renderRecord(), pr: domain.PRFacts{ - Exists: true, URL: "https://github.com/org/repo/pull/12", Number: 12, - CI: domain.CIFailing, Review: domain.ReviewChangesRequest, Mergeability: domain.MergeConflicting, - }}) - row, err := r.Render(context.Background(), ports.Event{ - Type: "reaction.review-comments", Priority: ports.PriorityAction, - ProjectID: "ao", SessionID: "ao-7", Message: "review", - Reaction: &ports.ReactionEvent{Key: "review-comments", Action: "notify"}, - }) - if err != nil { - t.Fatal(err) - } - var p Payload - if err := json.Unmarshal(row.Payload, &p); err != nil { - t.Fatal(err) - } - if p.Subject.PR == nil || p.Subject.PR.URL != "https://github.com/org/repo/pull/12" || p.Subject.PR.Number != 12 { - t.Fatalf("pr subject missing: %+v", p.Subject.PR) - } - if p.CI == nil || p.CI.Status != "failing" { - t.Fatalf("ci missing: %+v", p.CI) - } - if p.Review == nil || p.Review.Decision != "changes_requested" { - t.Fatalf("review missing: %+v", p.Review) - } - if p.Merge == nil || p.Merge.Conflicts == nil || *p.Merge.Conflicts != true || p.Merge.Ready == nil || *p.Merge.Ready != false { - t.Fatalf("merge missing: %+v", p.Merge) - } -} - -func TestRendererEscalationPayloadIncludesDetails(t *testing.T) { - r := NewRenderer(fakeReader{rec: renderRecord()}) - row, err := r.Render(context.Background(), ports.Event{ - Type: "reaction.escalated", Priority: ports.PriorityUrgent, - ProjectID: "ao", SessionID: "ao-7", Message: "escalated", - Reaction: &ports.ReactionEvent{Key: "ci-failed", Action: "escalated"}, - Escalation: &ports.EscalationEvent{Attempts: 3, Cause: "max_retries", DurationMs: 42}, - }) - if err != nil { - t.Fatal(err) - } - var p Payload - if err := json.Unmarshal(row.Payload, &p); err != nil { - t.Fatal(err) - } - if p.Reaction == nil || p.Reaction.Key != "ci-failed" || p.Reaction.Action != "escalated" { - t.Fatalf("reaction missing: %+v", p.Reaction) - } - if p.Escalation == nil || p.Escalation.Attempts != 3 || p.Escalation.Cause != "max_retries" || p.Escalation.DurationMs != 42 { - t.Fatalf("escalation missing: %+v", p.Escalation) - } -} - -func renderRecord() domain.SessionRecord { - return domain.SessionRecord{ - ID: "ao-7", - ProjectID: "ao", - IssueID: "AO-12", - Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionNeedsInput}}, - Metadata: domain.SessionMetadata{Branch: "feat/example"}, - UpdatedAt: time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC), - } -} diff --git a/backend/internal/observe/reaper/reaper.go b/backend/internal/observe/reaper/reaper.go index 7edee2b1..16812c9b 100644 --- a/backend/internal/observe/reaper/reaper.go +++ b/backend/internal/observe/reaper/reaper.go @@ -1,13 +1,9 @@ // Package reaper implements the OBSERVE-layer polling timer that supplies the -// LCM with the two facts the LCM cannot wake itself to discover: a periodic -// duration-based escalation heartbeat, and per-session runtime liveness probes. +// LCM with per-session runtime liveness probes. // -// The reaper sits OUTSIDE the LCM's per-session serial loop. It only REPORTS -// facts — it never decides whether a session is "truly" dead. The decider -// (anti-flap Detecting quarantine, terminal-session rules) is owned by the LCM -// and consumes these facts through the regular ApplyRuntimeObservation entry -// point. A probe error is reported as a probe-failure fact, never collapsed to -// "alive" or "dead", so the LCM's failed-probe ≠ dead invariant holds. +// The reaper only reports facts — it never writes session rows directly. The LCM +// consumes these facts through ApplyRuntimeObservation. A probe error is +// reported as a probe-failure fact, never collapsed to "alive" or "dead". package reaper import ( @@ -23,33 +19,14 @@ import ( // the design doc's 5s sampling window for runtime liveness. const DefaultTickInterval = 5 * time.Second -// RuntimeRegistry resolves a runtime adapter by the RuntimeName recorded in a -// session's RuntimeHandle. The reaper looks the runtime up per-session so a -// single reaper instance can probe tmux- and zellij-backed sessions side by -// side without knowing about either at construction. -type RuntimeRegistry interface { - Runtime(name string) (ports.Runtime, bool) -} - -// MapRegistry is the trivial RuntimeRegistry: a name->runtime map. Callers -// that need dynamic registration can implement RuntimeRegistry themselves. -type MapRegistry map[string]ports.Runtime - -// Runtime implements RuntimeRegistry. -func (m MapRegistry) Runtime(name string) (ports.Runtime, bool) { - rt, ok := m[name] - return rt, ok -} - // Config holds the externally-tunable knobs for a Reaper. Every field is -// optional; zero values fall back to safe defaults so production wiring (which -// only needs to inject the LCM and registry) and tests (which inject a clock -// plus a fast tick) can both stay terse. +// optional; zero values fall back to safe defaults so production wiring and +// tests can both stay terse. type Config struct { // Tick is the interval between ticks. <=0 means DefaultTickInterval. Tick time.Duration - // Clock supplies ObservedAt and TickEscalations now stamps. nil means - // time.Now. Injected in tests so assertions don't race wallclock. + // Clock supplies ObservedAt stamps. nil means time.Now. Injected in tests so + // assertions don't race wallclock. Clock func() time.Time // Logger receives operational diagnostics (probe errors, skipped sessions, // LCM call failures). The reaper logs but does not propagate these errors @@ -58,23 +35,36 @@ type Config struct { Logger *slog.Logger } +type sessionSource interface { + ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) +} + +type runtimeObservationSink interface { + ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error +} + +type runtimeProber interface { + IsAlive(context.Context, ports.RuntimeHandle) (bool, error) +} + // Reaper is the polling timer. Construct it with New; start the background // goroutine with Start, or drive a single cycle synchronously with Tick. type Reaper struct { - lcm ports.LifecycleManager - registry RuntimeRegistry + sink runtimeObservationSink + sessions sessionSource + runtime runtimeProber tick time.Duration clock func() time.Time logger *slog.Logger } -// New constructs a Reaper. The LCM is the sole writer destination (the reaper -// reports facts via ApplyRuntimeObservation and TickEscalations); the registry -// resolves the runtime adapter to use per session. -func New(lcm ports.LifecycleManager, registry RuntimeRegistry, cfg Config) *Reaper { +// New constructs a Reaper. sink is the lifecycle fact destination; sessions +// supplies the rows to probe; runtime checks whether a stored handle is alive. +func New(sink runtimeObservationSink, sessions sessionSource, runtime runtimeProber, cfg Config) *Reaper { r := &Reaper{ - lcm: lcm, - registry: registry, + sink: sink, + sessions: sessions, + runtime: runtime, tick: cfg.Tick, clock: cfg.Clock, logger: cfg.Logger, @@ -117,35 +107,27 @@ func (r *Reaper) loop(ctx context.Context, done chan<- struct{}) { } } -// Tick runs one observation cycle: it always fires TickEscalations first (the -// duration-based escalation heartbeat, which the synchronous LCM cannot wake -// itself to drive), then enumerates the LCM's running sessions, probes each -// one's runtime, and reports any non-alive result back as a fact. +// Tick runs one observation cycle: it enumerates non-terminated sessions, +// probes each one's runtime, and reports each result back as a fact. // // Tick is exported so the daemon (and tests) can drive cycles synchronously, // and so the Start goroutine has a single chokepoint to log against. // -// Errors: only the RunningSessions failure is propagated, since it short- -// circuits the rest of the cycle. TickEscalations and per-session -// ApplyRuntimeObservation failures are logged but never propagated — one -// failed call must not bring down the loop. +// Errors: only the session-listing failure is propagated, since it short- +// circuits the rest of the cycle. Per-session ApplyRuntimeObservation failures +// are logged but never propagated — one failed call must not bring down the loop. func (r *Reaper) Tick(ctx context.Context) error { now := r.clock() - // Heartbeat is best-effort and runs before enumeration so duration-based - // escalations still fire if the running-set lookup is the thing that - // errored. The LCM's TickEscalations is itself idempotent (no canonical - // writes) — at worst we miss escalating once and pick it up next tick. - if err := r.lcm.TickEscalations(ctx, now); err != nil { - r.logger.Error("reaper: TickEscalations failed", "err", err) - } - - sessions, err := r.lcm.RunningSessions(ctx) + sessions, err := r.sessions.ListAllSessions(ctx) if err != nil { return err } for _, sess := range sessions { + if sess.IsTerminated { + continue + } r.probeOne(ctx, sess, now) } return nil @@ -153,62 +135,47 @@ func (r *Reaper) Tick(ctx context.Context) error { // probeOne handles a single session's probe + fact-report. Every probe result — // alive, dead, or failed — is reported as a fact to the LCM. The reaper does -// not optimize away the "alive" case, because a session in Detecting (whose -// runtime axis is NOT alive) is included in the running set and needs the -// alive probe to recover; the reaper has no business deciding what counts as -// a no-op. The LCM's ApplyRuntimeObservation diffs against canonical and -// only Upserts on actual change, so steady-state alive is already cheap. +// not optimize away the "alive" case; the reaper has no business deciding what +// counts as a no-op. The LCM diffs and only writes on actual change. func (r *Reaper) probeOne(ctx context.Context, sess domain.SessionRecord, now time.Time) { handle, ok := handleFromRecord(sess) if !ok { // A session in the running-set without a handle is an anomaly worth - // surfacing (OnSpawnCompleted should have set both keys). Warn rather + // surfacing (MarkSpawned should have set both keys). Warn rather // than Debug so it doesn't hide behind a noisy log level. r.logger.Warn("reaper: session has no runtime handle metadata, skipping", "session", sess.ID) return } - rt, ok := r.registry.Runtime(handle.RuntimeName) - if !ok { - r.logger.Warn("reaper: no runtime registered for session, skipping", - "session", sess.ID, "runtime", handle.RuntimeName) - return - } - - alive, probeErr := rt.IsAlive(ctx, handle) + alive, probeErr := r.runtime.IsAlive(ctx, handle) facts := ports.RuntimeFacts{ObservedAt: now} switch { case probeErr != nil: // Failed probe must NOT be collapsed to alive — that would let a - // transient tmux/zellij outage hide a really-dead session, and a + // transient Zellij outage hide a really-dead session, and a // transient adapter bug terminate a really-alive one. Report failed - // and let the LCM's detecting quarantine arbitrate. - facts.Runtime = ports.ProbeFailed - facts.Process = ports.ProbeFailed + // and let the LCM arbitrate. + facts.Probe = ports.ProbeFailed r.logger.Debug("reaper: probe error reported as failed fact", - "session", sess.ID, "runtime", handle.RuntimeName, "err", probeErr) + "session", sess.ID, "err", probeErr) case alive: - facts.Runtime = ports.ProbeAlive - facts.Process = ports.ProbeAlive + facts.Probe = ports.ProbeAlive default: - facts.Runtime = ports.ProbeDead - facts.Process = ports.ProbeDead + facts.Probe = ports.ProbeDead } - if err := r.lcm.ApplyRuntimeObservation(ctx, sess.ID, facts); err != nil { + if err := r.sink.ApplyRuntimeObservation(ctx, sess.ID, facts); err != nil { r.logger.Error("reaper: ApplyRuntimeObservation failed", "session", sess.ID, "err", err) } } // handleFromRecord reconstructs the RuntimeHandle stored on the session by -// OnSpawnCompleted. Both fields are required; either being empty is the -// "session lacks a probable handle" signal that probeOne uses to skip. +// MarkSpawned. An empty handle id means the session cannot be probed. func handleFromRecord(rec domain.SessionRecord) (ports.RuntimeHandle, bool) { id := rec.Metadata.RuntimeHandleID - name := rec.Metadata.RuntimeName - if id == "" || name == "" { + if id == "" { return ports.RuntimeHandle{}, false } - return ports.RuntimeHandle{ID: id, RuntimeName: name}, true + return ports.RuntimeHandle{ID: id}, true } diff --git a/backend/internal/observe/reaper/reaper_test.go b/backend/internal/observe/reaper/reaper_test.go index ffb3eed4..a2c84578 100644 --- a/backend/internal/observe/reaper/reaper_test.go +++ b/backend/internal/observe/reaper/reaper_test.go @@ -6,7 +6,6 @@ import ( "io" "log/slog" "testing" - "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" @@ -15,14 +14,9 @@ import ( var ctx = context.Background() type fakeLCM struct { - running []domain.SessionRecord - observed map[domain.SessionID]ports.RuntimeFacts - escalated int + observed map[domain.SessionID]ports.RuntimeFacts } -func (l *fakeLCM) RunningSessions(context.Context) ([]domain.SessionRecord, error) { - return l.running, nil -} func (l *fakeLCM) ApplyRuntimeObservation(_ context.Context, id domain.SessionID, f ports.RuntimeFacts) error { if l.observed == nil { l.observed = map[domain.SessionID]ports.RuntimeFacts{} @@ -30,18 +24,11 @@ func (l *fakeLCM) ApplyRuntimeObservation(_ context.Context, id domain.SessionID l.observed[id] = f return nil } -func (l *fakeLCM) TickEscalations(context.Context, time.Time) error { l.escalated++; return nil } -func (l *fakeLCM) ApplyActivitySignal(context.Context, domain.SessionID, ports.ActivitySignal) error { - return nil -} -func (l *fakeLCM) ApplyPRObservation(context.Context, domain.SessionID, ports.PRObservation) error { - return nil -} -func (l *fakeLCM) OnSpawnCompleted(context.Context, domain.SessionID, ports.SpawnOutcome) error { - return nil -} -func (l *fakeLCM) OnKillRequested(context.Context, domain.SessionID, domain.TerminationReason) error { - return nil + +type fakeSessions struct{ rows []domain.SessionRecord } + +func (s fakeSessions) ListAllSessions(context.Context) ([]domain.SessionRecord, error) { + return s.rows, nil } type fakeRuntime struct { @@ -49,10 +36,6 @@ type fakeRuntime struct { err error } -func (r fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.RuntimeHandle, error) { - return ports.RuntimeHandle{}, nil -} -func (r fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { return nil } func (r fakeRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { return r.alive, r.err } @@ -60,53 +43,57 @@ func (r fakeRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) func probableSession(id domain.SessionID) domain.SessionRecord { return domain.SessionRecord{ ID: id, - Metadata: domain.SessionMetadata{RuntimeHandleID: "h1", RuntimeName: "tmux"}, - Lifecycle: domain.CanonicalSessionLifecycle{ - Session: domain.SessionSubstate{State: domain.SessionWorking}, - }, + Activity: domain.ActivitySubstate{State: domain.ActivityActive}, + Metadata: domain.SessionMetadata{RuntimeHandleID: "h1"}, } } func quietLogger() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } -func newReaper(lcm *fakeLCM, rt fakeRuntime) *Reaper { - return New(lcm, MapRegistry{"tmux": rt}, Config{Logger: quietLogger()}) +func newReaper(lcm *fakeLCM, sessions fakeSessions, rt fakeRuntime) *Reaper { + return New(lcm, sessions, rt, Config{Logger: quietLogger()}) } func TestTick_ReportsAliveProbe(t *testing.T) { - lcm := &fakeLCM{running: []domain.SessionRecord{probableSession("mer-1")}} - if err := newReaper(lcm, fakeRuntime{alive: true}).Tick(ctx); err != nil { + lcm := &fakeLCM{} + sessions := fakeSessions{rows: []domain.SessionRecord{probableSession("mer-1")}} + if err := newReaper(lcm, sessions, fakeRuntime{alive: true}).Tick(ctx); err != nil { t.Fatal(err) } - if lcm.observed["mer-1"].Runtime != ports.ProbeAlive { - t.Fatalf("want alive probe, got %q", lcm.observed["mer-1"].Runtime) + if lcm.observed["mer-1"].Probe != ports.ProbeAlive { + t.Fatalf("want alive probe, got %q", lcm.observed["mer-1"].Probe) } } func TestTick_ReportsProbeErrorAsFailed(t *testing.T) { - lcm := &fakeLCM{running: []domain.SessionRecord{probableSession("mer-1")}} - if err := newReaper(lcm, fakeRuntime{err: errors.New("tmux gone")}).Tick(ctx); err != nil { + lcm := &fakeLCM{} + sessions := fakeSessions{rows: []domain.SessionRecord{probableSession("mer-1")}} + if err := newReaper(lcm, sessions, fakeRuntime{err: errors.New("Zellij gone")}).Tick(ctx); err != nil { t.Fatal(err) } - if lcm.observed["mer-1"].Runtime != ports.ProbeFailed { - t.Fatalf("probe error must be reported as failed, got %q", lcm.observed["mer-1"].Runtime) + if lcm.observed["mer-1"].Probe != ports.ProbeFailed { + t.Fatalf("probe error must be reported as failed, got %q", lcm.observed["mer-1"].Probe) } } -func TestTick_FiresEscalationHeartbeat(t *testing.T) { +func TestTick_SkipsTerminatedSession(t *testing.T) { lcm := &fakeLCM{} - if err := newReaper(lcm, fakeRuntime{}).Tick(ctx); err != nil { + dead := probableSession("mer-1") + dead.IsTerminated = true + sessions := fakeSessions{rows: []domain.SessionRecord{dead}} + if err := newReaper(lcm, sessions, fakeRuntime{alive: true}).Tick(ctx); err != nil { t.Fatal(err) } - if lcm.escalated != 1 { - t.Fatalf("tick must drive TickEscalations once, got %d", lcm.escalated) + if _, probed := lcm.observed["mer-1"]; probed { + t.Fatal("terminated sessions must not be probed") } } func TestTick_SkipsSessionWithoutHandle(t *testing.T) { + lcm := &fakeLCM{} noHandle := domain.SessionRecord{ID: "mer-1"} // no runtime metadata - lcm := &fakeLCM{running: []domain.SessionRecord{noHandle}} - if err := newReaper(lcm, fakeRuntime{alive: true}).Tick(ctx); err != nil { + sessions := fakeSessions{rows: []domain.SessionRecord{noHandle}} + if err := newReaper(lcm, sessions, fakeRuntime{alive: true}).Tick(ctx); err != nil { t.Fatal(err) } if _, probed := lcm.observed["mer-1"]; probed { diff --git a/backend/internal/ports/doc.go b/backend/internal/ports/doc.go new file mode 100644 index 00000000..cbcc39a9 --- /dev/null +++ b/backend/internal/ports/doc.go @@ -0,0 +1,5 @@ +// Package ports declares boundary interfaces and DTOs used to connect core +// services to replaceable adapters such as runtimes, workspaces, trackers, and +// storage writers. Domain models stay in internal/domain; generated storage rows +// stay inside storage packages. +package ports diff --git a/backend/internal/ports/facts.go b/backend/internal/ports/facts.go deleted file mode 100644 index b119ecf6..00000000 --- a/backend/internal/ports/facts.go +++ /dev/null @@ -1,69 +0,0 @@ -// Package ports declares the boundary contracts for the lifecycle lane: the -// inbound interfaces the engine implements, the outbound interfaces its adapters -// implement, and the plain DTOs that cross those edges. It holds no logic. -package ports - -import ( - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" -) - -// ProbeResult is a single liveness reading. "failed" (the probe errored/timed -// out) and "unknown" (ran but couldn't tell) are kept distinct from dead — both -// route to the detecting quarantine, never to a death conclusion. -type ProbeResult string - -// Probe readings. Alive/Dead are conclusions; Failed/Unknown route to the -// detecting quarantine instead of a death decision. -const ( - ProbeAlive ProbeResult = "alive" - ProbeDead ProbeResult = "dead" - ProbeFailed ProbeResult = "failed" - ProbeUnknown ProbeResult = "unknown" -) - -// RuntimeFacts is what the reaper reports each probe: is the runtime container -// up, and is the agent process inside it up. -type RuntimeFacts struct { - ObservedAt time.Time - Runtime ProbeResult - Process ProbeResult -} - -// ActivitySignal is pushed by the agent hooks. Only a Valid signal is -// authoritative; a stale/absent one is ignored rather than read as idleness. -type ActivitySignal struct { - Valid bool - State domain.ActivityState - Timestamp time.Time - Source domain.ActivitySource -} - -// PRObservation is what the SCM poller reports for one PR. Fetched is the -// failed-fetch guard: when false the rest is meaningless and the engine must not -// read it as "PR closed". Checks/Comments are the current full sets (the engine -// records the checks and replaces the comment set). -type PRObservation struct { - Fetched bool - URL string - Number int - Draft bool - Merged bool - Closed bool - CI domain.CIState - Review domain.ReviewDecision - Mergeability domain.Mergeability - Checks []domain.PRCheckRow - Comments []domain.PRComment -} - -// SpawnOutcome is what the Session Manager reports once a spawn is live: the -// handles needed for later teardown/restore. -type SpawnOutcome struct { - Branch string - WorkspacePath string - RuntimeHandle RuntimeHandle - AgentSessionID string - Prompt string -} diff --git a/backend/internal/ports/inbound.go b/backend/internal/ports/inbound.go deleted file mode 100644 index fa472d00..00000000 --- a/backend/internal/ports/inbound.go +++ /dev/null @@ -1,53 +0,0 @@ -package ports - -import ( - "context" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" -) - -// LifecycleManager is the inbound contract the engine implements. Observers -// (reaper, SCM poller, activity hooks) and the Session Manager call in; the LCM -// is the sole writer of canonical transitions and the only place reactions fire. -type LifecycleManager interface { - ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f RuntimeFacts) error - ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ActivitySignal) error - ApplyPRObservation(ctx context.Context, id domain.SessionID, o PRObservation) error - - // OnSpawnCompleted marks a session live and records its handles. It works for - // a fresh spawn (not_started -> live) and a restore (terminal -> reopened). - OnSpawnCompleted(ctx context.Context, id domain.SessionID, o SpawnOutcome) error - OnKillRequested(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) error - - // TickEscalations fires the duration-based escalations the synchronous LCM - // can't wake itself for; the reaper calls it on a timer. - TickEscalations(ctx context.Context, now time.Time) error - // RunningSessions snapshots every non-terminal session for the reaper to probe. - RunningSessions(ctx context.Context) ([]domain.SessionRecord, error) -} - -// SessionManager is the inbound contract the API/CLI call for explicit -// mutations. It drives the runtime/agent/workspace plugins and routes canonical -// writes to the LCM. -type SessionManager interface { - Spawn(ctx context.Context, cfg SpawnConfig) (domain.Session, error) - Kill(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) (freed bool, err error) - Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) - List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) - Get(ctx context.Context, id domain.SessionID) (domain.Session, error) - Send(ctx context.Context, id domain.SessionID, message string) error - Cleanup(ctx context.Context, project domain.ProjectID) ([]domain.SessionID, error) -} - -// SpawnConfig is the request to start a new session: which project/issue, which -// agent harness, and the branch/prompt/rules the agent launches with. -type SpawnConfig struct { - ProjectID domain.ProjectID - IssueID domain.IssueID - Kind domain.SessionKind - Harness domain.AgentHarness - Branch string - Prompt string - AgentRules string -} diff --git a/backend/internal/ports/outbound.go b/backend/internal/ports/outbound.go index 58e1f509..765785c4 100644 --- a/backend/internal/ports/outbound.go +++ b/backend/internal/ports/outbound.go @@ -2,95 +2,28 @@ package ports import ( "context" - "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// SessionStore persists session records and serves the derived read-model's PR -// facts. The Session Manager creates rows; the Lifecycle Manager is the sole -// writer of canonical transitions thereafter. -type SessionStore interface { - CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) - UpdateSession(ctx context.Context, rec domain.SessionRecord) error - GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) - ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) - ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) - // PRFactsForSession returns the PR facts that drive a session's display - // status: the most-recently-updated non-closed PR, else the most recent. - // Zero value (Exists=false) means the session has no PR. - PRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, error) -} - // PRWriter records the PR facts a PR observation carries. The pr table's own DB // triggers emit the CDC; this just writes the rows. type PRWriter interface { // WritePR persists a full PR observation — scalar facts, check runs, and the // replacement comment set — in one transaction, so the rows and the CDC // events they emit are all-or-nothing. - WritePR(ctx context.Context, pr domain.PRRow, checks []domain.PRCheckRow, comments []domain.PRComment) error - // RecentCheckStatuses reads the last `limit` runs of a check (the CI brake). - RecentCheckStatuses(ctx context.Context, prURL, name string, limit int) ([]string, error) -} - -// Notifier delivers an event to the human (desktop/Slack later). Push, never poll. -type Notifier interface { - Notify(ctx context.Context, event Event) error + WritePR(ctx context.Context, pr domain.PullRequest, checks []domain.PullRequestCheck, comments []domain.PullRequestComment) error } -// AgentMessenger injects a message into a running agent (busy-detecting until the -// agent is ready). Used by the auto-nudge reactions. +// AgentMessenger injects a message into a running agent. type AgentMessenger interface { Send(ctx context.Context, id domain.SessionID, message string) error } -// Priority ranks a notification's urgency so a notifier can decide how loudly -// to surface it, from PriorityUrgent down to PriorityInfo. -type Priority string - -// Notification priorities, highest urgency first. -const ( - PriorityUrgent Priority = "urgent" - PriorityAction Priority = "action" - PriorityWarning Priority = "warning" - PriorityInfo Priority = "info" -) - -// Event is a human-facing notification produced by a reaction. It carries the -// stable reaction/escalation context a durable notification renderer needs, -// while lifecycle remains responsible for deciding what should notify. -type Event struct { - Type string - Priority Priority - SessionID domain.SessionID - ProjectID domain.ProjectID - Message string - Reaction *ReactionEvent - Escalation *EscalationEvent - DedupeKey string - CauseKey string - OccurredAt time.Time -} - -// ReactionEvent is the reaction context carried on an Event: which reaction -// fired and whether it merely notified or escalated. -type ReactionEvent struct { - Key string // agent-needs-input, approved-and-green, ci-failed, etc. - Action string // notify | escalated -} - -// EscalationEvent is the escalation context carried on an Event once a reaction -// has exhausted its retry/attempt/duration budget. -type EscalationEvent struct { - Attempts int - Cause string // max_retries | max_attempts | max_duration - DurationMs int64 -} - -// ---- runtime / agent / workspace plugin ports (used by the Session Manager) ---- +// ---- runtime / agent / workspace plugin ports ---- -// Runtime is where a session's agent process runs — a tmux/zellij session or a -// bare process. The Session Manager creates one per session and tears it down. +// Runtime is the full runtime adapter contract: session creation/teardown plus +// liveness probing for reapers and terminal attachment. type Runtime interface { Create(ctx context.Context, cfg RuntimeConfig) (RuntimeHandle, error) Destroy(ctx context.Context, handle RuntimeHandle) error @@ -105,10 +38,10 @@ type RuntimeConfig struct { Env map[string]string } -// RuntimeHandle identifies a live runtime instance (e.g. a tmux session). +// RuntimeHandle identifies a live runtime instance. Its ID is opaque outside +// the concrete runtime adapter. type RuntimeHandle struct { - ID string - RuntimeName string + ID string } // Agent is the AI coding tool driving a session (claude-code, codex, …): it diff --git a/backend/internal/ports/pr_observations.go b/backend/internal/ports/pr_observations.go new file mode 100644 index 00000000..91eac64b --- /dev/null +++ b/backend/internal/ports/pr_observations.go @@ -0,0 +1,40 @@ +package ports + +import "github.com/aoagents/agent-orchestrator/backend/internal/domain" + +// PRObservation is what the SCM poller reports for one PR. Fetched is the +// failed-fetch guard: when false the rest is meaningless and lifecycle must not +// read it as "PR closed". Checks/Comments are observation DTOs, not persistence +// rows; the PR Manager owns mapping them into stored domain.PullRequest rows. +type PRObservation struct { + Fetched bool + URL string + Number int + Draft bool + Merged bool + Closed bool + CI domain.CIState + Review domain.ReviewDecision + Mergeability domain.Mergeability + Checks []PRCheckObservation + Comments []PRCommentObservation +} + +// PRCheckObservation is one SCM check result on the observed PR. +type PRCheckObservation struct { + Name string + CommitHash string + Status domain.PRCheckStatus + URL string + LogTail string +} + +// PRCommentObservation is one review comment observed on the PR. +type PRCommentObservation struct { + ID string + Author string + File string + Line int + Body string + Resolved bool +} diff --git a/backend/internal/ports/runtime_observations.go b/backend/internal/ports/runtime_observations.go new file mode 100644 index 00000000..f81ffe67 --- /dev/null +++ b/backend/internal/ports/runtime_observations.go @@ -0,0 +1,34 @@ +package ports + +import ( + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// ProbeResult is a single liveness reading. "failed" means the probe errored +// or timed out and is never treated as a death conclusion. +type ProbeResult string + +// Probe readings. Alive/Dead are conclusions; Failed is ignored by lifecycle +// because it is not a reliable death decision. +const ( + ProbeAlive ProbeResult = "alive" + ProbeDead ProbeResult = "dead" + ProbeFailed ProbeResult = "failed" +) + +// RuntimeFacts is what the reaper reports each probe of a session runtime. +type RuntimeFacts struct { + ObservedAt time.Time + Probe ProbeResult +} + +// ActivitySignal is pushed by the agent hooks. Only a Valid signal is +// authoritative; a stale/absent one is ignored rather than read as idleness. +type ActivitySignal struct { + Valid bool + State domain.ActivityState + Timestamp time.Time + Source domain.ActivitySource +} diff --git a/backend/internal/ports/session.go b/backend/internal/ports/session.go new file mode 100644 index 00000000..56964245 --- /dev/null +++ b/backend/internal/ports/session.go @@ -0,0 +1,15 @@ +package ports + +import "github.com/aoagents/agent-orchestrator/backend/internal/domain" + +// SpawnConfig is the request to start a new session: which project/issue, which +// agent harness, and the branch/prompt/rules the agent launches with. +type SpawnConfig struct { + ProjectID domain.ProjectID + IssueID domain.IssueID + Kind domain.SessionKind + Harness domain.AgentHarness + Branch string + Prompt string + AgentRules string +} diff --git a/backend/internal/ports/tracker.go b/backend/internal/ports/tracker.go index d9fac910..11411d92 100644 --- a/backend/internal/ports/tracker.go +++ b/backend/internal/ports/tracker.go @@ -6,8 +6,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -// Tracker is the outbound port for issue trackers (GitHub Issues, GitLab -// Issues, Linear). v1 is read-only: +// Tracker is the outbound read-only port for issue trackers: // // - Get returns a normalized snapshot of one issue, used by spawn-bootstrap // to hydrate the agent prompt. @@ -16,13 +15,8 @@ import ( // - Preflight verifies the configured credential is actually valid against // the provider so daemons fail fast at startup, not at first request. // -// Mirroring agent lifecycle back onto the tracker (Comment, Transition) is -// deferred to issue #40. The observer / polling loop is deferred to #35. -// -// All v1 providers share this interface. Provider differences (label vs -// state machine vs close reason) are absorbed inside each adapter via -// domain.NormalizedIssueState. Fields on domain.Issue exist only when every -// provider can populate them; richer per-provider metadata belongs behind a +// Provider differences are absorbed inside each adapter via +// domain.NormalizedIssueState. Richer per-provider metadata belongs behind a // separate port. type Tracker interface { Get(ctx context.Context, id domain.TrackerID) (domain.Issue, error) diff --git a/backend/internal/pr/manager.go b/backend/internal/pr/manager.go new file mode 100644 index 00000000..86696ca0 --- /dev/null +++ b/backend/internal/pr/manager.go @@ -0,0 +1,67 @@ +// Package pr records SCM observations for pull requests associated with sessions. +package pr + +import ( + "context" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +type lifecycle interface { + ApplyPRObservation(ctx context.Context, id domain.SessionID, o ports.PRObservation) error +} + +// Manager persists PR observations and forwards them to lifecycle for agent +// nudges and direct lifecycle effects. +type Manager struct { + writer ports.PRWriter + lifecycle lifecycle + clock func() time.Time +} + +// Deps are the collaborators a PR Manager needs. +type Deps struct { + Writer ports.PRWriter + Lifecycle lifecycle + Clock func() time.Time +} + +// New builds a PR Manager from its dependencies, defaulting the clock to time.Now. +func New(d Deps) *Manager { + m := &Manager{writer: d.Writer, lifecycle: d.Lifecycle, clock: d.Clock} + if m.clock == nil { + m.clock = time.Now + } + return m +} + +// ApplyObservation records a successfully fetched PR observation. Failed fetches +// are ignored because their fields are not authoritative facts. +func (m *Manager) ApplyObservation(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { + if !o.Fetched { + return nil + } + if err := m.write(ctx, id, o); err != nil { + return err + } + if m.lifecycle == nil { + return nil + } + return m.lifecycle.ApplyPRObservation(ctx, id, o) +} + +func (m *Manager) write(ctx context.Context, id domain.SessionID, o ports.PRObservation) error { + now := m.clock() + row := domain.PullRequest{URL: o.URL, SessionID: id, Number: o.Number, Draft: o.Draft, Merged: o.Merged, Closed: o.Closed, CI: o.CI, Review: o.Review, Mergeability: o.Mergeability, UpdatedAt: now} + checks := make([]domain.PullRequestCheck, len(o.Checks)) + for i, c := range o.Checks { + checks[i] = domain.PullRequestCheck{Name: c.Name, CommitHash: c.CommitHash, Status: c.Status, URL: c.URL, LogTail: c.LogTail, CreatedAt: now} + } + comments := make([]domain.PullRequestComment, len(o.Comments)) + for i, c := range o.Comments { + comments[i] = domain.PullRequestComment{ID: c.ID, Author: c.Author, File: c.File, Line: c.Line, Body: c.Body, Resolved: c.Resolved, CreatedAt: now} + } + return m.writer.WritePR(ctx, row, checks, comments) +} diff --git a/backend/internal/pr/manager_test.go b/backend/internal/pr/manager_test.go new file mode 100644 index 00000000..92acea5b --- /dev/null +++ b/backend/internal/pr/manager_test.go @@ -0,0 +1,87 @@ +package pr + +import ( + "context" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +type fakeWriter struct { + pr map[domain.SessionID]domain.PullRequest + comments map[string][]domain.PullRequestComment + checks []domain.PullRequestCheck +} + +func (f *fakeWriter) WritePR(_ context.Context, pr domain.PullRequest, checks []domain.PullRequestCheck, comments []domain.PullRequestComment) error { + f.pr[pr.SessionID] = pr + f.checks = append(f.checks, checks...) + f.comments[pr.URL] = comments + return nil +} + +type fakeLifecycle struct { + observed []ports.PRObservation +} + +func (f *fakeLifecycle) ApplyPRObservation(_ context.Context, _ domain.SessionID, o ports.PRObservation) error { + f.observed = append(f.observed, o) + return nil +} + +func newPRManager() (*Manager, *fakeWriter, *fakeLifecycle) { + fw := &fakeWriter{pr: map[domain.SessionID]domain.PullRequest{}, comments: map[string][]domain.PullRequestComment{}} + fl := &fakeLifecycle{} + m := New(Deps{ + Writer: fw, + Lifecycle: fl, + Clock: func() time.Time { return time.Unix(1, 0).UTC() }, + }) + return m, fw, fl +} + +func TestApplyObservation_WritesPRChecksAndComments(t *testing.T) { + m, fw, fl := newPRManager() + o := ports.PRObservation{ + Fetched: true, URL: "https://example/pr/1", Number: 1, CI: domain.CIFailing, + Checks: []ports.PRCheckObservation{{Name: "build", CommitHash: "c1", Status: domain.PRCheckFailed, LogTail: "boom"}}, + Comments: []ports.PRCommentObservation{{ID: "1", Author: "greptileai", Body: "use a constant here"}}, + } + if err := m.ApplyObservation(context.Background(), "mer-1", o); err != nil { + t.Fatal(err) + } + if got := fw.pr["mer-1"]; got.URL != o.URL || got.CI != domain.CIFailing { + t.Fatalf("pr not written: %+v", got) + } + if len(fw.checks) != 1 || fw.checks[0].CreatedAt.IsZero() { + t.Fatalf("checks not normalized: %+v", fw.checks) + } + if len(fw.comments[o.URL]) != 1 || fw.comments[o.URL][0].CreatedAt.IsZero() { + t.Fatalf("comments not normalized: %+v", fw.comments) + } + if len(fl.observed) != 1 || fl.observed[0].URL != o.URL { + t.Fatalf("PR observation should be forwarded to lifecycle, got %v", fl.observed) + } +} + +func TestApplyObservation_MergedForwardsToLifecycle(t *testing.T) { + m, _, fl := newPRManager() + if err := m.ApplyObservation(context.Background(), "mer-1", ports.PRObservation{Fetched: true, URL: "pr1", Number: 1, Merged: true}); err != nil { + t.Fatal(err) + } + if len(fl.observed) != 1 || !fl.observed[0].Merged { + t.Fatalf("merged PR should be forwarded to lifecycle, got %v", fl.observed) + } +} + +func TestApplyObservation_FailedFetchIsDropped(t *testing.T) { + m, fw, fl := newPRManager() + if err := m.ApplyObservation(context.Background(), "mer-1", ports.PRObservation{Fetched: false, URL: "pr1", CI: domain.CIFailing}); err != nil { + t.Fatal(err) + } + if len(fw.pr) != 0 || len(fl.observed) != 0 { + t.Fatalf("failed fetch must write nothing, pr=%v observed=%v", fw.pr, fl.observed) + } +} diff --git a/backend/internal/processalive/process_unix.go b/backend/internal/processalive/process_unix.go new file mode 100644 index 00000000..bf9349ad --- /dev/null +++ b/backend/internal/processalive/process_unix.go @@ -0,0 +1,20 @@ +//go:build !windows + +// Package processalive probes whether an operating-system process id still +// maps to a live process. +package processalive + +import ( + "errors" + "syscall" +) + +// Alive reports whether pid exists. EPERM counts as alive: the process exists +// even if the current user cannot signal it. +func Alive(pid int) bool { + if pid <= 0 { + return false + } + err := syscall.Kill(pid, 0) + return err == nil || errors.Is(err, syscall.EPERM) +} diff --git a/backend/internal/processalive/process_windows.go b/backend/internal/processalive/process_windows.go new file mode 100644 index 00000000..225726bf --- /dev/null +++ b/backend/internal/processalive/process_windows.go @@ -0,0 +1,30 @@ +//go:build windows + +// Package processalive probes whether an operating-system process id still +// maps to a live process. +package processalive + +import ( + "errors" + + "golang.org/x/sys/windows" +) + +// Alive reports whether pid exists. Access denied counts as alive: the process +// exists even if the current user cannot wait on it. +func Alive(pid int) bool { + if pid <= 0 { + return false + } + handle, err := windows.OpenProcess(windows.SYNCHRONIZE, false, uint32(pid)) + if err != nil { + return errors.Is(err, windows.ERROR_ACCESS_DENIED) + } + defer windows.CloseHandle(handle) + + status, err := windows.WaitForSingleObject(handle, 0) + if err != nil { + return false + } + return status == uint32(windows.WAIT_TIMEOUT) +} diff --git a/backend/internal/project/dto.go b/backend/internal/project/dto.go index 0e6f5ee5..7146d455 100644 --- a/backend/internal/project/dto.go +++ b/backend/internal/project/dto.go @@ -30,11 +30,9 @@ type AddInput struct { // behaviour fields are mutable; identity fields (projectId, path, repo, // defaultBranch) are rejected by the handler with a 400 IDENTITY_FROZEN. type UpdateConfigInput struct { - Agent *string `json:"agent,omitempty"` - Runtime *string `json:"runtime,omitempty"` - Tracker *TrackerConfig `json:"tracker,omitempty"` - SCM *SCMConfig `json:"scm,omitempty"` - Reactions *map[string]*ReactionConfig `json:"reactions,omitempty"` + Agent *string `json:"agent,omitempty"` + Tracker *TrackerConfig `json:"tracker,omitempty"` + SCM *SCMConfig `json:"scm,omitempty"` } // RemoveResult reports what DELETE /api/v1/projects/{id} actually did. diff --git a/backend/internal/project/memory_store.go b/backend/internal/project/memory_store.go index e947136c..c9f91a2a 100644 --- a/backend/internal/project/memory_store.go +++ b/backend/internal/project/memory_store.go @@ -18,8 +18,8 @@ type Row struct { ArchivedAt time.Time } -// Store is the project persistence the manager depends on; both the sqlite -// store and MemoryStore satisfy it. +// Store is the project persistence the manager depends on. MemoryStore is the +// current in-process implementation; the sqlite adapter uses the same row shape. type Store interface { List(ctx context.Context) ([]Row, error) Get(ctx context.Context, id string) (Row, bool, error) diff --git a/backend/internal/project/project.go b/backend/internal/project/project.go index a997519d..14bf731a 100644 --- a/backend/internal/project/project.go +++ b/backend/internal/project/project.go @@ -2,12 +2,9 @@ // the HTTP layer calls and the request/response DTOs that cross it (dto.go). // // This is the pilot for the feature-package layout the backend is migrating -// toward: a resource's interface and DTOs live with the resource, not in a -// central catch-all. Controllers depend on project.Manager and nothing -// beneath it — whether the implementation reaches into the config registry, -// the lifecycle manager (to stop sessions on remove), or a workspace adapter -// (to destroy worktrees) is a private concern of the impl, which lands in a -// later handler-impl PR. This PR defines only the contract. +// toward: a resource's interface, implementation, and DTOs live with the +// resource, not in a central catch-all. Controllers depend on project.Manager +// and nothing beneath it. package project import ( @@ -17,7 +14,7 @@ import ( ) // Manager is the inbound contract for the /api/v1/projects surface. One -// implementation (this package, later); the HTTP controller is the consumer. +// implementation lives in this package; the HTTP controller is the consumer. type Manager interface { // List returns every registered project, including degraded entries // (those whose config failed to load but whose registry entry survives). diff --git a/backend/internal/project/types.go b/backend/internal/project/types.go index 65e5daa2..9e1e8b94 100644 --- a/backend/internal/project/types.go +++ b/backend/internal/project/types.go @@ -10,11 +10,9 @@ import "github.com/aoagents/agent-orchestrator/backend/internal/domain" // transport DTOs (dto.go) together is the feature-package layout the backend // is migrating toward. -// Summary is the row shape returned by GET /api/v1/projects. It mirrors the TS -// ProjectInfo (packages/web/src/lib/project-name.ts) so the existing dashboard -// list view reads the Go daemon's response unchanged. ResolveError is set only -// for degraded projects (registry entry survives but config failed to load), -// so the list shows them with a warning instead of dropping them silently. +// Summary is the row shape returned by GET /api/v1/projects. ResolveError is +// set only for degraded projects, so the list can show them with a warning +// instead of dropping them silently. type Summary struct { ID domain.ProjectID `json:"id"` Name string `json:"name"` @@ -26,16 +24,14 @@ type Summary struct { // project resolves cleanly. It joins the registry identity fields with the // project's behaviour config. type Project struct { - ID domain.ProjectID `json:"id"` - Name string `json:"name"` - Path string `json:"path"` - Repo string `json:"repo"` // "owner/name" or "" - DefaultBranch string `json:"defaultBranch"` - Agent string `json:"agent,omitempty"` - Runtime string `json:"runtime,omitempty"` - Tracker *TrackerConfig `json:"tracker,omitempty"` - SCM *SCMConfig `json:"scm,omitempty"` - Reactions map[string]*ReactionConfig `json:"reactions,omitempty"` + ID domain.ProjectID `json:"id"` + Name string `json:"name"` + Path string `json:"path"` + Repo string `json:"repo"` // "owner/name" or "" + DefaultBranch string `json:"defaultBranch"` + Agent string `json:"agent,omitempty"` + Tracker *TrackerConfig `json:"tracker,omitempty"` + SCM *SCMConfig `json:"scm,omitempty"` } // Degraded is returned in place of Project when the project's config failed to @@ -49,11 +45,9 @@ type Degraded struct { ResolveError string `json:"resolveError"` } -// Behaviour-config shapes ported from the TS Zod schemas (packages/core/src/ -// config.ts). Only the fields the projects API actually exposes are modelled; -// the passthrough/unknown-key round-trip the legacy schemas allowed lands with -// the handler implementation (and the SQLite persistence work), not in this -// interface-only PR. +// Behaviour-config shapes exposed by the projects API. Runtime selection and +// reaction rules are intentionally absent: the daemon has one runtime adapter and +// lifecycle owns agent nudges. // TrackerConfig mirrors TrackerConfigSchema. type TrackerConfig struct { @@ -80,17 +74,3 @@ type SCMWebhookConfig struct { DeliveryHeader string `json:"deliveryHeader,omitempty"` MaxBodyBytes int `json:"maxBodyBytes,omitempty"` } - -// ReactionConfig mirrors ReactionConfigSchema. EscalateAfter is either ms -// (number) or a duration string ("30m") in the TS schema, so it stays open as -// `any` until handler validation lands. -type ReactionConfig struct { - Auto *bool `json:"auto,omitempty"` - Action string `json:"action,omitempty"` // send-to-agent | notify | auto-merge - Message string `json:"message,omitempty"` - Priority string `json:"priority,omitempty"` // urgent | action | warning | info - Retries *int `json:"retries,omitempty"` - EscalateAfter any `json:"escalateAfter,omitempty"` - Threshold string `json:"threshold,omitempty"` - IncludeSummary *bool `json:"includeSummary,omitempty"` -} diff --git a/backend/internal/runfile/process_unix.go b/backend/internal/runfile/process_unix.go deleted file mode 100644 index efe957e1..00000000 --- a/backend/internal/runfile/process_unix.go +++ /dev/null @@ -1,24 +0,0 @@ -//go:build unix - -package runfile - -import ( - "errors" - "os" - "syscall" -) - -// processAlive probes existence with signal 0: kill(pid, 0) returns nil if the -// process exists and we can signal it, EPERM if it exists but is owned by -// another user, and ESRCH (or any other error from FindProcess) if it is gone. -func processAlive(pid int) bool { - proc, err := os.FindProcess(pid) - if err != nil { - return false - } - err = proc.Signal(syscall.Signal(0)) - if err == nil { - return true - } - return errors.Is(err, syscall.EPERM) -} diff --git a/backend/internal/runfile/process_windows.go b/backend/internal/runfile/process_windows.go deleted file mode 100644 index 1f8e78fe..00000000 --- a/backend/internal/runfile/process_windows.go +++ /dev/null @@ -1,21 +0,0 @@ -//go:build windows - -package runfile - -import ( - "syscall" -) - -// processAlive opens the process with the minimum-rights query flag. On -// Windows, OpenProcess returns ERROR_INVALID_PARAMETER for a PID that no -// longer maps to a live process, and a usable handle when one is. We close -// the handle immediately; the only thing we needed was the open's outcome. -func processAlive(pid int) bool { - const PROCESS_QUERY_LIMITED_INFORMATION = 0x1000 - h, err := syscall.OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid)) - if err != nil { - return false - } - _ = syscall.CloseHandle(h) - return true -} diff --git a/backend/internal/runfile/runfile.go b/backend/internal/runfile/runfile.go index 3db84590..92718d34 100644 --- a/backend/internal/runfile/runfile.go +++ b/backend/internal/runfile/runfile.go @@ -12,6 +12,8 @@ import ( "os" "path/filepath" "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/processalive" ) // Info is the on-disk handshake payload. @@ -86,6 +88,20 @@ func Remove(path string) error { return nil } +// RemoveIfOwned deletes running.json only if it still belongs to ownerPID. This +// prevents a shutting-down daemon from removing a successor's freshly written +// handshake after an overlapping restart. +func RemoveIfOwned(path string, ownerPID int) error { + info, err := Read(path) + if err != nil { + return err + } + if info == nil || info.PID != ownerPID { + return nil + } + return Remove(path) +} + // CheckStale inspects an existing run-file before the new daemon binds. It // returns: // @@ -104,7 +120,7 @@ func CheckStale(path string) (*Info, error) { if info == nil || info.PID <= 0 { return nil, nil } - if processAlive(info.PID) { + if processalive.Alive(info.PID) { return info, nil } return nil, nil diff --git a/backend/internal/runfile/runfile_test.go b/backend/internal/runfile/runfile_test.go index fbdf74e0..6a926874 100644 --- a/backend/internal/runfile/runfile_test.go +++ b/backend/internal/runfile/runfile_test.go @@ -75,6 +75,32 @@ func TestRemoveIdempotent(t *testing.T) { } } +func TestRemoveIfOwnedDoesNotDeleteSuccessorRunfile(t *testing.T) { + path := filepath.Join(t.TempDir(), "running.json") + if err := Write(path, Info{PID: 1, Port: 3001}); err != nil { + t.Fatalf("Write predecessor: %v", err) + } + if err := Write(path, Info{PID: 2, Port: 3002}); err != nil { + t.Fatalf("Write successor: %v", err) + } + if err := RemoveIfOwned(path, 1); err != nil { + t.Fatalf("RemoveIfOwned predecessor: %v", err) + } + got, err := Read(path) + if err != nil { + t.Fatalf("Read: %v", err) + } + if got == nil || got.PID != 2 || got.Port != 3002 { + t.Fatalf("successor runfile was removed or changed: %+v", got) + } + if err := RemoveIfOwned(path, 2); err != nil { + t.Fatalf("RemoveIfOwned successor: %v", err) + } + if got, err := Read(path); err != nil || got != nil { + t.Fatalf("after owner removal got=%+v err=%v", got, err) + } +} + func TestCheckStaleDeadPID(t *testing.T) { path := filepath.Join(t.TempDir(), "running.json") // PID 0x7FFFFFFF is effectively guaranteed not to exist. diff --git a/backend/internal/session/manager.go b/backend/internal/session/manager.go index 37b1de81..82576dad 100644 --- a/backend/internal/session/manager.go +++ b/backend/internal/session/manager.go @@ -1,7 +1,6 @@ -// Package session implements ports.SessionManager: the explicit-mutation half of -// the lane. It drives the runtime/agent/workspace plugins to create and tear -// down sessions, routes canonical writes to the LCM, and is the single producer -// of the derived display status (attached on read in List/Get). +// Package session drives the runtime/agent/workspace plugins to create and tear +// down sessions, routes durable lifecycle fact writes through lifecycle, and +// attaches derived display status on read. package session import ( @@ -28,27 +27,43 @@ const ( EnvIssueID = "AO_ISSUE_ID" ) -// Manager implements ports.SessionManager over the outbound ports. +type lifecycleRecorder interface { + MarkSpawned(ctx context.Context, id domain.SessionID, metadata domain.SessionMetadata) error + MarkTerminated(ctx context.Context, id domain.SessionID) error +} + +type runtimeController interface { + Create(ctx context.Context, cfg ports.RuntimeConfig) (ports.RuntimeHandle, error) + Destroy(ctx context.Context, handle ports.RuntimeHandle) error +} + +type sessionStore interface { + CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) + GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) + ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) + GetDisplayPRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, bool, error) +} + +// Manager coordinates session spawn, restore, kill, listing, and cleanup over +// the outbound ports. type Manager struct { - runtime ports.Runtime + runtime runtimeController agent ports.Agent workspace ports.Workspace - store ports.SessionStore + store sessionStore messenger ports.AgentMessenger - lcm ports.LifecycleManager + lcm lifecycleRecorder clock func() time.Time } -var _ ports.SessionManager = (*Manager)(nil) - // Deps are the collaborators a Session Manager needs; New wires them together. type Deps struct { - Runtime ports.Runtime + Runtime runtimeController Agent ports.Agent Workspace ports.Workspace - Store ports.SessionStore + Store sessionStore Messenger ports.AgentMessenger - Lifecycle ports.LifecycleManager + Lifecycle lifecycleRecorder Clock func() time.Time } @@ -72,7 +87,7 @@ func New(d Deps) *Manager { // Spawn creates the session row (which assigns the "{project}-{n}" id), then the // workspace and runtime, then reports completion to the LCM. A failure after the -// row exists routes it to a terminal errored state and rolls back what was built. +// row exists parks it as terminated and rolls back what was built. func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Session, error) { rec, err := m.store.CreateSession(ctx, seedRecord(cfg, m.clock())) if err != nil { @@ -82,7 +97,7 @@ func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Sess ws, err := m.workspace.Create(ctx, ports.WorkspaceConfig{ProjectID: cfg.ProjectID, SessionID: id, Branch: cfg.Branch}) if err != nil { - m.markErrored(ctx, id) + m.markSpawnFailedTerminated(ctx, id) return domain.Session{}, fmt.Errorf("spawn %s: workspace: %w", id, err) } @@ -95,30 +110,30 @@ func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Sess }) if err != nil { _ = m.workspace.Destroy(ctx, ws) - m.markErrored(ctx, id) + m.markSpawnFailedTerminated(ctx, id) return domain.Session{}, fmt.Errorf("spawn %s: runtime: %w", id, err) } - outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle, Prompt: agentCfg.Prompt} - if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { + metadata := domain.SessionMetadata{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandleID: handle.ID, Prompt: agentCfg.Prompt} + if err := m.lcm.MarkSpawned(ctx, id, metadata); err != nil { _ = m.runtime.Destroy(ctx, handle) _ = m.workspace.Destroy(ctx, ws) - m.markErrored(ctx, id) + m.markSpawnFailedTerminated(ctx, id) return domain.Session{}, fmt.Errorf("spawn %s: completed: %w", id, err) } return m.Get(ctx, id) } -// markErrored best-effort parks an orphaned spawn in a terminal errored state -// (the store has no delete; a phantom "spawning" row is worse than a terminal one). -func (m *Manager) markErrored(ctx context.Context, id domain.SessionID) { - _ = m.lcm.OnKillRequested(ctx, id, domain.TermErrorInProcess) +// markSpawnFailedTerminated best-effort parks an orphaned spawn as terminated. +// The store has no delete; a phantom half-spawned row is worse than a terminal one. +func (m *Manager) markSpawnFailedTerminated(ctx context.Context, id domain.SessionID) { + _ = m.lcm.MarkTerminated(ctx, id) } // Kill records terminal intent with the LCM, then tears down the runtime and // workspace. A workspace teardown refused by the worktree-remove safety // (uncommitted work) surfaces as an error with freed=false and is never forced. -func (m *Manager) Kill(ctx context.Context, id domain.SessionID, reason domain.TerminationReason) (bool, error) { +func (m *Manager) Kill(ctx context.Context, id domain.SessionID) (bool, error) { rec, ok, err := m.store.GetSession(ctx, id) if err != nil { return false, fmt.Errorf("kill %s: %w", id, err) @@ -131,7 +146,7 @@ func (m *Manager) Kill(ctx context.Context, id domain.SessionID, reason domain.T if handle.ID == "" || ws.Path == "" { return false, fmt.Errorf("kill %s: %w", id, ErrIncompleteHandle) } - if err := m.lcm.OnKillRequested(ctx, id, reason); err != nil { + if err := m.lcm.MarkTerminated(ctx, id); err != nil { return false, fmt.Errorf("kill %s: %w", id, err) } if err := m.runtime.Destroy(ctx, handle); err != nil { @@ -144,7 +159,7 @@ func (m *Manager) Kill(ctx context.Context, id domain.SessionID, reason domain.T } // Restore relaunches a torn-down session in its workspace. The fallible I/O runs -// before any canonical write, so a failure never resurrects the row or destroys +// before any durable session write, so a failure never resurrects the row or destroys // the worktree (it may hold the agent's prior work). func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) { rec, ok, err := m.store.GetSession(ctx, id) @@ -154,7 +169,7 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess if !ok { return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotFound) } - if !isTerminal(rec.Lifecycle.Session.State) { + if !rec.IsTerminated { return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotRestorable) } meta := rec.Metadata @@ -180,8 +195,8 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess if err != nil { return domain.Session{}, fmt.Errorf("restore %s: runtime: %w", id, err) } - outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle, AgentSessionID: meta.AgentSessionID, Prompt: meta.Prompt} - if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { + metadata := domain.SessionMetadata{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandleID: handle.ID, AgentSessionID: meta.AgentSessionID, Prompt: meta.Prompt} + if err := m.lcm.MarkSpawned(ctx, id, metadata); err != nil { _ = m.runtime.Destroy(ctx, handle) return domain.Session{}, fmt.Errorf("restore %s: completed: %w", id, err) } @@ -234,7 +249,7 @@ func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) ([]doma } var cleaned []domain.SessionID for _, rec := range recs { - if !isTerminal(rec.Lifecycle.Session.State) { + if !rec.IsTerminated { continue } ws := workspaceInfo(rec) @@ -255,15 +270,14 @@ func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) ([]doma // ---- helpers ---- func (m *Manager) toSession(ctx context.Context, rec domain.SessionRecord) (domain.Session, error) { - pr, err := m.store.PRFactsForSession(ctx, rec.ID) + pr, ok, err := m.store.GetDisplayPRFactsForSession(ctx, rec.ID) if err != nil { return domain.Session{}, fmt.Errorf("pr facts %s: %w", rec.ID, err) } - return domain.Session{SessionRecord: rec, Status: domain.DeriveStatus(rec.Lifecycle, pr)}, nil -} - -func isTerminal(s domain.SessionState) bool { - return s == domain.SessionDone || s == domain.SessionTerminated + if !ok { + return domain.Session{SessionRecord: rec, Status: domain.DeriveStatus(rec, nil)}, nil + } + return domain.Session{SessionRecord: rec, Status: domain.DeriveStatus(rec, &pr)}, nil } func seedRecord(cfg ports.SpawnConfig, now time.Time) domain.SessionRecord { @@ -273,11 +287,8 @@ func seedRecord(cfg ports.SpawnConfig, now time.Time) domain.SessionRecord { Kind: cfg.Kind, CreatedAt: now, UpdatedAt: now, - Lifecycle: domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Session: domain.SessionSubstate{State: domain.SessionNotStarted}, - Harness: cfg.Harness, - }, + Harness: cfg.Harness, + Activity: domain.ActivitySubstate{State: domain.ActivityIdle, LastActivityAt: now, Source: domain.SourceNone}, } } @@ -306,7 +317,7 @@ func spawnEnv(base map[string]string, id domain.SessionID, project domain.Projec } func runtimeHandle(meta domain.SessionMetadata) ports.RuntimeHandle { - return ports.RuntimeHandle{ID: meta.RuntimeHandleID, RuntimeName: meta.RuntimeName} + return ports.RuntimeHandle{ID: meta.RuntimeHandleID} } func workspaceInfo(rec domain.SessionRecord) ports.WorkspaceInfo { diff --git a/backend/internal/session/manager_test.go b/backend/internal/session/manager_test.go index 669e0c25..228fac89 100644 --- a/backend/internal/session/manager_test.go +++ b/backend/internal/session/manager_test.go @@ -13,8 +13,6 @@ import ( var ctx = context.Background() -// ---- fakes ---- - type fakeStore struct { sessions map[domain.SessionID]domain.SessionRecord pr map[domain.SessionID]domain.PRFacts @@ -24,7 +22,6 @@ type fakeStore struct { func newFakeStore() *fakeStore { return &fakeStore{sessions: map[domain.SessionID]domain.SessionRecord{}, pr: map[domain.SessionID]domain.PRFacts{}} } - func (f *fakeStore) CreateSession(_ context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { f.num++ rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, f.num)) @@ -48,59 +45,41 @@ func (f *fakeStore) ListSessions(_ context.Context, p domain.ProjectID) ([]domai } return out, nil } -func (f *fakeStore) ListAllSessions(_ context.Context) ([]domain.SessionRecord, error) { - out := make([]domain.SessionRecord, 0, len(f.sessions)) +func (f *fakeStore) ListAllSessions(context.Context) ([]domain.SessionRecord, error) { + var out []domain.SessionRecord for _, r := range f.sessions { out = append(out, r) } return out, nil } -func (f *fakeStore) PRFactsForSession(_ context.Context, id domain.SessionID) (domain.PRFacts, error) { - return f.pr[id], nil +func (f *fakeStore) GetDisplayPRFactsForSession(_ context.Context, id domain.SessionID) (domain.PRFacts, bool, error) { + if pr := f.pr[id]; pr.URL != "" { + return pr, true, nil + } + return domain.PRFacts{}, false, nil } -// fakeLCM is the minimal lifecycle the Session Manager drives: it persists the -// spawn/kill canonical writes into the store so Get reflects them. type fakeLCM struct { store *fakeStore completed int } -func (l *fakeLCM) OnSpawnCompleted(_ context.Context, id domain.SessionID, o ports.SpawnOutcome) error { +func (l *fakeLCM) MarkSpawned(_ context.Context, id domain.SessionID, metadata domain.SessionMetadata) error { l.completed++ rec := l.store.sessions[id] - rec.Lifecycle.Session.State = domain.SessionNotStarted - rec.Lifecycle.IsAlive = true - rec.Lifecycle.TerminationReason = domain.TermNone - rec.Metadata = domain.SessionMetadata{ - Branch: o.Branch, WorkspacePath: o.WorkspacePath, - RuntimeHandleID: o.RuntimeHandle.ID, RuntimeName: o.RuntimeHandle.RuntimeName, - AgentSessionID: o.AgentSessionID, Prompt: o.Prompt, - } + rec.IsTerminated = false + rec.Activity = domain.ActivitySubstate{State: domain.ActivityIdle, LastActivityAt: time.Now(), Source: domain.SourceRuntime} + rec.Metadata = metadata l.store.sessions[id] = rec return nil } -func (l *fakeLCM) OnKillRequested(_ context.Context, id domain.SessionID, reason domain.TerminationReason) error { +func (l *fakeLCM) MarkTerminated(_ context.Context, id domain.SessionID) error { rec := l.store.sessions[id] - rec.Lifecycle.Session.State = domain.SessionTerminated - rec.Lifecycle.TerminationReason = reason - rec.Lifecycle.IsAlive = false + rec.IsTerminated = true + rec.Activity = domain.ActivitySubstate{State: domain.ActivityExited, LastActivityAt: time.Now(), Source: domain.SourceRuntime} l.store.sessions[id] = rec return nil } -func (l *fakeLCM) ApplyRuntimeObservation(context.Context, domain.SessionID, ports.RuntimeFacts) error { - return nil -} -func (l *fakeLCM) ApplyActivitySignal(context.Context, domain.SessionID, ports.ActivitySignal) error { - return nil -} -func (l *fakeLCM) ApplyPRObservation(context.Context, domain.SessionID, ports.PRObservation) error { - return nil -} -func (l *fakeLCM) TickEscalations(context.Context, time.Time) error { return nil } -func (l *fakeLCM) RunningSessions(context.Context) ([]domain.SessionRecord, error) { - return nil, nil -} type fakeRuntime struct { createErr error @@ -112,12 +91,9 @@ func (r *fakeRuntime) Create(context.Context, ports.RuntimeConfig) (ports.Runtim return ports.RuntimeHandle{}, r.createErr } r.created++ - return ports.RuntimeHandle{ID: "h1", RuntimeName: "tmux"}, nil + return ports.RuntimeHandle{ID: "h1"}, nil } func (r *fakeRuntime) Destroy(context.Context, ports.RuntimeHandle) error { r.destroyed++; return nil } -func (r *fakeRuntime) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { - return true, nil -} type fakeAgent struct{} @@ -154,143 +130,111 @@ func newManager() (*Manager, *fakeStore, *fakeRuntime, *fakeWorkspace) { st := newFakeStore() rt := &fakeRuntime{} ws := &fakeWorkspace{} - m := New(Deps{ - Runtime: rt, Agent: fakeAgent{}, Workspace: ws, - Store: st, Messenger: &fakeMessenger{}, Lifecycle: &fakeLCM{store: st}, - }) + m := New(Deps{Runtime: rt, Agent: fakeAgent{}, Workspace: ws, Store: st, Messenger: &fakeMessenger{}, Lifecycle: &fakeLCM{store: st}}) return m, st, rt, ws } - func seedTerminal(st *fakeStore, id domain.SessionID, meta domain.SessionMetadata) { - st.sessions[id] = domain.SessionRecord{ - ID: id, ProjectID: "mer", Metadata: meta, - Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionTerminated}}, - } + st.sessions[id] = domain.SessionRecord{ID: id, ProjectID: "mer", Metadata: meta, IsTerminated: true, Activity: domain.ActivitySubstate{State: domain.ActivityExited}} +} +func mkLive(id domain.SessionID) domain.SessionRecord { + return domain.SessionRecord{ID: id, ProjectID: "mer", Metadata: domain.SessionMetadata{WorkspacePath: "/ws/" + string(id), RuntimeHandleID: "h1"}, Activity: domain.ActivitySubstate{State: domain.ActivityActive}} } -// ---- tests ---- - -func TestSpawn_AssignsIDAndGoesLive(t *testing.T) { +func TestSpawn_AssignsIDAndGoesIdle(t *testing.T) { m, st, rt, _ := newManager() - s, err := m.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer", Kind: domain.KindWorker, Prompt: "do it"}) if err != nil { t.Fatal(err) } if s.ID != "mer-1" { - t.Fatalf("store should assign mer-1, got %q", s.ID) + t.Fatalf("got %q", s.ID) } - if s.Status != domain.StatusSpawning { - t.Fatalf("fresh session displays spawning, got %q", s.Status) + if s.Status != domain.StatusIdle { + t.Fatalf("fresh session displays idle, got %q", s.Status) } if rt.created != 1 { - t.Fatalf("runtime not created") + t.Fatal("runtime not created") } if st.sessions["mer-1"].Metadata.RuntimeHandleID != "h1" { - t.Fatal("spawn handle not folded into the row") + t.Fatal("handle not folded") } } - func TestSpawn_RollsBackOnRuntimeFailure(t *testing.T) { m, st, _, ws := newManager() m.runtime = &fakeRuntime{createErr: errors.New("boom")} - if _, err := m.Spawn(ctx, ports.SpawnConfig{ProjectID: "mer"}); err == nil { - t.Fatal("expected spawn to fail") + t.Fatal("expected failure") } if ws.destroyed != 1 { - t.Fatal("workspace should be rolled back") + t.Fatal("workspace should roll back") } - if st.sessions["mer-1"].Lifecycle.Session.State != domain.SessionTerminated { - t.Fatal("orphaned spawn should be parked terminal") + if !st.sessions["mer-1"].IsTerminated { + t.Fatal("orphaned spawn should be terminated") } } - func TestKill_TearsDownRuntimeAndWorkspace(t *testing.T) { m, st, rt, ws := newManager() st.sessions["mer-1"] = mkLive("mer-1") - - freed, err := m.Kill(ctx, "mer-1", domain.TermManuallyKilled) + freed, err := m.Kill(ctx, "mer-1") if err != nil || !freed { - t.Fatalf("kill should free the workspace: freed=%v err=%v", freed, err) + t.Fatalf("freed=%v err=%v", freed, err) } if rt.destroyed != 1 || ws.destroyed != 1 { t.Fatal("kill should destroy runtime and workspace") } } - func TestKill_RefusesIncompleteHandle(t *testing.T) { m, st, _, _ := newManager() - st.sessions["mer-1"] = domain.SessionRecord{ // live, but no teardown handles - ID: "mer-1", ProjectID: "mer", - Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionWorking}, IsAlive: true}, - } - - if _, err := m.Kill(ctx, "mer-1", domain.TermManuallyKilled); !errors.Is(err, ErrIncompleteHandle) { + st.sessions["mer-1"] = domain.SessionRecord{ID: "mer-1", ProjectID: "mer", Activity: domain.ActivitySubstate{State: domain.ActivityActive}} + if _, err := m.Kill(ctx, "mer-1"); !errors.Is(err, ErrIncompleteHandle) { t.Fatalf("want ErrIncompleteHandle, got %v", err) } } - func TestRestore_ReopensTerminal(t *testing.T) { m, st, rt, _ := newManager() seedTerminal(st, "mer-1", domain.SessionMetadata{WorkspacePath: "/ws/mer-1", Branch: "b", AgentSessionID: "agent-x"}) - s, err := m.Restore(ctx, "mer-1") if err != nil { t.Fatal(err) } - if s.Status != domain.StatusSpawning { - t.Fatalf("restored session displays spawning, got %q", s.Status) + if s.Status != domain.StatusIdle { + t.Fatalf("restored displays idle, got %q", s.Status) } if rt.created != 1 { - t.Fatal("restore should relaunch the runtime") + t.Fatal("restore should relaunch") } } - func TestRestore_RefusesLiveSession(t *testing.T) { m, st, _, _ := newManager() st.sessions["mer-1"] = mkLive("mer-1") - if _, err := m.Restore(ctx, "mer-1"); !errors.Is(err, ErrNotRestorable) { t.Fatalf("want ErrNotRestorable, got %v", err) } } - func TestList_DerivesStatusFromPRFacts(t *testing.T) { m, st, _, _ := newManager() st.sessions["mer-1"] = mkLive("mer-1") - st.pr["mer-1"] = domain.PRFacts{Exists: true, CI: domain.CIFailing} - + st.pr["mer-1"] = domain.PRFacts{URL: "pr1", CI: domain.CIFailing} list, err := m.List(ctx, "mer") if err != nil { t.Fatal(err) } if len(list) != 1 || list[0].Status != domain.StatusCIFailed { - t.Fatalf("status should reflect PR facts, got %+v", list) + t.Fatalf("got %+v", list) } } - func TestCleanup_ReclaimsTerminalWorkspaces(t *testing.T) { m, st, _, ws := newManager() seedTerminal(st, "mer-1", domain.SessionMetadata{WorkspacePath: "/ws/mer-1"}) - st.sessions["mer-2"] = mkLive("mer-2") // live: must be skipped - + st.sessions["mer-2"] = mkLive("mer-2") cleaned, err := m.Cleanup(ctx, "mer") if err != nil { t.Fatal(err) } if len(cleaned) != 1 || cleaned[0] != "mer-1" { - t.Fatalf("only the terminal session should be reclaimed, got %v", cleaned) + t.Fatalf("got %v", cleaned) } if ws.destroyed != 1 { - t.Fatal("the live session's workspace must not be destroyed") - } -} - -func mkLive(id domain.SessionID) domain.SessionRecord { - return domain.SessionRecord{ - ID: id, ProjectID: "mer", - Metadata: domain.SessionMetadata{WorkspacePath: "/ws/" + string(id), RuntimeHandleID: "h1", RuntimeName: "tmux"}, - Lifecycle: domain.CanonicalSessionLifecycle{Session: domain.SessionSubstate{State: domain.SessionWorking}, IsAlive: true}, + t.Fatal("live workspace must not be destroyed") } } diff --git a/backend/internal/storage/sqlite/changelog_store.go b/backend/internal/storage/sqlite/changelog_store.go deleted file mode 100644 index 927d7968..00000000 --- a/backend/internal/storage/sqlite/changelog_store.go +++ /dev/null @@ -1,89 +0,0 @@ -package sqlite - -import ( - "context" - "fmt" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// ChangeLogRow is one durable CDC event. These rows are written by the DB -// triggers (migration 0001), never by application code; the store only reads -// them, for the CDC poller. -type ChangeLogRow struct { - Seq int64 - ProjectID string - SessionID string // empty when the event is project-level (NULL in the DB) - EventType string - Payload string - CreatedAt time.Time -} - -// ReadChangeLogAfter returns up to limit events with seq > after, in seq order -// — the CDC poller's read. The frontend's offset is `after`. -func (s *Store) ReadChangeLogAfter(ctx context.Context, after int64, limit int) ([]ChangeLogRow, error) { - rows, err := s.qr.ReadChangeLogAfter(ctx, gen.ReadChangeLogAfterParams{Seq: after, Limit: int64(limit)}) - if err != nil { - return nil, fmt.Errorf("read change_log after %d: %w", after, err) - } - out := make([]ChangeLogRow, 0, len(rows)) - for _, r := range rows { - out = append(out, changeLogRowFromGen(r)) - } - return out, nil -} - -// ReadChangeLogAfterForProject is the project-scoped variant — a client -// subscribed to one project reads only its events. -func (s *Store) ReadChangeLogAfterForProject(ctx context.Context, project string, after int64, limit int) ([]ChangeLogRow, error) { - rows, err := s.qr.ReadChangeLogAfterForProject(ctx, gen.ReadChangeLogAfterForProjectParams{ - ProjectID: project, Seq: after, Limit: int64(limit), - }) - if err != nil { - return nil, fmt.Errorf("read change_log for %s after %d: %w", project, after, err) - } - out := make([]ChangeLogRow, 0, len(rows)) - for _, r := range rows { - out = append(out, changeLogRowFromGen(r)) - } - return out, nil -} - -// MaxChangeLogSeq returns the highest seq (0 if empty) — a fresh consumer's -// starting offset. -func (s *Store) MaxChangeLogSeq(ctx context.Context) (int64, error) { - v, err := s.qr.MaxChangeLogSeq(ctx) - if err != nil { - return 0, fmt.Errorf("max change_log seq: %w", err) - } - return asInt64(v), nil -} - -func changeLogRowFromGen(r gen.ChangeLog) ChangeLogRow { - row := ChangeLogRow{ - Seq: r.Seq, - ProjectID: r.ProjectID, - EventType: r.EventType, - Payload: r.Payload, - CreatedAt: r.CreatedAt, - } - if r.SessionID.Valid { - row.SessionID = r.SessionID.String - } - return row -} - -// asInt64 coerces sqlc's interface{} result for COALESCE(MAX(...)) — sqlc's -// SQLite type inference can't narrow the aggregate, so the generated signature -// is interface{}. modernc returns int64 for an integer aggregate. -func asInt64(v interface{}) int64 { - switch n := v.(type) { - case int64: - return n - case int: - return int64(n) - default: - return 0 - } -} diff --git a/backend/internal/storage/sqlite/db.go b/backend/internal/storage/sqlite/db.go index 280b48e0..e9e447e8 100644 --- a/backend/internal/storage/sqlite/db.go +++ b/backend/internal/storage/sqlite/db.go @@ -1,6 +1,6 @@ -// Package sqlite is the durable persistence adapter: the goose-managed schema, -// typed CRUD over sqlc-generated queries, and the read side of the -// trigger-driven CDC (it reads change_log; the DB triggers write it). +// Package sqlite owns SQLite connection setup and goose-managed schema +// migrations. Typed CRUD lives in the store subpackage; this package keeps the +// public Open entrypoint and compatibility aliases for callers. package sqlite import ( @@ -12,12 +12,18 @@ import ( "sync" "github.com/pressly/goose/v3" + + sqlitestore "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/store" + // modernc.org/sqlite is the pure-Go (CGO-free) SQLite driver — chosen so the // daemon cross-compiles and ships as a static binary with no libsqlite/CGO // toolchain dependency, at the cost of some raw throughput vs a C-backed driver. _ "modernc.org/sqlite" ) +// Store is the SQLite-backed persistence layer. +type Store = sqlitestore.Store + //go:embed migrations/*.sql var migrationsFS embed.FS @@ -68,7 +74,7 @@ func Open(dataDir string) (*Store, error) { readDB.SetMaxOpenConns(maxReaders) readDB.SetMaxIdleConns(maxReaders) - return NewStore(writeDB, readDB), nil + return sqlitestore.NewStore(writeDB, readDB), nil } // gooseMu serialises calls into goose. goose v3 keeps its baseFS / logger / diff --git a/backend/internal/storage/sqlite/gen/changelog.sql.go b/backend/internal/storage/sqlite/gen/changelog.sql.go index 6568fdcc..c582a4c3 100644 --- a/backend/internal/storage/sqlite/gen/changelog.sql.go +++ b/backend/internal/storage/sqlite/gen/changelog.sql.go @@ -10,12 +10,12 @@ import ( ) const maxChangeLogSeq = `-- name: MaxChangeLogSeq :one -SELECT COALESCE(MAX(seq), 0) AS seq FROM change_log +SELECT CAST(COALESCE(MAX(seq), 0) AS INTEGER) AS seq FROM change_log ` -func (q *Queries) MaxChangeLogSeq(ctx context.Context) (interface{}, error) { +func (q *Queries) MaxChangeLogSeq(ctx context.Context) (int64, error) { row := q.db.QueryRowContext(ctx, maxChangeLogSeq) - var seq interface{} + var seq int64 err := row.Scan(&seq) return seq, err } @@ -59,44 +59,3 @@ func (q *Queries) ReadChangeLogAfter(ctx context.Context, arg ReadChangeLogAfter } return items, nil } - -const readChangeLogAfterForProject = `-- name: ReadChangeLogAfterForProject :many -SELECT seq, project_id, session_id, event_type, payload, created_at -FROM change_log WHERE project_id = ? AND seq > ? ORDER BY seq LIMIT ? -` - -type ReadChangeLogAfterForProjectParams struct { - ProjectID string - Seq int64 - Limit int64 -} - -func (q *Queries) ReadChangeLogAfterForProject(ctx context.Context, arg ReadChangeLogAfterForProjectParams) ([]ChangeLog, error) { - rows, err := q.db.QueryContext(ctx, readChangeLogAfterForProject, arg.ProjectID, arg.Seq, arg.Limit) - if err != nil { - return nil, err - } - defer rows.Close() - items := []ChangeLog{} - for rows.Next() { - var i ChangeLog - if err := rows.Scan( - &i.Seq, - &i.ProjectID, - &i.SessionID, - &i.EventType, - &i.Payload, - &i.CreatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} diff --git a/backend/internal/storage/sqlite/gen/models.go b/backend/internal/storage/sqlite/gen/models.go index 992c0ca0..720343e0 100644 --- a/backend/internal/storage/sqlite/gen/models.go +++ b/backend/internal/storage/sqlite/gen/models.go @@ -7,100 +7,77 @@ package gen import ( "database/sql" "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) type ChangeLog struct { Seq int64 - ProjectID string - SessionID sql.NullString - EventType string + ProjectID domain.ProjectID + SessionID *domain.SessionID + EventType cdc.EventType Payload string CreatedAt time.Time } -type Notification struct { - Seq int64 - ID string - ProjectID string - SessionID string - Source string - EventType string - SemanticType string - Priority string - Message string - PayloadJson string - ActionsJson string - DedupeKey string - CauseKey string - ReadAt sql.NullTime - ArchivedAt sql.NullTime - CreatedAt time.Time - UpdatedAt time.Time -} - -type Pr struct { - Url string - SessionID string +type PR struct { + URL string + SessionID domain.SessionID Number int64 - PrState string - ReviewDecision string - CiState string - Mergeability string + PRState domain.PRState + ReviewDecision domain.ReviewDecision + CIState domain.CIState + Mergeability domain.Mergeability UpdatedAt time.Time } -type PrCheck struct { - PrUrl string +type PRCheck struct { + PRURL string Name string CommitHash string - Status string - Url string + Status domain.PRCheckStatus + URL string LogTail string CreatedAt time.Time } -type PrComment struct { - PrUrl string +type PRComment struct { + PRURL string CommentID string Author string File string Line int64 Body string - Resolved int64 + Resolved bool CreatedAt time.Time } type Project struct { - ID string + ID domain.ProjectID Path string - RepoOriginUrl string + RepoOriginURL string DisplayName string RegisteredAt time.Time ArchivedAt sql.NullTime } type Session struct { - ID string - ProjectID string - Num int64 - IssueID string - Kind string - Harness string - SessionState string - TerminationReason string - IsAlive int64 - ActivityState string - ActivityLastAt time.Time - ActivitySource string - DetectingAttempts sql.NullInt64 - DetectingStartedAt sql.NullTime - DetectingEvidenceHash sql.NullString - Branch string - WorkspacePath string - RuntimeHandleID string - RuntimeName string - AgentSessionID string - Prompt string - CreatedAt time.Time - UpdatedAt time.Time + ID domain.SessionID + ProjectID domain.ProjectID + Num int64 + IssueID domain.IssueID + Kind domain.SessionKind + Harness domain.AgentHarness + ActivityState domain.ActivityState + ActivityLastAt time.Time + ActivitySource domain.ActivitySource + IsTerminated bool + Branch string + WorkspacePath string + RuntimeHandleID string + AgentSessionID string + Prompt string + CreatedAt time.Time + UpdatedAt time.Time } diff --git a/backend/internal/storage/sqlite/gen/notifications.sql.go b/backend/internal/storage/sqlite/gen/notifications.sql.go deleted file mode 100644 index 7b2b5493..00000000 --- a/backend/internal/storage/sqlite/gen/notifications.sql.go +++ /dev/null @@ -1,464 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.31.1 -// source: notifications.sql - -package gen - -import ( - "context" - "database/sql" - "time" -) - -const archiveNotification = `-- name: ArchiveNotification :one -UPDATE notifications -SET archived_at = ?, updated_at = ? -WHERE id = ? AND archived_at IS NULL -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -` - -type ArchiveNotificationParams struct { - ArchivedAt sql.NullTime - UpdatedAt time.Time - ID string -} - -func (q *Queries) ArchiveNotification(ctx context.Context, arg ArchiveNotificationParams) (Notification, error) { - row := q.db.QueryRowContext(ctx, archiveNotification, arg.ArchivedAt, arg.UpdatedAt, arg.ID) - var i Notification - err := row.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ) - return i, err -} - -const getNotification = `-- name: GetNotification :one -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications WHERE id = ? -` - -func (q *Queries) GetNotification(ctx context.Context, id string) (Notification, error) { - row := q.db.QueryRowContext(ctx, getNotification, id) - var i Notification - err := row.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ) - return i, err -} - -const getNotificationByDedupeKey = `-- name: GetNotificationByDedupeKey :one -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications WHERE dedupe_key = ? -` - -func (q *Queries) GetNotificationByDedupeKey(ctx context.Context, dedupeKey string) (Notification, error) { - row := q.db.QueryRowContext(ctx, getNotificationByDedupeKey, dedupeKey) - var i Notification - err := row.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ) - return i, err -} - -const insertNotification = `-- name: InsertNotification :one -INSERT INTO notifications ( - project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, created_at, updated_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (dedupe_key) DO NOTHING -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -` - -type InsertNotificationParams struct { - ProjectID string - SessionID string - Source string - EventType string - SemanticType string - Priority string - Message string - PayloadJson string - ActionsJson string - DedupeKey string - CauseKey string - CreatedAt time.Time - UpdatedAt time.Time -} - -func (q *Queries) InsertNotification(ctx context.Context, arg InsertNotificationParams) (Notification, error) { - row := q.db.QueryRowContext(ctx, insertNotification, - arg.ProjectID, - arg.SessionID, - arg.Source, - arg.EventType, - arg.SemanticType, - arg.Priority, - arg.Message, - arg.PayloadJson, - arg.ActionsJson, - arg.DedupeKey, - arg.CauseKey, - arg.CreatedAt, - arg.UpdatedAt, - ) - var i Notification - err := row.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ) - return i, err -} - -const listNotifications = `-- name: ListNotifications :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -ORDER BY seq DESC -LIMIT ? -` - -func (q *Queries) ListNotifications(ctx context.Context, limit int64) ([]Notification, error) { - rows, err := q.db.QueryContext(ctx, listNotifications, limit) - if err != nil { - return nil, err - } - defer rows.Close() - items := []Notification{} - for rows.Next() { - var i Notification - if err := rows.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const listNotificationsByProject = `-- name: ListNotificationsByProject :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -WHERE project_id = ? -ORDER BY seq DESC -LIMIT ? -` - -type ListNotificationsByProjectParams struct { - ProjectID string - Limit int64 -} - -func (q *Queries) ListNotificationsByProject(ctx context.Context, arg ListNotificationsByProjectParams) ([]Notification, error) { - rows, err := q.db.QueryContext(ctx, listNotificationsByProject, arg.ProjectID, arg.Limit) - if err != nil { - return nil, err - } - defer rows.Close() - items := []Notification{} - for rows.Next() { - var i Notification - if err := rows.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const listNotificationsBySession = `-- name: ListNotificationsBySession :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -WHERE session_id = ? -ORDER BY seq DESC -LIMIT ? -` - -type ListNotificationsBySessionParams struct { - SessionID string - Limit int64 -} - -func (q *Queries) ListNotificationsBySession(ctx context.Context, arg ListNotificationsBySessionParams) ([]Notification, error) { - rows, err := q.db.QueryContext(ctx, listNotificationsBySession, arg.SessionID, arg.Limit) - if err != nil { - return nil, err - } - defer rows.Close() - items := []Notification{} - for rows.Next() { - var i Notification - if err := rows.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const listUnreadNotifications = `-- name: ListUnreadNotifications :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -WHERE read_at IS NULL AND archived_at IS NULL -ORDER BY seq DESC -LIMIT ? -` - -func (q *Queries) ListUnreadNotifications(ctx context.Context, limit int64) ([]Notification, error) { - rows, err := q.db.QueryContext(ctx, listUnreadNotifications, limit) - if err != nil { - return nil, err - } - defer rows.Close() - items := []Notification{} - for rows.Next() { - var i Notification - if err := rows.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const markNotificationRead = `-- name: MarkNotificationRead :one -UPDATE notifications -SET read_at = ?, updated_at = ? -WHERE id = ? AND read_at IS NULL -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -` - -type MarkNotificationReadParams struct { - ReadAt sql.NullTime - UpdatedAt time.Time - ID string -} - -func (q *Queries) MarkNotificationRead(ctx context.Context, arg MarkNotificationReadParams) (Notification, error) { - row := q.db.QueryRowContext(ctx, markNotificationRead, arg.ReadAt, arg.UpdatedAt, arg.ID) - var i Notification - err := row.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ) - return i, err -} - -const markNotificationUnread = `-- name: MarkNotificationUnread :one -UPDATE notifications -SET read_at = NULL, updated_at = ? -WHERE id = ? AND read_at IS NOT NULL -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -` - -type MarkNotificationUnreadParams struct { - UpdatedAt time.Time - ID string -} - -func (q *Queries) MarkNotificationUnread(ctx context.Context, arg MarkNotificationUnreadParams) (Notification, error) { - row := q.db.QueryRowContext(ctx, markNotificationUnread, arg.UpdatedAt, arg.ID) - var i Notification - err := row.Scan( - &i.Seq, - &i.ID, - &i.ProjectID, - &i.SessionID, - &i.Source, - &i.EventType, - &i.SemanticType, - &i.Priority, - &i.Message, - &i.PayloadJson, - &i.ActionsJson, - &i.DedupeKey, - &i.CauseKey, - &i.ReadAt, - &i.ArchivedAt, - &i.CreatedAt, - &i.UpdatedAt, - ) - return i, err -} diff --git a/backend/internal/storage/sqlite/gen/pr.sql.go b/backend/internal/storage/sqlite/gen/pr.sql.go index f9fa3620..154885cf 100644 --- a/backend/internal/storage/sqlite/gen/pr.sql.go +++ b/backend/internal/storage/sqlite/gen/pr.sql.go @@ -8,31 +8,73 @@ package gen import ( "context" "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -const deletePR = `-- name: DeletePR :exec -DELETE FROM pr WHERE url = ? +const getDisplayPRFactsBySession = `-- name: GetDisplayPRFactsBySession :one +SELECT + pr.url, + pr.number, + pr.pr_state, + pr.review_decision, + pr.ci_state, + pr.mergeability, + EXISTS ( + SELECT 1 + FROM pr_comment + WHERE pr_comment.pr_url = pr.url + AND pr_comment.resolved = 0 + ) AS review_comments +FROM pr +WHERE pr.session_id = ? +ORDER BY + CASE WHEN pr.pr_state NOT IN ('merged', 'closed') THEN 0 ELSE 1 END, + pr.updated_at DESC +LIMIT 1 ` -func (q *Queries) DeletePR(ctx context.Context, url string) error { - _, err := q.db.ExecContext(ctx, deletePR, url) - return err +type GetDisplayPRFactsBySessionRow struct { + URL string + Number int64 + PRState domain.PRState + ReviewDecision domain.ReviewDecision + CIState domain.CIState + Mergeability domain.Mergeability + ReviewComments bool +} + +func (q *Queries) GetDisplayPRFactsBySession(ctx context.Context, sessionID domain.SessionID) (GetDisplayPRFactsBySessionRow, error) { + row := q.db.QueryRowContext(ctx, getDisplayPRFactsBySession, sessionID) + var i GetDisplayPRFactsBySessionRow + err := row.Scan( + &i.URL, + &i.Number, + &i.PRState, + &i.ReviewDecision, + &i.CIState, + &i.Mergeability, + &i.ReviewComments, + ) + return i, err } const getPR = `-- name: GetPR :one -SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at FROM pr WHERE url = ? +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at +FROM pr +WHERE url = ? ` -func (q *Queries) GetPR(ctx context.Context, url string) (Pr, error) { +func (q *Queries) GetPR(ctx context.Context, url string) (PR, error) { row := q.db.QueryRowContext(ctx, getPR, url) - var i Pr + var i PR err := row.Scan( - &i.Url, + &i.URL, &i.SessionID, &i.Number, - &i.PrState, + &i.PRState, &i.ReviewDecision, - &i.CiState, + &i.CIState, &i.Mergeability, &i.UpdatedAt, ) @@ -40,25 +82,28 @@ func (q *Queries) GetPR(ctx context.Context, url string) (Pr, error) { } const listPRsBySession = `-- name: ListPRsBySession :many -SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at FROM pr WHERE session_id = ? ORDER BY updated_at DESC +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at +FROM pr +WHERE session_id = ? +ORDER BY updated_at DESC ` -func (q *Queries) ListPRsBySession(ctx context.Context, sessionID string) ([]Pr, error) { +func (q *Queries) ListPRsBySession(ctx context.Context, sessionID domain.SessionID) ([]PR, error) { rows, err := q.db.QueryContext(ctx, listPRsBySession, sessionID) if err != nil { return nil, err } defer rows.Close() - items := []Pr{} + items := []PR{} for rows.Next() { - var i Pr + var i PR if err := rows.Scan( - &i.Url, + &i.URL, &i.SessionID, &i.Number, - &i.PrState, + &i.PRState, &i.ReviewDecision, - &i.CiState, + &i.CIState, &i.Mergeability, &i.UpdatedAt, ); err != nil { @@ -79,7 +124,6 @@ const upsertPR = `-- name: UpsertPR :exec INSERT INTO pr (url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT (url) DO UPDATE SET - session_id = excluded.session_id, number = excluded.number, pr_state = excluded.pr_state, review_decision = excluded.review_decision, @@ -89,24 +133,24 @@ ON CONFLICT (url) DO UPDATE SET ` type UpsertPRParams struct { - Url string - SessionID string + URL string + SessionID domain.SessionID Number int64 - PrState string - ReviewDecision string - CiState string - Mergeability string + PRState domain.PRState + ReviewDecision domain.ReviewDecision + CIState domain.CIState + Mergeability domain.Mergeability UpdatedAt time.Time } func (q *Queries) UpsertPR(ctx context.Context, arg UpsertPRParams) error { _, err := q.db.ExecContext(ctx, upsertPR, - arg.Url, + arg.URL, arg.SessionID, arg.Number, - arg.PrState, + arg.PRState, arg.ReviewDecision, - arg.CiState, + arg.CIState, arg.Mergeability, arg.UpdatedAt, ) diff --git a/backend/internal/storage/sqlite/gen/pr_checks.sql.go b/backend/internal/storage/sqlite/gen/pr_checks.sql.go index 58668ab1..fde21e67 100644 --- a/backend/internal/storage/sqlite/gen/pr_checks.sql.go +++ b/backend/internal/storage/sqlite/gen/pr_checks.sql.go @@ -8,27 +8,30 @@ package gen import ( "context" "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) const listChecksByPR = `-- name: ListChecksByPR :many -SELECT pr_url, name, commit_hash, status, url, log_tail, created_at FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at +SELECT pr_url, name, commit_hash, status, url, log_tail, created_at +FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at ` -func (q *Queries) ListChecksByPR(ctx context.Context, prUrl string) ([]PrCheck, error) { +func (q *Queries) ListChecksByPR(ctx context.Context, prUrl string) ([]PRCheck, error) { rows, err := q.db.QueryContext(ctx, listChecksByPR, prUrl) if err != nil { return nil, err } defer rows.Close() - items := []PrCheck{} + items := []PRCheck{} for rows.Next() { - var i PrCheck + var i PRCheck if err := rows.Scan( - &i.PrUrl, + &i.PRURL, &i.Name, &i.CommitHash, &i.Status, - &i.Url, + &i.URL, &i.LogTail, &i.CreatedAt, ); err != nil { @@ -45,47 +48,6 @@ func (q *Queries) ListChecksByPR(ctx context.Context, prUrl string) ([]PrCheck, return items, nil } -const listRecentChecks = `-- name: ListRecentChecks :many -SELECT status, commit_hash, created_at FROM pr_checks -WHERE pr_url = ? AND name = ? -ORDER BY created_at DESC LIMIT ? -` - -type ListRecentChecksParams struct { - PrUrl string - Name string - Limit int64 -} - -type ListRecentChecksRow struct { - Status string - CommitHash string - CreatedAt time.Time -} - -func (q *Queries) ListRecentChecks(ctx context.Context, arg ListRecentChecksParams) ([]ListRecentChecksRow, error) { - rows, err := q.db.QueryContext(ctx, listRecentChecks, arg.PrUrl, arg.Name, arg.Limit) - if err != nil { - return nil, err - } - defer rows.Close() - items := []ListRecentChecksRow{} - for rows.Next() { - var i ListRecentChecksRow - if err := rows.Scan(&i.Status, &i.CommitHash, &i.CreatedAt); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - const upsertPRCheck = `-- name: UpsertPRCheck :exec INSERT INTO pr_checks (pr_url, name, commit_hash, status, url, log_tail, created_at) VALUES (?, ?, ?, ?, ?, ?, ?) @@ -96,22 +58,22 @@ ON CONFLICT (pr_url, name, commit_hash) DO UPDATE SET ` type UpsertPRCheckParams struct { - PrUrl string + PRURL string Name string CommitHash string - Status string - Url string + Status domain.PRCheckStatus + URL string LogTail string CreatedAt time.Time } func (q *Queries) UpsertPRCheck(ctx context.Context, arg UpsertPRCheckParams) error { _, err := q.db.ExecContext(ctx, upsertPRCheck, - arg.PrUrl, + arg.PRURL, arg.Name, arg.CommitHash, arg.Status, - arg.Url, + arg.URL, arg.LogTail, arg.CreatedAt, ) diff --git a/backend/internal/storage/sqlite/gen/pr_comment.sql.go b/backend/internal/storage/sqlite/gen/pr_comment.sql.go index a2f09f34..f69cc17b 100644 --- a/backend/internal/storage/sqlite/gen/pr_comment.sql.go +++ b/backend/internal/storage/sqlite/gen/pr_comment.sql.go @@ -19,21 +19,52 @@ func (q *Queries) DeletePRComments(ctx context.Context, prUrl string) error { return err } +const insertPRComment = `-- name: InsertPRComment :exec +INSERT INTO pr_comment (pr_url, comment_id, author, file, line, body, resolved, created_at) +VALUES (?, ?, ?, ?, ?, ?, ?, ?) +` + +type InsertPRCommentParams struct { + PRURL string + CommentID string + Author string + File string + Line int64 + Body string + Resolved bool + CreatedAt time.Time +} + +func (q *Queries) InsertPRComment(ctx context.Context, arg InsertPRCommentParams) error { + _, err := q.db.ExecContext(ctx, insertPRComment, + arg.PRURL, + arg.CommentID, + arg.Author, + arg.File, + arg.Line, + arg.Body, + arg.Resolved, + arg.CreatedAt, + ) + return err +} + const listPRComments = `-- name: ListPRComments :many -SELECT pr_url, comment_id, author, file, line, body, resolved, created_at FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id +SELECT pr_url, comment_id, author, file, line, body, resolved, created_at +FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id ` -func (q *Queries) ListPRComments(ctx context.Context, prUrl string) ([]PrComment, error) { +func (q *Queries) ListPRComments(ctx context.Context, prUrl string) ([]PRComment, error) { rows, err := q.db.QueryContext(ctx, listPRComments, prUrl) if err != nil { return nil, err } defer rows.Close() - items := []PrComment{} + items := []PRComment{} for rows.Next() { - var i PrComment + var i PRComment if err := rows.Scan( - &i.PrUrl, + &i.PRURL, &i.CommentID, &i.Author, &i.File, @@ -54,36 +85,3 @@ func (q *Queries) ListPRComments(ctx context.Context, prUrl string) ([]PrComment } return items, nil } - -const upsertPRComment = `-- name: UpsertPRComment :exec -INSERT INTO pr_comment (pr_url, comment_id, author, file, line, body, resolved, created_at) -VALUES (?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (pr_url, comment_id) DO UPDATE SET - author = excluded.author, file = excluded.file, line = excluded.line, - body = excluded.body, resolved = excluded.resolved -` - -type UpsertPRCommentParams struct { - PrUrl string - CommentID string - Author string - File string - Line int64 - Body string - Resolved int64 - CreatedAt time.Time -} - -func (q *Queries) UpsertPRComment(ctx context.Context, arg UpsertPRCommentParams) error { - _, err := q.db.ExecContext(ctx, upsertPRComment, - arg.PrUrl, - arg.CommentID, - arg.Author, - arg.File, - arg.Line, - arg.Body, - arg.Resolved, - arg.CreatedAt, - ) - return err -} diff --git a/backend/internal/storage/sqlite/gen/projects.sql.go b/backend/internal/storage/sqlite/gen/projects.sql.go index a7c953cd..be255c5b 100644 --- a/backend/internal/storage/sqlite/gen/projects.sql.go +++ b/backend/internal/storage/sqlite/gen/projects.sql.go @@ -9,20 +9,44 @@ import ( "context" "database/sql" "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) -const archiveProject = `-- name: ArchiveProject :exec +const archiveProject = `-- name: ArchiveProject :execrows UPDATE projects SET archived_at = ? WHERE id = ? ` type ArchiveProjectParams struct { ArchivedAt sql.NullTime - ID string + ID domain.ProjectID } -func (q *Queries) ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error { - _, err := q.db.ExecContext(ctx, archiveProject, arg.ArchivedAt, arg.ID) - return err +func (q *Queries) ArchiveProject(ctx context.Context, arg ArchiveProjectParams) (int64, error) { + result, err := q.db.ExecContext(ctx, archiveProject, arg.ArchivedAt, arg.ID) + if err != nil { + return 0, err + } + return result.RowsAffected() +} + +const findProjectByPath = `-- name: FindProjectByPath :one +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE path = ? +` + +func (q *Queries) FindProjectByPath(ctx context.Context, path string) (Project, error) { + row := q.db.QueryRowContext(ctx, findProjectByPath, path) + var i Project + err := row.Scan( + &i.ID, + &i.Path, + &i.RepoOriginURL, + &i.DisplayName, + &i.RegisteredAt, + &i.ArchivedAt, + ) + return i, err } const getProject = `-- name: GetProject :one @@ -30,13 +54,13 @@ SELECT id, path, repo_origin_url, display_name, registered_at, archived_at FROM projects WHERE id = ? ` -func (q *Queries) GetProject(ctx context.Context, id string) (Project, error) { +func (q *Queries) GetProject(ctx context.Context, id domain.ProjectID) (Project, error) { row := q.db.QueryRowContext(ctx, getProject, id) var i Project err := row.Scan( &i.ID, &i.Path, - &i.RepoOriginUrl, + &i.RepoOriginURL, &i.DisplayName, &i.RegisteredAt, &i.ArchivedAt, @@ -61,7 +85,7 @@ func (q *Queries) ListProjects(ctx context.Context) ([]Project, error) { if err := rows.Scan( &i.ID, &i.Path, - &i.RepoOriginUrl, + &i.RepoOriginURL, &i.DisplayName, &i.RegisteredAt, &i.ArchivedAt, @@ -90,9 +114,9 @@ ON CONFLICT (id) DO UPDATE SET ` type UpsertProjectParams struct { - ID string + ID domain.ProjectID Path string - RepoOriginUrl string + RepoOriginURL string DisplayName string RegisteredAt time.Time ArchivedAt sql.NullTime @@ -102,7 +126,7 @@ func (q *Queries) UpsertProject(ctx context.Context, arg UpsertProjectParams) er _, err := q.db.ExecContext(ctx, upsertProject, arg.ID, arg.Path, - arg.RepoOriginUrl, + arg.RepoOriginURL, arg.DisplayName, arg.RegisteredAt, arg.ArchivedAt, diff --git a/backend/internal/storage/sqlite/gen/querier.go b/backend/internal/storage/sqlite/gen/querier.go deleted file mode 100644 index 4f91a9d5..00000000 --- a/backend/internal/storage/sqlite/gen/querier.go +++ /dev/null @@ -1,48 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.31.1 - -package gen - -import ( - "context" -) - -type Querier interface { - ArchiveNotification(ctx context.Context, arg ArchiveNotificationParams) (Notification, error) - ArchiveProject(ctx context.Context, arg ArchiveProjectParams) error - DeletePR(ctx context.Context, url string) error - DeletePRComments(ctx context.Context, prUrl string) error - DeleteSession(ctx context.Context, id string) error - GetNotification(ctx context.Context, id string) (Notification, error) - GetNotificationByDedupeKey(ctx context.Context, dedupeKey string) (Notification, error) - GetPR(ctx context.Context, url string) (Pr, error) - GetProject(ctx context.Context, id string) (Project, error) - GetSession(ctx context.Context, id string) (Session, error) - InsertNotification(ctx context.Context, arg InsertNotificationParams) (Notification, error) - InsertSession(ctx context.Context, arg InsertSessionParams) error - ListAllSessions(ctx context.Context) ([]Session, error) - ListChecksByPR(ctx context.Context, prUrl string) ([]PrCheck, error) - ListNotifications(ctx context.Context, limit int64) ([]Notification, error) - ListNotificationsByProject(ctx context.Context, arg ListNotificationsByProjectParams) ([]Notification, error) - ListNotificationsBySession(ctx context.Context, arg ListNotificationsBySessionParams) ([]Notification, error) - ListPRComments(ctx context.Context, prUrl string) ([]PrComment, error) - ListPRsBySession(ctx context.Context, sessionID string) ([]Pr, error) - ListProjects(ctx context.Context) ([]Project, error) - ListRecentChecks(ctx context.Context, arg ListRecentChecksParams) ([]ListRecentChecksRow, error) - ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) - ListUnreadNotifications(ctx context.Context, limit int64) ([]Notification, error) - MarkNotificationRead(ctx context.Context, arg MarkNotificationReadParams) (Notification, error) - MarkNotificationUnread(ctx context.Context, arg MarkNotificationUnreadParams) (Notification, error) - MaxChangeLogSeq(ctx context.Context) (interface{}, error) - NextSessionNum(ctx context.Context, projectID string) (int64, error) - ReadChangeLogAfter(ctx context.Context, arg ReadChangeLogAfterParams) ([]ChangeLog, error) - ReadChangeLogAfterForProject(ctx context.Context, arg ReadChangeLogAfterForProjectParams) ([]ChangeLog, error) - UpdateSession(ctx context.Context, arg UpdateSessionParams) error - UpsertPR(ctx context.Context, arg UpsertPRParams) error - UpsertPRCheck(ctx context.Context, arg UpsertPRCheckParams) error - UpsertPRComment(ctx context.Context, arg UpsertPRCommentParams) error - UpsertProject(ctx context.Context, arg UpsertProjectParams) error -} - -var _ Querier = (*Queries)(nil) diff --git a/backend/internal/storage/sqlite/gen/sessions.sql.go b/backend/internal/storage/sqlite/gen/sessions.sql.go index 5365a22c..2acc8918 100644 --- a/backend/internal/storage/sqlite/gen/sessions.sql.go +++ b/backend/internal/storage/sqlite/gen/sessions.sql.go @@ -7,24 +7,19 @@ package gen import ( "context" - "database/sql" "time" -) - -const deleteSession = `-- name: DeleteSession :exec -DELETE FROM sessions WHERE id = ? -` -func (q *Queries) DeleteSession(ctx context.Context, id string) error { - _, err := q.db.ExecContext(ctx, deleteSession, id) - return err -} + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) const getSession = `-- name: GetSession :one -SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions WHERE id = ? +SELECT id, project_id, num, issue_id, kind, harness, + activity_state, activity_last_at, activity_source, is_terminated, branch, workspace_path, + runtime_handle_id, agent_session_id, prompt, created_at, updated_at +FROM sessions WHERE id = ? ` -func (q *Queries) GetSession(ctx context.Context, id string) (Session, error) { +func (q *Queries) GetSession(ctx context.Context, id domain.SessionID) (Session, error) { row := q.db.QueryRowContext(ctx, getSession, id) var i Session err := row.Scan( @@ -34,19 +29,13 @@ func (q *Queries) GetSession(ctx context.Context, id string) (Session, error) { &i.IssueID, &i.Kind, &i.Harness, - &i.SessionState, - &i.TerminationReason, - &i.IsAlive, &i.ActivityState, &i.ActivityLastAt, &i.ActivitySource, - &i.DetectingAttempts, - &i.DetectingStartedAt, - &i.DetectingEvidenceHash, + &i.IsTerminated, &i.Branch, &i.WorkspacePath, &i.RuntimeHandleID, - &i.RuntimeName, &i.AgentSessionID, &i.Prompt, &i.CreatedAt, @@ -58,38 +47,30 @@ func (q *Queries) GetSession(ctx context.Context, id string) (Session, error) { const insertSession = `-- name: InsertSession :exec INSERT INTO sessions ( id, project_id, num, issue_id, kind, harness, - session_state, termination_reason, is_alive, - activity_state, activity_last_at, activity_source, - detecting_attempts, detecting_started_at, detecting_evidence_hash, - branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, + activity_state, activity_last_at, activity_source, is_terminated, + branch, workspace_path, runtime_handle_id, agent_session_id, prompt, created_at, updated_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ` type InsertSessionParams struct { - ID string - ProjectID string - Num int64 - IssueID string - Kind string - Harness string - SessionState string - TerminationReason string - IsAlive int64 - ActivityState string - ActivityLastAt time.Time - ActivitySource string - DetectingAttempts sql.NullInt64 - DetectingStartedAt sql.NullTime - DetectingEvidenceHash sql.NullString - Branch string - WorkspacePath string - RuntimeHandleID string - RuntimeName string - AgentSessionID string - Prompt string - CreatedAt time.Time - UpdatedAt time.Time + ID domain.SessionID + ProjectID domain.ProjectID + Num int64 + IssueID domain.IssueID + Kind domain.SessionKind + Harness domain.AgentHarness + ActivityState domain.ActivityState + ActivityLastAt time.Time + ActivitySource domain.ActivitySource + IsTerminated bool + Branch string + WorkspacePath string + RuntimeHandleID string + AgentSessionID string + Prompt string + CreatedAt time.Time + UpdatedAt time.Time } func (q *Queries) InsertSession(ctx context.Context, arg InsertSessionParams) error { @@ -100,19 +81,13 @@ func (q *Queries) InsertSession(ctx context.Context, arg InsertSessionParams) er arg.IssueID, arg.Kind, arg.Harness, - arg.SessionState, - arg.TerminationReason, - arg.IsAlive, arg.ActivityState, arg.ActivityLastAt, arg.ActivitySource, - arg.DetectingAttempts, - arg.DetectingStartedAt, - arg.DetectingEvidenceHash, + arg.IsTerminated, arg.Branch, arg.WorkspacePath, arg.RuntimeHandleID, - arg.RuntimeName, arg.AgentSessionID, arg.Prompt, arg.CreatedAt, @@ -122,7 +97,10 @@ func (q *Queries) InsertSession(ctx context.Context, arg InsertSessionParams) er } const listAllSessions = `-- name: ListAllSessions :many -SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions ORDER BY project_id, num +SELECT id, project_id, num, issue_id, kind, harness, + activity_state, activity_last_at, activity_source, is_terminated, branch, workspace_path, + runtime_handle_id, agent_session_id, prompt, created_at, updated_at +FROM sessions ORDER BY project_id, num ` func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { @@ -141,19 +119,13 @@ func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { &i.IssueID, &i.Kind, &i.Harness, - &i.SessionState, - &i.TerminationReason, - &i.IsAlive, &i.ActivityState, &i.ActivityLastAt, &i.ActivitySource, - &i.DetectingAttempts, - &i.DetectingStartedAt, - &i.DetectingEvidenceHash, + &i.IsTerminated, &i.Branch, &i.WorkspacePath, &i.RuntimeHandleID, - &i.RuntimeName, &i.AgentSessionID, &i.Prompt, &i.CreatedAt, @@ -173,10 +145,13 @@ func (q *Queries) ListAllSessions(ctx context.Context) ([]Session, error) { } const listSessionsByProject = `-- name: ListSessionsByProject :many -SELECT id, project_id, num, issue_id, kind, harness, session_state, termination_reason, is_alive, activity_state, activity_last_at, activity_source, detecting_attempts, detecting_started_at, detecting_evidence_hash, branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, created_at, updated_at FROM sessions WHERE project_id = ? ORDER BY num +SELECT id, project_id, num, issue_id, kind, harness, + activity_state, activity_last_at, activity_source, is_terminated, branch, workspace_path, + runtime_handle_id, agent_session_id, prompt, created_at, updated_at +FROM sessions WHERE project_id = ? ORDER BY num ` -func (q *Queries) ListSessionsByProject(ctx context.Context, projectID string) ([]Session, error) { +func (q *Queries) ListSessionsByProject(ctx context.Context, projectID domain.ProjectID) ([]Session, error) { rows, err := q.db.QueryContext(ctx, listSessionsByProject, projectID) if err != nil { return nil, err @@ -192,19 +167,13 @@ func (q *Queries) ListSessionsByProject(ctx context.Context, projectID string) ( &i.IssueID, &i.Kind, &i.Harness, - &i.SessionState, - &i.TerminationReason, - &i.IsAlive, &i.ActivityState, &i.ActivityLastAt, &i.ActivitySource, - &i.DetectingAttempts, - &i.DetectingStartedAt, - &i.DetectingEvidenceHash, + &i.IsTerminated, &i.Branch, &i.WorkspacePath, &i.RuntimeHandleID, - &i.RuntimeName, &i.AgentSessionID, &i.Prompt, &i.CreatedAt, @@ -227,7 +196,7 @@ const nextSessionNum = `-- name: NextSessionNum :one SELECT COALESCE(MAX(num), 0) + 1 AS next FROM sessions WHERE project_id = ? ` -func (q *Queries) NextSessionNum(ctx context.Context, projectID string) (int64, error) { +func (q *Queries) NextSessionNum(ctx context.Context, projectID domain.ProjectID) (int64, error) { row := q.db.QueryRowContext(ctx, nextSessionNum, projectID) var next int64 err := row.Scan(&next) @@ -237,35 +206,27 @@ func (q *Queries) NextSessionNum(ctx context.Context, projectID string) (int64, const updateSession = `-- name: UpdateSession :exec UPDATE sessions SET issue_id = ?, kind = ?, harness = ?, - session_state = ?, termination_reason = ?, is_alive = ?, - activity_state = ?, activity_last_at = ?, activity_source = ?, - detecting_attempts = ?, detecting_started_at = ?, detecting_evidence_hash = ?, - branch = ?, workspace_path = ?, runtime_handle_id = ?, runtime_name = ?, agent_session_id = ?, prompt = ?, + activity_state = ?, activity_last_at = ?, activity_source = ?, is_terminated = ?, + branch = ?, workspace_path = ?, runtime_handle_id = ?, agent_session_id = ?, prompt = ?, updated_at = ? WHERE id = ? ` type UpdateSessionParams struct { - IssueID string - Kind string - Harness string - SessionState string - TerminationReason string - IsAlive int64 - ActivityState string - ActivityLastAt time.Time - ActivitySource string - DetectingAttempts sql.NullInt64 - DetectingStartedAt sql.NullTime - DetectingEvidenceHash sql.NullString - Branch string - WorkspacePath string - RuntimeHandleID string - RuntimeName string - AgentSessionID string - Prompt string - UpdatedAt time.Time - ID string + IssueID domain.IssueID + Kind domain.SessionKind + Harness domain.AgentHarness + ActivityState domain.ActivityState + ActivityLastAt time.Time + ActivitySource domain.ActivitySource + IsTerminated bool + Branch string + WorkspacePath string + RuntimeHandleID string + AgentSessionID string + Prompt string + UpdatedAt time.Time + ID domain.SessionID } func (q *Queries) UpdateSession(ctx context.Context, arg UpdateSessionParams) error { @@ -273,19 +234,13 @@ func (q *Queries) UpdateSession(ctx context.Context, arg UpdateSessionParams) er arg.IssueID, arg.Kind, arg.Harness, - arg.SessionState, - arg.TerminationReason, - arg.IsAlive, arg.ActivityState, arg.ActivityLastAt, arg.ActivitySource, - arg.DetectingAttempts, - arg.DetectingStartedAt, - arg.DetectingEvidenceHash, + arg.IsTerminated, arg.Branch, arg.WorkspacePath, arg.RuntimeHandleID, - arg.RuntimeName, arg.AgentSessionID, arg.Prompt, arg.UpdatedAt, diff --git a/backend/internal/storage/sqlite/mapping.go b/backend/internal/storage/sqlite/mapping.go deleted file mode 100644 index 792854cf..00000000 --- a/backend/internal/storage/sqlite/mapping.go +++ /dev/null @@ -1,125 +0,0 @@ -package sqlite - -import ( - "database/sql" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -func boolToInt(b bool) int64 { - if b { - return 1 - } - return 0 -} - -// rowToRecord maps a stored session row to a domain record. The folded-in -// operational columns become Metadata; the canonical lifecycle is reassembled -// from the typed columns. Display status is never reconstructed here. -func rowToRecord(row gen.Session) domain.SessionRecord { - return domain.SessionRecord{ - ID: domain.SessionID(row.ID), - ProjectID: domain.ProjectID(row.ProjectID), - IssueID: domain.IssueID(row.IssueID), - Kind: domain.SessionKind(row.Kind), - Lifecycle: domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Harness: domain.AgentHarness(row.Harness), - IsAlive: row.IsAlive != 0, - Session: domain.SessionSubstate{State: domain.SessionState(row.SessionState)}, - TerminationReason: domain.TerminationReason(row.TerminationReason), - Activity: domain.ActivitySubstate{ - State: domain.ActivityState(row.ActivityState), - LastActivityAt: row.ActivityLastAt, - Source: domain.ActivitySource(row.ActivitySource), - }, - Detecting: nullToDetecting(row), - }, - Metadata: domain.SessionMetadata{ - Branch: row.Branch, - WorkspacePath: row.WorkspacePath, - RuntimeHandleID: row.RuntimeHandleID, - RuntimeName: row.RuntimeName, - AgentSessionID: row.AgentSessionID, - Prompt: row.Prompt, - }, - CreatedAt: row.CreatedAt, - UpdatedAt: row.UpdatedAt, - } -} - -func recordToInsert(rec domain.SessionRecord, num int64) gen.InsertSessionParams { - da, ds, dh := detectingToNull(rec.Lifecycle.Detecting) - return gen.InsertSessionParams{ - ID: string(rec.ID), - ProjectID: string(rec.ProjectID), - Num: num, - IssueID: string(rec.IssueID), - Kind: string(rec.Kind), - Harness: string(rec.Lifecycle.Harness), - SessionState: string(rec.Lifecycle.Session.State), - TerminationReason: string(rec.Lifecycle.TerminationReason), - IsAlive: boolToInt(rec.Lifecycle.IsAlive), - ActivityState: string(rec.Lifecycle.Activity.State), - ActivityLastAt: rec.Lifecycle.Activity.LastActivityAt, - ActivitySource: string(rec.Lifecycle.Activity.Source), - DetectingAttempts: da, - DetectingStartedAt: ds, - DetectingEvidenceHash: dh, - Branch: rec.Metadata.Branch, - WorkspacePath: rec.Metadata.WorkspacePath, - RuntimeHandleID: rec.Metadata.RuntimeHandleID, - RuntimeName: rec.Metadata.RuntimeName, - AgentSessionID: rec.Metadata.AgentSessionID, - Prompt: rec.Metadata.Prompt, - CreatedAt: rec.CreatedAt, - UpdatedAt: rec.UpdatedAt, - } -} - -func recordToUpdate(rec domain.SessionRecord) gen.UpdateSessionParams { - da, ds, dh := detectingToNull(rec.Lifecycle.Detecting) - return gen.UpdateSessionParams{ - IssueID: string(rec.IssueID), - Kind: string(rec.Kind), - Harness: string(rec.Lifecycle.Harness), - SessionState: string(rec.Lifecycle.Session.State), - TerminationReason: string(rec.Lifecycle.TerminationReason), - IsAlive: boolToInt(rec.Lifecycle.IsAlive), - ActivityState: string(rec.Lifecycle.Activity.State), - ActivityLastAt: rec.Lifecycle.Activity.LastActivityAt, - ActivitySource: string(rec.Lifecycle.Activity.Source), - DetectingAttempts: da, - DetectingStartedAt: ds, - DetectingEvidenceHash: dh, - Branch: rec.Metadata.Branch, - WorkspacePath: rec.Metadata.WorkspacePath, - RuntimeHandleID: rec.Metadata.RuntimeHandleID, - RuntimeName: rec.Metadata.RuntimeName, - AgentSessionID: rec.Metadata.AgentSessionID, - Prompt: rec.Metadata.Prompt, - UpdatedAt: rec.UpdatedAt, - ID: string(rec.ID), - } -} - -func detectingToNull(d *domain.DetectingState) (sql.NullInt64, sql.NullTime, sql.NullString) { - if d == nil { - return sql.NullInt64{}, sql.NullTime{}, sql.NullString{} - } - return sql.NullInt64{Int64: int64(d.Attempts), Valid: true}, - sql.NullTime{Time: d.StartedAt, Valid: true}, - sql.NullString{String: d.EvidenceHash, Valid: true} -} - -func nullToDetecting(row gen.Session) *domain.DetectingState { - if !row.DetectingAttempts.Valid { - return nil - } - return &domain.DetectingState{ - Attempts: int(row.DetectingAttempts.Int64), - StartedAt: row.DetectingStartedAt.Time, - EvidenceHash: row.DetectingEvidenceHash.String, - } -} diff --git a/backend/internal/storage/sqlite/migrations/0001_init.sql b/backend/internal/storage/sqlite/migrations/0001_init.sql index 9d5a6a22..d308fb33 100644 --- a/backend/internal/storage/sqlite/migrations/0001_init.sql +++ b/backend/internal/storage/sqlite/migrations/0001_init.sql @@ -14,43 +14,30 @@ CREATE TABLE projects ( archived_at TIMESTAMP ); --- sessions is the canonical record. id is "{project_id}-{num}" (e.g. mer-1) — a --- single string key, so every inbound FK is single-column. num is the per-project --- counter (computed at insert under the write mutex). Operational metadata is --- folded in (no separate table). is_alive replaces the old runtime axis; there is --- no revision column — the per-session write mutex serializes and change_log.seq --- orders. The display status is derived on read (from this + the pr row), never --- stored. +-- sessions is the durable session fact row. id is "{project_id}-{num}" +-- (e.g. mer-1), so every inbound FK is single-column. num is the per-project +-- counter. The only persisted status-like facts are activity_state and +-- is_terminated; display status is derived on read from this row plus PR facts. CREATE TABLE sessions ( id TEXT PRIMARY KEY, project_id TEXT NOT NULL REFERENCES projects (id), num INTEGER NOT NULL, issue_id TEXT NOT NULL DEFAULT '', - kind TEXT NOT NULL DEFAULT 'worker', + kind TEXT NOT NULL DEFAULT 'worker' + CHECK (kind IN ('worker', 'orchestrator')), harness TEXT NOT NULL DEFAULT '' CHECK (harness IN ('', 'claude-code', 'codex', 'aider', 'opencode')), - session_state TEXT NOT NULL - CHECK (session_state IN ('not_started', 'working', 'idle', 'needs_input', 'stuck', 'detecting', 'done', 'terminated')), - -- only terminal sessions carry a reason; '' otherwise. - termination_reason TEXT NOT NULL DEFAULT '' - CHECK (termination_reason IN ('', 'manually_killed', 'runtime_lost', 'agent_process_exited', 'probe_failure', 'error_in_process', 'auto_cleanup', 'pr_merged')), - is_alive INTEGER NOT NULL DEFAULT 0, - - activity_state TEXT NOT NULL DEFAULT 'idle', + activity_state TEXT NOT NULL DEFAULT 'idle' + CHECK (activity_state IN ('active', 'idle', 'waiting_input', 'blocked', 'exited')), activity_last_at TIMESTAMP NOT NULL, - activity_source TEXT NOT NULL DEFAULT 'none', - - -- detecting quarantine memory; NULL when the session is not in detecting. - detecting_attempts INTEGER, - detecting_started_at TIMESTAMP, - detecting_evidence_hash TEXT, + activity_source TEXT NOT NULL DEFAULT 'none' + CHECK (activity_source IN ('native', 'terminal', 'hook', 'runtime', 'none')), + is_terminated BOOLEAN NOT NULL DEFAULT FALSE, - -- folded-in operational handles (was the session_metadata table) branch TEXT NOT NULL DEFAULT '', workspace_path TEXT NOT NULL DEFAULT '', runtime_handle_id TEXT NOT NULL DEFAULT '', - runtime_name TEXT NOT NULL DEFAULT '', agent_session_id TEXT NOT NULL DEFAULT '', prompt TEXT NOT NULL DEFAULT '', @@ -80,9 +67,8 @@ CREATE TABLE pr ( ); CREATE INDEX idx_pr_session ON pr (session_id); --- pr_checks is CI run history: one row per (PR, check, commit). The CI-fix-loop --- brake is a LIMIT 3 query over it ("last 3 runs of this check all failed?") — no --- counter is stored. Re-polling the same commit upserts the same row. +-- pr_checks is CI run history: one row per (PR, check, commit). Re-polling the +-- same commit upserts the same row. CREATE TABLE pr_checks ( pr_url TEXT NOT NULL REFERENCES pr (url) ON DELETE CASCADE, name TEXT NOT NULL, @@ -120,8 +106,9 @@ CREATE TABLE change_log ( seq INTEGER PRIMARY KEY AUTOINCREMENT, project_id TEXT NOT NULL REFERENCES projects (id), session_id TEXT REFERENCES sessions (id), - event_type TEXT NOT NULL, - payload TEXT NOT NULL, + event_type TEXT NOT NULL + CHECK (event_type IN ('session_created', 'session_updated', 'pr_created', 'pr_updated', 'pr_check_recorded')), + payload TEXT NOT NULL CHECK (json_valid(payload)), created_at TIMESTAMP NOT NULL DEFAULT (datetime('now')) ); CREATE INDEX idx_change_log_project ON change_log (project_id, seq); @@ -138,8 +125,7 @@ AFTER INSERT ON sessions BEGIN INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) VALUES (NEW.project_id, NEW.id, 'session_created', - json_object('id', NEW.id, 'state', NEW.session_state, 'terminationReason', NEW.termination_reason, - 'isAlive', NEW.is_alive, 'activity', NEW.activity_state), + json_object('id', NEW.id, 'activity', NEW.activity_state, 'isTerminated', json(CASE WHEN NEW.is_terminated THEN 'true' ELSE 'false' END)), NEW.updated_at); END; -- +goose StatementEnd @@ -147,15 +133,12 @@ END; -- +goose StatementBegin CREATE TRIGGER sessions_cdc_update AFTER UPDATE ON sessions -WHEN OLD.session_state <> NEW.session_state - OR OLD.termination_reason <> NEW.termination_reason - OR OLD.is_alive <> NEW.is_alive - OR OLD.activity_state <> NEW.activity_state +WHEN OLD.activity_state <> NEW.activity_state + OR OLD.is_terminated <> NEW.is_terminated BEGIN INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) VALUES (NEW.project_id, NEW.id, 'session_updated', - json_object('id', NEW.id, 'state', NEW.session_state, 'terminationReason', NEW.termination_reason, - 'isAlive', NEW.is_alive, 'activity', NEW.activity_state), + json_object('id', NEW.id, 'activity', NEW.activity_state, 'isTerminated', json(CASE WHEN NEW.is_terminated THEN 'true' ELSE 'false' END)), NEW.updated_at); END; -- +goose StatementEnd @@ -217,7 +200,7 @@ BEGIN (SELECT session_id FROM pr WHERE url = NEW.pr_url), 'pr_check_recorded', json_object('pr', NEW.pr_url, 'name', NEW.name, 'commit', NEW.commit_hash, 'status', NEW.status), - NEW.created_at); + datetime('now')); END; -- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/migrations/0002_notifications.sql b/backend/internal/storage/sqlite/migrations/0002_notifications.sql deleted file mode 100644 index 1ab12f5b..00000000 --- a/backend/internal/storage/sqlite/migrations/0002_notifications.sql +++ /dev/null @@ -1,81 +0,0 @@ --- +goose Up --- +goose StatementBegin -CREATE TABLE notifications ( - seq INTEGER PRIMARY KEY AUTOINCREMENT, - id TEXT NOT NULL UNIQUE DEFAULT ('ntf_' || lower(hex(randomblob(16)))), - project_id TEXT NOT NULL REFERENCES projects(id), - session_id TEXT NOT NULL REFERENCES sessions(id), - source TEXT NOT NULL DEFAULT 'lifecycle' CHECK (source IN ('lifecycle')), - event_type TEXT NOT NULL, - semantic_type TEXT NOT NULL, - priority TEXT NOT NULL CHECK (priority IN ('urgent','action','warning','info')), - message TEXT NOT NULL, - payload_json TEXT NOT NULL CHECK (json_valid(payload_json)), - actions_json TEXT NOT NULL DEFAULT '[]' CHECK (json_valid(actions_json)), - dedupe_key TEXT NOT NULL UNIQUE, - cause_key TEXT NOT NULL DEFAULT '', - read_at TIMESTAMP, - archived_at TIMESTAMP, - created_at TIMESTAMP NOT NULL DEFAULT (datetime('now')), - updated_at TIMESTAMP NOT NULL DEFAULT (datetime('now')) -); - -CREATE INDEX idx_notifications_project_seq ON notifications(project_id, seq DESC); -CREATE INDEX idx_notifications_session_seq ON notifications(session_id, seq DESC); -CREATE INDEX idx_notifications_unread ON notifications(seq DESC) - WHERE read_at IS NULL AND archived_at IS NULL; --- +goose StatementEnd - --- +goose StatementBegin -CREATE TRIGGER notifications_cdc_insert -AFTER INSERT ON notifications -BEGIN - INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) - VALUES ( - NEW.project_id, - NEW.session_id, - 'notification_created', - json_object( - 'seq', NEW.seq, - 'id', NEW.id, - 'type', NEW.semantic_type, - 'priority', NEW.priority, - 'message', NEW.message, - 'data', json(NEW.payload_json), - 'actions', json(NEW.actions_json), - 'readAt', NEW.read_at, - 'archivedAt', NEW.archived_at - ), - NEW.created_at - ); -END; --- +goose StatementEnd - --- +goose StatementBegin -CREATE TRIGGER notifications_cdc_update -AFTER UPDATE ON notifications -WHEN OLD.read_at IS NOT NEW.read_at - OR OLD.archived_at IS NOT NEW.archived_at -BEGIN - INSERT INTO change_log (project_id, session_id, event_type, payload, created_at) - VALUES ( - NEW.project_id, - NEW.session_id, - 'notification_updated', - json_object( - 'seq', NEW.seq, - 'id', NEW.id, - 'readAt', NEW.read_at, - 'archivedAt', NEW.archived_at - ), - NEW.updated_at - ); -END; --- +goose StatementEnd - --- +goose Down --- +goose StatementBegin -DROP TRIGGER IF EXISTS notifications_cdc_update; -DROP TRIGGER IF EXISTS notifications_cdc_insert; -DROP TABLE IF EXISTS notifications; --- +goose StatementEnd diff --git a/backend/internal/storage/sqlite/notification_store.go b/backend/internal/storage/sqlite/notification_store.go deleted file mode 100644 index 90b84331..00000000 --- a/backend/internal/storage/sqlite/notification_store.go +++ /dev/null @@ -1,242 +0,0 @@ -package sqlite - -import ( - "context" - "database/sql" - "encoding/json" - "errors" - "fmt" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// NotificationRow is the storage-facing notification row. It aliases the -// provider-neutral domain type so callers do not depend on sqlc structs. -type NotificationRow = domain.Notification - -// NotificationFilter constrains ListNotifications. A zero filter returns the -// newest notifications across projects. -type NotificationFilter struct { - ProjectID string - SessionID string - UnreadOnly bool - Limit int -} - -const defaultNotificationLimit = 100 - -// EnqueueNotification inserts a notification exactly once per dedupe key. The -// returned bool is true when a new row was created; false means the existing row -// for the same dedupe key was returned. -func (s *Store) EnqueueNotification(ctx context.Context, row NotificationRow) (NotificationRow, bool, error) { - row = normalizeNotification(row) - actionsJSON, err := json.Marshal(row.Actions) - if err != nil { - return NotificationRow{}, false, fmt.Errorf("marshal notification actions: %w", err) - } - - s.writeMu.Lock() - defer s.writeMu.Unlock() - - got, err := s.qw.InsertNotification(ctx, gen.InsertNotificationParams{ - ProjectID: string(row.ProjectID), - SessionID: string(row.SessionID), - Source: row.Source, - EventType: row.EventType, - SemanticType: row.SemanticType, - Priority: row.Priority, - Message: row.Message, - PayloadJson: string(row.Payload), - ActionsJson: string(actionsJSON), - DedupeKey: row.DedupeKey, - CauseKey: row.CauseKey, - CreatedAt: row.CreatedAt, - UpdatedAt: row.UpdatedAt, - }) - if errors.Is(err, sql.ErrNoRows) { - existing, readErr := s.qw.GetNotificationByDedupeKey(ctx, row.DedupeKey) - if readErr != nil { - return NotificationRow{}, false, fmt.Errorf("get notification by dedupe %q: %w", row.DedupeKey, readErr) - } - mapped, mapErr := notificationFromGen(existing) - return mapped, false, mapErr - } - if err != nil { - return NotificationRow{}, false, fmt.Errorf("insert notification: %w", err) - } - mapped, err := notificationFromGen(got) - return mapped, true, err -} - -// GetNotification returns one notification by id, or ok=false if absent. -func (s *Store) GetNotification(ctx context.Context, id string) (NotificationRow, bool, error) { - row, err := s.qr.GetNotification(ctx, id) - if errors.Is(err, sql.ErrNoRows) { - return NotificationRow{}, false, nil - } - if err != nil { - return NotificationRow{}, false, fmt.Errorf("get notification %s: %w", id, err) - } - mapped, err := notificationFromGen(row) - return mapped, true, err -} - -// ListNotifications returns notifications in descending seq order. -func (s *Store) ListNotifications(ctx context.Context, filter NotificationFilter) ([]NotificationRow, error) { - limit := int64(filter.Limit) - if limit <= 0 { - limit = defaultNotificationLimit - } - - var ( - rows []gen.Notification - err error - ) - switch { - case filter.UnreadOnly: - rows, err = s.qr.ListUnreadNotifications(ctx, limit) - case filter.SessionID != "": - rows, err = s.qr.ListNotificationsBySession(ctx, gen.ListNotificationsBySessionParams{SessionID: filter.SessionID, Limit: limit}) - case filter.ProjectID != "": - rows, err = s.qr.ListNotificationsByProject(ctx, gen.ListNotificationsByProjectParams{ProjectID: filter.ProjectID, Limit: limit}) - default: - rows, err = s.qr.ListNotifications(ctx, limit) - } - if err != nil { - return nil, fmt.Errorf("list notifications: %w", err) - } - return notificationsFromGen(rows) -} - -// MarkNotificationRead marks an unread notification read. The returned bool is -// true only when the row changed; repeated calls return the existing row with -// changed=false and emit no CDC update. -func (s *Store) MarkNotificationRead(ctx context.Context, id string, at time.Time) (NotificationRow, bool, error) { - if at.IsZero() { - at = time.Now().UTC() - } - s.writeMu.Lock() - defer s.writeMu.Unlock() - - row, err := s.qw.MarkNotificationRead(ctx, gen.MarkNotificationReadParams{ - ReadAt: sql.NullTime{Time: at, Valid: true}, - UpdatedAt: at, - ID: id, - }) - return s.changedNotificationResult(ctx, row, id, true, err) -} - -// MarkNotificationUnread clears read_at. Repeated calls are idempotent and emit -// no CDC update. -func (s *Store) MarkNotificationUnread(ctx context.Context, id string) (NotificationRow, bool, error) { - now := time.Now().UTC() - s.writeMu.Lock() - defer s.writeMu.Unlock() - - row, err := s.qw.MarkNotificationUnread(ctx, gen.MarkNotificationUnreadParams{UpdatedAt: now, ID: id}) - return s.changedNotificationResult(ctx, row, id, true, err) -} - -// ArchiveNotification marks a notification archived. Repeated calls are -// idempotent and emit no CDC update. -func (s *Store) ArchiveNotification(ctx context.Context, id string, at time.Time) (NotificationRow, bool, error) { - if at.IsZero() { - at = time.Now().UTC() - } - s.writeMu.Lock() - defer s.writeMu.Unlock() - - row, err := s.qw.ArchiveNotification(ctx, gen.ArchiveNotificationParams{ - ArchivedAt: sql.NullTime{Time: at, Valid: true}, - UpdatedAt: at, - ID: id, - }) - return s.changedNotificationResult(ctx, row, id, true, err) -} - -func (s *Store) changedNotificationResult(ctx context.Context, row gen.Notification, id string, changed bool, err error) (NotificationRow, bool, error) { - if errors.Is(err, sql.ErrNoRows) { - existing, readErr := s.qw.GetNotification(ctx, id) - if errors.Is(readErr, sql.ErrNoRows) { - return NotificationRow{}, false, nil - } - if readErr != nil { - return NotificationRow{}, false, fmt.Errorf("get notification %s: %w", id, readErr) - } - mapped, mapErr := notificationFromGen(existing) - return mapped, false, mapErr - } - if err != nil { - return NotificationRow{}, false, err - } - mapped, mapErr := notificationFromGen(row) - return mapped, changed, mapErr -} - -func normalizeNotification(row NotificationRow) NotificationRow { - now := time.Now().UTC() - if row.Source == "" { - row.Source = "lifecycle" - } - if len(row.Payload) == 0 { - row.Payload = json.RawMessage(`{}`) - } - if row.Actions == nil { - row.Actions = []domain.NotificationAction{} - } - if row.CreatedAt.IsZero() { - row.CreatedAt = now - } - if row.UpdatedAt.IsZero() { - row.UpdatedAt = row.CreatedAt - } - return row -} - -func notificationsFromGen(rows []gen.Notification) ([]NotificationRow, error) { - out := make([]NotificationRow, 0, len(rows)) - for _, r := range rows { - row, err := notificationFromGen(r) - if err != nil { - return nil, err - } - out = append(out, row) - } - return out, nil -} - -func notificationFromGen(r gen.Notification) (NotificationRow, error) { - var actions []domain.NotificationAction - if r.ActionsJson == "" { - r.ActionsJson = "[]" - } - if err := json.Unmarshal([]byte(r.ActionsJson), &actions); err != nil { - return NotificationRow{}, fmt.Errorf("decode notification actions %s: %w", r.ID, err) - } - row := NotificationRow{ - Seq: r.Seq, - ID: domain.NotificationID(r.ID), - ProjectID: domain.ProjectID(r.ProjectID), - SessionID: domain.SessionID(r.SessionID), - Source: r.Source, - EventType: r.EventType, - SemanticType: r.SemanticType, - Priority: r.Priority, - Message: r.Message, - Payload: append(json.RawMessage(nil), []byte(r.PayloadJson)...), - Actions: actions, - DedupeKey: r.DedupeKey, - CauseKey: r.CauseKey, - CreatedAt: r.CreatedAt, - UpdatedAt: r.UpdatedAt, - } - if r.ReadAt.Valid { - row.ReadAt = r.ReadAt.Time - } - if r.ArchivedAt.Valid { - row.ArchivedAt = r.ArchivedAt.Time - } - return row, nil -} diff --git a/backend/internal/storage/sqlite/notification_store_test.go b/backend/internal/storage/sqlite/notification_store_test.go deleted file mode 100644 index cd5c44a9..00000000 --- a/backend/internal/storage/sqlite/notification_store_test.go +++ /dev/null @@ -1,232 +0,0 @@ -package sqlite - -import ( - "context" - "encoding/json" - "fmt" - "strings" - "sync" - "testing" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -func TestNotificationInsertListGetAndDedupe(t *testing.T) { - s, rec := newNotificationTestSession(t) - ctx := context.Background() - - row, created, err := s.EnqueueNotification(ctx, sampleNotification(rec, "dedupe-1")) - if err != nil { - t.Fatal(err) - } - if !created || row.ID == "" || row.Seq == 0 { - t.Fatalf("enqueue created=%v row=%+v", created, row) - } - got, ok, err := s.GetNotification(ctx, string(row.ID)) - if err != nil || !ok { - t.Fatalf("get ok=%v err=%v", ok, err) - } - if got.DedupeKey != "dedupe-1" || got.Actions[0].ID != "open-session" { - t.Fatalf("get mismatch: %+v", got) - } - dup, created, err := s.EnqueueNotification(ctx, sampleNotification(rec, "dedupe-1")) - if err != nil { - t.Fatal(err) - } - if created || dup.ID != row.ID || dup.Seq != row.Seq { - t.Fatalf("duplicate should return existing row created=false: created=%v dup=%+v first=%+v", created, dup, row) - } - all, err := s.ListNotifications(ctx, NotificationFilter{Limit: 10}) - if err != nil || len(all) != 1 { - t.Fatalf("list all len=%d err=%v", len(all), err) - } - byProject, _ := s.ListNotifications(ctx, NotificationFilter{ProjectID: string(rec.ProjectID), Limit: 10}) - bySession, _ := s.ListNotifications(ctx, NotificationFilter{SessionID: string(rec.ID), Limit: 10}) - if len(byProject) != 1 || len(bySession) != 1 { - t.Fatalf("project/session lists = %d/%d", len(byProject), len(bySession)) - } -} - -func TestNotificationReadUnreadArchiveAndIdempotentCDC(t *testing.T) { - s, rec := newNotificationTestSession(t) - ctx := context.Background() - row, _, err := s.EnqueueNotification(ctx, sampleNotification(rec, "dedupe-read")) - if err != nil { - t.Fatal(err) - } - createdSeq, _ := s.MaxChangeLogSeq(ctx) - - readAt := time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC) - read, changed, err := s.MarkNotificationRead(ctx, string(row.ID), readAt) - if err != nil || !changed { - t.Fatalf("mark read changed=%v err=%v", changed, err) - } - if read.ReadAt.IsZero() { - t.Fatal("read_at not set") - } - afterRead, _ := s.MaxChangeLogSeq(ctx) - if afterRead != createdSeq+1 { - t.Fatalf("read should emit one CDC event: before=%d after=%d", createdSeq, afterRead) - } - _, changed, err = s.MarkNotificationRead(ctx, string(row.ID), readAt.Add(time.Second)) - if err != nil || changed { - t.Fatalf("repeated mark read should be idempotent changed=false, got changed=%v err=%v", changed, err) - } - afterRepeat, _ := s.MaxChangeLogSeq(ctx) - if afterRepeat != afterRead { - t.Fatalf("repeated read emitted CDC: before=%d after=%d", afterRead, afterRepeat) - } - - unread, changed, err := s.MarkNotificationUnread(ctx, string(row.ID)) - if err != nil || !changed || !unread.ReadAt.IsZero() { - t.Fatalf("mark unread changed=%v err=%v row=%+v", changed, err, unread) - } - unreadList, err := s.ListNotifications(ctx, NotificationFilter{UnreadOnly: true, Limit: 10}) - if err != nil || len(unreadList) != 1 { - t.Fatalf("unread list len=%d err=%v", len(unreadList), err) - } - - archiveSeq, _ := s.MaxChangeLogSeq(ctx) - archived, changed, err := s.ArchiveNotification(ctx, string(row.ID), readAt.Add(2*time.Second)) - if err != nil || !changed || archived.ArchivedAt.IsZero() { - t.Fatalf("archive changed=%v err=%v row=%+v", changed, err, archived) - } - afterArchive, _ := s.MaxChangeLogSeq(ctx) - if afterArchive != archiveSeq+1 { - t.Fatalf("archive should emit one CDC event: before=%d after=%d", archiveSeq, afterArchive) - } - _, changed, err = s.ArchiveNotification(ctx, string(row.ID), readAt.Add(3*time.Second)) - if err != nil || changed { - t.Fatalf("repeated archive should be idempotent changed=false, got changed=%v err=%v", changed, err) - } - afterArchiveRepeat, _ := s.MaxChangeLogSeq(ctx) - if afterArchiveRepeat != afterArchive { - t.Fatalf("repeated archive emitted CDC: before=%d after=%d", afterArchive, afterArchiveRepeat) - } -} - -func TestNotificationJSONConstraintsRejectInvalidPayloadAndActions(t *testing.T) { - s, rec := newNotificationTestSession(t) - ctx := context.Background() - - badPayload := sampleNotification(rec, "bad-payload") - badPayload.Payload = json.RawMessage(`{"nope"`) - if _, _, err := s.EnqueueNotification(ctx, badPayload); err == nil { - t.Fatal("invalid payload JSON should be rejected") - } - - now := time.Now().UTC().Truncate(time.Second) - _, err := s.qw.InsertNotification(ctx, gen.InsertNotificationParams{ - ProjectID: string(rec.ProjectID), - SessionID: string(rec.ID), - Source: "lifecycle", - EventType: "reaction.agent-needs-input", - SemanticType: "session.needs_input", - Priority: "urgent", - Message: "bad actions", - PayloadJson: `{}`, - ActionsJson: `{not-json`, - DedupeKey: "bad-actions", - CauseKey: "agent-needs-input", - CreatedAt: now, - UpdatedAt: now, - }) - if err == nil { - t.Fatal("invalid actions JSON should be rejected") - } -} - -func TestNotificationCDCForCreateReadArchive(t *testing.T) { - s, rec := newNotificationTestSession(t) - ctx := context.Background() - startSeq, _ := s.MaxChangeLogSeq(ctx) - row, _, err := s.EnqueueNotification(ctx, sampleNotification(rec, "dedupe-cdc")) - if err != nil { - t.Fatal(err) - } - _, _, _ = s.MarkNotificationRead(ctx, string(row.ID), time.Now().UTC()) - _, _, _ = s.ArchiveNotification(ctx, string(row.ID), time.Now().UTC()) - - evs, err := s.ReadChangeLogAfter(ctx, startSeq, 10) - if err != nil { - t.Fatal(err) - } - var types []string - for _, e := range evs { - types = append(types, e.EventType) - if e.EventType == "notification_created" && !strings.Contains(e.Payload, `"data"`) { - t.Fatalf("notification_created payload missing data: %s", e.Payload) - } - } - want := []string{"notification_created", "notification_updated", "notification_updated"} - if fmt.Sprint(types) != fmt.Sprint(want) { - t.Fatalf("notification CDC types = %v, want %v", types, want) - } -} - -func TestConcurrentNotificationEnqueueSameDedupeCreatesOneRow(t *testing.T) { - s, rec := newNotificationTestSession(t) - ctx := context.Background() - const n = 20 - var wg sync.WaitGroup - ids := make(chan domain.NotificationID, n) - for i := 0; i < n; i++ { - wg.Add(1) - go func() { - defer wg.Done() - row, _, err := s.EnqueueNotification(ctx, sampleNotification(rec, "dedupe-concurrent")) - if err != nil { - t.Errorf("enqueue: %v", err) - return - } - ids <- row.ID - }() - } - wg.Wait() - close(ids) - var first domain.NotificationID - for id := range ids { - if first == "" { - first = id - } - if id != first { - t.Fatalf("all callers should see same id, got %q and %q", first, id) - } - } - rows, err := s.ListNotifications(ctx, NotificationFilter{Limit: 10}) - if err != nil || len(rows) != 1 { - t.Fatalf("list len=%d err=%v", len(rows), err) - } -} - -func newNotificationTestSession(t *testing.T) (*Store, domain.SessionRecord) { - t.Helper() - s := newTestStore(t) - seedProject(t, s, "ao") - rec, err := s.CreateSession(context.Background(), sampleRecord("ao")) - if err != nil { - t.Fatalf("create session: %v", err) - } - return s, rec -} - -func sampleNotification(rec domain.SessionRecord, dedupe string) NotificationRow { - now := time.Now().UTC().Truncate(time.Second) - return NotificationRow{ - ProjectID: rec.ProjectID, - SessionID: rec.ID, - Source: "lifecycle", - EventType: "reaction.agent-needs-input", - SemanticType: "session.needs_input", - Priority: "urgent", - Message: "Agent needs input to continue.", - Payload: json.RawMessage(`{"schemaVersion":3,"semanticType":"session.needs_input"}`), - Actions: []domain.NotificationAction{{ID: "open-session", Kind: "route", Label: "Open session", Route: "/projects/ao/sessions/ao-1"}}, - DedupeKey: dedupe, - CauseKey: "agent-needs-input", - CreatedAt: now, - UpdatedAt: now, - } -} diff --git a/backend/internal/storage/sqlite/pr_facts.go b/backend/internal/storage/sqlite/pr_facts.go deleted file mode 100644 index c0c3068b..00000000 --- a/backend/internal/storage/sqlite/pr_facts.go +++ /dev/null @@ -1,43 +0,0 @@ -package sqlite - -import ( - "context" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" -) - -// PRFactsForSession picks the PR that drives display/reaction status — the -// newest non-closed PR, else the newest PR — and folds in whether it has -// unresolved review comments. -func (s *Store) PRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, error) { - rows, err := s.ListPRsBySession(ctx, string(id)) - if err != nil { - return domain.PRFacts{}, err - } - if len(rows) == 0 { - return domain.PRFacts{}, nil - } - pick := rows[0] - for _, r := range rows { - if !r.Merged && !r.Closed { // newest non-closed (draft or open) - pick = r - break - } - } - facts := domain.PRFacts{ - URL: pick.URL, Number: pick.Number, Exists: true, - Draft: pick.Draft, Merged: pick.Merged, Closed: pick.Closed, - CI: pick.CI, Review: pick.Review, Mergeability: pick.Mergeability, - } - comments, err := s.ListPRComments(ctx, pick.URL) - if err != nil { - return domain.PRFacts{}, err - } - for _, c := range comments { - if !c.Resolved { - facts.ReviewComments = true - break - } - } - return facts, nil -} diff --git a/backend/internal/storage/sqlite/pr_store.go b/backend/internal/storage/sqlite/pr_store.go deleted file mode 100644 index 1d57b40d..00000000 --- a/backend/internal/storage/sqlite/pr_store.go +++ /dev/null @@ -1,246 +0,0 @@ -package sqlite - -import ( - "context" - "database/sql" - "errors" - "fmt" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/ports" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// The pr / pr_checks / pr_comment rows are modelled by domain.PRRow / -// domain.PRCheckRow / domain.PRComment — flat tables, one shared type per table. -// This layer only maps those to/from the sqlc gen.* params: the bool PR state -// becomes the single pr.state column, empty enums default to their -// "nothing known yet" value (matching the CHECK constraints), and ints widen to -// int64. - -// Compile-time proof that *Store satisfies both ports it is wired into, so a -// drift between either interface and this implementation fails here at the point -// of definition rather than later at the call sites in lifecycle_wiring / tests. -var ( - _ ports.SessionStore = (*Store)(nil) - _ ports.PRWriter = (*Store)(nil) -) - -// UpsertPR inserts or replaces the scalar PR facts for a PR URL. -func (s *Store) UpsertPR(ctx context.Context, r domain.PRRow) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.qw.UpsertPR(ctx, genPRParams(r)) -} - -// WritePR persists a full PR observation — scalar facts, check runs, and the -// replacement comment set — in one write transaction, so the rows and the -// change_log events their triggers emit are committed all-or-nothing. The scalar -// PR upsert runs first so the checks'/comments' CDC triggers can resolve the -// session id from the pr row within the same transaction. -func (s *Store) WritePR(ctx context.Context, pr domain.PRRow, checks []domain.PRCheckRow, comments []domain.PRComment) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.inTx(ctx, "write pr observation", func(q *gen.Queries) error { - if err := q.UpsertPR(ctx, genPRParams(pr)); err != nil { - return err - } - for _, c := range checks { - if err := q.UpsertPRCheck(ctx, genCheckParams(c)); err != nil { - return err - } - } - if err := q.DeletePRComments(ctx, pr.URL); err != nil { - return err - } - for _, c := range comments { - if err := q.UpsertPRComment(ctx, genCommentParams(pr.URL, c)); err != nil { - return fmt.Errorf("comment %q: %w", c.ID, err) - } - } - return nil - }) -} - -// GetPR returns the PR facts for a URL, or ok=false if absent. -func (s *Store) GetPR(ctx context.Context, url string) (domain.PRRow, bool, error) { - p, err := s.qr.GetPR(ctx, url) - if errors.Is(err, sql.ErrNoRows) { - return domain.PRRow{}, false, nil - } - if err != nil { - return domain.PRRow{}, false, fmt.Errorf("get pr %s: %w", url, err) - } - return prRowFromGen(p), true, nil -} - -// ListPRsBySession returns every PR owned by a session, newest first. -func (s *Store) ListPRsBySession(ctx context.Context, sessionID string) ([]domain.PRRow, error) { - rows, err := s.qr.ListPRsBySession(ctx, sessionID) - if err != nil { - return nil, fmt.Errorf("list prs for %s: %w", sessionID, err) - } - out := make([]domain.PRRow, 0, len(rows)) - for _, p := range rows { - out = append(out, prRowFromGen(p)) - } - return out, nil -} - -// DeletePR removes a PR (cascades to its checks + comments). -func (s *Store) DeletePR(ctx context.Context, url string) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.qw.DeletePR(ctx, url) -} - -// RecordCheck upserts a CI check run. Re-polling the same (pr, name, commit) -// updates the same row; a new commit creates a new row (a fresh agent attempt). -func (s *Store) RecordCheck(ctx context.Context, r domain.PRCheckRow) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.qw.UpsertPRCheck(ctx, genCheckParams(r)) -} - -// RecentCheckStatuses returns the statuses of the last `limit` runs of a check, -// most-recent first. The CI-fix-loop brake reads this: "last 3 all failed?". -func (s *Store) RecentCheckStatuses(ctx context.Context, prURL, name string, limit int) ([]string, error) { - rows, err := s.qr.ListRecentChecks(ctx, gen.ListRecentChecksParams{ - PrUrl: prURL, Name: name, Limit: int64(limit), - }) - if err != nil { - return nil, fmt.Errorf("recent checks %s/%s: %w", prURL, name, err) - } - out := make([]string, 0, len(rows)) - for _, r := range rows { - out = append(out, r.Status) - } - return out, nil -} - -// ListChecks returns every recorded check run for a PR. -func (s *Store) ListChecks(ctx context.Context, prURL string) ([]domain.PRCheckRow, error) { - rows, err := s.qr.ListChecksByPR(ctx, prURL) - if err != nil { - return nil, fmt.Errorf("list checks %s: %w", prURL, err) - } - out := make([]domain.PRCheckRow, 0, len(rows)) - for _, c := range rows { - out = append(out, checkRowFromGen(c)) - } - return out, nil -} - -// ReplacePRComments atomically replaces the full comment set for a PR (each SCM -// fetch reports the current set, so a replace keeps it in sync). -func (s *Store) ReplacePRComments(ctx context.Context, prURL string, comments []domain.PRComment) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.inTx(ctx, "replace pr comments", func(q *gen.Queries) error { - if err := q.DeletePRComments(ctx, prURL); err != nil { - return err - } - for _, c := range comments { - if err := q.UpsertPRComment(ctx, genCommentParams(prURL, c)); err != nil { - return fmt.Errorf("comment %q: %w", c.ID, err) - } - } - return nil - }) -} - -// ListPRComments returns a PR's review comments, oldest first. -func (s *Store) ListPRComments(ctx context.Context, prURL string) ([]domain.PRComment, error) { - rows, err := s.qr.ListPRComments(ctx, prURL) - if err != nil { - return nil, fmt.Errorf("list pr comments %s: %w", prURL, err) - } - out := make([]domain.PRComment, 0, len(rows)) - for _, c := range rows { - out = append(out, commentFromGen(c)) - } - return out, nil -} - -// ---- domain <-> gen mapping ---- - -// prState collapses the PR's bools into the single pr.state column value. -func prState(r domain.PRRow) string { - switch { - case r.Merged: - return "merged" - case r.Closed: - return "closed" - case r.Draft: - return "draft" - default: - return "open" - } -} - -func orDefault(v, def string) string { - if v == "" { - return def - } - return v -} - -func genPRParams(r domain.PRRow) gen.UpsertPRParams { - return gen.UpsertPRParams{ - Url: r.URL, - SessionID: r.SessionID, - Number: int64(r.Number), - PrState: prState(r), - ReviewDecision: orDefault(string(r.Review), "none"), - CiState: orDefault(string(r.CI), "unknown"), - Mergeability: orDefault(string(r.Mergeability), "unknown"), - UpdatedAt: r.UpdatedAt, - } -} - -func prRowFromGen(p gen.Pr) domain.PRRow { - return domain.PRRow{ - URL: p.Url, - SessionID: p.SessionID, - Number: int(p.Number), - Draft: p.PrState == "draft", - Merged: p.PrState == "merged", - Closed: p.PrState == "closed", - CI: domain.CIState(p.CiState), - Review: domain.ReviewDecision(p.ReviewDecision), - Mergeability: domain.Mergeability(p.Mergeability), - UpdatedAt: p.UpdatedAt, - } -} - -func genCheckParams(c domain.PRCheckRow) gen.UpsertPRCheckParams { - status := c.Status - if status == "" { - status = "unknown" - } - return gen.UpsertPRCheckParams{ - PrUrl: c.PRURL, Name: c.Name, CommitHash: c.CommitHash, - Status: status, Url: c.URL, LogTail: c.LogTail, CreatedAt: c.CreatedAt, - } -} - -func checkRowFromGen(c gen.PrCheck) domain.PRCheckRow { - return domain.PRCheckRow{ - PRURL: c.PrUrl, Name: c.Name, CommitHash: c.CommitHash, - Status: c.Status, URL: c.Url, LogTail: c.LogTail, CreatedAt: c.CreatedAt, - } -} - -func genCommentParams(prURL string, c domain.PRComment) gen.UpsertPRCommentParams { - return gen.UpsertPRCommentParams{ - PrUrl: prURL, CommentID: c.ID, Author: c.Author, File: c.File, - Line: int64(c.Line), Body: c.Body, Resolved: boolToInt(c.Resolved), CreatedAt: c.CreatedAt, - } -} - -func commentFromGen(c gen.PrComment) domain.PRComment { - return domain.PRComment{ - ID: c.CommentID, Author: c.Author, File: c.File, Line: int(c.Line), - Body: c.Body, Resolved: c.Resolved != 0, CreatedAt: c.CreatedAt, - } -} diff --git a/backend/internal/storage/sqlite/project_store.go b/backend/internal/storage/sqlite/project_store.go deleted file mode 100644 index d81943c3..00000000 --- a/backend/internal/storage/sqlite/project_store.go +++ /dev/null @@ -1,93 +0,0 @@ -package sqlite - -import ( - "context" - "database/sql" - "errors" - "fmt" - "time" - - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// ProjectRow is one registered repo (the projects table). id is a short slug -// (mer, ao). ArchivedAt zero means active. -type ProjectRow struct { - ID string - Path string - RepoOriginURL string - DisplayName string - RegisteredAt time.Time - ArchivedAt time.Time -} - -// UpsertProject inserts or updates a registered project. -func (s *Store) UpsertProject(ctx context.Context, r ProjectRow) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.qw.UpsertProject(ctx, gen.UpsertProjectParams{ - ID: r.ID, - Path: r.Path, - RepoOriginUrl: r.RepoOriginURL, - DisplayName: r.DisplayName, - RegisteredAt: r.RegisteredAt, - ArchivedAt: nullTime(r.ArchivedAt), - }) -} - -// GetProject returns a project by id (active or archived), or ok=false. -func (s *Store) GetProject(ctx context.Context, id string) (ProjectRow, bool, error) { - p, err := s.qr.GetProject(ctx, id) - if errors.Is(err, sql.ErrNoRows) { - return ProjectRow{}, false, nil - } - if err != nil { - return ProjectRow{}, false, fmt.Errorf("get project %s: %w", id, err) - } - return projectRowFromGen(p), true, nil -} - -// ListProjects returns active (non-archived) projects, ordered by id. -func (s *Store) ListProjects(ctx context.Context) ([]ProjectRow, error) { - rows, err := s.qr.ListProjects(ctx) - if err != nil { - return nil, fmt.Errorf("list projects: %w", err) - } - out := make([]ProjectRow, 0, len(rows)) - for _, p := range rows { - out = append(out, projectRowFromGen(p)) - } - return out, nil -} - -// ArchiveProject soft-deletes a project (the row stays so session.project_id -// still resolves). -func (s *Store) ArchiveProject(ctx context.Context, id string, at time.Time) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.qw.ArchiveProject(ctx, gen.ArchiveProjectParams{ - ArchivedAt: nullTime(at), - ID: id, - }) -} - -func projectRowFromGen(p gen.Project) ProjectRow { - r := ProjectRow{ - ID: p.ID, - Path: p.Path, - RepoOriginURL: p.RepoOriginUrl, - DisplayName: p.DisplayName, - RegisteredAt: p.RegisteredAt, - } - if p.ArchivedAt.Valid { - r.ArchivedAt = p.ArchivedAt.Time - } - return r -} - -func nullTime(t time.Time) sql.NullTime { - if t.IsZero() { - return sql.NullTime{} - } - return sql.NullTime{Time: t, Valid: true} -} diff --git a/backend/internal/storage/sqlite/queries/changelog.sql b/backend/internal/storage/sqlite/queries/changelog.sql index 0e11899c..9d41a3e3 100644 --- a/backend/internal/storage/sqlite/queries/changelog.sql +++ b/backend/internal/storage/sqlite/queries/changelog.sql @@ -2,9 +2,6 @@ SELECT seq, project_id, session_id, event_type, payload, created_at FROM change_log WHERE seq > ? ORDER BY seq LIMIT ?; --- name: ReadChangeLogAfterForProject :many -SELECT seq, project_id, session_id, event_type, payload, created_at -FROM change_log WHERE project_id = ? AND seq > ? ORDER BY seq LIMIT ?; -- name: MaxChangeLogSeq :one -SELECT COALESCE(MAX(seq), 0) AS seq FROM change_log; +SELECT CAST(COALESCE(MAX(seq), 0) AS INTEGER) AS seq FROM change_log; diff --git a/backend/internal/storage/sqlite/queries/notifications.sql b/backend/internal/storage/sqlite/queries/notifications.sql deleted file mode 100644 index a896b43c..00000000 --- a/backend/internal/storage/sqlite/queries/notifications.sql +++ /dev/null @@ -1,70 +0,0 @@ --- name: InsertNotification :one -INSERT INTO notifications ( - project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, created_at, updated_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (dedupe_key) DO NOTHING -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at; - --- name: GetNotification :one -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications WHERE id = ?; - --- name: GetNotificationByDedupeKey :one -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications WHERE dedupe_key = ?; - --- name: ListNotifications :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -ORDER BY seq DESC -LIMIT ?; - --- name: ListNotificationsByProject :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -WHERE project_id = ? -ORDER BY seq DESC -LIMIT ?; - --- name: ListNotificationsBySession :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -WHERE session_id = ? -ORDER BY seq DESC -LIMIT ?; - --- name: ListUnreadNotifications :many -SELECT seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at -FROM notifications -WHERE read_at IS NULL AND archived_at IS NULL -ORDER BY seq DESC -LIMIT ?; - --- name: MarkNotificationRead :one -UPDATE notifications -SET read_at = ?, updated_at = ? -WHERE id = ? AND read_at IS NULL -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at; - --- name: MarkNotificationUnread :one -UPDATE notifications -SET read_at = NULL, updated_at = ? -WHERE id = ? AND read_at IS NOT NULL -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at; - --- name: ArchiveNotification :one -UPDATE notifications -SET archived_at = ?, updated_at = ? -WHERE id = ? AND archived_at IS NULL -RETURNING seq, id, project_id, session_id, source, event_type, semantic_type, priority, - message, payload_json, actions_json, dedupe_key, cause_key, read_at, archived_at, created_at, updated_at; diff --git a/backend/internal/storage/sqlite/queries/pr.sql b/backend/internal/storage/sqlite/queries/pr.sql index e6b41cf1..508eddd4 100644 --- a/backend/internal/storage/sqlite/queries/pr.sql +++ b/backend/internal/storage/sqlite/queries/pr.sql @@ -2,7 +2,6 @@ INSERT INTO pr (url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT (url) DO UPDATE SET - session_id = excluded.session_id, number = excluded.number, pr_state = excluded.pr_state, review_decision = excluded.review_decision, @@ -11,10 +10,34 @@ ON CONFLICT (url) DO UPDATE SET updated_at = excluded.updated_at; -- name: GetPR :one -SELECT * FROM pr WHERE url = ?; +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at +FROM pr +WHERE url = ?; -- name: ListPRsBySession :many -SELECT * FROM pr WHERE session_id = ? ORDER BY updated_at DESC; +SELECT url, session_id, number, pr_state, review_decision, ci_state, mergeability, updated_at +FROM pr +WHERE session_id = ? +ORDER BY updated_at DESC; --- name: DeletePR :exec -DELETE FROM pr WHERE url = ?; + +-- name: GetDisplayPRFactsBySession :one +SELECT + pr.url, + pr.number, + pr.pr_state, + pr.review_decision, + pr.ci_state, + pr.mergeability, + EXISTS ( + SELECT 1 + FROM pr_comment + WHERE pr_comment.pr_url = pr.url + AND pr_comment.resolved = 0 + ) AS review_comments +FROM pr +WHERE pr.session_id = ? +ORDER BY + CASE WHEN pr.pr_state NOT IN ('merged', 'closed') THEN 0 ELSE 1 END, + pr.updated_at DESC +LIMIT 1; diff --git a/backend/internal/storage/sqlite/queries/pr_checks.sql b/backend/internal/storage/sqlite/queries/pr_checks.sql index 2e3e3c15..2e223729 100644 --- a/backend/internal/storage/sqlite/queries/pr_checks.sql +++ b/backend/internal/storage/sqlite/queries/pr_checks.sql @@ -6,10 +6,6 @@ ON CONFLICT (pr_url, name, commit_hash) DO UPDATE SET url = excluded.url, log_tail = excluded.log_tail; --- name: ListRecentChecks :many -SELECT status, commit_hash, created_at FROM pr_checks -WHERE pr_url = ? AND name = ? -ORDER BY created_at DESC LIMIT ?; - -- name: ListChecksByPR :many -SELECT * FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at; +SELECT pr_url, name, commit_hash, status, url, log_tail, created_at +FROM pr_checks WHERE pr_url = ? ORDER BY name, created_at; diff --git a/backend/internal/storage/sqlite/queries/pr_comment.sql b/backend/internal/storage/sqlite/queries/pr_comment.sql index df4f99d0..870a87d7 100644 --- a/backend/internal/storage/sqlite/queries/pr_comment.sql +++ b/backend/internal/storage/sqlite/queries/pr_comment.sql @@ -1,12 +1,10 @@ --- name: UpsertPRComment :exec +-- name: InsertPRComment :exec INSERT INTO pr_comment (pr_url, comment_id, author, file, line, body, resolved, created_at) -VALUES (?, ?, ?, ?, ?, ?, ?, ?) -ON CONFLICT (pr_url, comment_id) DO UPDATE SET - author = excluded.author, file = excluded.file, line = excluded.line, - body = excluded.body, resolved = excluded.resolved; +VALUES (?, ?, ?, ?, ?, ?, ?, ?); -- name: DeletePRComments :exec DELETE FROM pr_comment WHERE pr_url = ?; -- name: ListPRComments :many -SELECT * FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id; +SELECT pr_url, comment_id, author, file, line, body, resolved, created_at +FROM pr_comment WHERE pr_url = ? ORDER BY created_at, comment_id; diff --git a/backend/internal/storage/sqlite/queries/projects.sql b/backend/internal/storage/sqlite/queries/projects.sql index 3dc28950..c5706035 100644 --- a/backend/internal/storage/sqlite/queries/projects.sql +++ b/backend/internal/storage/sqlite/queries/projects.sql @@ -15,5 +15,9 @@ FROM projects WHERE id = ?; SELECT id, path, repo_origin_url, display_name, registered_at, archived_at FROM projects WHERE archived_at IS NULL ORDER BY id; --- name: ArchiveProject :exec +-- name: FindProjectByPath :one +SELECT id, path, repo_origin_url, display_name, registered_at, archived_at +FROM projects WHERE path = ?; + +-- name: ArchiveProject :execrows UPDATE projects SET archived_at = ? WHERE id = ?; diff --git a/backend/internal/storage/sqlite/queries/sessions.sql b/backend/internal/storage/sqlite/queries/sessions.sql index 9b294de3..799718b8 100644 --- a/backend/internal/storage/sqlite/queries/sessions.sql +++ b/backend/internal/storage/sqlite/queries/sessions.sql @@ -4,31 +4,34 @@ SELECT COALESCE(MAX(num), 0) + 1 AS next FROM sessions WHERE project_id = ?; -- name: InsertSession :exec INSERT INTO sessions ( id, project_id, num, issue_id, kind, harness, - session_state, termination_reason, is_alive, - activity_state, activity_last_at, activity_source, - detecting_attempts, detecting_started_at, detecting_evidence_hash, - branch, workspace_path, runtime_handle_id, runtime_name, agent_session_id, prompt, + activity_state, activity_last_at, activity_source, is_terminated, + branch, workspace_path, runtime_handle_id, agent_session_id, prompt, created_at, updated_at -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?); +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?); -- name: UpdateSession :exec UPDATE sessions SET issue_id = ?, kind = ?, harness = ?, - session_state = ?, termination_reason = ?, is_alive = ?, - activity_state = ?, activity_last_at = ?, activity_source = ?, - detecting_attempts = ?, detecting_started_at = ?, detecting_evidence_hash = ?, - branch = ?, workspace_path = ?, runtime_handle_id = ?, runtime_name = ?, agent_session_id = ?, prompt = ?, + activity_state = ?, activity_last_at = ?, activity_source = ?, is_terminated = ?, + branch = ?, workspace_path = ?, runtime_handle_id = ?, agent_session_id = ?, prompt = ?, updated_at = ? WHERE id = ?; -- name: GetSession :one -SELECT * FROM sessions WHERE id = ?; +SELECT id, project_id, num, issue_id, kind, harness, + activity_state, activity_last_at, activity_source, is_terminated, branch, workspace_path, + runtime_handle_id, agent_session_id, prompt, created_at, updated_at +FROM sessions WHERE id = ?; -- name: ListSessionsByProject :many -SELECT * FROM sessions WHERE project_id = ? ORDER BY num; +SELECT id, project_id, num, issue_id, kind, harness, + activity_state, activity_last_at, activity_source, is_terminated, branch, workspace_path, + runtime_handle_id, agent_session_id, prompt, created_at, updated_at +FROM sessions WHERE project_id = ? ORDER BY num; -- name: ListAllSessions :many -SELECT * FROM sessions ORDER BY project_id, num; +SELECT id, project_id, num, issue_id, kind, harness, + activity_state, activity_last_at, activity_source, is_terminated, branch, workspace_path, + runtime_handle_id, agent_session_id, prompt, created_at, updated_at +FROM sessions ORDER BY project_id, num; --- name: DeleteSession :exec -DELETE FROM sessions WHERE id = ?; diff --git a/backend/internal/storage/sqlite/store.go b/backend/internal/storage/sqlite/store.go deleted file mode 100644 index 34d028da..00000000 --- a/backend/internal/storage/sqlite/store.go +++ /dev/null @@ -1,134 +0,0 @@ -package sqlite - -import ( - "context" - "database/sql" - "errors" - "fmt" - "sync" - - "github.com/aoagents/agent-orchestrator/backend/internal/domain" - "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" -) - -// Store is the SQLite-backed persistence layer. It routes writes to a single -// writer connection (qw) and reads to a reader pool (qr) — see Open. writeMu -// guards the read-modify-write write methods (e.g. CreateSession's -// next-num-then-insert) so concurrent writes can't interleave them. -// -// CDC is captured by DB triggers (migration 0001), NOT by this layer: the store -// never writes change_log, it only reads it for the CDC poller. -type Store struct { - writeDB *sql.DB - readDB *sql.DB - qw *gen.Queries // bound to the single writer connection - qr *gen.Queries // bound to the reader pool - writeMu sync.Mutex -} - -// NewStore wraps an opened writer + reader *sql.DB (see Open) as a Store. -func NewStore(writeDB, readDB *sql.DB) *Store { - return &Store{ - writeDB: writeDB, - readDB: readDB, - qw: gen.New(writeDB), - qr: gen.New(readDB), - } -} - -// Close closes both pools. -func (s *Store) Close() error { - err := s.writeDB.Close() - if e := s.readDB.Close(); e != nil && err == nil { - err = e - } - return err -} - -// ---- sessions ---- - -// CreateSession assigns the per-project identity ("{project}-{num}") and inserts -// the record, returning it with ID populated. The next-num read and the insert -// run on the writer connection under writeMu, so two concurrent creates in the -// same project can't collide on num. -func (s *Store) CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { - s.writeMu.Lock() - defer s.writeMu.Unlock() - - num, err := s.qw.NextSessionNum(ctx, string(rec.ProjectID)) - if err != nil { - return domain.SessionRecord{}, fmt.Errorf("next session num for %s: %w", rec.ProjectID, err) - } - rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, num)) - if err := s.qw.InsertSession(ctx, recordToInsert(rec, num)); err != nil { - return domain.SessionRecord{}, fmt.Errorf("insert session %s: %w", rec.ID, err) - } - return rec, nil -} - -// UpdateSession writes the full mutable state of an existing session. The -// id/project/num/created_at are immutable and not touched here. -func (s *Store) UpdateSession(ctx context.Context, rec domain.SessionRecord) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.qw.UpdateSession(ctx, recordToUpdate(rec)) -} - -// GetSession returns the full record for a session, or ok=false if absent. -func (s *Store) GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { - row, err := s.qr.GetSession(ctx, string(id)) - if errors.Is(err, sql.ErrNoRows) { - return domain.SessionRecord{}, false, nil - } - if err != nil { - return domain.SessionRecord{}, false, fmt.Errorf("get session %s: %w", id, err) - } - return rowToRecord(row), true, nil -} - -// ListSessions returns every session in a project, ordered by num. -func (s *Store) ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { - rows, err := s.qr.ListSessionsByProject(ctx, string(project)) - if err != nil { - return nil, fmt.Errorf("list sessions for %s: %w", project, err) - } - return mapSessionRows(rows), nil -} - -// ListAllSessions returns every session across all projects. -func (s *Store) ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) { - rows, err := s.qr.ListAllSessions(ctx) - if err != nil { - return nil, fmt.Errorf("list all sessions: %w", err) - } - return mapSessionRows(rows), nil -} - -// DeleteSession removes a session (cascades to its pr/checks/comments). -func (s *Store) DeleteSession(ctx context.Context, id domain.SessionID) error { - s.writeMu.Lock() - defer s.writeMu.Unlock() - return s.qw.DeleteSession(ctx, string(id)) -} - -func mapSessionRows(rows []gen.Session) []domain.SessionRecord { - out := make([]domain.SessionRecord, 0, len(rows)) - for _, r := range rows { - out = append(out, rowToRecord(r)) - } - return out -} - -// inTx runs fn inside a single write transaction on the writer connection, -// rolling back on error. The caller must already hold writeMu. -func (s *Store) inTx(ctx context.Context, what string, fn func(*gen.Queries) error) error { - tx, err := s.writeDB.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("begin %s: %w", what, err) - } - defer func() { _ = tx.Rollback() }() - if err := fn(s.qw.WithTx(tx)); err != nil { - return fmt.Errorf("%s: %w", what, err) - } - return tx.Commit() -} diff --git a/backend/internal/storage/sqlite/store/changelog_store.go b/backend/internal/storage/sqlite/store/changelog_store.go new file mode 100644 index 00000000..42b30a30 --- /dev/null +++ b/backend/internal/storage/sqlite/store/changelog_store.go @@ -0,0 +1,46 @@ +package store + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// EventsAfter implements cdc.Source over the SQLite change_log table. +func (s *Store) EventsAfter(ctx context.Context, after int64, limit int) ([]cdc.Event, error) { + rows, err := s.qr.ReadChangeLogAfter(ctx, gen.ReadChangeLogAfterParams{Seq: after, Limit: int64(limit)}) + if err != nil { + return nil, fmt.Errorf("read change_log after %d: %w", after, err) + } + events := make([]cdc.Event, 0, len(rows)) + for _, r := range rows { + events = append(events, changeLogEventFromGen(r)) + } + return events, nil +} + +// LatestSeq implements cdc.Source by returning the current change_log head. +func (s *Store) LatestSeq(ctx context.Context) (int64, error) { + seq, err := s.qr.MaxChangeLogSeq(ctx) + if err != nil { + return 0, fmt.Errorf("max change_log seq: %w", err) + } + return seq, nil +} + +func changeLogEventFromGen(r gen.ChangeLog) cdc.Event { + e := cdc.Event{ + Seq: r.Seq, + ProjectID: string(r.ProjectID), + Type: r.EventType, + Payload: json.RawMessage(r.Payload), + CreatedAt: r.CreatedAt, + } + if r.SessionID != nil { + e.SessionID = string(*r.SessionID) + } + return e +} diff --git a/backend/internal/storage/sqlite/pr_cdc_test.go b/backend/internal/storage/sqlite/store/pr_cdc_test.go similarity index 72% rename from backend/internal/storage/sqlite/pr_cdc_test.go rename to backend/internal/storage/sqlite/store/pr_cdc_test.go index 102e8b4f..82f53b75 100644 --- a/backend/internal/storage/sqlite/pr_cdc_test.go +++ b/backend/internal/storage/sqlite/store/pr_cdc_test.go @@ -1,4 +1,4 @@ -package sqlite +package store_test import ( "context" @@ -6,6 +6,7 @@ import ( "testing" "time" + "github.com/aoagents/agent-orchestrator/backend/internal/cdc" "github.com/aoagents/agent-orchestrator/backend/internal/domain" ) @@ -21,13 +22,9 @@ func TestPRChecksCDC_EmitsOnInsertAndStatusUpdate(t *testing.T) { t.Fatal(err) } url := "https://example/pr/1" - if err := s.UpsertPR(ctx, domain.PRRow{URL: url, SessionID: string(rec.ID), Number: 1}); err != nil { - t.Fatal(err) - } - now := time.Now() - mustCheck := func(status string) { - if err := s.RecordCheck(ctx, domain.PRCheckRow{PRURL: url, Name: "build", CommitHash: "c1", Status: status, CreatedAt: now}); err != nil { + mustCheck := func(status domain.PRCheckStatus) { + if err := s.WritePR(ctx, domain.PullRequest{URL: url, SessionID: rec.ID, Number: 1, UpdatedAt: now}, []domain.PullRequestCheck{{Name: "build", CommitHash: "c1", Status: status, CreatedAt: now}}, nil); err != nil { t.Fatal(err) } } @@ -35,20 +32,20 @@ func TestPRChecksCDC_EmitsOnInsertAndStatusUpdate(t *testing.T) { mustCheck("failed") // status change on same commit (update) -> event mustCheck("failed") // no-op re-poll (status unchanged) -> NO event - rows, err := s.ReadChangeLogAfter(ctx, 0, 100) + rows, err := s.EventsAfter(ctx, 0, 100) if err != nil { t.Fatal(err) } - var checkEvents []ChangeLogRow + var checkEvents []cdc.Event for _, r := range rows { - if r.EventType == "pr_check_recorded" { + if r.Type == "pr_check_recorded" { checkEvents = append(checkEvents, r) } } if len(checkEvents) != 2 { t.Fatalf("want 2 check CDC events (insert + status change, no-op suppressed), got %d", len(checkEvents)) } - if !strings.Contains(checkEvents[1].Payload, `"status":"failed"`) { + if !strings.Contains(string(checkEvents[1].Payload), `"status":"failed"`) { t.Fatalf("the update event should carry the new status, got %q", checkEvents[1].Payload) } } @@ -67,9 +64,9 @@ func TestWritePR_PersistsScalarsChecksAndComments(t *testing.T) { now := time.Now() err = s.WritePR(ctx, - domain.PRRow{URL: url, SessionID: string(rec.ID), Number: 7, CI: domain.CIFailing, UpdatedAt: now}, - []domain.PRCheckRow{{PRURL: url, Name: "build", CommitHash: "c1", Status: "failed", CreatedAt: now}}, - []domain.PRComment{{ID: "1", Author: "reviewer", Body: "use a const", CreatedAt: now}}, + domain.PullRequest{URL: url, SessionID: rec.ID, Number: 7, CI: domain.CIFailing, UpdatedAt: now}, + []domain.PullRequestCheck{{Name: "build", CommitHash: "c1", Status: "failed", CreatedAt: now}}, + []domain.PullRequestComment{{ID: "1", Author: "reviewer", Body: "use a const", CreatedAt: now}}, ) if err != nil { t.Fatal(err) diff --git a/backend/internal/storage/sqlite/store/pr_facts.go b/backend/internal/storage/sqlite/store/pr_facts.go new file mode 100644 index 00000000..7bc9a849 --- /dev/null +++ b/backend/internal/storage/sqlite/store/pr_facts.go @@ -0,0 +1,40 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// GetDisplayPRFactsForSession returns the PR snapshot that should represent a +// session in derived display status: active PRs first, otherwise the newest +// historical PR. ok=false means the session has no associated PRs. +func (s *Store) GetDisplayPRFactsForSession(ctx context.Context, id domain.SessionID) (domain.PRFacts, bool, error) { + r, err := s.qr.GetDisplayPRFactsBySession(ctx, id) + if errors.Is(err, sql.ErrNoRows) { + return domain.PRFacts{}, false, nil + } + if err != nil { + return domain.PRFacts{}, false, fmt.Errorf("display pr facts for %s: %w", id, err) + } + return prFactsFromGen(r), true, nil +} + +func prFactsFromGen(r gen.GetDisplayPRFactsBySessionRow) domain.PRFacts { + state := r.PRState + return domain.PRFacts{ + URL: r.URL, + Number: int(r.Number), + Draft: state == domain.PRStateDraft, + Merged: state == domain.PRStateMerged, + Closed: state == domain.PRStateClosed, + CI: r.CIState, + Review: r.ReviewDecision, + Mergeability: r.Mergeability, + ReviewComments: r.ReviewComments, + } +} diff --git a/backend/internal/storage/sqlite/store/pr_store.go b/backend/internal/storage/sqlite/store/pr_store.go new file mode 100644 index 00000000..0f609f7b --- /dev/null +++ b/backend/internal/storage/sqlite/store/pr_store.go @@ -0,0 +1,210 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// The pr / pr_checks / pr_comment rows are modelled by domain.PullRequest / +// domain.PullRequestCheck / domain.PullRequestComment — flat tables, one shared type per table. +// This layer only maps those to/from the sqlc gen.* params: the bool PR flags +// become the single pr.pr_state column, empty enums default to their +// "nothing known yet" value (matching the CHECK constraints), and ints widen to +// int64. + +// Compile-time proof that *Store satisfies both ports it is wired into, so a +// drift between either interface and this implementation fails here at the point +// of definition rather than later at the call sites in lifecycle_wiring / tests. +var ( + _ ports.PRWriter = (*Store)(nil) +) + +// WritePR persists a full PR observation — scalar facts, check runs, and the +// replacement comment set — in one write transaction, so the rows and the +// change_log events their triggers emit are committed all-or-nothing. The scalar +// PR upsert runs first so the checks'/comments' CDC triggers can resolve the +// session id from the pr row within the same transaction. +func (s *Store) WritePR(ctx context.Context, pr domain.PullRequest, checks []domain.PullRequestCheck, comments []domain.PullRequestComment) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.inTx(ctx, "write pr observation", func(q *gen.Queries) error { + existing, err := q.GetPR(ctx, pr.URL) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return err + } + if err == nil && existing.SessionID != pr.SessionID { + return fmt.Errorf("pr %s already belongs to session %s", pr.URL, existing.SessionID) + } + if err := q.UpsertPR(ctx, genPRParams(pr)); err != nil { + return err + } + for _, c := range checks { + if err := q.UpsertPRCheck(ctx, genCheckParams(pr.URL, c)); err != nil { + return err + } + } + if err := q.DeletePRComments(ctx, pr.URL); err != nil { + return err + } + for _, c := range comments { + if err := q.InsertPRComment(ctx, genCommentParams(pr.URL, c)); err != nil { + return fmt.Errorf("comment %q: %w", c.ID, err) + } + } + return nil + }) +} + +// GetPR returns the PR facts for a URL, or ok=false if absent. +func (s *Store) GetPR(ctx context.Context, url string) (domain.PullRequest, bool, error) { + p, err := s.qr.GetPR(ctx, url) + if errors.Is(err, sql.ErrNoRows) { + return domain.PullRequest{}, false, nil + } + if err != nil { + return domain.PullRequest{}, false, fmt.Errorf("get pr %s: %w", url, err) + } + return prRowFromGen(p), true, nil +} + +// ListPRsBySession returns every PR owned by a session, newest first. +func (s *Store) ListPRsBySession(ctx context.Context, sessionID domain.SessionID) ([]domain.PullRequest, error) { + rows, err := s.qr.ListPRsBySession(ctx, sessionID) + if err != nil { + return nil, fmt.Errorf("list prs for %s: %w", sessionID, err) + } + out := make([]domain.PullRequest, 0, len(rows)) + for _, p := range rows { + out = append(out, prRowFromGen(p)) + } + return out, nil +} + +// ListChecks returns every recorded check run for a PR. +func (s *Store) ListChecks(ctx context.Context, prURL string) ([]domain.PullRequestCheck, error) { + rows, err := s.qr.ListChecksByPR(ctx, prURL) + if err != nil { + return nil, fmt.Errorf("list checks %s: %w", prURL, err) + } + out := make([]domain.PullRequestCheck, 0, len(rows)) + for _, c := range rows { + out = append(out, checkRowFromGen(c)) + } + return out, nil +} + +// ListPRComments returns a PR's review comments, oldest first. +func (s *Store) ListPRComments(ctx context.Context, prURL string) ([]domain.PullRequestComment, error) { + rows, err := s.qr.ListPRComments(ctx, prURL) + if err != nil { + return nil, fmt.Errorf("list pr comments %s: %w", prURL, err) + } + out := make([]domain.PullRequestComment, 0, len(rows)) + for _, c := range rows { + out = append(out, commentFromGen(c)) + } + return out, nil +} + +// ---- domain <-> gen mapping ---- + +// prState collapses the PR's bools into the single pr.state column value. +func prState(r domain.PullRequest) domain.PRState { + switch { + case r.Merged: + return domain.PRStateMerged + case r.Closed: + return domain.PRStateClosed + case r.Draft: + return domain.PRStateDraft + default: + return domain.PRStateOpen + } +} + +func genPRParams(r domain.PullRequest) gen.UpsertPRParams { + return gen.UpsertPRParams{ + URL: r.URL, + SessionID: r.SessionID, + Number: int64(r.Number), + PRState: prState(r), + ReviewDecision: reviewOrDefault(r.Review), + CIState: ciOrDefault(r.CI), + Mergeability: mergeabilityOrDefault(r.Mergeability), + UpdatedAt: r.UpdatedAt, + } +} + +func reviewOrDefault(v domain.ReviewDecision) domain.ReviewDecision { + if v == "" { + return domain.ReviewNone + } + return v +} + +func ciOrDefault(v domain.CIState) domain.CIState { + if v == "" { + return domain.CIUnknown + } + return v +} + +func mergeabilityOrDefault(v domain.Mergeability) domain.Mergeability { + if v == "" { + return domain.MergeUnknown + } + return v +} + +func prRowFromGen(p gen.PR) domain.PullRequest { + return domain.PullRequest{ + URL: p.URL, + SessionID: p.SessionID, + Number: int(p.Number), + Draft: p.PRState == domain.PRStateDraft, + Merged: p.PRState == domain.PRStateMerged, + Closed: p.PRState == domain.PRStateClosed, + CI: p.CIState, + Review: p.ReviewDecision, + Mergeability: p.Mergeability, + UpdatedAt: p.UpdatedAt, + } +} + +func genCheckParams(prURL string, c domain.PullRequestCheck) gen.UpsertPRCheckParams { + status := c.Status + if status == "" { + status = domain.PRCheckUnknown + } + return gen.UpsertPRCheckParams{ + PRURL: prURL, Name: c.Name, CommitHash: c.CommitHash, + Status: status, URL: c.URL, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + } +} + +func checkRowFromGen(c gen.PRCheck) domain.PullRequestCheck { + return domain.PullRequestCheck{ + Name: c.Name, CommitHash: c.CommitHash, Status: c.Status, + URL: c.URL, LogTail: c.LogTail, CreatedAt: c.CreatedAt, + } +} + +func genCommentParams(prURL string, c domain.PullRequestComment) gen.InsertPRCommentParams { + return gen.InsertPRCommentParams{ + PRURL: prURL, CommentID: c.ID, Author: c.Author, File: c.File, + Line: int64(c.Line), Body: c.Body, Resolved: c.Resolved, CreatedAt: c.CreatedAt, + } +} + +func commentFromGen(c gen.PRComment) domain.PullRequestComment { + return domain.PullRequestComment{ + ID: c.CommentID, Author: c.Author, File: c.File, Line: int(c.Line), + Body: c.Body, Resolved: c.Resolved, CreatedAt: c.CreatedAt, + } +} diff --git a/backend/internal/storage/sqlite/store/project_store.go b/backend/internal/storage/sqlite/store/project_store.go new file mode 100644 index 00000000..1d216d3e --- /dev/null +++ b/backend/internal/storage/sqlite/store/project_store.go @@ -0,0 +1,101 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/project" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +var _ project.Store = (*Store)(nil) + +// Upsert inserts or replaces a registered project row. +func (s *Store) Upsert(ctx context.Context, r project.Row) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.UpsertProject(ctx, gen.UpsertProjectParams{ + ID: domain.ProjectID(r.ID), + Path: r.Path, + RepoOriginURL: r.RepoOriginURL, + DisplayName: r.DisplayName, + RegisteredAt: r.RegisteredAt, + ArchivedAt: nullTime(r.ArchivedAt), + }) +} + +// Get returns a project by id, active or archived. +func (s *Store) Get(ctx context.Context, id string) (project.Row, bool, error) { + p, err := s.qr.GetProject(ctx, domain.ProjectID(id)) + if errors.Is(err, sql.ErrNoRows) { + return project.Row{}, false, nil + } + if err != nil { + return project.Row{}, false, fmt.Errorf("get project %s: %w", id, err) + } + return projectRowFromGen(p), true, nil +} + +// FindByPath returns a project registered at path, active or archived. +func (s *Store) FindByPath(ctx context.Context, path string) (project.Row, bool, error) { + p, err := s.qr.FindProjectByPath(ctx, path) + if errors.Is(err, sql.ErrNoRows) { + return project.Row{}, false, nil + } + if err != nil { + return project.Row{}, false, fmt.Errorf("find project by path %s: %w", path, err) + } + return projectRowFromGen(p), true, nil +} + +// List returns active projects ordered by id. +func (s *Store) List(ctx context.Context) ([]project.Row, error) { + rows, err := s.qr.ListProjects(ctx) + if err != nil { + return nil, fmt.Errorf("list projects: %w", err) + } + out := make([]project.Row, 0, len(rows)) + for _, p := range rows { + out = append(out, projectRowFromGen(p)) + } + return out, nil +} + +// Archive soft-deletes a project and reports whether a row was affected. +func (s *Store) Archive(ctx context.Context, id string, at time.Time) (bool, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + n, err := s.qw.ArchiveProject(ctx, gen.ArchiveProjectParams{ + ArchivedAt: nullTime(at), + ID: domain.ProjectID(id), + }) + if err != nil { + return false, err + } + return n > 0, nil +} + +func projectRowFromGen(p gen.Project) project.Row { + r := project.Row{ + ID: string(p.ID), + Path: p.Path, + RepoOriginURL: p.RepoOriginURL, + DisplayName: p.DisplayName, + RegisteredAt: p.RegisteredAt, + } + if p.ArchivedAt.Valid { + r.ArchivedAt = p.ArchivedAt.Time + } + return r +} + +func nullTime(t time.Time) sql.NullTime { + if t.IsZero() { + return sql.NullTime{} + } + return sql.NullTime{Time: t, Valid: true} +} diff --git a/backend/internal/storage/sqlite/store/session_store.go b/backend/internal/storage/sqlite/store/session_store.go new file mode 100644 index 00000000..fefd7f3e --- /dev/null +++ b/backend/internal/storage/sqlite/store/session_store.go @@ -0,0 +1,163 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// ---- sessions ---- + +// CreateSession assigns the per-project identity ("{project}-{num}") and inserts +// the record, returning it with ID populated. The next-num read and the insert +// run on the writer connection under writeMu, so two concurrent creates in the +// same project can't collide on num. +func (s *Store) CreateSession(ctx context.Context, rec domain.SessionRecord) (domain.SessionRecord, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + + num, err := s.qw.NextSessionNum(ctx, rec.ProjectID) + if err != nil { + return domain.SessionRecord{}, fmt.Errorf("next session num for %s: %w", rec.ProjectID, err) + } + rec.ID = domain.SessionID(fmt.Sprintf("%s-%d", rec.ProjectID, num)) + if err := s.qw.InsertSession(ctx, recordToInsert(rec, num)); err != nil { + return domain.SessionRecord{}, fmt.Errorf("insert session %s: %w", rec.ID, err) + } + return rec, nil +} + +// UpdateSession writes the full mutable state of an existing session. The +// id/project/num/created_at are immutable and not touched here. +func (s *Store) UpdateSession(ctx context.Context, rec domain.SessionRecord) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.qw.UpdateSession(ctx, recordToUpdate(rec)) +} + +// GetSession returns the full record for a session, or ok=false if absent. +func (s *Store) GetSession(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + row, err := s.qr.GetSession(ctx, id) + if errors.Is(err, sql.ErrNoRows) { + return domain.SessionRecord{}, false, nil + } + if err != nil { + return domain.SessionRecord{}, false, fmt.Errorf("get session %s: %w", id, err) + } + return rowToRecord(row), true, nil +} + +// ListSessions returns every session in a project, ordered by num. +func (s *Store) ListSessions(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { + rows, err := s.qr.ListSessionsByProject(ctx, project) + if err != nil { + return nil, fmt.Errorf("list sessions for %s: %w", project, err) + } + return mapSessionRows(rows), nil +} + +// ListAllSessions returns every session across all projects. +func (s *Store) ListAllSessions(ctx context.Context) ([]domain.SessionRecord, error) { + rows, err := s.qr.ListAllSessions(ctx) + if err != nil { + return nil, fmt.Errorf("list all sessions: %w", err) + } + return mapSessionRows(rows), nil +} + +func mapSessionRows(rows []gen.Session) []domain.SessionRecord { + out := make([]domain.SessionRecord, 0, len(rows)) + for _, r := range rows { + out = append(out, rowToRecord(r)) + } + return out +} + +func rowToRecord(row gen.Session) domain.SessionRecord { + return domain.SessionRecord{ + ID: row.ID, + ProjectID: row.ProjectID, + IssueID: row.IssueID, + Kind: row.Kind, + Harness: row.Harness, + Activity: domain.ActivitySubstate{ + State: row.ActivityState, + LastActivityAt: row.ActivityLastAt, + Source: row.ActivitySource, + }, + IsTerminated: row.IsTerminated, + Metadata: domain.SessionMetadata{ + Branch: row.Branch, + WorkspacePath: row.WorkspacePath, + RuntimeHandleID: row.RuntimeHandleID, + AgentSessionID: row.AgentSessionID, + Prompt: row.Prompt, + }, + CreatedAt: row.CreatedAt, + UpdatedAt: row.UpdatedAt, + } +} + +func recordToInsert(rec domain.SessionRecord, num int64) gen.InsertSessionParams { + activity := normalActivity(rec.Activity, rec.CreatedAt) + return gen.InsertSessionParams{ + ID: rec.ID, + ProjectID: rec.ProjectID, + Num: num, + IssueID: rec.IssueID, + Kind: rec.Kind, + Harness: rec.Harness, + ActivityState: activity.State, + ActivityLastAt: activity.LastActivityAt, + ActivitySource: activity.Source, + IsTerminated: rec.IsTerminated, + Branch: rec.Metadata.Branch, + WorkspacePath: rec.Metadata.WorkspacePath, + RuntimeHandleID: rec.Metadata.RuntimeHandleID, + AgentSessionID: rec.Metadata.AgentSessionID, + Prompt: rec.Metadata.Prompt, + CreatedAt: rec.CreatedAt, + UpdatedAt: rec.UpdatedAt, + } +} + +func recordToUpdate(rec domain.SessionRecord) gen.UpdateSessionParams { + activity := normalActivity(rec.Activity, rec.UpdatedAt) + return gen.UpdateSessionParams{ + ID: rec.ID, + IssueID: rec.IssueID, + Kind: rec.Kind, + Harness: rec.Harness, + ActivityState: activity.State, + ActivityLastAt: activity.LastActivityAt, + ActivitySource: activity.Source, + IsTerminated: rec.IsTerminated, + Branch: rec.Metadata.Branch, + WorkspacePath: rec.Metadata.WorkspacePath, + RuntimeHandleID: rec.Metadata.RuntimeHandleID, + AgentSessionID: rec.Metadata.AgentSessionID, + Prompt: rec.Metadata.Prompt, + UpdatedAt: rec.UpdatedAt, + } +} + +func normalActivity(a domain.ActivitySubstate, fallback time.Time) domain.ActivitySubstate { + if a.State == "" { + a.State = domain.ActivityIdle + } + if a.Source == "" { + a.Source = domain.SourceNone + } + if a.LastActivityAt.IsZero() { + a.LastActivityAt = fallback + } + if a.LastActivityAt.IsZero() { + a.LastActivityAt = time.Now().UTC() + } + return a +} diff --git a/backend/internal/storage/sqlite/store/store.go b/backend/internal/storage/sqlite/store/store.go new file mode 100644 index 00000000..829e385e --- /dev/null +++ b/backend/internal/storage/sqlite/store/store.go @@ -0,0 +1,60 @@ +// Package store contains SQLite-backed table stores built on sqlc-generated +// queries. +package store + +import ( + "context" + "database/sql" + "fmt" + "sync" + + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite/gen" +) + +// Store is the SQLite-backed persistence layer. It routes writes to a single +// writer connection (qw) and reads to a reader pool (qr) — see Open. writeMu +// guards the read-modify-write write methods (e.g. CreateSession's +// next-num-then-insert) so concurrent writes can't interleave them. +// +// CDC is captured by DB triggers (migration 0001), NOT by this layer: the store +// never writes change_log, it only reads it for the CDC poller. +type Store struct { + writeDB *sql.DB + readDB *sql.DB + qw *gen.Queries // bound to the single writer connection + qr *gen.Queries // bound to the reader pool + writeMu sync.Mutex +} + +// NewStore wraps an opened writer + reader *sql.DB (see Open) as a Store. +func NewStore(writeDB, readDB *sql.DB) *Store { + return &Store{ + writeDB: writeDB, + readDB: readDB, + qw: gen.New(writeDB), + qr: gen.New(readDB), + } +} + +// Close closes both pools. +func (s *Store) Close() error { + err := s.writeDB.Close() + if e := s.readDB.Close(); e != nil && err == nil { + err = e + } + return err +} + +// inTx runs fn inside a single write transaction on the writer connection, +// rolling back on error. The caller must already hold writeMu. +func (s *Store) inTx(ctx context.Context, what string, fn func(*gen.Queries) error) error { + tx, err := s.writeDB.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("begin %s: %w", what, err) + } + defer func() { _ = tx.Rollback() }() + if err := fn(s.qw.WithTx(tx)); err != nil { + return fmt.Errorf("%s: %w", what, err) + } + return tx.Commit() +} diff --git a/backend/internal/storage/sqlite/store_test.go b/backend/internal/storage/sqlite/store/store_test.go similarity index 54% rename from backend/internal/storage/sqlite/store_test.go rename to backend/internal/storage/sqlite/store/store_test.go index 426a37d2..669d15a9 100644 --- a/backend/internal/storage/sqlite/store_test.go +++ b/backend/internal/storage/sqlite/store/store_test.go @@ -1,18 +1,20 @@ -package sqlite +package store_test import ( "context" - "fmt" + "encoding/json" "sync" "testing" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/project" + "github.com/aoagents/agent-orchestrator/backend/internal/storage/sqlite" ) -func newTestStore(t *testing.T) *Store { +func newTestStore(t *testing.T) *sqlite.Store { t.Helper() - s, err := Open(t.TempDir()) + s, err := sqlite.Open(t.TempDir()) if err != nil { t.Fatalf("open: %v", err) } @@ -20,9 +22,9 @@ func newTestStore(t *testing.T) *Store { return s } -func seedProject(t *testing.T, s *Store, id string) { +func seedProject(t *testing.T, s *sqlite.Store, id string) { t.Helper() - if err := s.UpsertProject(context.Background(), ProjectRow{ + if err := s.Upsert(context.Background(), project.Row{ ID: id, Path: "/tmp/" + id, RegisteredAt: time.Now().UTC().Truncate(time.Second), }); err != nil { t.Fatalf("seed project %s: %v", id, err) @@ -34,15 +36,8 @@ func sampleRecord(project string) domain.SessionRecord { return domain.SessionRecord{ ProjectID: domain.ProjectID(project), Kind: domain.KindWorker, - Lifecycle: domain.CanonicalSessionLifecycle{ - Version: domain.LifecycleVersion, - Harness: domain.HarnessClaudeCode, - IsAlive: true, - Session: domain.SessionSubstate{State: domain.SessionWorking}, - Activity: domain.ActivitySubstate{ - State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative, - }, - }, + Harness: domain.HarnessClaudeCode, + Activity: domain.ActivitySubstate{State: domain.ActivityActive, LastActivityAt: now, Source: domain.SourceNative}, Metadata: domain.SessionMetadata{Branch: "feat/x", WorkspacePath: "/ws"}, CreatedAt: now, UpdatedAt: now, @@ -54,24 +49,24 @@ func TestProjectCRUDAndArchive(t *testing.T) { ctx := context.Background() seedProject(t, s, "mer") - got, ok, err := s.GetProject(ctx, "mer") + got, ok, err := s.Get(ctx, "mer") if err != nil || !ok { t.Fatalf("get: ok=%v err=%v", ok, err) } if got.ID != "mer" || got.Path != "/tmp/mer" { t.Fatalf("project = %+v", got) } - if list, _ := s.ListProjects(ctx); len(list) != 1 { + if list, _ := s.List(ctx); len(list) != 1 { t.Fatalf("active list = %d, want 1", len(list)) } // archive hides from the active list but still resolves by id. - if err := s.ArchiveProject(ctx, "mer", time.Now().UTC()); err != nil { - t.Fatal(err) + if ok, err := s.Archive(ctx, "mer", time.Now().UTC()); err != nil || !ok { + t.Fatalf("archive: ok=%v err=%v", ok, err) } - if list, _ := s.ListProjects(ctx); len(list) != 0 { + if list, _ := s.List(ctx); len(list) != 0 { t.Fatalf("after archive, active list = %d, want 0", len(list)) } - if _, ok, _ := s.GetProject(ctx, "mer"); !ok { + if _, ok, _ := s.Get(ctx, "mer"); !ok { t.Fatal("archived project must still resolve by id") } } @@ -95,8 +90,8 @@ func TestSessionCreateAssignsPerProjectID(t *testing.T) { if err != nil || !ok { t.Fatalf("get: ok=%v err=%v", ok, err) } - if got.Lifecycle.Session.State != domain.SessionWorking || !got.Lifecycle.IsAlive || - got.Lifecycle.Harness != domain.HarnessClaudeCode || got.Metadata.Branch != "feat/x" { + if got.Activity.State != domain.ActivityActive || got.IsTerminated || + got.Harness != domain.HarnessClaudeCode || got.Metadata.Branch != "feat/x" { t.Fatalf("round-trip mismatch: %+v", got) } if list, _ := s.ListSessions(ctx, "mer"); len(list) != 2 { @@ -107,32 +102,28 @@ func TestSessionCreateAssignsPerProjectID(t *testing.T) { } } -func TestSessionUpdateAndDetecting(t *testing.T) { +func TestSessionUpdateActivityAndTermination(t *testing.T) { s := newTestStore(t) ctx := context.Background() seedProject(t, s, "mer") r, _ := s.CreateSession(ctx, sampleRecord("mer")) - r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionDetecting} - r.Lifecycle.IsAlive = false - r.Lifecycle.Detecting = &domain.DetectingState{Attempts: 2, StartedAt: r.CreatedAt, EvidenceHash: "abc"} + r.Activity = domain.ActivitySubstate{State: domain.ActivityWaitingInput, LastActivityAt: r.CreatedAt, Source: domain.SourceHook} + r.IsTerminated = true if err := s.UpdateSession(ctx, r); err != nil { t.Fatal(err) } got, _, _ := s.GetSession(ctx, r.ID) - if got.Lifecycle.Session.State != domain.SessionDetecting || got.Lifecycle.IsAlive { - t.Fatalf("update not persisted: %+v", got.Lifecycle.Session) - } - if got.Lifecycle.Detecting == nil || got.Lifecycle.Detecting.Attempts != 2 || got.Lifecycle.Detecting.EvidenceHash != "abc" { - t.Fatalf("detecting not round-tripped: %+v", got.Lifecycle.Detecting) + if got.Activity.State != domain.ActivityWaitingInput || !got.IsTerminated { + t.Fatalf("update not persisted: %+v", got) } - // clearing detecting persists as nil. - got.Lifecycle.Detecting = nil - got.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionWorking} + + got.IsTerminated = false + got.Activity.State = domain.ActivityActive _ = s.UpdateSession(ctx, got) again, _, _ := s.GetSession(ctx, r.ID) - if again.Lifecycle.Detecting != nil { - t.Fatalf("detecting should clear to nil, got %+v", again.Lifecycle.Detecting) + if again.IsTerminated || again.Activity.State != domain.ActivityActive { + t.Fatalf("activity/termination should update, got %+v", again) } } @@ -143,57 +134,66 @@ func TestPRCRUD(t *testing.T) { r, _ := s.CreateSession(ctx, sampleRecord("mer")) now := time.Now().UTC().Truncate(time.Second) - pr := domain.PRRow{ - URL: "https://gh/pr/1", SessionID: string(r.ID), Number: 1, + pr := domain.PullRequest{ + URL: "https://gh/pr/1", SessionID: r.ID, Number: 1, Review: domain.ReviewRequired, CI: domain.CIFailing, Mergeability: domain.MergeBlocked, UpdatedAt: now, } - if err := s.UpsertPR(ctx, pr); err != nil { + if err := s.WritePR(ctx, pr, nil, nil); err != nil { t.Fatal(err) } got, ok, err := s.GetPR(ctx, pr.URL) if err != nil || !ok || got != pr { t.Fatalf("get pr: ok=%v err=%v got=%+v", ok, err, got) } - if list, _ := s.ListPRsBySession(ctx, string(r.ID)); len(list) != 1 { + if list, _ := s.ListPRsBySession(ctx, r.ID); len(list) != 1 { t.Fatalf("list prs = %d, want 1", len(list)) } - if err := s.DeletePR(ctx, pr.URL); err != nil { +} + +func TestWritePRRejectsSessionReassignment(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + seedProject(t, s, "mer") + first, _ := s.CreateSession(ctx, sampleRecord("mer")) + second, _ := s.CreateSession(ctx, sampleRecord("mer")) + now := time.Now().UTC().Truncate(time.Second) + + pr := domain.PullRequest{URL: "https://gh/pr/1", SessionID: first.ID, Number: 1, UpdatedAt: now} + if err := s.WritePR(ctx, pr, nil, nil); err != nil { t.Fatal(err) } - if _, ok, _ := s.GetPR(ctx, pr.URL); ok { - t.Fatal("pr should be gone") + pr.SessionID = second.ID + if err := s.WritePR(ctx, pr, nil, nil); err == nil { + t.Fatal("expected reassignment to fail") + } + got, ok, err := s.GetPR(ctx, pr.URL) + if err != nil || !ok { + t.Fatalf("get pr: ok=%v err=%v", ok, err) + } + if got.SessionID != first.ID { + t.Fatalf("pr moved to %s, want %s", got.SessionID, first.ID) } } -func TestPRChecksLoopBrakeQuery(t *testing.T) { +func TestDisplayPRFactsPrefersActivePR(t *testing.T) { s := newTestStore(t) ctx := context.Background() seedProject(t, s, "mer") r, _ := s.CreateSession(ctx, sampleRecord("mer")) now := time.Now().UTC().Truncate(time.Second) - _ = s.UpsertPR(ctx, domain.PRRow{URL: "pr1", SessionID: string(r.ID), UpdatedAt: now}) - // three consecutive failing runs of "build" (one per commit). - for i := 1; i <= 3; i++ { - if err := s.RecordCheck(ctx, domain.PRCheckRow{ - PRURL: "pr1", Name: "build", CommitHash: fmt.Sprintf("c%d", i), - Status: "failed", CreatedAt: now.Add(time.Duration(i) * time.Second), - }); err != nil { - t.Fatal(err) - } + if err := s.WritePR(ctx, domain.PullRequest{URL: "closed", SessionID: r.ID, Number: 1, Closed: true, UpdatedAt: now.Add(time.Minute)}, nil, nil); err != nil { + t.Fatal(err) } - last3, err := s.RecentCheckStatuses(ctx, "pr1", "build", 3) - if err != nil { + if err := s.WritePR(ctx, domain.PullRequest{URL: "open", SessionID: r.ID, Number: 2, CI: domain.CIFailing, UpdatedAt: now}, nil, nil); err != nil { t.Fatal(err) } - if len(last3) != 3 || last3[0] != "failed" || last3[1] != "failed" || last3[2] != "failed" { - t.Fatalf("recent statuses = %v, want 3x failed (loop brake would trip)", last3) + got, ok, err := s.GetDisplayPRFactsForSession(ctx, r.ID) + if err != nil || !ok { + t.Fatalf("display pr: ok=%v err=%v", ok, err) } - // a pass on a newer commit breaks the streak. - _ = s.RecordCheck(ctx, domain.PRCheckRow{PRURL: "pr1", Name: "build", CommitHash: "c4", Status: "passed", CreatedAt: now.Add(4 * time.Second)}) - last3, _ = s.RecentCheckStatuses(ctx, "pr1", "build", 3) - if last3[0] != "passed" { - t.Fatalf("most recent should be passed, got %v", last3) + if got.URL != "open" || got.CI != domain.CIFailing { + t.Fatalf("display pr = %+v", got) } } @@ -203,9 +203,7 @@ func TestPRCommentsReplace(t *testing.T) { seedProject(t, s, "mer") r, _ := s.CreateSession(ctx, sampleRecord("mer")) now := time.Now().UTC().Truncate(time.Second) - _ = s.UpsertPR(ctx, domain.PRRow{URL: "pr1", SessionID: string(r.ID), UpdatedAt: now}) - - _ = s.ReplacePRComments(ctx, "pr1", []domain.PRComment{ + _ = s.WritePR(ctx, domain.PullRequest{URL: "pr1", SessionID: r.ID, UpdatedAt: now}, nil, []domain.PullRequestComment{ {ID: "c1", Author: "a", File: "a.go", Line: 1, Body: "nit", CreatedAt: now}, {ID: "c2", Author: "b", File: "b.go", Line: 2, Body: "bug", Resolved: true, CreatedAt: now.Add(time.Second)}, }) @@ -213,7 +211,7 @@ func TestPRCommentsReplace(t *testing.T) { t.Fatalf("comments = %d, want 2", len(list)) } // replace with a smaller set drops the rest. - _ = s.ReplacePRComments(ctx, "pr1", []domain.PRComment{{ID: "c1", Body: "x", CreatedAt: now}}) + _ = s.WritePR(ctx, domain.PullRequest{URL: "pr1", SessionID: r.ID, UpdatedAt: now}, nil, []domain.PullRequestComment{{ID: "c1", Body: "x", CreatedAt: now}}) if list, _ := s.ListPRComments(ctx, "pr1"); len(list) != 1 { t.Fatalf("after replace, comments = %d, want 1", len(list)) } @@ -226,14 +224,14 @@ func TestCDCTriggersPopulateChangeLog(t *testing.T) { r, _ := s.CreateSession(ctx, sampleRecord("mer")) // a real state change logs; a metadata-only change does not (WHEN guard). - r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionIdle} + r.Activity.State = domain.ActivityIdle _ = s.UpdateSession(ctx, r) r.Metadata.Prompt = "only metadata changed" _ = s.UpdateSession(ctx, r) // a PR insert logs too. - _ = s.UpsertPR(ctx, domain.PRRow{URL: "pr1", SessionID: string(r.ID), UpdatedAt: r.UpdatedAt}) + _ = s.WritePR(ctx, domain.PullRequest{URL: "pr1", SessionID: r.ID, UpdatedAt: r.UpdatedAt}, nil, nil) - evs, err := s.ReadChangeLogAfter(ctx, 0, 100) + evs, err := s.EventsAfter(ctx, 0, 100) if err != nil { t.Fatal(err) } @@ -242,13 +240,20 @@ func TestCDCTriggersPopulateChangeLog(t *testing.T) { if e.ProjectID != "mer" { t.Fatalf("event project = %s, want mer", e.ProjectID) } - types = append(types, e.EventType) + types = append(types, string(e.Type)) } want := []string{"session_created", "session_updated", "pr_created"} if len(types) != 3 || types[0] != want[0] || types[1] != want[1] || types[2] != want[2] { t.Fatalf("change_log event types = %v, want %v (metadata-only update suppressed)", types, want) } - maxSeq, _ := s.MaxChangeLogSeq(ctx) + var payload map[string]any + if err := json.Unmarshal([]byte(evs[0].Payload), &payload); err != nil { + t.Fatalf("session payload JSON: %v", err) + } + if _, ok := payload["isTerminated"].(bool); !ok { + t.Fatalf("isTerminated payload type = %T, want bool", payload["isTerminated"]) + } + maxSeq, _ := s.LatestSeq(ctx) if maxSeq != int64(len(evs)) { t.Fatalf("max seq = %d, want %d", maxSeq, len(evs)) } @@ -287,30 +292,3 @@ func TestConcurrentSessionCreateAssignsUniqueNums(t *testing.T) { t.Fatalf("created %d sessions, want %d", len(all), n) } } - -func TestTerminationReasonRoundTripAndCheck(t *testing.T) { - s := newTestStore(t) - ctx := context.Background() - seedProject(t, s, "mer") - r, _ := s.CreateSession(ctx, sampleRecord("mer")) - - // terminate with a valid reason -> round-trips. - r.Lifecycle.Session = domain.SessionSubstate{State: domain.SessionTerminated} - r.Lifecycle.TerminationReason = domain.TermManuallyKilled - if err := s.UpdateSession(ctx, r); err != nil { - t.Fatal(err) - } - got, _, _ := s.GetSession(ctx, r.ID) - if got.Lifecycle.TerminationReason != domain.TermManuallyKilled { - t.Fatalf("termination_reason = %q, want manually_killed", got.Lifecycle.TerminationReason) - } - if domain.DeriveStatus(got.Lifecycle, domain.PRFacts{}) != domain.StatusKilled { - t.Fatal("terminated+manually_killed should derive to killed") - } - - // an off-enum reason is rejected by the CHECK constraint. - r.Lifecycle.TerminationReason = domain.TerminationReason("definitely_not_a_reason") - if err := s.UpdateSession(ctx, r); err == nil { - t.Fatal("expected CHECK constraint to reject an invalid termination_reason") - } -} diff --git a/backend/internal/terminal/doc.go b/backend/internal/terminal/doc.go index e9d3ebba..44878ec3 100644 --- a/backend/internal/terminal/doc.go +++ b/backend/internal/terminal/doc.go @@ -1,9 +1,9 @@ // Package terminal is the live-terminal streaming feature: it attaches to a -// session's tmux pane over a PTY and multiplexes the byte stream to one or more +// session's Zellij pane over a PTY and multiplexes the byte stream to one or more // WebSocket clients, alongside a session-state channel fed by the CDC // broadcaster. // -// Boundaries (see docs/backend-code-structure.md): +// Boundaries (see docs/architecture.md): // // - This package owns the product workflow: PTY attach, output fan-out, a // bounded replay buffer, re-attach resilience, and the ch-tagged wire @@ -11,10 +11,10 @@ // not to any concrete WebSocket library. // - internal/httpd owns the HTTP/WebSocket upgrade and adapts the accepted // socket to wsConn; it does not contain stream logic. -// - The PTY itself is reached through PTYSource (satisfied by the tmux runtime +// - The PTY itself is reached through PTYSource (satisfied by the Zellij runtime // adapter's AttachCommand/IsAlive) and spawned through an injectable // spawnFunc, so the fan-out, buffering, and re-attach logic test without a -// real process, tmux, or network. +// real process, Zellij, or network. // // Raw PTY bytes never flow through the CDC change_log; only the session channel // is fed by cdc.Broadcaster. Terminal output is high-volume ephemeral data and diff --git a/backend/internal/terminal/fakes_test.go b/backend/internal/terminal/fakes_test.go index 939f6ecc..247c33bd 100644 --- a/backend/internal/terminal/fakes_test.go +++ b/backend/internal/terminal/fakes_test.go @@ -24,7 +24,7 @@ func (f *fakeSource) AttachCommand(ports.RuntimeHandle) ([]string, error) { return nil, f.attachErr } if f.argv == nil { - return []string{"tmux", "attach"}, nil + return []string{"zellij", "attach"}, nil } return f.argv, nil } @@ -42,7 +42,7 @@ func (f *fakeSource) setAlive(v bool) { } // fakePTY is a scripted ptyProcess: Read drains the out channel, Write records, -// Resize records, Close/Wait unblock on close. +// Resize records, and Close unblocks reads. type fakePTY struct { out chan []byte closed chan struct{} @@ -82,11 +82,6 @@ func (p *fakePTY) Resize(rows, cols uint16) error { return nil } -func (p *fakePTY) Wait() error { - <-p.closed - return nil -} - func (p *fakePTY) Close() error { p.once.Do(func() { close(p.closed) }) return nil diff --git a/backend/internal/terminal/logger_test.go b/backend/internal/terminal/logger_test.go new file mode 100644 index 00000000..a1323341 --- /dev/null +++ b/backend/internal/terminal/logger_test.go @@ -0,0 +1,19 @@ +package terminal + +import ( + "testing" + + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +func TestNilLoggerFallsBackToDefault(t *testing.T) { + mgr := NewManager(&fakeSource{}, nil, nil, WithSpawn((&fakeSpawner{}).spawn)) + defer mgr.Close() + if mgr.log == nil { + t.Fatal("manager logger is nil") + } + s := newSession("t1", ports.RuntimeHandle{ID: "t1"}, &fakeSource{}, (&fakeSpawner{}).spawn, nil) + if s.log == nil { + t.Fatal("session logger is nil") + } +} diff --git a/backend/internal/terminal/manager.go b/backend/internal/terminal/manager.go index 895edb6f..79ba6134 100644 --- a/backend/internal/terminal/manager.go +++ b/backend/internal/terminal/manager.go @@ -63,8 +63,11 @@ func WithSpawn(fn spawnFunc) Option { return func(m *Manager) { m.spawn = fn } } func WithHeartbeat(d time.Duration) Option { return func(m *Manager) { m.heartbeat = d } } // NewManager builds a Manager. src attaches PTYs; events feeds the session -// channel (may be nil to disable it); log is required. +// channel (may be nil to disable it). A nil logger falls back to slog.Default. func NewManager(src PTYSource, events EventSource, log *slog.Logger, opts ...Option) *Manager { + if log == nil { + log = slog.Default() + } ctx, cancel := context.WithCancel(context.Background()) m := &Manager{ src: src, @@ -105,7 +108,7 @@ func (m *Manager) Close() { } // openSession returns the live session for id, starting it on first open. The id -// is the runtime handle id (tmux target). +// is the runtime handle id (Zellij handle). func (m *Manager) openSession(id string) (*session, error) { m.mu.Lock() defer m.mu.Unlock() @@ -156,7 +159,7 @@ func (m *Manager) Serve(ctx context.Context, conn wsConn) { if ctx.Err() != nil { return } - c.handle(ctx, msg) + c.handle(msg) } } @@ -173,12 +176,12 @@ type connState struct { closed bool } -func (c *connState) handle(ctx context.Context, msg clientMsg) { +func (c *connState) handle(msg clientMsg) { switch msg.Ch { case chTerminal: - c.handleTerminal(ctx, msg) + c.handleTerminal(msg) case chSubscribe: - c.handleSubscribe() + c.handleSubscribe(msg) case chSystem: if msg.Type == msgPing { c.enqueue(serverMsg{Ch: chSystem, Type: msgPong}) @@ -186,10 +189,10 @@ func (c *connState) handle(ctx context.Context, msg clientMsg) { } } -func (c *connState) handleTerminal(ctx context.Context, msg clientMsg) { +func (c *connState) handleTerminal(msg clientMsg) { switch msg.Type { case msgOpen: - c.openTerminal(ctx, msg.ID) + c.openTerminal(msg.ID) case msgData: raw, err := base64.StdEncoding.DecodeString(msg.Data) if err != nil { @@ -207,7 +210,7 @@ func (c *connState) handleTerminal(ctx context.Context, msg clientMsg) { } } -func (c *connState) openTerminal(_ context.Context, id string) { +func (c *connState) openTerminal(id string) { if id == "" { c.enqueue(serverMsg{Ch: chTerminal, Type: msgError, Error: "missing terminal id"}) return @@ -300,8 +303,8 @@ func (c *connState) lookup(id string) *session { return s } -func (c *connState) handleSubscribe() { - if c.mgr.events == nil { +func (c *connState) handleSubscribe(msg clientMsg) { + if msg.Type != msgSubscribe || c.mgr.events == nil { return } c.mu.Lock() diff --git a/backend/internal/terminal/protocol.go b/backend/internal/terminal/protocol.go index 31a47999..163ca3ba 100644 --- a/backend/internal/terminal/protocol.go +++ b/backend/internal/terminal/protocol.go @@ -4,9 +4,9 @@ package terminal // ("ch"), mirroring the legacy Node mux server so the existing xterm client can // connect unchanged. One socket carries every logical stream: // -// ch "terminal" — per-pane byte stream, keyed by an opaque client-chosen id +// ch "terminal" — per-pane byte stream, keyed by an opaque runtime handle id // ch "subscribe" — the client opts into the session-state channel -// ch "sessions" — server-pushed session-state notifications (CDC-fed) +// ch "sessions" — server-pushed session-state messages (CDC-fed) // ch "system" — liveness; ws-level ping/pong also runs underneath // // Terminal payloads are base64 in the Data field: PTY output is arbitrary bytes diff --git a/backend/internal/terminal/pty_unix.go b/backend/internal/terminal/pty_unix.go index e5ca6f34..a250a037 100644 --- a/backend/internal/terminal/pty_unix.go +++ b/backend/internal/terminal/pty_unix.go @@ -41,9 +41,7 @@ func (p *creackPTY) Resize(rows, cols uint16) error { return pty.Setsize(p.f, &pty.Winsize{Rows: rows, Cols: cols}) } -func (p *creackPTY) Wait() error { return p.cmd.Wait() } - -// Close stops the attach process and releases the PTY. tmux attach exits cleanly +// Close stops the attach process and releases the PTY. Zellij attach exits cleanly // when the master closes, but kill the process to be sure it does not linger. // // It is idempotent: both the session run loop (after copyOut returns) and diff --git a/backend/internal/terminal/pty_windows.go b/backend/internal/terminal/pty_windows.go index c93465aa..f88ef557 100644 --- a/backend/internal/terminal/pty_windows.go +++ b/backend/internal/terminal/pty_windows.go @@ -7,9 +7,8 @@ import ( "errors" ) -// defaultSpawn is not yet implemented on Windows: the POSIX PTY path uses -// creack/pty. A ConPTY-backed attach (mirroring the legacy named-pipe relay) is -// a follow-up. The rest of the package compiles and tests on Windows with an +// defaultSpawn is not implemented on Windows: the POSIX PTY path uses +// creack/pty. The rest of the package compiles and tests on Windows with an // injected spawner. func defaultSpawn(_ context.Context, _ []string) (ptyProcess, error) { return nil, errors.New("terminal: PTY streaming is not supported on Windows yet") diff --git a/backend/internal/terminal/ring.go b/backend/internal/terminal/ring.go index ed55ca65..8ed303cd 100644 --- a/backend/internal/terminal/ring.go +++ b/backend/internal/terminal/ring.go @@ -1,17 +1,13 @@ package terminal -import "sync" - // defaultRingMax caps per-terminal replay history. A late subscriber gets at // most this many bytes of recent output so it can paint a usable screen without // the whole session backlog. Matches the legacy 50KB ring. const defaultRingMax = 50 * 1024 // ringBuffer is a byte ring holding the most recent output of one terminal. It -// keeps a contiguous tail capped at max bytes; snapshot returns a copy for -// replay-on-subscribe. +// is owned by session and accessed under session.mu. type ringBuffer struct { - mu sync.Mutex buf []byte max int } @@ -26,8 +22,6 @@ func newRingBuffer(maxBytes int) *ringBuffer { // append adds p and drops the oldest bytes beyond max. A single write larger // than max is truncated to its last max bytes. func (r *ringBuffer) append(p []byte) { - r.mu.Lock() - defer r.mu.Unlock() if len(p) >= r.max { r.buf = append(r.buf[:0], p[len(p)-r.max:]...) return @@ -40,8 +34,6 @@ func (r *ringBuffer) append(p []byte) { // snapshot returns a copy of the current contents (oldest first). func (r *ringBuffer) snapshot() []byte { - r.mu.Lock() - defer r.mu.Unlock() out := make([]byte, len(r.buf)) copy(out, r.buf) return out diff --git a/backend/internal/terminal/session.go b/backend/internal/terminal/session.go index 77fb5147..02d99cbd 100644 --- a/backend/internal/terminal/session.go +++ b/backend/internal/terminal/session.go @@ -13,7 +13,7 @@ import ( // PTYSource is what a terminal needs from the runtime: the argv that attaches a // PTY to a session's pane, and a liveness check used to decide whether a dropped -// PTY should be re-attached or treated as a clean exit. The tmux runtime adapter +// PTY should be re-attached or treated as a clean exit. The Zellij runtime adapter // satisfies this via AttachCommand/IsAlive; the interface lives here, next to its // only consumer, so terminal does not depend on a concrete adapter. type PTYSource interface { @@ -28,14 +28,12 @@ type PTYSource interface { type ptyProcess interface { io.ReadWriteCloser Resize(rows, cols uint16) error - // Wait blocks until the attach process exits. - Wait() error } // spawnFunc starts a PTY for argv. ctx cancellation must terminate the process. type spawnFunc func(ctx context.Context, argv []string) (ptyProcess, error) -// reattach policy: a PTY that drops is re-attached while the underlying tmux +// reattach policy: a PTY that drops is re-attached while the underlying Zellij // session is still alive, up to maxReattach consecutive failures. An attach that // survived longer than reattachResetGrace before dropping resets the counter, so // a long-lived pane that blips recovers but a tight crash-loop gives up. @@ -44,9 +42,9 @@ const ( defaultReattachResetTime = 5 * time.Second ) -// subscriber receives one terminal's output frames. It must not block; the -// session calls it while holding no lock, but a slow consumer stalls fan-out, so -// the WS layer funnels these onto its own buffered writer. +// subscriber receives one terminal's output frames. It must not block: session +// fan-out calls subscribers while serializing replay/delivery under its mutex, +// so the WS layer funnels frames onto its own buffered writer. type subscriber func(data []byte) // session is one attached terminal pane, fanned out to N subscribers. It owns a @@ -75,6 +73,9 @@ type session struct { } func newSession(id string, handle ports.RuntimeHandle, src PTYSource, spawn spawnFunc, log *slog.Logger) *session { + if log == nil { + log = slog.Default() + } return &session{ id: id, handle: handle, @@ -152,7 +153,7 @@ func (s *session) copyOut(p ptyProcess) { } // shouldReattach decides whether a dropped/failed PTY warrants another attempt: -// only while not closed/cancelled, the tmux session still exists, and we are +// only while not closed/cancelled, the Zellij session still exists, and we are // under the consecutive-failure cap. A backoff sleep separates attempts. func (s *session) shouldReattach(ctx context.Context, failures int) bool { if s.isClosed() || ctx.Err() != nil || failures > s.maxReattach { diff --git a/backend/internal/terminal/session_integration_test.go b/backend/internal/terminal/session_integration_test.go index 9041d963..1c9fceaf 100644 --- a/backend/internal/terminal/session_integration_test.go +++ b/backend/internal/terminal/session_integration_test.go @@ -1,31 +1,45 @@ +//go:build !windows + package terminal import ( "context" + "os" "os/exec" + "path/filepath" "strings" "testing" "time" - "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/tmux" + "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/zellij" + "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) -// TestSessionStreamsRealTmuxPane attaches a real PTY to a real tmux session and +// TestSessionStreamsRealZellijPane attaches a real PTY to a real Zellij session and // asserts output streams back, then that killing the pane stops the session -// without a re-attach storm. Skipped when tmux is unavailable. -func TestSessionStreamsRealTmuxPane(t *testing.T) { - tmuxBin, err := exec.LookPath("tmux") +// without a re-attach storm. Skipped when Zellij is unavailable. +func TestSessionStreamsRealZellijPane(t *testing.T) { + zellijBin, err := exec.LookPath("zellij") if err != nil { - t.Skip("tmux unavailable") + t.Skip("zellij unavailable") } name := "ao-term-it-" + strings.ReplaceAll(t.Name(), "/", "-") - mustRun(t, tmuxBin, "new-session", "-d", "-s", name, "/bin/sh") - t.Cleanup(func() { _ = exec.Command(tmuxBin, "kill-session", "-t", "="+name).Run() }) - - rt := tmux.New(tmux.Options{Binary: tmuxBin}) - handle := ports.RuntimeHandle{ID: name} + socketDir := filepath.Join(os.TempDir(), name+"-socket") + if err := os.MkdirAll(socketDir, 0o755); err != nil { + t.Fatalf("mkdir socket dir: %v", err) + } + rt := zellij.New(zellij.Options{Binary: zellijBin, SocketDir: socketDir, ConfigDir: t.TempDir(), Timeout: 5 * time.Second}) + handle, err := rt.Create(context.Background(), ports.RuntimeConfig{ + SessionID: domain.SessionID(name), + WorkspacePath: t.TempDir(), + LaunchCommand: "printf AO_READY\n", + }) + if err != nil { + t.Fatalf("Create: %v", err) + } + t.Cleanup(func() { _ = rt.Destroy(context.Background(), handle) }) s := newSession(name, handle, rt, defaultSpawn, testLogger()) ctx, cancel := context.WithCancel(context.Background()) @@ -39,14 +53,9 @@ func TestSessionStreamsRealTmuxPane(t *testing.T) { eventually(t, 3*time.Second, func() bool { return s.write([]byte("echo AO_MARKER_42\n")) == nil }) eventually(t, 5*time.Second, func() bool { return strings.Contains(got.string(), "AO_MARKER_42") }) - // Kill the pane: the session must observe it as gone and not re-attach. - mustRun(t, tmuxBin, "kill-session", "-t", "="+name) - eventually(t, 5*time.Second, func() bool { return s.isExited() }) -} - -func mustRun(t *testing.T, name string, args ...string) { - t.Helper() - if out, err := exec.Command(name, args...).CombinedOutput(); err != nil { - t.Fatalf("%s %s: %v\n%s", name, strings.Join(args, " "), err, out) + // Kill the session: the terminal session must observe it as gone and not re-attach. + if err := rt.Destroy(context.Background(), handle); err != nil { + t.Fatalf("Destroy: %v", err) } + eventually(t, 5*time.Second, func() bool { return s.isExited() }) } diff --git a/backend/internal/terminal/session_test.go b/backend/internal/terminal/session_test.go index f7b9ddea..5483117a 100644 --- a/backend/internal/terminal/session_test.go +++ b/backend/internal/terminal/session_test.go @@ -45,13 +45,19 @@ func TestSessionReplaysRingBufferOnSubscribe(t *testing.T) { go s.run(ctx) pty.push([]byte("scrollback")) - eventually(t, time.Second, func() bool { return len(s.ring.snapshot()) == len("scrollback") }) + eventually(t, time.Second, func() bool { return ringLen(s) == len("scrollback") }) var late safeBytes s.subscribe(late.add, nil) eventually(t, time.Second, func() bool { return late.string() == "scrollback" }) } +func ringLen(s *session) int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.ring.snapshot()) +} + func TestSessionWriteAndResizeReachPTY(t *testing.T) { src := &fakeSource{} pty := newFakePTY() @@ -75,7 +81,7 @@ func TestSessionWriteAndResizeReachPTY(t *testing.T) { } func TestSessionSkipsReattachOnCleanExit(t *testing.T) { - src := &fakeSource{alive: false} // tmux session gone -> no re-attach + src := &fakeSource{alive: false} // Zellij session gone -> no re-attach pty := newFakePTY() sp := &fakeSpawner{ptys: []*fakePTY{pty}} s := newTestSession(src, sp.spawn) @@ -91,7 +97,7 @@ func TestSessionSkipsReattachOnCleanExit(t *testing.T) { select { case <-exited: case <-time.After(time.Second): - t.Fatal("expected exit notification after clean pane exit") + t.Fatal("expected exit callback after clean pane exit") } if got := sp.calls(); got != 1 { t.Fatalf("expected exactly one attach, got %d", got) diff --git a/backend/sqlc.yaml b/backend/sqlc.yaml index 9659bf77..3614c425 100644 --- a/backend/sqlc.yaml +++ b/backend/sqlc.yaml @@ -9,5 +9,82 @@ sql: out: "internal/storage/sqlite/gen" emit_json_tags: false emit_prepared_queries: false - emit_interface: true + emit_interface: false emit_empty_slices: true + initialisms: + - id + - url + - pr + - ci + overrides: + - column: "change_log.project_id" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "ProjectID" + - column: "change_log.session_id" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "SessionID" + pointer: true + - column: "change_log.event_type" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/cdc" + type: "EventType" + - column: "pr.session_id" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "SessionID" + - column: "pr.pr_state" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "PRState" + - column: "pr.review_decision" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "ReviewDecision" + - column: "pr.ci_state" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "CIState" + - column: "pr.mergeability" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "Mergeability" + - column: "pr_checks.status" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "PRCheckStatus" + - column: "pr_comment.resolved" + go_type: "bool" + - column: "projects.id" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "ProjectID" + - column: "sessions.id" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "SessionID" + - column: "sessions.project_id" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "ProjectID" + - column: "sessions.issue_id" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "IssueID" + - column: "sessions.kind" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "SessionKind" + - column: "sessions.harness" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "AgentHarness" + - column: "sessions.activity_state" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "ActivityState" + - column: "sessions.activity_source" + go_type: + import: "github.com/aoagents/agent-orchestrator/backend/internal/domain" + type: "ActivitySource" diff --git a/docs/README.md b/docs/README.md index 220dec40..ad4c1453 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,35 +1,17 @@ -# agent-orchestrator (rewrite) — docs +# agent-orchestrator rewrite docs -The agent-orchestrator is being rebuilt as a long-running **Go backend daemon** -(`backend/`) plus an **Electron + TypeScript frontend** (`frontend/`). The -backend supervises a fleet of coding-agent sessions and keeps one true status -per session. +The agent-orchestrator is being rebuilt as a long-running Go backend daemon +(`backend/`) plus an Electron + TypeScript frontend (`frontend/`). The backend +supervises coding-agent sessions and exposes daemon control, project/session +state, terminal streaming, and CDC/event infrastructure. -This folder documents the **Lifecycle Manager (LCM) + Session Manager (SM) -lane** — the deterministic core of the backend that is now implemented (behind -fakes) on the `feat/lcm-sm-contracts` integration branch. +Start with [architecture.md](architecture.md) for the current backend model and +[cli/README.md](cli/README.md) for the CLI surface. -## Start here +## Mental model -| Doc | What it covers | -|-----|----------------| -| [architecture.md](architecture.md) | How the lane works: the OBSERVE→DECIDE→ACT loop, the canonical state model, the package layout, every component, and the load-bearing invariants. Read this first. | -| [status.md](status.md) | What's done (PR by PR), what's left, the integration to-dos, the open cross-lane contract questions, and how to build/test. | -| [cli/README.md](cli/README.md) | CLI foundation decisions: Cobra, reference projects, old CLI inventory, and the first command surface. | +Persist durable facts, derive display status: -## The one-paragraph mental model - -The backend is a **stateless supervisor over external ground truth**: git/GitHub -own PR/CI/review truth, the agent's own files own its activity, and the backend -owns no agent state. Its whole job is, per session: **OBSERVE** raw facts → -**DECIDE** one canonical status via pure, deterministic functions → **ACT** -(persist + fire reactions). The LCM is that reducer; the SM is the -explicit-mutation plumbing (spawn/kill/restore/cleanup) that feeds it. - -## Where this lane fits - -Other lanes (built by other people, in parallel) provide the real adapters this -lane depends on through narrow interfaces: the **persistence layer + CDC**, the -**SCM poller**, the **runtime/agent/workspace plugins**, the **backend API + -OpenAPI**, and the **frontend store**. See [status.md](status.md#integration) -for the hand-off points. +- session table: `activity_state`, `is_terminated`, identity, metadata +- PR tables: PR/CI/review facts +- derived read model: `domain.DeriveStatus(session, prFacts)` diff --git a/docs/architecture.md b/docs/architecture.md index 9673142c..fe2159bd 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,187 +1,95 @@ -# LCM + Session Manager — architecture +# Agent Orchestrator backend architecture -This is the deterministic core of the backend daemon. It supervises agent -sessions and keeps exactly one true status per session. +The backend is a long-running Go daemon that supervises coding-agent sessions. +The current model is intentionally small: session rows persist only durable facts, +and display status is derived at read time. -## 1. Mental model: OBSERVE → DECIDE → ACT - -The backend owns no agent state. git/GitHub own PR/CI/review truth; the agent's -own files own its activity. The job, per session, is one loop: +## Mental model ``` -OBSERVE → DECIDE → ACT -(impure, external) (pure, total) (impure) -raw facts one canonical status persist + react +OBSERVE external facts → UPDATE durable facts → DERIVE display status / ACT ``` -In the rewrite the **OBSERVE** step lives *outside* the LCM (separate owners), -and the LCM is a **synchronous reducer** invoked with facts: +The durable session facts are: + +- `activity_state` — what the agent last reported or what the runtime observer + can safely conclude (`active`, `ready`, `idle`, `waiting_input`, `blocked`, + `exited`). +- `is_terminated` — whether the session should be treated as over. +- PR facts in the `pr`, `pr_checks`, and `pr_comment` tables. + +The UI status is not stored. `domain.DeriveStatus` computes it from the session +record plus PR facts. + +## Package layout ``` -SCM poller ─ ApplySCMObservation ──┐ -reaper ─ ApplyRuntimeObservation┤ -activity hooks ─ ApplyActivitySignal ───┼─▶ LCM: load canonical -Session Mgr ─ OnSpawnCompleted ──────┘ → pure DECIDE - ─ OnKillRequested → diff → persist (merge-patch) -reaper tick ─ TickEscalations → if transition: react (ACT) +backend/internal/domain shared vocabulary and display-status derivation +backend/internal/ports inbound/outbound interfaces +backend/internal/session explicit mutations: spawn, kill, restore, send, cleanup +backend/internal/lifecycle runtime/activity/spawn/termination session fact reducer +backend/internal/pr PR observation ingestion +backend/internal/storage SQLite persistence and DB-triggered CDC +backend/internal/cdc change-log poller and broadcaster +backend/internal/httpd daemon HTTP surface +backend/internal/terminal WebSocket terminal multiplexer +backend/internal/adapters Zellij/git-worktree/GitHub adapters ``` -The LCM **never polls**. The reaper (a timer, owned elsewhere) drives liveness -sampling and duration-based escalation by calling in. +## Status derivation -## 2. Canonical state model — the crown jewel +`session.Manager` selects the display PR from all PR snapshots for a session, then +`domain.DeriveStatus(session, prFacts)` applies this rough precedence: -The **only** thing persisted per session is `CanonicalSessionLifecycle` -(`backend/internal/domain/lifecycle.go`). The single-word display status is -**derived on read and never stored** — this is the most important invariant; it -prevents canonical truth and display from drifting. +1. `is_terminated` → `terminated`, except merged PRs display `merged`. +2. `activity_state=waiting_input` → `needs_input`. +3. `activity_state=blocked` → `stuck`. +4. Open PR facts drive PR pipeline statuses: `ci_failed`, `draft`, + `changes_requested`, `mergeable`, `approved`, `review_pending`, `pr_open`. +5. `activity_state=active` → `working`. +6. Everything else → `idle`. -``` -CanonicalSessionLifecycle - Version schema version of the record shape - Revision monotonic write counter (optimistic-concurrency token) - Session (state, reason) working/idle/needs_input/stuck/detecting/done/terminated - PR (state, reason) none/open/merged/closed - Runtime (state, reason) unknown/alive/exited/missing/probe_failed - Activity last-known agent activity (+ timestamp, source) ← decider input - Detecting anti-flap quarantine memory (nil unless quarantined) ← decider input -``` +## Lifecycle manager -`DeriveLegacyStatus` (`domain/status.go`) is the **sole producer** of the -display `SessionStatus`. Precedence: terminal/hard session states map directly -(they outrank PR facts) → a merged PR wins → an open PR maps by reason → else the -soft session state. So an idle worker with a CI-failing open PR displays -`ci_failed`, but a `needs_input` session shows `needs_input` regardless of the PR. +`lifecycle.Manager` is the write path for session lifecycle facts and lifecycle-owned agent nudges: -`Session` (`domain/session.go`) is the read-model: a `SessionRecord` -(persistence shape, identity + lifecycle + metadata) plus the derived `Status`. -The **Session Manager is the single producer of `Status`** — it attaches it on -read; the store and API never recompute or persist it. +- runtime observations can mark a session terminated only when runtime and + process are both clearly dead and recent activity does not contradict that; + failed/unknown probes do not persist a special state. +- activity signals update `activity_state`; `exited` also marks the session + terminated. +- PR observations do not write PR rows here, but after the PR service persists + them lifecycle sends actionable agent nudges for CI failures, review feedback, + and merge conflicts. -## 3. Package layout (`backend/internal/`) +## PR manager -``` -domain/ the vocabulary (imports only the std lib → no cycles) - lifecycle.go CanonicalSessionLifecycle + all sub-states/enums - status.go SessionStatus + DeriveLegacyStatus (sole display producer) - session.go SessionRecord (persisted) + Session (read-model) + id types - decide/ the PURE core — total, deterministic, zero I/O - types.go LifecycleDecision + Probe/OpenPR/Detecting inputs + tuning consts - decide.go the deciders + the anti-flap quarantine + HashEvidence -ports/ the boundaries (interfaces + DTOs) - inbound.go LifecycleManager, SessionManager (we implement) - outbound.go LifecycleStore, Notifier, AgentMessenger, Runtime/Agent/Workspace - facts.go SCMFacts, RuntimeFacts, ActivitySignal, SpawnOutcome, KillReason -lifecycle/ the LCM implementation (DECIDE + ACT) - manager.go the Apply* pipeline, per-session lock, patch diffing - decide_bridge.go fact→decide-input translation + the composition rules - reactions.go the reaction table + escalation engine + TickEscalations -session/ the SM implementation (explicit mutations) - manager.go Spawn/Kill/Restore/Cleanup/List/Get/Send + rollback -``` +`pr.Manager` records SCM observations into the PR/check/comment tables, then +forwards the observation to lifecycle for agent nudges. A merged PR marks the +owning session terminated through the lifecycle manager; other PR facts are +consumed at read time for display status. -`domain` + `ports` are the committed, stabilized **integration boundary**. -Everything else implements behind it. +## Session manager -## 4. The pure DECIDE core (`domain/decide`) +`session.Manager` performs explicit user mutations: -Total, deterministic, side-effect-free functions — the highest-value test -surface (table-tested to 100%). Key ones: +- `Spawn` creates a row, creates workspace/runtime resources, and reports the + handles to the lifecycle manager. +- `Kill` marks the row terminated, then tears down runtime/workspace resources. +- `Restore` relaunches a terminated session and clears `is_terminated` via the + spawn-completed path. +- `List`/`Get` attach the derived display status. -- `ResolveProbeDecision` — runtime/process liveness. An explicit kill - short-circuits to terminal; a **failed probe is never read as death** (routes - to `detecting`), as does any probe disagreement; only runtime-dead + - process-dead + no-recent-activity reaches `killed`. -- `ResolveOpenPRDecision` — the PR ladder: `ci_failing` → `changes_requested` → - `mergeable` → `approved` → `review_pending` → idle-beyond → else `pr_open`. -- `ResolveTerminalPRStateDecision` — merged → `merged` (park idle awaiting a - human decision); closed → `idle`. -- `CreateDetectingDecision` — the **anti-flap quarantine**. Counts attempts and - hashes the *timestamp-stripped* evidence; escalates to `stuck` only after 3 - consecutive unchanged-evidence ticks **or** 5 minutes since first entering - detecting (`StartedAt` is preserved across the whole episode). Changing - evidence resets the counter. +## Persistence and CDC -## 5. The LCM (`lifecycle`) +SQLite is the durable store. User-visible table changes are captured by database +triggers into `change_log`; the Go store does not manually emit CDC events. A +poller tails `change_log` and publishes live events to in-process subscribers. -Implements `ports.LifecycleManager`. Every `Apply*`/`On*` entrypoint runs the -same pipeline (`manager.go`): +## Load-bearing rules -``` -withLock(session): ← per-session serialization - load canonical → decideFn (build sparse patch) → if changed: persist → load after -return transition (before, after) -``` -then, **after the lock releases**, `react()` fires the mapped reaction. - -- **Per-session serialization** — `keyedMutex` hands out one lock per session id - (parallel across sessions, serial within one). Entries are reference-counted - and evicted when the last holder releases, so the map stays bounded. -- **Composition rules** (`decide_bridge.go`) — two observers must not fight over - the session axis. Liveness (runtime probes) owns the runtime + death/detecting - axis; activity owns working/idle/waiting. `isLivenessOwned` decides when a - healthy probe may *recover* a state (e.g. `detecting → working`) vs. when it - must not clobber an activity-owned `needs_input`/`blocked`. A high-confidence - activity signal may resolve a `detecting` session; an open PR writes only the - PR axis and lets `DeriveLegacyStatus` surface it. -- **Detecting-memory lifecycle** — a decision with `Detecting == nil` clears the - persisted quarantine memory (`LifecyclePatch.ClearDetecting`) so a stale prior - can't leak into a later episode. -- **ACT — reactions + escalation** (`reactions.go`) — on a genuine status - transition, `react()` maps it to a reaction (`send-to-agent` / `notify`; - `auto-merge` exists but is off by default) and dispatches it. A - per-`(session,reaction)` escalation tracker counts attempts; it escalates - (notifies a human and silences further auto-dispatch) when a numeric cap or a - duration is exceeded. The `ci-failed` budget is persistent across CI - oscillation within an open PR and re-arms on genuine recovery. `TickEscalations` - (called by the reaper) fires the duration-based escalations the synchronous - LCM can't wake itself for; it notifies outside the lock. - -## 6. The Session Manager (`session`) - -Implements `ports.SessionManager` — the explicit-mutation plumbing. It never -derives/observes lifecycle state; it routes outcomes to the LCM. - -- **Spawn** — `Workspace.Create` → build prompt → `Runtime.Create` (env - `AO_SESSION_ID`/`AO_PROJECT_ID`/`AO_ISSUE_ID`) → **seed** the initial record - (`not_started`/`spawn_requested`) via the store → `LCM.OnSpawnCompleted`. - Eager rollback unwinds prior steps on failure; an `OnSpawnCompleted` failure - routes the seeded orphan to terminal-errored (the store has no delete; a later - `Cleanup` reclaims it). -- **Kill** — `LCM.OnKillRequested` → `Runtime.Destroy` → `Workspace.Destroy`, - honoring the **worktree-remove safety**: after `git worktree prune`, a still- - registered path is never `rm -rf`'d (it may hold the agent's uncommitted work) - — the refusal is surfaced, not forced. -- **Restore** — reopen via `PatchLifecycle` (not re-seed): session → - `not_started`, PR → `cleared_on_restore`; relaunch with the agent's resume - command; runtime is rolled back on a post-create failure. -- **List/Get** — read records and attach the derived `Status`. **Send** — via - `AgentMessenger`. **Cleanup** — tear down terminal/stale sessions, skipping - paths with uncommitted work. - -## 7. Load-bearing invariants - -1. **Persist canonical; derive display.** Never store the display status. -2. **One authority for death.** Only the DECIDE pipeline (via `detecting`) writes - inferred terminal states; the SM's explicit-kill path goes through - `OnKillRequested`. Everything else that notices a dead runtime persists - `detecting`, never `terminated`. -3. **Failed probe ≠ dead.** Timed-out/errored probes route to `detecting`. -4. **Evidence-hash debounce** prevents flapping signals from terminating live - work; the 5-minute cap is a whole-episode wall-clock safety net. -5. **PR facts dominate** the soft session states once a PR exists. -6. **Merge-patch persistence** — writes touch only changed keys; the store is the - single disk writer (atomic write + lock + CDC). -7. **Sticky activity states** (`waiting_input`/`blocked`) do not decay by clock. -8. **Worktree-remove safety** on teardown. - -## 8. Concurrency & testing - -- Within a session, the per-session lock serializes the load→decide→persist - read-modify-write. `react()` runs *outside* the lock (so a busy-waiting - send-to-agent never holds the session mutex) — see `status.md` for the - integration-time follow-up this implies. -- Tests use **in-memory fakes** for every outbound port, so the LCM and SM are - fully testable with no real adapters. The SM tests drive the **real** - `lifecycle.Manager` for spawn/kill round-trips, so the SM↔LCM contract is - genuinely exercised. The `decide` package is table-tested in isolation. +- Do not store display status. +- Keep session status facts small: `activity_state`, `is_terminated`, and PR + facts are the durable inputs. +- Do not treat failed probes as death. +- Do not force-delete registered dirty worktrees. diff --git a/docs/cli/README.md b/docs/cli/README.md index d78539a0..f4af7107 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -1,32 +1,41 @@ -# AO CLI Foundation +# AO CLI -This page is the running decision log for the Agent Orchestrator CLI. Keep new -CLI decisions here as the command surface grows. +The `ao` CLI is a thin Go/Cobra client for the local Agent Orchestrator daemon. +It starts, discovers, inspects, and stops the daemon through the loopback HTTP +surface and the `running.json` handshake. It must not open SQLite directly or +call runtime, workspace, tracker, or agent adapters in-process. -## Current State +## Current commands -This branch implements the daemon-control foundation. AO now has a Go/Cobra -`ao` binary that can start, inspect, diagnose, and stop the local backend daemon -end to end. +| Command | Purpose | +|---|---| +| `ao start` | Start the daemon in the background and wait for `/readyz`. | +| `ao status` | Report daemon state from `running.json`, process liveness, `/healthz`, and `/readyz`. | +| `ao status --json` | Emit the same daemon state as machine-readable JSON. | +| `ao stop` | Gracefully stop the daemon via loopback `POST /shutdown` after verifying daemon identity. | +| `ao doctor` | Check config, data directory, DB-file presence, daemon state, `git`, and optional `zellij`. | +| `ao doctor --json` | Emit doctor checks as JSON. | +| `ao completion ` | Generate completions for `bash`, `zsh`, `fish`, or `powershell`. | +| `ao version` / `ao --version` | Print build metadata. | +| `ao daemon` | Hidden internal daemon entrypoint used by `ao start`. | + +`go run .` in `backend/` remains a compatibility wrapper around the daemon. + +## Configuration -What works now: +The CLI and daemon share the same environment-driven config: + +| Var | Default | Purpose | +|---|---|---| +| `AO_PORT` | `3001` | Loopback daemon port. | +| `AO_RUN_FILE` | `/agent-orchestrator/running.json` | PID/port handshake. | +| `AO_DATA_DIR` | `/agent-orchestrator/data` | SQLite data directory. | +| `AO_REQUEST_TIMEOUT` | `60s` | REST request timeout. | +| `AO_SHUTDOWN_TIMEOUT` | `10s` | Graceful shutdown cap. | -- `ao start` starts the daemon in the background and waits for `/readyz`. -- `ao status` and `ao status --json` report stopped, stale, unhealthy, - not-ready, or ready daemon state. -- `ao stop` gracefully stops the daemon via the loopback `POST /shutdown` - endpoint, only after verifying the daemon's identity from `running.json`. -- `ao daemon` is the hidden internal daemon entrypoint used by `ao start`. -- `ao doctor` (and `ao doctor --json`) checks config, data dir, the database - file's presence, daemon state, and local tool availability for `git`, `tmux`, - and `zellij`. It never opens or migrates the store — the daemon is the sole - writer/migrator, so doctor only reports whether the database exists yet. -- `ao completion` generates shell completions for `bash`, `zsh`, `fish`, and - `powershell`. -- `ao version` and `ao --version` print build metadata. -- `go run .` still works as a compatibility wrapper around `internal/daemon.Run`. +The daemon always binds `127.0.0.1`. -Manual smoke test: +## Manual smoke test ```bash cd backend @@ -43,353 +52,18 @@ export AO_PORT=3037 /tmp/ao status --json /tmp/ao stop /tmp/ao status --json +rm -rf "$tmp" ``` -What is intentionally not implemented yet: - -- `ao project ...` -- `ao spawn` -- `ao session ...` -- `ao send` -- `ao events ...` - -Next steps: - -1. Wire the existing project manager/controller shell into the daemon with a - durable SQLite-backed project store. -2. Implement `ao project list/add/show/remove` against `/api/v1/projects`. -3. Wire production Session Manager dependencies: project-backed repo resolver, - tmux/zellij runtime registry, first agent adapter, and AgentMessenger. -4. Add `/api/v1/sessions`, then implement `ao spawn`, `ao session ...`, and - `ao send`. -5. Add `/events` SSE and durable event-list reads, then implement - `ao events tail/list`. - -## Decision - -AO will use a single Go CLI binary built with -[Cobra](https://github.com/spf13/cobra). - -The CLI is a thin client for the Go daemon. It should not call SQLite, runtime -adapters, agent adapters, workspace adapters, or SCM integrations directly. It -should start, discover, inspect, and command the daemon through the loopback API -and the existing `running.json` handshake. - -Initial rules: - -- The binary name is `ao`. -- `ao daemon` is the hidden/internal entrypoint for the long-running daemon. -- User-facing commands call the daemon over loopback after reading - `running.json`. -- Commands that mutate core AO state go through HTTP API routes, not direct - stores. -- Commands support predictable text output first and `--json` where automation - is likely. -- Do not introduce Viper in the foundation. Start with explicit flags and a - small config/client layer, then add config loading once the shape is real. - -## References - -These projects inform the direction, but AO should keep its own command surface -smaller at first. - -| Project | CLI stack | What to take | -|---|---|---| -| [Gastown](https://github.com/gastownhall/gastown) | Go + Cobra, with Charmbracelet packages for richer terminal UI | Simple `cmd//main.go` delegating to internal command construction. Useful confirmation that Cobra is the right default for this size of Go CLI. | -| [GitHub CLI](https://github.com/cli/cli) | Go + Cobra | Command factories, explicit IO streams, JSON output, and testable command construction. | -| [Docker CLI](https://github.com/docker/cli) | Go + Cobra | Daemon/client split, command groups, signal handling, and plugin-aware CLI layout. | -| [kubectl](https://github.com/kubernetes/kubectl) | Go + Cobra | Large command tree patterns and IO abstractions. It is a useful ceiling, not a shape to copy now. | -| [Tailscale CLI](https://github.com/tailscale/tailscale) | Go + ffcli | Useful daemon-backed product model: a CLI talks to a local daemon. Do not copy the framework choice. | - -The old AO TypeScript CLI is a product/workflow reference only. We should not -port its implementation because it mixes CLI, storage, runtime, and project -logic in-process. The rewrite needs the CLI to sit outside the core daemon. - -## Current Legacy CLI Inventory - -Inventory source: installed `ao` binary at version `0.9.2`, plus the old -`packages/cli/src/program.ts` and `packages/cli/src/commands/*.ts` files. - -Count: - -- 25 public top-level commands, excluding Commander-generated `help`. -- 26 visible top-level commands if generated `help` is counted. -- 64 explicit public command nodes when nested subcommands are counted. -- 1 hidden internal command: `completion __complete`. -- No aliases are registered in the old Commander source. - -Top-level commands: - -| Command | Legacy purpose | Foundation decision | -|---|---|---| -| `start` | Start orchestrator agent and dashboard | Keep, but redefine as daemon start. | -| `stop` | Stop orchestrator agent and dashboard | Keep, daemon stop. | -| `status` | Show all sessions and project/session health | Keep, daemon and session status. | -| `spawn` | Spawn a single agent session | Keep after session API exists. | -| `batch-spawn` | Spawn many sessions | Defer. | -| `session` | Manage sessions | Keep a smaller subset after session API exists. | -| `send` | Send a message to a session | Keep after messaging API exists. | -| `acknowledge` | Agent self-reporting hook | Defer or replace with internal API. | -| `report` | Agent workflow transition hook | Defer or replace with internal API. | -| `review-check` | Trigger agents from review comments | Defer. | -| `review` | Manage AO-local reviewer runs | Defer. | -| `dashboard` | Start web dashboard | Defer to Electron/frontend lane. | -| `open` | Open terminal/dashboard | Defer. | -| `verify` | Verify issue after staging check | Defer. | -| `doctor` | Run install/env/runtime checks | Keep. | -| `update` | Upgrade AO | Defer to packaging/release lane. | -| `setup` | Configure integrations | Defer. | -| `plugin` | Plugin marketplace/install flow | Defer. | -| `notify` | Notification test commands | Defer. | -| `project` | Manage registered projects | Keep after project API exists. | -| `migrate-storage` | Legacy storage migration | Drop for rewrite unless a real migration appears. | -| `completion` | Generate shell completions | Keep. | -| `events` | Query activity event log | Keep a small `tail`/`list` surface after event API exists. | -| `config` | Read/write old global config | Defer. Avoid until config shape is stable. | -| `config-help` | Print old config schema | Drop. | - -Nested legacy commands: - -| Parent | Subcommands | -|---|---| -| `session` | `ls`, `attach`, `kill`, `cleanup`, `claim-pr`, `restore`, `remap` | -| `review` | `run`, `execute`, `send`, `list` | -| `setup` | `dashboard`, `desktop`, `webhook`, `slack`, `discord`, `composio`, `composio-slack`, `composio-discord`, `composio-discord-bot`, `composio-mail`, `openclaw` | -| `plugin` | `list`, `search`, `create`, `install`, `update`, `uninstall` | -| `project` | `ls`, `add`, `rm`, `set-default` | -| `events` | `list`, `search`, `stats` | -| `config` | `set`, `get` | -| `notify` | `test` | -| `completion` | `zsh`, hidden `__complete` | - -## Initial Command Surface - -The first CLI should make AO installable, startable, inspectable, and stoppable -before trying to recreate the old product surface. - -### Foundation Commands - -These are the first commands to implement. - -| Command | Purpose | Notes | -|---|---|---| -| `ao start` | Start the daemon, wait for `/readyz`, and print PID/port. | Reads the same config env as the daemon. Should be idempotent when an existing healthy daemon is already running. | -| `ao stop` | Stop the running daemon. | Reads `running.json`, sends graceful termination, waits for run-file removal, and reports stale/dead daemon state clearly. | -| `ao status` | Show daemon status and, once APIs exist, project/session summary. | First version can show run-file, process liveness, `/healthz`, `/readyz`, uptime, and port. Add `--json`; add `--watch` once useful. | -| `ao daemon` | Hidden internal daemon entrypoint. | This replaces the current direct `go run .` daemon entrypoint once `main.go` is extracted into `internal/daemon`. | -| `ao doctor` | Diagnose the local environment. | Start with daemon/run-file/port checks, required binaries, config dir/data dir permissions, and runtime availability. | -| `ao completion` | Generate shell completions. | Cobra can support `bash`, `zsh`, `fish`, and `powershell`. | -| `ao version` | Print CLI and build metadata. | Implement as both `ao version` and Cobra's `--version` flag. | - -This gives a useful first release even before project/session mutation routes are -complete. - -### First Core Application Commands - -These are the next commands once daemon HTTP routes expose the needed managers. - -| Command | Purpose | Depends on | -|---|---|---| -| `ao project list` | List registered projects. | Project API. Alias `ls` is acceptable for old muscle memory. | -| `ao project add ` | Register a project. | Project API and project identity rules. | -| `ao project show ` | Inspect project config and health. | Project API. | -| `ao project remove ` | Archive/remove a project. | Project API. Alias `rm` is acceptable. | -| `ao spawn [issue]` | Spawn one coding-agent session. | Session Manager HTTP route, tracker lookup, workspace/runtime/agent adapters. | -| `ao session list` | List sessions across projects or one project. | Session API. Alias `ls` is acceptable. | -| `ao session show ` | Show one session with lifecycle, PR, CI, runtime, and paths. | Session API. | -| `ao session attach ` | Attach to the runtime terminal. | Runtime API or direct terminal attach contract exposed by daemon. | -| `ao session kill ` | Kill a session and clean up safely. | Session Manager `Kill`. | -| `ao session restore ` | Restore a terminated/crashed session. | Session Manager `Restore`. | -| `ao send [message...]` | Send instructions to a running session. | AgentMessenger route. | -| `ao events tail` | Follow daemon activity events. | SSE/CDC API. | -| `ao events list` | List recent activity events. | Event read API. | - -This is the smallest surface that covers the core product loop: - -1. Register a repo. -2. Start AO. -3. Spawn work. -4. Inspect work. -5. Intervene in work. -6. Stop AO. - -## Explicit Deferrals - -Do not include these in the CLI foundation: - -- `batch-spawn`: valuable, but it multiplies error handling before single-spawn - semantics are stable. -- `dashboard` and `open`: frontend/Electron should own the primary dashboard - launch path first. -- `review`, `review-check`, and `verify`: useful workflow automation, but not - required to run core AO. -- `setup`, `plugin`, and `notify`: integration/plugin surface should come after - the daemon API and config model settle. -- `update`: belongs with distribution and release packaging. -- `config` and `config-help`: wait for a stable Go config model. Avoid copying - the old TypeScript global config behavior. -- `migrate-storage`: old storage migration is not part of the rewrite unless a - concrete migration requirement appears. -- `acknowledge` and `report`: these are agent self-reporting hooks. Prefer a - daemon/internal protocol before exposing them as durable user CLI commands. - -## Implementation Plan - -1. Add Cobra to `backend/go.mod`. -2. Move current daemon startup from `backend/main.go` into - `backend/internal/daemon.Run(ctx, opts)`. -3. Add `backend/cmd/ao/main.go` as the only user binary entrypoint. -4. Add `backend/internal/cli` for command construction, IO streams, process - launching, run-file discovery, loopback HTTP client, and output formatting. -5. Implement `ao daemon` first so the current daemon behavior is preserved. -6. Implement `ao start`, `ao stop`, and `ao status` around `running.json` and - `/healthz`/`/readyz`. -7. Add `ao doctor`, `ao completion`, and `ao version`. -8. Add command tests using Cobra command construction with fake IO, fake process - runner, and fake daemon client. Keep daemon integration tests in the daemon - packages. - -Suggested package layout: - -```text -backend/ - cmd/ - ao/ - main.go - internal/ - cli/ - root.go - start.go - stop.go - status.go - doctor.go - completion.go - version.go - client.go - output.go - process.go - daemon/ - daemon.go -``` - -Acceptance criteria for the foundation: - -- `go run ./cmd/ao daemon` behaves like today's `go run .`. -- `go run ./cmd/ao start` starts the daemon and waits until `/readyz` returns - ready. -- `go run ./cmd/ao status --json` works when the daemon is running, stopped, and - stale. -- `go run ./cmd/ao stop` gracefully stops the daemon and removes `running.json`. -- `go test ./...`, `go vet ./...`, and `go test -race ./...` pass. - -## Implementation Readiness - -This section records what the CLI can connect to in the current codebase and -what still needs to be built. Inventory date: 2026-05-31 after merging -`origin/main` at `438b830`. - -### Implemented Foundation - -The daemon-control foundation now exists in `backend/cmd/ao` and -`backend/internal/cli`. - -Implemented commands: - -- `ao daemon` hidden/internal daemon entrypoint. -- `ao start` starts the daemon, waits for `/readyz`, and supports `--json`, - `--timeout`, and `--log-file`. -- `ao stop` stops the daemon from `running.json`, removes stale run-files, and - supports `--json` and `--timeout`. -- `ao status` reports stopped/stale/unhealthy/not-ready/ready states and - supports `--json`. -- `ao doctor` checks config, data dir, database-file presence, daemon state, and - local tool availability for `git`, `tmux`, and `zellij`; supports `--json`. It - does not open or migrate the store (the daemon owns that). -- `ao completion` generates `bash`, `zsh`, `fish`, and `powershell` - completions. -- `ao version` prints build metadata. - -The old `backend/main.go` remains as a compatibility wrapper around -`internal/daemon.Run`, so `go run .` still starts the daemon while scripts move -to `go run ./cmd/ao ...`. - -### Already Implemented and Directly Usable by the CLI - -These pieces are available now and are enough to build the daemon-management -part of the CLI. - -| Area | Existing code | CLI use | -|---|---|---| -| Daemon config | `backend/internal/config` loads `AO_PORT`, `AO_REQUEST_TIMEOUT`, `AO_SHUTDOWN_TIMEOUT`, `AO_RUN_FILE`, and `AO_DATA_DIR`. Host is fixed to `127.0.0.1`. | `ao start`, `ao daemon`, `ao status`, and `ao doctor` can share the same config resolution. | -| HTTP server lifecycle | `backend/internal/httpd.Server` binds loopback, writes `running.json`, serves until context cancellation, then removes `running.json`. | `ao daemon` can preserve today's daemon behavior after extraction into `internal/daemon`. | -| Health probes | `GET /healthz` and `GET /readyz`. | `ao start` can wait for readiness; `ao status` and `ao doctor` can check daemon health. | -| Run-file handshake | `backend/internal/runfile` reads, writes, removes, and stale-checks `running.json`. | `ao status` can discover PID/port; `ao stop` can find the process; `ao start` can detect an already-running daemon. | -| Durable store | `backend/internal/storage/sqlite` opens SQLite, runs goose migrations, uses WAL, stores projects/sessions/PR/check/comment rows, and reads `change_log`. | Not directly called by user CLI commands, but confirms the daemon has a durable backend once APIs expose it. | -| CDC substrate | `backend/internal/cdc` poller and broadcaster exist; daemon starts the poller with `startCDC`. | Future `ao events tail` can build on this once an SSE/API transport exists. | -| Lifecycle manager | `backend/internal/lifecycle` is implemented and currently wired in daemon startup. | Session/status APIs can use it; CLI must wait for HTTP routes rather than calling it directly. | -| Reaper timer | `backend/internal/observe/reaper` exists and is wired. | Runtime liveness will be available once runtime registry wiring exists. | - -### Implemented Internally but Not Reachable by CLI Yet - -These are real backend components, but the CLI cannot responsibly use them until -they are wired into the daemon and exposed through HTTP. - -| Area | Existing code | Missing before CLI can use it | -|---|---|---| -| Project API pieces | `internal/project` has manager/controller DTOs, `/api/v1/projects` routes exist, and `sqlite.Store` has project CRUD. | Durable project-store adapter/wiring in the daemon and CLI commands. The daemon currently constructs the router with nil API deps, so project routes are not product-usable from `ao` yet. | -| Session Manager | `backend/internal/session.Manager` implements `Spawn`, `Kill`, `Restore`, `List`, `Get`, `Send`, and `Cleanup`. | Production daemon wiring with real runtime, agent, workspace, messenger, and HTTP routes. | -| Runtime adapters | tmux and zellij adapters implement `ports.Runtime` and also have attach/send/output helpers. | Runtime registry wiring in daemon, attach/send abstractions in ports/API, and selection config. | -| Workspace adapter | git worktree adapter implements create/destroy/restore/list with safety checks. | Repo resolver backed by registered projects and daemon wiring into Session Manager. | -| GitHub issue tracker | `backend/internal/adapters/tracker/github` implements read-only issue `Get`, `List`, and `Preflight`. | Tracker registry/config, spawn prompt hydration, and project tracker metadata. | -| PR facts storage | SQLite PR/check/comment writes and CDC triggers exist. | SCM/PR observer that fetches GitHub PR/CI/review facts and calls `LCM.ApplyPRObservation`. | -| Session read model | `SessionManager.List/Get` derive display status from canonical lifecycle + PR facts. | HTTP response DTOs and API routes for CLI/frontend reads. | - -### Still Missing - -These are the main gaps before the full initial command set is real. - -| Gap | Blocks | -|---|---| -| Product API client package with run-file discovery. | `project`, `spawn`, `session`, `send`, `events list`, richer `status`. | -| Shutdown mechanism choice: PID signal now, optional `POST /api/v1/daemon/shutdown` later. | `ao stop` polish and cross-platform behavior. | -| Session/send API route surface under `/api/v1`. | `spawn`, `session`, `send`, richer `status`. | -| Project API daemon wiring. | `ao project list/add/show/remove`. | -| SSE route for live CDC events plus durable catch-up reads. | `ao events tail`, frontend live updates. | -| Agent adapters for supported harnesses (`codex`, `claude-code`, etc.). | `ao spawn`, `ao session restore`. | -| AgentMessenger implementation over tmux/zellij. | `ao send`, LCM auto-nudge reactions. | -| Runtime registry wired with tmux/zellij. | Reaper liveness, `session attach`, spawn/kill/restore runtime work. | -| Notifier implementation/multiplexer. | Human notifications and LCM escalation side effects. | -| Activity hooks or agent self-report protocol. | Accurate working/idle/needs-input status beyond runtime/PR facts. | -| Project/tracker config model. | `project add/show`, tracker-backed `spawn`, `doctor` config checks. | -| OpenAPI/DTO/error contract. | Stable CLI/frontend API clients and tests. | - -### Command Readiness Matrix +## Product commands not present yet -| Command | Can implement now? | Existing support | Remaining work | -|---|---:|---|---| -| `ao daemon` | Implemented | Current daemon startup is extracted to `internal/daemon.Run`. | None for foundation. | -| `ao start` | Implemented | Config, run-file stale check, HTTP readiness probes. | Later: package-manager/service integration if needed. | -| `ao stop` | Implemented | Run-file discovery gives PID/port; server exits cleanly on SIGINT/SIGTERM. | Optional later shutdown HTTP route. | -| `ao status` | Partially implemented | Run-file, process liveness via PID, `/healthz`, `/readyz`. | Rich project/session summary waits for `/api/v1/projects` and `/api/v1/sessions`. | -| `ao doctor` | Partially implemented | Config resolution, run-file, database-file presence (no open/migrate), runtime binary checks. | Deeper adapter preflights need daemon wiring/config and should be queried from the daemon, not run in-process. | -| `ao completion` | Implemented | Cobra generators. | None for foundation. | -| `ao version` | Implemented | Build metadata can be injected with `-ldflags`. | Release tooling needs to set metadata. | -| `ao project list/add/show/remove` | Not yet | Project manager/controller route shell and SQLite project CRUD exist. | Durable project-store adapter, daemon API wiring, and CLI HTTP client. CLI must not write SQLite directly. | -| `ao spawn` | Not yet | Session Manager exists; runtime/workspace/tracker pieces partly exist. | Agent adapters, registry/config wiring, project lookup, tracker hydration, HTTP route. | -| `ao session list/show` | Not yet | Store and Session Manager read model exist. | HTTP routes and response DTOs. | -| `ao session attach` | Not yet | tmux/zellij have attach command helpers. | Runtime attach port/API and terminal-launch policy. | -| `ao session kill/restore` | Not yet | Session Manager implements both. | Production wiring and HTTP routes. | -| `ao send` | Not yet | Session Manager has `Send`; tmux/zellij have send helpers. | AgentMessenger implementation, port/API wiring, busy/idle delivery policy. | -| `ao events tail/list` | Not yet | Durable `change_log`, CDC poller, in-process broadcaster. | SSE route and durable event-list route. | +The backend has project, session, lifecycle, terminal, and CDC building blocks, +but the public CLI currently exposes only daemon-control commands. Add product +commands only when a daemon HTTP route owns the corresponding mutation/read: -### Recommended Build Order +- `ao project ...` should call project HTTP routes. +- `ao spawn`, `ao session ...`, and `ao send` should call session/messaging HTTP routes. +- `ao events ...` should call CDC/event HTTP routes. -1. Build CLI foundation around the daemon only: `daemon`, `start`, `stop`, - `status`, `doctor`, `completion`, `version`. -2. Wire the existing project manager/controller shell into the daemon with a - durable SQLite-backed store, then implement `project list/add/show/remove`. -3. Wire production Session Manager dependencies: project-backed repo resolver, - tmux/zellij runtime registry, first agent adapter, and AgentMessenger. -4. Add `/api/v1/sessions` and implement `spawn`, `session list/show/kill/restore`, - and `send`. -5. Add `/events` SSE plus event-list reads, then implement `events tail/list`. +Do not port old in-process TypeScript CLI behavior that mixed command handling +with storage and runtime implementation details. diff --git a/docs/status.md b/docs/status.md index 9bb79cdb..6ca5bc27 100644 --- a/docs/status.md +++ b/docs/status.md @@ -1,98 +1,29 @@ -# LCM + Session Manager — status & roadmap +# agent-orchestrator status -Where the lane stands, what's left, and where to plug in. +Current main contains the Go backend daemon, Cobra CLI foundation, SQLite store, +CDC poller/broadcaster, lifecycle/session managers, terminal mux, project API +controller/manager work, runtime/workspace/tracker adapters, and CDC-backed event rows. -## Branch model +## Build & test -`feat/lcm-sm-contracts` is the **lane integration branch**: each sub-PR below -branched off it and merged **into** it. The whole lane lands on `main` as one -unit once it's ready. Sub-PRs were reviewed against the integration branch; -the eventual lane→main merge is a single cumulative review. - -## Done — implementation complete (behind fakes) - -| Area | What landed | PR | -|------|-------------|----| -| Skeleton | `backend/` (Go) + `frontend/` (Electron/TS) | #1 (on `main`) | -| Contracts + CI | `domain/` + `ports/`; Go + gitleaks workflows | #2 | -| Pure DECIDE core | the deciders + anti-flap quarantine + exhaustive truth-table tests | #4 | -| LCM — pipeline | `Apply*` pipeline, per-session serialization, store integration, composition rules, detecting-memory lifecycle | #5 | -| LCM — reactions | reaction table + escalation engine + real `TickEscalations` | #6 | -| Session Manager | spawn / kill / restore / cleanup / list, eager rollback, worktree-remove safety | #7 | - -`gofmt` / `go build` / `go vet` / `go test -race` all green across `domain`, -`domain/decide`, `lifecycle`, and `session`. The `decide` core is at 100% -statement coverage; the impl packages cover the load-bearing logic including the -error/rollback paths. - -### Build & test - -``` -cd backend -gofmt -l . # must print nothing -go build ./... -go vet ./... -go test -race ./... -go test -cover ./... +```bash +npm run lint ``` -## Not done — the integration phase - -Everything above runs against **in-memory fakes**. Making it a live system means -swapping fakes for real adapters (built by other lanes) behind the existing -ports, and resolving the carried-forward items below. - -### Carried-forward items (must be addressed as real adapters land) - -- **`react()` out-of-lock dispatch.** Reactions fire after the per-session lock - releases (deliberate, so a busy-waiting send-to-agent doesn't hold the mutex). - Under a live daemon with concurrent observers this can dispatch on a stale - snapshot / out of order. Give `react()` a per-session ordering (a small react - queue) or re-check the triggering state before dispatching. Documented in - `lifecycle/reactions.go`. -- **`ExpectedRevision` optimistic-concurrency is unused.** The in-process - per-session mutex covers a single daemon. Multi-writer or CDC-driven setups - must use the `LifecyclePatch.ExpectedRevision` CAS the contract already exposes. -- **Store `Seed` + `Get` need a real implementation.** The Session Manager added - two record-with-identity methods to `LifecycleStore`; the real persistence - layer must implement them (create-with-identity that rejects an existing id; - full-record read by id). Documented in `ports/outbound.go`. - -### Real adapters needed (other lanes) - -| Port | Real adapter | Owning lane | -|------|--------------|-------------| -| `LifecycleStore` | persistence layer (flat-file/KV + atomic write + lock + CDC) | persistence | -| `SCMFacts` producer | SCM poller (batch PR/CI/review enrichment) | SCM | -| `Runtime` / `Agent` / `Workspace` | tmux runtime, claude-code/codex agent, git-worktree workspace | coding-agents | -| `Notifier` | desktop/Slack notifier | notifications | -| `AgentMessenger` | tmux inject with busy-detect + delivery verify | coding-agents | -| `SessionManager` consumer | backend API (routes/controllers) + OpenAPI | API | - -### Open cross-lane contract questions - -- **SCM facts** — does `SCMFacts` match what the poller can cheaply produce - (batch enrichment, CI log tail as a pointer)? -- **Persistence** — is `LifecycleStore` + `LifecyclePatch` the right boundary? - Per-session lock vs. the `ExpectedRevision` CAS? -- **API** — is the `SessionManager` interface + the `Session` read-model - OpenAPI-friendly? - -### Land the lane → `main` +## Current shape -A final cumulative review of `feat/lcm-sm-contracts` vs. `main`, then merge the -complete lane in one unit. +- CLI: `ao start`, `status`, `stop`, `doctor`, `completion`, `version`, and the + hidden daemon entrypoint. +- Session facts: `activity_state` and `is_terminated`; display status is derived + from those plus PR facts. +- SQLite: migrations create projects, sessions, PR/check/comment, and `change_log` tables. +- CDC: DB triggers append to `change_log`; the poller broadcasts live events. +- Session Manager: spawn/kill/restore/list/get/send/cleanup over runtime, + workspace, agent, store, messenger, and lifecycle ports. It is package-level + code today; daemon HTTP routes for session commands are not wired yet. -## Where to plug in (for someone picking this up) +## Next integration work -- **Implementing a real adapter?** Write it to satisfy the matching interface in - `ports/`, then construct the `lifecycle.Manager` / `session.Manager` with it in - place of the fake. Nothing in `domain`/`lifecycle`/`session` should need to - change. -- **Changing decision behavior?** It lives in `domain/decide` (pure) — add a - truth-table case first; nothing there does I/O. -- **Adding a reaction?** Extend the table in `lifecycle/reactions.go` and map the - triggering status in `reactionEventFor`. -- **Don't** persist the display status, conclude death outside the probe - pipeline, or `rm -rf` a still-registered worktree — see the invariants in - [architecture.md](architecture.md#7-load-bearing-invariants). +- Wire production agent adapters. +- Finish project/session HTTP routes and CLI product commands. +- Add SSE/event read endpoints over the CDC log. diff --git a/package.json b/package.json new file mode 100644 index 00000000..4010149d --- /dev/null +++ b/package.json @@ -0,0 +1,9 @@ +{ + "name": "agent-orchestrator", + "private": true, + "scripts": { + "lint": "cd backend && go test ./... && go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.12.2 run --path-mode=abs", + "frontend:typecheck": "npm --prefix frontend run typecheck", + "sqlc": "cd backend && go run github.com/sqlc-dev/sqlc/cmd/sqlc@v1.31.1 generate" + } +} diff --git a/test/cli/Dockerfile b/test/cli/Dockerfile index fb5d85b2..6ed08cc6 100644 --- a/test/cli/Dockerfile +++ b/test/cli/Dockerfile @@ -24,11 +24,11 @@ RUN cd backend && CGO_ENABLED=0 go build -trimpath -o /out/ao ./cmd/ao # ---- stage 2: a clean machine with NO Go toolchain, just like an end user ---- FROM debian:bookworm-slim AS run -# Runtime deps a fresh user would need: git is required by `ao doctor`; tmux is -# the optional runtime it probes for; curl drives the HTTP-level guard checks; -# ca-certificates for good measure. +# Runtime deps a fresh user would need: git is required by `ao doctor`; curl +# drives the HTTP-level guard checks; ca-certificates for good measure. Zellij is +# optional for this smoke test, so doctor reports a WARN if it is absent. RUN apt-get update \ - && apt-get install -y --no-install-recommends git tmux curl ca-certificates \ + && apt-get install -y --no-install-recommends git curl ca-certificates \ && rm -rf /var/lib/apt/lists/* # "Install" the CLI the way a user would: drop the binary on PATH.