Skip to content

Commit 7d442a7

Browse files
adaltonclaude
andauthored
feat: auto-stop idle sessions and preserve git repo state across restarts (#651)
## Summary - Add configurable inactivity timeout that automatically stops idle sessions, reclaiming cluster resources - Preserve git repo state (local branches, uncommitted/staged changes) to S3 on pod shutdown and restore on resume, so work is never lost when pods are stopped or recycled - Three-tier timeout resolution: session-level `spec.inactivityTimeout` > project-level `ProjectSettings.spec.inactivityTimeoutSeconds` > default 24h; set to 0 to disable ## What changed **CRDs:** - `AgenticSession`: added `spec.inactivityTimeout`, `status.lastActivityTime`, `status.stoppedReason` - `ProjectSettings`: added `spec.inactivityTimeoutSeconds` **Backend:** - Debounced activity tracking in `agui_proxy.go` updates `status.lastActivityTime` on AG-UI events (RUN_STARTED, TEXT_MESSAGE_START, TEXT_MESSAGE_CONTENT, TOOL_CALL_START) - `parseStatus()` extracts new status fields for API responses **Operator:** - New `inactivity.go` with `shouldAutoStop()`, `resolveInactivityTimeout()`, `triggerInactivityStop()`, and per-namespace ProjectSettings cache - `monitorPod()` checks inactivity on each tick; re-reads CR before stopping to avoid races - `reconciler.go` reads `stop-reason` annotation to set `status.stoppedReason` and condition reason **Frontend:** - "Stopped (idle)" badge on session cards - Inactivity alert banner in session header (full/actions-only modes) - "This session was automatically stopped after being idle" message in hibernated section **State-sync:** - `sync.sh`: on SIGTERM, creates git bundles, uncommitted/staged patches, and metadata.json per repo - `hydrate.sh`: restores repos from bundles, checks out saved branch, applies patches (best-effort) - `TerminationGracePeriodSeconds` increased from 30 to 60 **Tests:** - `inactivity_test.go`: 16 tests covering `shouldAutoStop`, `resolveInactivityTimeout`, `triggerInactivityStop`, `getProjectInactivityTimeout` - `agui_proxy_test.go`: 18 subtests for `isActivityEvent` ## Test plan - [x] Operator and backend unit tests pass (`go test -race ./...`) - [x] Frontend builds with zero errors/warnings (`npm run build`) - [x] Create session with 180s inactivity timeout, verify auto-stop after idle period - [x] Verify `status.stoppedReason=inactivity` and "Stopped (idle)" badge in UI - [x] Verify git repo with local branch + uncommitted changes survives stop/resume cycle - [x] Verify `inactivityTimeout: 0` disables auto-stop - [x] Verify sessions without explicit timeout fall back to project settings, then 24h default Fixes: RHOAIENG-49782 --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e4250c7 commit 7d442a7

18 files changed

Lines changed: 1240 additions & 20 deletions

File tree

components/backend/handlers/sessions.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,14 @@ func parseStatus(status map[string]interface{}) *types.AgenticSessionStatus {
245245
result.CompletionTime = types.StringPtr(completionTime)
246246
}
247247

248+
if lastActivityTime, ok := status["lastActivityTime"].(string); ok && strings.TrimSpace(lastActivityTime) != "" {
249+
result.LastActivityTime = types.StringPtr(lastActivityTime)
250+
}
251+
252+
if stoppedReason, ok := status["stoppedReason"].(string); ok && stoppedReason != "" {
253+
result.StoppedReason = types.StringPtr(stoppedReason)
254+
}
255+
248256
// jobName and runnerPodName removed - they go stale on restarts
249257
// Use GET /k8s-resources endpoint for live job/pod information
250258

components/backend/types/session.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ type AgenticSessionSpec struct {
1818
DisplayName string `json:"displayName"`
1919
LLMSettings LLMSettings `json:"llmSettings"`
2020
Timeout int `json:"timeout"`
21+
InactivityTimeout *int `json:"inactivityTimeout,omitempty"`
2122
UserContext *UserContext `json:"userContext,omitempty"`
2223
BotAccount *BotAccountRef `json:"botAccount,omitempty"`
2324
ResourceOverrides *ResourceOverrides `json:"resourceOverrides,omitempty"`
@@ -41,6 +42,8 @@ type AgenticSessionStatus struct {
4142
Phase string `json:"phase,omitempty"`
4243
StartTime *string `json:"startTime,omitempty"`
4344
CompletionTime *string `json:"completionTime,omitempty"`
45+
LastActivityTime *string `json:"lastActivityTime,omitempty"`
46+
StoppedReason *string `json:"stoppedReason,omitempty"`
4447
ReconciledRepos []ReconciledRepo `json:"reconciledRepos,omitempty"`
4548
ReconciledWorkflow *ReconciledWorkflow `json:"reconciledWorkflow,omitempty"`
4649
SDKSessionID string `json:"sdkSessionId,omitempty"`

components/backend/websocket/agui_proxy.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"log"
2424
"net/http"
2525
"strings"
26+
"sync"
2627
"time"
2728

2829
"github.com/gin-gonic/gin"
@@ -33,6 +34,30 @@ import (
3334
"k8s.io/client-go/kubernetes"
3435
)
3536

37+
const (
38+
// activityDebounceInterval is the minimum interval between CR status updates for lastActivityTime.
39+
// Inactivity timeout is measured in hours, so minute-level granularity is sufficient.
40+
activityDebounceInterval = 60 * time.Second
41+
42+
// activityUpdateTimeout bounds how long a single activity status update can take.
43+
activityUpdateTimeout = 10 * time.Second
44+
)
45+
46+
// activityUpdateSem limits concurrent goroutines spawned by updateLastActivityTime.
47+
// With 60s debounce, at most one goroutine per session is active at a time under normal
48+
// conditions; this cap protects against pathological bursts (e.g., many sessions starting
49+
// simultaneously with immediate=true).
50+
var activityUpdateSem = make(chan struct{}, 50)
51+
52+
// lastActivityUpdateTimes tracks the last time we updated lastActivityTime on the CR
53+
// for each session to avoid excessive API calls. Key: "namespace/sessionName"
54+
var lastActivityUpdateTimes sync.Map
55+
56+
// sessionProjectMap maps sessionName → projectName so that persistStreamedEvent
57+
// (which only receives sessionID) can look up the project for activity tracking.
58+
// Populated by HandleAGUIRunProxy on each run request.
59+
var sessionProjectMap sync.Map
60+
3661
// HandleAGUIEvents serves the AG-UI event stream over SSE. Clients
3762
// (typically EventSource) connect here to receive all events for a
3863
// session — both persisted history and live events from active runs.
@@ -180,6 +205,9 @@ func HandleAGUIRunProxy(c *gin.Context) {
180205

181206
log.Printf("AGUI Proxy: run=%s session=%s/%s msgs=%d", truncID(runID), projectName, sessionName, len(rawMessages))
182207

208+
// Store project→session mapping for activity tracking in persistStreamedEvent
209+
sessionProjectMap.Store(sessionName, projectName)
210+
183211
// Parse messages for display name generation and hidden metadata
184212
var minimalMsgs []types.Message
185213
if len(rawMessages) > 0 {
@@ -350,6 +378,16 @@ func persistStreamedEvent(sessionID, runID, threadID, jsonData string) {
350378
}
351379

352380
persistEvent(sessionID, event)
381+
382+
// Update lastActivityTime on CR for activity events (debounced).
383+
// Extract event type to check; projectName is derived from the
384+
// sessionID-to-project mapping populated by HandleAGUIRunProxy.
385+
eventType, _ := event["type"].(string)
386+
if isActivityEvent(eventType) {
387+
if projectName, ok := sessionProjectMap.Load(sessionID); ok {
388+
updateLastActivityTime(projectName.(string), sessionID, eventType == types.EventTypeRunStarted)
389+
}
390+
}
353391
}
354392

355393
// ─── POST /agui/interrupt ────────────────────────────────────────────
@@ -727,3 +765,76 @@ func triggerDisplayNameGenerationIfNeeded(projectName, sessionName string, messa
727765
sessionCtx := handlers.ExtractSessionContext(spec)
728766
handlers.GenerateDisplayNameAsync(projectName, sessionName, userMessage, sessionCtx)
729767
}
768+
769+
// isActivityEvent returns true for AG-UI event types that indicate session activity.
770+
func isActivityEvent(eventType string) bool {
771+
switch eventType {
772+
case types.EventTypeRunStarted,
773+
types.EventTypeTextMessageStart,
774+
types.EventTypeTextMessageContent,
775+
types.EventTypeToolCallStart:
776+
return true
777+
default:
778+
return false
779+
}
780+
}
781+
782+
// updateLastActivityTime updates the lastActivityTime field on the AgenticSession CR status.
783+
// Updates are debounced to avoid excessive API calls. RUN_STARTED events bypass the debounce
784+
// to immediately mark the session as active.
785+
func updateLastActivityTime(projectName, sessionName string, immediate bool) {
786+
if handlers.DynamicClient == nil {
787+
log.Printf("Activity tracking: DynamicClient is nil, skipping update for %s/%s", projectName, sessionName)
788+
return
789+
}
790+
791+
key := projectName + "/" + sessionName
792+
now := time.Now()
793+
794+
if !immediate {
795+
if lastUpdate, ok := lastActivityUpdateTimes.Load(key); ok {
796+
if now.Sub(lastUpdate.(time.Time)) < activityDebounceInterval {
797+
return // Debounce: too soon since last update
798+
}
799+
}
800+
}
801+
802+
lastActivityUpdateTimes.Store(key, now)
803+
804+
// Bound concurrency: drop the update if all slots are busy (debounce will retry later).
805+
select {
806+
case activityUpdateSem <- struct{}{}:
807+
default:
808+
return
809+
}
810+
811+
// Run in goroutine to avoid blocking event processing
812+
go func() {
813+
defer func() { <-activityUpdateSem }()
814+
815+
gvr := handlers.GetAgenticSessionV1Alpha1Resource()
816+
ctx, cancel := context.WithTimeout(context.Background(), activityUpdateTimeout)
817+
defer cancel()
818+
819+
obj, err := handlers.DynamicClient.Resource(gvr).Namespace(projectName).Get(ctx, sessionName, metav1.GetOptions{})
820+
if err != nil {
821+
log.Printf("Activity tracking: failed to get session %s/%s: %v", projectName, sessionName, err)
822+
return
823+
}
824+
825+
status, _, _ := unstructured.NestedMap(obj.Object, "status")
826+
if status == nil {
827+
status = make(map[string]any)
828+
}
829+
status["lastActivityTime"] = now.UTC().Format(time.RFC3339)
830+
if err := unstructured.SetNestedField(obj.Object, status, "status"); err != nil {
831+
log.Printf("Activity tracking: failed to set status for %s/%s: %v", projectName, sessionName, err)
832+
return
833+
}
834+
835+
_, err = handlers.DynamicClient.Resource(gvr).Namespace(projectName).UpdateStatus(ctx, obj, metav1.UpdateOptions{})
836+
if err != nil {
837+
log.Printf("Activity tracking: failed to update lastActivityTime for %s/%s: %v", projectName, sessionName, err)
838+
}
839+
}()
840+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package websocket
2+
3+
import (
4+
"testing"
5+
6+
"ambient-code-backend/types"
7+
)
8+
9+
func TestIsActivityEvent(t *testing.T) {
10+
activityEvents := []struct {
11+
name string
12+
eventType string
13+
}{
14+
{"RUN_STARTED", types.EventTypeRunStarted},
15+
{"TEXT_MESSAGE_START", types.EventTypeTextMessageStart},
16+
{"TEXT_MESSAGE_CONTENT", types.EventTypeTextMessageContent},
17+
{"TOOL_CALL_START", types.EventTypeToolCallStart},
18+
}
19+
20+
for _, tc := range activityEvents {
21+
t.Run(tc.name+" is activity", func(t *testing.T) {
22+
if !isActivityEvent(tc.eventType) {
23+
t.Errorf("expected %s to be an activity event", tc.name)
24+
}
25+
})
26+
}
27+
28+
nonActivityEvents := []struct {
29+
name string
30+
eventType string
31+
}{
32+
{"RUN_FINISHED", types.EventTypeRunFinished},
33+
{"RUN_ERROR", types.EventTypeRunError},
34+
{"STEP_STARTED", types.EventTypeStepStarted},
35+
{"STEP_FINISHED", types.EventTypeStepFinished},
36+
{"TEXT_MESSAGE_END", types.EventTypeTextMessageEnd},
37+
{"TOOL_CALL_ARGS", types.EventTypeToolCallArgs},
38+
{"TOOL_CALL_END", types.EventTypeToolCallEnd},
39+
{"STATE_SNAPSHOT", types.EventTypeStateSnapshot},
40+
{"STATE_DELTA", types.EventTypeStateDelta},
41+
{"MESSAGES_SNAPSHOT", types.EventTypeMessagesSnapshot},
42+
{"RAW", types.EventTypeRaw},
43+
{"META", types.EventTypeMeta},
44+
{"empty string", ""},
45+
{"unknown event", "UNKNOWN_EVENT"},
46+
}
47+
48+
for _, tc := range nonActivityEvents {
49+
t.Run(tc.name+" is not activity", func(t *testing.T) {
50+
if isActivityEvent(tc.eventType) {
51+
t.Errorf("expected %s to NOT be an activity event", tc.name)
52+
}
53+
})
54+
}
55+
}

components/frontend/src/app/projects/[name]/sessions/[sessionName]/page.tsx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1525,6 +1525,11 @@ export default function ProjectSessionDetailPage({
15251525
{["Stopped", "Completed", "Failed"].includes(phase) && (
15261526
<div className="max-w-sm">
15271527
<h3 className="font-semibold text-lg mb-4">Session Hibernated</h3>
1528+
{phase === "Stopped" && session?.status?.stoppedReason === "inactivity" && (
1529+
<p className="text-sm text-muted-foreground mb-4">
1530+
This session was automatically stopped after being idle. You can resume it to continue working.
1531+
</p>
1532+
)}
15281533

15291534
{/* Session details */}
15301535
<div className="space-y-3 mb-6 text-left">
@@ -2048,6 +2053,11 @@ export default function ProjectSessionDetailPage({
20482053
{["Stopped", "Completed", "Failed"].includes(phase) && (
20492054
<div className="max-w-sm">
20502055
<h3 className="font-semibold text-lg mb-4">Session Hibernated</h3>
2056+
{phase === "Stopped" && session?.status?.stoppedReason === "inactivity" && (
2057+
<p className="text-sm text-muted-foreground mb-4">
2058+
This session was automatically stopped after being idle. You can resume it to continue working.
2059+
</p>
2060+
)}
20512061
<Button onClick={handleContinue} size="lg" className="w-full" disabled={continueMutation.isPending}>
20522062
{continueMutation.isPending ? (
20532063
<>

components/frontend/src/app/projects/[name]/sessions/[sessionName]/session-header.tsx

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
import { useState } from 'react';
44
import { Button } from '@/components/ui/button';
5-
import { RefreshCw, Octagon, Trash2, Copy, MoreVertical, Info, Play, Pencil, Download, FileText, Printer, Loader2, HardDrive } from 'lucide-react';
5+
import { Alert, AlertDescription } from '@/components/ui/alert';
6+
import { RefreshCw, Octagon, Trash2, Copy, MoreVertical, Info, Play, Pencil, Download, FileText, Printer, Loader2, HardDrive, Clock } from 'lucide-react';
67
import { CloneSessionDialog } from '@/components/clone-session-dialog';
78
import { SessionDetailsModal } from '@/components/session-details-modal';
89
import { EditSessionNameDialog } from '@/components/edit-session-name-dialog';
@@ -51,6 +52,7 @@ export function SessionHeader({
5152
const canStop = isRunning || phase === "Creating";
5253
const canResume = phase === "Stopped";
5354
const canDelete = phase === "Completed" || phase === "Failed" || phase === "Stopped";
55+
const stoppedDueToInactivity = phase === "Stopped" && session.status?.stoppedReason === "inactivity";
5456

5557
const { refetch: fetchExportData } = useSessionExport(projectName, session.metadata.name, false);
5658
const { data: mcpStatus } = useMcpStatus(projectName, session.metadata.name, isRunning);
@@ -270,7 +272,15 @@ export function SessionHeader({
270272
// Actions only (Stop/Resume buttons) - for below breadcrumb
271273
if (renderMode === 'actions-only') {
272274
return (
273-
<div>
275+
<div className="space-y-2">
276+
{stoppedDueToInactivity && (
277+
<Alert variant="info">
278+
<Clock className="h-4 w-4" />
279+
<AlertDescription>
280+
This session was automatically stopped after being idle. You can resume it to continue working.
281+
</AlertDescription>
282+
</Alert>
283+
)}
274284
<div className="flex items-start justify-start">
275285
<div className="flex gap-2">
276286
{canStop && (
@@ -305,7 +315,15 @@ export function SessionHeader({
305315

306316
// Full mode (original layout)
307317
return (
308-
<div>
318+
<div className="space-y-2">
319+
{stoppedDueToInactivity && (
320+
<Alert variant="info">
321+
<Clock className="h-4 w-4" />
322+
<AlertDescription>
323+
This session was automatically stopped after being idle. You can resume it to continue working.
324+
</AlertDescription>
325+
</Alert>
326+
)}
309327
<div className="flex items-start justify-end">
310328
<div className="flex gap-2">
311329
<Button

components/frontend/src/components/status-badge.tsx

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,10 @@ export function StatusBadge({
120120
}
121121

122122
/**
123-
* Session phase badge with appropriate styling
123+
* Session phase badge with appropriate styling.
124+
* When stoppedReason is "inactivity", the label changes to "Stopped (idle)".
124125
*/
125-
export function SessionPhaseBadge({ phase }: { phase: string }) {
126+
export function SessionPhaseBadge({ phase, stoppedReason }: { phase: string; stoppedReason?: string }) {
126127
const statusMap: Record<string, StatusVariant> = {
127128
pending: 'pending',
128129
creating: 'pending',
@@ -137,7 +138,11 @@ export function SessionPhaseBadge({ phase }: { phase: string }) {
137138
const status = statusMap[phase.toLowerCase()] || 'default';
138139
const shouldAnimate = status === 'running' || status === 'stopping';
139140

140-
return <StatusBadge status={status} label={phase} pulse={shouldAnimate} />;
141+
const label = phase === 'Stopped' && stoppedReason === 'inactivity'
142+
? 'Stopped (idle)'
143+
: phase;
144+
145+
return <StatusBadge status={status} label={label} pulse={shouldAnimate} />;
141146
}
142147

143148
/**

components/frontend/src/components/workspace-sections/sessions-section.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ export function SessionsSection({ projectName }: SessionsSectionProps) {
243243
</Link>
244244
</TableCell>
245245
<TableCell>
246-
<SessionPhaseBadge phase={phase} />
246+
<SessionPhaseBadge phase={phase} stoppedReason={session.status?.stoppedReason} />
247247
</TableCell>
248248
<TableCell>
249249
<span className="text-xs px-2 py-1 rounded border bg-muted/50">

components/frontend/src/types/agentic-session.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ export type AgenticSessionSpec = {
2323
initialPrompt?: string;
2424
llmSettings: LLMSettings;
2525
timeout: number;
26+
inactivityTimeout?: number;
2627
displayName?: string;
2728
project?: string;
2829
interactive?: boolean;
@@ -159,6 +160,8 @@ export type AgenticSessionStatus = {
159160
phase: AgenticSessionPhase;
160161
startTime?: string;
161162
completionTime?: string;
163+
lastActivityTime?: string;
164+
stoppedReason?: "user" | "inactivity";
162165
reconciledRepos?: ReconciledRepo[];
163166
reconciledWorkflow?: ReconciledWorkflow;
164167
sdkSessionId?: string;

components/frontend/src/types/api/sessions.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ export type AgenticSessionSpec = {
4545
initialPrompt?: string;
4646
llmSettings: LLMSettings;
4747
timeout: number;
48+
inactivityTimeout?: number;
4849
displayName?: string;
4950
project?: string;
5051
interactive?: boolean;
@@ -89,6 +90,8 @@ export type AgenticSessionStatus = {
8990
phase: AgenticSessionPhase;
9091
startTime?: string;
9192
completionTime?: string;
93+
lastActivityTime?: string;
94+
stoppedReason?: "user" | "inactivity";
9295
jobName?: string;
9396
runnerPodName?: string;
9497
reconciledRepos?: ReconciledRepo[];

0 commit comments

Comments
 (0)