Skip to content

Commit 89cd2a4

Browse files
committed
fix: per-step timeoutMinutes, tools.dataSource, and review follow-ups
1 parent c718372 commit 89cd2a4

15 files changed

Lines changed: 384 additions & 113 deletions

.ai/spec/how/reconciler.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ Audience: AI agents. Behavioral rules and phase semantics live in **what/** spec
146146
- **`AgentCaller`:** Boundary between reconciler and runtime (stub vs sandbox+HTTP). Methods mirror workflow steps plus `ReleaseSandboxes`.
147147
- **`SandboxProvider`:** Swappable claim/wait/release (tests can fake).
148148
- **`resolveProposal`:** Produces `resolvedWorkflow` with cached `Agent` + `LLMProvider` per name; applies per-stage agent overrides from `ProposalApproval` via `getStageOverrideAgent`; `Execution`/`Verification` steps nil when corresponding spec sections are zero.
149-
- **`EnsureAgentTemplate`:** Deterministic derived `SandboxTemplate` name from hash of LLM spec, model, skills, MCP servers, required secrets, step, and **base template resourceVersion**. Patches pod template env/volumes for credentials, Vertex/Bedrock/Azure extras, skills image/paths, MCP JSON env, `LIGHTSPEED_MODE`. GC older templates labeled for same agent+step.
149+
- **`EnsureAgentTemplate`:** Deterministic derived `SandboxTemplate` name from hash of LLM spec, model, skills, MCP servers, required secrets, dataSource PVC, step, and **base template resourceVersion**. `dataSource` is extracted from `tools.DataSource` (set in `ToolsSpec`). Patches pod template env/volumes for credentials, Vertex/Bedrock/Azure extras, skills image/paths, MCP JSON env, `LIGHTSPEED_MODE`, and optional `/data/input` PVC mount. GC older templates labeled for same agent+step.
150150

151151
---
152152

api/v1alpha1/proposal_types.go

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -258,10 +258,22 @@ type ProposalStep struct {
258258
// for this step. Use this when different steps need different skills.
259259
// +optional
260260
Tools ToolsSpec `json:"tools,omitzero"`
261+
262+
// timeoutMinutes sets the timeout for this step's sandbox agent call.
263+
// This controls how long the operator waits for the sandbox pod to
264+
// become ready and for the agent to complete its work. Increase this
265+
// for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes).
266+
// Defaults to 5 minutes when omitted.
267+
//
268+
// Mutable: can be adjusted before approving a step.
269+
// +optional
270+
// +kubebuilder:validation:Minimum=1
271+
// +kubebuilder:validation:Maximum=60
272+
TimeoutMinutes *int32 `json:"timeoutMinutes,omitempty"`
261273
}
262274

263275
func (s ProposalStep) IsZero() bool {
264-
return s.Agent == "" && s.Tools.IsZero()
276+
return s.Agent == "" && s.Tools.IsZero() && s.TimeoutMinutes == nil
265277
}
266278

267279
// DataSource references a pre-existing PersistentVolumeClaim containing
@@ -294,7 +306,6 @@ type DataSource struct {
294306
// +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysis) || (has(self.analysis) && self.analysis == oldSelf.analysis)",message="analysis is immutable once set"
295307
// +kubebuilder:validation:XValidation:rule="!has(oldSelf.execution) || (has(self.execution) && self.execution == oldSelf.execution)",message="execution is immutable once set"
296308
// +kubebuilder:validation:XValidation:rule="!has(oldSelf.verification) || (has(self.verification) && self.verification == oldSelf.verification)",message="verification is immutable once set"
297-
// +kubebuilder:validation:XValidation:rule="!has(oldSelf.dataSource) || (has(self.dataSource) && self.dataSource == oldSelf.dataSource)",message="dataSource is immutable once set"
298309
type ProposalSpec struct {
299310
// request is the user's original request, alert description, or a
300311
// description of what triggered this proposal. This text is passed to
@@ -342,25 +353,17 @@ type ProposalSpec struct {
342353
AnalysisOutput AnalysisOutput `json:"analysisOutput,omitzero"`
343354

344355
// tools defines the default tools for all steps: skills images,
345-
// MCP servers, and required secrets. Per-step tools
346-
// (analysis.tools, execution.tools, verification.tools) replace
347-
// this default for individual steps.
356+
// MCP servers, required secrets, and an optional dataSource PVC.
357+
// Per-step tools (analysis.tools, execution.tools, verification.tools)
358+
// replace this default for individual steps, so a dataSource set in
359+
// spec.analysis.tools is mounted only in the analysis sandbox.
348360
//
349361
// Immutable: the skills and secrets available to the agent are
350362
// fixed at creation. Changing tools mid-flight could violate the
351363
// assumptions of an in-progress analysis or execution.
352364
// +optional
353365
Tools ToolsSpec `json:"tools,omitzero"`
354366

355-
// dataSource references a PVC containing pre-populated input data
356-
// (e.g., must-gather bundles, diagnostic data). The operator mounts
357-
// it read-only at /data/input in the sandbox pod. Skills discover
358-
// input data at this standard location.
359-
//
360-
// Immutable: input data source is fixed at creation.
361-
// +optional
362-
DataSource *DataSource `json:"dataSource,omitzero"`
363-
364367
// analysis defines per-step configuration for the analysis step,
365368
// including which agent handles it and any per-step tools.
366369
//
@@ -382,18 +385,6 @@ type ProposalSpec struct {
382385
// +optional
383386
Verification ProposalStep `json:"verification,omitzero"`
384387

385-
// timeoutMinutes sets the per-step timeout for sandbox agent calls.
386-
// This controls how long the operator waits for the sandbox pod to
387-
// become ready and for the agent to complete its work. Increase this
388-
// for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes).
389-
// Defaults to 5 minutes when omitted.
390-
//
391-
// Mutable: can be adjusted before approving a step.
392-
// +optional
393-
// +kubebuilder:validation:Minimum=1
394-
// +kubebuilder:validation:Maximum=60
395-
TimeoutMinutes *int32 `json:"timeoutMinutes,omitempty"`
396-
397388
// revisionFeedback is the user's free-text feedback requesting changes
398389
// to the analysis. Patching this field bumps metadata.generation, which
399390
// the operator detects (generation > observedGeneration) and triggers

api/v1alpha1/tools_types.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,15 @@ type SecretRequirement struct {
109109
}
110110

111111
// ToolsSpec defines the tools available to an agent in its sandbox pod.
112-
// This includes skills images, MCP servers, and required secrets.
112+
// This includes skills images, MCP servers, required secrets, and an
113+
// optional data source PVC.
113114
//
114115
// ToolsSpec is specified on a Proposal either as a shared default
115116
// (spec.tools) or per-step (spec.analysis.tools, spec.execution.tools,
116117
// spec.verification.tools). Per-step tools replace the shared default
117-
// for that step.
118+
// for that step, so a dataSource set in spec.analysis.tools is mounted
119+
// only in the analysis sandbox, while one in spec.tools is mounted in
120+
// every step that does not override tools.
118121
//
119122
// +kubebuilder:validation:MinProperties=1
120123
type ToolsSpec struct {
@@ -147,8 +150,16 @@ type ToolsSpec struct {
147150
// +kubebuilder:validation:MinItems=1
148151
// +kubebuilder:validation:MaxItems=20
149152
RequiredSecrets []SecretRequirement `json:"requiredSecrets,omitempty"`
153+
154+
// dataSource references a pre-existing PersistentVolumeClaim containing
155+
// input data for this step (e.g., must-gather bundles, diagnostic data).
156+
// The PVC must already exist in the same namespace as the Proposal and be
157+
// pre-populated with data before the Proposal is created. The operator
158+
// mounts it read-only at /data/input in the sandbox pod.
159+
// +optional
160+
DataSource *DataSource `json:"dataSource,omitempty"`
150161
}
151162

152163
func (t ToolsSpec) IsZero() bool {
153-
return len(t.Skills) == 0 && len(t.MCPServers) == 0 && len(t.RequiredSecrets) == 0
164+
return len(t.Skills) == 0 && len(t.MCPServers) == 0 && len(t.RequiredSecrets) == 0 && t.DataSource == nil
154165
}

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 10 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

controller/proposal/client_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"net/http"
77
"net/http/httptest"
88
"testing"
9+
"time"
910

1011
agenticv1alpha1 "github.com/openshift/lightspeed-agentic-operator/api/v1alpha1"
1112
)
@@ -179,3 +180,64 @@ func TestAgentHTTPClient_RunWithContext(t *testing.T) {
179180
t.Fatalf("unexpected error: %v", err)
180181
}
181182
}
183+
184+
// TestAgentHTTPClient_TimeoutMs_CustomTimeout verifies that a positive timeout
185+
// is serialized as timeout_ms in the request body sent to the agent.
186+
func TestAgentHTTPClient_TimeoutMs_CustomTimeout(t *testing.T) {
187+
const customTimeout = 10 * time.Second
188+
var gotTimeoutMs *int64
189+
190+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
191+
var req agentRunRequest
192+
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
193+
t.Fatalf("decode request: %v", err)
194+
}
195+
gotTimeoutMs = req.TimeoutMs
196+
w.Header().Set("Content-Type", "application/json")
197+
w.Write([]byte(`{"success": true}`))
198+
}))
199+
defer server.Close()
200+
201+
c := NewAgentHTTPClient(server.URL, customTimeout)
202+
if _, err := c.Run(context.Background(), "", "ping", nil, nil); err != nil {
203+
t.Fatalf("unexpected error: %v", err)
204+
}
205+
206+
if gotTimeoutMs == nil {
207+
t.Fatal("timeout_ms was not sent in request")
208+
}
209+
wantMs := int64(customTimeout / time.Millisecond)
210+
if *gotTimeoutMs != wantMs {
211+
t.Errorf("timeout_ms = %d, want %d", *gotTimeoutMs, wantMs)
212+
}
213+
}
214+
215+
// TestAgentHTTPClient_TimeoutMs_ZeroFallsBackToDefault verifies that timeout <= 0
216+
// falls back to defaultSandboxTimeout and is reflected in timeout_ms.
217+
func TestAgentHTTPClient_TimeoutMs_ZeroFallsBackToDefault(t *testing.T) {
218+
var gotTimeoutMs *int64
219+
220+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
221+
var req agentRunRequest
222+
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
223+
t.Fatalf("decode request: %v", err)
224+
}
225+
gotTimeoutMs = req.TimeoutMs
226+
w.Header().Set("Content-Type", "application/json")
227+
w.Write([]byte(`{"success": true}`))
228+
}))
229+
defer server.Close()
230+
231+
c := NewAgentHTTPClient(server.URL, 0) // zero should fall back to defaultSandboxTimeout
232+
if _, err := c.Run(context.Background(), "", "ping", nil, nil); err != nil {
233+
t.Fatalf("unexpected error: %v", err)
234+
}
235+
236+
if gotTimeoutMs == nil {
237+
t.Fatal("timeout_ms was not sent in request")
238+
}
239+
wantMs := int64(defaultSandboxTimeout / time.Millisecond)
240+
if *gotTimeoutMs != wantMs {
241+
t.Errorf("timeout_ms = %d, want %d (defaultSandboxTimeout)", *gotTimeoutMs, wantMs)
242+
}
243+
}

controller/proposal/handlers.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ func (r *ProposalReconciler) handleAnalysis(
5858
return ctrl.Result{}, fmt.Errorf("update to Analyzing: %w", err)
5959
}
6060

61-
timeout := proposalTimeout(proposal)
61+
timeout := stepTimeout(resolved.Analysis)
6262
analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, timeout)
6363
if err != nil {
6464
return r.failStep(ctx, log, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err)
@@ -125,7 +125,7 @@ func (r *ProposalReconciler) handleRevision(
125125
revisionSuffix := buildRevisionContext(proposal)
126126
requestWithRevision := proposal.Spec.Request + "\n\n" + revisionSuffix
127127

128-
timeout := proposalTimeout(proposal)
128+
timeout := stepTimeout(resolved.Analysis)
129129
analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, timeout)
130130
if err != nil {
131131
return r.failStep(ctx, log, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err)
@@ -242,7 +242,7 @@ func (r *ProposalReconciler) handleExecution(
242242
return ctrl.Result{}, fmt.Errorf("update to Executing: %w", err)
243243
}
244244

245-
timeout := proposalTimeout(proposal)
245+
timeout := stepTimeout(*resolved.Execution)
246246
execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, timeout)
247247
if err != nil {
248248
return r.failStep(ctx, log, proposal, agenticv1alpha1.ProposalConditionExecuted, err)
@@ -346,7 +346,7 @@ func (r *ProposalReconciler) handleVerification(
346346
}
347347
}
348348

349-
timeout := proposalTimeout(proposal)
349+
timeout := stepTimeout(*resolved.Verification)
350350
verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, timeout)
351351
if err != nil {
352352
return r.failStep(ctx, log, proposal, agenticv1alpha1.ProposalConditionVerified, err)
@@ -507,7 +507,7 @@ func (r *ProposalReconciler) handleEscalation(
507507
}
508508

509509
escalationText := buildEscalationRequest(proposal)
510-
timeout := proposalTimeout(proposal)
510+
timeout := stepTimeout(step)
511511
escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, timeout)
512512
if err != nil {
513513
return r.failStep(ctx, log, proposal, agenticv1alpha1.ProposalConditionEscalated, err)

controller/proposal/reconciler_test.go

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ type testAgentCaller struct {
3232
executeResult *ExecutionOutput
3333
verifyResult *VerificationOutput
3434
escalateResult *EscalationOutput
35+
36+
lastAnalyzeTimeout time.Duration
37+
lastExecuteTimeout time.Duration
38+
lastVerifyTimeout time.Duration
39+
lastEscalateTimeout time.Duration
3540
}
3641

3742
func newTestAgentCaller() *testAgentCaller {
@@ -43,25 +48,29 @@ func newTestAgentCaller() *testAgentCaller {
4348
return &testAgentCaller{analyzeResult: a, executeResult: e, verifyResult: v, escalateResult: esc}
4449
}
4550

46-
func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ time.Duration) (*AnalysisOutput, error) {
51+
func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, timeout time.Duration) (*AnalysisOutput, error) {
52+
ta.lastAnalyzeTimeout = timeout
4753
if ta.analyzeErr != nil {
4854
return nil, ta.analyzeErr
4955
}
5056
return ta.analyzeResult, nil
5157
}
52-
func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ time.Duration) (*ExecutionOutput, error) {
58+
func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, timeout time.Duration) (*ExecutionOutput, error) {
59+
ta.lastExecuteTimeout = timeout
5360
if ta.executeErr != nil {
5461
return nil, ta.executeErr
5562
}
5663
return ta.executeResult, nil
5764
}
58-
func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ time.Duration) (*VerificationOutput, error) {
65+
func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, timeout time.Duration) (*VerificationOutput, error) {
66+
ta.lastVerifyTimeout = timeout
5967
if ta.verifyErr != nil {
6068
return nil, ta.verifyErr
6169
}
6270
return ta.verifyResult, nil
6371
}
64-
func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ time.Duration) (*EscalationOutput, error) {
72+
func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, timeout time.Duration) (*EscalationOutput, error) {
73+
ta.lastEscalateTimeout = timeout
6574
if ta.escalateErr != nil {
6675
return nil, ta.escalateErr
6776
}
@@ -319,3 +328,40 @@ func TestReconcile_Denied_Terminal(t *testing.T) {
319328
t.Fatalf("expected Denied, got %s", agenticv1alpha1.DerivePhase(p.Status.Conditions))
320329
}
321330
}
331+
332+
// TestReconcile_PropagatesStepTimeout verifies that timeoutMinutes set on a
333+
// ProposalStep is resolved and forwarded to the AgentCaller as time.Duration.
334+
func TestReconcile_PropagatesStepTimeout(t *testing.T) {
335+
const wantMinutes int32 = 30
336+
337+
scheme := testScheme()
338+
proposal := &agenticv1alpha1.Proposal{
339+
ObjectMeta: metav1.ObjectMeta{Name: "timeout-check", Namespace: "default"},
340+
Spec: agenticv1alpha1.ProposalSpec{
341+
Request: "Pod crashing",
342+
Tools: testTools(),
343+
Analysis: agenticv1alpha1.ProposalStep{
344+
Agent: "default",
345+
TimeoutMinutes: &[]int32{wantMinutes}[0],
346+
},
347+
Execution: agenticv1alpha1.ProposalStep{Agent: "default"},
348+
Verification: agenticv1alpha1.ProposalStep{Agent: "default"},
349+
},
350+
}
351+
352+
objs := append([]client.Object{proposal}, defaultObjects()...)
353+
fc := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).
354+
WithStatusSubresource(proposal, &agenticv1alpha1.AnalysisResult{}, &agenticv1alpha1.ExecutionResult{}, &agenticv1alpha1.VerificationResult{}, &agenticv1alpha1.EscalationResult{}).Build()
355+
356+
caller := newTestAgentCaller()
357+
r := &ProposalReconciler{Client: fc, Log: logr.Discard(), Agent: caller, Namespace: "default"}
358+
359+
if _, err := reconcileOnce(r, "timeout-check"); err != nil {
360+
t.Fatalf("reconcile: %v", err)
361+
}
362+
363+
want := time.Duration(wantMinutes) * time.Minute
364+
if caller.lastAnalyzeTimeout != want {
365+
t.Errorf("Analyze timeout = %v, want %v", caller.lastAnalyzeTimeout, want)
366+
}
367+
}

0 commit comments

Comments
 (0)