Skip to content

Commit 0468de8

Browse files
committed
feat: add per-proposal timeoutMinutes and configurable sandbox timeout
1 parent 28c4b03 commit 0468de8

11 files changed

Lines changed: 158 additions & 79 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Binaries and local tool installs (Makefile uses ./bin for controller-gen, kustomize, etc.)
22
/bin/
3+
/oc-agentic
34

45
# IDE / OS
56
.idea/

api/v1alpha1/proposal_types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,16 @@ type ProposalSpec struct {
373373
// +kubebuilder:validation:MinLength=1
374374
// +kubebuilder:validation:MaxLength=32768
375375
RevisionFeedback string `json:"revisionFeedback,omitempty"`
376+
377+
// timeoutMinutes overrides the default sandbox operation timeout for
378+
// this proposal. When set, all sandbox wait and HTTP client timeouts
379+
// use this value instead of the operator default (5 minutes).
380+
//
381+
// Immutable: timeout policy is fixed at creation.
382+
// +optional
383+
// +kubebuilder:validation:Minimum=1
384+
// +kubebuilder:validation:Maximum=120
385+
TimeoutMinutes *int32 `json:"timeoutMinutes,omitzero"`
376386
}
377387

378388
// ProposalStatus defines the observed state of Proposal. All fields are

config/crd/bases/agentic.openshift.io_proposals.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,28 @@ spec:
481481
x-kubernetes-validations:
482482
- message: schema is required when mode is Minimal
483483
rule: self.mode != 'Minimal' || has(self.schema)
484+
dataSource:
485+
description: |-
486+
dataSource references a PVC containing pre-populated input data
487+
(e.g., must-gather bundles, diagnostic data). The operator mounts
488+
it read-only at /data/input in the sandbox pod. Skills discover
489+
input data at this standard location.
490+
491+
Immutable: input data source is fixed at creation.
492+
properties:
493+
claimName:
494+
description: |-
495+
claimName is the name of the PersistentVolumeClaim to mount.
496+
The PVC must exist in the same namespace as the Proposal.
497+
maxLength: 253
498+
minLength: 1
499+
type: string
500+
x-kubernetes-validations:
501+
- message: must be a valid DNS subdomain
502+
rule: '!format.dns1123Subdomain().validate(self).hasValue()'
503+
required:
504+
- claimName
505+
type: object
484506
execution:
485507
description: |-
486508
execution defines per-step configuration for the execution step.
@@ -913,6 +935,19 @@ spec:
913935
x-kubernetes-validations:
914936
- message: each namespace must be a valid DNS label
915937
rule: self.all(ns, !format.dns1123Label().validate(ns).hasValue())
938+
timeoutMinutes:
939+
description: |-
940+
timeoutMinutes sets the per-step timeout for sandbox agent calls.
941+
This controls how long the operator waits for the sandbox pod to
942+
become ready and for the agent to complete its work. Increase this
943+
for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes).
944+
Defaults to 5 minutes when omitted.
945+
946+
Mutable: can be adjusted before approving a step.
947+
format: int32
948+
maximum: 60
949+
minimum: 1
950+
type: integer
916951
tools:
917952
description: |-
918953
tools defines the default tools for all steps: skills images,
@@ -1684,6 +1719,9 @@ spec:
16841719
- message: verification is immutable once set
16851720
rule: '!has(oldSelf.verification) || (has(self.verification) && self.verification
16861721
== oldSelf.verification)'
1722+
- message: dataSource is immutable once set
1723+
rule: '!has(oldSelf.dataSource) || (has(self.dataSource) && self.dataSource
1724+
== oldSelf.dataSource)'
16871725
status:
16881726
description: status defines the observed state of Proposal.
16891727
minProperties: 1

config/rbac/role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ rules:
3333
- create
3434
- delete
3535
- get
36+
- apiGroups:
37+
- ""
38+
resources:
39+
- persistentvolumeclaims
40+
verbs:
41+
- get
42+
- list
43+
- watch
3644
- apiGroups:
3745
- agentic.openshift.io
3846
resources:

controller/proposal/agent.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package proposal
22

33
import (
44
"context"
5+
"time"
56

67
agenticv1alpha1 "github.com/openshift/lightspeed-agentic-operator/api/v1alpha1"
78
)
@@ -42,18 +43,18 @@ type EscalationOutput struct {
4243
// HTTP implementations POST to /v1/agent/run — a step-agnostic
4344
// endpoint where all workflow context is in the request payload.
4445
type AgentCaller interface {
45-
Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*AnalysisOutput, error)
46-
Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string) (*ExecutionOutput, error)
47-
Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string) (*VerificationOutput, error)
48-
Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*EscalationOutput, error)
46+
Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*AnalysisOutput, error)
47+
Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string, timeout time.Duration) (*ExecutionOutput, error)
48+
Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string, timeout time.Duration) (*VerificationOutput, error)
49+
Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*EscalationOutput, error)
4950
ReleaseSandboxes(ctx context.Context, proposal *agenticv1alpha1.Proposal) error
5051
}
5152

5253
// StubAgentCaller returns canned success results. Wire in a real
5354
// implementation (sandbox + HTTP) when the agent infrastructure is ready.
5455
type StubAgentCaller struct{}
5556

56-
func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*AnalysisOutput, error) {
57+
func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*AnalysisOutput, error) {
5758
return &AnalysisOutput{
5859
Success: true,
5960
Options: []agenticv1alpha1.RemediationOption{{
@@ -73,7 +74,7 @@ func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal
7374
}, nil
7475
}
7576

76-
func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string) (*ExecutionOutput, error) {
77+
func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, _ time.Duration) (*ExecutionOutput, error) {
7778
return &ExecutionOutput{
7879
Success: true,
7980
ActionsTaken: []agenticv1alpha1.ExecutionAction{{
@@ -88,7 +89,7 @@ func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal
8889
}, nil
8990
}
9091

91-
func (s *StubAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*EscalationOutput, error) {
92+
func (s *StubAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*EscalationOutput, error) {
9293
return &EscalationOutput{
9394
Success: true,
9495
Summary: "Stub escalation summary",
@@ -100,7 +101,7 @@ func (s *StubAgentCaller) ReleaseSandboxes(_ context.Context, _ *agenticv1alpha1
100101
return nil
101102
}
102103

103-
func (s *StubAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string) (*VerificationOutput, error) {
104+
func (s *StubAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, _ time.Duration) (*VerificationOutput, error) {
104105
return &VerificationOutput{
105106
Success: true,
106107
Checks: []agenticv1alpha1.VerifyCheck{{

controller/proposal/client.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,13 @@ type AgentHTTPClient struct {
7979
endpoint string
8080
}
8181

82-
func NewAgentHTTPClient(endpoint string) AgentHTTPClientInterface {
82+
func NewAgentHTTPClient(endpoint string, timeout time.Duration) AgentHTTPClientInterface {
83+
if timeout <= 0 {
84+
timeout = defaultSandboxTimeout
85+
}
8386
return &AgentHTTPClient{
8487
httpClient: &http.Client{
85-
Timeout: 5 * time.Minute,
88+
Timeout: timeout,
8689
Transport: &http.Transport{
8790
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, //nolint:gosec // internal cluster traffic
8891
},
@@ -92,11 +95,13 @@ func NewAgentHTTPClient(endpoint string) AgentHTTPClientInterface {
9295
}
9396

9497
func (c *AgentHTTPClient) Run(ctx context.Context, systemPrompt, query string, outputSchema json.RawMessage, agentCtx *agentContext) (*agentRunResponse, error) {
98+
timeoutMs := int64(c.httpClient.Timeout / time.Millisecond)
9599
req := agentRunRequest{
96100
Query: query,
97101
SystemPrompt: systemPrompt,
98102
OutputSchema: outputSchema,
99103
Context: agentCtx,
104+
TimeoutMs: &timeoutMs,
100105
}
101106

102107
body, err := json.Marshal(req)

controller/proposal/client_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func TestAgentHTTPClient_RunSuccess(t *testing.T) {
3535
}))
3636
defer server.Close()
3737

38-
client := NewAgentHTTPClient(server.URL)
38+
client := NewAgentHTTPClient(server.URL, 0)
3939
resp, err := client.Run(context.Background(), "You are an SRE agent", "check health", nil, nil)
4040
if err != nil {
4141
t.Fatalf("unexpected error: %v", err)
@@ -52,15 +52,15 @@ func TestAgentHTTPClient_RunHTTPError(t *testing.T) {
5252
}))
5353
defer server.Close()
5454

55-
client := NewAgentHTTPClient(server.URL)
55+
client := NewAgentHTTPClient(server.URL, 0)
5656
_, err := client.Run(context.Background(), "", "test", nil, nil)
5757
if err == nil {
5858
t.Fatal("expected error for HTTP 500")
5959
}
6060
}
6161

6262
func TestAgentHTTPClient_RunConnectionError(t *testing.T) {
63-
client := NewAgentHTTPClient("http://127.0.0.1:1")
63+
client := NewAgentHTTPClient("http://127.0.0.1:1", 0)
6464
_, err := client.Run(context.Background(), "", "test", nil, nil)
6565
if err == nil {
6666
t.Fatal("expected error for connection failure")
@@ -100,7 +100,7 @@ func TestAgentHTTPClient_RunWithExecutionResult(t *testing.T) {
100100
}))
101101
defer server.Close()
102102

103-
client := NewAgentHTTPClient(server.URL)
103+
client := NewAgentHTTPClient(server.URL, 0)
104104
agentCtx := &agentContext{
105105
TargetNamespaces: []string{"production"},
106106
ExecutionResult: &agentExecutionResult{
@@ -135,7 +135,7 @@ func TestAgentHTTPClient_RunWithoutExecutionResult(t *testing.T) {
135135
}))
136136
defer server.Close()
137137

138-
client := NewAgentHTTPClient(server.URL)
138+
client := NewAgentHTTPClient(server.URL, 0)
139139
agentCtx := &agentContext{
140140
TargetNamespaces: []string{"production"},
141141
}
@@ -169,7 +169,7 @@ func TestAgentHTTPClient_RunWithContext(t *testing.T) {
169169
}))
170170
defer server.Close()
171171

172-
client := NewAgentHTTPClient(server.URL)
172+
client := NewAgentHTTPClient(server.URL, 0)
173173
agentCtx := &agentContext{
174174
TargetNamespaces: []string{"production"},
175175
PreviousAttempts: []agentPreviousAttempt{{Attempt: 1, FailureReason: "timeout"}},

controller/proposal/handlers.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ func (r *ProposalReconciler) handleAnalysis(
8585
return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToAnalyzing, err)
8686
}
8787

88-
analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, defaultSandboxSA)
88+
timeout := proposalTimeout(proposal)
89+
analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, defaultSandboxSA, timeout)
8990
if err != nil {
9091
return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err)
9192
}
@@ -151,7 +152,8 @@ func (r *ProposalReconciler) handleRevision(
151152
revisionSuffix := buildRevisionContext(proposal)
152153
requestWithRevision := proposal.Spec.Request + "\n\n" + revisionSuffix
153154

154-
analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, defaultSandboxSA)
155+
timeout := proposalTimeout(proposal)
156+
analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, defaultSandboxSA, timeout)
155157
if err != nil {
156158
return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err)
157159
}
@@ -270,7 +272,8 @@ func (r *ProposalReconciler) handleExecution(
270272
return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToExecuting, err)
271273
}
272274

273-
execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, execSA)
275+
timeout := proposalTimeout(proposal)
276+
execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, execSA, timeout)
274277
if err != nil {
275278
return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionExecuted, err)
276279
}
@@ -387,7 +390,8 @@ func (r *ProposalReconciler) handleVerification(
387390
}
388391
}
389392

390-
verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, defaultSandboxSA)
393+
timeout := proposalTimeout(proposal)
394+
verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, defaultSandboxSA, timeout)
391395
if err != nil {
392396
return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionVerified, err)
393397
}
@@ -586,7 +590,8 @@ func (r *ProposalReconciler) handleEscalation(
586590
}
587591

588592
escalationText := buildEscalationRequest(proposal)
589-
escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, defaultSandboxSA)
593+
timeout := proposalTimeout(proposal)
594+
escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, defaultSandboxSA, timeout)
590595
if err != nil {
591596
return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionEscalated, err)
592597
}

controller/proposal/reconciler_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,32 +35,32 @@ type testAgentCaller struct {
3535

3636
func newTestAgentCaller() *testAgentCaller {
3737
stub := &StubAgentCaller{}
38-
a, _ := stub.Analyze(context.Background(), nil, resolvedStep{}, "", "")
39-
e, _ := stub.Execute(context.Background(), nil, resolvedStep{}, nil, "")
40-
v, _ := stub.Verify(context.Background(), nil, resolvedStep{}, nil, nil, "")
41-
esc, _ := stub.Escalate(context.Background(), nil, resolvedStep{}, "", "")
38+
a, _ := stub.Analyze(context.Background(), nil, resolvedStep{}, "", "", 0)
39+
e, _ := stub.Execute(context.Background(), nil, resolvedStep{}, nil, "", 0)
40+
v, _ := stub.Verify(context.Background(), nil, resolvedStep{}, nil, nil, "", 0)
41+
esc, _ := stub.Escalate(context.Background(), nil, resolvedStep{}, "", "", 0)
4242
return &testAgentCaller{analyzeResult: a, executeResult: e, verifyResult: v, escalateResult: esc}
4343
}
4444

45-
func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*AnalysisOutput, error) {
45+
func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*AnalysisOutput, error) {
4646
if ta.analyzeErr != nil {
4747
return nil, ta.analyzeErr
4848
}
4949
return ta.analyzeResult, nil
5050
}
51-
func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string) (*ExecutionOutput, error) {
51+
func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, _ time.Duration) (*ExecutionOutput, error) {
5252
if ta.executeErr != nil {
5353
return nil, ta.executeErr
5454
}
5555
return ta.executeResult, nil
5656
}
57-
func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string) (*VerificationOutput, error) {
57+
func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, _ time.Duration) (*VerificationOutput, error) {
5858
if ta.verifyErr != nil {
5959
return nil, ta.verifyErr
6060
}
6161
return ta.verifyResult, nil
6262
}
63-
func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*EscalationOutput, error) {
63+
func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*EscalationOutput, error) {
6464
if ta.escalateErr != nil {
6565
return nil, ta.escalateErr
6666
}
@@ -247,7 +247,7 @@ func newMockSandboxAgent(analysisJSON, executionJSON, verificationJSON string) (
247247
caller := &SandboxAgentCaller{
248248
Sandbox: sandbox,
249249
K8sClient: fc,
250-
ClientFactory: func(_ string) AgentHTTPClientInterface {
250+
ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface {
251251
resp := responses[callCount%len(responses)]
252252
callCount++
253253
httpClient.response = &agentRunResponse{Response: json.RawMessage(resp)}

0 commit comments

Comments
 (0)