Skip to content

Commit a324194

Browse files
harcheclaude
andcommitted
pkg/readiness: Add readiness checks and wire into proposal controller
Add pkg/readiness package with 9 cluster readiness checks that gather pre-upgrade health data: cluster conditions, operator health, API deprecations, node capacity, PDB drain blockers, etcd health, network config, CRD compatibility, and OLM operator lifecycle. Wire readiness.RunAll() into the proposal controller, replacing the hardcoded readinessJSON placeholder with real per-target readiness data that gets embedded in each proposal's request body. Plumb dynamic.Interface from pkg/start through cvo.New() to the proposal controller to support the readiness checks' cluster queries. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 810bfc1 commit a324194

27 files changed

Lines changed: 4046 additions & 161 deletions

.openshift-tests-extension/openshift_payload_cluster-version-operator.json

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,96 @@
110110
"lifecycle": "informing",
111111
"environmentSelector": {}
112112
},
113+
{
114+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should run all checks without errors",
115+
"labels": {},
116+
"resources": {
117+
"isolation": {}
118+
},
119+
"source": "openshift:payload:cluster-version-operator",
120+
"lifecycle": "blocking",
121+
"environmentSelector": {}
122+
},
123+
{
124+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should produce valid JSON that round-trips",
125+
"labels": {},
126+
"resources": {
127+
"isolation": {}
128+
},
129+
"source": "openshift:payload:cluster-version-operator",
130+
"lifecycle": "blocking",
131+
"environmentSelector": {}
132+
},
133+
{
134+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report node count matching the actual cluster",
135+
"labels": {},
136+
"resources": {
137+
"isolation": {}
138+
},
139+
"source": "openshift:payload:cluster-version-operator",
140+
"lifecycle": "blocking",
141+
"environmentSelector": {}
142+
},
143+
{
144+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report operator count matching actual ClusterOperators",
145+
"labels": {},
146+
"resources": {
147+
"isolation": {}
148+
},
149+
"source": "openshift:payload:cluster-version-operator",
150+
"lifecycle": "blocking",
151+
"environmentSelector": {}
152+
},
153+
{
154+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report etcd member count matching actual etcd pods",
155+
"labels": {},
156+
"resources": {
157+
"isolation": {}
158+
},
159+
"source": "openshift:payload:cluster-version-operator",
160+
"lifecycle": "blocking",
161+
"environmentSelector": {}
162+
},
163+
{
164+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report network type matching actual Network config",
165+
"labels": {},
166+
"resources": {
167+
"isolation": {}
168+
},
169+
"source": "openshift:payload:cluster-version-operator",
170+
"lifecycle": "blocking",
171+
"environmentSelector": {}
172+
},
173+
{
174+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report PDB count matching actual PodDisruptionBudgets",
175+
"labels": {},
176+
"resources": {
177+
"isolation": {}
178+
},
179+
"source": "openshift:payload:cluster-version-operator",
180+
"lifecycle": "blocking",
181+
"environmentSelector": {}
182+
},
183+
{
184+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report cluster conditions matching ClusterVersion status",
185+
"labels": {},
186+
"resources": {
187+
"isolation": {}
188+
},
189+
"source": "openshift:payload:cluster-version-operator",
190+
"lifecycle": "blocking",
191+
"environmentSelector": {}
192+
},
193+
{
194+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should complete all checks within 60 seconds",
195+
"labels": {},
196+
"resources": {
197+
"isolation": {}
198+
},
199+
"source": "openshift:payload:cluster-version-operator",
200+
"lifecycle": "blocking",
201+
"environmentSelector": {}
202+
},
113203
{
114204
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator must get the APIServer when the TLS profile manager is created",
115205
"labels": {

AGENTS.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,46 @@ Subsystems include: `pkg/cvo`, `pkg/payload`, `lib/resourceapply`, `hack`, etc.
188188

189189
### Development and Testing
190190
- Never test against production clusters - always use disposable test environments
191-
- CVO has significant control over cluster state and can disrupt operations during development
191+
- CVO has significant control over cluster state and can disrupt operations during development
192+
193+
### Deploying CVO with Lightspeed Proposals (Dev)
194+
195+
The proposal controller is gated behind `TechPreviewNoUpgrade`. To test on a Default feature set cluster:
196+
197+
1. **Bypass the feature gate** (local only, do not commit):
198+
```go
199+
// In pkg/cvo/cvo.go, shouldEnableProposalController()
200+
return true
201+
```
202+
203+
2. **Build and push the skills image** to the cluster's internal registry:
204+
```bash
205+
oc new-project cvo-dev
206+
# Build from the agentic-skills repo
207+
podman build -f Dockerfile -t <registry-route>/cvo-dev/agentic-skills:latest .
208+
podman push --tls-verify=false <registry-route>/cvo-dev/agentic-skills:latest
209+
```
210+
211+
3. **Set `LIGHTSPEED_SKILLS_IMAGE`** in `install/0000_00_cluster-version-operator_30_deployment.yaml` to the internal registry image (local only, do not commit):
212+
```yaml
213+
- name: LIGHTSPEED_SKILLS_IMAGE
214+
value: "image-registry.openshift-image-registry.svc:5000/cvo-dev/agentic-skills@sha256:..."
215+
```
216+
217+
4. **Build and deploy** following the standard dev workflow in `docs/dev/README.md`:
218+
```bash
219+
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o _output/linux/amd64/cluster-version-operator ./cmd/cluster-version-operator/
220+
# Build image on cluster, then: oc adm release new ...
221+
```
222+
223+
5. **Grant image pull access** for the sandbox namespace:
224+
```bash
225+
oc policy add-role-to-group system:image-puller system:serviceaccounts:openshift-lightspeed -n cvo-dev
226+
```
227+
228+
6. **Apply the prompt ConfigMap**:
229+
```bash
230+
oc apply -f install/0000_00_cluster-version-operator_50_lightspeed-prompts.yaml
231+
```
232+
233+
7. **Revert local changes** before committing (steps 1 and 3)

install/0000_00_cluster-version-operator_30_deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ spec:
7777
fieldPath: spec.nodeName
7878
- name: CLUSTER_PROFILE
7979
value: '{{ .ClusterProfile }}'
80+
- name: LIGHTSPEED_SKILLS_IMAGE
81+
value: "quay.io/openshift/ci:ocp_5.0_agentic-skills"
8082
# this pod is hostNetwork and uses the internal LB DNS name when possible, which the kubelet also uses.
8183
# this dnsPolicy allows us to use the same dnsConfig as the kubelet, without access to read it ourselves.
8284
dnsPolicy: Default

pkg/cvo/availableupdates_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ func newOperator(url string, cluster release, promqlMock clusterconditions.Condi
209209
func() ([]configv1.Release, []configv1.ConditionalUpdate, error) {
210210
return nil, nil, nil
211211
},
212-
fake.NewClientBuilder().Build(), func(_ string) (*configv1.ClusterVersion, error) {
212+
fake.NewClientBuilder().Build(), nil, func(_ string) (*configv1.ClusterVersion, error) {
213213
return &configv1.ClusterVersion{}, nil
214214
},
215215
func(_ context.Context, namespace, name string, _ metav1.GetOptions) (*corev1.ConfigMap, error) {

pkg/cvo/cvo.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
1919
"k8s.io/apimachinery/pkg/util/sets"
2020
"k8s.io/apimachinery/pkg/util/wait"
21+
"k8s.io/client-go/dynamic"
2122
informerscorev1 "k8s.io/client-go/informers/core/v1"
2223
"k8s.io/client-go/kubernetes"
2324
"k8s.io/client-go/kubernetes/scheme"
@@ -111,6 +112,7 @@ type Operator struct {
111112

112113
client clientset.Interface
113114
kubeClient kubernetes.Interface
115+
dynamicClient dynamic.Interface
114116
operatorClient operatorclientset.Interface
115117
eventRecorder record.EventRecorder
116118

@@ -244,6 +246,7 @@ func New(
244246
overrides *cvotls.Settings,
245247
client clientset.Interface,
246248
kubeClient kubernetes.Interface,
249+
dynamicClient dynamic.Interface,
247250
operatorClient operatorclientset.Interface,
248251
exclude string,
249252
clusterProfile string,
@@ -276,6 +279,7 @@ func New(
276279

277280
client: client,
278281
kubeClient: kubeClient,
282+
dynamicClient: dynamicClient,
279283
operatorClient: operatorClient,
280284
eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: namespace}),
281285
queue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "clusterversion"}),
@@ -370,6 +374,7 @@ func New(
370374
return availableUpdates.Updates, availableUpdates.ConditionalUpdates, nil
371375
},
372376
rtClient,
377+
dynamicClient,
373378
cvInformer.Lister().Get,
374379
func(ctx context.Context, namespace, name string, opts metav1.GetOptions) (*corev1.ConfigMap, error) {
375380
return kubeClient.CoreV1().ConfigMaps(namespace).Get(ctx, name, opts)

pkg/cvo/cvo_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2756,7 +2756,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
27562756
ctx := context.Background()
27572757
optr.proposalController = proposal.NewController(func() ([]configv1.Release, []configv1.ConditionalUpdate, error) {
27582758
return nil, nil, nil
2759-
}, ctrlruntimefake.NewClientBuilder().Build(), func(_ string) (*configv1.ClusterVersion, error) {
2759+
}, ctrlruntimefake.NewClientBuilder().Build(), nil, func(_ string) (*configv1.ClusterVersion, error) {
27602760
return &configv1.ClusterVersion{}, nil
27612761
}, func(_ context.Context, namespace, name string, _ metav1.GetOptions) (*corev1.ConfigMap, error) {
27622762
return &corev1.ConfigMap{}, nil

pkg/payload/testdata/TestRenderManifest_expected_cvo_deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ spec:
7777
fieldPath: spec.nodeName
7878
- name: CLUSTER_PROFILE
7979
value: 'some-profile'
80+
- name: LIGHTSPEED_SKILLS_IMAGE
81+
value: "quay.io/openshift/ci:ocp_5.0_agentic-skills"
8082
# this pod is hostNetwork and uses the internal LB DNS name when possible, which the kubelet also uses.
8183
# this dnsPolicy allows us to use the same dnsConfig as the kubelet, without access to read it ourselves.
8284
dnsPolicy: Default

pkg/proposal/analysis_schema.json

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
{
2+
"type": "object",
3+
"required": ["analysisData"],
4+
"properties": {
5+
"analysisData": {
6+
"type": "array",
7+
"description": "Typed components describing upgrade readiness. Must include exactly one ota_readiness_summary. Include one ota_finding per blocker or warning. Include one ota_olm_operator_status if OLM operators are present.",
8+
"minItems": 1,
9+
"items": {
10+
"oneOf": [
11+
{
12+
"type": "object",
13+
"description": "Overall upgrade readiness summary with per-check results.",
14+
"properties": {
15+
"type": { "type": "string", "const": "ota_readiness_summary" },
16+
"decision": {
17+
"type": "string",
18+
"enum": ["recommend", "caution", "block", "escalate"],
19+
"description": "recommend=all clear, caution=warnings only, block=blockers found, escalate=insufficient data"
20+
},
21+
"checks": {
22+
"type": "array",
23+
"description": "One entry per readiness check from the input JSON.",
24+
"items": {
25+
"type": "object",
26+
"properties": {
27+
"name": { "type": "string", "description": "Check name, e.g. Cluster Conditions, Operator Health" },
28+
"status": { "type": "string", "enum": ["pass", "warn", "fail", "error"] },
29+
"detail": { "type": "string", "description": "One-line summary" }
30+
},
31+
"required": ["name", "status"]
32+
}
33+
}
34+
},
35+
"required": ["type", "decision", "checks"]
36+
},
37+
{
38+
"type": "object",
39+
"description": "A specific blocker, warning, or informational finding.",
40+
"properties": {
41+
"type": { "type": "string", "const": "ota_finding" },
42+
"severity": { "type": "string", "enum": ["blocker", "warning", "info"] },
43+
"check": { "type": "string", "description": "Which readiness check surfaced this" },
44+
"detail": { "type": "string", "description": "Description for a cluster administrator" },
45+
"affectedResources": { "type": "array", "items": { "type": "string" } },
46+
"prerequisite": { "type": "string", "description": "Action to resolve before upgrading" },
47+
"verifyCommand": { "type": "string", "description": "Command to verify the finding is resolved" }
48+
},
49+
"required": ["type", "severity", "check", "detail"]
50+
},
51+
{
52+
"type": "object",
53+
"description": "Per-operator OLM lifecycle status.",
54+
"properties": {
55+
"type": { "type": "string", "const": "ota_olm_operator_status" },
56+
"operators": {
57+
"type": "array",
58+
"items": {
59+
"type": "object",
60+
"properties": {
61+
"name": { "type": "string" },
62+
"namespace": { "type": "string" },
63+
"displayName": { "type": "string" },
64+
"installedVersion": { "type": "string" },
65+
"channel": { "type": "string" },
66+
"source": { "type": "string" },
67+
"installPlanApproval": { "type": "string", "enum": ["Automatic", "Manual"] },
68+
"pendingUpgrade": { "type": "boolean" },
69+
"pendingVersion": { "type": "string" },
70+
"compatibleWithTarget": { "type": "boolean" },
71+
"availableChannels": { "type": "array", "items": { "type": "string" } },
72+
"ocpCompat": {
73+
"type": "object",
74+
"properties": { "min": { "type": "string" }, "max": { "type": "string" } }
75+
},
76+
"lifecycle": {
77+
"type": "object",
78+
"properties": {
79+
"productName": { "type": "string" },
80+
"supportPhase": { "type": "string", "enum": ["Full Support", "Maintenance Support", "End of life"] },
81+
"ocpVersions": { "type": "string" },
82+
"maintenanceEnds": { "type": "string" }
83+
}
84+
}
85+
},
86+
"required": ["name", "namespace"]
87+
}
88+
},
89+
"summary": {
90+
"type": "object",
91+
"properties": {
92+
"totalOperators": { "type": "integer" },
93+
"pendingUpgrades": { "type": "integer" },
94+
"manualApproval": { "type": "integer" },
95+
"incompatibleWithTarget": { "type": "integer" }
96+
}
97+
}
98+
},
99+
"required": ["type", "operators", "summary"]
100+
}
101+
]
102+
}
103+
}
104+
}
105+
}

0 commit comments

Comments
 (0)