Skip to content

Commit 53c26a6

Browse files
harcheclaude
andcommitted
pkg/readiness: Add readiness checks and wire into proposal controller
Add pkg/readiness package with 9 cluster readiness checks that gather pre-upgrade health data: cluster conditions, operator health, API deprecations, node capacity, PDB drain blockers, etcd health, network config, CRD compatibility, and OLM operator lifecycle. Wire readiness.RunAll() into the proposal controller, replacing the hardcoded readinessJSON placeholder with real per-target readiness data that gets embedded in each proposal's request body. Plumb dynamic.Interface from pkg/start through cvo.New() to the proposal controller to support the readiness checks' cluster queries. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 52cc8e3 commit 53c26a6

27 files changed

Lines changed: 4063 additions & 161 deletions

.openshift-tests-extension/openshift_payload_cluster-version-operator.json

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,5 +110,95 @@
110110
"source": "openshift:payload:cluster-version-operator",
111111
"lifecycle": "informing",
112112
"environmentSelector": {}
113+
},
114+
{
115+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should run all checks without errors",
116+
"labels": {},
117+
"resources": {
118+
"isolation": {}
119+
},
120+
"source": "openshift:payload:cluster-version-operator",
121+
"lifecycle": "blocking",
122+
"environmentSelector": {}
123+
},
124+
{
125+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should produce valid JSON that round-trips",
126+
"labels": {},
127+
"resources": {
128+
"isolation": {}
129+
},
130+
"source": "openshift:payload:cluster-version-operator",
131+
"lifecycle": "blocking",
132+
"environmentSelector": {}
133+
},
134+
{
135+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report node count matching the actual cluster",
136+
"labels": {},
137+
"resources": {
138+
"isolation": {}
139+
},
140+
"source": "openshift:payload:cluster-version-operator",
141+
"lifecycle": "blocking",
142+
"environmentSelector": {}
143+
},
144+
{
145+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report operator count matching actual ClusterOperators",
146+
"labels": {},
147+
"resources": {
148+
"isolation": {}
149+
},
150+
"source": "openshift:payload:cluster-version-operator",
151+
"lifecycle": "blocking",
152+
"environmentSelector": {}
153+
},
154+
{
155+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report etcd member count matching actual etcd pods",
156+
"labels": {},
157+
"resources": {
158+
"isolation": {}
159+
},
160+
"source": "openshift:payload:cluster-version-operator",
161+
"lifecycle": "blocking",
162+
"environmentSelector": {}
163+
},
164+
{
165+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report network type matching actual Network config",
166+
"labels": {},
167+
"resources": {
168+
"isolation": {}
169+
},
170+
"source": "openshift:payload:cluster-version-operator",
171+
"lifecycle": "blocking",
172+
"environmentSelector": {}
173+
},
174+
{
175+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report PDB count matching actual PodDisruptionBudgets",
176+
"labels": {},
177+
"resources": {
178+
"isolation": {}
179+
},
180+
"source": "openshift:payload:cluster-version-operator",
181+
"lifecycle": "blocking",
182+
"environmentSelector": {}
183+
},
184+
{
185+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report cluster conditions matching ClusterVersion status",
186+
"labels": {},
187+
"resources": {
188+
"isolation": {}
189+
},
190+
"source": "openshift:payload:cluster-version-operator",
191+
"lifecycle": "blocking",
192+
"environmentSelector": {}
193+
},
194+
{
195+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should complete all checks within 60 seconds",
196+
"labels": {},
197+
"resources": {
198+
"isolation": {}
199+
},
200+
"source": "openshift:payload:cluster-version-operator",
201+
"lifecycle": "blocking",
202+
"environmentSelector": {}
113203
}
114204
]

AGENTS.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,46 @@ Subsystems include: `pkg/cvo`, `pkg/payload`, `lib/resourceapply`, `hack`, etc.
188188

189189
### Development and Testing
190190
- Never test against production clusters - always use disposable test environments
191-
- CVO has significant control over cluster state and can disrupt operations during development
191+
- CVO has significant control over cluster state and can disrupt operations during development
192+
193+
### Deploying CVO with Lightspeed Proposals (Dev)
194+
195+
The proposal controller is gated behind `TechPreviewNoUpgrade`. To test on a Default feature set cluster:
196+
197+
1. **Bypass the feature gate** (local only, do not commit):
198+
```go
199+
// In pkg/cvo/cvo.go, shouldEnableProposalController()
200+
return true
201+
```
202+
203+
2. **Build and push the skills image** to the cluster's internal registry:
204+
```bash
205+
oc new-project cvo-dev
206+
# Build from the agentic-skills repo
207+
podman build -f Dockerfile -t <registry-route>/cvo-dev/agentic-skills:latest .
208+
podman push --tls-verify=false <registry-route>/cvo-dev/agentic-skills:latest
209+
```
210+
211+
3. **Set `LIGHTSPEED_SKILLS_IMAGE`** in `install/0000_00_cluster-version-operator_30_deployment.yaml` to the internal registry image (local only, do not commit):
212+
```yaml
213+
- name: LIGHTSPEED_SKILLS_IMAGE
214+
value: "image-registry.openshift-image-registry.svc:5000/cvo-dev/agentic-skills@sha256:..."
215+
```
216+
217+
4. **Build and deploy** following the standard dev workflow in `docs/dev/README.md`:
218+
```bash
219+
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o _output/linux/amd64/cluster-version-operator ./cmd/cluster-version-operator/
220+
# Build image on cluster, then: oc adm release new ...
221+
```
222+
223+
5. **Grant image pull access** for the sandbox namespace:
224+
```bash
225+
oc policy add-role-to-group system:image-puller system:serviceaccounts:openshift-lightspeed -n cvo-dev
226+
```
227+
228+
6. **Apply the prompt ConfigMap**:
229+
```bash
230+
oc apply -f install/0000_00_cluster-version-operator_50_lightspeed-prompts.yaml
231+
```
232+
233+
7. **Revert local changes** before committing (steps 1 and 3)

install/0000_00_cluster-version-operator_30_deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ spec:
7777
fieldPath: spec.nodeName
7878
- name: CLUSTER_PROFILE
7979
value: '{{ .ClusterProfile }}'
80+
- name: LIGHTSPEED_SKILLS_IMAGE
81+
value: "quay.io/openshift/ci:ocp_5.0_agentic-skills"
8082
# this pod is hostNetwork and uses the internal LB DNS name when possible, which the kubelet also uses.
8183
# this dnsPolicy allows us to use the same dnsConfig as the kubelet, without access to read it ourselves.
8284
dnsPolicy: Default

pkg/cvo/availableupdates_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ func newOperator(url string, cluster release, promqlMock clusterconditions.Condi
208208
func() ([]configv1.Release, []configv1.ConditionalUpdate, error) {
209209
return nil, nil, nil
210210
},
211-
fake.NewClientBuilder().Build(), func(_ string) (*configv1.ClusterVersion, error) {
211+
fake.NewClientBuilder().Build(), nil, func(_ string) (*configv1.ClusterVersion, error) {
212212
return &configv1.ClusterVersion{}, nil
213213
},
214214
func(_ context.Context, namespace, name string, _ metav1.GetOptions) (*corev1.ConfigMap, error) {

pkg/cvo/cvo.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
1818
"k8s.io/apimachinery/pkg/util/sets"
1919
"k8s.io/apimachinery/pkg/util/wait"
20+
"k8s.io/client-go/dynamic"
2021
informerscorev1 "k8s.io/client-go/informers/core/v1"
2122
"k8s.io/client-go/kubernetes"
2223
"k8s.io/client-go/kubernetes/scheme"
@@ -109,6 +110,7 @@ type Operator struct {
109110

110111
client clientset.Interface
111112
kubeClient kubernetes.Interface
113+
dynamicClient dynamic.Interface
112114
operatorClient operatorclientset.Interface
113115
eventRecorder record.EventRecorder
114116

@@ -235,6 +237,7 @@ func New(
235237
featureGateInformer configinformersv1.FeatureGateInformer,
236238
client clientset.Interface,
237239
kubeClient kubernetes.Interface,
240+
dynamicClient dynamic.Interface,
238241
operatorClient operatorclientset.Interface,
239242
exclude string,
240243
clusterProfile string,
@@ -267,6 +270,7 @@ func New(
267270

268271
client: client,
269272
kubeClient: kubeClient,
273+
dynamicClient: dynamicClient,
270274
operatorClient: operatorClient,
271275
eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: namespace}),
272276
queue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "clusterversion"}),
@@ -354,6 +358,7 @@ func New(
354358
return availableUpdates.Updates, availableUpdates.ConditionalUpdates, nil
355359
},
356360
rtClient,
361+
dynamicClient,
357362
cvInformer.Lister().Get,
358363
func(ctx context.Context, namespace, name string, opts metav1.GetOptions) (*corev1.ConfigMap, error) {
359364
return kubeClient.CoreV1().ConfigMaps(namespace).Get(ctx, name, opts)

pkg/cvo/cvo_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2756,7 +2756,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
27562756
ctx := context.Background()
27572757
optr.proposalController = proposal.NewController(func() ([]configv1.Release, []configv1.ConditionalUpdate, error) {
27582758
return nil, nil, nil
2759-
}, ctrlruntimefake.NewClientBuilder().Build(), func(_ string) (*configv1.ClusterVersion, error) {
2759+
}, ctrlruntimefake.NewClientBuilder().Build(), nil, func(_ string) (*configv1.ClusterVersion, error) {
27602760
return &configv1.ClusterVersion{}, nil
27612761
}, func(_ context.Context, namespace, name string, _ metav1.GetOptions) (*corev1.ConfigMap, error) {
27622762
return &corev1.ConfigMap{}, nil

pkg/payload/testdata/TestRenderManifest_expected_cvo_deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ spec:
7777
fieldPath: spec.nodeName
7878
- name: CLUSTER_PROFILE
7979
value: 'some-profile'
80+
- name: LIGHTSPEED_SKILLS_IMAGE
81+
value: "quay.io/openshift/ci:ocp_5.0_agentic-skills"
8082
# this pod is hostNetwork and uses the internal LB DNS name when possible, which the kubelet also uses.
8183
# this dnsPolicy allows us to use the same dnsConfig as the kubelet, without access to read it ourselves.
8284
dnsPolicy: Default

pkg/proposal/analysis_schema.json

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
{
2+
"type": "object",
3+
"required": ["analysisData"],
4+
"properties": {
5+
"analysisData": {
6+
"type": "array",
7+
"description": "Typed components describing upgrade readiness. Must include exactly one ota_readiness_summary. Include one ota_finding per blocker or warning. Include one ota_olm_operator_status if OLM operators are present.",
8+
"minItems": 1,
9+
"items": {
10+
"oneOf": [
11+
{
12+
"type": "object",
13+
"description": "Overall upgrade readiness summary with per-check results.",
14+
"properties": {
15+
"type": { "type": "string", "const": "ota_readiness_summary" },
16+
"decision": {
17+
"type": "string",
18+
"enum": ["recommend", "caution", "block", "escalate"],
19+
"description": "recommend=all clear, caution=warnings only, block=blockers found, escalate=insufficient data"
20+
},
21+
"checks": {
22+
"type": "array",
23+
"description": "One entry per readiness check from the input JSON.",
24+
"items": {
25+
"type": "object",
26+
"properties": {
27+
"name": { "type": "string", "description": "Check name, e.g. Cluster Conditions, Operator Health" },
28+
"status": { "type": "string", "enum": ["pass", "warn", "fail", "error"] },
29+
"detail": { "type": "string", "description": "One-line summary" }
30+
},
31+
"required": ["name", "status"]
32+
}
33+
}
34+
},
35+
"required": ["type", "decision", "checks"]
36+
},
37+
{
38+
"type": "object",
39+
"description": "A specific blocker, warning, or informational finding.",
40+
"properties": {
41+
"type": { "type": "string", "const": "ota_finding" },
42+
"severity": { "type": "string", "enum": ["blocker", "warning", "info"] },
43+
"check": { "type": "string", "description": "Which readiness check surfaced this" },
44+
"detail": { "type": "string", "description": "Description for a cluster administrator" },
45+
"affectedResources": { "type": "array", "items": { "type": "string" } },
46+
"prerequisite": { "type": "string", "description": "Action to resolve before upgrading" },
47+
"verifyCommand": { "type": "string", "description": "Command to verify the finding is resolved" }
48+
},
49+
"required": ["type", "severity", "check", "detail"]
50+
},
51+
{
52+
"type": "object",
53+
"description": "Per-operator OLM lifecycle status.",
54+
"properties": {
55+
"type": { "type": "string", "const": "ota_olm_operator_status" },
56+
"operators": {
57+
"type": "array",
58+
"items": {
59+
"type": "object",
60+
"properties": {
61+
"name": { "type": "string" },
62+
"namespace": { "type": "string" },
63+
"displayName": { "type": "string" },
64+
"installedVersion": { "type": "string" },
65+
"channel": { "type": "string" },
66+
"source": { "type": "string" },
67+
"installPlanApproval": { "type": "string", "enum": ["Automatic", "Manual"] },
68+
"pendingUpgrade": { "type": "boolean" },
69+
"pendingVersion": { "type": "string" },
70+
"compatibleWithTarget": { "type": "boolean" },
71+
"availableChannels": { "type": "array", "items": { "type": "string" } },
72+
"ocpCompat": {
73+
"type": "object",
74+
"properties": { "min": { "type": "string" }, "max": { "type": "string" } }
75+
},
76+
"lifecycle": {
77+
"type": "object",
78+
"properties": {
79+
"productName": { "type": "string" },
80+
"supportPhase": { "type": "string", "enum": ["Full Support", "Maintenance Support", "End of life"] },
81+
"ocpVersions": { "type": "string" },
82+
"maintenanceEnds": { "type": "string" }
83+
}
84+
}
85+
},
86+
"required": ["name", "namespace"]
87+
}
88+
},
89+
"summary": {
90+
"type": "object",
91+
"properties": {
92+
"totalOperators": { "type": "integer" },
93+
"pendingUpgrades": { "type": "integer" },
94+
"manualApproval": { "type": "integer" },
95+
"incompatibleWithTarget": { "type": "integer" }
96+
}
97+
}
98+
},
99+
"required": ["type", "operators", "summary"]
100+
}
101+
]
102+
}
103+
}
104+
}
105+
}

0 commit comments

Comments
 (0)