Skip to content

Commit dd0a841

Browse files
Merge pull request #1395 from harche/readiness_pkg
OTA-1966: Add readiness checks and wire into proposal controller
2 parents 26871b1 + eb3321d commit dd0a841

28 files changed

Lines changed: 4077 additions & 167 deletions

.openshift-tests-extension/openshift_payload_cluster-version-operator.json

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,96 @@
110110
"lifecycle": "informing",
111111
"environmentSelector": {}
112112
},
113+
{
114+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should run all checks without errors",
115+
"labels": {},
116+
"resources": {
117+
"isolation": {}
118+
},
119+
"source": "openshift:payload:cluster-version-operator",
120+
"lifecycle": "blocking",
121+
"environmentSelector": {}
122+
},
123+
{
124+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should produce valid JSON that round-trips",
125+
"labels": {},
126+
"resources": {
127+
"isolation": {}
128+
},
129+
"source": "openshift:payload:cluster-version-operator",
130+
"lifecycle": "blocking",
131+
"environmentSelector": {}
132+
},
133+
{
134+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report node count matching the actual cluster",
135+
"labels": {},
136+
"resources": {
137+
"isolation": {}
138+
},
139+
"source": "openshift:payload:cluster-version-operator",
140+
"lifecycle": "blocking",
141+
"environmentSelector": {}
142+
},
143+
{
144+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report operator count matching actual ClusterOperators",
145+
"labels": {},
146+
"resources": {
147+
"isolation": {}
148+
},
149+
"source": "openshift:payload:cluster-version-operator",
150+
"lifecycle": "blocking",
151+
"environmentSelector": {}
152+
},
153+
{
154+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report etcd member count matching actual etcd pods",
155+
"labels": {},
156+
"resources": {
157+
"isolation": {}
158+
},
159+
"source": "openshift:payload:cluster-version-operator",
160+
"lifecycle": "blocking",
161+
"environmentSelector": {}
162+
},
163+
{
164+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report network type matching actual Network config",
165+
"labels": {},
166+
"resources": {
167+
"isolation": {}
168+
},
169+
"source": "openshift:payload:cluster-version-operator",
170+
"lifecycle": "blocking",
171+
"environmentSelector": {}
172+
},
173+
{
174+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report PDB count matching actual PodDisruptionBudgets",
175+
"labels": {},
176+
"resources": {
177+
"isolation": {}
178+
},
179+
"source": "openshift:payload:cluster-version-operator",
180+
"lifecycle": "blocking",
181+
"environmentSelector": {}
182+
},
183+
{
184+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should report cluster conditions matching ClusterVersion status",
185+
"labels": {},
186+
"resources": {
187+
"isolation": {}
188+
},
189+
"source": "openshift:payload:cluster-version-operator",
190+
"lifecycle": "blocking",
191+
"environmentSelector": {}
192+
},
193+
{
194+
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator readiness checks should complete all checks within 60 seconds",
195+
"labels": {},
196+
"resources": {
197+
"isolation": {}
198+
},
199+
"source": "openshift:payload:cluster-version-operator",
200+
"lifecycle": "blocking",
201+
"environmentSelector": {}
202+
},
113203
{
114204
"name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator must get the APIServer when the TLS profile manager is created",
115205
"labels": {

AGENTS.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,46 @@ Subsystems include: `pkg/cvo`, `pkg/payload`, `lib/resourceapply`, `hack`, etc.
188188

189189
### Development and Testing
190190
- Never test against production clusters - always use disposable test environments
191-
- CVO has significant control over cluster state and can disrupt operations during development
191+
- CVO has significant control over cluster state and can disrupt operations during development
192+
193+
### Deploying CVO with Lightspeed Proposals (Dev)
194+
195+
The proposal controller is gated behind `TechPreviewNoUpgrade`. To test on a Default feature set cluster:
196+
197+
1. **Bypass the feature gate** (local only, do not commit):
198+
```go
199+
// In pkg/cvo/cvo.go, shouldEnableProposalController()
200+
return true
201+
```
202+
203+
2. **Build and push the skills image** to the cluster's internal registry:
204+
```bash
205+
oc new-project cvo-dev
206+
# Build from the agentic-skills repo
207+
podman build -f Dockerfile -t <registry-route>/cvo-dev/agentic-skills:latest .
208+
podman push --tls-verify=false <registry-route>/cvo-dev/agentic-skills:latest
209+
```
210+
211+
3. **Set `LIGHTSPEED_SKILLS_IMAGE`** in `install/0000_00_cluster-version-operator_30_deployment.yaml` to the internal registry image (local only, do not commit):
212+
```yaml
213+
- name: LIGHTSPEED_SKILLS_IMAGE
214+
value: "image-registry.openshift-image-registry.svc:5000/cvo-dev/agentic-skills@sha256:..."
215+
```
216+
217+
4. **Build and deploy** following the standard dev workflow in `docs/dev/README.md`:
218+
```bash
219+
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o _output/linux/amd64/cluster-version-operator ./cmd/cluster-version-operator/
220+
# Build image on cluster, then: oc adm release new ...
221+
```
222+
223+
5. **Grant image pull access** for the sandbox namespace:
224+
```bash
225+
oc policy add-role-to-group system:image-puller system:serviceaccounts:openshift-lightspeed -n cvo-dev
226+
```
227+
228+
6. **Apply the prompt ConfigMap**:
229+
```bash
230+
oc apply -f install/0000_00_cluster-version-operator_50_lightspeed-prompts.yaml
231+
```
232+
233+
7. **Revert local changes** before committing (steps 1 and 3)

install/0000_00_cluster-version-operator_30_deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ spec:
7373
fieldPath: spec.nodeName
7474
- name: CLUSTER_PROFILE
7575
value: '{{ .ClusterProfile }}'
76+
- name: LIGHTSPEED_SKILLS_IMAGE
77+
value: "quay.io/openshift/ci:ocp_5.0_agentic-skills"
7678
# this pod is hostNetwork and uses the internal LB DNS name when possible, which the kubelet also uses.
7779
# this dnsPolicy allows us to use the same dnsConfig as the kubelet, without access to read it ourselves.
7880
dnsPolicy: Default

install/0000_00_cluster-version-operator_50_lightspeed-prompts.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ data:
1717
re-collect it. Parse the JSON, evaluate each check's results, and
1818
classify findings as blockers, warnings, or informational.
1919
20-
Use the ota-upgrade-advisor skill for the decision framework and
20+
Use the update-advisor skill for the decision framework and
2121
blocker classification rules. When findings need deeper investigation,
22-
use prometheus, platform-docs, redhat-support, or product-lifecycle
23-
skills.
22+
use prometheus metrics and product-lifecycle skills.
2423
2524
When the readiness data includes olm_operator_lifecycle results, use
2625
the product-lifecycle skill to cross-reference each operator's package
2726
name against the Red Hat Product Life Cycle API. Report support phase,
28-
EOL dates, and OCP compatibility from PLCC alongside the OLM data.
27+
EOL dates, and OCP compatibility from Product Lifecycle alongside the
28+
OLM data.
2929
3030
Do not guess or assume cluster state. Do not execute upgrade commands.

pkg/cvo/availableupdates_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ func newOperator(url string, cluster release, promqlMock clusterconditions.Condi
209209
func() ([]configv1.Release, []configv1.ConditionalUpdate, error) {
210210
return nil, nil, nil
211211
},
212-
fake.NewClientBuilder().Build(), func(_ string) (*configv1.ClusterVersion, error) {
212+
fake.NewClientBuilder().Build(), nil, func(_ string) (*configv1.ClusterVersion, error) {
213213
return &configv1.ClusterVersion{}, nil
214214
},
215215
func(_ context.Context, namespace, name string, _ metav1.GetOptions) (*corev1.ConfigMap, error) {

pkg/cvo/cvo.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
1919
"k8s.io/apimachinery/pkg/util/sets"
2020
"k8s.io/apimachinery/pkg/util/wait"
21+
"k8s.io/client-go/dynamic"
2122
informerscorev1 "k8s.io/client-go/informers/core/v1"
2223
"k8s.io/client-go/kubernetes"
2324
"k8s.io/client-go/kubernetes/scheme"
@@ -111,6 +112,7 @@ type Operator struct {
111112

112113
client clientset.Interface
113114
kubeClient kubernetes.Interface
115+
dynamicClient dynamic.Interface
114116
operatorClient operatorclientset.Interface
115117
eventRecorder record.EventRecorder
116118

@@ -244,6 +246,7 @@ func New(
244246
overrides *cvotls.Settings,
245247
client clientset.Interface,
246248
kubeClient kubernetes.Interface,
249+
dynamicClient dynamic.Interface,
247250
operatorClient operatorclientset.Interface,
248251
exclude string,
249252
clusterProfile string,
@@ -276,6 +279,7 @@ func New(
276279

277280
client: client,
278281
kubeClient: kubeClient,
282+
dynamicClient: dynamicClient,
279283
operatorClient: operatorClient,
280284
eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: namespace}),
281285
queue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "clusterversion"}),
@@ -370,6 +374,7 @@ func New(
370374
return availableUpdates.Updates, availableUpdates.ConditionalUpdates, nil
371375
},
372376
rtClient,
377+
dynamicClient,
373378
cvInformer.Lister().Get,
374379
func(ctx context.Context, namespace, name string, opts metav1.GetOptions) (*corev1.ConfigMap, error) {
375380
return kubeClient.CoreV1().ConfigMaps(namespace).Get(ctx, name, opts)
@@ -1240,8 +1245,7 @@ func (optr *Operator) shouldReconcileAcceptRisks() bool {
12401245

12411246
// shouldEnableProposalController returns whether the CVO should enable the proposal controller
12421247
func (optr *Operator) shouldEnableProposalController() bool {
1243-
// We do not have a specific gate for the Proposal feature and use the TechPreviewNoUpgrade instead.
1244-
// It can ensure that featuregates.ChangeStopper restarts CVO when the returns of this function flips.
1248+
// Gated behind a feature set so featuregates.ChangeStopper restarts CVO when the return of this function flips.
12451249
return optr.requiredFeatureSet == configv1.TechPreviewNoUpgrade
12461250
}
12471251

pkg/cvo/cvo_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2756,7 +2756,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
27562756
ctx := context.Background()
27572757
optr.proposalController = proposal.NewController(func() ([]configv1.Release, []configv1.ConditionalUpdate, error) {
27582758
return nil, nil, nil
2759-
}, ctrlruntimefake.NewClientBuilder().Build(), func(_ string) (*configv1.ClusterVersion, error) {
2759+
}, ctrlruntimefake.NewClientBuilder().Build(), nil, func(_ string) (*configv1.ClusterVersion, error) {
27602760
return &configv1.ClusterVersion{}, nil
27612761
}, func(_ context.Context, namespace, name string, _ metav1.GetOptions) (*corev1.ConfigMap, error) {
27622762
return &corev1.ConfigMap{}, nil

pkg/payload/testdata/TestRenderManifest_expected_cvo_deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ spec:
7373
fieldPath: spec.nodeName
7474
- name: CLUSTER_PROFILE
7575
value: 'some-profile'
76+
- name: LIGHTSPEED_SKILLS_IMAGE
77+
value: "quay.io/openshift/ci:ocp_5.0_agentic-skills"
7678
# this pod is hostNetwork and uses the internal LB DNS name when possible, which the kubelet also uses.
7779
# this dnsPolicy allows us to use the same dnsConfig as the kubelet, without access to read it ourselves.
7880
dnsPolicy: Default

pkg/proposal/analysis_schema.json

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
{
2+
"type": "object",
3+
"required": ["analysisData"],
4+
"properties": {
5+
"analysisData": {
6+
"type": "array",
7+
"description": "Typed components describing upgrade readiness. Must include exactly one ota_readiness_summary. Include one ota_finding per blocker or warning. Include one ota_olm_operator_status if OLM operators are present.",
8+
"minItems": 1,
9+
"items": {
10+
"oneOf": [
11+
{
12+
"type": "object",
13+
"description": "Overall upgrade readiness summary with per-check results.",
14+
"properties": {
15+
"type": { "type": "string", "const": "ota_readiness_summary" },
16+
"decision": {
17+
"type": "string",
18+
"enum": ["recommend", "caution", "block", "escalate"],
19+
"description": "recommend=all clear, caution=warnings only, block=blockers found, escalate=insufficient data"
20+
},
21+
"checks": {
22+
"type": "array",
23+
"description": "One entry per readiness check from the input JSON.",
24+
"items": {
25+
"type": "object",
26+
"properties": {
27+
"name": { "type": "string", "description": "Check name, e.g. Cluster Conditions, Operator Health" },
28+
"status": { "type": "string", "enum": ["pass", "warn", "fail", "error"] },
29+
"detail": { "type": "string", "description": "One-line summary" }
30+
},
31+
"required": ["name", "status"]
32+
}
33+
}
34+
},
35+
"required": ["type", "decision", "checks"]
36+
},
37+
{
38+
"type": "object",
39+
"description": "A specific blocker, warning, or informational finding.",
40+
"properties": {
41+
"type": { "type": "string", "const": "ota_finding" },
42+
"severity": { "type": "string", "enum": ["blocker", "warning", "info"] },
43+
"check": { "type": "string", "description": "Which readiness check surfaced this" },
44+
"detail": { "type": "string", "description": "Description for a cluster administrator" },
45+
"affectedResources": { "type": "array", "items": { "type": "string" } },
46+
"prerequisite": { "type": "string", "description": "Action to resolve before upgrading" },
47+
"verifyCommand": { "type": "string", "description": "Command to verify the finding is resolved" }
48+
},
49+
"required": ["type", "severity", "check", "detail"]
50+
},
51+
{
52+
"type": "object",
53+
"description": "Per-operator OLM lifecycle status.",
54+
"properties": {
55+
"type": { "type": "string", "const": "ota_olm_operator_status" },
56+
"operators": {
57+
"type": "array",
58+
"items": {
59+
"type": "object",
60+
"properties": {
61+
"name": { "type": "string" },
62+
"namespace": { "type": "string" },
63+
"displayName": { "type": "string" },
64+
"installedVersion": { "type": "string" },
65+
"channel": { "type": "string" },
66+
"source": { "type": "string" },
67+
"installPlanApproval": { "type": "string", "enum": ["Automatic", "Manual"] },
68+
"pendingUpgrade": { "type": "boolean" },
69+
"pendingVersion": { "type": "string" },
70+
"compatibleWithTarget": { "type": "boolean" },
71+
"availableChannels": { "type": "array", "items": { "type": "string" } },
72+
"ocpCompat": {
73+
"type": "object",
74+
"properties": { "min": { "type": "string" }, "max": { "type": "string" } }
75+
},
76+
"lifecycle": {
77+
"type": "object",
78+
"properties": {
79+
"productName": { "type": "string" },
80+
"supportPhase": { "type": "string", "enum": ["Full Support", "Maintenance Support", "End of life"] },
81+
"ocpVersions": { "type": "string" },
82+
"maintenanceEnds": { "type": "string" }
83+
}
84+
}
85+
},
86+
"required": ["name", "namespace"]
87+
}
88+
},
89+
"summary": {
90+
"type": "object",
91+
"properties": {
92+
"totalOperators": { "type": "integer" },
93+
"pendingUpgrades": { "type": "integer" },
94+
"manualApproval": { "type": "integer" },
95+
"incompatibleWithTarget": { "type": "integer" }
96+
}
97+
}
98+
},
99+
"required": ["type", "operators", "summary"]
100+
}
101+
]
102+
}
103+
}
104+
}
105+
}

0 commit comments

Comments
 (0)