Skip to content

Commit 5449914

Browse files
authored
Merge pull request #18433 from hakman/channels-probe
channels: surface addon apply failures via a readiness probe
2 parents 8a38af9 + 888f5a2 commit 5449914

90 files changed

Lines changed: 872 additions & 1 deletion

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

channels/pkg/cmd/apply_channel.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,25 @@ func runApplyChannelIteration(ctx context.Context, f *ChannelsFactory, out io.Wr
9494
// ChannelsFactory per iteration drops cached REST configs and the discovery
9595
// cache, picking up cert rotation and new CRDs without a restart.
9696
func runApplyChannelLoop(ctx context.Context, out io.Writer, options *ApplyChannelOptions, args []string) error {
97+
// In daemon mode kops-channels runs as a system-node-critical static pod; serve a
98+
// readiness probe reporting the last apply outcome, so a persistent failure surfaces
99+
// as NotReady (failing `kops validate cluster`, which gates rolling updates) instead
100+
// of only being logged. Starts NotReady until the first successful apply.
101+
readiness, err := serveReadiness(ctx)
102+
if err != nil {
103+
return fmt.Errorf("serving readiness probe: %w", err)
104+
}
105+
97106
// Retry quickly until the first success: the apiserver is usually
98107
// unreachable while the control plane is still coming up.
99108
const startupRetryInterval = 5 * time.Second
100109

101110
settled := false
102111
for {
103112
interval := options.Interval
104-
if err := runApplyChannelIteration(ctx, NewChannelsFactory(), out, options, args); err != nil {
113+
err := runApplyChannelIteration(ctx, NewChannelsFactory(), out, options, args)
114+
readiness.recordApplyResult(err)
115+
if err != nil {
105116
if !settled {
106117
interval = min(startupRetryInterval, options.Interval)
107118
}

channels/pkg/cmd/readiness.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
Copyright 2026 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package cmd
18+
19+
import (
20+
"context"
21+
"errors"
22+
"fmt"
23+
"net"
24+
"net/http"
25+
"strconv"
26+
"sync/atomic"
27+
"time"
28+
29+
"k8s.io/klog/v2"
30+
31+
"k8s.io/kops/pkg/wellknownports"
32+
)
33+
34+
type applyChannelReadiness struct {
35+
ready atomic.Bool
36+
addr string // resolved listen address; read only by tests (which bind :0)
37+
}
38+
39+
func (r *applyChannelReadiness) recordApplyResult(err error) {
40+
r.ready.Store(err == nil)
41+
}
42+
43+
// serveReadiness serves /readyz on loopback for the kubelet readiness probe until ctx is cancelled:
44+
// 200 when ready is true, 503 otherwise. The pod runs with hostNetwork, so the kubelet reaches it
45+
// via 127.0.0.1 in the host network namespace.
46+
func serveReadiness(ctx context.Context) (*applyChannelReadiness, error) {
47+
addr := net.JoinHostPort("127.0.0.1", strconv.Itoa(wellknownports.KopsChannelsHealthCheck))
48+
return serveReadinessOnAddr(ctx, addr)
49+
}
50+
51+
func serveReadinessOnAddr(ctx context.Context, addr string) (*applyChannelReadiness, error) {
52+
readiness := &applyChannelReadiness{}
53+
54+
mux := http.NewServeMux()
55+
mux.HandleFunc("/readyz", func(w http.ResponseWriter, _ *http.Request) {
56+
if readiness.ready.Load() {
57+
w.WriteHeader(http.StatusOK)
58+
_, _ = w.Write([]byte("ok\n"))
59+
} else {
60+
w.WriteHeader(http.StatusServiceUnavailable)
61+
_, _ = w.Write([]byte("apply iterations are failing\n"))
62+
}
63+
})
64+
65+
listener, err := net.Listen("tcp", addr)
66+
if err != nil {
67+
return nil, fmt.Errorf("listening on %s: %w", addr, err)
68+
}
69+
readiness.addr = listener.Addr().String()
70+
71+
server := &http.Server{
72+
Handler: mux,
73+
ReadHeaderTimeout: 5 * time.Second,
74+
}
75+
76+
go func() {
77+
<-ctx.Done()
78+
_ = server.Close()
79+
}()
80+
go func() {
81+
if err := server.Serve(listener); err != nil && !errors.Is(err, http.ErrServerClosed) {
82+
klog.Fatalf("kops-channels readiness server stopped: %v", err)
83+
}
84+
}()
85+
return readiness, nil
86+
}

channels/pkg/cmd/readiness_test.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
Copyright 2026 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package cmd
18+
19+
import (
20+
"context"
21+
"errors"
22+
"net"
23+
"net/http"
24+
"testing"
25+
"time"
26+
)
27+
28+
func TestServeReadinessReportsApplyOutcome(t *testing.T) {
29+
ctx, cancel := context.WithCancel(context.Background())
30+
defer cancel()
31+
32+
readiness, err := serveReadinessOnAddr(ctx, "127.0.0.1:0")
33+
if err != nil {
34+
t.Fatalf("serveReadinessOnAddr returned error: %v", err)
35+
}
36+
37+
assertReadinessStatus(t, readiness.addr, http.StatusServiceUnavailable)
38+
39+
readiness.recordApplyResult(nil)
40+
assertReadinessStatus(t, readiness.addr, http.StatusOK)
41+
42+
readiness.recordApplyResult(errors.New("apply failed"))
43+
assertReadinessStatus(t, readiness.addr, http.StatusServiceUnavailable)
44+
}
45+
46+
func TestServeReadinessReturnsBindError(t *testing.T) {
47+
listener, err := net.Listen("tcp", "127.0.0.1:0")
48+
if err != nil {
49+
t.Fatalf("failed to reserve test port: %v", err)
50+
}
51+
defer listener.Close()
52+
53+
_, err = serveReadinessOnAddr(context.Background(), listener.Addr().String())
54+
if err == nil {
55+
t.Fatalf("expected bind error")
56+
}
57+
}
58+
59+
func assertReadinessStatus(t *testing.T, addr string, expectedStatus int) {
60+
t.Helper()
61+
62+
client := &http.Client{Timeout: time.Second}
63+
resp, err := client.Get("http://" + addr + "/readyz")
64+
if err != nil {
65+
t.Fatalf("GET /readyz failed: %v", err)
66+
}
67+
defer resp.Body.Close()
68+
69+
if resp.StatusCode != expectedStatus {
70+
t.Fatalf("expected status %d, got %d", expectedStatus, resp.StatusCode)
71+
}
72+
}

pkg/model/components/channels/model.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@ import (
2323
v1 "k8s.io/api/core/v1"
2424
"k8s.io/apimachinery/pkg/api/resource"
2525
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/util/intstr"
2627

2728
kopsroot "k8s.io/kops"
2829
"k8s.io/kops/pkg/assets"
2930
"k8s.io/kops/pkg/k8scodecs"
3031
"k8s.io/kops/pkg/kubemanifest"
3132
"k8s.io/kops/pkg/model"
33+
"k8s.io/kops/pkg/wellknownports"
3234
"k8s.io/kops/pkg/wellknownusers"
3335
"k8s.io/kops/upup/pkg/fi"
3436
"k8s.io/kops/upup/pkg/fi/fitasks"
@@ -156,6 +158,23 @@ func (b *ChannelsBuilder) buildPod(channels []string) (*v1.Pod, error) {
156158
v1.ResourceMemory: resource.MustParse("50Mi"),
157159
},
158160
},
161+
// kops-channels is system-node-critical, so a NotReady container fails `kops validate
162+
// cluster`. The apply loop serves /readyz on loopback, publishing the most recent apply
163+
// outcome. failureThreshold 2 (~20s at periodSeconds 10) trips within one apply
164+
// interval (channelsInterval, 60s), so a single failed apply surfaces as NotReady; 2
165+
// (not 1) rides out one flaky probe sample.
166+
ReadinessProbe: &v1.Probe{
167+
ProbeHandler: v1.ProbeHandler{
168+
HTTPGet: &v1.HTTPGetAction{
169+
Host: "127.0.0.1",
170+
Path: "/readyz",
171+
Port: intstr.FromInt(wellknownports.KopsChannelsHealthCheck),
172+
},
173+
},
174+
InitialDelaySeconds: 30,
175+
PeriodSeconds: 10,
176+
FailureThreshold: 2,
177+
},
159178
// ko-distroless's default nonroot uid can't read /var/lib/kops/kubeconfig.
160179
SecurityContext: &v1.SecurityContext{
161180
RunAsUser: fi.PtrTo(int64(wellknownusers.KopsChannelsID)),

pkg/model/components/channels/tests/container_registry/tasks.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ Contents: |
3030
value: us-test-1
3131
image: my-mirror.example.com/kops-channels:1.36.0-alpha.1
3232
name: kops-channels
33+
readinessProbe:
34+
failureThreshold: 2
35+
httpGet:
36+
host: 127.0.0.1
37+
path: /readyz
38+
port: 3986
39+
initialDelaySeconds: 30
40+
periodSeconds: 10
3341
resources:
3442
requests:
3543
cpu: 50m

pkg/model/components/channels/tests/minimal/tasks.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ Contents: |
3030
value: us-test-1
3131
image: registry.k8s.io/kops/channels:1.36.0-alpha.1
3232
name: kops-channels
33+
readinessProbe:
34+
failureThreshold: 2
35+
httpGet:
36+
host: 127.0.0.1
37+
path: /readyz
38+
port: 3986
39+
initialDelaySeconds: 30
40+
periodSeconds: 10
3341
resources:
3442
requests:
3543
cpu: 50m

pkg/wellknownports/wellknownports.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ const (
2626
// EtcdMetricsPort is used to serve etcd metrics
2727
EtcdMetricsPort = 2382
2828

29+
// KopsChannelsHealthCheck is the loopback port the kops-channels static pod serves /readyz on.
30+
KopsChannelsHealthCheck = 3986
31+
2932
// NodeupChallenge is the port where nodeup listens for challenges.
3033
NodeupChallenge = 3987
3134

tests/integration/update_cluster/additionalobjects/data/aws_s3_object_manifests-channels-kops-channels_content

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ spec:
2828
value: us-test-1
2929
image: registry.k8s.io/kops/channels:1.34.0-beta.1
3030
name: kops-channels
31+
readinessProbe:
32+
failureThreshold: 2
33+
httpGet:
34+
host: 127.0.0.1
35+
path: /readyz
36+
port: 3986
37+
initialDelaySeconds: 30
38+
periodSeconds: 10
3139
resources:
3240
requests:
3341
cpu: 50m

tests/integration/update_cluster/apiservernodes/data/aws_s3_object_manifests-channels-kops-channels_content

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ spec:
2828
value: us-test-1
2929
image: registry.k8s.io/kops/channels:1.34.0-beta.1
3030
name: kops-channels
31+
readinessProbe:
32+
failureThreshold: 2
33+
httpGet:
34+
host: 127.0.0.1
35+
path: /readyz
36+
port: 3986
37+
initialDelaySeconds: 30
38+
periodSeconds: 10
3139
resources:
3240
requests:
3341
cpu: 50m

tests/integration/update_cluster/aws-lb-controller/data/aws_s3_object_manifests-channels-kops-channels_content

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ spec:
2828
value: us-test-1
2929
image: registry.k8s.io/kops/channels:1.34.0-beta.1
3030
name: kops-channels
31+
readinessProbe:
32+
failureThreshold: 2
33+
httpGet:
34+
host: 127.0.0.1
35+
path: /readyz
36+
port: 3986
37+
initialDelaySeconds: 30
38+
periodSeconds: 10
3139
resources:
3240
requests:
3341
cpu: 50m

0 commit comments

Comments
 (0)