From 01f52547c506bec1cee67da9dc7c260926125b11 Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Thu, 7 May 2026 14:01:58 +0100 Subject: [PATCH 1/2] autohealing: fix insecure TLS certificate verification in endpoint health check Replace the hardcoded InsecureSkipVerify: true with proper TLS certificate verification using the in-cluster Kubernetes CA by default. Add ca-file and insecure-skip-verify config options so operators can override behaviour when needed (e.g. when the API server cert does not include node IPs as SANs). Signed-off-by: Stephen Finucane Assisted-By: Claude Sonnet 4.6 --- .../healthcheck/plugin_endpoint.go | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/pkg/autohealing/healthcheck/plugin_endpoint.go b/pkg/autohealing/healthcheck/plugin_endpoint.go index 4ce065fb12..1e41a4d51b 100644 --- a/pkg/autohealing/healthcheck/plugin_endpoint.go +++ b/pkg/autohealing/healthcheck/plugin_endpoint.go @@ -19,6 +19,7 @@ package healthcheck import ( "context" "crypto/tls" + "crypto/x509" "fmt" "net/http" "os" @@ -33,6 +34,7 @@ import ( const ( EndpointType = "Endpoint" TokenPath = "/var/run/secrets/kubernetes.io/serviceaccount/token" + CAPath = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" TimeLayout = "2006-01-02 15:04:05" ) @@ -60,6 +62,13 @@ type EndpointCheck struct { // (Optional) Token to use in the request header. Default: read from TokenPath file Token string `mapstructure:"token"` + + // (Optional) Path to CA certificate file for TLS server verification. Only used when Protocol is HTTPS. + // Default: read from CAPath (the in-cluster Kubernetes CA). + CAFile string `mapstructure:"ca-file"` + + // (Optional) Skip TLS certificate verification. Not recommended for production. Default: false. + InsecureSkipVerify bool `mapstructure:"insecure-skip-verify"` } // GetName returns name of the health check @@ -140,9 +149,27 @@ func (check *EndpointCheck) Check(ctx context.Context, node NodeInfo, controller protocol := strings.ToLower(check.Protocol) switch protocol { case "https": - tr := &http.Transport{ - TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + tlsConfig := &tls.Config{} + if check.InsecureSkipVerify { + tlsConfig.InsecureSkipVerify = true + } else { + caFile := check.CAFile + if caFile == "" { + caFile = CAPath + } + caCert, err := os.ReadFile(caFile) + if err != nil { + log.Errorf("Node %s, failed to read CA certificate from %s: %v", nodeName, caFile, err) + return true + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(caCert) { + log.Errorf("Node %s, failed to parse CA certificate from %s", nodeName, caFile) + return true + } + tlsConfig.RootCAs = pool } + tr := &http.Transport{TLSClientConfig: tlsConfig} client = &http.Client{Transport: tr, Timeout: time.Second * 5} case "http": client = &http.Client{Timeout: time.Second * 5} From 7499e11e315a3c209fd634a2a43c53af4f17a1d0 Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Thu, 7 May 2026 14:05:35 +0100 Subject: [PATCH 2/2] docs: document Endpoint health check parameters in magnum-auto-healer guide Add a parameter reference table covering all Endpoint check options, including the newly added ca-file and insecure-skip-verify fields. Signed-off-by: Stephen Finucane --- .../using-magnum-auto-healer.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/magnum-auto-healer/using-magnum-auto-healer.md b/docs/magnum-auto-healer/using-magnum-auto-healer.md index a6538a7858..def6aa438c 100644 --- a/docs/magnum-auto-healer/using-magnum-auto-healer.md +++ b/docs/magnum-auto-healer/using-magnum-auto-healer.md @@ -183,6 +183,23 @@ spec: EOF ``` +#### Endpoint health check parameters + +The `Endpoint` type health check supports the following parameters: + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `protocol` | string | `HTTPS` | URL scheme to use. `HTTP` or `HTTPS` (case-insensitive). | +| `port` | int | `6443` | Port to connect to on the node. | +| `endpoints` | []string | `["/healthz"]` | List of URL paths to check. | +| `ok-codes` | []int | `[200]` | HTTP response codes considered healthy. | +| `unhealthy-duration` | duration | `300s` | How long a node must be continuously unhealthy before repair is triggered. | +| `unhealthy-annotation` | string | `autohealing.openstack.org/unhealthy-timestamp` | Node annotation used to record when the node first became unhealthy. | +| `require-token` | bool | `false` | Whether to include a bearer token in the request. | +| `token` | string | read from `/var/run/secrets/kubernetes.io/serviceaccount/token` | Bearer token value. Only used when `require-token` is `true`. | +| `ca-file` | string | `/var/run/secrets/kubernetes.io/serviceaccount/ca.crt` | Path to a CA certificate file used to verify the server's TLS certificate. Only used when `protocol` is `HTTPS`. | +| `insecure-skip-verify` | bool | `false` | Skip TLS certificate verification entirely. Not recommended for production use. | + ### Testing magnum-auto-healer We could ssh into a worker node(`lingxian-por-test-1-12-7-ha-bbgjts5g4xhb-minion-1` in this example) and stop the kubelet service to simulate the worker node failure. The node status check is covered in NodeCondition type of health check plugin(see configuration above).