diff --git a/deploy/releases/v0.0.1/deployment.yaml b/deploy/releases/v0.0.1/deployment.yaml index e106b26..b31a800 100644 --- a/deploy/releases/v0.0.1/deployment.yaml +++ b/deploy/releases/v0.0.1/deployment.yaml @@ -43,17 +43,17 @@ spec: livenessProbe: httpGet: path: /healthz - port: 10268 - scheme: HTTP + port: 10258 + scheme: HTTPS initialDelaySeconds: 15 failureThreshold: 60 periodSeconds: 60 readinessProbe: httpGet: - path: /readyz - port: 10268 - scheme: HTTP - initialDelaySeconds: 5 + # The ccm only exposes healthz. Should work fine for indicating a functional state + path: /healthz + port: 10258 + scheme: HTTPS periodSeconds: 60 resources: requests: diff --git a/go.mod b/go.mod index 80d2acc..7bbfc3a 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( k8s.io/client-go v0.34.1 k8s.io/cloud-provider v0.34.1 k8s.io/component-base v0.34.1 + k8s.io/controller-manager v0.34.1 k8s.io/klog/v2 v2.130.1 sigs.k8s.io/controller-runtime v0.22.4 ) @@ -160,7 +161,6 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiserver v0.34.1 // indirect k8s.io/component-helpers v0.34.1 // indirect - k8s.io/controller-manager v0.34.1 // indirect k8s.io/kms v0.34.1 // indirect k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect diff --git a/main.go b/main.go index 1c3a527..0641799 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,7 @@ package main import ( + "context" "flag" "io" "os" @@ -8,19 +9,21 @@ import ( "k8s.io/apimachinery/pkg/util/wait" cloudprovider "k8s.io/cloud-provider" "k8s.io/cloud-provider/app" + cloudcontrollerconfig "k8s.io/cloud-provider/app/config" "k8s.io/cloud-provider/names" "k8s.io/cloud-provider/options" - cloudcontrollerconfig "k8s.io/cloud-provider/app/config" - "k8s.io/component-base/cli" cliflag "k8s.io/component-base/cli/flag" _ "k8s.io/component-base/metrics/prometheus/clientgo" // load all the prometheus client-go plugins _ "k8s.io/component-base/metrics/prometheus/version" // for version metric registration + genericcontrollermanager "k8s.io/controller-manager/app" + "k8s.io/controller-manager/controller" "k8s.io/klog/v2" "github.com/metal-stack/metal-ccm/metal" + "github.com/metal-stack/metal-ccm/pkg/controllers/health" "github.com/metal-stack/metal-ccm/pkg/resources/constants" "github.com/metal-stack/v" "github.com/spf13/pflag" @@ -34,6 +37,21 @@ func main() { opts.KubeCloudShared.CloudProvider.Name = constants.ProviderName controllerInitializers := app.DefaultInitFuncConstructors + controllerInitializers["metal-health"] = app.ControllerInitFuncConstructor{ + InitContext: app.ControllerInitContext{ + ClientName: "metal-health-controller", + }, + Constructor: func(initContext app.ControllerInitContext, completedConfig *cloudcontrollerconfig.CompletedConfig, cloud cloudprovider.Interface) app.InitFunc { + return func(ctx context.Context, controllerContext genericcontrollermanager.ControllerContext) (controller.Interface, bool, error) { + m, err := metal.NewMetalClient() + if err != nil { + return nil, false, err + } + + return health.New(m), true, nil + } + }, + } fss := cliflag.NamedFlagSets{ NormalizeNameFunc: cliflag.WordSepNormalizeFunc, } diff --git a/metal/cloud.go b/metal/cloud.go index a8aaf99..eb2e725 100644 --- a/metal/cloud.go +++ b/metal/cloud.go @@ -40,14 +40,13 @@ type cloud struct { } func NewCloud(_ io.Reader) (cloudprovider.Interface, error) { - url := os.Getenv(constants.MetalAPIUrlEnvVar) - token := os.Getenv(constants.MetalAuthTokenEnvVar) - hmac := os.Getenv(constants.MetalAuthHMACEnvVar) - hmacAuthType := os.Getenv(constants.MetalAuthHMACAuthTypeEnvVar) - projectID := os.Getenv(constants.MetalProjectIDEnvVar) - partitionID := os.Getenv(constants.MetalPartitionIDEnvVar) - clusterID := os.Getenv(constants.MetalClusterIDEnvVar) - defaultExternalNetworkID := os.Getenv(constants.MetalDefaultExternalNetworkEnvVar) + var ( + projectID = os.Getenv(constants.MetalProjectIDEnvVar) + partitionID = os.Getenv(constants.MetalPartitionIDEnvVar) + clusterID = os.Getenv(constants.MetalClusterIDEnvVar) + defaultExternalNetworkID = os.Getenv(constants.MetalDefaultExternalNetworkEnvVar) + ) + loadbalancerType, err := config.LoadBalancerTypeFromString(os.Getenv(constants.Loadbalancer)) if err != nil { return nil, err @@ -76,21 +75,9 @@ func NewCloud(_ io.Reader) (cloudprovider.Interface, error) { return nil, fmt.Errorf("environment variable %q is required", constants.MetalClusterIDEnvVar) } - if url == "" { - return nil, fmt.Errorf("environment variable %q is required", constants.MetalAPIUrlEnvVar) - } - - if (token == "") == (hmac == "") { - return nil, fmt.Errorf("environment variable %q or %q is required", constants.MetalAuthTokenEnvVar, constants.MetalAuthHMACEnvVar) - } - - if hmacAuthType == "" { - hmacAuthType = "Metal-Admin" - } - - metalclient, err = metalgo.NewDriver(url, token, hmac, metalgo.AuthType(hmacAuthType)) + metalclient, err = NewMetalClient() if err != nil { - return nil, fmt.Errorf("unable to initialize metal ccm:%w", err) + return nil, err } resp, err := metalclient.Health().Health(nil, nil) @@ -113,6 +100,34 @@ func NewCloud(_ io.Reader) (cloudprovider.Interface, error) { }, nil } +func NewMetalClient() (metalgo.Client, error) { + var ( + url = os.Getenv(constants.MetalAPIUrlEnvVar) + token = os.Getenv(constants.MetalAuthTokenEnvVar) + hmac = os.Getenv(constants.MetalAuthHMACEnvVar) + hmacAuthType = os.Getenv(constants.MetalAuthHMACAuthTypeEnvVar) + ) + + if url == "" { + return nil, fmt.Errorf("environment variable %q is required", constants.MetalAPIUrlEnvVar) + } + + if (token == "") == (hmac == "") { + return nil, fmt.Errorf("environment variable %q or %q is required", constants.MetalAuthTokenEnvVar, constants.MetalAuthHMACEnvVar) + } + + if hmacAuthType == "" { + hmacAuthType = "Metal-Admin" + } + + m, err := metalgo.NewDriver(url, token, hmac, metalgo.AuthType(hmacAuthType)) + if err != nil { + return nil, fmt.Errorf("unable to initialize metal ccm:%w", err) + } + + return m, nil +} + // Initialize provides the cloud with a kubernetes client builder and may spawn goroutines // to perform housekeeping activities within the cloud provider. func (c *cloud) Initialize(clientBuilder cloudprovider.ControllerClientBuilder, stop <-chan struct{}) { diff --git a/pkg/controllers/health/health.go b/pkg/controllers/health/health.go new file mode 100644 index 0000000..42ca8d4 --- /dev/null +++ b/pkg/controllers/health/health.go @@ -0,0 +1,43 @@ +package health + +import ( + "fmt" + "net/http" + + metalgo "github.com/metal-stack/metal-go" + "github.com/metal-stack/metal-lib/pkg/healthstatus" + "k8s.io/controller-manager/pkg/healthz" + "k8s.io/klog/v2" +) + +// HealthCheck is a controller that exposes metal-api health via the CCM /healthz endpoint. +type HealthCheck struct { + client metalgo.Client +} + +func New(client metalgo.Client) *HealthCheck { + return &HealthCheck{client: client} +} + +func (h *HealthCheck) Name() string { + return "metal-api-health" +} + +func (h *HealthCheck) HealthChecker() healthz.UnnamedHealthChecker { + return h +} + +func (h *HealthCheck) Check(_ *http.Request) error { + resp, err := h.client.Health().Health(nil, nil) + if err != nil { + return fmt.Errorf("metal-api health endpoint not reachable: %w", err) + } + + if resp.Payload == nil || resp.Payload.Status == nil || *resp.Payload.Status != string(healthstatus.HealthStatusHealthy) { + return fmt.Errorf("metal-api not healthy") + } + + klog.Info("external health check: metal-api is healthy") + + return nil +} diff --git a/pkg/controllers/housekeeping/api_probe.go b/pkg/controllers/housekeeping/api_probe.go deleted file mode 100644 index f6c36a7..0000000 --- a/pkg/controllers/housekeeping/api_probe.go +++ /dev/null @@ -1,67 +0,0 @@ -package housekeeping - -import ( - "fmt" - "net/http" - "time" - - "github.com/metal-stack/metal-lib/pkg/healthstatus" - "k8s.io/klog/v2" -) - -const ( - probeAddr = ":10268" - probeReadTimeout = 5 * time.Second - probeWriteTimeout = 5 * time.Second -) - -// Run probe endpoint outside of controller runtime, as there is no compact way -// to share the metal-client to a second controller -// CCM restart on api failure is expected to be configured in the deployment -func (h *Housekeeper) startProbeServer() { - mux := http.NewServeMux() - - metalAPICheckHandler := func(w http.ResponseWriter, _ *http.Request) { - if err := h.checkMetalAPI(); err != nil { - http.Error(w, err.Error(), http.StatusServiceUnavailable) - return - } - if _, err := fmt.Fprintln(w, "ok"); err != nil { - klog.Error(err) - } - } - - mux.HandleFunc("/healthz", metalAPICheckHandler) - mux.HandleFunc("/readyz", metalAPICheckHandler) - - server := &http.Server{ - Addr: probeAddr, - Handler: mux, - ReadHeaderTimeout: probeReadTimeout, - WriteTimeout: probeWriteTimeout, - } - - go func() { - klog.Infof("starting probe server on %s", probeAddr) - if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { - klog.Errorf("probe server terminated: %v", err) - } - }() -} - -func (h *Housekeeper) checkMetalAPI() error { - klog.Infof("checking metal-api health") - resp, err := h.client.Health().Health(nil, nil) - if err != nil { - klog.Errorf("metal-api health check failed: %v", err) - return err - } - - if resp.Payload != nil && resp.Payload.Status != nil && *resp.Payload.Status == string(healthstatus.HealthStatusHealthy) { - return nil - } - - err = fmt.Errorf("metal-api is not healthy with response %v: %v", *resp.Payload.Status, resp.Payload.Message) - klog.Error(err) - return err -} diff --git a/pkg/controllers/housekeeping/housekeeper.go b/pkg/controllers/housekeeping/housekeeper.go index 6bef077..24dbf52 100644 --- a/pkg/controllers/housekeeping/housekeeper.go +++ b/pkg/controllers/housekeeping/housekeeper.go @@ -47,8 +47,6 @@ func New(metalClient metalgo.Client, stop <-chan struct{}, lbController *loadbal // Run runs the housekeeper... func (h *Housekeeper) Run() error { - h.startProbeServer() - h.startTagSynching() h.startLoadBalancerConfigSynching() h.startSSHKeysSynching()