openstack-k8s-operators
diff --git a/‎apis/bases/instanceha.openstack.org_instancehas.yaml‎
Lines changed: 12 additions & 0 deletions b/‎apis/bases/instanceha.openstack.org_instancehas.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎apis/instanceha/v1beta1/instanceha_types.go‎
Lines changed: 5 additions & 0 deletions b/‎apis/instanceha/v1beta1/instanceha_types.go‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎apis/instanceha/v1beta1/zz_generated.deepcopy.go‎
Lines changed: 1 addition & 0 deletions b/‎apis/instanceha/v1beta1/zz_generated.deepcopy.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/crd/bases/instanceha.openstack.org_instancehas.yaml‎
Lines changed: 12 additions & 0 deletions b/‎config/crd/bases/instanceha.openstack.org_instancehas.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/instanceha_guide.md‎
Lines changed: 3 additions & 1 deletion b/‎docs/instanceha_guide.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/instanceha_prometheus.md‎
Lines changed: 53 additions & 31 deletions b/‎docs/instanceha_prometheus.md‎
Lines changed: 53 additions & 31 deletions
diff --git a/‎internal/controller/instanceha/instanceha_controller.go‎
Lines changed: 71 additions & 2 deletions b/‎internal/controller/instanceha/instanceha_controller.go‎
Lines changed: 71 additions & 2 deletions
diff --git a/‎internal/instanceha/const.go‎
Lines changed: 10 additions & 0 deletions b/‎internal/instanceha/const.go‎
Lines changed: 10 additions & 0 deletions
@@ -100,6 +100,18 @@ spec:
                 default: 7410
                 format: int32
                 type: integer
+              metricsTLS:
+                description: MetricsTLS - Parameters related to TLS for the metrics
+                  endpoint
+                properties:
+                  caBundleSecretName:
+                    description: CaBundleSecretName - holding the CA certs in a pre-created
+                      bundle file
+                    type: string
+                  secretName:
+                    description: SecretName - holding the cert, key for the service
+                    type: string
+                type: object
               networkAttachments:
                 description: |-
                   NetworkAttachments is a list of NetworkAttachment resource names to expose
 
@@ -115,6 +115,11 @@ type InstanceHaSpec struct {
 	// +kubebuilder:validation:Optional
 	// Auth - Parameters related to authentication
 	Auth AuthSpec `json:"auth,omitempty"`
+
+	// +kubebuilder:validation:Optional
+	//+operator-sdk:csv:customresourcedefinitions:type=spec
+	// MetricsTLS - Parameters related to TLS for the metrics endpoint
+	MetricsTLS tls.SimpleService `json:"metricsTLS,omitempty"`
 }
 
 // InstanceHaStatus defines the observed state of InstanceHa
 
@@ -100,6 +100,18 @@ spec:
                 default: 7410
                 format: int32
                 type: integer
+              metricsTLS:
+                description: MetricsTLS - Parameters related to TLS for the metrics
+                  endpoint
+                properties:
+                  caBundleSecretName:
+                    description: CaBundleSecretName - holding the CA certs in a pre-created
+                      bundle file
+                    type: string
+                  secretName:
+                    description: SecretName - holding the cert, key for the service
+                    type: string
+                type: object
               networkAttachments:
                 description: |-
                   NetworkAttachments is a list of NetworkAttachment resource names to expose
 
@@ -190,7 +190,9 @@ groups:
 
 #### Scraping Configuration
 
-The InstanceHA pod exposes metrics on TCP port 8080. To scrape with Prometheus, create a `PodMonitor` or `ServiceMonitor`:
+The InstanceHA pod exposes metrics on TCP port 8080. The infra-operator automatically creates a Kubernetes Service (`<instance-name>-metrics`) with the labels `metrics: enabled` and `service: instanceha`, which the telemetry-operator discovers and scrapes via the COO Prometheus. **No manual configuration is needed when the telemetry-operator is deployed.**
+
+For environments using OpenShift user workload monitoring instead of (or in addition to) the telemetry-operator, create a `PodMonitor`:
 
 ```yaml
 apiVersion: monitoring.coreos.com/v1
 
@@ -6,6 +6,8 @@ InstanceHA exposes Prometheus metrics at `:8080/metrics` on the workload pod, co
 
 The metrics are served by the `prometheus_client` Python library on the same HTTP server used for liveness and readiness probes. No sidecar or additional container is needed.
 
+When pod-level TLS is enabled, the metrics endpoint serves over **HTTPS**. The openstack-operator creates a cert-manager Certificate producing a TLS secret (`cert-instanceha-metrics`), which the infra-operator mounts into the pod. The Python HTTP server wraps its socket with TLS automatically when the certificate files are present.
+
 ---
 
 ## Prerequisites
@@ -17,6 +19,34 @@ The metrics are served by the `prometheus_client` Python library on the same HTT
 
 ---
 
+## TLS Configuration
+
+When `OpenStackControlPlane` has pod-level TLS enabled (`spec.tls.podLevel.enabled: true`), the openstack-operator automatically provisions a cert-manager Certificate for the InstanceHA metrics endpoint. This produces a Kubernetes TLS secret (`cert-instanceha-metrics`) containing `tls.crt`, `tls.key`, and `ca.crt`.
+
+The infra-operator InstanceHA controller **auto-detects** this secret: if the default secret `cert-instanceha-metrics` exists in the namespace, TLS is enabled automatically without any configuration on the InstanceHa CR. The controller:
+1. Validates the TLS secret exists and is well-formed
+2. Mounts the certificate at `/etc/pki/tls/certs/metrics.crt` and the key at `/etc/pki/tls/private/metrics.key`
+3. Sets `METRICS_TLS_CERT` and `METRICS_TLS_KEY` environment variables
+4. Switches liveness and readiness probes to HTTPS
+
+The Python process detects these environment variables and wraps the HTTP server socket with TLS. A single wildcard certificate (`*.NAMESPACE.svc`) covers all InstanceHA instances in a namespace.
+
+To use a custom TLS secret instead of the auto-detected default, set `metricsTLS.secretName` in the InstanceHa CR:
+
+```yaml
+apiVersion: instanceha.openstack.org/v1beta1
+kind: InstanceHa
+metadata:
+  name: instanceha
+spec:
+  metricsTLS:
+    secretName: my-custom-metrics-cert
+```
+
+When the telemetry-operator is deployed, its `ScrapeConfig` automatically switches to `scheme: HTTPS` with the appropriate TLS configuration when `PrometheusTLS` is enabled — no manual changes are needed.
+
+---
+
 ## Enabling Scraping
 
 ### Step 1: Deploy a PodMonitor
@@ -95,8 +125,9 @@ curl -sk -H "Authorization: Bearer $TOKEN" \
 
 ```bash
 # Scrape metrics directly from the pod
+# Use https and -k when TLS is enabled
 oc exec -n openstack deployment/instanceha-instanceha -- \
-  curl -s http://localhost:8080/metrics
+  curl -sk https://localhost:8080/metrics
 
 # Query a specific metric in Prometheus
 # (via Prometheus UI or API)
@@ -299,13 +330,17 @@ promtool check rules instanceha-prometheusrule.yaml
 ### Verify Metrics Endpoint
 
 ```bash
-# Scrape all metrics from the pod
+# Scrape all metrics from the pod (HTTP, when TLS is not enabled)
 oc exec -n openstack deployment/instanceha-instanceha -- \
   curl -s http://localhost:8080/metrics
 
+# When TLS is enabled, use HTTPS with -k to skip certificate verification
+oc exec -n openstack deployment/instanceha-instanceha -- \
+  curl -sk https://localhost:8080/metrics
+
 # Check a specific metric family
 oc exec -n openstack deployment/instanceha-instanceha -- \
-  curl -s http://localhost:8080/metrics | grep instanceha_poll_cycles_total
+  curl -sk https://localhost:8080/metrics | grep instanceha_poll_cycles_total
 ```
 
 Expected output (counters start at zero, increment over time):
@@ -319,16 +354,16 @@ instanceha_poll_cycles_total{result="error"} 0.0
 
 ### Verify Poll Loop Metrics
 
-After the pod has been running for a few poll cycles:
+After the pod has been running for a few poll cycles (use `https` and `-k` when TLS is enabled):
 
 ```bash
 # Should show increasing success count
 oc exec -n openstack deployment/instanceha-instanceha -- \
-  curl -s http://localhost:8080/metrics | grep poll_cycles
+  curl -sk https://localhost:8080/metrics | grep poll_cycles
 
 # Should show 0 consecutive failures (healthy state)
 oc exec -n openstack deployment/instanceha-instanceha -- \
-  curl -s http://localhost:8080/metrics | grep poll_consecutive_failures
+  curl -sk https://localhost:8080/metrics | grep poll_consecutive_failures
 ```
 
 ### Simulate a Nova API Failure
@@ -353,8 +388,9 @@ Fencing and evacuation counters only increment during actual host failures. To v
 
 ```bash
 # Check that the metric families are registered (even if values are 0)
+# Use https and -k when TLS is enabled
 oc exec -n openstack deployment/instanceha-instanceha -- \
-  curl -s http://localhost:8080/metrics | grep "^instanceha_" | grep "# TYPE"
+  curl -sk https://localhost:8080/metrics | grep "^instanceha_" | grep "# TYPE"
 ```
 
 Expected output:
@@ -482,37 +518,23 @@ When the [telemetry-operator](https://github.com/openstack-k8s-operators/telemet
 | OpenShift user workload monitoring | `prometheus-user-workload` in `openshift-user-workload-monitoring` | `thanos-querier` route in `openshift-monitoring` |
 | telemetry-operator (COO) | `prometheus-metric-storage` in `openstack` | `metric-storage-prometheus.openstack.svc:9090` |
 
-The PodMonitor approach described above places InstanceHA metrics in the OpenShift user workload Prometheus. If you want InstanceHA metrics alongside other OpenStack metrics (Ceilometer, RabbitMQ, node-exporter, OVN) in the COO Prometheus, create a `ScrapeConfig` CR instead.
+### Automatic Discovery (default)
 
-### Creating a ScrapeConfig for COO Prometheus
+The telemetry-operator **automatically discovers and scrapes InstanceHA metrics** — no manual configuration is required. The infra-operator creates a Kubernetes Service (`<instance-name>-metrics`) with the labels `metrics: enabled` and `service: instanceha`. The telemetry-operator's `MetricStorage` controller watches for Services with these labels and automatically generates a `ScrapeConfig` CR named `telemetry-instanceha` targeting port 8080.
 
-The COO Prometheus only picks up CRs with the label `service: metricStorage`. Create a `ScrapeConfig` targeting the InstanceHA pod:
+This works the same way as the OVN metrics integration. When a `MetricStorage` CR exists in the namespace:
 
-```yaml
-apiVersion: monitoring.rhobs/v1alpha1
-kind: ScrapeConfig
-metadata:
-  name: instanceha-metrics
-  namespace: openstack
-  labels:
-    service: metricStorage
-spec:
-  scrapeInterval: 30s
-  metricsPath: /metrics
-  staticConfigs:
-    - targets:
-        - "<instanceha-pod-ip>:8080"
-```
+1. The telemetry-operator discovers the InstanceHA metrics Service via label selectors
+2. A `ScrapeConfig` CR is created with the target `<service-name>.<namespace>.svc:8080`
+3. The COO Prometheus picks up the `ScrapeConfig` and begins scraping
+4. If the InstanceHA Service is deleted or recreated, the `ScrapeConfig` is automatically reconciled
 
-To discover the pod IP dynamically:
+To verify the automatic scrapeconfig was created:
 
 ```bash
-POD_IP=$(oc get pod -n openstack -l service=instanceha -o jsonpath='{.items[0].status.podIP}')
-echo "Target: ${POD_IP}:8080"
+oc get scrapeconfig -n openstack telemetry-instanceha -o yaml
 ```
 
-> **Note**: The COO `ScrapeConfig` uses static targets (IP:port), not label-based pod discovery like a `PodMonitor`. If the InstanceHA pod is rescheduled and gets a new IP, the `ScrapeConfig` must be updated. For automatic discovery, consider requesting native InstanceHA support in the telemetry-operator — the OVN metrics integration uses a label-based service discovery pattern that could be extended to InstanceHA.
-
 ### Alert Rules for COO Prometheus
 
 The alert rules from the [Alert Rules](#alert-rules) section use the `monitoring.coreos.com/v1` API group, which is picked up by OpenShift's built-in Prometheus Operator. To use these alerts with the COO Prometheus instead, change the API group and add the `service: metricStorage` label:
@@ -532,7 +554,7 @@ spec:
 ### Which Approach to Use
 
 - **OpenShift user workload monitoring only** (no telemetry-operator): Use the PodMonitor approach from [Enabling Scraping](#enabling-scraping). This is simpler and uses automatic pod discovery.
-- **telemetry-operator deployed**: Use the ScrapeConfig approach if you want all OpenStack metrics in a single Prometheus. You can also use both approaches simultaneously — the PodMonitor and ScrapeConfig target different Prometheus instances and do not conflict.
+- **telemetry-operator deployed** (default): InstanceHA metrics are automatically scraped by the COO Prometheus alongside other OpenStack metrics (Ceilometer, RabbitMQ, node-exporter, OVN). No manual configuration needed. You can also deploy the PodMonitor simultaneously — it targets the OpenShift user workload Prometheus and does not conflict with the COO scrapeconfig.
 - **Querying across both**: OpenShift's `thanos-querier` route aggregates the cluster and user workload Prometheus instances. The COO Prometheus is separate and must be queried directly at `metric-storage-prometheus.openstack.svc:9090`.
 
 ---
 
@@ -55,6 +55,7 @@ import (
 
 	commondeployment "github.com/openstack-k8s-operators/lib-common/modules/common/deployment"
 	"github.com/openstack-k8s-operators/lib-common/modules/common/secret"
+	commonservice "github.com/openstack-k8s-operators/lib-common/modules/common/service"
 	"github.com/openstack-k8s-operators/lib-common/modules/common/util"
 
 	networkv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
@@ -80,6 +81,7 @@ func (r *Reconciler) GetLogger(ctx context.Context) logr.Logger {
 // +kubebuilder:rbac:groups=instanceha.openstack.org,resources=instancehas/finalizers,verbs=update;patch
 // +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;
 // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;
+// +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=k8s.cni.cncf.io,resources=network-attachment-definitions,verbs=get;list;watch
 // service account, role, rolebinding
 // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;patch
@@ -164,6 +166,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
 		condition.UnknownCondition(condition.RoleReadyCondition, condition.InitReason, condition.RoleReadyInitMessage),
 		condition.UnknownCondition(condition.RoleBindingReadyCondition, condition.InitReason, condition.RoleBindingReadyInitMessage),
 		condition.UnknownCondition(condition.NetworkAttachmentsReadyCondition, condition.InitReason, condition.NetworkAttachmentsReadyInitMessage),
+		condition.UnknownCondition(condition.CreateServiceReadyCondition, condition.InitReason, condition.CreateServiceReadyInitMessage),
 	)
 	instance.Status.Conditions.Init(&cl)
 	instance.Status.ObservedGeneration = instance.Generation
@@ -369,8 +372,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
 		)
 		if err != nil {
 			if k8s_errors.IsNotFound(err) {
-				// Since the CA cert secret should have been manually created by the user and provided in the spec,
-				// we treat this as a warning because it means that the service will not be able to start.
 				instance.Status.Conditions.Set(condition.FalseCondition(
 					condition.TLSInputReadyCondition,
 					condition.ErrorReason,
@@ -390,6 +391,38 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
 		configVars[instance.Spec.CaBundleSecretName] = env.SetValue(secretHash)
 	}
 
+	metricsTLSExplicit := instance.Spec.MetricsTLS.Enabled()
+	if !metricsTLSExplicit {
+		certName := instanceha.DefaultMetricsCertSecret
+		instance.Spec.MetricsTLS.SecretName = &certName
+	}
+
+	hash, err := instance.Spec.MetricsTLS.ValidateCertSecret(ctx, helper, instance.Namespace)
+	if err != nil {
+		if k8s_errors.IsNotFound(err) {
+			if metricsTLSExplicit {
+				instance.Status.Conditions.Set(condition.FalseCondition(
+					condition.TLSInputReadyCondition,
+					condition.RequestedReason,
+					condition.SeverityInfo,
+					condition.TLSInputReadyWaitingMessage, err.Error()))
+				return ctrl.Result{}, nil
+			}
+			// Auto-detect: default cert not found, proceed without TLS
+			instance.Spec.MetricsTLS.SecretName = nil
+		} else {
+			instance.Status.Conditions.Set(condition.FalseCondition(
+				condition.TLSInputReadyCondition,
+				condition.ErrorReason,
+				condition.SeverityWarning,
+				condition.TLSInputErrorMessage,
+				err.Error()))
+			return ctrl.Result{}, err
+		}
+	} else {
+		configVars[tls.TLSHashName+"_metrics"] = env.SetValue(hash)
+	}
+
 	// all cert input checks out so report InputReady
 	instance.Status.Conditions.MarkTrue(condition.TLSInputReadyCondition, condition.InputReadyMessage)
 
@@ -505,6 +538,28 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
 		// remove LastAppliedTopology from the .Status
 		instance.Status.LastAppliedTopology = nil
 	}
+	commonsvc, err := commonservice.NewService(instanceha.MetricsService(instance), time.Duration(5)*time.Second, nil)
+	if err != nil {
+		instance.Status.Conditions.Set(condition.FalseCondition(
+			condition.CreateServiceReadyCondition,
+			condition.ErrorReason,
+			condition.SeverityWarning,
+			condition.CreateServiceReadyErrorMessage,
+			err.Error()))
+		return ctrl.Result{}, err
+	}
+	sres, serr := commonsvc.CreateOrPatch(ctx, helper)
+	if serr != nil {
+		instance.Status.Conditions.Set(condition.FalseCondition(
+			condition.CreateServiceReadyCondition,
+			condition.ErrorReason,
+			condition.SeverityWarning,
+			condition.CreateServiceReadyErrorMessage,
+			serr.Error()))
+		return sres, serr
+	}
+	instance.Status.Conditions.MarkTrue(condition.CreateServiceReadyCondition, condition.CreateServiceReadyMessage)
+
 	deployment := commondeployment.NewDeployment(instanceha.Deployment(instance, deploymentLabels, serviceAnnotations, cloud, configVarsHash, containerImage, topology, acSecretName), time.Duration(5)*time.Second)
 	sfres, sferr := deployment.CreateOrPatch(ctx, helper)
 	if sferr != nil {
@@ -558,6 +613,7 @@ const (
 	instanceHaConfigMapField   = ".spec.instanceHaConfigMap"
 	topologyField              = ".spec.topologyRef.Name"
 	acSecretField              = ".spec.auth.applicationCredentialSecret" // #nosec G101
+	metricsTLSField            = ".spec.metricsTLS.secretName"            // #nosec G101
 )
 
 var allWatchFields = []string{
@@ -568,6 +624,7 @@ var allWatchFields = []string{
 	instanceHaConfigMapField,
 	topologyField,
 	acSecretField,
+	metricsTLSField,
 }
 
 // SetupWithManager sets up the controller with the Manager.
@@ -649,9 +706,21 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
 		return err
 	}
 
+	// index metricsTLSField
+	if err := mgr.GetFieldIndexer().IndexField(context.Background(), &instancehav1.InstanceHa{}, metricsTLSField, func(rawObj client.Object) []string {
+		cr := rawObj.(*instancehav1.InstanceHa)
+		if cr.Spec.MetricsTLS.SecretName == nil {
+			return []string{instanceha.DefaultMetricsCertSecret}
+		}
+		return []string{*cr.Spec.MetricsTLS.SecretName}
+	}); err != nil {
+		return err
+	}
+
 	return ctrl.NewControllerManagedBy(mgr).
 		For(&instancehav1.InstanceHa{}).
 		Owns(&appsv1.Deployment{}).
+		Owns(&corev1.Service{}).
 		Owns(&corev1.ServiceAccount{}).
 		Owns(&rbacv1.Role{}).
 		Owns(&rbacv1.RoleBinding{}).
 
@@ -0,0 +1,10 @@
+package instanceha
+
+const (
+	// MetricsCertPath is the path to the metrics certificate file
+	MetricsCertPath = "/etc/pki/tls/certs/metrics.crt"
+	// MetricsKeyPath is the path to the metrics private key file
+	MetricsKeyPath = "/etc/pki/tls/private/metrics.key"
+	// DefaultMetricsCertSecret is the default secret name for the metrics TLS certificate
+	DefaultMetricsCertSecret = "cert-instanceha-metrics" //nolint:gosec
+)