Tune API probes

gibizer · openshift-merge-bot[bot] · commit ed0bef54a7ad · 2026-03-12T11:00:01.000Z
While investigated API scaling issues we noticed that our hard code
probe configuration is not optimal for scaling nova-api. Instead of
immediately killing pods when they are not responding in 30 seconds we
should be removing pods from the load balancer first when they are
getting overloaded and let them work through their backlog and only kill
a pod if it is hanging for an excessive amount of time.

Another observation was that we allow configuring APITimeout parameter
on our routes but changing that value is not reflected in our probe
configs. So even if the customer decides that it is OK if nova-api is
responding slower by increasing the APITimeout, our probes does not
become more forgiving.

This patch changes the probe configuration of nova-api and nova-metadata
to:
* be quick to remove the pod from the load balancer if it is overloaded
  via the readiness probe config
* be very forgiving about slow responses and only killing the pod if it
  is hanging for a long time via the liveness probe.
* both readiness and liveness probe timeout is now scaling with the
  APITimeout configuration.

Jira: OSPRH-25717
Jira: OSPRH-27192

Signed-off-by: Balazs Gibizer &lt;gibi@redhat.com&gt;
diff --git a/internal/novaapi/deployment.go b/internal/novaapi/deployment.go
@@ -17,6 +17,8 @@ limitations under the License.
 package novaapi
 
 import (
+	"math"
+
 	memcachedv1 "github.com/openstack-k8s-operators/infra-operator/apis/memcached/v1beta1"
 	topologyv1 "github.com/openstack-k8s-operators/infra-operator/apis/topology/v1beta1"
 	common "github.com/openstack-k8s-operators/lib-common/modules/common"
@@ -49,16 +51,31 @@ func StatefulSet(
 		FailureThreshold: 6,
 		PeriodSeconds:    10,
 	}
-	// After the first successful startupProbe, livenessProbe takes over
-	livenessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds: 30,
-		PeriodSeconds:  30,
-	}
+	// After the first successful startupProbe, livenessProbe takes over.
+
+	// Set up the readiness probe to detect overload by scheduling
+	// frequent API calls and detect even within the APITimeout period
+	// if API requests are being queued up on the pod. If so the readiness
+	// prove will fail and k8s will pull the pod out from the load balancer
+	// letting it work through the queued work before it gets new requests
+	// forwarded to it.
+	t := float64(instance.Spec.APITimeout)
 	readinessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds: 30,
-		PeriodSeconds:  30,
+		TimeoutSeconds:   int32(math.Floor(0.3 * t)),
+		PeriodSeconds:    int32(math.Floor(0.3 * t)),
+		FailureThreshold: 3,
+	}
+	// Set up the liveness probe to be lot more forgiving than readiness
+	// not to trigger a pod restart on overload directly and only fail if the
+	// pod hangs for a long time.
+	// Eventually we want to have a way to assess the health of the pod
+	// without directly calling its API and use a dedicated health check
+	// endpoint instead. But it needs upstream nova work first in
+	// https://blueprints.launchpad.net/nova/+spec/per-process-healthchecks
+	livenessProbe := &corev1.Probe{
+		TimeoutSeconds:   int32(math.Floor(0.5 * t)),
+		PeriodSeconds:    int32(math.Floor(0.5 * t)),
+		FailureThreshold: 10,
 	}
 
 	args := []string{"-c", nova.KollaServiceCommand}
diff --git a/internal/novametadata/deployment.go b/internal/novametadata/deployment.go
@@ -17,6 +17,8 @@ limitations under the License.
 package novametadata
 
 import (
+	"math"
+
 	memcachedv1 "github.com/openstack-k8s-operators/infra-operator/apis/memcached/v1beta1"
 	topologyv1 "github.com/openstack-k8s-operators/infra-operator/apis/topology/v1beta1"
 	common "github.com/openstack-k8s-operators/lib-common/modules/common"
@@ -48,15 +50,30 @@ func StatefulSet(
 		PeriodSeconds:    10,
 	}
 	// After the first successful startupProbe, livenessProbe takes over
-	livenessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds: 10,
-		PeriodSeconds:  10,
-	}
+
+	// Set up the readiness probe to detect overload by scheduling
+	// frequent API calls and detect even within the APITimeout period
+	// if API requests are being queued up on the pod. If so the readiness
+	// prove will fail and k8s will pull the pod out from the load balancer
+	// letting it work through the queued work before it gets new requests
+	// forwarded to it.
+	t := float64(instance.Spec.APITimeout)
 	readinessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds: 5,
-		PeriodSeconds:  5,
+		TimeoutSeconds:   int32(math.Floor(0.3 * t)),
+		PeriodSeconds:    int32(math.Floor(0.3 * t)),
+		FailureThreshold: 3,
+	}
+	// Set up the liveness probe to be lot more forgiving than readiness
+	// not to trigger a pod restart on overload directly and only fail if the
+	// pod hangs for a long time.
+	// Eventually we want to have a way to assess the health of the pod
+	// without directly calling its API and use a dedicated health check
+	// endpoint instead. But it needs upstream nova work first in
+	// https://blueprints.launchpad.net/nova/+spec/per-process-healthchecks
+	livenessProbe := &corev1.Probe{
+		TimeoutSeconds:   int32(math.Floor(0.5 * t)),
+		PeriodSeconds:    int32(math.Floor(0.5 * t)),
+		FailureThreshold: 10,
 	}
 
 	args := []string{"-c", nova.KollaServiceCommand}
diff --git a/test/functional/nova_metadata_controller_test.go b/test/functional/nova_metadata_controller_test.go
@@ -286,6 +286,13 @@ var _ = Describe("NovaMetadata controller", func() {
 				Expect(container.LivenessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8775)))
 				Expect(container.ReadinessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8775)))
 
+				Expect(container.ReadinessProbe.TimeoutSeconds).To(Equal(int32(18)))
+				Expect(container.ReadinessProbe.PeriodSeconds).To(Equal(int32(18)))
+				Expect(container.ReadinessProbe.FailureThreshold).To(Equal(int32(3)))
+
+				Expect(container.LivenessProbe.TimeoutSeconds).To(Equal(int32(30)))
+				Expect(container.LivenessProbe.PeriodSeconds).To(Equal(int32(30)))
+				Expect(container.LivenessProbe.FailureThreshold).To(Equal(int32(10)))
 			})
 
 			When("the StatefulSet has at least one Replica ready", func() {
diff --git a/test/functional/novaapi_controller_test.go b/test/functional/novaapi_controller_test.go
@@ -426,6 +426,13 @@ endpoint_service_type = compute`))
 			Expect(container.LivenessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8774)))
 			Expect(container.ReadinessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8774)))
 
+			Expect(container.ReadinessProbe.TimeoutSeconds).To(Equal(int32(18)))
+			Expect(container.ReadinessProbe.PeriodSeconds).To(Equal(int32(18)))
+			Expect(container.ReadinessProbe.FailureThreshold).To(Equal(int32(3)))
+
+			Expect(container.LivenessProbe.TimeoutSeconds).To(Equal(int32(30)))
+			Expect(container.LivenessProbe.PeriodSeconds).To(Equal(int32(30)))
+			Expect(container.LivenessProbe.FailureThreshold).To(Equal(int32(10)))
 		})
 
 		When("the StatefulSet has at least one Replica ready", func() {