Skip to content

Commit ed0bef5

Browse files
gibizeropenshift-merge-bot[bot]
authored andcommitted
Tune API probes
While investigated API scaling issues we noticed that our hard code probe configuration is not optimal for scaling nova-api. Instead of immediately killing pods when they are not responding in 30 seconds we should be removing pods from the load balancer first when they are getting overloaded and let them work through their backlog and only kill a pod if it is hanging for an excessive amount of time. Another observation was that we allow configuring APITimeout parameter on our routes but changing that value is not reflected in our probe configs. So even if the customer decides that it is OK if nova-api is responding slower by increasing the APITimeout, our probes does not become more forgiving. This patch changes the probe configuration of nova-api and nova-metadata to: * be quick to remove the pod from the load balancer if it is overloaded via the readiness probe config * be very forgiving about slow responses and only killing the pod if it is hanging for a long time via the liveness probe. * both readiness and liveness probe timeout is now scaling with the APITimeout configuration. Jira: OSPRH-25717 Jira: OSPRH-27192 Signed-off-by: Balazs Gibizer <gibi@redhat.com>
1 parent 673abc0 commit ed0bef5

4 files changed

Lines changed: 65 additions & 17 deletions

File tree

internal/novaapi/deployment.go

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package novaapi
1818

1919
import (
20+
"math"
21+
2022
memcachedv1 "github.com/openstack-k8s-operators/infra-operator/apis/memcached/v1beta1"
2123
topologyv1 "github.com/openstack-k8s-operators/infra-operator/apis/topology/v1beta1"
2224
common "github.com/openstack-k8s-operators/lib-common/modules/common"
@@ -49,16 +51,31 @@ func StatefulSet(
4951
FailureThreshold: 6,
5052
PeriodSeconds: 10,
5153
}
52-
// After the first successful startupProbe, livenessProbe takes over
53-
livenessProbe := &corev1.Probe{
54-
// TODO might need tuning
55-
TimeoutSeconds: 30,
56-
PeriodSeconds: 30,
57-
}
54+
// After the first successful startupProbe, livenessProbe takes over.
55+
56+
// Set up the readiness probe to detect overload by scheduling
57+
// frequent API calls and detect even within the APITimeout period
58+
// if API requests are being queued up on the pod. If so the readiness
59+
// prove will fail and k8s will pull the pod out from the load balancer
60+
// letting it work through the queued work before it gets new requests
61+
// forwarded to it.
62+
t := float64(instance.Spec.APITimeout)
5863
readinessProbe := &corev1.Probe{
59-
// TODO might need tuning
60-
TimeoutSeconds: 30,
61-
PeriodSeconds: 30,
64+
TimeoutSeconds: int32(math.Floor(0.3 * t)),
65+
PeriodSeconds: int32(math.Floor(0.3 * t)),
66+
FailureThreshold: 3,
67+
}
68+
// Set up the liveness probe to be lot more forgiving than readiness
69+
// not to trigger a pod restart on overload directly and only fail if the
70+
// pod hangs for a long time.
71+
// Eventually we want to have a way to assess the health of the pod
72+
// without directly calling its API and use a dedicated health check
73+
// endpoint instead. But it needs upstream nova work first in
74+
// https://blueprints.launchpad.net/nova/+spec/per-process-healthchecks
75+
livenessProbe := &corev1.Probe{
76+
TimeoutSeconds: int32(math.Floor(0.5 * t)),
77+
PeriodSeconds: int32(math.Floor(0.5 * t)),
78+
FailureThreshold: 10,
6279
}
6380

6481
args := []string{"-c", nova.KollaServiceCommand}

internal/novametadata/deployment.go

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package novametadata
1818

1919
import (
20+
"math"
21+
2022
memcachedv1 "github.com/openstack-k8s-operators/infra-operator/apis/memcached/v1beta1"
2123
topologyv1 "github.com/openstack-k8s-operators/infra-operator/apis/topology/v1beta1"
2224
common "github.com/openstack-k8s-operators/lib-common/modules/common"
@@ -48,15 +50,30 @@ func StatefulSet(
4850
PeriodSeconds: 10,
4951
}
5052
// After the first successful startupProbe, livenessProbe takes over
51-
livenessProbe := &corev1.Probe{
52-
// TODO might need tuning
53-
TimeoutSeconds: 10,
54-
PeriodSeconds: 10,
55-
}
53+
54+
// Set up the readiness probe to detect overload by scheduling
55+
// frequent API calls and detect even within the APITimeout period
56+
// if API requests are being queued up on the pod. If so the readiness
57+
// prove will fail and k8s will pull the pod out from the load balancer
58+
// letting it work through the queued work before it gets new requests
59+
// forwarded to it.
60+
t := float64(instance.Spec.APITimeout)
5661
readinessProbe := &corev1.Probe{
57-
// TODO might need tuning
58-
TimeoutSeconds: 5,
59-
PeriodSeconds: 5,
62+
TimeoutSeconds: int32(math.Floor(0.3 * t)),
63+
PeriodSeconds: int32(math.Floor(0.3 * t)),
64+
FailureThreshold: 3,
65+
}
66+
// Set up the liveness probe to be lot more forgiving than readiness
67+
// not to trigger a pod restart on overload directly and only fail if the
68+
// pod hangs for a long time.
69+
// Eventually we want to have a way to assess the health of the pod
70+
// without directly calling its API and use a dedicated health check
71+
// endpoint instead. But it needs upstream nova work first in
72+
// https://blueprints.launchpad.net/nova/+spec/per-process-healthchecks
73+
livenessProbe := &corev1.Probe{
74+
TimeoutSeconds: int32(math.Floor(0.5 * t)),
75+
PeriodSeconds: int32(math.Floor(0.5 * t)),
76+
FailureThreshold: 10,
6077
}
6178

6279
args := []string{"-c", nova.KollaServiceCommand}

test/functional/nova_metadata_controller_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,13 @@ var _ = Describe("NovaMetadata controller", func() {
286286
Expect(container.LivenessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8775)))
287287
Expect(container.ReadinessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8775)))
288288

289+
Expect(container.ReadinessProbe.TimeoutSeconds).To(Equal(int32(18)))
290+
Expect(container.ReadinessProbe.PeriodSeconds).To(Equal(int32(18)))
291+
Expect(container.ReadinessProbe.FailureThreshold).To(Equal(int32(3)))
292+
293+
Expect(container.LivenessProbe.TimeoutSeconds).To(Equal(int32(30)))
294+
Expect(container.LivenessProbe.PeriodSeconds).To(Equal(int32(30)))
295+
Expect(container.LivenessProbe.FailureThreshold).To(Equal(int32(10)))
289296
})
290297

291298
When("the StatefulSet has at least one Replica ready", func() {

test/functional/novaapi_controller_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,13 @@ endpoint_service_type = compute`))
426426
Expect(container.LivenessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8774)))
427427
Expect(container.ReadinessProbe.HTTPGet.Port.IntVal).To(Equal(int32(8774)))
428428

429+
Expect(container.ReadinessProbe.TimeoutSeconds).To(Equal(int32(18)))
430+
Expect(container.ReadinessProbe.PeriodSeconds).To(Equal(int32(18)))
431+
Expect(container.ReadinessProbe.FailureThreshold).To(Equal(int32(3)))
432+
433+
Expect(container.LivenessProbe.TimeoutSeconds).To(Equal(int32(30)))
434+
Expect(container.LivenessProbe.PeriodSeconds).To(Equal(int32(30)))
435+
Expect(container.LivenessProbe.FailureThreshold).To(Equal(int32(10)))
429436
})
430437

431438
When("the StatefulSet has at least one Replica ready", func() {

0 commit comments

Comments
 (0)