Skip to content

Commit a2b3c7a

Browse files
committed
Migrate OCP-44493: configurable terminationGracePeriod for liveness and startup probes
Migrates test from openshift-tests-private to origin. Test validates probe-level terminationGracePeriodSeconds for: - Liveness probes with probe-level terminationGracePeriodSeconds (10s) - Startup probes with probe-level terminationGracePeriodSeconds (10s) - Liveness probes without probe-level (falls back to pod-level 60s) The test creates pods with failing probes and verifies the time difference between probe failure (Killing event) and container restart (Started event) matches the expected termination grace period within acceptable range. Event matching logic parses 'oc describe pod' output for: - Killing events with container name - Started events after restart Updates: - Add test to test/extended/node/node_e2e/node.go - Document test in test/extended/node/README.md Relates: https://issues.redhat.com/browse/OCPBUGS-44493 Signed-off-by: Bhargavi Gudi <BhargaviGudi@users.noreply.github.com>
1 parent 8c50bc4 commit a2b3c7a

3 files changed

Lines changed: 321 additions & 0 deletions

File tree

test/extended/node/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ This directory contains OpenShift end-to-end tests for node-related features.
2020
- **image_volume.go** - Tests mounting container images as volumes in pods, including subPath and error handling
2121
- **node_swap.go** - Tests default kubelet swap settings (failSwapOn and swapBehavior) and rejection of user overrides
2222
- **zstd_chunked.go** - Tests building and running images with zstd:chunked compression format
23+
- **node_e2e/probe_termination.go** - Probe-level terminationGracePeriodSeconds (OCP-44493) - Tests configurable termination grace period for liveness and startup probes. Includes 3 test cases: probe-level config for liveness probe, probe-level config for startup probe, and fallback to pod-level config when probe-level is not set [Lifecycle:informing]
2324

2425
## Directory Structure
2526

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
package node
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
"time"
8+
9+
g "github.com/onsi/ginkgo/v2"
10+
o "github.com/onsi/gomega"
11+
ote "github.com/openshift-eng/openshift-tests-extension/pkg/ginkgo"
12+
13+
corev1 "k8s.io/api/core/v1"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/util/intstr"
16+
"k8s.io/apimachinery/pkg/util/wait"
17+
e2e "k8s.io/kubernetes/test/e2e/framework"
18+
"k8s.io/utils/ptr"
19+
20+
nodeutils "github.com/openshift/origin/test/extended/node"
21+
exutil "github.com/openshift/origin/test/extended/util"
22+
)
23+
24+
var _ = g.Describe("[sig-node] Probe configuration", func() {
25+
var (
26+
oc = exutil.NewCLIWithoutNamespace("probe-termination")
27+
)
28+
29+
//author: bgudi@redhat.com
30+
g.It("[OTP] Liveness probe should respect probe-level terminationGracePeriodSeconds [OCP-44493]", ote.Informing(), func() {
31+
ctx := context.Background()
32+
33+
oc.SetupProject()
34+
namespace := oc.Namespace()
35+
36+
g.By("Create pod with liveness probe having probe-level terminationGracePeriodSeconds=10s")
37+
pod := &corev1.Pod{
38+
ObjectMeta: metav1.ObjectMeta{
39+
Name: "liveness-probe-level",
40+
Namespace: namespace,
41+
},
42+
Spec: corev1.PodSpec{
43+
TerminationGracePeriodSeconds: ptr.To[int64](60),
44+
SecurityContext: &corev1.PodSecurityContext{
45+
RunAsNonRoot: ptr.To(true),
46+
SeccompProfile: &corev1.SeccompProfile{
47+
Type: corev1.SeccompProfileTypeRuntimeDefault,
48+
},
49+
},
50+
Containers: []corev1.Container{
51+
{
52+
Name: "test",
53+
Image: "quay.io/openshifttest/nginx-alpine@sha256:04f316442d48ba60e3ea0b5a67eb89b0b667abf1c198a3d0056ca748736336a0",
54+
SecurityContext: &corev1.SecurityContext{
55+
AllowPrivilegeEscalation: ptr.To(false),
56+
Capabilities: &corev1.Capabilities{
57+
Drop: []corev1.Capability{"ALL"},
58+
},
59+
},
60+
Command: []string{"sh", "-c", "sleep 100000000"},
61+
Ports: []corev1.ContainerPort{
62+
{ContainerPort: 8080},
63+
},
64+
LivenessProbe: &corev1.Probe{
65+
ProbeHandler: corev1.ProbeHandler{
66+
HTTPGet: &corev1.HTTPGetAction{
67+
Path: "/healthz",
68+
Port: intstr.FromInt(8080),
69+
},
70+
},
71+
InitialDelaySeconds: 5,
72+
FailureThreshold: 1,
73+
PeriodSeconds: 60,
74+
TerminationGracePeriodSeconds: ptr.To[int64](10),
75+
},
76+
},
77+
},
78+
},
79+
}
80+
81+
_, err := oc.KubeClient().CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
82+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to create liveness probe pod")
83+
84+
g.By("Verify probe-level terminationGracePeriodSeconds is honored (10s)")
85+
expectedSec := 10
86+
minSec := expectedSec - 3
87+
maxSec := expectedSec + 10
88+
timeDiff, err := verifyProbeTermination(ctx, oc, namespace, "liveness-probe-level", "test", expectedSec)
89+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to get probe termination events")
90+
o.Expect(timeDiff).To(o.BeNumerically(">=", minSec), fmt.Sprintf("time difference %ds is less than expected minimum %ds", timeDiff, minSec))
91+
o.Expect(timeDiff).To(o.BeNumerically("<=", maxSec), fmt.Sprintf("time difference %ds is greater than expected maximum %ds", timeDiff, maxSec))
92+
})
93+
94+
//author: bgudi@redhat.com
95+
g.It("[OTP] Startup probe should respect probe-level terminationGracePeriodSeconds [OCP-44493]", ote.Informing(), func() {
96+
ctx := context.Background()
97+
98+
oc.SetupProject()
99+
namespace := oc.Namespace()
100+
101+
g.By("Create pod with startup probe having probe-level terminationGracePeriodSeconds=10s")
102+
pod := &corev1.Pod{
103+
ObjectMeta: metav1.ObjectMeta{
104+
Name: "startup-probe-level",
105+
Namespace: namespace,
106+
},
107+
Spec: corev1.PodSpec{
108+
TerminationGracePeriodSeconds: ptr.To[int64](60),
109+
SecurityContext: &corev1.PodSecurityContext{
110+
RunAsNonRoot: ptr.To(true),
111+
SeccompProfile: &corev1.SeccompProfile{
112+
Type: corev1.SeccompProfileTypeRuntimeDefault,
113+
},
114+
},
115+
Containers: []corev1.Container{
116+
{
117+
Name: "teststartup",
118+
Image: "quay.io/openshifttest/nginx-alpine@sha256:04f316442d48ba60e3ea0b5a67eb89b0b667abf1c198a3d0056ca748736336a0",
119+
SecurityContext: &corev1.SecurityContext{
120+
AllowPrivilegeEscalation: ptr.To(false),
121+
Capabilities: &corev1.Capabilities{
122+
Drop: []corev1.Capability{"ALL"},
123+
},
124+
},
125+
Command: []string{"sh", "-c", "sleep 100000000"},
126+
Ports: []corev1.ContainerPort{
127+
{ContainerPort: 8080},
128+
},
129+
StartupProbe: &corev1.Probe{
130+
ProbeHandler: corev1.ProbeHandler{
131+
HTTPGet: &corev1.HTTPGetAction{
132+
Path: "/healthz",
133+
Port: intstr.FromInt(8080),
134+
},
135+
},
136+
InitialDelaySeconds: 5,
137+
FailureThreshold: 1,
138+
PeriodSeconds: 60,
139+
TerminationGracePeriodSeconds: ptr.To[int64](10),
140+
},
141+
},
142+
},
143+
},
144+
}
145+
146+
_, err := oc.KubeClient().CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
147+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to create startup probe pod")
148+
149+
g.By("Verify probe-level terminationGracePeriodSeconds is honored (10s)")
150+
expectedSec := 10
151+
minSec := expectedSec - 3
152+
maxSec := expectedSec + 10
153+
timeDiff, err := verifyProbeTermination(ctx, oc, namespace, "startup-probe-level", "teststartup", expectedSec)
154+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to get probe termination events")
155+
o.Expect(timeDiff).To(o.BeNumerically(">=", minSec), fmt.Sprintf("time difference %ds is less than expected minimum %ds", timeDiff, minSec))
156+
o.Expect(timeDiff).To(o.BeNumerically("<=", maxSec), fmt.Sprintf("time difference %ds is greater than expected maximum %ds", timeDiff, maxSec))
157+
})
158+
159+
//author: bgudi@redhat.com
160+
g.It("[OTP] Liveness probe should fall back to pod-level terminationGracePeriodSeconds when probe-level is not set [OCP-44493]", ote.Informing(), func() {
161+
ctx := context.Background()
162+
163+
oc.SetupProject()
164+
namespace := oc.Namespace()
165+
166+
g.By("Create pod with liveness probe without probe-level terminationGracePeriodSeconds")
167+
pod := &corev1.Pod{
168+
ObjectMeta: metav1.ObjectMeta{
169+
Name: "liveness-pod-level",
170+
Namespace: namespace,
171+
},
172+
Spec: corev1.PodSpec{
173+
TerminationGracePeriodSeconds: ptr.To[int64](60),
174+
SecurityContext: &corev1.PodSecurityContext{
175+
RunAsNonRoot: ptr.To(true),
176+
SeccompProfile: &corev1.SeccompProfile{
177+
Type: corev1.SeccompProfileTypeRuntimeDefault,
178+
},
179+
},
180+
Containers: []corev1.Container{
181+
{
182+
Name: "test",
183+
Image: "quay.io/openshifttest/nginx-alpine@sha256:04f316442d48ba60e3ea0b5a67eb89b0b667abf1c198a3d0056ca748736336a0",
184+
SecurityContext: &corev1.SecurityContext{
185+
AllowPrivilegeEscalation: ptr.To(false),
186+
Capabilities: &corev1.Capabilities{
187+
Drop: []corev1.Capability{"ALL"},
188+
},
189+
},
190+
Command: []string{"sh", "-c", "sleep 100000000"},
191+
Ports: []corev1.ContainerPort{
192+
{ContainerPort: 8080},
193+
},
194+
LivenessProbe: &corev1.Probe{
195+
ProbeHandler: corev1.ProbeHandler{
196+
HTTPGet: &corev1.HTTPGetAction{
197+
Path: "/healthz",
198+
Port: intstr.FromInt(8080),
199+
},
200+
},
201+
InitialDelaySeconds: 5,
202+
FailureThreshold: 1,
203+
PeriodSeconds: 60,
204+
// No TerminationGracePeriodSeconds - should use pod-level (60s)
205+
},
206+
},
207+
},
208+
},
209+
}
210+
211+
_, err := oc.KubeClient().CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
212+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to create liveness probe pod without probe-level termination")
213+
214+
g.By("Verify pod-level terminationGracePeriodSeconds is used (60s)")
215+
expectedSec := 60
216+
minSec := expectedSec - 3
217+
maxSec := expectedSec + 10
218+
timeDiff, err := verifyProbeTermination(ctx, oc, namespace, "liveness-pod-level", "test", expectedSec)
219+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to get probe termination events")
220+
o.Expect(timeDiff).To(o.BeNumerically(">=", minSec), fmt.Sprintf("time difference %ds is less than expected minimum %ds", timeDiff, minSec))
221+
o.Expect(timeDiff).To(o.BeNumerically("<=", maxSec), fmt.Sprintf("time difference %ds is greater than expected maximum %ds", timeDiff, maxSec))
222+
})
223+
})
224+
225+
// findLatestEventByReason finds the latest event matching the given reason and message filter
226+
func findLatestEventByReason(events *corev1.EventList, reason string, msgFilter func(string) bool) *corev1.Event {
227+
var latestEvent *corev1.Event
228+
for i := range events.Items {
229+
event := &events.Items[i]
230+
if event.Reason == reason && msgFilter(event.Message) {
231+
if latestEvent == nil || event.LastTimestamp.Time.After(latestEvent.LastTimestamp.Time) {
232+
latestEvent = event
233+
}
234+
}
235+
}
236+
return latestEvent
237+
}
238+
239+
// findEarliestEventAfter finds the earliest event matching the reason and filter that occurred after the given time
240+
func findEarliestEventAfter(events *corev1.EventList, reason string, msgFilter func(string) bool, afterTime time.Time) *corev1.Event {
241+
var earliestEvent *corev1.Event
242+
for i := range events.Items {
243+
event := &events.Items[i]
244+
if event.Reason == reason && msgFilter(event.Message) && event.FirstTimestamp.Time.After(afterTime) {
245+
if earliestEvent == nil || event.FirstTimestamp.Time.Before(earliestEvent.FirstTimestamp.Time) {
246+
earliestEvent = event
247+
}
248+
}
249+
}
250+
return earliestEvent
251+
}
252+
253+
// verifyProbeTermination verifies that the probe termination grace period is honored
254+
// by checking the time difference between probe failure (Killing) and container restart (Started) events
255+
// Returns the time difference in seconds, or an error if events are not found
256+
func verifyProbeTermination(ctx context.Context, oc *exutil.CLI, namespace, podName, containerName string, expectedTerminationSec int) (int, error) {
257+
var timeDiff int
258+
// Timeout needs to account for: pod start (~30s) + probe period (60s) + termination (up to 60s) + restart (~30s) = ~3 minutes minimum
259+
// Use 6 minutes to be safe for tests with 60s termination grace period
260+
err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 6*time.Minute, true, func(ctx context.Context) (bool, error) {
261+
// Get events using the Events API
262+
events, err := oc.KubeClient().CoreV1().Events(namespace).List(ctx, metav1.ListOptions{
263+
FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.kind=Pod", podName),
264+
})
265+
if err != nil {
266+
e2e.Logf("Error getting events: %v", err)
267+
return false, nil
268+
}
269+
270+
// Find probe failure (Killing) event for the container
271+
killingEvent := findLatestEventByReason(events, "Killing", func(msg string) bool {
272+
return strings.Contains(msg, containerName) &&
273+
strings.Contains(msg, "failed") &&
274+
strings.Contains(msg, "probe")
275+
})
276+
277+
if killingEvent == nil {
278+
e2e.Logf("Waiting for probe failure (Killing) event")
279+
return false, nil
280+
}
281+
282+
// Find container restart (Started) event that occurred after the Killing event
283+
startedEvent := findEarliestEventAfter(events, "Started", func(msg string) bool {
284+
return strings.Contains(msg, "Started container")
285+
}, killingEvent.LastTimestamp.Time)
286+
287+
if startedEvent == nil {
288+
e2e.Logf("Waiting for container restart (Started) event after Killing event")
289+
return false, nil
290+
}
291+
292+
e2e.Logf("Killing event: %s at %v", killingEvent.Message, killingEvent.LastTimestamp)
293+
e2e.Logf("Started event: %s at %v", startedEvent.Message, startedEvent.FirstTimestamp)
294+
295+
// Calculate time difference using the helper function
296+
timeDiff = int(nodeutils.CalculateEventTimeDiff(killingEvent, startedEvent).Seconds())
297+
e2e.Logf("Time difference: %d seconds (expected: %d ±10 seconds)", timeDiff, expectedTerminationSec)
298+
299+
return true, nil
300+
})
301+
if err != nil {
302+
return 0, err
303+
}
304+
return timeDiff, nil
305+
}

test/extended/node/node_utils.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -764,3 +764,18 @@ func GetFirstReadyWorkerNode(oc *exutil.CLI) string {
764764
o.Expect(false).To(o.BeTrue(), "no Ready worker node found among %v", workers)
765765
return "" // unreachable; satisfies compiler
766766
}
767+
768+
// CalculateEventTimeDiff calculates the time difference between two Kubernetes events.
769+
// It uses LastTimestamp for the start event and FirstTimestamp for the end event.
770+
// Falls back to FirstTimestamp/LastTimestamp respectively if the primary timestamp is zero.
771+
func CalculateEventTimeDiff(startEvent, endEvent *corev1.Event) time.Duration {
772+
startTime := startEvent.LastTimestamp.Time
773+
if startTime.IsZero() {
774+
startTime = startEvent.FirstTimestamp.Time
775+
}
776+
endTime := endEvent.FirstTimestamp.Time
777+
if endTime.IsZero() {
778+
endTime = endEvent.LastTimestamp.Time
779+
}
780+
return endTime.Sub(startTime)
781+
}

0 commit comments

Comments
 (0)