Skip to content

Commit 2aca499

Browse files
committed
Migrate OCP-44493: configurable terminationGracePeriod for liveness and startup probes
Migrates test from openshift-tests-private to origin. Test validates probe-level terminationGracePeriodSeconds for: - Liveness probes with probe-level terminationGracePeriodSeconds (10s) - Startup probes with probe-level terminationGracePeriodSeconds (10s) - Liveness probes without probe-level (falls back to pod-level 60s) The test creates pods with failing probes and verifies the time difference between probe failure (Killing event) and container restart (Started event) matches the expected termination grace period within acceptable range. Event matching logic parses 'oc describe pod' output for: - Killing events with container name - Started events after restart Updates: - Add test to test/extended/node/node_e2e/node.go - Document test in test/extended/node/README.md Relates: https://issues.redhat.com/browse/OCPBUGS-44493
1 parent 38c4fba commit 2aca499

2 files changed

Lines changed: 286 additions & 0 deletions

File tree

test/extended/node/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ This directory contains OpenShift end-to-end tests for node-related features.
1919
- **image_volume.go** - Tests mounting container images as volumes in pods, including subPath and error handling
2020
- **node_swap.go** - Tests default kubelet swap settings (failSwapOn and swapBehavior) and rejection of user overrides
2121
- **zstd_chunked.go** - Tests building and running images with zstd:chunked compression format
22+
- **node_e2e/node.go** - Probe-level terminationGracePeriodSeconds (OCP-44493) - Tests configurable termination grace period for liveness and startup probes [Lifecycle:informing]
2223

2324
## Directory Structure
2425

test/extended/node/node_e2e/node.go

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,19 @@ import (
44
"context"
55
"fmt"
66
"path/filepath"
7+
"strconv"
78
"strings"
89
"time"
910

1011
g "github.com/onsi/ginkgo/v2"
1112
o "github.com/onsi/gomega"
13+
ote "github.com/openshift-eng/openshift-tests-extension/pkg/ginkgo"
1214

1315
configv1 "github.com/openshift/api/config/v1"
1416
"github.com/openshift/origin/test/extended/imagepolicy"
17+
corev1 "k8s.io/api/core/v1"
1518
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19+
"k8s.io/apimachinery/pkg/util/intstr"
1620
utilrand "k8s.io/apimachinery/pkg/util/rand"
1721
"k8s.io/apimachinery/pkg/util/wait"
1822
e2e "k8s.io/kubernetes/test/e2e/framework"
@@ -164,6 +168,287 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager",
164168
e2e.Logf("/dev/fuse mount output: %s", output)
165169
o.Expect(output).To(o.ContainSubstring("fuse"), "dev fuse is not mounted inside pod")
166170
})
171+
172+
//author: minmli@redhat.com
173+
//migrated from openshift-tests-private
174+
//automates: https://issues.redhat.com/browse/OCPBUGS-44493
175+
g.It("[OTP] add configurable terminationGracePeriodSeconds to liveness and startup probes [OCP-44493]", ote.Informing(), func() {
176+
ctx := context.Background()
177+
var err error
178+
179+
oc.SetupProject()
180+
namespace := oc.Namespace()
181+
182+
// Helper function to parse duration string like "1m30s" or "45s" to seconds
183+
parseDurationToSeconds := func(durationStr string) (int, error) {
184+
var totalSeconds int
185+
if strings.Contains(durationStr, "m") {
186+
parts := strings.Split(durationStr, "m")
187+
minutes, err := strconv.Atoi(parts[0])
188+
if err != nil {
189+
return 0, err
190+
}
191+
totalSeconds = minutes * 60
192+
if len(parts) > 1 && strings.Contains(parts[1], "s") {
193+
secStr := strings.TrimSuffix(parts[1], "s")
194+
if secStr != "" {
195+
seconds, err := strconv.Atoi(secStr)
196+
if err != nil {
197+
return 0, err
198+
}
199+
totalSeconds += seconds
200+
}
201+
}
202+
} else if strings.Contains(durationStr, "s") {
203+
secStr := strings.TrimSuffix(durationStr, "s")
204+
seconds, err := strconv.Atoi(secStr)
205+
if err != nil {
206+
return 0, err
207+
}
208+
totalSeconds = seconds
209+
}
210+
return totalSeconds, nil
211+
}
212+
213+
// Helper to verify probe termination period
214+
verifyProbeTermination := func(podName, containerName string, expectedTerminationSec int) error {
215+
return wait.PollUntilContextTimeout(ctx, 10*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
216+
podDesc, err := oc.AsAdmin().WithoutNamespace().Run("describe").Args("pod", podName, "-n", namespace).Output()
217+
if err != nil {
218+
e2e.Logf("Error describing pod: %v", err)
219+
return false, nil
220+
}
221+
222+
// Look for probe failure (killing) and container restart events
223+
// Event format: "Normal Killing <time> kubelet Container <name> failed <probe> probe, will be restarted"
224+
// Event format: "Normal Started <time> kubelet Container started"
225+
killingLine := ""
226+
restartLine := ""
227+
228+
inEvents := false
229+
for _, line := range strings.Split(podDesc, "\n") {
230+
if strings.Contains(line, "Events:") {
231+
inEvents = true
232+
continue
233+
}
234+
if !inEvents {
235+
continue
236+
}
237+
238+
// Look for killing event with container name
239+
if strings.Contains(line, "Killing") && strings.Contains(line, containerName) &&
240+
strings.Contains(line, "failed") && strings.Contains(line, "probe") &&
241+
strings.Contains(line, "will be restarted") {
242+
killingLine = line
243+
}
244+
// Look for Started event after Killing
245+
if killingLine != "" && strings.Contains(line, "Started") && strings.Contains(line, "Container started") {
246+
restartLine = line
247+
break
248+
}
249+
}
250+
251+
if killingLine == "" || restartLine == "" {
252+
e2e.Logf("Waiting for probe failure (killing) and container restart events")
253+
return false, nil
254+
}
255+
256+
e2e.Logf("Killing event: %s", killingLine)
257+
e2e.Logf("Restart event: %s", restartLine)
258+
259+
// Extract timestamps (format: "1m30s" or "45s")
260+
// Event format: "Normal Killing 2m30s kubelet Container..."
261+
killingFields := strings.Fields(killingLine)
262+
restartFields := strings.Fields(restartLine)
263+
if len(killingFields) < 3 || len(restartFields) < 3 {
264+
e2e.Logf("Unable to parse event timestamps")
265+
return false, nil
266+
}
267+
268+
killingTime := killingFields[2]
269+
restartTime := restartFields[2]
270+
271+
killingSec, err := parseDurationToSeconds(killingTime)
272+
if err != nil {
273+
e2e.Logf("Error parsing killing time: %v", err)
274+
return false, nil
275+
}
276+
277+
restartSec, err := parseDurationToSeconds(restartTime)
278+
if err != nil {
279+
e2e.Logf("Error parsing restart time: %v", err)
280+
return false, nil
281+
}
282+
283+
// Time difference: killing happened earlier, restart happened later
284+
// So we need to calculate how long between them
285+
timeDiff := killingSec - restartSec
286+
e2e.Logf("Time difference: %d seconds (expected: %d ±10 seconds)", timeDiff, expectedTerminationSec)
287+
288+
// Allow range: [expectedTerminationSec-3, expectedTerminationSec+10]
289+
if timeDiff >= (expectedTerminationSec-3) && timeDiff <= (expectedTerminationSec+10) {
290+
e2e.Logf("Termination grace period check passed")
291+
return true, nil
292+
}
293+
294+
e2e.Logf("Time difference %d is outside expected range [%d, %d]", timeDiff, expectedTerminationSec-3, expectedTerminationSec+10)
295+
return false, nil
296+
})
297+
}
298+
299+
g.By("Test liveness probe with probe-level terminationGracePeriodSeconds")
300+
livenessPod := &corev1.Pod{
301+
ObjectMeta: metav1.ObjectMeta{
302+
Name: "liveness-probe",
303+
Namespace: namespace,
304+
},
305+
Spec: corev1.PodSpec{
306+
TerminationGracePeriodSeconds: &[]int64{60}[0],
307+
SecurityContext: &corev1.PodSecurityContext{
308+
RunAsNonRoot: &[]bool{true}[0],
309+
SeccompProfile: &corev1.SeccompProfile{
310+
Type: corev1.SeccompProfileTypeRuntimeDefault,
311+
},
312+
},
313+
Containers: []corev1.Container{
314+
{
315+
Name: "test",
316+
Image: "quay.io/openshifttest/nginx-alpine@sha256:04f316442d48ba60e3ea0b5a67eb89b0b667abf1c198a3d0056ca748736336a0",
317+
SecurityContext: &corev1.SecurityContext{
318+
AllowPrivilegeEscalation: &[]bool{false}[0],
319+
Capabilities: &corev1.Capabilities{
320+
Drop: []corev1.Capability{"ALL"},
321+
},
322+
},
323+
Command: []string{"sh", "-c", "sleep 100000000"},
324+
Ports: []corev1.ContainerPort{
325+
{ContainerPort: 8080},
326+
},
327+
LivenessProbe: &corev1.Probe{
328+
ProbeHandler: corev1.ProbeHandler{
329+
HTTPGet: &corev1.HTTPGetAction{
330+
Path: "/healthz",
331+
Port: intstr.FromInt(8080),
332+
},
333+
},
334+
FailureThreshold: 1,
335+
PeriodSeconds: 60,
336+
TerminationGracePeriodSeconds: &[]int64{10}[0],
337+
},
338+
},
339+
},
340+
},
341+
}
342+
343+
_, err = oc.KubeClient().CoreV1().Pods(namespace).Create(ctx, livenessPod, metav1.CreateOptions{})
344+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to create liveness probe pod")
345+
g.DeferCleanup(oc.KubeClient().CoreV1().Pods(namespace).Delete, ctx, "liveness-probe", metav1.DeleteOptions{})
346+
347+
err = verifyProbeTermination("liveness-probe", "test", 10)
348+
o.Expect(err).NotTo(o.HaveOccurred(), "liveness probe termination grace period not honored")
349+
350+
g.By("Test startup probe with probe-level terminationGracePeriodSeconds")
351+
startupPod := &corev1.Pod{
352+
ObjectMeta: metav1.ObjectMeta{
353+
Name: "startup-probe",
354+
Namespace: namespace,
355+
},
356+
Spec: corev1.PodSpec{
357+
TerminationGracePeriodSeconds: &[]int64{60}[0],
358+
SecurityContext: &corev1.PodSecurityContext{
359+
RunAsNonRoot: &[]bool{true}[0],
360+
SeccompProfile: &corev1.SeccompProfile{
361+
Type: corev1.SeccompProfileTypeRuntimeDefault,
362+
},
363+
},
364+
Containers: []corev1.Container{
365+
{
366+
Name: "teststartup",
367+
Image: "quay.io/openshifttest/nginx-alpine@sha256:04f316442d48ba60e3ea0b5a67eb89b0b667abf1c198a3d0056ca748736336a0",
368+
SecurityContext: &corev1.SecurityContext{
369+
AllowPrivilegeEscalation: &[]bool{false}[0],
370+
Capabilities: &corev1.Capabilities{
371+
Drop: []corev1.Capability{"ALL"},
372+
},
373+
},
374+
Command: []string{"sh", "-c", "sleep 100000000"},
375+
Ports: []corev1.ContainerPort{
376+
{ContainerPort: 8080},
377+
},
378+
StartupProbe: &corev1.Probe{
379+
ProbeHandler: corev1.ProbeHandler{
380+
HTTPGet: &corev1.HTTPGetAction{
381+
Path: "/healthz",
382+
Port: intstr.FromInt(8080),
383+
},
384+
},
385+
FailureThreshold: 1,
386+
PeriodSeconds: 60,
387+
TerminationGracePeriodSeconds: &[]int64{10}[0],
388+
},
389+
},
390+
},
391+
},
392+
}
393+
394+
_, err = oc.KubeClient().CoreV1().Pods(namespace).Create(ctx, startupPod, metav1.CreateOptions{})
395+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to create startup probe pod")
396+
g.DeferCleanup(oc.KubeClient().CoreV1().Pods(namespace).Delete, ctx, "startup-probe", metav1.DeleteOptions{})
397+
398+
err = verifyProbeTermination("startup-probe", "teststartup", 10)
399+
o.Expect(err).NotTo(o.HaveOccurred(), "startup probe termination grace period not honored")
400+
401+
g.By("Test liveness probe without probe-level terminationGracePeriodSeconds (should use pod-level)")
402+
livenessPodNoProbeTerm := &corev1.Pod{
403+
ObjectMeta: metav1.ObjectMeta{
404+
Name: "liveness-probe-no-term",
405+
Namespace: namespace,
406+
},
407+
Spec: corev1.PodSpec{
408+
TerminationGracePeriodSeconds: &[]int64{60}[0],
409+
SecurityContext: &corev1.PodSecurityContext{
410+
RunAsNonRoot: &[]bool{true}[0],
411+
SeccompProfile: &corev1.SeccompProfile{
412+
Type: corev1.SeccompProfileTypeRuntimeDefault,
413+
},
414+
},
415+
Containers: []corev1.Container{
416+
{
417+
Name: "test",
418+
Image: "quay.io/openshifttest/nginx-alpine@sha256:04f316442d48ba60e3ea0b5a67eb89b0b667abf1c198a3d0056ca748736336a0",
419+
SecurityContext: &corev1.SecurityContext{
420+
AllowPrivilegeEscalation: &[]bool{false}[0],
421+
Capabilities: &corev1.Capabilities{
422+
Drop: []corev1.Capability{"ALL"},
423+
},
424+
},
425+
Command: []string{"sh", "-c", "sleep 100000000"},
426+
Ports: []corev1.ContainerPort{
427+
{ContainerPort: 8080},
428+
},
429+
LivenessProbe: &corev1.Probe{
430+
ProbeHandler: corev1.ProbeHandler{
431+
HTTPGet: &corev1.HTTPGetAction{
432+
Path: "/healthz",
433+
Port: intstr.FromInt(8080),
434+
},
435+
},
436+
FailureThreshold: 1,
437+
PeriodSeconds: 60,
438+
// No TerminationGracePeriodSeconds - should use pod-level (60s)
439+
},
440+
},
441+
},
442+
},
443+
}
444+
445+
_, err = oc.KubeClient().CoreV1().Pods(namespace).Create(ctx, livenessPodNoProbeTerm, metav1.CreateOptions{})
446+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to create liveness probe pod without probe termination")
447+
g.DeferCleanup(oc.KubeClient().CoreV1().Pods(namespace).Delete, ctx, "liveness-probe-no-term", metav1.DeleteOptions{})
448+
449+
err = verifyProbeTermination("liveness-probe-no-term", "test", 60)
450+
o.Expect(err).NotTo(o.HaveOccurred(), "liveness probe should use pod-level termination grace period when probe-level not set")
451+
})
167452
})
168453

169454
// author: asahay@redhat.com

0 commit comments

Comments
 (0)