Skip to content

Commit 65f9f4a

Browse files
committed
Update ocdebug to ssh
1 parent 1325193 commit 65f9f4a

1 file changed

Lines changed: 64 additions & 17 deletions

File tree

test/extended/edge_topologies/tnf_recovery.go

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,32 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
421421
// Requires resource-agents >= 4.10.0-71.el9_6.13 (RHEL 9) or >= 4.16.0-33.el10 (RHEL 10).
422422
survivedNode := peerNode
423423

424+
// Set up two-hop SSH (local → hypervisor → node) for post-panic verification.
425+
// After kernel panic the Kubernetes API is unstable for minutes, making oc debug
426+
// unreliable. SSH via the hypervisor bypasses the API entirely.
427+
if !exutil.HasHypervisorConfig() {
428+
g.Skip("Hypervisor SSH config required for kernel panic verification")
429+
}
430+
sshCfg := exutil.GetHypervisorConfig()
431+
o.Expect(sshCfg).NotTo(o.BeNil(), "Failed to parse hypervisor config")
432+
hypervisorConfig := core.SSHConfig{
433+
IP: sshCfg.HypervisorIP,
434+
User: sshCfg.SSHUser,
435+
PrivateKeyPath: sshCfg.PrivateKeyPath,
436+
}
437+
localKH, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig)
438+
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare local known hosts")
439+
440+
survivedNodeIP := utils.GetNodeInternalIP(&survivedNode)
441+
o.Expect(survivedNodeIP).NotTo(o.BeEmpty(), "survived node has no internal IP")
442+
targetNodeIP := utils.GetNodeInternalIP(&targetNode)
443+
o.Expect(targetNodeIP).NotTo(o.BeEmpty(), "target node has no internal IP")
444+
445+
survivedRemoteKH, err := core.PrepareRemoteKnownHostsFile(survivedNodeIP, &hypervisorConfig, localKH)
446+
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for survived node")
447+
targetRemoteKH, err := core.PrepareRemoteKnownHostsFile(targetNodeIP, &hypervisorConfig, localKH)
448+
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for target node")
449+
424450
g.By("Logging resource-agents RPM version")
425451
raVersion, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
426452
"bash", "-c", "rpm -q resource-agents")
@@ -462,8 +488,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
462488
g.By("Reading bump-amount from journal log on survived node")
463489
var journalBump int
464490
o.Eventually(func() error {
465-
journalOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
466-
"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp))
491+
journalOutput, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
492+
fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp),
493+
&hypervisorConfig, localKH, survivedRemoteKH)
467494
if err != nil {
468495
return fmt.Errorf("failed to read journal: %v", err)
469496
}
@@ -482,8 +509,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
482509
g.By("Verifying force-new-cluster-bump-amount in config.yaml matches journal bump-amount")
483510
var configBump int
484511
o.Eventually(func() error {
485-
bumpAmountStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
486-
"bash", "-c", "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'")
512+
bumpAmountStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
513+
"grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'",
514+
&hypervisorConfig, localKH, survivedRemoteKH)
487515
if err != nil {
488516
return fmt.Errorf("failed to read bump amount: %v", err)
489517
}
@@ -497,8 +525,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
497525
fmt.Sprintf("config.yaml bump-amount %d should match journal bump-amount %d", configBump, journalBump))
498526

499527
g.By("Independently verifying bump amount is approximately floor(maxRaftIndex * 0.2)")
500-
raftIndexStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
501-
"bash", "-c", "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json")
528+
raftIndexStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
529+
"jq -r '.maxRaftIndex' /var/lib/etcd/revision.json",
530+
&hypervisorConfig, localKH, survivedRemoteKH)
502531
o.Expect(err).To(o.BeNil())
503532
maxRaftIndex, err := strconv.Atoi(strings.TrimSpace(raftIndexStr))
504533
o.Expect(err).To(o.BeNil())
@@ -521,22 +550,40 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
521550
memberPromotedVotingTimeout, utils.FiveSecondPollInterval)
522551

523552
g.By(fmt.Sprintf("Verifying etcd container is running on %s", targetNode.Name))
524-
got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
525-
strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...)
526-
o.Expect(err).To(o.BeNil())
527-
o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
553+
o.Eventually(func() error {
554+
got, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
555+
ensurePodmanEtcdContainerIsRunning,
556+
&hypervisorConfig, localKH, targetRemoteKH)
557+
if err != nil {
558+
return fmt.Errorf("failed to inspect etcd container: %v", err)
559+
}
560+
if strings.TrimSpace(got) != "'true'" {
561+
return fmt.Errorf("etcd container not running on %s: got %s", targetNode.Name, got)
562+
}
563+
return nil
564+
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
565+
fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
528566

529567
g.By(fmt.Sprintf("Verifying etcd-previous container exists on %s", targetNode.Name))
530-
prevOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
531-
"bash", "-c", "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous")
532-
o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
533-
o.Expect(strings.TrimSpace(prevOutput)).To(o.Equal("etcd-previous"),
534-
fmt.Sprintf("expected etcd-previous container on %s", targetNode.Name))
568+
o.Eventually(func() error {
569+
prevOutput, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
570+
"podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous",
571+
&hypervisorConfig, localKH, targetRemoteKH)
572+
if err != nil {
573+
return fmt.Errorf("etcd-previous container not found on %s: %v", targetNode.Name, err)
574+
}
575+
if strings.TrimSpace(prevOutput) != "etcd-previous" {
576+
return fmt.Errorf("expected etcd-previous container on %s, got %q", targetNode.Name, prevOutput)
577+
}
578+
return nil
579+
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
580+
fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
535581

536582
g.By(fmt.Sprintf("Verifying pod.yaml was recreated on %s via pacemaker log", targetNode.Name))
537583
o.Eventually(func() error {
538-
_, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
539-
"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp))
584+
_, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
585+
fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp),
586+
&hypervisorConfig, localKH, targetRemoteKH)
540587
return err
541588
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
542589
"Expected pacemaker log to contain pod.yaml recreation entry after reboot")

0 commit comments

Comments
 (0)