Skip to content

Commit af71706

Browse files
committed
Update ocdebug to ssh
1 parent 1325193 commit af71706

1 file changed

Lines changed: 69 additions & 17 deletions

File tree

test/extended/edge_topologies/tnf_recovery.go

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,37 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
421421
// Requires resource-agents >= 4.10.0-71.el9_6.13 (RHEL 9) or >= 4.16.0-33.el10 (RHEL 10).
422422
survivedNode := peerNode
423423

424+
// Set up two-hop SSH (local → hypervisor → node) for post-panic verification.
425+
// After kernel panic the Kubernetes API is unstable for minutes, making oc debug
426+
// unreliable. SSH via the hypervisor bypasses the API entirely.
427+
if !exutil.HasHypervisorConfig() {
428+
g.Skip("Hypervisor SSH config required for kernel panic verification")
429+
}
430+
sshCfg := exutil.GetHypervisorConfig()
431+
o.Expect(sshCfg).NotTo(o.BeNil(), "Failed to parse hypervisor config")
432+
o.Expect(sshCfg.HypervisorIP).NotTo(o.BeEmpty(), "Hypervisor IP is empty")
433+
o.Expect(sshCfg.SSHUser).NotTo(o.BeEmpty(), "Hypervisor SSH user is empty")
434+
o.Expect(sshCfg.PrivateKeyPath).NotTo(o.BeEmpty(), "Hypervisor private key path is empty")
435+
_, err := os.Stat(sshCfg.PrivateKeyPath)
436+
o.Expect(err).NotTo(o.HaveOccurred(), "Hypervisor private key not readable at %s", sshCfg.PrivateKeyPath)
437+
hypervisorConfig := core.SSHConfig{
438+
IP: sshCfg.HypervisorIP,
439+
User: sshCfg.SSHUser,
440+
PrivateKeyPath: sshCfg.PrivateKeyPath,
441+
}
442+
localKH, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig)
443+
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare local known hosts")
444+
445+
survivedNodeIP := utils.GetNodeInternalIP(&survivedNode)
446+
o.Expect(survivedNodeIP).NotTo(o.BeEmpty(), "survived node has no internal IP")
447+
targetNodeIP := utils.GetNodeInternalIP(&targetNode)
448+
o.Expect(targetNodeIP).NotTo(o.BeEmpty(), "target node has no internal IP")
449+
450+
survivedRemoteKH, err := core.PrepareRemoteKnownHostsFile(survivedNodeIP, &hypervisorConfig, localKH)
451+
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for survived node")
452+
targetRemoteKH, err := core.PrepareRemoteKnownHostsFile(targetNodeIP, &hypervisorConfig, localKH)
453+
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for target node")
454+
424455
g.By("Logging resource-agents RPM version")
425456
raVersion, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
426457
"bash", "-c", "rpm -q resource-agents")
@@ -462,8 +493,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
462493
g.By("Reading bump-amount from journal log on survived node")
463494
var journalBump int
464495
o.Eventually(func() error {
465-
journalOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
466-
"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp))
496+
journalOutput, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
497+
fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp),
498+
&hypervisorConfig, localKH, survivedRemoteKH)
467499
if err != nil {
468500
return fmt.Errorf("failed to read journal: %v", err)
469501
}
@@ -482,8 +514,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
482514
g.By("Verifying force-new-cluster-bump-amount in config.yaml matches journal bump-amount")
483515
var configBump int
484516
o.Eventually(func() error {
485-
bumpAmountStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
486-
"bash", "-c", "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'")
517+
bumpAmountStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
518+
"grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'",
519+
&hypervisorConfig, localKH, survivedRemoteKH)
487520
if err != nil {
488521
return fmt.Errorf("failed to read bump amount: %v", err)
489522
}
@@ -497,8 +530,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
497530
fmt.Sprintf("config.yaml bump-amount %d should match journal bump-amount %d", configBump, journalBump))
498531

499532
g.By("Independently verifying bump amount is approximately floor(maxRaftIndex * 0.2)")
500-
raftIndexStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
501-
"bash", "-c", "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json")
533+
raftIndexStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
534+
"jq -r '.maxRaftIndex' /var/lib/etcd/revision.json",
535+
&hypervisorConfig, localKH, survivedRemoteKH)
502536
o.Expect(err).To(o.BeNil())
503537
maxRaftIndex, err := strconv.Atoi(strings.TrimSpace(raftIndexStr))
504538
o.Expect(err).To(o.BeNil())
@@ -521,22 +555,40 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
521555
memberPromotedVotingTimeout, utils.FiveSecondPollInterval)
522556

523557
g.By(fmt.Sprintf("Verifying etcd container is running on %s", targetNode.Name))
524-
got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
525-
strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...)
526-
o.Expect(err).To(o.BeNil())
527-
o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
558+
o.Eventually(func() error {
559+
got, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
560+
ensurePodmanEtcdContainerIsRunning,
561+
&hypervisorConfig, localKH, targetRemoteKH)
562+
if err != nil {
563+
return fmt.Errorf("failed to inspect etcd container: %v", err)
564+
}
565+
if strings.TrimSpace(got) != "'true'" {
566+
return fmt.Errorf("etcd container not running on %s: got %s", targetNode.Name, got)
567+
}
568+
return nil
569+
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
570+
fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
528571

529572
g.By(fmt.Sprintf("Verifying etcd-previous container exists on %s", targetNode.Name))
530-
prevOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
531-
"bash", "-c", "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous")
532-
o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
533-
o.Expect(strings.TrimSpace(prevOutput)).To(o.Equal("etcd-previous"),
534-
fmt.Sprintf("expected etcd-previous container on %s", targetNode.Name))
573+
o.Eventually(func() error {
574+
prevOutput, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
575+
"podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous",
576+
&hypervisorConfig, localKH, targetRemoteKH)
577+
if err != nil {
578+
return fmt.Errorf("etcd-previous container not found on %s: %v", targetNode.Name, err)
579+
}
580+
if strings.TrimSpace(prevOutput) != "etcd-previous" {
581+
return fmt.Errorf("expected etcd-previous container on %s, got %q", targetNode.Name, prevOutput)
582+
}
583+
return nil
584+
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
585+
fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
535586

536587
g.By(fmt.Sprintf("Verifying pod.yaml was recreated on %s via pacemaker log", targetNode.Name))
537588
o.Eventually(func() error {
538-
_, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
539-
"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp))
589+
_, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
590+
fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp),
591+
&hypervisorConfig, localKH, targetRemoteKH)
540592
return err
541593
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
542594
"Expected pacemaker log to contain pod.yaml recreation entry after reboot")

0 commit comments

Comments
 (0)