@@ -421,6 +421,32 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
421421 // Requires resource-agents >= 4.10.0-71.el9_6.13 (RHEL 9) or >= 4.16.0-33.el10 (RHEL 10).
422422 survivedNode := peerNode
423423
424+ // Set up two-hop SSH (local → hypervisor → node) for post-panic verification.
425+ // After kernel panic the Kubernetes API is unstable for minutes, making oc debug
426+ // unreliable. SSH via the hypervisor bypasses the API entirely.
427+ if ! exutil .HasHypervisorConfig () {
428+ g .Skip ("Hypervisor SSH config required for kernel panic verification" )
429+ }
430+ sshCfg := exutil .GetHypervisorConfig ()
431+ o .Expect (sshCfg ).NotTo (o .BeNil (), "Failed to parse hypervisor config" )
432+ hypervisorConfig := core.SSHConfig {
433+ IP : sshCfg .HypervisorIP ,
434+ User : sshCfg .SSHUser ,
435+ PrivateKeyPath : sshCfg .PrivateKeyPath ,
436+ }
437+ localKH , err := core .PrepareLocalKnownHostsFile (& hypervisorConfig )
438+ o .Expect (err ).NotTo (o .HaveOccurred (), "Failed to prepare local known hosts" )
439+
440+ survivedNodeIP := utils .GetNodeInternalIP (& survivedNode )
441+ o .Expect (survivedNodeIP ).NotTo (o .BeEmpty (), "survived node has no internal IP" )
442+ targetNodeIP := utils .GetNodeInternalIP (& targetNode )
443+ o .Expect (targetNodeIP ).NotTo (o .BeEmpty (), "target node has no internal IP" )
444+
445+ survivedRemoteKH , err := core .PrepareRemoteKnownHostsFile (survivedNodeIP , & hypervisorConfig , localKH )
446+ o .Expect (err ).NotTo (o .HaveOccurred (), "Failed to prepare remote known hosts for survived node" )
447+ targetRemoteKH , err := core .PrepareRemoteKnownHostsFile (targetNodeIP , & hypervisorConfig , localKH )
448+ o .Expect (err ).NotTo (o .HaveOccurred (), "Failed to prepare remote known hosts for target node" )
449+
424450 g .By ("Logging resource-agents RPM version" )
425451 raVersion , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
426452 "bash" , "-c" , "rpm -q resource-agents" )
@@ -462,8 +488,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
462488 g .By ("Reading bump-amount from journal log on survived node" )
463489 var journalBump int
464490 o .Eventually (func () error {
465- journalOutput , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
466- "bash" , "-c" , fmt .Sprintf ("journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1" , crashTimestamp ))
491+ journalOutput , _ , err := core .ExecuteRemoteSSHCommand (survivedNodeIP ,
492+ fmt .Sprintf ("sudo journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1" , crashTimestamp ),
493+ & hypervisorConfig , localKH , survivedRemoteKH )
467494 if err != nil {
468495 return fmt .Errorf ("failed to read journal: %v" , err )
469496 }
@@ -482,8 +509,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
482509 g .By ("Verifying force-new-cluster-bump-amount in config.yaml matches journal bump-amount" )
483510 var configBump int
484511 o .Eventually (func () error {
485- bumpAmountStr , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
486- "bash" , "-c" , "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'" )
512+ bumpAmountStr , _ , err := core .ExecuteRemoteSSHCommand (survivedNodeIP ,
513+ "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'" ,
514+ & hypervisorConfig , localKH , survivedRemoteKH )
487515 if err != nil {
488516 return fmt .Errorf ("failed to read bump amount: %v" , err )
489517 }
@@ -497,8 +525,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
497525 fmt .Sprintf ("config.yaml bump-amount %d should match journal bump-amount %d" , configBump , journalBump ))
498526
499527 g .By ("Independently verifying bump amount is approximately floor(maxRaftIndex * 0.2)" )
500- raftIndexStr , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
501- "bash" , "-c" , "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json" )
528+ raftIndexStr , _ , err := core .ExecuteRemoteSSHCommand (survivedNodeIP ,
529+ "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json" ,
530+ & hypervisorConfig , localKH , survivedRemoteKH )
502531 o .Expect (err ).To (o .BeNil ())
503532 maxRaftIndex , err := strconv .Atoi (strings .TrimSpace (raftIndexStr ))
504533 o .Expect (err ).To (o .BeNil ())
@@ -521,22 +550,40 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
521550 memberPromotedVotingTimeout , utils .FiveSecondPollInterval )
522551
523552 g .By (fmt .Sprintf ("Verifying etcd container is running on %s" , targetNode .Name ))
524- got , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , targetNode .Name , "openshift-etcd" ,
525- strings .Split (ensurePodmanEtcdContainerIsRunning , " " )... )
526- o .Expect (err ).To (o .BeNil ())
527- o .Expect (got ).To (o .Equal ("'true'" ), fmt .Sprintf ("expected etcd container running on %s" , targetNode .Name ))
553+ o .Eventually (func () error {
554+ got , _ , err := core .ExecuteRemoteSSHCommand (targetNodeIP ,
555+ ensurePodmanEtcdContainerIsRunning ,
556+ & hypervisorConfig , localKH , targetRemoteKH )
557+ if err != nil {
558+ return fmt .Errorf ("failed to inspect etcd container: %v" , err )
559+ }
560+ if strings .TrimSpace (got ) != "'true'" {
561+ return fmt .Errorf ("etcd container not running on %s: got %s" , targetNode .Name , got )
562+ }
563+ return nil
564+ }, 5 * time .Minute , utils .FiveSecondPollInterval ).ShouldNot (o .HaveOccurred (),
565+ fmt .Sprintf ("expected etcd container running on %s" , targetNode .Name ))
528566
529567 g .By (fmt .Sprintf ("Verifying etcd-previous container exists on %s" , targetNode .Name ))
530- prevOutput , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , targetNode .Name , "openshift-etcd" ,
531- "bash" , "-c" , "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous" )
532- o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("expected etcd-previous container to exist on %s" , targetNode .Name ))
533- o .Expect (strings .TrimSpace (prevOutput )).To (o .Equal ("etcd-previous" ),
534- fmt .Sprintf ("expected etcd-previous container on %s" , targetNode .Name ))
568+ o .Eventually (func () error {
569+ prevOutput , _ , err := core .ExecuteRemoteSSHCommand (targetNodeIP ,
570+ "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous" ,
571+ & hypervisorConfig , localKH , targetRemoteKH )
572+ if err != nil {
573+ return fmt .Errorf ("etcd-previous container not found on %s: %v" , targetNode .Name , err )
574+ }
575+ if strings .TrimSpace (prevOutput ) != "etcd-previous" {
576+ return fmt .Errorf ("expected etcd-previous container on %s, got %q" , targetNode .Name , prevOutput )
577+ }
578+ return nil
579+ }, 5 * time .Minute , utils .FiveSecondPollInterval ).ShouldNot (o .HaveOccurred (),
580+ fmt .Sprintf ("expected etcd-previous container to exist on %s" , targetNode .Name ))
535581
536582 g .By (fmt .Sprintf ("Verifying pod.yaml was recreated on %s via pacemaker log" , targetNode .Name ))
537583 o .Eventually (func () error {
538- _ , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , targetNode .Name , "openshift-etcd" ,
539- "bash" , "-c" , fmt .Sprintf ("journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'" , crashTimestamp ))
584+ _ , _ , err := core .ExecuteRemoteSSHCommand (targetNodeIP ,
585+ fmt .Sprintf ("sudo journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'" , crashTimestamp ),
586+ & hypervisorConfig , localKH , targetRemoteKH )
540587 return err
541588 }, 5 * time .Minute , utils .FiveSecondPollInterval ).ShouldNot (o .HaveOccurred (),
542589 "Expected pacemaker log to contain pod.yaml recreation entry after reboot" )
0 commit comments