@@ -421,6 +421,37 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
421421 // Requires resource-agents >= 4.10.0-71.el9_6.13 (RHEL 9) or >= 4.16.0-33.el10 (RHEL 10).
422422 survivedNode := peerNode
423423
424+ // Set up two-hop SSH (local → hypervisor → node) for post-panic verification.
425+ // After kernel panic the Kubernetes API is unstable for minutes, making oc debug
426+ // unreliable. SSH via the hypervisor bypasses the API entirely.
427+ if ! exutil .HasHypervisorConfig () {
428+ g .Skip ("Hypervisor SSH config required for kernel panic verification" )
429+ }
430+ sshCfg := exutil .GetHypervisorConfig ()
431+ o .Expect (sshCfg ).NotTo (o .BeNil (), "Failed to parse hypervisor config" )
432+ o .Expect (sshCfg .HypervisorIP ).NotTo (o .BeEmpty (), "Hypervisor IP is empty" )
433+ o .Expect (sshCfg .SSHUser ).NotTo (o .BeEmpty (), "Hypervisor SSH user is empty" )
434+ o .Expect (sshCfg .PrivateKeyPath ).NotTo (o .BeEmpty (), "Hypervisor private key path is empty" )
435+ _ , err := os .Stat (sshCfg .PrivateKeyPath )
436+ o .Expect (err ).NotTo (o .HaveOccurred (), "Hypervisor private key not readable at %s" , sshCfg .PrivateKeyPath )
437+ hypervisorConfig := core.SSHConfig {
438+ IP : sshCfg .HypervisorIP ,
439+ User : sshCfg .SSHUser ,
440+ PrivateKeyPath : sshCfg .PrivateKeyPath ,
441+ }
442+ localKH , err := core .PrepareLocalKnownHostsFile (& hypervisorConfig )
443+ o .Expect (err ).NotTo (o .HaveOccurred (), "Failed to prepare local known hosts" )
444+
445+ survivedNodeIP := utils .GetNodeInternalIP (& survivedNode )
446+ o .Expect (survivedNodeIP ).NotTo (o .BeEmpty (), "survived node has no internal IP" )
447+ targetNodeIP := utils .GetNodeInternalIP (& targetNode )
448+ o .Expect (targetNodeIP ).NotTo (o .BeEmpty (), "target node has no internal IP" )
449+
450+ survivedRemoteKH , err := core .PrepareRemoteKnownHostsFile (survivedNodeIP , & hypervisorConfig , localKH )
451+ o .Expect (err ).NotTo (o .HaveOccurred (), "Failed to prepare remote known hosts for survived node" )
452+ targetRemoteKH , err := core .PrepareRemoteKnownHostsFile (targetNodeIP , & hypervisorConfig , localKH )
453+ o .Expect (err ).NotTo (o .HaveOccurred (), "Failed to prepare remote known hosts for target node" )
454+
424455 g .By ("Logging resource-agents RPM version" )
425456 raVersion , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
426457 "bash" , "-c" , "rpm -q resource-agents" )
@@ -462,8 +493,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
462493 g .By ("Reading bump-amount from journal log on survived node" )
463494 var journalBump int
464495 o .Eventually (func () error {
465- journalOutput , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
466- "bash" , "-c" , fmt .Sprintf ("journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1" , crashTimestamp ))
496+ journalOutput , _ , err := core .ExecuteRemoteSSHCommand (survivedNodeIP ,
497+ fmt .Sprintf ("sudo journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1" , crashTimestamp ),
498+ & hypervisorConfig , localKH , survivedRemoteKH )
467499 if err != nil {
468500 return fmt .Errorf ("failed to read journal: %v" , err )
469501 }
@@ -482,8 +514,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
482514 g .By ("Verifying force-new-cluster-bump-amount in config.yaml matches journal bump-amount" )
483515 var configBump int
484516 o .Eventually (func () error {
485- bumpAmountStr , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
486- "bash" , "-c" , "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'" )
517+ bumpAmountStr , _ , err := core .ExecuteRemoteSSHCommand (survivedNodeIP ,
518+ "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'" ,
519+ & hypervisorConfig , localKH , survivedRemoteKH )
487520 if err != nil {
488521 return fmt .Errorf ("failed to read bump amount: %v" , err )
489522 }
@@ -497,8 +530,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
497530 fmt .Sprintf ("config.yaml bump-amount %d should match journal bump-amount %d" , configBump , journalBump ))
498531
499532 g .By ("Independently verifying bump amount is approximately floor(maxRaftIndex * 0.2)" )
500- raftIndexStr , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , survivedNode .Name , "openshift-etcd" ,
501- "bash" , "-c" , "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json" )
533+ raftIndexStr , _ , err := core .ExecuteRemoteSSHCommand (survivedNodeIP ,
534+ "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json" ,
535+ & hypervisorConfig , localKH , survivedRemoteKH )
502536 o .Expect (err ).To (o .BeNil ())
503537 maxRaftIndex , err := strconv .Atoi (strings .TrimSpace (raftIndexStr ))
504538 o .Expect (err ).To (o .BeNil ())
@@ -521,22 +555,40 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
521555 memberPromotedVotingTimeout , utils .FiveSecondPollInterval )
522556
523557 g .By (fmt .Sprintf ("Verifying etcd container is running on %s" , targetNode .Name ))
524- got , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , targetNode .Name , "openshift-etcd" ,
525- strings .Split (ensurePodmanEtcdContainerIsRunning , " " )... )
526- o .Expect (err ).To (o .BeNil ())
527- o .Expect (got ).To (o .Equal ("'true'" ), fmt .Sprintf ("expected etcd container running on %s" , targetNode .Name ))
558+ o .Eventually (func () error {
559+ got , _ , err := core .ExecuteRemoteSSHCommand (targetNodeIP ,
560+ ensurePodmanEtcdContainerIsRunning ,
561+ & hypervisorConfig , localKH , targetRemoteKH )
562+ if err != nil {
563+ return fmt .Errorf ("failed to inspect etcd container: %v" , err )
564+ }
565+ if strings .TrimSpace (got ) != "'true'" {
566+ return fmt .Errorf ("etcd container not running on %s: got %s" , targetNode .Name , got )
567+ }
568+ return nil
569+ }, 5 * time .Minute , utils .FiveSecondPollInterval ).ShouldNot (o .HaveOccurred (),
570+ fmt .Sprintf ("expected etcd container running on %s" , targetNode .Name ))
528571
529572 g .By (fmt .Sprintf ("Verifying etcd-previous container exists on %s" , targetNode .Name ))
530- prevOutput , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , targetNode .Name , "openshift-etcd" ,
531- "bash" , "-c" , "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous" )
532- o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("expected etcd-previous container to exist on %s" , targetNode .Name ))
533- o .Expect (strings .TrimSpace (prevOutput )).To (o .Equal ("etcd-previous" ),
534- fmt .Sprintf ("expected etcd-previous container on %s" , targetNode .Name ))
573+ o .Eventually (func () error {
574+ prevOutput , _ , err := core .ExecuteRemoteSSHCommand (targetNodeIP ,
575+ "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous" ,
576+ & hypervisorConfig , localKH , targetRemoteKH )
577+ if err != nil {
578+ return fmt .Errorf ("etcd-previous container not found on %s: %v" , targetNode .Name , err )
579+ }
580+ if strings .TrimSpace (prevOutput ) != "etcd-previous" {
581+ return fmt .Errorf ("expected etcd-previous container on %s, got %q" , targetNode .Name , prevOutput )
582+ }
583+ return nil
584+ }, 5 * time .Minute , utils .FiveSecondPollInterval ).ShouldNot (o .HaveOccurred (),
585+ fmt .Sprintf ("expected etcd-previous container to exist on %s" , targetNode .Name ))
535586
536587 g .By (fmt .Sprintf ("Verifying pod.yaml was recreated on %s via pacemaker log" , targetNode .Name ))
537588 o .Eventually (func () error {
538- _ , err := exutil .DebugNodeRetryWithOptionsAndChroot (oc , targetNode .Name , "openshift-etcd" ,
539- "bash" , "-c" , fmt .Sprintf ("journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'" , crashTimestamp ))
589+ _ , _ , err := core .ExecuteRemoteSSHCommand (targetNodeIP ,
590+ fmt .Sprintf ("sudo journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'" , crashTimestamp ),
591+ & hypervisorConfig , localKH , targetRemoteKH )
540592 return err
541593 }, 5 * time .Minute , utils .FiveSecondPollInterval ).ShouldNot (o .HaveOccurred (),
542594 "Expected pacemaker log to contain pod.yaml recreation entry after reboot" )
0 commit comments