@@ -406,6 +406,196 @@ func TestResolveHostKeyCallbackNoKnownHosts(t *testing.T) {
406406 }
407407}
408408
409+ // TestSSHJumpHost_BurstCloseDoesNotDropExecReply is a focused regression
410+ // test for the race where an upstream that replies + writes data + sends
411+ // exit-status + closes in one burst (no sleep between exit-status and
412+ // close) caused sluice to close srcChan before the agent-to-upstream
413+ // forwarder finished writing the SUCCESS reply for the agent's
414+ // session.SendRequest("exec", true, ...). gossh closes ch.msg on
415+ // SSH_MSG_CHANNEL_CLOSE, so the blocked SendRequest returns io.EOF and
416+ // session.Output(...) fails with "exec command via SSH: EOF".
417+ //
418+ // startTestSSHServer (used by other tests) papers over the race with a
419+ // 50ms sleep before returning from the channel handler. This test
420+ // spins up its own burst-close server with no such sleep, so the race
421+ // is deterministically triggered without the inflightBarrier fix.
422+ func TestSSHJumpHost_BurstCloseDoesNotDropExecReply (t * testing.T ) {
423+ pubKey , privPEM := generateTestSSHKey (t )
424+ dir := t .TempDir ()
425+ store , err := vault .NewStore (dir )
426+ if err != nil {
427+ t .Fatal (err )
428+ }
429+ if _ , err := store .Add ("ssh_key" , string (privPEM )); err != nil {
430+ t .Fatal (err )
431+ }
432+
433+ sshServer := startBurstCloseSSHServer (t , pubKey )
434+ defer func () { _ = sshServer .Close () }()
435+
436+ proxyHostKey , err := GenerateSSHHostKey ()
437+ if err != nil {
438+ t .Fatal (err )
439+ }
440+
441+ binding := vault.Binding {
442+ Credential : "ssh_key" ,
443+ Template : "testuser" ,
444+ Protocols : []string {"ssh" },
445+ }
446+
447+ jumpHost := NewSSHJumpHost (store , proxyHostKey )
448+ jumpHost .HostKeyCallback = ssh .InsecureIgnoreHostKey ()
449+
450+ // Run the test many times in a single process to maximize the
451+ // chance the close race fires if the fix regresses. Each iteration
452+ // runs through a fresh proxy connection + fresh agent session.
453+ const iterations = 50
454+ for i := 0 ; i < iterations ; i ++ {
455+ agentConn , proxyConn := tcpConnPair (t )
456+
457+ ready := make (chan error , 1 )
458+ errCh := make (chan error , 1 )
459+ go func () {
460+ errCh <- jumpHost .HandleConnection (proxyConn , []string {sshServer .Addr ().String ()}, sshServer .Addr ().String (), binding , ready )
461+ }()
462+
463+ if setupErr := <- ready ; setupErr != nil {
464+ t .Fatalf ("iter %d: handler setup: %v" , i , setupErr )
465+ }
466+
467+ agentSSH , agentChans , agentReqs , err := ssh .NewClientConn (agentConn , "proxy" , & ssh.ClientConfig {
468+ User : "ignored" ,
469+ HostKeyCallback : ssh .InsecureIgnoreHostKey (),
470+ })
471+ if err != nil {
472+ t .Fatalf ("iter %d: agent SSH handshake: %v" , i , err )
473+ }
474+
475+ client := ssh .NewClient (agentSSH , agentChans , agentReqs )
476+ session , err := client .NewSession ()
477+ if err != nil {
478+ t .Fatalf ("iter %d: open session: %v" , i , err )
479+ }
480+
481+ output , err := session .Output ("whoami" )
482+ if err != nil {
483+ t .Fatalf ("iter %d: exec: %v (this is the EOF symptom of the close race)" , i , err )
484+ }
485+ if string (output ) != "ssh-injection-ok\n " {
486+ t .Errorf ("iter %d: expected 'ssh-injection-ok', got %q" , i , string (output ))
487+ }
488+ _ = session .Close ()
489+ _ = client .Close ()
490+ _ = agentSSH .Close ()
491+ _ = agentConn .Close ()
492+
493+ // Wait for HandleConnection to return so a leaked handler
494+ // goroutine (or a connection that fails to teardown after
495+ // close) surfaces as a test timeout rather than as silent
496+ // resource exhaustion on the next iteration. HandleConnection
497+ // returns nil on graceful agent disconnect; a non-nil error
498+ // here would mean the teardown path produced an unexpected
499+ // failure that a future regression could mask.
500+ select {
501+ case handlerErr := <- errCh :
502+ if handlerErr != nil {
503+ t .Fatalf ("iter %d: HandleConnection returned error: %v" , i , handlerErr )
504+ }
505+ case <- time .After (5 * time .Second ):
506+ t .Fatalf ("iter %d: HandleConnection did not return within 5s after close" , i )
507+ }
508+ }
509+ }
510+
511+ // startBurstCloseSSHServer is a test SSH server that, on the first exec
512+ // request, replies + writes output + sends exit-status + closes the
513+ // channel with no delay between exit-status and Close. The lack of any
514+ // sleep is intentional: it deterministically triggers the close race
515+ // in sluice's SSH jump host when the inflightBarrier fix is absent.
516+ func startBurstCloseSSHServer (t * testing.T , authorizedKey ssh.PublicKey ) net.Listener {
517+ t .Helper ()
518+
519+ key , err := ecdsa .GenerateKey (elliptic .P256 (), rand .Reader )
520+ if err != nil {
521+ t .Fatal (err )
522+ }
523+ hostSigner , err := ssh .NewSignerFromKey (key )
524+ if err != nil {
525+ t .Fatal (err )
526+ }
527+
528+ config := & ssh.ServerConfig {
529+ PublicKeyCallback : func (_ ssh.ConnMetadata , pubKey ssh.PublicKey ) (* ssh.Permissions , error ) {
530+ if bytes .Equal (pubKey .Marshal (), authorizedKey .Marshal ()) {
531+ return & ssh.Permissions {}, nil
532+ }
533+ return nil , fmt .Errorf ("unknown public key" )
534+ },
535+ }
536+ config .AddHostKey (hostSigner )
537+
538+ ln , err := net .Listen ("tcp" , "127.0.0.1:0" )
539+ if err != nil {
540+ t .Fatal (err )
541+ }
542+
543+ go func () {
544+ for {
545+ conn , err := ln .Accept ()
546+ if err != nil {
547+ return
548+ }
549+ go func (c net.Conn ) {
550+ sshConn , chans , reqs , err := ssh .NewServerConn (c , config )
551+ if err != nil {
552+ _ = c .Close ()
553+ return
554+ }
555+ defer func () { _ = sshConn .Close () }()
556+ go ssh .DiscardRequests (reqs )
557+ for newChan := range chans {
558+ if newChan .ChannelType () != "session" {
559+ _ = newChan .Reject (ssh .UnknownChannelType , "unsupported" )
560+ continue
561+ }
562+ ch , chReqs , err := newChan .Accept ()
563+ if err != nil {
564+ continue
565+ }
566+ go func (ch ssh.Channel , reqs <- chan * ssh.Request ) {
567+ // Defer close so a request loop that exits without
568+ // hitting the exec path (early agent close,
569+ // non-exec request only) still releases the
570+ // server-side channel.
571+ defer func () { _ = ch .Close () }()
572+ for req := range reqs {
573+ if req .Type != "exec" {
574+ if req .WantReply {
575+ _ = req .Reply (false , nil )
576+ }
577+ continue
578+ }
579+ if req .WantReply {
580+ _ = req .Reply (true , nil )
581+ }
582+ _ , _ = ch .Write ([]byte ("ssh-injection-ok\n " ))
583+ _ , _ = ch .SendRequest ("exit-status" , false , ssh .Marshal (struct { Status uint32 }{0 }))
584+ _ = ch .CloseWrite ()
585+ // NO time.Sleep here. This is the whole point of
586+ // the test: close immediately after exit-status
587+ // to maximally tighten the race window in
588+ // sluice's sshHandleChannel.
589+ return
590+ }
591+ }(ch , chReqs )
592+ }
593+ }(conn )
594+ }
595+ }()
596+ return ln
597+ }
598+
409599// TestGenerateSSHHostKey tests SSH host key generation.
410600func TestGenerateSSHHostKey (t * testing.T ) {
411601 signer , err := GenerateSSHHostKey ()
0 commit comments