Skip to content

Commit 5b584f4

Browse files
committed
fix(ssh): harden reconnect and port-forward recovery
1 parent a53707e commit 5b584f4

10 files changed

Lines changed: 304 additions & 50 deletions

File tree

infra/sidecar/entrypoint.sh

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,20 @@ OKDEV_WORKSPACE_PATH="__OKDEV_WORKSPACE_PATH__"
2323
2424
# Find the dev container's PID 1 by looking for a process with a different root filesystem.
2525
# In a shared PID namespace, our PID 1 is the pause/sandbox container.
26-
# We look for the first "sleep infinity" or the init process of the dev container.
26+
# We look for the first long-lived process in the dev container.
2727
DEV_PID=""
28-
for pid in $(ls /proc | grep -E '^[0-9]+$' | sort -n); do
28+
for pid in $(ls /proc 2>/dev/null | grep -E '^[0-9]+$' | sort -n); do
2929
[ "$pid" = "1" ] && continue
3030
[ "$pid" = "$$" ] && continue
31-
# Skip if we can't read the process
32-
[ -r "/proc/$pid/root" ] || continue
33-
# Check if this process has a different root than us (different container)
34-
if ! [ "/proc/$pid/root" -ef "/proc/self/root" ]; then
31+
# Use 2>/dev/null to suppress error if process disappears while we're checking.
32+
[ -r "/proc/$pid/root" ] 2>/dev/null || continue
33+
if ! [ "/proc/$pid/root" -ef "/proc/self/root" ] 2>/dev/null; then
3534
# Found a process in a different mount namespace — likely the dev container
36-
DEV_PID="$pid"
37-
break
35+
# Verify it still exists before we commit to it.
36+
if [ -d "/proc/$pid" ]; then
37+
DEV_PID="$pid"
38+
break
39+
fi
3840
fi
3941
done
4042
@@ -43,14 +45,16 @@ if [ -z "$DEV_PID" ]; then
4345
exit 1
4446
fi
4547
46-
# Ensure current GID exists in /etc/group inside the target to suppress
48+
# Ensure current GID exists in /etc/group inside BOTH the sidecar and the target to suppress
4749
# "groups: cannot find name for group ID ..." warnings from login shells.
4850
CURRENT_GID="$(id -g 2>/dev/null || true)"
4951
if [ -n "$CURRENT_GID" ]; then
52+
grep -q ":${CURRENT_GID}:" /etc/group 2>/dev/null || \
53+
echo "okdev:x:${CURRENT_GID}:" >> /etc/group 2>/dev/null || true
5054
nsenter --target "$DEV_PID" --mount -- sh -c "
5155
grep -q \":${CURRENT_GID}:\" /etc/group 2>/dev/null || \
5256
echo \"okdev:x:${CURRENT_GID}:\" >> /etc/group 2>/dev/null || true
53-
"
57+
" 2>/dev/null || true
5458
fi
5559
5660
# If a remote command was requested, execute it inside the dev container.
@@ -105,6 +109,18 @@ sed -i \
105109
/usr/local/bin/nsenter-dev.sh
106110
chmod +x /usr/local/bin/nsenter-dev.sh
107111

112+
# Harden sshd_config for long-lived idle sessions.
113+
# Server-side keepalive: probe every 30s, tolerate 10 misses (5min of dead connection).
114+
# This complements the client-side ServerAliveInterval and keeps intermediate
115+
# connections (kubectl port-forward, load balancers) alive with bidirectional traffic.
116+
if ! grep -q "ClientAliveInterval" /etc/ssh/sshd_config; then
117+
cat >> /etc/ssh/sshd_config << 'SSHD_KEEPALIVE'
118+
ClientAliveInterval 30
119+
ClientAliveCountMax 10
120+
TCPKeepAlive yes
121+
SSHD_KEEPALIVE
122+
fi
123+
108124
# Add ForceCommand to sshd_config dynamically
109125
if ! grep -q "ForceCommand" /etc/ssh/sshd_config; then
110126
echo "ForceCommand /usr/local/bin/nsenter-dev.sh" >> /etc/ssh/sshd_config

internal/cli/portforward_helper.go

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"io"
7+
"log/slog"
78
"net"
89
"strconv"
910
"strings"
@@ -28,7 +29,14 @@ func startManagedPortForwardWithClient(k interface {
2829
doneCh := make(chan struct{})
2930
go func() {
3031
defer close(doneCh)
31-
errCh <- k.PortForward(ctx, namespace, pod, forwards, io.Discard, io.Discard)
32+
slog.Debug("starting port-forward worker", "pod", pod, "forwards", forwards)
33+
err := k.PortForward(ctx, namespace, pod, forwards, io.Discard, io.Discard)
34+
if err != nil {
35+
slog.Debug("port-forward worker exited with error", "pod", pod, "error", err)
36+
} else {
37+
slog.Debug("port-forward worker exited normally", "pod", pod)
38+
}
39+
errCh <- err
3240
}()
3341
cancelAndWait := func() {
3442
cancel()
@@ -70,7 +78,14 @@ func startManagedPortForwardNoProbeWithClient(k interface {
7078
doneCh := make(chan struct{})
7179
go func() {
7280
defer close(doneCh)
73-
errCh <- k.PortForward(ctx, namespace, pod, forwards, io.Discard, io.Discard)
81+
slog.Debug("starting port-forward worker (no-probe)", "pod", pod, "forwards", forwards)
82+
err := k.PortForward(ctx, namespace, pod, forwards, io.Discard, io.Discard)
83+
if err != nil {
84+
slog.Debug("port-forward worker (no-probe) exited with error", "pod", pod, "error", err)
85+
} else {
86+
slog.Debug("port-forward worker (no-probe) exited normally", "pod", pod)
87+
}
88+
errCh <- err
7489
}()
7590
cancelAndWait := func() {
7691
cancel()
@@ -122,3 +137,29 @@ func allPortsReachable(ports []int) bool {
122137
}
123138
return true
124139
}
140+
141+
// startPortForwardKeepalive periodically dials the local port-forward endpoint
142+
// to create new SPDY streams on the underlying connection. Each TCP connection
143+
// triggers the port-forward handler to create a fresh stream pair (data+error),
144+
// which generates SYN/FIN frames visible to all intermediaries (load balancers,
145+
// API server proxies, WebSocket gateways) — keeping the connection alive even
146+
// when they don't count data on existing streams as "activity".
147+
func startPortForwardKeepalive(ctx context.Context, localPort int, interval time.Duration) {
148+
go func() {
149+
ticker := time.NewTicker(interval)
150+
defer ticker.Stop()
151+
for {
152+
select {
153+
case <-ctx.Done():
154+
return
155+
case <-ticker.C:
156+
conn, err := net.DialTimeout("tcp", fmt.Sprintf("127.0.0.1:%d", localPort), 2*time.Second)
157+
if err != nil {
158+
slog.Debug("port-forward keepalive dial failed", "port", localPort, "error", err)
159+
continue
160+
}
161+
_ = conn.Close()
162+
}
163+
}
164+
}()
165+
}

internal/cli/ports.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func newPortsCmd(opts *Options) *cobra.Command {
5757
return fmt.Errorf("wait for sshd ready: %w", err)
5858
}
5959
alias := sshHostAlias(sn)
60-
changed, err := ensureSSHConfigEntry(alias, sn, ns, cfg.Spec.SSH.User, cfg.Spec.SSH.RemotePort, keyPath, cfgPath, cfg.Spec.Ports)
60+
changed, err := ensureSSHConfigEntry(alias, sn, ns, cfg.Spec.SSH.User, cfg.Spec.SSH.RemotePort, keyPath, cfgPath, cfg.Spec.Ports, cfg.Spec.SSH)
6161
if err != nil {
6262
return fmt.Errorf("update ~/.ssh/config for managed forwards: %w", err)
6363
}
@@ -72,7 +72,7 @@ func newPortsCmd(opts *Options) *cobra.Command {
7272
if running {
7373
_ = stopManagedSSHForward(alias)
7474
}
75-
if err := startManagedSSHForward(alias); err != nil {
75+
if err := startManagedSSHForwardWithForwards(alias, cfg.Spec.Ports, cfg.Spec.SSH); err != nil {
7676
return fmt.Errorf("start managed SSH forwards: %w", err)
7777
}
7878
if changed {

internal/cli/ssh.go

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"io"
8+
"log/slog"
89
"net"
910
"os"
1011
"os/exec"
@@ -78,9 +79,12 @@ func newSSHCmd(opts *Options) *cobra.Command {
7879
if err != nil {
7980
return err
8081
}
82+
pfKeepAliveCtx, pfKeepAliveCancel := context.WithCancel(cmd.Context())
83+
startPortForwardKeepalive(pfKeepAliveCtx, usedLocalPort, 30*time.Second)
8184
var pfMu sync.Mutex
8285
currentCancelPF := cancelPF
8386
defer func() {
87+
pfKeepAliveCancel()
8488
pfMu.Lock()
8589
defer pfMu.Unlock()
8690
if currentCancelPF != nil {
@@ -90,7 +94,7 @@ func newSSHCmd(opts *Options) *cobra.Command {
9094

9195
sshHost := sshHostAlias(sn)
9296
cfgPath, _ := config.ResolvePath(opts.ConfigPath)
93-
if _, cfgErr := ensureSSHConfigEntry(sshHost, sn, ns, user, remotePort, keyPath, cfgPath, cfg.Spec.Ports); cfgErr != nil {
97+
if _, cfgErr := ensureSSHConfigEntry(sshHost, sn, ns, user, remotePort, keyPath, cfgPath, cfg.Spec.Ports, cfg.Spec.SSH); cfgErr != nil {
9498
fmt.Fprintf(cmd.ErrOrStderr(), "warning: failed to update ~/.ssh/config: %v\n", cfgErr)
9599
}
96100

@@ -100,6 +104,7 @@ func newSSHCmd(opts *Options) *cobra.Command {
100104
RemotePort: remotePort,
101105
KeepAliveInterval: time.Duration(cfg.Spec.SSH.KeepAliveInterval) * time.Second,
102106
KeepAliveTimeout: time.Duration(cfg.Spec.SSH.KeepAliveTimeout) * time.Second,
107+
KeepAliveCountMax: cfg.Spec.SSH.KeepAliveCountMax,
103108
}
104109
if noTmux {
105110
tm.Env = map[string]string{"OKDEV_NO_TMUX": "1"}
@@ -123,11 +128,15 @@ func newSSHCmd(opts *Options) *cobra.Command {
123128
currentCancelPF()
124129
currentCancelPF = nil
125130
}
131+
pfKeepAliveCancel()
126132
cancel, lp, err := startSSHPortForwardWithFallback(newKubeClient(opts), ns, podName(sn), localPort, remotePort)
127133
if err != nil {
128134
return "", 0, err
129135
}
130136
currentCancelPF = cancel
137+
newCtx, newCancel := context.WithCancel(cmd.Context())
138+
pfKeepAliveCancel = newCancel
139+
startPortForwardKeepalive(newCtx, lp, 30*time.Second)
131140
return "127.0.0.1", lp, nil
132141
})
133142
var lastRTTWarnNanos atomic.Int64
@@ -196,14 +205,23 @@ func newSSHCmd(opts *Options) *cobra.Command {
196205
}
197206
return nil
198207
}
199-
if err := tm.OpenShell(); err != nil {
200-
if !tm.IsConnected() || isIgnorableProxyIOError(err) {
201-
fmt.Fprintln(cmd.ErrOrStderr(), "Connection lost. Session ended.")
202-
return nil
208+
// Shell loop: reconnect automatically when the connection drops.
209+
for {
210+
err := tm.OpenShell()
211+
if err == nil {
212+
return nil // clean exit (user typed exit/logout)
213+
}
214+
if isIgnorableProxyIOError(err) || !tm.IsConnected() {
215+
fmt.Fprintln(cmd.ErrOrStderr(), "\nConnection lost. Reconnecting...")
216+
if !tm.WaitConnected(cmd.Context()) {
217+
fmt.Fprintln(cmd.ErrOrStderr(), "Reconnect failed. Session ended.")
218+
return nil
219+
}
220+
fmt.Fprintln(cmd.ErrOrStderr(), "Reconnected.")
221+
continue
203222
}
204223
return fmt.Errorf("ssh shell failed: %w", err)
205224
}
206-
return nil
207225
},
208226
}
209227

@@ -313,7 +331,7 @@ func sshHostAlias(sessionName string) string {
313331
return "okdev-" + sessionName
314332
}
315333

316-
func ensureSSHConfigEntry(hostAlias, sessionName, namespace, user string, remotePort int, keyPath, okdevConfigPath string, forwards []config.PortMapping) (bool, error) {
334+
func ensureSSHConfigEntry(hostAlias, sessionName, namespace, user string, remotePort int, keyPath, okdevConfigPath string, forwards []config.PortMapping, sshSpec config.SSHSpec) (bool, error) {
317335
home, err := os.UserHomeDir()
318336
if err != nil {
319337
return false, err
@@ -341,15 +359,9 @@ func ensureSSHConfigEntry(hostAlias, sessionName, namespace, user string, remote
341359
" UserKnownHostsFile /dev/null",
342360
" ProxyCommand " + proxyCmd,
343361
}
344-
for _, p := range forwards {
345-
if p.Local <= 0 || p.Remote <= 0 {
346-
continue
347-
}
348-
blockLines = append(blockLines, fmt.Sprintf(" LocalForward %d 127.0.0.1:%d", p.Local, p.Remote))
349-
}
350362
blockLines = append(blockLines,
351-
" ServerAliveInterval 30",
352-
" ServerAliveCountMax 10",
363+
fmt.Sprintf(" ServerAliveInterval %d", sshSpec.KeepAliveInterval),
364+
fmt.Sprintf(" ServerAliveCountMax %d", sshSpec.KeepAliveCountMax),
353365
" TCPKeepAlive yes",
354366
" LogLevel ERROR",
355367
end,
@@ -449,14 +461,21 @@ func newSSHProxyCmd(opts *Options) *cobra.Command {
449461
if err != nil {
450462
return err
451463
}
464+
pfKeepAliveCtx, pfKeepAliveCancel := context.WithCancel(context.Background())
465+
startPortForwardKeepalive(pfKeepAliveCtx, usedLocalPort, 30*time.Second)
466+
defer pfKeepAliveCancel()
452467
if cancelPF != nil {
453468
defer cancelPF()
454469
}
470+
slog.Debug("ssh-proxy dialing local port-forward", "port", usedLocalPort)
455471
conn, err := waitDialLocal(usedLocalPort, 10*time.Second)
456472
if err != nil {
473+
slog.Debug("ssh-proxy dial failed", "error", err)
457474
return err
458475
}
459476
defer conn.Close()
477+
slog.Debug("ssh-proxy connection established", "localAddr", conn.LocalAddr(), "remoteAddr", conn.RemoteAddr())
478+
460479
var wg sync.WaitGroup
461480
var copyErr error
462481
var once sync.Once
@@ -465,13 +484,16 @@ func newSSHProxyCmd(opts *Options) *cobra.Command {
465484
if err == nil || errors.Is(err, io.EOF) || errors.Is(err, net.ErrClosed) || errors.Is(err, syscall.EPIPE) || isIgnorableProxyIOError(err) {
466485
return
467486
}
487+
slog.Debug("ssh-proxy copy error", "error", err)
468488
once.Do(func() { copyErr = err })
469489
}
470490
wg.Add(2)
471491
go func() {
472492
defer wg.Done()
493+
slog.Debug("ssh-proxy starting copy: stdin -> conn")
473494
_, err := io.Copy(conn, os.Stdin)
474495
setErr(err)
496+
slog.Debug("ssh-proxy finished copy: stdin -> conn")
475497
select {
476498
case <-done:
477499
default:
@@ -480,12 +502,15 @@ func newSSHProxyCmd(opts *Options) *cobra.Command {
480502
}()
481503
go func() {
482504
defer wg.Done()
505+
slog.Debug("ssh-proxy starting copy: conn -> stdout")
483506
_, err := io.Copy(os.Stdout, conn)
484507
setErr(err)
508+
slog.Debug("ssh-proxy finished copy: conn -> stdout")
485509
close(done)
486510
_ = conn.Close()
487511
}()
488512
<-done
513+
slog.Debug("ssh-proxy session finished", "error", copyErr)
489514
return copyErr
490515
},
491516
}
@@ -500,7 +525,8 @@ func isIgnorableProxyIOError(err error) bool {
500525
msg := strings.ToLower(err.Error())
501526
return strings.Contains(msg, "broken pipe") ||
502527
strings.Contains(msg, "use of closed network connection") ||
503-
strings.Contains(msg, "connection reset by peer")
528+
strings.Contains(msg, "connection reset by peer") ||
529+
strings.Contains(msg, "remote command exited without exit status")
504530
}
505531

506532
func waitDialLocal(localPort int, timeout time.Duration) (net.Conn, error) {
@@ -533,7 +559,7 @@ func sshControlSocketPath(hostAlias string) (string, error) {
533559
return filepath.Join(dir, hostAlias+".sock"), nil
534560
}
535561

536-
func startManagedSSHForward(hostAlias string) error {
562+
func startManagedSSHForward(hostAlias string, sshSpec config.SSHSpec) error {
537563
socketPath, err := sshControlSocketPath(hostAlias)
538564
if err != nil {
539565
return err
@@ -542,19 +568,38 @@ func startManagedSSHForward(hostAlias string) error {
542568
if err := check.Run(); err == nil {
543569
return nil
544570
}
545-
cmd := exec.Command(
571+
return startManagedSSHForwardWithForwards(hostAlias, nil, sshSpec)
572+
}
573+
574+
func startManagedSSHForwardWithForwards(hostAlias string, forwards []config.PortMapping, sshSpec config.SSHSpec) error {
575+
socketPath, err := sshControlSocketPath(hostAlias)
576+
if err != nil {
577+
return err
578+
}
579+
check := exec.Command("ssh", "-S", socketPath, "-O", "check", hostAlias)
580+
if err := check.Run(); err == nil {
581+
return nil
582+
}
583+
args := []string{
546584
"ssh",
547585
"-fN",
548586
"-M",
549587
"-S", socketPath,
550-
"-o", "ControlPersist=600",
588+
"-o", "ControlPersist=3600",
551589
"-o", "ExitOnForwardFailure=no",
552-
"-o", "ServerAliveInterval=30",
553-
"-o", "ServerAliveCountMax=10",
590+
"-o", fmt.Sprintf("ServerAliveInterval=%d", sshSpec.KeepAliveInterval),
591+
"-o", fmt.Sprintf("ServerAliveCountMax=%d", sshSpec.KeepAliveCountMax),
554592
"-o", "TCPKeepAlive=yes",
555593
"-o", "LogLevel=ERROR",
556-
hostAlias,
557-
)
594+
}
595+
for _, p := range forwards {
596+
if p.Local <= 0 || p.Remote <= 0 {
597+
continue
598+
}
599+
args = append(args, "-L", fmt.Sprintf("%d:127.0.0.1:%d", p.Local, p.Remote))
600+
}
601+
args = append(args, hostAlias)
602+
cmd := exec.Command(args[0], args[1:]...)
558603
if out, err := cmd.CombinedOutput(); err != nil {
559604
return fmt.Errorf("start managed ssh forward: %w (%s)", err, strings.TrimSpace(string(out)))
560605
}

0 commit comments

Comments
 (0)