Skip to content

Commit 1ae20a6

Browse files
acmoreclaude
andauthored
feat(config): omit sidecar image from generated config (#54)
* feat(ssh): detect and notify user on dev container restart When the dev container restarts (OOM, crash), okdev ssh now detects the restart by comparing the container's RestartCount before and after connection loss. Instead of burning through the reconnect timeout against a dead sshd, it immediately tells the user what happened: Container restarted (OOMKilled). tmux sessions and running processes were lost. Run "okdev ssh" to reconnect after the container is ready. If the container is in CrashLoopBackOff, that status is also shown. Adds GetContainerRestartInfo to the kube client which reads per-container restart count and last termination state from the Kubernetes API. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor(config): omit sidecar image from generated config Let SetDefaults() resolve the sidecar image at runtime based on the binary version instead of baking it into .okdev.yaml during init. Users who skip sidecar image upgrades automatically get the correct version without a stale tag in their config. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(cli): skip reuse of terminating pod during okdev up WorkloadRef() returns the Kubernetes API kind "Pod" (capital P) but the reuse check compared against workload.TypePod ("pod" lowercase), so the terminating/terminal pod guard was never reached. Use case-insensitive comparison and add regression tests covering kind casing variants. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 145e303 commit 1ae20a6

9 files changed

Lines changed: 257 additions & 24 deletions

File tree

internal/cli/prompt.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ func (o InitOverrides) hasWorkloadType() bool { return o.WorkloadType != "" }
7979
func (o InitOverrides) hasManifestPath() bool { return o.ManifestPath != "" }
8080
func (o InitOverrides) hasInjectPaths() bool { return len(o.InjectPaths) > 0 }
8181
func (o InitOverrides) hasGenericPreset() bool { return o.GenericPreset != "" }
82-
func (o InitOverrides) hasSidecarImage() bool { return o.SidecarImage != "" }
8382
func (o InitOverrides) hasSyncLocal() bool { return o.SyncLocal != "" }
8483
func (o InitOverrides) hasSyncRemote() bool { return o.SyncRemote != "" }
8584
func (o InitOverrides) hasSSHUser() bool { return o.SSHUser != "" }
@@ -162,16 +161,6 @@ func promptInteractive(vars *config.TemplateVars, overrides InitOverrides, in io
162161
}
163162
}
164163

165-
if !overrides.hasSidecarImage() {
166-
input, err := promptString(reader, out, "Sidecar image (okdev SSH/sync helper image)", vars.SidecarImage)
167-
if err != nil {
168-
return err
169-
}
170-
if input != "" {
171-
vars.SidecarImage = input
172-
}
173-
}
174-
175164
if !overrides.hasSyncLocal() {
176165
input, err := promptString(reader, out, "Sync local path (project directory on this machine)", vars.SyncLocal)
177166
if err != nil {

internal/cli/prompt_test.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ func TestPromptInteractiveSkipsOverriddenFields(t *testing.T) {
7272
}
7373
applyOverrides(vars, overrides)
7474

75-
input := strings.NewReader("custom-image\n./src\n/app\nalice\n")
75+
input := strings.NewReader("./src\n/app\nalice\n")
7676
var out bytes.Buffer
7777
if err := promptInteractive(vars, overrides, input, &out, false, true); err != nil {
7878
t.Fatalf("promptInteractive returned error: %v", err)
@@ -84,9 +84,6 @@ func TestPromptInteractiveSkipsOverriddenFields(t *testing.T) {
8484
if vars.Namespace != "flag-ns" {
8585
t.Fatalf("expected overridden namespace to remain unchanged, got %q", vars.Namespace)
8686
}
87-
if vars.SidecarImage != "custom-image" {
88-
t.Fatalf("expected prompted sidecar image, got %q", vars.SidecarImage)
89-
}
9087
if vars.SyncLocal != "./src" {
9188
t.Fatalf("expected prompted sync local, got %q", vars.SyncLocal)
9289
}
@@ -103,7 +100,7 @@ func TestPromptInteractiveSkipsOverriddenFields(t *testing.T) {
103100

104101
func TestPromptInteractiveUsesExplanatoryLabels(t *testing.T) {
105102
vars := config.NewTemplateVars()
106-
input := strings.NewReader("\n\n\n\n\n\n\n")
103+
input := strings.NewReader("\n\n\n\n\n\n")
107104
var out bytes.Buffer
108105

109106
if err := promptInteractive(vars, InitOverrides{}, input, &out, false, true); err != nil {
@@ -115,7 +112,6 @@ func TestPromptInteractiveUsesExplanatoryLabels(t *testing.T) {
115112
"Environment name (used for session labels and default naming)",
116113
"Namespace (where the dev workload will run)",
117114
"Workload type (pod=simple dev pod, job=batch workload, pytorchjob=distributed training, generic=custom manifest)",
118-
"Sidecar image (okdev SSH/sync helper image)",
119115
"Sync local path (project directory on this machine)",
120116
"Sync remote path (workspace path in the container)",
121117
"SSH user (login user inside the dev container)",

internal/cli/ssh.go

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"time"
1818

1919
"github.com/acmore/okdev/internal/config"
20+
"github.com/acmore/okdev/internal/kube"
2021
"github.com/acmore/okdev/internal/logx"
2122
"github.com/acmore/okdev/internal/session"
2223
"github.com/acmore/okdev/internal/shellutil"
@@ -226,7 +227,23 @@ func newSSHCmd(opts *Options) *cobra.Command {
226227
}
227228
return nil
228229
}
229-
return runSSHShellWithReconnect(cmd.Context(), tm, cc.sessionName, cmd.ErrOrStderr())
230+
var baselineRestarts int32
231+
if ri, riErr := cc.kube.GetContainerRestartInfo(cmd.Context(), cc.namespace, target.PodName, target.Container); riErr == nil && ri != nil {
232+
baselineRestarts = ri.RestartCount
233+
}
234+
checkRestart := func() *kube.ContainerRestartInfo {
235+
ctx, cancel := context.WithTimeout(context.Background(), sessionAccessTimeout)
236+
defer cancel()
237+
ri, err := cc.kube.GetContainerRestartInfo(ctx, cc.namespace, target.PodName, target.Container)
238+
if err != nil || ri == nil {
239+
return nil
240+
}
241+
if ri.RestartCount > baselineRestarts {
242+
return ri
243+
}
244+
return nil
245+
}
246+
return runSSHShellWithReconnectAndRestartCheck(cmd.Context(), tm, cc.sessionName, cmd.ErrOrStderr(), checkRestart)
230247
})
231248
},
232249
}
@@ -285,7 +302,16 @@ type sshShellRunner interface {
285302

286303
const rapidFailureThreshold = 20
287304

305+
// containerRestartCheck is called during SSH reconnection to detect if the
306+
// dev container restarted. It returns the restart info if the restart count
307+
// increased since baseline, or nil if unchanged / unavailable.
308+
type containerRestartCheck func() *kube.ContainerRestartInfo
309+
288310
func runSSHShellWithReconnect(ctx context.Context, tm sshShellRunner, sessionName string, errOut io.Writer) error {
311+
return runSSHShellWithReconnectAndRestartCheck(ctx, tm, sessionName, errOut, nil)
312+
}
313+
314+
func runSSHShellWithReconnectAndRestartCheck(ctx context.Context, tm sshShellRunner, sessionName string, errOut io.Writer, checkRestart containerRestartCheck) error {
289315
rapidFailures := 0
290316
readinessFailures := 0
291317
var firstRapidFailure time.Time
@@ -309,6 +335,12 @@ func runSSHShellWithReconnect(ctx context.Context, tm sshShellRunner, sessionNam
309335
return nil
310336
}
311337
if !inRecovery {
338+
if checkRestart != nil {
339+
if info := checkRestart(); info != nil {
340+
printContainerRestartNotice(errOut, info)
341+
return nil
342+
}
343+
}
312344
fmt.Fprintln(errOut, "\nConnection lost. Reconnecting...")
313345
inRecovery = true
314346
recoveryStart = time.Now()
@@ -356,6 +388,19 @@ func runSSHShellWithReconnect(ctx context.Context, tm sshShellRunner, sessionNam
356388
}
357389
}
358390

391+
func printContainerRestartNotice(w io.Writer, info *kube.ContainerRestartInfo) {
392+
reason := info.LastReason
393+
if reason == "" {
394+
reason = "unknown"
395+
}
396+
fmt.Fprintf(w, "\nContainer restarted (%s).", reason)
397+
if info.CurrentWaiting != "" {
398+
fmt.Fprintf(w, " Status: %s.", info.CurrentWaiting)
399+
}
400+
fmt.Fprintln(w, " tmux sessions and running processes were lost.")
401+
fmt.Fprintln(w, "Run \"okdev ssh\" to reconnect after the container is ready.")
402+
}
403+
359404
func ensureSSHKeyOnPod(opts *Options, namespace, pod, container, keyPath string) error {
360405
if err := ensureCommand("ssh-keygen"); err != nil {
361406
return err

internal/cli/ssh_dev_test.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"strings"
88
"sync/atomic"
99
"testing"
10+
11+
"github.com/acmore/okdev/internal/kube"
1012
)
1113

1214
func TestEnsureDevSSHDRunning(t *testing.T) {
@@ -160,3 +162,115 @@ func TestRunSSHShellWithReconnectRapidFailures(t *testing.T) {
160162
t.Fatalf("expected rapid failure circuit breaker, got %v", err)
161163
}
162164
}
165+
166+
func TestRunSSHShellWithReconnectDetectsContainerRestart(t *testing.T) {
167+
t.Setenv("HOME", t.TempDir())
168+
runner := &fakeShellRunner{
169+
shellErrors: []error{errors.New("connection reset by peer")},
170+
}
171+
runner.connected.Store(true)
172+
173+
checkRestart := func() *kube.ContainerRestartInfo {
174+
return &kube.ContainerRestartInfo{
175+
RestartCount: 2,
176+
LastReason: "OOMKilled",
177+
}
178+
}
179+
180+
var errOut bytes.Buffer
181+
err := runSSHShellWithReconnectAndRestartCheck(context.Background(), runner, "test-sess", &errOut, checkRestart)
182+
if err != nil {
183+
t.Fatalf("expected nil return on container restart, got %v", err)
184+
}
185+
output := errOut.String()
186+
if !strings.Contains(output, "OOMKilled") {
187+
t.Fatalf("expected OOMKilled in output, got %q", output)
188+
}
189+
if !strings.Contains(output, "Container restarted") {
190+
t.Fatalf("expected 'Container restarted' in output, got %q", output)
191+
}
192+
}
193+
194+
func TestRunSSHShellWithReconnectNoRestartContinuesRecovery(t *testing.T) {
195+
t.Setenv("HOME", t.TempDir())
196+
runner := &fakeShellRunner{
197+
shellErrors: []error{errors.New("connection reset by peer"), nil},
198+
}
199+
runner.connected.Store(true)
200+
201+
// checkRestart returns nil — no restart detected, should continue reconnecting.
202+
checkRestart := func() *kube.ContainerRestartInfo {
203+
return nil
204+
}
205+
206+
var errOut bytes.Buffer
207+
err := runSSHShellWithReconnectAndRestartCheck(context.Background(), runner, "test-sess", &errOut, checkRestart)
208+
if err != nil {
209+
t.Fatalf("expected successful reconnect, got %v", err)
210+
}
211+
if !strings.Contains(errOut.String(), "Connection lost") {
212+
t.Fatalf("expected 'Connection lost' message, got %q", errOut.String())
213+
}
214+
}
215+
216+
func TestRunSSHShellWithReconnectDetectsContainerCrashLoop(t *testing.T) {
217+
t.Setenv("HOME", t.TempDir())
218+
runner := &fakeShellRunner{
219+
shellErrors: []error{errors.New("connection reset by peer")},
220+
}
221+
runner.connected.Store(true)
222+
223+
checkRestart := func() *kube.ContainerRestartInfo {
224+
return &kube.ContainerRestartInfo{
225+
RestartCount: 5,
226+
LastReason: "Error",
227+
LastExitCode: 137,
228+
CurrentWaiting: "CrashLoopBackOff",
229+
}
230+
}
231+
232+
var errOut bytes.Buffer
233+
err := runSSHShellWithReconnectAndRestartCheck(context.Background(), runner, "test-sess", &errOut, checkRestart)
234+
if err != nil {
235+
t.Fatalf("expected nil return on container restart, got %v", err)
236+
}
237+
output := errOut.String()
238+
if !strings.Contains(output, "CrashLoopBackOff") {
239+
t.Fatalf("expected CrashLoopBackOff in output, got %q", output)
240+
}
241+
}
242+
243+
func TestPrintContainerRestartNotice(t *testing.T) {
244+
tests := []struct {
245+
name string
246+
info kube.ContainerRestartInfo
247+
contains []string
248+
}{
249+
{
250+
name: "OOMKilled",
251+
info: kube.ContainerRestartInfo{RestartCount: 1, LastReason: "OOMKilled"},
252+
contains: []string{"Container restarted (OOMKilled)", "tmux sessions"},
253+
},
254+
{
255+
name: "CrashLoopBackOff",
256+
info: kube.ContainerRestartInfo{RestartCount: 5, LastReason: "Error", CurrentWaiting: "CrashLoopBackOff"},
257+
contains: []string{"Container restarted (Error)", "CrashLoopBackOff"},
258+
},
259+
{
260+
name: "unknown reason",
261+
info: kube.ContainerRestartInfo{RestartCount: 1},
262+
contains: []string{"Container restarted (unknown)"},
263+
},
264+
}
265+
for _, tt := range tests {
266+
t.Run(tt.name, func(t *testing.T) {
267+
var buf bytes.Buffer
268+
printContainerRestartNotice(&buf, &tt.info)
269+
for _, want := range tt.contains {
270+
if !strings.Contains(buf.String(), want) {
271+
t.Errorf("expected output to contain %q, got %q", want, buf.String())
272+
}
273+
}
274+
})
275+
}
276+
}

internal/cli/up.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,7 @@ func shouldReuseExistingWorkload(ctx context.Context, k workloadExistenceChecker
963963
if !exists {
964964
return false, nil
965965
}
966-
if kind == workload.TypePod {
966+
if strings.EqualFold(kind, workload.TypePod) {
967967
summary, err := k.GetPodSummary(ctx, namespace, name)
968968
if err != nil {
969969
return false, fmt.Errorf("get pod/%s status: %w", name, err)

internal/cli/up_reconcile_test.go

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,14 +126,14 @@ func TestShouldReuseExistingWorkloadReusesExistingPodAndSkipsForReconcile(t *tes
126126
}},
127127
}
128128

129-
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default", &fakeRefRuntime{kind: workload.TypePod, apiVersion: "v1", name: "okdev-sess"})
129+
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default", &fakeRefRuntime{kind: "Pod", apiVersion: "v1", name: "okdev-sess"})
130130
if err != nil {
131131
t.Fatalf("pod shouldReuseExistingWorkload: %v", err)
132132
}
133133
if !reuse {
134134
t.Fatal("expected pod runtime to reuse existing workload")
135135
}
136-
if k.apiVersion != "v1" || k.kind != workload.TypePod || k.name != "okdev-sess" {
136+
if k.apiVersion != "v1" || k.kind != "Pod" || k.name != "okdev-sess" {
137137
t.Fatalf("unexpected pod workload ref lookup: %#v", k)
138138
}
139139

@@ -154,7 +154,7 @@ func TestShouldReuseExistingWorkloadSkipsTerminatingPod(t *testing.T) {
154154
}},
155155
}
156156

157-
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default", &fakeRefRuntime{kind: workload.TypePod, apiVersion: "v1", name: "okdev-sess"})
157+
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default", &fakeRefRuntime{kind: "Pod", apiVersion: "v1", name: "okdev-sess"})
158158
if err != nil {
159159
t.Fatalf("pod shouldReuseExistingWorkload: %v", err)
160160
}
@@ -171,7 +171,7 @@ func TestShouldReuseExistingWorkloadSkipsTerminalPod(t *testing.T) {
171171
}},
172172
}
173173

174-
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default", &fakeRefRuntime{kind: workload.TypePod, apiVersion: "v1", name: "okdev-sess"})
174+
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default", &fakeRefRuntime{kind: "Pod", apiVersion: "v1", name: "okdev-sess"})
175175
if err != nil {
176176
t.Fatalf("pod shouldReuseExistingWorkload: %v", err)
177177
}
@@ -180,6 +180,53 @@ func TestShouldReuseExistingWorkloadSkipsTerminalPod(t *testing.T) {
180180
}
181181
}
182182

183+
// Regression: PodRuntime.WorkloadRef() returns API kind "Pod" (capital P),
184+
// while workload.TypePod is "pod" (lowercase). The reuse check must handle
185+
// both casings so that terminating/terminal pod checks are not skipped.
186+
func TestShouldReuseExistingWorkloadSkipsTerminatingPodAPIKind(t *testing.T) {
187+
for _, kind := range []string{"Pod", "pod", "POD"} {
188+
t.Run(kind, func(t *testing.T) {
189+
k := &fakeWorkloadExistenceChecker{
190+
exists: true,
191+
podsSeq: [][]kube.PodSummary{{
192+
{Name: "okdev-sess", Phase: string(corev1.PodRunning), Deleting: true},
193+
}},
194+
}
195+
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default",
196+
&fakeRefRuntime{kind: kind, apiVersion: "v1", name: "okdev-sess"})
197+
if err != nil {
198+
t.Fatalf("shouldReuseExistingWorkload(%q): %v", kind, err)
199+
}
200+
if reuse {
201+
t.Fatalf("expected terminating pod (kind=%q) to skip reuse", kind)
202+
}
203+
})
204+
}
205+
}
206+
207+
func TestShouldReuseExistingWorkloadSkipsTerminalPodAPIKind(t *testing.T) {
208+
for _, kind := range []string{"Pod", "pod"} {
209+
for _, phase := range []corev1.PodPhase{corev1.PodSucceeded, corev1.PodFailed} {
210+
t.Run(kind+"/"+string(phase), func(t *testing.T) {
211+
k := &fakeWorkloadExistenceChecker{
212+
exists: true,
213+
podsSeq: [][]kube.PodSummary{{
214+
{Name: "okdev-sess", Phase: string(phase)},
215+
}},
216+
}
217+
reuse, err := shouldReuseExistingWorkload(context.Background(), k, "default",
218+
&fakeRefRuntime{kind: kind, apiVersion: "v1", name: "okdev-sess"})
219+
if err != nil {
220+
t.Fatalf("shouldReuseExistingWorkload(%q, %s): %v", kind, phase, err)
221+
}
222+
if reuse {
223+
t.Fatalf("expected terminal pod (kind=%q, phase=%s) to skip reuse", kind, phase)
224+
}
225+
})
226+
}
227+
}
228+
}
229+
183230
func TestShouldReuseExistingWorkloadSurfacesLookupErrors(t *testing.T) {
184231
want := errors.New("boom")
185232
k := &fakeWorkloadExistenceChecker{err: want}

internal/config/template.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ func NewTemplateVars() *TemplateVars {
102102
return &TemplateVars{
103103
Name: "",
104104
Namespace: "default",
105-
SidecarImage: DefaultSidecarImage,
105+
SidecarImage: "",
106106
SyncthingVersion: DefaultSyncthingVersion,
107107
SyncLocal: ".",
108108
SyncRemote: "/workspace",

internal/config/templates/basic.yaml.tmpl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ spec:
3737
keepAliveIntervalSeconds: 30
3838
keepAliveTimeoutSeconds: 90
3939
sidecar:
40+
{{- if .SidecarImage }}
4041
image: {{ .SidecarImage }}
42+
{{- end }}
4143
{{- if and (eq .WorkloadType "pod") .DevImage }}
4244
podTemplate:
4345
spec:

0 commit comments

Comments
 (0)