Skip to content

Commit 5f9b98c

Browse files
committed
fix(truenas): wait for systemd boot completion in Ready
Add a deterministic readiness probe to TrueNAS Ready() that runs `systemctl is-system-running --wait` inside the container, then `id`, in a single SSH session. This blocks until systemd reports a stable boot state instead of relying on client-side polling. Background: Ready() previously declared success once SSH accepted the connection and `ssh ... true` (TestAuth) returned 0. Freshly-booted clones could still have unit activations in flight — agent reports showed first exec returning exit 0 with empty stdout, with retries working seconds later. Live testing measured ~90-103s from create_sandbox to first successful exec, with the first exec attempt timing out and the retry returning instantly. The systemd probe absorbs that wait inside Ready(). `|| true` swallows the non-zero exit returned for "degraded" (non- essential units that never start are normal); `id` then verifies the user session is live and non-empty stdout returns from a real command. Fixes #15
1 parent 0e7ab2d commit 5f9b98c

3 files changed

Lines changed: 109 additions & 2 deletions

File tree

sandbox/truenas/backend_test.go

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ func TestWriteFile(t *testing.T) {
672672

673673
func TestReady(t *testing.T) {
674674
t.Run("happy path: RUNNING with IP, auth ok", func(t *testing.T) {
675-
mssh := &mockSSH{}
675+
mssh := &mockSSH{outputFn: probeOK()}
676676
tn, _ := NewForTest(&Client{
677677
Virt: &tnapi.MockVirtService{
678678
GetInstanceFunc: runningInstanceFunc("10.0.0.5"),
@@ -692,7 +692,7 @@ func TestReady(t *testing.T) {
692692
})
693693

694694
t.Run("polls until RUNNING with IP", func(t *testing.T) {
695-
mssh := &mockSSH{}
695+
mssh := &mockSSH{outputFn: probeOK()}
696696
var calls int
697697
tn, _ := NewForTest(&Client{
698698
Virt: &tnapi.MockVirtService{
@@ -728,6 +728,7 @@ func TestReady(t *testing.T) {
728728
testAuthFn: func(ctx context.Context, cc ssh.ConnConfig) error {
729729
return errors.New("permission denied")
730730
},
731+
outputFn: probeOK(),
731732
}
732733

733734
dir := t.TempDir()
@@ -788,6 +789,77 @@ func TestReady(t *testing.T) {
788789
}
789790
})
790791

792+
t.Run("readiness probe runs systemctl + id in one ssh call", func(t *testing.T) {
793+
mssh := &mockSSH{outputFn: probeOK()}
794+
tn, _ := NewForTest(&Client{
795+
Virt: &tnapi.MockVirtService{
796+
GetInstanceFunc: runningInstanceFunc("10.0.0.5"),
797+
},
798+
}, mssh, testCfg())
799+
800+
if err := tn.Ready(context.Background(), "test", 5*time.Second); err != nil {
801+
t.Fatalf("unexpected error: %v", err)
802+
}
803+
if len(mssh.outputCalls) != 1 {
804+
t.Fatalf("expected exactly one OutputQuiet call (single readiness probe); got %d", len(mssh.outputCalls))
805+
}
806+
probe := mssh.outputCalls[0]
807+
if probe.User != "pixel" {
808+
t.Errorf("probe user = %q, want pixel", probe.User)
809+
}
810+
if probe.Host != "px-test" {
811+
t.Errorf("probe host = %q, want px-test", probe.Host)
812+
}
813+
if len(probe.Cmd) != 1 || !strings.Contains(probe.Cmd[0], "systemctl is-system-running --wait") {
814+
t.Errorf("probe cmd = %v, want systemctl is-system-running --wait + id", probe.Cmd)
815+
}
816+
if !strings.Contains(probe.Cmd[0], "; id") {
817+
t.Errorf("probe cmd = %v, missing trailing `id`", probe.Cmd)
818+
}
819+
})
820+
821+
t.Run("readiness probe empty output returns error", func(t *testing.T) {
822+
mssh := &mockSSH{
823+
outputFn: func(ctx context.Context, cc ssh.ConnConfig, cmd []string) ([]byte, error) {
824+
return []byte(""), nil
825+
},
826+
}
827+
tn, _ := NewForTest(&Client{
828+
Virt: &tnapi.MockVirtService{
829+
GetInstanceFunc: runningInstanceFunc("10.0.0.5"),
830+
},
831+
}, mssh, testCfg())
832+
833+
err := tn.Ready(context.Background(), "test", 5*time.Second)
834+
if err == nil {
835+
t.Fatal("expected error when readiness probe returns empty output")
836+
}
837+
if !strings.Contains(err.Error(), "empty output") {
838+
t.Errorf("error %q should mention empty output", err.Error())
839+
}
840+
})
841+
842+
t.Run("readiness probe surfaces ssh error", func(t *testing.T) {
843+
mssh := &mockSSH{
844+
outputFn: func(ctx context.Context, cc ssh.ConnConfig, cmd []string) ([]byte, error) {
845+
return nil, errors.New("connection reset")
846+
},
847+
}
848+
tn, _ := NewForTest(&Client{
849+
Virt: &tnapi.MockVirtService{
850+
GetInstanceFunc: runningInstanceFunc("10.0.0.5"),
851+
},
852+
}, mssh, testCfg())
853+
854+
err := tn.Ready(context.Background(), "test", 5*time.Second)
855+
if err == nil {
856+
t.Fatal("expected ssh error to propagate")
857+
}
858+
if !strings.Contains(err.Error(), "connection reset") {
859+
t.Errorf("error %q should wrap the ssh error", err.Error())
860+
}
861+
})
862+
791863
t.Run("instance never appears: deadline exceeded", func(t *testing.T) {
792864
mssh := &mockSSH{}
793865
tn, _ := NewForTest(&Client{

sandbox/truenas/exec.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package truenas
22

33
import (
4+
"bytes"
45
"context"
56
"errors"
67
"fmt"
@@ -128,6 +129,32 @@ func (t *TrueNAS) Ready(ctx context.Context, name string, timeout time.Duration)
128129
return fmt.Errorf("SSH key auth failed; writing key: %w", writeErr)
129130
}
130131
}
132+
133+
// Wait for systemd to reach a stable boot state. TestAuth confirms sshd
134+
// accepts the connection, but freshly-booted clones can still have unit
135+
// activations in flight — first exec has been seen returning exit 0 with
136+
// empty stdout, recovering on retry. `systemctl is-system-running --wait`
137+
// blocks inside the container until boot stabilises ("running" or
138+
// "degraded"), giving us a deterministic readiness signal instead of
139+
// client-side polling. `|| true` swallows the non-zero exit returned for
140+
// "degraded" (non-essential units that never start are normal); `id` runs
141+
// in the same SSH session afterward so we can verify the user's session
142+
// is live and non-empty stdout returns from at least one real command.
143+
remaining = time.Until(deadline)
144+
if remaining <= 0 {
145+
return fmt.Errorf("no time left for readiness probe on %s", name)
146+
}
147+
probeCtx, probeCancel := context.WithTimeout(ctx, remaining)
148+
defer probeCancel()
149+
out, err := t.ssh.OutputQuiet(probeCtx, cc, []string{
150+
"systemctl is-system-running --wait >/dev/null 2>&1 || true; id",
151+
})
152+
if err != nil {
153+
return fmt.Errorf("readiness probe on %s: %w", name, err)
154+
}
155+
if len(bytes.TrimSpace(out)) == 0 {
156+
return fmt.Errorf("readiness probe on %s returned empty output", name)
157+
}
131158
return nil
132159
}
133160

sandbox/truenas/network_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,14 @@ func (m *mockSSH) OutputQuiet(ctx context.Context, cc ssh.ConnConfig, cmd []stri
5858
return nil, nil
5959
}
6060

61+
// probeOK returns an outputFn that satisfies Ready()'s smoke probe by
62+
// returning a non-empty `id`-style line for any call.
63+
func probeOK() func(ctx context.Context, cc ssh.ConnConfig, cmd []string) ([]byte, error) {
64+
return func(ctx context.Context, cc ssh.ConnConfig, cmd []string) ([]byte, error) {
65+
return []byte("uid=1000(pixel) gid=1000(pixel)\n"), nil
66+
}
67+
}
68+
6169
func (m *mockSSH) WaitReady(ctx context.Context, host string, timeout time.Duration, log io.Writer) error {
6270
m.waitCalls = append(m.waitCalls, host)
6371
if m.waitFn != nil {

0 commit comments

Comments
 (0)