Skip to content

Commit 7aea850

Browse files
committed
fix(container): WebSocket reload, startup retry, loopback bypass
- Replace `openclaw secrets reload` with direct WebSocket RPC via node. The openclaw CLI hangs in container environments (confirmed bug in openclaw 2026.4.5 where any subcommand hangs). The node script reads gateway config from disk, opens a WebSocket to 127.0.0.1, and sends the secrets.reload RPC directly. - Add retry with backoff for startup env injection. Sluice starts before openclaw (compose healthcheck ordering), so the first few docker exec attempts fail with "container not running". Retries at 0, 2, 5, 10, 30 second intervals. - Add --bypass 127.0.0.0/8 and ::1/128 to tun2proxy in all compose files so localhost traffic inside the openclaw container stays on loopback and doesn't get routed through the SOCKS5 proxy.
1 parent 7feb709 commit 7aea850

6 files changed

Lines changed: 58 additions & 16 deletions

File tree

cmd/sluice/main.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -341,10 +341,27 @@ func main() {
341341
// Inject phantom env vars into the agent container at startup.
342342
// Bindings with env_var set produce env var entries (e.g. OPENAI_API_KEY=phantom-xxx)
343343
// that are written into the agent's .env file via docker exec.
344+
// Retry with backoff because the agent container may still be starting
345+
// (compose healthcheck ordering ensures sluice starts first).
344346
if containerMgr != nil && db != nil {
345-
if err := injectEnvVarsFromStore(db, containerMgr); err != nil {
346-
log.Printf("WARNING: startup env injection failed: %v", err)
347-
}
347+
go func() {
348+
backoff := []time.Duration{0, 2 * time.Second, 5 * time.Second, 10 * time.Second, 30 * time.Second}
349+
for i, delay := range backoff {
350+
if delay > 0 {
351+
time.Sleep(delay)
352+
}
353+
if err := injectEnvVarsFromStore(db, containerMgr); err != nil {
354+
if i < len(backoff)-1 {
355+
log.Printf("startup env injection attempt %d/%d failed: %v (retrying)", i+1, len(backoff), err)
356+
continue
357+
}
358+
log.Printf("WARNING: startup env injection failed after %d attempts: %v", len(backoff), err)
359+
} else {
360+
log.Printf("startup env injection succeeded (attempt %d/%d)", i+1, len(backoff))
361+
return
362+
}
363+
}
364+
}()
348365
}
349366

350367
// Configure the OAuth refresh callback so that after a token refresh

compose.dev.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ services:
3232
cap_add: [NET_ADMIN]
3333
devices:
3434
- /dev/net/tun:/dev/net/tun
35-
command: ["--proxy", "socks5://sluice:1080"]
35+
command: ["--proxy", "socks5://sluice:1080", "--bypass", "127.0.0.0/8", "--bypass", "::1/128"]
3636
ports:
3737
- "${OPENCLAW_GATEWAY_PORT:-18789}:18789"
3838
healthcheck:

compose.e2e.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ services:
2828
cap_add: [NET_ADMIN]
2929
devices:
3030
- /dev/net/tun:/dev/net/tun
31-
command: ["--proxy", "socks5://sluice:1080"]
31+
command: ["--proxy", "socks5://sluice:1080", "--bypass", "127.0.0.0/8", "--bypass", "::1/128"]
3232
healthcheck:
3333
test: ["CMD-SHELL", "ip link show tun0 || exit 1"]
3434
interval: 5s

compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ services:
4646
cap_add: [NET_ADMIN]
4747
devices:
4848
- /dev/net/tun:/dev/net/tun
49-
command: ["--proxy", "socks5://sluice:1080"]
49+
command: ["--proxy", "socks5://sluice:1080", "--bypass", "127.0.0.0/8", "--bypass", "::1/128"]
5050
# OpenClaw uses network_mode: "service:tun2proxy", so its ports are
5151
# exposed here. Gateway (18789) serves the web UI and API.
5252
ports:

internal/container/docker.go

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,14 +92,44 @@ func (m *DockerManager) InjectEnvVars(ctx context.Context, envMap map[string]str
9292
}
9393

9494
// Signal the agent to reload secrets from the updated env file.
95+
// The openclaw CLI hangs in container environments (confirmed bug in
96+
// 2026.4.5), so we send the secrets.reload RPC directly via WebSocket
97+
// using node which is available in the openclaw container.
9598
if reloadErr := m.client.ExecInContainer(ctx, m.containerName,
96-
[]string{"openclaw", "secrets", "reload"}); reloadErr != nil {
99+
[]string{"node", "-e", reloadSecretsScript}); reloadErr != nil {
97100
log.Printf("env vars injected but secrets reload failed: %v", reloadErr)
98101
}
99102

100103
return nil
101104
}
102105

106+
// reloadSecretsScript is a Node.js one-liner that sends a secrets.reload
107+
// RPC to the openclaw gateway via WebSocket. It reads the gateway config
108+
// from disk to discover the port and auth token. This bypasses the openclaw
109+
// CLI which hangs in container/non-TTY environments.
110+
const reloadSecretsScript = `const fs=require("fs"),http=require("http"),crypto=require("crypto");` +
111+
`let port=18789,token="";` +
112+
`try{const c=JSON.parse(fs.readFileSync(process.env.HOME+"/.openclaw/openclaw.json","utf8"));` +
113+
`port=c.gateway?.port||18789;token=c.gateway?.auth?.token||"";}catch(e){}` +
114+
`const key=crypto.randomBytes(16).toString("base64");` +
115+
`const req=http.request({hostname:"127.0.0.1",port,path:"/",headers:{` +
116+
`"Upgrade":"websocket","Connection":"Upgrade",` +
117+
`"Sec-WebSocket-Key":key,"Sec-WebSocket-Version":"13",` +
118+
`"Authorization":"Bearer "+token}});` +
119+
`req.on("upgrade",(res,socket)=>{` +
120+
`const id=crypto.randomUUID();` +
121+
`const msg=JSON.stringify({type:"req",id,method:"secrets.reload"});` +
122+
`const p=Buffer.from(msg),mask=crypto.randomBytes(4);` +
123+
`let h;if(p.length<126){h=Buffer.alloc(2);h[0]=0x81;h[1]=0x80|p.length;}` +
124+
`else{h=Buffer.alloc(4);h[0]=0x81;h[1]=0x80|126;h.writeUInt16BE(p.length,2);}` +
125+
`const m=Buffer.alloc(p.length);for(let i=0;i<p.length;i++)m[i]=p[i]^mask[i%4];` +
126+
`socket.write(Buffer.concat([h,mask,m]));` +
127+
`socket.on("data",()=>{console.log("secrets reloaded");process.exit(0);});` +
128+
`setTimeout(()=>process.exit(0),5000);});` +
129+
`req.on("error",e=>{console.error(e.message);process.exit(1);});` +
130+
`req.setTimeout(5000,()=>{req.destroy();process.exit(1);});` +
131+
`req.end();`
132+
103133
// RestartWithEnv recreates the container with updated environment variables.
104134
// It inspects the current container config, stops and removes it, creates a
105135
// new container with the same config plus updated env vars, and starts it.

internal/container/docker_test.go

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -514,16 +514,11 @@ func TestInjectEnvVarsWritesEnvFile(t *testing.T) {
514514
t.Errorf("script should reference .openclaw/.env, got %s", script)
515515
}
516516

517-
// Second call: secrets reload.
518-
wantReload := []string{"openclaw", "secrets", "reload"}
517+
// Second call: secrets reload via node WebSocket script.
519518
if len(mc.execCalls[1]) != 3 {
520-
t.Errorf("reload cmd = %v, want %v", mc.execCalls[1], wantReload)
521-
} else {
522-
for i, w := range wantReload {
523-
if mc.execCalls[1][i] != w {
524-
t.Errorf("reload cmd[%d] = %q, want %q", i, mc.execCalls[1][i], w)
525-
}
526-
}
519+
t.Errorf("reload cmd len = %d, want 3", len(mc.execCalls[1]))
520+
} else if mc.execCalls[1][0] != "node" || mc.execCalls[1][1] != "-e" {
521+
t.Errorf("reload cmd = %v, want [node -e <script>]", mc.execCalls[1][:2])
527522
}
528523
}
529524

0 commit comments

Comments
 (0)