Skip to content

Commit 78833ec

Browse files
committed
fix(container): MITM-compatible phantom tokens, entrypoint env sourcing, reload retry
- Use SLUICE_PHANTOM:<credname> format for env var injection instead of GeneratePhantomToken() random hex. The MITM proxy only recognizes the SLUICE_PHANTOM: prefix for byte-level replacement in HTTP headers and body. Previously the env phantom and MITM phantom were different formats, so credential swap never worked for env-injected keys (e.g. GEMINI_API_KEY). - Add entrypoint wrapper in compose that sources ~/.openclaw/.env before starting openclaw so child processes (gemini --acp) inherit phantom tokens in their process environment. - Add ReloadSecrets method to ContainerManager interface. All backends (Docker, Apple, Tart) implement it using a WebSocket RPC script that sends secrets.reload directly to the gateway. This bypasses the openclaw CLI which is slow in container environments. - Add phase 2 retry loop for secrets reload after env injection. The gateway takes longer to start than the container, so the reload retries with backoff (5, 10, 20, 30, 60s) after env file is written. - Add NODE_COMPILE_CACHE and NPM_CONFIG_PREFIX env vars in compose for persistent Node.js compile cache and npm global installs.
1 parent 7aea850 commit 78833ec

9 files changed

Lines changed: 76 additions & 10 deletions

File tree

cmd/sluice/main.go

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,10 @@ func main() {
345345
// (compose healthcheck ordering ensures sluice starts first).
346346
if containerMgr != nil && db != nil {
347347
go func() {
348+
// Phase 1: write .env file into the agent container.
349+
// Retry with backoff because the container may still be starting.
348350
backoff := []time.Duration{0, 2 * time.Second, 5 * time.Second, 10 * time.Second, 30 * time.Second}
351+
injected := false
349352
for i, delay := range backoff {
350353
if delay > 0 {
351354
time.Sleep(delay)
@@ -358,6 +361,30 @@ func main() {
358361
log.Printf("WARNING: startup env injection failed after %d attempts: %v", len(backoff), err)
359362
} else {
360363
log.Printf("startup env injection succeeded (attempt %d/%d)", i+1, len(backoff))
364+
injected = true
365+
break
366+
}
367+
}
368+
if !injected {
369+
return
370+
}
371+
// Phase 2: signal the agent to reload secrets.
372+
// The gateway takes longer to start than the container itself,
373+
// so retry the reload with a longer backoff.
374+
reloadBackoff := []time.Duration{5 * time.Second, 10 * time.Second, 20 * time.Second, 30 * time.Second, 60 * time.Second}
375+
for i, delay := range reloadBackoff {
376+
time.Sleep(delay)
377+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
378+
if err := containerMgr.ReloadSecrets(ctx); err != nil {
379+
cancel()
380+
if i < len(reloadBackoff)-1 {
381+
log.Printf("startup secrets reload attempt %d/%d failed: %v (retrying)", i+1, len(reloadBackoff), err)
382+
continue
383+
}
384+
log.Printf("WARNING: startup secrets reload failed after %d attempts: %v", len(reloadBackoff), err)
385+
} else {
386+
cancel()
387+
log.Printf("startup secrets reload succeeded (attempt %d/%d)", i+1, len(reloadBackoff))
361388
return
362389
}
363390
}
@@ -763,9 +790,10 @@ func injectEnvVarsFromStore(db *store.Store, mgr container.ContainerManager) err
763790
}
764791
envMap := make(map[string]string, len(envBindings))
765792
for _, b := range envBindings {
766-
// For OAuth credentials, the env_var maps to the access token phantom.
767-
// The credential name is used to generate a format-matching phantom.
768-
envMap[b.EnvVar] = vault.GeneratePhantomToken(b.Credential)
793+
// Use the MITM-compatible phantom format (SLUICE_PHANTOM:<credname>)
794+
// so the proxy's byte-level find-and-replace works when the agent
795+
// passes the env var value in HTTP headers or request body.
796+
envMap[b.EnvVar] = proxy.PhantomToken(b.Credential)
769797
}
770798

771799
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)

cmd/sluice/main_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1469,6 +1469,10 @@ func (m *mockContainerMgr) Stop(_ context.Context) error {
14691469
return nil
14701470
}
14711471

1472+
func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error {
1473+
return nil
1474+
}
1475+
14721476
func (m *mockContainerMgr) Runtime() container.Runtime {
14731477
return container.RuntimeDocker
14741478
}

compose.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,21 @@ services:
6767
restart: unless-stopped
6868
container_name: openclaw
6969
network_mode: "service:tun2proxy"
70+
# Source sluice-injected env vars before starting openclaw so all child
71+
# processes (e.g. gemini --acp) inherit phantom tokens in their env.
72+
entrypoint: ["/bin/sh", "-c", "[ -f \"$HOME/.openclaw/.env\" ] && set -a && . \"$HOME/.openclaw/.env\" && set +a; exec docker-entrypoint.sh node openclaw.mjs gateway --allow-unconfigured"]
7073
environment:
7174
- HOME=/home/node
7275
- OPENCLAW_GATEWAY_TOKEN=${OPENCLAW_GATEWAY_TOKEN:-}
7376
- SSL_CERT_FILE=/usr/local/share/ca-certificates/sluice/sluice-ca.crt
7477
- REQUESTS_CA_BUNDLE=/usr/local/share/ca-certificates/sluice/sluice-ca.crt
7578
- NODE_EXTRA_CA_CERTS=/usr/local/share/ca-certificates/sluice/sluice-ca.crt
79+
# npm user-global directory for persistent CLI tool installs (e.g. gemini-cli).
80+
# The openclaw-home volume is mounted at /home/node so these survive restarts.
81+
- NPM_CONFIG_PREFIX=/home/node/.npm-global
82+
- PATH=/home/node/.npm-global/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
83+
# Persistent V8 compile cache to speed up Node.js CLI cold starts.
84+
- NODE_COMPILE_CACHE=/home/node/.node-compile-cache
7685
volumes:
7786
- openclaw-home:/home/node
7887
- sluice-ca:/usr/local/share/ca-certificates/sluice:ro

internal/api/server_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2631,6 +2631,10 @@ func (m *mockContainerMgr) Stop(_ context.Context) error {
26312631
return nil
26322632
}
26332633

2634+
func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error {
2635+
return nil
2636+
}
2637+
26342638
func (m *mockContainerMgr) Runtime() container.Runtime {
26352639
return container.RuntimeDocker
26362640
}

internal/container/apple.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,13 +230,19 @@ func (m *AppleManager) InjectEnvVars(ctx context.Context, envMap map[string]stri
230230
return fmt.Errorf("inject env vars: %w", execErr)
231231
}
232232

233-
if _, reloadErr := m.cli.Exec(ctx, m.containerName, []string{"openclaw", "secrets", "reload"}); reloadErr != nil {
233+
if reloadErr := m.ReloadSecrets(ctx); reloadErr != nil {
234234
log.Printf("env vars injected but secrets reload failed: %v", reloadErr)
235235
}
236236

237237
return nil
238238
}
239239

240+
// ReloadSecrets signals the openclaw gateway to re-read secrets via WebSocket RPC.
241+
func (m *AppleManager) ReloadSecrets(ctx context.Context) error {
242+
_, err := m.cli.Exec(ctx, m.containerName, []string{"node", "-e", reloadSecretsScript})
243+
return err
244+
}
245+
240246
// RestartWithEnv stops the VM, removes it, and recreates it with updated
241247
// environment variables merged into the existing config.
242248
func (m *AppleManager) RestartWithEnv(ctx context.Context, envUpdates map[string]string) error {

internal/container/docker.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,17 +92,19 @@ func (m *DockerManager) InjectEnvVars(ctx context.Context, envMap map[string]str
9292
}
9393

9494
// Signal the agent to reload secrets from the updated env file.
95-
// The openclaw CLI hangs in container environments (confirmed bug in
96-
// 2026.4.5), so we send the secrets.reload RPC directly via WebSocket
97-
// using node which is available in the openclaw container.
98-
if reloadErr := m.client.ExecInContainer(ctx, m.containerName,
99-
[]string{"node", "-e", reloadSecretsScript}); reloadErr != nil {
95+
if reloadErr := m.ReloadSecrets(ctx); reloadErr != nil {
10096
log.Printf("env vars injected but secrets reload failed: %v", reloadErr)
10197
}
10298

10399
return nil
104100
}
105101

102+
// ReloadSecrets signals the openclaw gateway to re-read secrets via WebSocket RPC.
103+
func (m *DockerManager) ReloadSecrets(ctx context.Context) error {
104+
return m.client.ExecInContainer(ctx, m.containerName,
105+
[]string{"node", "-e", reloadSecretsScript})
106+
}
107+
106108
// reloadSecretsScript is a Node.js one-liner that sends a secrets.reload
107109
// RPC to the openclaw gateway via WebSocket. It reads the gateway config
108110
// from disk to discover the port and auth token. This bypasses the openclaw

internal/container/tart.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,13 +270,19 @@ func (m *TartManager) InjectEnvVars(ctx context.Context, envMap map[string]strin
270270
return fmt.Errorf("inject env vars: %w", execErr)
271271
}
272272

273-
if _, reloadErr := m.cli.Exec(ctx, m.vmName, []string{"openclaw", "secrets", "reload"}); reloadErr != nil {
273+
if reloadErr := m.ReloadSecrets(ctx); reloadErr != nil {
274274
log.Printf("env vars injected but secrets reload failed: %v", reloadErr)
275275
}
276276

277277
return nil
278278
}
279279

280+
// ReloadSecrets signals the openclaw gateway to re-read secrets via WebSocket RPC.
281+
func (m *TartManager) ReloadSecrets(ctx context.Context) error {
282+
_, err := m.cli.Exec(ctx, m.vmName, []string{"node", "-e", reloadSecretsScript})
283+
return err
284+
}
285+
280286
// RestartWithEnv stops the VM and re-runs it in the background. Unlike Apple
281287
// Container, tart VMs persist state across stop/run cycles so we do NOT
282288
// delete+clone (which takes minutes for macOS images). Environment variables

internal/container/types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ type ContainerManager interface { //nolint:revive // stuttering accepted for cla
6868
// the cert is still available via env vars (SSL_CERT_FILE, etc.).
6969
InjectCACert(ctx context.Context, hostCertPath, certDir string) error
7070

71+
// ReloadSecrets signals the agent to re-read secrets from the env file.
72+
ReloadSecrets(ctx context.Context) error
73+
7174
// Status returns container health information.
7275
Status(ctx context.Context) (ContainerStatus, error)
7376

internal/telegram/approval_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1743,6 +1743,10 @@ func (m *mockContainerMgr) InjectCACert(_ context.Context, _, _ string) error {
17431743
return nil
17441744
}
17451745

1746+
func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error {
1747+
return nil
1748+
}
1749+
17461750
func (m *mockContainerMgr) Runtime() container.Runtime {
17471751
return container.RuntimeDocker
17481752
}

0 commit comments

Comments
 (0)