nnemirovsky
diff --git a/‎internal/container/agent_profile_test.go‎
Lines changed: 25 additions & 0 deletions b/‎internal/container/agent_profile_test.go‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎internal/container/types.go‎
Lines changed: 67 additions & 28 deletions b/‎internal/container/types.go‎
Lines changed: 67 additions & 28 deletions
diff --git a/‎internal/proxy/addon.go‎
Lines changed: 152 additions & 5 deletions b/‎internal/proxy/addon.go‎
Lines changed: 152 additions & 5 deletions
@@ -242,6 +242,31 @@ func TestBuildEnvInjectionScript_QuotesValuesForSourcing(t *testing.T) {
 	}
 }
 
+func TestBuildEnvInjectionScript_ChownsEnvFileToDirOwner(t *testing.T) {
+	// Sluice's docker exec runs as the image's USER (root for the
+	// upstream openclaw and hermes images), so the awk rename and
+	// heredoc append leave the file root-owned. The agent runtime
+	// runs as a non-root user, so without a chown back to the dir
+	// owner the agent cannot run `hermes claw migrate` or in-agent
+	// secret edits. Verify the script emits a chown step.
+	for _, hasValues := range []bool{true, false} {
+		envMap := map[string]string{}
+		if hasValues {
+			envMap["KEY"] = "value"
+		}
+		script, err := BuildEnvInjectionScript(envMap, false, true)
+		if err != nil {
+			t.Fatalf("hasValues=%v: build script: %v", hasValues, err)
+		}
+		if !strings.Contains(script, `stat -c '%u:%g'`) {
+			t.Errorf("hasValues=%v: script must stat the parent dir to derive owner: %s", hasValues, script)
+		}
+		if !strings.Contains(script, `chown "$DIR_OWNER" "$ENV_FILE"`) {
+			t.Errorf("hasValues=%v: script must chown the env file to dir owner: %s", hasValues, script)
+		}
+	}
+}
+
 func TestBuildEnvInjectionScript_RejectsNewlineInValue(t *testing.T) {
 	// A newline in the value would split the env-file entry across two
 	// lines. The second line would either be silently lost or interpreted
 
@@ -295,37 +295,76 @@ func BuildEnvInjectionScriptForProfile(profile *AgentProfile, envMap map[string]
 			break
 		}
 	}
-	if !hasContent {
-		return script.String(), nil
-	}
 
-	// Append the new block via a quoted-tag heredoc. The single quotes
-	// around the tag (`<<'TAG'`) tell the shell to perform NO expansion
-	// on the body — every byte we write between the heredoc start and
-	// the end tag lands in the file verbatim. That lets us emit
-	// `KEY='value'` lines whose contents are safe under both shell
-	// `source` and dotenv parsing.
-	script.WriteString(fmt.Sprintf(" && cat >> \"$ENV_FILE\" <<'%s'\n", envHeredocTag))
-	script.WriteString(EnvBlockBegin)
-	script.WriteString("\n")
-	for _, k := range keys {
-		v := envMap[k]
-		if v == "" {
-			continue
+	// Match the env file's ownership to its parent directory. The
+	// docker exec runs the script as the image's USER (typically
+	// root for agent containers), so both the awk rename above and
+	// the heredoc `cat >>` leave the file root-owned. The agent
+	// runtime runs as a non-root user (UID 10000 for the upstream
+	// hermes image, configurable via HERMES_UID); a root-owned env
+	// file blocks agent-side write paths like `hermes claw migrate`
+	// or in-agent secret edits with EACCES. Inheriting the parent
+	// directory's owner is portable across agents because each
+	// agent's entrypoint chowns its data dir to its runtime user
+	// before sluice ever exec's in. The chown is best-effort: a
+	// read-only env file still satisfies sluice's own usage, so a
+	// stat or chown failure does not abort the script.
+	//
+	// Stat flag portability: GNU stat (Linux containers like
+	// alpine, debian) uses `-c`, BSD stat (macOS guests booted via
+	// the tart backend) uses `-f`. The two `stat` calls are run as
+	// fallbacks under a single subshell so whichever userland is
+	// in the agent's container resolves the directory owner.
+	const chownStep = ` && { DIR_OWNER=$(stat -c '%u:%g' "$(dirname "$ENV_FILE")" 2>/dev/null` +
+		` || stat -f '%u:%g' "$(dirname "$ENV_FILE")" 2>/dev/null);` +
+		` [ -n "$DIR_OWNER" ] && chown "$DIR_OWNER" "$ENV_FILE" 2>/dev/null;` +
+		` true; }`
+
+	if hasContent {
+		// Append the new block via a quoted-tag heredoc. The single
+		// quotes around the tag (`<<'TAG'`) tell the shell to perform
+		// NO expansion on the body. Every byte between the heredoc
+		// start and the end tag lands in the file verbatim, so we
+		// can emit KEY='value' lines whose contents are safe under
+		// both shell source and dotenv parsing.
+		//
+		// The chownStep is appended to the SAME line as the heredoc
+		// `<<` operator (before the heredoc body) because shell will
+		// not let a `&&` at the start of a new line attach to a
+		// command whose heredoc body is in between. With this
+		// placement, the parser sees one logical line:
+		//   cat >> file <<TAG && { chown ...; }
+		// whose heredoc body follows. The `&&` short-circuits if
+		// cat fails (disk full, permissions, etc.) so we do not
+		// chown a file that did not get the new managed block.
+		script.WriteString(fmt.Sprintf(" && cat >> \"$ENV_FILE\" <<'%s'%s\n", envHeredocTag, chownStep))
+		script.WriteString(EnvBlockBegin)
+		script.WriteString("\n")
+		for _, k := range keys {
+			v := envMap[k]
+			if v == "" {
+				continue
+			}
+			// Escape embedded single quotes via the four-character
+			// idiom (apostrophe, backslash, apostrophe, apostrophe).
+			// Wrapped in single quotes, the result is one
+			// well-formed dotenv/shell string with no expansion.
+			escaped := strings.ReplaceAll(v, "'", `'\''`)
+			script.WriteString(k)
+			script.WriteString("='")
+			script.WriteString(escaped)
+			script.WriteString("'\n")
 		}
-		// Escape embedded single quotes: 'value' -> 'val'\''ue'.
-		// The result, wrapped in single quotes, is one well-formed
-		// dotenv/shell string with no expansion.
-		escaped := strings.ReplaceAll(v, "'", `'\''`)
-		script.WriteString(k)
-		script.WriteString("='")
-		script.WriteString(escaped)
-		script.WriteString("'\n")
+		script.WriteString(EnvBlockEnd)
+		script.WriteString("\n")
+		script.WriteString(envHeredocTag)
+		script.WriteString("\n")
+	} else {
+		// No managed-block emission. The awk pre-pass still
+		// rewrote the file (removing any prior block), so we
+		// still want to fix ownership of the result.
+		script.WriteString(chownStep)
 	}
-	script.WriteString(EnvBlockEnd)
-	script.WriteString("\n")
-	script.WriteString(envHeredocTag)
-	script.WriteString("\n")
 
 	return script.String(), nil
 }
 
@@ -7,6 +7,7 @@ import (
 	"log"
 	"net"
 	"net/http"
+	"runtime/debug"
 	"sort"
 	"strconv"
 	"strings"
@@ -127,6 +128,14 @@ type SluiceAddon struct {
 	// time.Sleep-based synchronization. Nil in production.
 	persistDone chan struct{}
 
+	// responsePanicHook is a test injection point for the Response
+	// handler's deferred recover. When non-nil it is invoked between
+	// the OAuth swap and the DLP scan, so a test can force the
+	// downstream-of-OAuth panic shape we observed in production
+	// without having to engineer a malformed Flow that triggers a
+	// real nil deref. Always nil in production.
+	responsePanicHook func()
+
 	// auditLog, when non-nil, receives per-request deny/inject events.
 	auditLog *audit.FileLogger
 
@@ -547,11 +556,53 @@ func (a *SluiceAddon) injectHeaders(f *mitmproxy.Flow, host string, port int) {
 	}
 	defer secret.Release()
 
-	f.Request.Header.Set(binding.Header, binding.FormatValue(secret.String()))
+	f.Request.Header.Set(binding.Header, binding.FormatValue(extractInjectableSecret(a.oauthIndex.Load(), binding.Credential, secret.String())))
 	log.Printf("[ADDON-INJECT] injected header %q for %s:%d (credential %q)",
 		binding.Header, host, port, binding.Credential)
 }
 
+// extractInjectableSecret returns the value to substitute into a binding's
+// `{value}` template.
+//
+// Static credentials are plain strings stored as-is in the vault; the
+// value to inject is the string itself. OAuth credentials are
+// JSON-marshalled OAuthCredential structs (access_token + refresh_token
+// + token_url + expires_at); the value to inject is just the
+// access_token, so a binding like `Authorization: Bearer {value}`
+// produces `Bearer <jwt>` rather than `Bearer {"access_token":...}`.
+//
+// We dispatch on the credential's metadata type (looked up via the
+// supplied OAuthIndex, populated from credential_meta on startup and
+// SIGHUP) rather than inferring from the secret's JSON shape. Shape
+// inference would mis-handle a static credential whose value happens
+// to be OAuth-shaped JSON. The credential_meta table is the single
+// source of truth for cred_type elsewhere in sluice; the injection
+// path follows the same rule.
+//
+// If the metadata says oauth but parsing fails (corrupted vault
+// entry, schema drift, etc.) we fall back to the raw secret. That
+// preserves the previous behavior on broken state instead of
+// returning an empty string and silently producing `Bearer ` headers.
+//
+// A nil index (no oauth credentials registered yet, or the QUIC
+// path running before UpdateOAuthIndex fires) means every
+// credential is treated as static.
+func extractInjectableSecret(idx *OAuthIndex, credName, secret string) string {
+	if idx == nil || !idx.Has(credName) {
+		return secret
+	}
+	cred, err := vault.ParseOAuth([]byte(secret))
+	if err != nil || cred == nil || cred.AccessToken == "" {
+		// Generic [INJECT] prefix because both the HTTP/1+2 and the
+		// HTTP/3 (QUIC) header-injection paths share this helper.
+		// An [ADDON-INJECT] tag would mislead a reader who saw the
+		// line in a deployment that uses HTTP/3 exclusively.
+		log.Printf("[INJECT] credential %q registered as oauth but vault payload not parseable; injecting raw secret", credName)
+		return secret
+	}
+	return cred.AccessToken
+}
+
 // Request performs Pass 2 (scoped phantom replacement) and Pass 3 (strip
 // unbound phantoms) on the fully-buffered request body, headers, URL query,
 // and URL path. Called by go-mitmproxy after the request body has been read
@@ -647,12 +698,52 @@ func (a *SluiceAddon) StreamRequestModifier(f *mitmproxy.Flow, in io.Reader) io.
 // redact patterns (see SetRedactRules) so credential strings in upstream
 // responses are scrubbed before being relayed to the agent.
 func (a *SluiceAddon) Response(f *mitmproxy.Flow) {
+	// Top-level recover so a panic inside any sub-step (OAuth swap,
+	// DLP scan, future hooks) cannot escape into go-mitmproxy's
+	// generic recover, which abandons the response body and leaves
+	// the agent reading an empty stream. We log the full stack so
+	// the underlying bug can be diagnosed later, but the response
+	// continues with whatever state f.Response was in at the time
+	// of the panic. Real tokens cannot leak: processOAuthResponseIfMatching
+	// has its own snapshot/rollback, and any panic in DLP runs AFTER
+	// OAuth swap (so tokens are already phantoms by then).
+	defer func() {
+		if r := recover(); r != nil {
+			host := "unknown"
+			method := ""
+			if f != nil && f.Request != nil {
+				if f.Request.URL != nil {
+					host = f.Request.URL.Host
+				}
+				method = f.Request.Method
+			}
+			log.Printf("[ADDON] PANIC in Response handler for %s %s: %v\n%s", method, host, r, debug.Stack())
+		}
+	}()
+
+	// Nil-flow guard. The deferred recover above dereferences f to
+	// build the log line; without this early return, a nil flow
+	// (which go-mitmproxy never produces in practice but tests can)
+	// would hit the recover path on every call. Mirror what
+	// StreamResponseModifier does so both entry points handle nil
+	// flows uniformly.
+	if f == nil {
+		return
+	}
 	if f.Response == nil || f.Request == nil {
 		return
 	}
 
 	a.processOAuthResponseIfMatching(f)
 
+	// Test-only panic injection. Always nil in production. Lets a
+	// regression test exercise the deferred recover above without
+	// having to construct a Flow that triggers a real downstream
+	// nil deref.
+	if a.responsePanicHook != nil {
+		a.responsePanicHook()
+	}
+
 	// Outbound DLP: scan response body and headers for credential
 	// patterns that should not reach the agent. Runs after OAuth
 	// processing so real tokens are already swapped to phantoms.
@@ -711,10 +802,56 @@ func (a *SluiceAddon) processOAuthResponseIfMatching(f *mitmproxy.Flow) {
 // emit a single WARNING per client connection so operators notice the
 // gap without log spam from multi-chunk streams. The dedup state lives
 // on dlpStreamWarned, scoped to the client connection id.
-func (a *SluiceAddon) StreamResponseModifier(f *mitmproxy.Flow, in io.Reader) io.Reader {
-	if f.Request == nil {
-		return in
-	}
+func (a *SluiceAddon) StreamResponseModifier(f *mitmproxy.Flow, in io.Reader) (out io.Reader) {
+	// Default to passing the input through unchanged; named return
+	// lets the deferred recover ensure we always hand SOMETHING
+	// usable back to go-mitmproxy on a panic, instead of letting
+	// the panic escape into mitmproxy's outer recover (which
+	// abandons the response body entirely).
+	out = in
+
+	// Defensive nil-input guard up front, BEFORE the flow checks
+	// below. If both `f` and `in` are nil (rare but possible in tests
+	// or on an unusual go-mitmproxy code path), the f-nil early
+	// return below would otherwise hand back a nil io.Reader, which
+	// the proxy's downstream copy would nil-deref on. http.NoBody
+	// keeps the response well-framed (zero bytes) and the panic is
+	// avoided regardless of what `f` looks like.
+	if in == nil {
+		out = http.NoBody
+		return out
+	}
+
+	if f == nil || f.Request == nil {
+		return out
+	}
+
+	// Known-safe fallback for the panic recover. Set ONLY after the
+	// OAuth phantom swap has produced a clean buffer that contains
+	// no real tokens. Critically: we never assign the raw upstream
+	// bytes here, because a matched OAuth token-endpoint response
+	// contains real access and refresh tokens, and a panic between
+	// io.ReadAll and a successful swapOAuthTokens would otherwise
+	// leak those tokens straight to the agent. If the panic fires
+	// before safeFallback is set, the recover hands back http.NoBody
+	// instead. The agent sees an empty 2xx token body and surfaces
+	// the failure as a parse error, which is the strictly safer
+	// outcome compared to leaking a real bearer.
+	var safeFallback []byte
+	defer func() {
+		if r := recover(); r != nil {
+			host := "unknown"
+			if f.Request != nil && f.Request.URL != nil {
+				host = f.Request.URL.Host
+			}
+			log.Printf("[ADDON] PANIC in StreamResponseModifier for %s: %v\n%s", host, r, debug.Stack())
+			if safeFallback != nil {
+				out = bytes.NewReader(safeFallback)
+			} else {
+				out = http.NoBody
+			}
+		}
+	}()
 
 	// Warn when DLP rules are configured but the response is streamed.
 	// Dedupe by client connection id so we emit at most one warning per
@@ -782,10 +919,20 @@ func (a *SluiceAddon) StreamResponseModifier(f *mitmproxy.Flow, in io.Reader) io
 
 	modified, err := a.swapOAuthTokens(body, contentType, credName)
 	if err != nil {
+		// The body did not parse as an OAuth token response. This is
+		// usually an HTML error page from a misconfigured token
+		// endpoint, not a credentials envelope, so passing it through
+		// is the historical behavior. We deliberately do NOT set
+		// safeFallback here: if a later panic somehow fires while
+		// returning, the recover defaults to http.NoBody rather than
+		// leaking whatever this body contains.
 		log.Printf("[ADDON-OAUTH] stream token parse error for credential %q: %v", credName, err)
 		return bytes.NewReader(body)
 	}
 
+	// Swap completed. The modified buffer is phantom-only and safe to
+	// hand back on a late panic.
+	safeFallback = modified
 	return bytes.NewReader(modified)
 }