fix(entrypoint): relax config permissions before write after CAP_DAC_OVERRIDE drop (#2659)

jyaunches · web-flow · commit 3739258e631d · 2026-04-29T08:34:17.000-07:00
## Summary Fix the `test/e2e/test-runtime-overrides.sh` E2E crash at baseline config capture. After PR #917 drops `CAP_DAC_OVERRIDE` via `capsh`, root can no longer write to 444-mode config files. The three runtime override functions (`apply_model_override`, `apply_cors_override`, `apply_slack_token_override`) now temporarily relax permissions to 644 before writing and re-lock to 444 afterward, using `CAP_FOWNER` which is retained by design. ## Related Issue Fixes #2653 ## Changes - **`scripts/lib/sandbox-init.sh`**: Add `relax_config_for_write()` / `lock_config_after_write()` shared helpers that `chmod 644`/`chmod 444` with symlink guards - **`scripts/nemoclaw-start.sh`**: Wrap python3 write + hash recomputation in all three override functions with the new helpers - **`scripts/nemoclaw-start.sh`**: Tighten `apply_model_override()` trigger guard — only fire when `NEMOCLAW_MODEL_OVERRIDE` or `NEMOCLAW_INFERENCE_API_OVERRIDE` is explicitly set (Dockerfile ENV defaults no longer trigger spurious override runs) - **`test/e2e/test-runtime-overrides.sh`**: Create timestamped log file matching CI artifact glob `test-runtime-overrides-*.log`, replace all `2>/dev/null` with `2>>"$LOG_FILE"` - **`test/e2e/test-runtime-overrides.sh`**: Update tests 3-5, 9-11, 14 to pair standalone parameters with `NEMOCLAW_MODEL_OVERRIDE` (matching the tightened trigger guard) ## Type of Change - [x] Code change (feature, bug fix, or refactor) - [ ] Code change with doc updates - [ ] Doc only (prose changes, no code sample modifications) - [ ] Doc only (includes code sample changes) ## Verification - [ ] `npx prek run --all-files` passes - [x] `npm test` passes - [x] Tests added or updated for new or changed behavior - [x] No secrets, API keys, or credentials committed - [ ] Docs updated for user-facing behavior changes - [ ] `make docs` builds without warnings (doc changes only) - [ ] Doc pages follow the [style guide](https://github.com/NVIDIA/NemoClaw/blob/main/docs/CONTRIBUTING.md) (doc changes only) - [ ] New doc pages include SPDX header and frontmatter (new pages only) ## AI Disclosure - [x] AI-assisted — tool: Claude Code (pi agent) --- Signed-off-by: Julie Yaunches <jyaunches@nvidia.com>  ## Summary by CodeRabbit * **Bug Fixes** * Overrides now only apply when explicit override flags are set, preventing unintended changes. * Safer config writes: permissions are temporarily relaxed with strict safety checks (rejecting unsafe targets, skipping missing files), clear security errors on failure, recompute verification only on successful writes, and permissions are always restored; write/hash failures are surfaced. * **Tests** * Improved test logging with timestamped artifacts and preserved stderr; runtime override tests updated to cover model-override scenarios.  --------- Signed-off-by: Julie Yaunches <jyaunches@nvidia.com>
diff --git a/scripts/lib/sandbox-init.sh b/scripts/lib/sandbox-init.sh
@@ -143,6 +143,52 @@ validate_tmp_permissions() {
   return $failed
 }
 
+# ── Config file permission helpers ────────────────────────────────
+# After drop_capabilities() strips CAP_DAC_OVERRIDE, root can no longer
+# write to files with mode 444. These helpers temporarily relax config
+# files to 644 for writing, then re-lock to 444 afterward.
+#
+# CAP_FOWNER is retained (by design in PR #917), so root can still chmod
+# files it doesn't own. The helpers include symlink guards to prevent
+# symlink-following attacks on the config path.
+#
+# Usage:
+#   relax_config_for_write /sandbox/.openclaw/openclaw.json /sandbox/.openclaw/.config-hash
+#   # ... perform writes ...
+#   lock_config_after_write /sandbox/.openclaw/openclaw.json /sandbox/.openclaw/.config-hash
+#
+# Ref: https://github.com/NVIDIA/NemoClaw/issues/2653
+
+relax_config_for_write() {
+  local f
+  for f in "$@"; do
+    if [ -L "$f" ]; then
+      printf '[SECURITY] Refusing to relax permissions — %s is a symlink\n' "$f" >&2
+      return 1
+    fi
+    [ -f "$f" ] || continue
+    if ! chmod 644 "$f"; then
+      printf '[SECURITY] Failed to relax permissions on %s\n' "$f" >&2
+      return 1
+    fi
+  done
+}
+
+lock_config_after_write() {
+  local f
+  for f in "$@"; do
+    if [ -L "$f" ]; then
+      printf '[SECURITY] Refusing to lock permissions — %s is a symlink\n' "$f" >&2
+      return 1
+    fi
+    [ -f "$f" ] || continue
+    if ! chmod 444 "$f"; then
+      printf '[SECURITY] Failed to lock permissions on %s\n' "$f" >&2
+      return 1
+    fi
+  done
+}
+
 # ── Capability dropping ──────────────────────────────────────────
 # CIS Docker Benchmark 5.3: containers should not run with default caps.
 # OpenShell manages the container runtime so we cannot pass --cap-drop=ALL
diff --git a/scripts/nemoclaw-start.sh b/scripts/nemoclaw-start.sh
@@ -193,12 +193,13 @@ _SANDBOX_HOME="/sandbox"          # Home dir for the sandbox user (useradd -d /s
 # Ref: https://github.com/NVIDIA/NemoClaw/issues/759
 
 apply_model_override() {
-  # Any of these env vars trigger a config patch
+  # Only explicit override env vars trigger a config patch. NEMOCLAW_CONTEXT_WINDOW,
+  # NEMOCLAW_MAX_TOKENS, and NEMOCLAW_REASONING are promoted from Dockerfile build
+  # ARGs to ENV and are always set — they should only take effect when accompanied
+  # by an explicit model or API override. Without this guard the function runs on
+  # every container start even with no override requested. Ref: #2653
   [ -n "${NEMOCLAW_MODEL_OVERRIDE:-}" ] \
     || [ -n "${NEMOCLAW_INFERENCE_API_OVERRIDE:-}" ] \
-    || [ -n "${NEMOCLAW_CONTEXT_WINDOW:-}" ] \
-    || [ -n "${NEMOCLAW_MAX_TOKENS:-}" ] \
-    || [ -n "${NEMOCLAW_REASONING:-}" ] \
     || return 0
 
   # SECURITY: Only root can write to /sandbox/.openclaw (root:root 444).
@@ -272,10 +273,15 @@ apply_model_override() {
   [ -n "$max_tokens" ] && printf '[config] Applying max tokens override: %s\n' "$max_tokens" >&2
   [ -n "$reasoning" ] && printf '[config] Applying reasoning override: %s\n' "$reasoning" >&2
 
+  # Relax 444 → 644 so writes succeed after CAP_DAC_OVERRIDE is dropped (#2653).
+  # Re-lock in all exit paths so files are never left at 644 on failure.
+  relax_config_for_write "$config_file" "$hash_file"
+  local _write_rc=0
+
   NEMOCLAW_CONTEXT_WINDOW="$context_window" \
     NEMOCLAW_MAX_TOKENS="$max_tokens" \
     NEMOCLAW_REASONING="$reasoning" \
-    python3 - "$config_file" "$model_override" "$api_override" <<'PYOVERRIDE'
+    python3 - "$config_file" "$model_override" "$api_override" <<'PYOVERRIDE' || _write_rc=$?
 import json, os, sys
 
 config_file, model_override, api_override = sys.argv[1], sys.argv[2], sys.argv[3]
@@ -311,9 +317,18 @@ with open(config_file, "w") as f:
     json.dump(cfg, f, indent=2)
 PYOVERRIDE
 
-  # Recompute config hash so integrity check passes on next startup
-  (cd /sandbox/.openclaw && sha256sum openclaw.json >"$hash_file")
-  printf '[SECURITY] Config hash recomputed after model override\n' >&2
+  if [ "$_write_rc" -eq 0 ]; then
+    # Recompute config hash so integrity check passes on next startup
+    if (cd /sandbox/.openclaw && sha256sum openclaw.json >"$hash_file"); then
+      printf '[SECURITY] Config hash recomputed after model override\n' >&2
+    else
+      _write_rc=$?
+    fi
+  fi
+
+  # Re-lock 644 → 444 — always runs, even on write/hash failure (#2653)
+  lock_config_after_write "$config_file" "$hash_file"
+  [ "$_write_rc" -eq 0 ] || return "$_write_rc"
 }
 
 # ── Runtime CORS origin override ──────────────────────────────────
@@ -356,7 +371,12 @@ apply_cors_override() {
 
   printf '[config] Adding CORS origin: %s\n' "$cors_origin" >&2
 
-  python3 - "$config_file" "$cors_origin" <<'PYCORS'
+  # Relax 444 → 644 so writes succeed after CAP_DAC_OVERRIDE is dropped (#2653).
+  # Re-lock in all exit paths so files are never left at 644 on failure.
+  relax_config_for_write "$config_file" "$hash_file"
+  local _write_rc=0
+
+  python3 - "$config_file" "$cors_origin" <<'PYCORS' || _write_rc=$?
 import json, sys
 
 config_file, cors_origin = sys.argv[1], sys.argv[2]
@@ -373,8 +393,17 @@ with open(config_file, "w") as f:
     json.dump(cfg, f, indent=2)
 PYCORS
 
-  (cd /sandbox/.openclaw && sha256sum openclaw.json >"$hash_file")
-  printf '[config] Config hash recomputed after CORS override\n' >&2
+  if [ "$_write_rc" -eq 0 ]; then
+    if (cd /sandbox/.openclaw && sha256sum openclaw.json >"$hash_file"); then
+      printf '[config] Config hash recomputed after CORS override\n' >&2
+    else
+      _write_rc=$?
+    fi
+  fi
+
+  # Re-lock 644 → 444 — always runs, even on write/hash failure (#2653)
+  lock_config_after_write "$config_file" "$hash_file"
+  [ "$_write_rc" -eq 0 ] || return "$_write_rc"
 }
 
 # ── Slack token placeholder resolution ────────────────────────────
@@ -431,9 +460,14 @@ apply_slack_token_override() {
 
   printf '[channels] Resolving Slack token placeholders in openclaw.json\n' >&2
 
+  # Relax 444 → 644 so writes succeed after CAP_DAC_OVERRIDE is dropped (#2653).
+  # Re-lock in all exit paths so files are never left at 644 on failure.
+  relax_config_for_write "$config_file" "$hash_file"
+  local _write_rc=0
+
   SLACK_BOT_TOKEN="$SLACK_BOT_TOKEN" \
     SLACK_APP_TOKEN="${SLACK_APP_TOKEN:-}" \
-    python3 - "$config_file" <<'PYSLACK'
+    python3 - "$config_file" <<'PYSLACK' || _write_rc=$?
 import json, os, re, sys
 
 config_file = sys.argv[1]
@@ -463,8 +497,17 @@ with open(config_file, "w") as f:
     f.write(content)
 PYSLACK
 
-  (cd /sandbox/.openclaw && sha256sum openclaw.json >"$hash_file")
-  printf '[channels] Config hash recomputed after Slack token override\n' >&2
+  if [ "$_write_rc" -eq 0 ]; then
+    if (cd /sandbox/.openclaw && sha256sum openclaw.json >"$hash_file"); then
+      printf '[channels] Config hash recomputed after Slack token override\n' >&2
+    else
+      _write_rc=$?
+    fi
+  fi
+
+  # Re-lock 644 → 444 — always runs, even on write/hash failure (#2653)
+  lock_config_after_write "$config_file" "$hash_file"
+  [ "$_write_rc" -eq 0 ] || return "$_write_rc"
 }
 
 # ── Slack channel guard (unhandled-rejection safety net) ─────────
diff --git a/test/e2e/test-runtime-overrides.sh b/test/e2e/test-runtime-overrides.sh
@@ -36,13 +36,22 @@ info() { echo -e "${YELLOW}TEST${NC}: $1"; }
 PASSED=0
 FAILED=0
 
+# ── Log file for CI artifact collection ──────────────────────────
+# Create a timestamped log file whose name matches the CI artifact glob
+# test-runtime-overrides-*.log so Docker stderr is captured automatically.
+LOG_DIR="${SCRIPT_DIR}"
+LOG_FILE="${LOG_DIR}/test-runtime-overrides-$(date +%Y%m%dT%H%M%S).log"
+: >"$LOG_FILE"
+info "Logging Docker stderr to: $LOG_FILE"
+
 # Helper: run entrypoint with env vars, then read a config field via jq.
 # The entrypoint patches config and starts the gateway — we only need the
 # config patch, so we override CMD to just cat the config and exit.
+# Docker stderr is captured to the log file for CI artifact visibility.
 run_override() {
   local env_args=("$@")
   docker run --rm "${env_args[@]}" "$IMAGE" \
-    bash -c 'cat /sandbox/.openclaw/openclaw.json' 2>/dev/null
+    bash -c 'cat /sandbox/.openclaw/openclaw.json' 2>>"$LOG_FILE"
 }
 
 # Helper: run entrypoint with env vars and capture stderr for validation messages.
@@ -53,6 +62,8 @@ run_override_stderr() {
   docker run --rm "${env_args[@]}" "$IMAGE" \
     bash -c 'true' >/dev/null 2>"$tmpfile" || true
   cat "$tmpfile"
+  # Also append to the main log file for CI artifact capture
+  cat "$tmpfile" >>"$LOG_FILE"
   rm -f "$tmpfile"
 }
 
@@ -83,7 +94,7 @@ info "Baseline: model=$BASELINE_MODEL ctx=$BASELINE_CTX max=$BASELINE_MAX reason
 # ── Test 1: No-op baseline ───────────────────────────────────────
 
 info "1. No overrides — config matches build-time defaults"
-HASH_CHECK=$(docker run --rm "$IMAGE" bash -c 'cd /sandbox/.openclaw && sha256sum -c .config-hash --status && echo OK || echo FAIL' 2>/dev/null)
+HASH_CHECK=$(docker run --rm "$IMAGE" bash -c 'cd /sandbox/.openclaw && sha256sum -c .config-hash --status && echo OK || echo FAIL' 2>>"$LOG_FILE")
 if [ "$HASH_CHECK" = "OK" ]; then
   pass "baseline config hash valid"
 else
@@ -104,17 +115,19 @@ fi
 
 # Verify hash was recomputed
 HASH_CHECK=$(docker run --rm -e "NEMOCLAW_MODEL_OVERRIDE=$OVERRIDE_MODEL" "$IMAGE" \
-  bash -c 'cd /sandbox/.openclaw && sha256sum -c .config-hash --status && echo OK || echo FAIL' 2>/dev/null)
+  bash -c 'cd /sandbox/.openclaw && sha256sum -c .config-hash --status && echo OK || echo FAIL' 2>>"$LOG_FILE")
 if [ "$HASH_CHECK" = "OK" ]; then
   pass "config hash valid after model override"
 else
   fail "config hash invalid after model override"
 fi
 
 # ── Test 3: Context window override ──────────────────────────────
+# NEMOCLAW_CONTEXT_WINDOW only takes effect alongside a model override
+# (standalone values are baked at build time). Ref: #2653 Phase 2.
 
-info "3. NEMOCLAW_CONTEXT_WINDOW patches contextWindow"
-CFG=$(run_override -e "NEMOCLAW_CONTEXT_WINDOW=32768")
+info "3. NEMOCLAW_CONTEXT_WINDOW patches contextWindow (with model override)"
+CFG=$(run_override -e "NEMOCLAW_MODEL_OVERRIDE=$OVERRIDE_MODEL" -e "NEMOCLAW_CONTEXT_WINDOW=32768")
 ACTUAL=$(echo "$CFG" | jq -r '.models.providers | to_entries[0].value.models[0].contextWindow')
 if [ "$ACTUAL" = "32768" ]; then
   pass "contextWindow overridden to 32768"
@@ -124,8 +137,8 @@ fi
 
 # ── Test 4: Max tokens override ──────────────────────────────────
 
-info "4. NEMOCLAW_MAX_TOKENS patches maxTokens"
-CFG=$(run_override -e "NEMOCLAW_MAX_TOKENS=16384")
+info "4. NEMOCLAW_MAX_TOKENS patches maxTokens (with model override)"
+CFG=$(run_override -e "NEMOCLAW_MODEL_OVERRIDE=$OVERRIDE_MODEL" -e "NEMOCLAW_MAX_TOKENS=16384")
 ACTUAL=$(echo "$CFG" | jq -r '.models.providers | to_entries[0].value.models[0].maxTokens')
 if [ "$ACTUAL" = "16384" ]; then
   pass "maxTokens overridden to 16384"
@@ -135,8 +148,8 @@ fi
 
 # ── Test 5: Reasoning override ───────────────────────────────────
 
-info "5. NEMOCLAW_REASONING=true patches reasoning"
-CFG=$(run_override -e "NEMOCLAW_REASONING=true")
+info "5. NEMOCLAW_REASONING=true patches reasoning (with model override)"
+CFG=$(run_override -e "NEMOCLAW_MODEL_OVERRIDE=$OVERRIDE_MODEL" -e "NEMOCLAW_REASONING=true")
 ACTUAL=$(echo "$CFG" | jq -r '.models.providers | to_entries[0].value.models[0].reasoning')
 if [ "$ACTUAL" = "true" ]; then
   pass "reasoning overridden to true"
@@ -190,23 +203,23 @@ else
 fi
 
 info "9. NEMOCLAW_CONTEXT_WINDOW with non-integer is rejected"
-STDERR=$(run_override_stderr -e "NEMOCLAW_CONTEXT_WINDOW=notanumber")
+STDERR=$(run_override_stderr -e "NEMOCLAW_MODEL_OVERRIDE=test" -e "NEMOCLAW_CONTEXT_WINDOW=notanumber")
 if echo "$STDERR" | grep -q "must be a positive integer"; then
   pass "non-integer context window rejected"
 else
   fail "non-integer context window was not rejected"
 fi
 
 info "10. NEMOCLAW_MAX_TOKENS with non-integer is rejected"
-STDERR=$(run_override_stderr -e "NEMOCLAW_MAX_TOKENS=abc")
+STDERR=$(run_override_stderr -e "NEMOCLAW_MODEL_OVERRIDE=test" -e "NEMOCLAW_MAX_TOKENS=abc")
 if echo "$STDERR" | grep -q "must be a positive integer"; then
   pass "non-integer max tokens rejected"
 else
   fail "non-integer max tokens was not rejected"
 fi
 
 info "11. NEMOCLAW_REASONING with invalid value is rejected"
-STDERR=$(run_override_stderr -e "NEMOCLAW_REASONING=maybe")
+STDERR=$(run_override_stderr -e "NEMOCLAW_MODEL_OVERRIDE=test" -e "NEMOCLAW_REASONING=maybe")
 if echo "$STDERR" | grep -q 'must be "true" or "false"'; then
   pass "invalid reasoning value rejected"
 else
@@ -232,7 +245,7 @@ fi
 # ── Test 14: Original config unchanged after rejected override ───
 
 info "14. Config unchanged after rejected override"
-CFG=$(run_override -e "NEMOCLAW_CONTEXT_WINDOW=notanumber")
+CFG=$(run_override -e "NEMOCLAW_MODEL_OVERRIDE=test" -e "NEMOCLAW_CONTEXT_WINDOW=notanumber")
 ACTUAL_CTX=$(echo "$CFG" | jq -r '.models.providers | to_entries[0].value.models[0].contextWindow')
 if [ "$ACTUAL_CTX" = "$BASELINE_CTX" ]; then
   pass "config unchanged after rejected override"
diff --git a/test/nemoclaw-start.test.ts b/test/nemoclaw-start.test.ts
@@ -401,16 +401,25 @@ describe("runtime model override (#759)", () => {
     expect(fn[1]).toContain('NEMOCLAW_REASONING must be "true" or "false"');
   });
 
-  it("triggers on any override env var, not just MODEL_OVERRIDE", () => {
+  it("triggers only on explicit override env vars (MODEL_OVERRIDE or INFERENCE_API_OVERRIDE)", () => {
     const fn = src.match(/apply_model_override\(\) \{([\s\S]*?)^}/m);
     expect(fn).toBeTruthy();
-    // The guard should check all five env vars
+    // The guard should only check the two explicit override env vars (#2653).
+    // NEMOCLAW_CONTEXT_WINDOW, NEMOCLAW_MAX_TOKENS, and NEMOCLAW_REASONING are
+    // promoted from Dockerfile ARGs to ENV and always set — they should only
+    // take effect alongside an explicit model or API override.
     const guard = fn[1].split("return 0")[0];
     expect(guard).toContain("NEMOCLAW_MODEL_OVERRIDE");
     expect(guard).toContain("NEMOCLAW_INFERENCE_API_OVERRIDE");
-    expect(guard).toContain("NEMOCLAW_CONTEXT_WINDOW");
-    expect(guard).toContain("NEMOCLAW_MAX_TOKENS");
-    expect(guard).toContain("NEMOCLAW_REASONING");
+    expect(guard).not.toMatch(
+      /\[\s*-n\s*"\$\{NEMOCLAW_CONTEXT_WINDOW:-\}"/,
+    );
+    expect(guard).not.toMatch(
+      /\[\s*-n\s*"\$\{NEMOCLAW_MAX_TOKENS:-\}"/,
+    );
+    expect(guard).not.toMatch(
+      /\[\s*-n\s*"\$\{NEMOCLAW_REASONING:-\}"/,
+    );
   });
 });