@@ -146,6 +146,57 @@ COMPLIANCE_PROXY_API_TOKEN=CHANGE_ME_COMPLIANCE_PROXY_API_TOKEN
146146# transport. There is no separate ai-gateway runtime token (the previous
147147# AI_GATEWAY_API_TOKEN was an unvalidated 4th token; removed in F-0243).
148148
149+ # Performance flags — all OPTIONAL and non-secret. Every one ships with its
150+ # optimal value as the CODE default, so a stock deployment needs none of them;
151+ # set a variable only to diverge from the default.
152+ # NEXUS_LAZY_CANONICAL — compute the request canonical only when a synchronous
153+ # consumer needs it (smart routing / response cache); otherwise leave it nil
154+ # and let the async audit writer derive it off the latency path. On the clean
155+ # path (hooks-off, simple routing, cache-off) this skips the eager request-body
156+ # Normalize entirely (~29% of request-path CPU on a 50 KB body). DEFAULT ON;
157+ # NEXUS_LAZY_CANONICAL=0 forces always-compute. See normalization-architecture.md §5.2.
158+ # NEXUS_CGO_SCAN_LIMIT — cap concurrent hook content-scan cgo crossings.
159+ # DEFAULT "auto" (≈ CPUs − 2): tames the M-oversubscription tail under high
160+ # hooks-on concurrency. "0" disables the cap; a positive integer pins it.
161+ # AI_GATEWAY_AUDIT_CODEC — inline-body compression codec on the audit side-path.
162+ # DEFAULT "s2" (faster; the larger frame is covered by the spool quota).
163+ # "zstd" trades CPU for a smaller frame.
164+ # NEXUS_AUDIT_WIRE — gw→hub audit wire. DEFAULT "binary" (the Hub dual-reads).
165+ # "json" reverts to the legacy text wire.
166+ # AI_GATEWAY_AUDIT_LOSS_MODE — audit overflow policy. DEFAULT "spill"
167+ # (non-blocking spill-defer: no loss until the spill channel + disk are
168+ # saturated; drops past that are counted on dropped_total). "block" = strict
169+ # back-pressure (never drops, slows the request path); "drop" = bounded loss.
170+ # NEXUS_QUOTA_WRITE_BEHIND / NEXUS_CREDSTATS_WRITE_BEHIND — defer quota and
171+ # credential-stats Redis writes off the request hot path (flush on an interval,
172+ # final drain on graceful shutdown). DEFAULT ON (soft quota). Overshoot per
173+ # instance ≤ read-cache TTL + flush interval (~1.25s); across an N-instance
174+ # fleet the blind-spend window is that × N (each instance is unaware of peers'
175+ # un-flushed spend), and a hard kill loses the un-flushed increments. Set to 0
176+ # for strict synchronous per-request accounting.
177+ # NEXUS_EVENTS_MAX_BYTES — NEXUS_EVENTS audit-stream cap. DEFAULT "auto"
178+ # (15% of total RAM; logs a WARN at startup with the chosen value). Pin a fixed
179+ # size to override, e.g. NEXUS_EVENTS_MAX_BYTES=32GB. Alias: NEXUS_STREAM_MAX_BYTES.
180+ # NEXUS_EVENTS_STORAGE — NEXUS_EVENTS storage tier. DEFAULT "memory" (the audit
181+ # stream is a delay-tolerant burst buffer; keeping it in RAM frees the data disk
182+ # for the durable Postgres writes, the single largest single-box throughput
183+ # lever). Trade-off: a NATS broker restart/crash drops published-but-undrained
184+ # events (those already reclaimed from the producer spill); the overflow→disk
185+ # no-loss path only covers the stream-full case, not a broker bounce. Set
186+ # NEXUS_EVENTS_STORAGE=file for a durable file-backed stream that survives a
187+ # broker restart at the cost of the steady-state disk writes.
188+ # GOMEMLIMIT — Go runtime soft memory limit (read by the Go runtime, not our
189+ # code). When UNSET, each service auto-sets it at boot from the cgroup memory
190+ # limit (~70% of the cgroup max) when one is present, and logs a WARN with the
191+ # chosen value and how to override; if no cgroup limit is detectable it is left
192+ # unset (no soft cap). Without a soft cap a burst of large request/response
193+ # bodies can grow the heap until the kernel OOM-kills the service (observed under
194+ # high-concurrency SSE). To pin it explicitly, set ~70% of the box/cgroup memory,
195+ # e.g. GOMEMLIMIT=22GiB on a 32 GiB box. The AMI/systemd deployment also stamps
196+ # it; the auto-set covers hand-rolled and container deployments that don't.
197+ # NEXUS_PPROF_ADDR=:6060 — bind a net/http/pprof server for profiling. Unset
198+ # in production unless actively profiling.
199+
149200# ─────────────────────────────────────────────────────────────────────────────
150201# Infrastructure URLs (REQUIRED; vary per environment)
151202# ─────────────────────────────────────────────────────────────────────────────
@@ -229,6 +280,49 @@ NATS_URL=nats://localhost:4222
229280# default.
230281# AI_GATEWAY_AUDIT_SPOOL_DIR=/var/lib/nexus/audit-spool
231282
283+ # In-heap audit record-buffer cap (overflow → durable spill above). Each queued
284+ # record pins its pooled ~50 KB body until marshaled, so this bound is the primary
285+ # control over the audit side-path's gw heap: 10000 (default) holds the body pool
286+ # near ~1 GB under a slow-publish burst vs ~5 GB at the former 50000, same spill
287+ # rate. Raise on a memory-rich box, lower on a constrained one. 0/unset → 10000.
288+ # AI_GATEWAY_AUDIT_MAX_QUEUED_RECORDS=10000
289+
290+ # Audit overflow policy. Durable audit is a product promise + compliance
291+ # requirement. DEFAULT "spill" is spill-defer: the request path never
292+ # back-pressures — overflow goes to the durable on-disk spool and the
293+ # spill-recovery sweeper replays it to Postgres. No loss UNTIL the in-process
294+ # spill channel + disk are saturated; under sustained overload past that point
295+ # records are dropped and counted on dropped_total (never silently). This lifts
296+ # clean-path RPS past the block-mode ceiling. Alternatives: "block" = hard
297+ # synchronous back-pressure (never drops — slows the request path until the audit
298+ # pipeline drains; strictest compliance posture); "drop" = counted bounded drop
299+ # (lossy, non-compliance only). Empty/unknown → "block" (never silently lossy
300+ # from a typo).
301+ # AI_GATEWAY_AUDIT_LOSS_MODE=spill
302+
303+ # End-to-end zstd compression of large captured audit bodies. The producer
304+ # compresses off the request path (async marshal worker), the body rides the
305+ # NATS wire compressed, the Hub persists the compressed bytes verbatim (no
306+ # decompress on ingest), and only the Control-Plane view layer decompresses.
307+ # Captured bodies are JSON/text (~3-10x), and the audit pipeline is disk-I/O-
308+ # bound at the NATS broker, so this is the direct lever on publish throughput.
309+ # Default true; set 0/false to disable.
310+ # AI_GATEWAY_AUDIT_COMPRESS=true
311+ # Smallest captured body worth compressing (zstd frame + base64 overhead can
312+ # exceed savings below this). 0/unset → 1024.
313+ # AI_GATEWAY_AUDIT_COMPRESS_MIN_BYTES=1024
314+ # zstd encoder level (1=fastest, 3=default, higher=better ratio/slower).
315+ # 0/unset → library default.
316+ # AI_GATEWAY_AUDIT_COMPRESS_LEVEL=3
317+ # Spill-recovery sweeper: replays sealed on-disk spool files back into the MQ
318+ # queue so a record that overflowed to disk still reaches the queryable store
319+ # (the drain half of spill-defer). ON by default whenever a spool dir is set —
320+ # a durable spool that never reaches Postgres is a silent data gap. Interval =
321+ # sweep period; pace = throttle between files (yields the box to the request
322+ # path). 0/unset → 2000 ms / 50 ms. Set INTERVAL_MS negative to DISABLE.
323+ # AI_GATEWAY_AUDIT_SPILL_RECOVERY_INTERVAL_MS=2000
324+ # AI_GATEWAY_AUDIT_SPILL_RECOVERY_PACE_MS=50
325+
232326# Service discovery — co-located services on localhost; on prod each is a
233327# domain or LB. Every URL below is bare-named (no service prefix) because
234328# it identifies a shared environment-level entity, not a service-private
@@ -283,6 +377,36 @@ AUTH_SERVER_ISSUER=http://127.0.0.1:3001
283377# Accept localhost WebSocket origins (dev only). MUST stay false/unset in prod.
284378# NEXUS_HUB_DEV_MODE=true
285379
380+ # NEXUS_EVENTS JetStream stream cap (audit side-path burst buffer). Accepts
381+ # "8GB" / "512MB" / a bare byte count, or "auto"/unset. DEFAULT "auto" = 15% of
382+ # total RAM (a WARN at startup logs the chosen value); pin a fixed size to override.
383+ # The producer publishes full-speed and the Hub drains lazily, so this absorbs a
384+ # long burst. The stream uses DiscardNew: at the cap, NEW audit publishes fail and
385+ # the gateway spills them durably to disk — it does NOT discard old un-acked rows.
386+ # Alias NEXUS_STREAM_MAX_BYTES (perf-rig name) is honoured when this is unset.
387+ # NEXUS_EVENTS_MAX_BYTES=auto
388+ #
389+ # NEXUS_EVENTS storage tier. DEFAULT "memory" (in-RAM stream) — keeps the
390+ # delay-tolerant burst buffer off the data disk so the disk serves the durable
391+ # Postgres writes, the single largest single-box throughput lever. NOTE: the cap
392+ # above is committed to RAM, so on a 256 GiB box "auto" (15%) commits ~38 GiB to
393+ # the stream — size GOMEMLIMIT/cgroup accordingly. Trade-off: a NATS broker
394+ # restart/crash drops published-but-undrained events; the overflow→disk no-loss
395+ # path covers only the stream-full case, not a broker bounce. Set "file" for a
396+ # durable file-backed stream that survives a restart at the cost of steady-state
397+ # disk writes.
398+ # NEXUS_EVENTS_STORAGE=memory
399+
400+ # Traffic-event drain duty cycle — how the audit drain yields CPU to a co-located
401+ # AI-gateway core path (yaml: consumers.trafficDrainDutyCycle, this env overrides).
402+ # Default 0.3 = FIXED throttle: reliably yields the single-box's memory bandwidth /
403+ # loopback / Postgres to the gateway core path (measured: gateway 200-VU non-SSE
404+ # RPS ~5150 -> ~6300, beating Bifrost 5284, no loss). 0 = ADAPTIVE CPU-pressure
405+ # probe (best on a small/CPU-bound box; cannot see memory-bandwidth contention on
406+ # a core-rich box). >=1 = OFF (dedicated Hub box). NATS file store absorbs the
407+ # backlog while idle; audit is delay-tolerant, no-loss preserved by retention.
408+ # NEXUS_HUB_AUDIT_DRAIN_DUTY_CYCLE=0.3
409+
286410# Control Plane knobs.
287411# CONTROL_PLANE_PORT=3001
288412# CONTROL_PLANE_HOST=127.0.0.1
@@ -320,6 +444,9 @@ NEXUS_ASSISTANT_SYSTEM_VK=
320444# AI Gateway knobs.
321445# AI_GATEWAY_PORT=3050
322446# AI_GATEWAY_HOST=127.0.0.1
447+ # Pre-grow KiB for the request-body read scratch (server.requestReadBufKb).
448+ # 64 default; raise to ~128 for fleets that routinely carry ~128K-token contexts.
449+ # AI_GATEWAY_REQUEST_READ_BUF_KB=64
323450# AI_GATEWAY_CACHE_ENABLED=true
324451# AI_GATEWAY_CACHE_TTL=5m
325452# AI_GATEWAY_CACHE_PREFIX=ai-gw:
0 commit comments