11package adapter
22
33import (
4- "sync"
54 "sync/atomic"
65
76 lua "github.com/yuin/gopher-lua"
@@ -34,32 +33,57 @@ const luaResetKeySlack = 8
3433// table, redis, cjson, cmsgpack).
3534const luaWhitelistedTableHint = 8
3635
36+ // DefaultLuaPoolMaxIdle is the default upper bound on idle pooled
37+ // *lua.LState instances retained for reuse. Each pooled state holds
38+ // the base stdlib + redis/cjson/cmsgpack closures + per-state
39+ // snapshot tables (globals / tables / metatables); empirically ~200
40+ // KiB of long-lived heap per state. 64 is sized to comfortably cover
41+ // typical Redis-side EVAL/EVALSHA concurrency (one in-flight script
42+ // per connection up to redcon's default worker pool) without
43+ // retaining a long tail of warm states after a burst subsides.
44+ //
45+ // Operators expecting sustained higher concurrency can raise the cap
46+ // with --redisLuaMaxIdleStates; concurrency that exceeds the cap
47+ // still works correctly — excess get() calls fall through to a fresh
48+ // allocation (miss) and excess put() calls drop the state for the GC
49+ // (drop). The cap therefore controls memory floor, not throughput
50+ // ceiling.
51+ //
52+ // Exported so main.go can use it as the default for the
53+ // --redisLuaMaxIdleStates flag instead of duplicating the literal
54+ // (which trips the mnd lint and creates a drift source if the
55+ // adapter-side default changes).
56+ const DefaultLuaPoolMaxIdle = 64
57+
3758// luaStatePool pools *lua.LState instances to cut heap/GC pressure on
3859// high-rate EVAL / EVALSHA workloads (e.g. BullMQ ~10 scripts/s, where
3960// each fresh state allocs ~34% of in-use heap via newFuncContext,
4061// newRegistry, newFunctionProto).
4162//
42- // Security invariant: no state must leak between scripts. Each pooled
43- // state is initialised with a fixed set of base globals (redis, cjson,
44- // cmsgpack, table/string/math + base lib helpers, and nil-ed loaders).
45- // Three snapshots are captured at construction time:
63+ // Internal storage is a buffered channel of capacity maxIdle. We
64+ // previously used sync.Pool, which GC-clears on every cycle and has
65+ // no operator-tunable capacity; the bounded channel gives a
66+ // predictable memory floor (maxIdle * per-state footprint) and an
67+ // observable "states dropped because the pool was full" counter that
68+ // makes mis-sizing visible. The channel ops are O(1) atomic CAS on
69+ // the buffered chan; under high concurrency the contention is
70+ // comparable to sync.Pool's per-P slabs and well below the cost of
71+ // a single Lua eval. See TestLua_PoolBoundedOverflow for the
72+ // invariants.
4673//
47- // - globalsSnapshot: the full (*any*-keyed) _G map at init. Using an
48- // LValue-keyed map lets the reset path catch non-string-keyed
49- // leaks like `_G[42] = "secret"` or `_G[true] = "bad"`, which
50- // would otherwise survive a naive string-only wipe.
51- // - tableSnapshots: a shallow map from each whitelisted nested
52- // table (string, math, table, redis, cjson, cmsgpack) to its
53- // init-time field set. This is what blocks table-poisoning
54- // attacks such as `string.upper = function() return "pwned" end`
55- // -- merely restoring the `string` *reference* on _G would leave
56- // the shared table's fields still mutated.
57- // - metatableSnapshots: the init-time raw metatable of _G plus of
58- // every whitelisted nested table. Without this, a script calling
59- // `setmetatable(_G, { __index = function() return "pwned" end })`
60- // could leak a poisoned fallback into the next pooled eval via
61- // any undefined-global access. Same risk for `setmetatable(string,
62- // ...)` etc.
74+ // Security invariant: no state must leak between scripts. Each
75+ // pooled state is initialised with a fixed set of base globals
76+ // (redis, cjson, cmsgpack, table/string/math + base lib helpers,
77+ // and nil-ed loaders). Three snapshots — captured per-state at
78+ // construction and stored on pooledLuaState — back the reset path:
79+ // globalsSnapshot (the full LValue-keyed _G map, so non-string-keyed
80+ // leaks like `_G[42] = "secret"` are caught), tableSnapshots
81+ // (shallow field sets of the whitelisted nested tables, so
82+ // `string.upper = function() return "pwned" end` cannot poison
83+ // reuse), and metatableSnapshots (the init-time raw metatable of
84+ // _G plus each whitelisted nested table, so a script-installed
85+ // `setmetatable(_G, { __index = function() … end })` does not
86+ // leak across evals).
6387//
6488// On release, the reset routine
6589//
@@ -68,33 +92,30 @@ const luaWhitelistedTableHint = 8
6892// poisoning,
6993// 2. walks each snapshotted nested table and restores its contents
7094// (deletes script-added fields, rebinds original fields),
71- // 3. walks the current global table and deletes every key -- of any
72- // type -- that is not present in the globals snapshot (removes
95+ // 3. walks the current global table and deletes every key — of any
96+ // type — that is not present in the globals snapshot (removes
7397// user-added globals such as KEYS, ARGV, GLOBAL_LEAK, _G[42]),
7498// and
75- // 4. restores every globals-snapshot key to its original value (so a
76- // script that did `table = nil` or `redis = evil` cannot poison
77- // the next script).
78- //
79- // Additionally the value stack is truncated to 0 and the script
80- // context binding is cleared so the redis.call/pcall closures cannot
81- // be invoked against a stale context.
99+ // 4. restores every globals-snapshot key to its original value (so
100+ // a script that did `table = nil` or `redis = evil` cannot
101+ // poison the next script).
82102//
83- // The redis / cjson / cmsgpack closures are registered ONCE at pool
84- // fill time and read the per-eval *luaScriptContext out of each
85- // state's own Lua registry (see luaCtxRegistryKey / ctxBinding),
86- // which is set on acquire and cleared on release. Closures that
87- // would otherwise capture a fresh context per eval no longer need
88- // to be re-registered, which is what makes pooling safe and cheap.
89- // The registry-backed binding is also the reason redis.call is
90- // lock-free in the hot path, unlike the first iteration which used
103+ // The value stack is also truncated to 0 and the script-context
104+ // binding is cleared so the redis.call/pcall closures cannot be
105+ // invoked against a stale context. Those closures are registered
106+ // ONCE at pool fill time and read the per-eval *luaScriptContext
107+ // out of each state's own Lua registry (see luaCtxRegistryKey /
108+ // pooledLuaState.ctxBinding); this is what keeps redis.call
109+ // lock-free on the hot path, unlike the first iteration which used
91110// a package-level map guarded by sync.RWMutex.
92111type luaStatePool struct {
93- pool sync.Pool
112+ idle chan * pooledLuaState
113+ maxIdle int
94114
95- // hits / misses are exposed for tests and metrics.
115+ // hits / misses / drops are exposed for tests and metrics.
96116 hits atomic.Uint64
97117 misses atomic.Uint64
118+ drops atomic.Uint64
98119}
99120
100121// pooledLuaState wraps a *lua.LState plus the immutable snapshot of
@@ -188,23 +209,33 @@ func luaLookupContext(state *lua.LState) (*luaScriptContext, bool) {
188209func (r * RedisServer ) getLuaPool () * luaStatePool {
189210 r .luaPoolOnce .Do (func () {
190211 if r .luaPool == nil {
191- r .luaPool = newLuaStatePool ( )
212+ r .luaPool = newLuaStatePoolWithMaxIdle ( r . luaPoolMaxIdle )
192213 }
193214 })
194215 return r .luaPool
195216}
196217
197- // newLuaStatePool returns a pool that lazily allocates
198- // *pooledLuaState instances on demand. The pool deliberately does NOT
199- // set sync.Pool.New: if it did, p.pool.Get() would auto-invoke the
200- // constructor on an empty pool and we could not distinguish a fresh
201- // allocation from a reused instance. Instead, get() inspects the
202- // result of p.pool.Get() -- a nil return signals an empty pool and
203- // drives the miss counter plus an explicit newPooledLuaState() call.
204- // This keeps the hit/miss metrics honest, which is what the serial
205- // reuse tests and the observability counters rely on.
218+ // newLuaStatePool returns a bounded pool sized at DefaultLuaPoolMaxIdle.
219+ // Used by test fixtures and any caller that does not thread a
220+ // configured cap through. Production wires the explicit cap via
221+ // newLuaStatePoolWithMaxIdle from NewRedisServer.
206222func newLuaStatePool () * luaStatePool {
207- return & luaStatePool {}
223+ return newLuaStatePoolWithMaxIdle (DefaultLuaPoolMaxIdle )
224+ }
225+
226+ // newLuaStatePoolWithMaxIdle returns a pool whose idle backing
227+ // channel is sized at maxIdle. Non-positive values clamp to
228+ // DefaultLuaPoolMaxIdle to keep get/put semantics well-defined
229+ // (cap=0 would make every put() drop, which is never what we want
230+ // — callers asking for "no pool" should bypass the pool entirely).
231+ func newLuaStatePoolWithMaxIdle (maxIdle int ) * luaStatePool {
232+ if maxIdle < 1 {
233+ maxIdle = DefaultLuaPoolMaxIdle
234+ }
235+ return & luaStatePool {
236+ idle : make (chan * pooledLuaState , maxIdle ),
237+ maxIdle : maxIdle ,
238+ }
208239}
209240
210241// newPooledLuaState builds a fresh pooled state: base libs, dangerous
@@ -484,36 +515,56 @@ func resetTableContents(tbl *lua.LTable, originalFields map[lua.LValue]lua.LValu
484515// pointer write to the state-local ctxBinding userdata -- no lock,
485516// no global map.
486517//
487- // Because newLuaStatePool does NOT set sync.Pool.New, p.pool.Get()
488- // returns nil when the pool is empty; that is the signal for a miss
489- // (fresh allocation). A non-nil return is a genuine reuse and counts
490- // as a hit. The defensive type-assertion guard preserves behaviour if
491- // a future refactor ever puts something unexpected into the pool.
518+ // A non-blocking recv that does not match either case fires the
519+ // default branch (allocation miss). The defensive nil guard
520+ // preserves behaviour if a future refactor ever sends an unexpected
521+ // value through the idle channel.
522+ //
523+ // The ctxBinding assignment and return are centralized after the
524+ // select so every path goes through one binding step. This was a
525+ // gemini r1 review nit: refactor for clarity rather than duplicate
526+ // the binding in each branch.
492527func (p * luaStatePool ) get (ctx * luaScriptContext ) * pooledLuaState {
493- v := p .pool .Get ()
494- if v == nil {
495- p .misses .Add (1 )
496- pls := newPooledLuaState ()
497- pls .ctxBinding .Value = ctx
498- return pls
499- }
500- pls , ok := v .(* pooledLuaState )
501- if ! ok || pls == nil {
502- // Defence in depth: anything other than a *pooledLuaState is
503- // treated as an allocation miss rather than a silent hit.
528+ var pls * pooledLuaState
529+ select {
530+ case pls = <- p .idle :
531+ if pls != nil {
532+ p .hits .Add (1 )
533+ } else {
534+ // Intentionally-unreachable defence in depth:
535+ // put() above already guards `if pls == nil ||
536+ // pls.state == nil { return }`, so no caller can
537+ // enqueue a nil. If a future refactor breaks that
538+ // invariant the nil arrives here as a miss + fresh
539+ // allocation rather than a runtime panic. Not a
540+ // branch a reader should try to hit.
541+ p .misses .Add (1 )
542+ pls = newPooledLuaState ()
543+ }
544+ default :
504545 p .misses .Add (1 )
505546 pls = newPooledLuaState ()
506- pls .ctxBinding .Value = ctx
507- return pls
508547 }
509- p .hits .Add (1 )
510548 pls .ctxBinding .Value = ctx
511549 return pls
512550}
513551
514- // put resets the state and returns it to the pool. If the state is
515- // somehow closed (shouldn't happen on the happy path), it is dropped
516- // so a dead VM is never handed out again.
552+ // put resets the state and tries to return it to the pool. If the
553+ // state is closed (shouldn't happen on the happy path) or the idle
554+ // channel is full it is dropped so the GC can reclaim it; the drop
555+ // counter makes pool saturation observable so operators can tune
556+ // maxIdle.
557+ //
558+ // Fast-path: when len(p.idle) already equals maxIdle the put is
559+ // guaranteed to overflow, so skip the (non-trivial) pls.reset()
560+ // — restoring the globals / tables / metatables snapshots is
561+ // pointless work for a state that is about to be Close()'d. This is
562+ // a gemini r1 review optimisation; under saturation the EVAL hot
563+ // path spends much less CPU on doomed resets. The check is a racy
564+ // snapshot of len(); concurrent puts can still observe stale
565+ // "len < max" and race into the select below, where the same
566+ // channel-full guard catches them — so the fast-path is a strict
567+ // improvement, not a new correctness requirement.
517568func (p * luaStatePool ) put (pls * pooledLuaState ) {
518569 if pls == nil || pls .state == nil {
519570 return
@@ -527,15 +578,37 @@ func (p *luaStatePool) put(pls *pooledLuaState) {
527578 if pls .state .IsClosed () {
528579 return
529580 }
581+ if len (p .idle ) >= p .maxIdle {
582+ p .drops .Add (1 )
583+ pls .state .Close ()
584+ return
585+ }
530586 pls .reset ()
531- p .pool .Put (pls )
587+ select {
588+ case p .idle <- pls :
589+ default :
590+ // Idle channel filled between the fast-path check and the
591+ // select (concurrent puts winning the race). Drop the state.
592+ p .drops .Add (1 )
593+ pls .state .Close ()
594+ }
532595}
533596
534- // Hits / Misses are test hooks. They count Get() outcomes, not
535- // allocations proper, but in practice they track allocation avoidance
536- // well enough for the "is the pool actually being used?" test.
597+ // Hits / Misses / Drops are test hooks. They count Get/Put outcomes,
598+ // not allocations proper, but in practice they track allocation
599+ // avoidance well enough for the "is the pool actually being used?"
600+ // test and the "is maxIdle too low for the workload?" diagnostic.
537601func (p * luaStatePool ) Hits () uint64 { return p .hits .Load () }
538602func (p * luaStatePool ) Misses () uint64 { return p .misses .Load () }
603+ func (p * luaStatePool ) Drops () uint64 { return p .drops .Load () }
604+
605+ // Idle reports the number of states currently sitting in the pool.
606+ // Useful for metrics gauges and for tests asserting bounded retention.
607+ func (p * luaStatePool ) Idle () int { return len (p .idle ) }
608+
609+ // MaxIdle reports the configured cap. Exposed for diagnostics so
610+ // /admin can surface "(idle / maxIdle)" to operators.
611+ func (p * luaStatePool ) MaxIdle () int { return p .maxIdle }
539612
540613// registerPooledRedisModule installs redis.call / redis.pcall /
541614// redis.sha1hex / redis.status_reply / redis.error_reply where the
0 commit comments