@@ -20,9 +20,9 @@ const (
2020 // being it is almost certainly greater than the number of models that most
2121 // developers' systems will be able to load.
2222 maximumRunnerSlots = 8
23- // runnerIdleTimeout is the maximum amount of time that a runner can sit
24- // idle (i.e. without any requests) before being terminated.
25- runnerIdleTimeout = 5 * time .Minute
23+ // defaultRunnerIdleTimeout is the default maximum amount of time that a
24+ // runner can sit idle (i.e. without any requests) before being terminated.
25+ defaultRunnerIdleTimeout = 5 * time .Minute
2626)
2727
2828var (
@@ -58,6 +58,8 @@ type loader struct {
5858 backends map [string ]inference.Backend
5959 // modelManager is the shared model manager.
6060 modelManager * models.Manager
61+ // runnerIdleTimeout is the loader-specific default runner idle timeout.
62+ runnerIdleTimeout time.Duration
6163 // totalMemory is the total system memory allocated to the loader.
6264 totalMemory uint64
6365 // idleCheck is used to signal the run loop when timestamps have updated.
@@ -103,6 +105,19 @@ func newLoader(
103105 // tune this heuristic for systems with enormous amounts of VRAM.
104106 nSlots := min (runtime .NumCPU (), maximumRunnerSlots )
105107
108+ // Check if we have a special environment.
109+ isGPUEnabledCloudEnvironment := environment .Get () == environment .EnvironmentCloud &&
110+ os .Getenv ("NVIDIA_VISIBLE_DEVICES" ) != ""
111+
112+ // Compute the idle runner timeout.
113+ //
114+ // HACK: On GPU-enabled cloud engines, we'll bump this to 8 hours. We can
115+ // remove this once we have configurable TTLs.
116+ runnerIdleTimeout := defaultRunnerIdleTimeout
117+ if isGPUEnabledCloudEnvironment {
118+ runnerIdleTimeout = 8 * time .Hour
119+ }
120+
106121 // Compute the amount of available memory.
107122 //
108123 // TODO: For now, we treat the system as having memory size 1 and all models
@@ -113,28 +128,30 @@ func newLoader(
113128 // computing model size through estimation (using parameter count and
114129 // quantization data type size).
115130 //
116- // HACK: On GPU-enabled cloud engines, we'll temporarily bump this to 2.
131+ // HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
132+ // this once we have VRAM estimation.
117133 totalMemory := uint64 (1 )
118- if environment . Get () == environment . EnvironmentCloud && os . Getenv ( "NVIDIA_VISIBLE_DEVICES" ) != "" {
134+ if isGPUEnabledCloudEnvironment {
119135 totalMemory = 2
120136 }
121137
122138 // Create the loader.
123139 l := & loader {
124- log : log ,
125- backends : backends ,
126- modelManager : modelManager ,
127- totalMemory : totalMemory ,
128- idleCheck : make (chan struct {}, 1 ),
129- guard : make (chan struct {}, 1 ),
130- availableMemory : totalMemory ,
131- waiters : make (map [chan <- struct {}]bool ),
132- runners : make (map [runnerKey ]int , nSlots ),
133- slots : make ([]* runner , nSlots ),
134- references : make ([]uint , nSlots ),
135- allocations : make ([]uint64 , nSlots ),
136- timestamps : make ([]time.Time , nSlots ),
137- runnerConfigs : make (map [runnerKey ]inference.BackendConfiguration ),
140+ log : log ,
141+ backends : backends ,
142+ modelManager : modelManager ,
143+ runnerIdleTimeout : runnerIdleTimeout ,
144+ totalMemory : totalMemory ,
145+ idleCheck : make (chan struct {}, 1 ),
146+ guard : make (chan struct {}, 1 ),
147+ availableMemory : totalMemory ,
148+ waiters : make (map [chan <- struct {}]bool ),
149+ runners : make (map [runnerKey ]int , nSlots ),
150+ slots : make ([]* runner , nSlots ),
151+ references : make ([]uint , nSlots ),
152+ allocations : make ([]uint64 , nSlots ),
153+ timestamps : make ([]time.Time , nSlots ),
154+ runnerConfigs : make (map [runnerKey ]inference.BackendConfiguration ),
138155 }
139156 l .guard <- struct {}{}
140157 return l
@@ -175,7 +192,7 @@ func (l *loader) evict(idleOnly bool) int {
175192 now := time .Now ()
176193 for r , slot := range l .runners {
177194 unused := l .references [slot ] == 0
178- idle := unused && now .Sub (l .timestamps [slot ]) > runnerIdleTimeout
195+ idle := unused && now .Sub (l .timestamps [slot ]) > l . runnerIdleTimeout
179196 defunct := false
180197 select {
181198 case <- l .slots [slot ].done :
@@ -282,7 +299,7 @@ func (l *loader) idleCheckDuration() time.Duration {
282299 // Compute the remaining duration. If negative, check immediately, otherwise
283300 // wait until 100 milliseconds after expiration time (to avoid checking
284301 // right on the expiration boundary).
285- if remaining := runnerIdleTimeout - time .Since (oldest ); remaining < 0 {
302+ if remaining := l . runnerIdleTimeout - time .Since (oldest ); remaining < 0 {
286303 return 0
287304 } else {
288305 return remaining + 100 * time .Millisecond
0 commit comments