Skip to content

Commit e0ddfdf

Browse files
committed
Temporarily bump GPU-enabled cloud idle timeout to 8 hours.
Signed-off-by: Jacob Howard <jacob.howard@docker.com>
1 parent 64153a7 commit e0ddfdf

1 file changed

Lines changed: 38 additions & 21 deletions

File tree

pkg/inference/scheduling/loader.go

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ const (
2020
// being it is almost certainly greater than the number of models that most
2121
// developers' systems will be able to load.
2222
maximumRunnerSlots = 8
23-
// runnerIdleTimeout is the maximum amount of time that a runner can sit
24-
// idle (i.e. without any requests) before being terminated.
25-
runnerIdleTimeout = 5 * time.Minute
23+
// defaultRunnerIdleTimeout is the default maximum amount of time that a
24+
// runner can sit idle (i.e. without any requests) before being terminated.
25+
defaultRunnerIdleTimeout = 5 * time.Minute
2626
)
2727

2828
var (
@@ -58,6 +58,8 @@ type loader struct {
5858
backends map[string]inference.Backend
5959
// modelManager is the shared model manager.
6060
modelManager *models.Manager
61+
// runnerIdleTimeout is the loader-specific default runner idle timeout.
62+
runnerIdleTimeout time.Duration
6163
// totalMemory is the total system memory allocated to the loader.
6264
totalMemory uint64
6365
// idleCheck is used to signal the run loop when timestamps have updated.
@@ -103,6 +105,19 @@ func newLoader(
103105
// tune this heuristic for systems with enormous amounts of VRAM.
104106
nSlots := min(runtime.NumCPU(), maximumRunnerSlots)
105107

108+
// Check if we have a special environment.
109+
isGPUEnabledCloudEnvironment := environment.Get() == environment.EnvironmentCloud &&
110+
os.Getenv("NVIDIA_VISIBLE_DEVICES") != ""
111+
112+
// Compute the idle runner timeout.
113+
//
114+
// HACK: On GPU-enabled cloud engines, we'll bump this to 8 hours. We can
115+
// remove this once we have configurable TTLs.
116+
runnerIdleTimeout := defaultRunnerIdleTimeout
117+
if isGPUEnabledCloudEnvironment {
118+
runnerIdleTimeout = 8 * time.Hour
119+
}
120+
106121
// Compute the amount of available memory.
107122
//
108123
// TODO: For now, we treat the system as having memory size 1 and all models
@@ -113,28 +128,30 @@ func newLoader(
113128
// computing model size through estimation (using parameter count and
114129
// quantization data type size).
115130
//
116-
// HACK: On GPU-enabled cloud engines, we'll temporarily bump this to 2.
131+
// HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
132+
// this once we have VRAM estimation.
117133
totalMemory := uint64(1)
118-
if environment.Get() == environment.EnvironmentCloud && os.Getenv("NVIDIA_VISIBLE_DEVICES") != "" {
134+
if isGPUEnabledCloudEnvironment {
119135
totalMemory = 2
120136
}
121137

122138
// Create the loader.
123139
l := &loader{
124-
log: log,
125-
backends: backends,
126-
modelManager: modelManager,
127-
totalMemory: totalMemory,
128-
idleCheck: make(chan struct{}, 1),
129-
guard: make(chan struct{}, 1),
130-
availableMemory: totalMemory,
131-
waiters: make(map[chan<- struct{}]bool),
132-
runners: make(map[runnerKey]int, nSlots),
133-
slots: make([]*runner, nSlots),
134-
references: make([]uint, nSlots),
135-
allocations: make([]uint64, nSlots),
136-
timestamps: make([]time.Time, nSlots),
137-
runnerConfigs: make(map[runnerKey]inference.BackendConfiguration),
140+
log: log,
141+
backends: backends,
142+
modelManager: modelManager,
143+
runnerIdleTimeout: runnerIdleTimeout,
144+
totalMemory: totalMemory,
145+
idleCheck: make(chan struct{}, 1),
146+
guard: make(chan struct{}, 1),
147+
availableMemory: totalMemory,
148+
waiters: make(map[chan<- struct{}]bool),
149+
runners: make(map[runnerKey]int, nSlots),
150+
slots: make([]*runner, nSlots),
151+
references: make([]uint, nSlots),
152+
allocations: make([]uint64, nSlots),
153+
timestamps: make([]time.Time, nSlots),
154+
runnerConfigs: make(map[runnerKey]inference.BackendConfiguration),
138155
}
139156
l.guard <- struct{}{}
140157
return l
@@ -175,7 +192,7 @@ func (l *loader) evict(idleOnly bool) int {
175192
now := time.Now()
176193
for r, slot := range l.runners {
177194
unused := l.references[slot] == 0
178-
idle := unused && now.Sub(l.timestamps[slot]) > runnerIdleTimeout
195+
idle := unused && now.Sub(l.timestamps[slot]) > l.runnerIdleTimeout
179196
defunct := false
180197
select {
181198
case <-l.slots[slot].done:
@@ -282,7 +299,7 @@ func (l *loader) idleCheckDuration() time.Duration {
282299
// Compute the remaining duration. If negative, check immediately, otherwise
283300
// wait until 100 milliseconds after expiration time (to avoid checking
284301
// right on the expiration boundary).
285-
if remaining := runnerIdleTimeout - time.Since(oldest); remaining < 0 {
302+
if remaining := l.runnerIdleTimeout - time.Since(oldest); remaining < 0 {
286303
return 0
287304
} else {
288305
return remaining + 100*time.Millisecond

0 commit comments

Comments
 (0)