@@ -80,6 +80,15 @@ type runnerInfo struct {
8080 modelRef string
8181}
8282
83+ // loadingInfo holds metadata about a runner that is being initialized.
84+ type loadingInfo struct {
85+ backendName string
86+ modelID string
87+ draftModelID string
88+ modelRef string
89+ mode inference.BackendMode
90+ }
91+
8392// loader manages the loading and unloading of backend runners. It regulates
8493// active backends in a manner that avoids exhausting system resources. Loaders
8594// assume that all of their backends have been installed, so no load requests
@@ -109,6 +118,12 @@ type loader struct {
109118 waiters map [chan <- struct {}]bool
110119 // runners maps runner keys to their slot index.
111120 runners map [runnerKey ]runnerInfo
121+ // loading tracks slots that have a runner being initialized. This
122+ // allows the lock to be released during long-running operations
123+ // (run + wait) while still preventing other goroutines from using
124+ // or evicting those slots. The value contains metadata needed to
125+ // report loading status.
126+ loading map [int ]loadingInfo
112127 // slots maps slot indices to associated runners. A slot is considered free
113128 // if the runner value in it is nil.
114129 slots []* runner
@@ -157,6 +172,7 @@ func newLoader(
157172 guard : make (chan struct {}, 1 ),
158173 waiters : make (map [chan <- struct {}]bool ),
159174 runners : make (map [runnerKey ]runnerInfo , nSlots ),
175+ loading : make (map [int ]loadingInfo ),
160176 slots : make ([]* runner , nSlots ),
161177 references : make ([]uint , nSlots ),
162178 timestamps : make ([]time.Time , nSlots ),
@@ -411,6 +427,30 @@ func (l *loader) run(ctx context.Context) {
411427 }
412428}
413429
430+ // usedSlots returns the number of slots that are either occupied by a
431+ // registered runner or reserved for a runner being loaded.
432+ func (l * loader ) usedSlots () int {
433+ return len (l .runners ) + len (l .loading )
434+ }
435+
436+ // isSlotLoading reports whether the given slot is reserved for a runner
437+ // that is currently being initialized.
438+ func (l * loader ) isSlotLoading (slot int ) bool {
439+ _ , ok := l .loading [slot ]
440+ return ok
441+ }
442+
443+ // isModelLoading reports whether a runner for the given model is currently
444+ // being initialized by another goroutine.
445+ func (l * loader ) isModelLoading (backendName , modelID , draftModelID string , mode inference.BackendMode ) bool {
446+ for _ , info := range l .loading {
447+ if info .backendName == backendName && info .modelID == modelID && info .draftModelID == draftModelID && info .mode == mode {
448+ return true
449+ }
450+ }
451+ return false
452+ }
453+
414454// load allocates a runner using the specified backend and modelID. If allocated,
415455// it should be released by the caller using the release mechanism (once the
416456// runner is no longer needed).
@@ -427,7 +467,9 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
427467 if ! l .lock (ctx ) {
428468 return nil , context .Canceled
429469 }
430- defer l .unlock ()
470+ // Note: the lock is managed explicitly throughout this function rather
471+ // than via defer, because it is released during long-running operations
472+ // (run + wait) and re-acquired afterwards.
431473
432474 // Get runner configuration if available (must be done under lock since
433475 // runnerConfigs can be modified concurrently by setRunnerConfig).
@@ -468,22 +510,37 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
468510 // ensure that it's deregistered by the time we return.
469511 poll := make (chan struct {}, 1 )
470512 l .waiters [poll ] = true
471- defer func () {
513+
514+ // cleanupAndReturn is a helper that cleans up the waiter registration,
515+ // releases the lock, and returns. All exit paths must go through this
516+ // to avoid leaking the poll channel or double-unlocking.
517+ cleanupAndReturn := func (r * runner , err error ) (* runner , error ) {
472518 delete (l .waiters , poll )
473- }()
519+ l .unlock ()
520+ return r , err
521+ }
474522
475523 // Loop until we can satisfy the request or an error occurs.
524+ // These are declared outside the loop to avoid goto-over-declaration errors.
525+ var existing runnerInfo
526+ var existingOK bool
476527 for {
477528 slot := - 1
478529
479530 // If loads are disabled, then there's nothing we can do.
480531 if ! l .loadsEnabled {
481- return nil , errLoadsDisabled
532+ return cleanupAndReturn (nil , errLoadsDisabled )
533+ }
534+
535+ // See if another goroutine is already loading this runner.
536+ // If so, wait for it to finish rather than starting a duplicate load.
537+ if l .isModelLoading (backendName , modelID , draftModelID , mode ) {
538+ goto WaitForChange
482539 }
483540
484541 // See if we can satisfy the request with an existing runner.
485- existing , ok : = l .runners [makeRunnerKey (backendName , modelID , draftModelID , mode )]
486- if ok {
542+ existing , existingOK = l .runners [makeRunnerKey (backendName , modelID , draftModelID , mode )]
543+ if existingOK {
487544 select {
488545 case <- l .slots [existing .slot ].done :
489546 l .log .Warn ("Runner is defunct, waiting for eviction" , "backend" , backendName , "model" , existing .modelRef )
@@ -497,13 +554,13 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
497554 default :
498555 l .references [existing .slot ]++
499556 l .timestamps [existing .slot ] = time.Time {}
500- return l .slots [existing .slot ], nil
557+ return cleanupAndReturn ( l .slots [existing .slot ], nil )
501558 }
502559 }
503560
504- // If all slots are full, try evicting unused runners.
505- if len ( l . runners ) = = len (l .slots ) {
506- l .log .Info ("Evicting to make room" , "runners" , len (l .runners ), "slots" , len (l .slots ))
561+ // If all slots are full (including loading reservations) , try evicting unused runners.
562+ if l . usedSlots () > = len (l .slots ) {
563+ l .log .Info ("Evicting to make room" , "runners" , len (l .runners ), "loading" , len ( l . loading ), " slots" , len (l .slots ))
507564 runnerCountAtLoopStart := len (l .runners )
508565 remainingRunners := l .evict (false )
509566 // Restart the loop if eviction happened
@@ -512,46 +569,62 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
512569 }
513570 }
514571
515- // If there's a free slot, then find the slot .
516- if len ( l . runners ) < len (l .slots ) {
572+ // If there's a free slot, then find one that is not reserved for loading .
573+ if l . usedSlots ( ) < len (l .slots ) {
517574 for s , runner := range l .slots {
518- if runner == nil {
575+ if runner == nil && ! l . isSlotLoading ( s ) {
519576 slot = s
520577 break
521578 }
522579 }
523580 }
524581
525582 if slot < 0 {
526- l .log .Debug ("Cannot load model yet" , "runners" , len (l .runners ), "slots" , len (l .slots ))
583+ l .log .Debug ("Cannot load model yet" , "runners" , len (l .runners ), "loading" , len ( l . loading ), " slots" , len (l .slots ))
527584 }
528585
529586 // If we've identified a slot, then we're ready to start a runner.
530587 if slot >= 0 {
531- // Create the runner.
532- runner , err := run (l .log , backend , modelID , modelRef , mode , slot , runnerConfig , l .openAIRecorder )
588+ // Reserve the slot and release the lock for the long-running
589+ // operations (run + wait). This allows other goroutines to
590+ // proceed with loading different models, releasing runners, etc.
591+ l .loading [slot ] = loadingInfo {
592+ backendName : backendName ,
593+ modelID : modelID ,
594+ draftModelID : draftModelID ,
595+ modelRef : modelRef ,
596+ mode : mode ,
597+ }
598+ l .unlock ()
599+
600+ newRunner , err := run (l .log , backend , modelID , modelRef , mode , slot , runnerConfig , l .openAIRecorder )
533601 if err != nil {
534602 l .log .Warn ("Unable to start backend runner" , "backend" , backendName , "model" , modelID , "mode" , mode , "error" , err )
535- return nil , fmt .Errorf ("unable to start runner: %w" , err )
603+ l .lock (context .Background ())
604+ delete (l .loading , slot )
605+ l .broadcast ()
606+ return cleanupAndReturn (nil , fmt .Errorf ("unable to start runner: %w" , err ))
536607 }
537608
538- // Wait for the runner to be ready. In theory it's a little
539- // inefficient to block all other loaders (including those that
540- // might not want this runner), but in reality they would probably
541- // be blocked by the underlying loading anyway (in terms of disk and
542- // GPU performance). We have to retain a lock here though to enforce
543- // deduplication of runners and keep slot / memory reservations.
544- if err := runner .wait (ctx ); err != nil {
545- runner .terminate ()
609+ if err := newRunner .wait (ctx ); err != nil {
610+ newRunner .terminate ()
546611 l .log .Warn ("Backend runner initialization failed" , "backend" , backendName , "model" , modelID , "mode" , mode , "error" , err )
547- return nil , fmt .Errorf ("error waiting for runner to be ready: %w" , err )
612+ l .lock (context .Background ())
613+ delete (l .loading , slot )
614+ l .broadcast ()
615+ return cleanupAndReturn (nil , fmt .Errorf ("error waiting for runner to be ready: %w" , err ))
548616 }
549617
618+ // Re-acquire lock and register the runner.
619+ l .lock (context .Background ())
620+ delete (l .loading , slot )
621+
550622 // Perform registration and return the runner.
551623 l .runners [makeRunnerKey (backendName , modelID , draftModelID , mode )] = runnerInfo {slot , modelRef }
552- l .slots [slot ] = runner
624+ l .slots [slot ] = newRunner
553625 l .references [slot ] = 1
554- return runner , nil
626+ l .broadcast ()
627+ return cleanupAndReturn (newRunner , nil )
555628 }
556629
557630 // Wait for something to change. Note that we always re-lock with
@@ -562,7 +635,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
562635 select {
563636 case <- ctx .Done ():
564637 l .lock (context .Background ())
565- return nil , context .Canceled
638+ return cleanupAndReturn ( nil , context .Canceled )
566639 case <- poll :
567640 l .lock (context .Background ())
568641 }
0 commit comments