@@ -1138,55 +1138,85 @@ func (s *Gopher) downloadFromHuggingFace(ctx context.Context, task *GopherTask,
11381138
11391139// waitForP2PAvailability waits for the model to become available via P2P.
11401140// This is used when another node holds the download lease.
1141- // It uses exponential backoff to avoid overwhelming the cluster with DNS lookups.
1142- func (s * Gopher ) waitForP2PAvailability (ctx context.Context , modelHash , modelInfo string ) error {
1141+ // It checks the lease status to determine whether to keep waiting:
1142+ // - If lease is complete: P2P should be available, try download
1143+ // - If lease exists and not expired: keep waiting (lease holder still downloading)
1144+ // - If lease expired or not found: give up (lease holder crashed)
1145+ // - If model is being deleted: abort early
1146+ func (s * Gopher ) waitForP2PAvailability (ctx context.Context , task * GopherTask , modelHash , modelInfo , leaseName string ) error {
11431147 if s .p2pDistributor == nil {
11441148 return fmt .Errorf ("P2P distributor not configured" )
11451149 }
11461150
11471151 // Use constants for configurable wait behavior
1148- maxAttempts := constants .P2PDefaultWaitMaxAttempts
1149- baseDelay := time .Duration (constants .P2PDefaultWaitBaseDelayMs ) * time .Millisecond
1150- maxDelay := time .Duration (constants .P2PDefaultWaitMaxDelayMs ) * time .Millisecond
1151- backoffDivisor := constants .P2PDefaultWaitBackoffDivisor
1152+ checkInterval := time .Duration (constants .P2PDefaultWaitBaseDelayMs ) * time .Millisecond
1153+ maxWaitTime := time .Duration (constants .P2PMaxWaitTimeMinutes ) * time .Minute
1154+ startTime := time .Now ()
1155+
1156+ for {
1157+ elapsed := time .Since (startTime )
11521158
1153- for attempt := 0 ; attempt < maxAttempts ; attempt ++ {
11541159 // Check context cancellation first
11551160 select {
11561161 case <- ctx .Done ():
11571162 return ctx .Err ()
11581163 default :
11591164 }
11601165
1166+ // Check absolute maximum wait time
1167+ if elapsed > maxWaitTime {
1168+ return fmt .Errorf ("absolute timeout waiting for P2P availability for model %s after %v" , modelInfo , elapsed )
1169+ }
1170+
1171+ // Check if the model is being deleted - abort early to allow cleanup
1172+ if s .isModelBeingDeleted (task ) {
1173+ s .logger .Infof ("Model %s is being deleted, aborting P2P wait" , modelInfo )
1174+ return fmt .Errorf ("model %s is being deleted, aborting P2P wait" , modelInfo )
1175+ }
1176+
1177+ // Check lease status to decide whether to keep waiting
1178+ lease , err := s .p2pLeaseManager .Get (ctx , leaseName )
1179+ if err != nil {
1180+ s .logger .Debugf ("Failed to get lease %s: %v, will retry" , leaseName , err )
1181+ } else if lease != nil {
1182+ // Check if lease is complete (download finished, seeding started)
1183+ if s .p2pLeaseManager .IsComplete (lease ) {
1184+ s .logger .Infof ("Lease %s is complete, P2P should be available for model %s" , leaseName , modelInfo )
1185+ // Give a short delay for seeding to fully start
1186+ time .Sleep (2 * time .Second )
1187+ } else if s .p2pLeaseManager .IsExpired (lease ) {
1188+ // Lease expired - holder might have crashed
1189+ s .logger .Warnf ("Lease %s expired for model %s, giving up on P2P wait" , leaseName , modelInfo )
1190+ return fmt .Errorf ("lease expired while waiting for P2P availability for model %s" , modelInfo )
1191+ } else {
1192+ // Lease is active but not complete - holder still downloading
1193+ s .logger .Debugf ("Lease %s still active (holder: %s) for model %s, waiting... (elapsed: %v)" ,
1194+ leaseName , * lease .Spec .HolderIdentity , modelInfo , elapsed .Round (time .Second ))
1195+ }
1196+ }
1197+
11611198 // Check if model is available via P2P
11621199 if s .p2pDistributor .HasPeers (ctx , modelHash ) {
11631200 s .logger .Infof ("P2P peers now available for model %s, attempting download" , modelInfo )
11641201 if err := s .p2pDistributor .TryP2PDownload (ctx , modelHash , s .p2pTimeout ); err == nil {
1165- s .logger .Infof ("Successfully downloaded model %s via P2P after waiting" , modelInfo )
1202+ s .logger .Infof ("Successfully downloaded model %s via P2P after waiting %v " , modelInfo , elapsed . Round ( time . Second ) )
11661203 return nil
11671204 } else {
11681205 s .logger .Warnf ("P2P download attempt failed for model %s: %v" , modelInfo , err )
11691206 }
11701207 }
11711208
1172- // Calculate delay with exponential backoff
1173- // Backoff increases every backoffDivisor attempts to avoid rapid polling
1174- delay := baseDelay * time .Duration (1 << uint (attempt / backoffDivisor ))
1175- if delay > maxDelay {
1176- delay = maxDelay
1209+ // Log progress periodically (every 30 seconds)
1210+ if int (elapsed .Seconds ())% 30 == 0 && elapsed .Seconds () > 0 {
1211+ s .logger .Infof ("Still waiting for P2P availability for model %s (elapsed: %v)" , modelInfo , elapsed .Round (time .Second ))
11771212 }
11781213
1179- s .logger .Debugf ("Waiting %v before next P2P check for model %s (attempt %d/%d)" ,
1180- delay , modelInfo , attempt + 1 , maxAttempts )
1181-
11821214 select {
11831215 case <- ctx .Done ():
11841216 return ctx .Err ()
1185- case <- time .After (delay ):
1217+ case <- time .After (checkInterval ):
11861218 }
11871219 }
1188-
1189- return fmt .Errorf ("timeout waiting for P2P availability for model %s after %d attempts" , modelInfo , maxAttempts )
11901220}
11911221
11921222// downloadWithP2P orchestrates the model download with P2P support.
@@ -1240,7 +1270,7 @@ func (s *Gopher) downloadWithP2P(ctx context.Context, task *GopherTask, baseMode
12401270
12411271 // Lease held by another node - wait for P2P availability
12421272 s .logger .Infof ("Lease held by another node for model %s, waiting for P2P availability" , modelInfo )
1243- if err := s .waitForP2PAvailability (ctx , modelHash , modelInfo ); err == nil {
1273+ if err := s .waitForP2PAvailability (ctx , task , modelHash , modelInfo , leaseName ); err == nil {
12441274 s .logger .Infof ("Model %s now available via P2P" , modelInfo )
12451275 return nil
12461276 } else {
@@ -1286,6 +1316,28 @@ func (s *Gopher) downloadWithLeaseHeld(ctx context.Context, task *GopherTask, ba
12861316 return nil
12871317}
12881318
1319+ // isModelBeingDeleted checks if the model resource is being deleted (has deletionTimestamp).
1320+ // This is used to abort long-running operations early when the resource is deleted.
1321+ func (s * Gopher ) isModelBeingDeleted (task * GopherTask ) bool {
1322+ if task .BaseModel != nil {
1323+ bm , err := s .baseModelLister .BaseModels (task .BaseModel .Namespace ).Get (task .BaseModel .Name )
1324+ if err != nil {
1325+ // If we can't get the resource, it might be deleted
1326+ return true
1327+ }
1328+ return ! bm .ObjectMeta .DeletionTimestamp .IsZero ()
1329+ }
1330+ if task .ClusterBaseModel != nil {
1331+ cbm , err := s .clusterBaseModelLister .Get (task .ClusterBaseModel .Name )
1332+ if err != nil {
1333+ // If we can't get the resource, it might be deleted
1334+ return true
1335+ }
1336+ return ! cbm .ObjectMeta .DeletionTimestamp .IsZero ()
1337+ }
1338+ return false
1339+ }
1340+
12891341// startSeeding begins seeding the model to peers. Errors are logged but not returned
12901342// since seeding failure shouldn't fail the overall download operation.
12911343func (s * Gopher ) startSeeding (destPath , modelHash , modelInfo string ) {
0 commit comments