Skip to content

Commit 51a3715

Browse files
committed
replicationcontroller: fix 100 % CPU when all queue blobs inaccessible
1 parent 5168ff6 commit 51a3715

2 files changed

Lines changed: 15 additions & 7 deletions

File tree

pkg/stoserver/restapi.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1308,7 +1308,7 @@ func (h *handlers) DownloadBlob(rctx *httpauth.RequestContext, w http.ResponseWr
13081308

13091309
bestVolumeID, err := h.conf.DiskAccess.BestVolumeID(blobMetadata.Volumes)
13101310
if err != nil {
1311-
http.Error(w, stotypes.ErrBlobNotAccessibleOnThisNode.Error(), http.StatusInternalServerError)
1311+
http.Error(w, err.Error(), http.StatusInternalServerError)
13121312
return
13131313
}
13141314

pkg/stoserver/storeplication/replicationcontroller.go

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"log"
1010
"slices"
11+
"strconv"
1112
"sync"
1213
"sync/atomic"
1314
"time"
@@ -187,19 +188,25 @@ func (c *Controller) discoverReplicationJobs(continueToken []byte) ([]*replicati
187188
}
188189
defer func() { ignoreError(tx.Rollback()) }()
189190

191+
// discover blobs to replicate in batches (instead of buffering unbounded # of jobs in RAM)
190192
batchLimit := 500
193+
// if all volumes are offline for this replication target's queue we'd end up hammering 100 % CPU with
194+
// this discovery process unless we have some throttling in place.
195+
notAccessibleLimit := 25_000
191196

192197
jobs := []*replicationJob{}
193198

199+
notAccessiblesEncountered := 0
200+
194201
nextContinueToken := stodb.StartFromFirst
195202

196203
err = stodb.BlobsPendingReplicationByVolumeIndex.Query(volIDToBytesForIndex(c.toVolumeID), continueToken, func(id []byte) error {
197-
if len(jobs) == batchLimit {
204+
batchLimitHit := len(jobs) == batchLimit
205+
notAccessibleLimitHit := notAccessiblesEncountered == notAccessibleLimit
206+
if batchLimitHit || notAccessibleLimitHit {
198207
nextContinueToken = id
199208

200-
c.logl.Info.Printf(
201-
"operating @ batchLimit (%d)",
202-
batchLimit)
209+
c.logl.Info.Printf("batchLimitHit=%v (%d) notAccessibleLimitHit=%v (%d)", batchLimitHit, batchLimit, notAccessibleLimitHit, notAccessibleLimit)
203210
return stodb.StopIteration
204211
}
205212

@@ -221,8 +228,9 @@ func (c *Controller) discoverReplicationJobs(continueToken []byte) ([]*replicati
221228
if err != nil {
222229
if err == stotypes.ErrBlobNotAccessibleOnThisNode {
223230
c.stats.blobVolumeNotAccessible++
231+
notAccessiblesEncountered++
224232
return nil
225-
} else {
233+
} else { // not expected (above func shouldn't return any other error)
226234
c.stats.otherErrors++
227235
return err
228236
}
@@ -257,7 +265,7 @@ func HasQueuedWriteIOsForVolume(volID int, tx *bbolt.Tx) (bool, error) {
257265
}
258266

259267
func volIDToBytesForIndex(volID int) []byte {
260-
return []byte(fmt.Sprintf("%d", volID))
268+
return []byte(strconv.Itoa(volID))
261269
}
262270

263271
type atomicInt32 struct {

0 commit comments

Comments
 (0)