Skip to content

Commit fd65906

Browse files
authored
Push maintenance leader startup logic down into maintenance package (#1196)
This one's a quick follow up to #1184. Unfortunately in order to cover all the edge cases that Codex found during review we ended up with fairly complex start logic for the queue maintainer because it needs to have a separate goroutine, do a lot of context checking, and track its current leadership "epoch" to make sure that an older start attempt doesn't affect the start attempt booted off by a newer leadership signal. This had left an unfortunate amount of code in `client.go`. Here, break this all out into its own file in `internal/maintenance/`. This lets us streamline `Client` back to a nicer point.
1 parent d55e3fc commit fd65906

File tree

5 files changed

+362
-188
lines changed

5 files changed

+362
-188
lines changed

client.go

Lines changed: 23 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,8 @@ import (
3030
"github.com/riverqueue/river/rivershared/riverpilot"
3131
"github.com/riverqueue/river/rivershared/riversharedmaintenance"
3232
"github.com/riverqueue/river/rivershared/startstop"
33-
"github.com/riverqueue/river/rivershared/testsignal"
3433
"github.com/riverqueue/river/rivershared/util/dbutil"
3534
"github.com/riverqueue/river/rivershared/util/maputil"
36-
"github.com/riverqueue/river/rivershared/util/serviceutil"
3735
"github.com/riverqueue/river/rivershared/util/sliceutil"
3836
"github.com/riverqueue/river/rivershared/util/testutil"
3937
"github.com/riverqueue/river/rivershared/util/valutil"
@@ -605,18 +603,10 @@ type Client[TTx any] struct {
605603
notifier *notifier.Notifier // may be nil in poll-only mode
606604
periodicJobs *PeriodicJobBundle
607605
pilot riverpilot.Pilot
608-
producersByQueueName map[string]*producer
609-
queueMaintainer *maintenance.QueueMaintainer
610-
611-
// queueMaintainerEpoch is incremented each time leadership is gained,
612-
// giving each tryStartQueueMaintainer goroutine a term number.
613-
// queueMaintainerMu serializes epoch checks with Stop calls so that a
614-
// stale goroutine from an older term cannot tear down a maintainer
615-
// started by a newer term.
616-
queueMaintainerEpoch int64
617-
queueMaintainerMu sync.Mutex
618-
619-
queues *QueueBundle
606+
producersByQueueName map[string]*producer
607+
queueMaintainer *maintenance.QueueMaintainer
608+
queueMaintainerLeader *maintenance.QueueMaintainerLeader
609+
queues *QueueBundle
620610
services []startstop.Service
621611
stopped <-chan struct{}
622612
subscriptionManager *subscriptionManager
@@ -629,23 +619,16 @@ type Client[TTx any] struct {
629619

630620
// Test-only signals.
631621
type clientTestSignals struct {
632-
electedLeader testsignal.TestSignal[struct{}] // notifies when elected leader
633-
queueMaintainerStartError testsignal.TestSignal[error] // notifies on each failed queue maintainer start attempt
634-
queueMaintainerStartRetriesExhausted testsignal.TestSignal[struct{}] // notifies when leader resignation is requested after all queue maintainer start retries have been exhausted
635-
636-
jobCleaner *maintenance.JobCleanerTestSignals
637-
jobRescuer *maintenance.JobRescuerTestSignals
638-
jobScheduler *maintenance.JobSchedulerTestSignals
639-
periodicJobEnqueuer *maintenance.PeriodicJobEnqueuerTestSignals
640-
queueCleaner *maintenance.QueueCleanerTestSignals
641-
reindexer *maintenance.ReindexerTestSignals
622+
jobCleaner *maintenance.JobCleanerTestSignals
623+
jobRescuer *maintenance.JobRescuerTestSignals
624+
jobScheduler *maintenance.JobSchedulerTestSignals
625+
periodicJobEnqueuer *maintenance.PeriodicJobEnqueuerTestSignals
626+
queueCleaner *maintenance.QueueCleanerTestSignals
627+
queueMaintainerLeader *maintenance.QueueMaintainerLeaderTestSignals
628+
reindexer *maintenance.ReindexerTestSignals
642629
}
643630

644631
func (ts *clientTestSignals) Init(tb testutil.TestingTB) {
645-
ts.electedLeader.Init(tb)
646-
ts.queueMaintainerStartError.Init(tb)
647-
ts.queueMaintainerStartRetriesExhausted.Init(tb)
648-
649632
if ts.jobCleaner != nil {
650633
ts.jobCleaner.Init(tb)
651634
}
@@ -661,6 +644,9 @@ func (ts *clientTestSignals) Init(tb testutil.TestingTB) {
661644
if ts.queueCleaner != nil {
662645
ts.queueCleaner.Init(tb)
663646
}
647+
if ts.queueMaintainerLeader != nil {
648+
ts.queueMaintainerLeader.Init(tb)
649+
}
664650
if ts.reindexer != nil {
665651
ts.reindexer.Init(tb)
666652
}
@@ -867,9 +853,6 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
867853
client.services = append(client.services,
868854
startstop.StartStopFunc(client.logStatsLoop))
869855

870-
client.services = append(client.services,
871-
startstop.StartStopFunc(client.handleLeadershipChangeLoop))
872-
873856
if pluginPilot != nil {
874857
client.services = append(client.services, pluginPilot.PluginServices()...)
875858
}
@@ -972,6 +955,15 @@ func NewClient[TTx any](driver riverdriver.Driver[TTx], config *Config) (*Client
972955
if config.TestOnly {
973956
client.queueMaintainer.StaggerStartupDisable(true)
974957
}
958+
959+
client.queueMaintainerLeader = maintenance.NewQueueMaintainerLeader(archetype, &maintenance.QueueMaintainerLeaderConfig{
960+
ClientID: config.ID,
961+
Elector: client.elector,
962+
QueueMaintainer: client.queueMaintainer,
963+
RequestResignFunc: client.clientNotifyBundle.RequestResign,
964+
})
965+
client.services = append(client.services, client.queueMaintainerLeader)
966+
client.testSignals.queueMaintainerLeader = &client.queueMaintainerLeader.TestSignals
975967
}
976968

977969
return client, nil
@@ -1292,147 +1284,6 @@ func (c *Client[TTx]) logStatsLoop(ctx context.Context, shouldStart bool, starte
12921284
return nil
12931285
}
12941286

1295-
func (c *Client[TTx]) handleLeadershipChangeLoop(ctx context.Context, shouldStart bool, started, stopped func()) error {
1296-
if !shouldStart {
1297-
return nil
1298-
}
1299-
1300-
go func() {
1301-
started()
1302-
defer stopped() // this defer should come first so it's last out
1303-
1304-
sub := c.elector.Listen()
1305-
defer sub.Unlisten()
1306-
1307-
// Cancel function for an in-progress tryStartQueueMaintainer. If
1308-
// leadership is lost while the start process is still retrying, used to
1309-
// abort it promptly instead of waiting for retries to finish.
1310-
var cancelQueueMaintainerStart context.CancelCauseFunc = func(_ error) {}
1311-
1312-
for {
1313-
select {
1314-
case <-ctx.Done():
1315-
cancelQueueMaintainerStart(context.Cause(ctx))
1316-
return
1317-
1318-
case notification := <-sub.C():
1319-
c.baseService.Logger.DebugContext(ctx, c.baseService.Name+": Election change received",
1320-
slog.String("client_id", c.config.ID), slog.Bool("is_leader", notification.IsLeader))
1321-
1322-
switch {
1323-
case notification.IsLeader:
1324-
// Starting the queue maintainer takes time, so send the
1325-
// test signal first. Tests waiting on it can receive it,
1326-
// cancel the queue maintainer start, and finish faster.
1327-
c.testSignals.electedLeader.Signal(struct{}{})
1328-
1329-
// Start the queue maintainer with retries and exponential
1330-
// backoff in a separate goroutine so the leadership change
1331-
// loop remains responsive to new notifications. startCtx is
1332-
// used for cancellation in case leadership is lost while
1333-
// retries are in progress.
1334-
//
1335-
// Epoch is incremented so stale tryStartQueueMaintainer
1336-
// goroutines from a previous term cannot call Stop after a
1337-
// new term has begun.
1338-
var startCtx context.Context
1339-
startCtx, cancelQueueMaintainerStart = context.WithCancelCause(ctx)
1340-
1341-
c.queueMaintainerMu.Lock()
1342-
c.queueMaintainerEpoch++
1343-
epoch := c.queueMaintainerEpoch
1344-
c.queueMaintainerMu.Unlock()
1345-
1346-
go c.tryStartQueueMaintainer(startCtx, epoch)
1347-
1348-
default:
1349-
// Cancel any in-progress start attempts before stopping.
1350-
// Send a startstop.ErrStop to make sure services like
1351-
// Reindexer run any specific cleanup code for stops.
1352-
cancelQueueMaintainerStart(startstop.ErrStop)
1353-
cancelQueueMaintainerStart = func(_ error) {}
1354-
1355-
c.queueMaintainer.Stop()
1356-
}
1357-
}
1358-
}
1359-
}()
1360-
1361-
return nil
1362-
}
1363-
1364-
// Tries to start the queue maintainer after gaining leadership. We allow some
1365-
// retries with exponential backoff in case of failure, and in case the queue
1366-
// maintainer can't be started, we request resignation to allow another client
1367-
// to try and take over.
1368-
func (c *Client[TTx]) tryStartQueueMaintainer(ctx context.Context, epoch int64) {
1369-
const maxStartAttempts = 3
1370-
1371-
ctxCancelled := func() bool {
1372-
if ctx.Err() != nil {
1373-
c.baseService.Logger.InfoContext(ctx, c.baseService.Name+": Queue maintainer start cancelled")
1374-
return true
1375-
}
1376-
return false
1377-
}
1378-
1379-
// stopIfCurrentEpoch atomically checks whether this goroutine's epoch is
1380-
// still the active one and calls Stop only if it is. Combined with the
1381-
// epoch increment in handleLeadershipChangeLoop, prevents stale goroutine
1382-
// from stopping a maintainer started by a newer leadership term.
1383-
stopIfCurrentEpoch := func() bool {
1384-
c.queueMaintainerMu.Lock()
1385-
defer c.queueMaintainerMu.Unlock()
1386-
1387-
if c.queueMaintainerEpoch != epoch {
1388-
return false
1389-
}
1390-
1391-
c.queueMaintainer.Stop()
1392-
return true
1393-
}
1394-
1395-
var lastErr error
1396-
for attempt := 1; attempt <= maxStartAttempts; attempt++ {
1397-
if ctxCancelled() {
1398-
return
1399-
}
1400-
1401-
if lastErr = c.queueMaintainer.Start(ctx); lastErr == nil {
1402-
return
1403-
}
1404-
1405-
c.baseService.Logger.ErrorContext(ctx, c.baseService.Name+": Error starting queue maintainer",
1406-
slog.String("err", lastErr.Error()), slog.Int("attempt", attempt))
1407-
1408-
c.testSignals.queueMaintainerStartError.Signal(lastErr)
1409-
1410-
// Stop the queue maintainer to fully reset its state (and any
1411-
// sub-services) before retrying. The epoch check ensures a stale
1412-
// goroutine cannot stop a maintainer from a newer leadership term.
1413-
if !stopIfCurrentEpoch() {
1414-
return
1415-
}
1416-
1417-
if attempt < maxStartAttempts {
1418-
serviceutil.CancellableSleep(ctx, serviceutil.ExponentialBackoff(attempt, serviceutil.MaxAttemptsBeforeResetDefault))
1419-
}
1420-
}
1421-
1422-
if ctxCancelled() {
1423-
return
1424-
}
1425-
1426-
c.baseService.Logger.ErrorContext(ctx, c.baseService.Name+": Queue maintainer failed to start after all attempts, requesting leader resignation",
1427-
slog.String("err", lastErr.Error()))
1428-
1429-
c.testSignals.queueMaintainerStartRetriesExhausted.Signal(struct{}{})
1430-
1431-
if err := c.clientNotifyBundle.RequestResign(ctx); err != nil {
1432-
c.baseService.Logger.ErrorContext(ctx, c.baseService.Name+": Error requesting leader resignation", slog.String("err", err.Error()))
1433-
}
1434-
}
1435-
14361287
// Driver exposes the underlying driver used by the client.
14371288
//
14381289
// API is not stable. DO NOT USE.

client_pilot_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ func Test_Client_PilotUsage(t *testing.T) {
281281
pilot.testSignals.Init(t)
282282

283283
startClient(ctx, t, client)
284-
client.testSignals.electedLeader.WaitOrTimeout()
284+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
285285

286286
pilot.testSignals.PeriodicJobGetAll.WaitOrTimeout()
287287
pilot.testSignals.PeriodicJobUpsertMany.WaitOrTimeout()

client_test.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,15 +1116,15 @@ func Test_Client_Common(t *testing.T) {
11161116
startClient(ctx, t, client)
11171117

11181118
client.config.Logger.InfoContext(ctx, "Test waiting for client to be elected leader for the first time")
1119-
client.testSignals.electedLeader.WaitOrTimeout()
1119+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
11201120
client.config.Logger.InfoContext(ctx, "Client was elected leader for the first time")
11211121

11221122
// We test the function with a forced resignation, but this is a general
11231123
// Notify test case so this could be changed to any notification.
11241124
require.NoError(t, client.Notify().RequestResign(ctx))
11251125

11261126
client.config.Logger.InfoContext(ctx, "Test waiting for client to be elected leader after forced resignation")
1127-
client.testSignals.electedLeader.WaitOrTimeout()
1127+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
11281128
client.config.Logger.InfoContext(ctx, "Client was elected leader after forced resignation")
11291129
})
11301130

@@ -1137,7 +1137,7 @@ func Test_Client_Common(t *testing.T) {
11371137
startClient(ctx, t, client)
11381138

11391139
client.config.Logger.InfoContext(ctx, "Test waiting for client to be elected leader for the first time")
1140-
client.testSignals.electedLeader.WaitOrTimeout()
1140+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
11411141
client.config.Logger.InfoContext(ctx, "Client was elected leader for the first time")
11421142

11431143
tx, err := bundle.dbPool.Begin(ctx)
@@ -1149,7 +1149,7 @@ func Test_Client_Common(t *testing.T) {
11491149
require.NoError(t, tx.Commit(ctx))
11501150

11511151
client.config.Logger.InfoContext(ctx, "Test waiting for client to be elected leader after forced resignation")
1152-
client.testSignals.electedLeader.WaitOrTimeout()
1152+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
11531153
client.config.Logger.InfoContext(ctx, "Client was elected leader after forced resignation")
11541154
})
11551155

@@ -1422,7 +1422,7 @@ func Test_Client_Common(t *testing.T) {
14221422

14231423
// Despite no notifier, the client should still be able to elect itself
14241424
// leader.
1425-
client.testSignals.electedLeader.WaitOrTimeout()
1425+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
14261426

14271427
event := riversharedtest.WaitOrTimeout(t, subscribeChan)
14281428
require.Equal(t, EventKindJobCompleted, event.Kind)
@@ -1450,7 +1450,7 @@ func Test_Client_Common(t *testing.T) {
14501450

14511451
// Despite no notifier, the client should still be able to elect itself
14521452
// leader.
1453-
client.testSignals.electedLeader.WaitOrTimeout()
1453+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
14541454

14551455
event := riversharedtest.WaitOrTimeout(t, subscribeChan)
14561456
require.Equal(t, EventKindJobCompleted, event.Kind)
@@ -4881,7 +4881,7 @@ func Test_Client_Maintenance(t *testing.T) {
48814881
t.Helper()
48824882

48834883
startClient(ctx, t, client)
4884-
client.testSignals.electedLeader.WaitOrTimeout()
4884+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
48854885
riversharedtest.WaitOrTimeout(t, client.queueMaintainer.Started())
48864886
}
48874887

@@ -5158,16 +5158,16 @@ func Test_Client_Maintenance(t *testing.T) {
51585158
client, _ := setup(t, config)
51595159

51605160
startClient(ctx, t, client)
5161-
client.testSignals.electedLeader.WaitOrTimeout()
5161+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
51625162

51635163
// Wait for all 3 retry attempts to fail.
51645164
for range 3 {
5165-
err := client.testSignals.queueMaintainerStartError.WaitOrTimeout()
5165+
err := client.queueMaintainerLeader.TestSignals.StartError.WaitOrTimeout()
51665166
require.EqualError(t, err, "hook start error")
51675167
}
51685168

51695169
// After all retries exhausted, the client should request resignation.
5170-
client.testSignals.queueMaintainerStartRetriesExhausted.WaitOrTimeout()
5170+
client.queueMaintainerLeader.TestSignals.StartRetriesExhausted.WaitOrTimeout()
51715171
})
51725172

51735173
t.Run("PeriodicJobEnqueuerWithInsertOpts", func(t *testing.T) {
@@ -5276,7 +5276,7 @@ func Test_Client_Maintenance(t *testing.T) {
52765276

52775277
exec := client.driver.GetExecutor()
52785278

5279-
client.testSignals.electedLeader.WaitOrTimeout()
5279+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
52805280

52815281
client.PeriodicJobs().Add(
52825282
NewPeriodicJob(cron.Every(15*time.Minute), func() (JobArgs, *InsertOpts) {
@@ -5322,7 +5322,7 @@ func Test_Client_Maintenance(t *testing.T) {
53225322

53235323
startClient(ctx, t, client)
53245324

5325-
client.testSignals.electedLeader.WaitOrTimeout()
5325+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
53265326

53275327
svc := maintenance.GetService[*maintenance.PeriodicJobEnqueuer](client.queueMaintainer)
53285328
svc.TestSignals.EnteredLoop.WaitOrTimeout()
@@ -5395,7 +5395,7 @@ func Test_Client_Maintenance(t *testing.T) {
53955395

53965396
startClient(ctx, t, client)
53975397

5398-
client.testSignals.electedLeader.WaitOrTimeout()
5398+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
53995399

54005400
svc := maintenance.GetService[*maintenance.PeriodicJobEnqueuer](client.queueMaintainer)
54015401
svc.TestSignals.EnteredLoop.WaitOrTimeout()
@@ -5440,7 +5440,7 @@ func Test_Client_Maintenance(t *testing.T) {
54405440

54415441
startClient(ctx, t, client)
54425442

5443-
client.testSignals.electedLeader.WaitOrTimeout()
5443+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
54445444

54455445
svc := maintenance.GetService[*maintenance.PeriodicJobEnqueuer](client.queueMaintainer)
54465446
svc.TestSignals.EnteredLoop.WaitOrTimeout()
@@ -5535,7 +5535,7 @@ func Test_Client_Maintenance(t *testing.T) {
55355535

55365536
startClient(ctx, t, client)
55375537

5538-
client.testSignals.electedLeader.WaitOrTimeout()
5538+
client.queueMaintainerLeader.TestSignals.ElectedLeader.WaitOrTimeout()
55395539
qc := maintenance.GetService[*maintenance.QueueCleaner](client.queueMaintainer)
55405540
qc.TestSignals.DeletedBatch.WaitOrTimeout()
55415541

0 commit comments

Comments
 (0)