55 "bytes"
66 "context"
77 "fmt"
8+ "math/rand"
89 "path/filepath"
910 "strings"
1011 "sync"
@@ -132,6 +133,9 @@ const (
132133 // NOTE: this is only used during the initial sync.
133134 syncWorkerLastBlockTimeDiffThreshold = 1 * time .Minute
134135
136+ minUpgradeStopWaitPeriod = 5 * time .Second
137+ upgradeStopDelay = 30 * time .Second
138+
135139 // tmSubscriberID is the subscriber identifier used for all internal Tendermint pubsub
136140 // subscriptions. If any other subscriber IDs need to be derived they will be under this prefix.
137141 tmSubscriberID = "oasis-core"
@@ -181,8 +185,10 @@ type fullService struct { // nolint: maligned
181185 isInitialized , isStarted bool
182186 startedCh chan struct {}
183187 syncedCh chan struct {}
188+ quitCh chan struct {}
184189
185- startFn func () error
190+ startFn func () error
191+ stopOnce sync.Once
186192
187193 nextSubscriberID uint64
188194}
@@ -219,6 +225,19 @@ func (t *fullService) Start() error {
219225 return fmt .Errorf ("tendermint: failed to start service: %w" , err )
220226 }
221227
228+ // Make sure the quit channel is closed when the node shuts down.
229+ go func () {
230+ select {
231+ case <- t .quitCh :
232+ case <- t .node .Quit ():
233+ select {
234+ case <- t .quitCh :
235+ default :
236+ close (t .quitCh )
237+ }
238+ }
239+ }()
240+
222241 // Start event dispatchers for all the service clients.
223242 t .serviceClientsWg .Add (len (t .serviceClients ))
224243 for _ , svc := range t .serviceClients {
@@ -247,11 +266,7 @@ func (t *fullService) Start() error {
247266
248267// Implements service.BackgroundService.
249268func (t * fullService ) Quit () <- chan struct {} {
250- if ! t .started () {
251- return make (chan struct {})
252- }
253-
254- return t .node .Quit ()
269+ return t .quitCh
255270}
256271
257272// Implements service.BackgroundService.
@@ -266,14 +281,15 @@ func (t *fullService) Stop() {
266281 return
267282 }
268283
269- t .failMonitor .markCleanShutdown ()
270- if err := t .node .Stop (); err != nil {
271- t .Logger .Error ("Error on stopping node" , err )
272- }
284+ t .stopOnce .Do (func () {
285+ t .failMonitor .markCleanShutdown ()
286+ if err := t .node .Stop (); err != nil {
287+ t .Logger .Error ("Error on stopping node" , err )
288+ }
273289
274- t .svcMgr .Stop ()
275- t .mux .Stop ()
276- t . node . Wait ( )
290+ t .svcMgr .Stop ()
291+ t .mux .Stop ()
292+ } )
277293}
278294
279295func (t * fullService ) Started () <- chan struct {} {
@@ -401,7 +417,7 @@ func (t *fullService) GetGenesisDocument(ctx context.Context) (*genesisAPI.Docum
401417 return t .genesis , nil
402418}
403419
404- func (t * fullService ) RegisterHaltHook (hook func (context. Context , int64 , epochtimeAPI. EpochTime ) ) {
420+ func (t * fullService ) RegisterHaltHook (hook consensusAPI. HaltHook ) {
405421 if ! t .initialized () {
406422 return
407423 }
@@ -1301,6 +1317,41 @@ func (t *fullService) lazyInit() error {
13011317 t .client = tmcli .New (t .node )
13021318 t .failMonitor = newFailMonitor (t .ctx , t .Logger , t .node .ConsensusState ().Wait )
13031319
1320+ // Register a halt hook that handles upgrades gracefully.
1321+ t .RegisterHaltHook (func (ctx context.Context , blockHeight int64 , epoch epochtimeAPI.EpochTime , err error ) {
1322+ if ! errors .Is (err , upgradeAPI .ErrStopForUpgrade ) {
1323+ return
1324+ }
1325+
1326+ // Mark this as a clean shutdown and request the node to stop gracefully.
1327+ t .failMonitor .markCleanShutdown ()
1328+
1329+ // Wait before stopping to give time for P2P messages to propagate. Sleep for at least
1330+ // minUpgradeStopWaitPeriod or the configured commit timeout.
1331+ t .Logger .Info ("waiting a bit before stopping the node for upgrade" )
1332+ waitPeriod := minUpgradeStopWaitPeriod
1333+ if tc := t .genesis .Consensus .Parameters .TimeoutCommit ; tc > waitPeriod {
1334+ waitPeriod = tc
1335+ }
1336+ time .Sleep (waitPeriod )
1337+
1338+ go func () {
1339+ // Sleep another period so there is some time between when consensus shuts down and
1340+ // when all the other services start shutting down.
1341+ //
1342+ // Randomize the period so that not all nodes shut down at the same time.
1343+ delay := getRandomValueFromInterval (0.5 , rand .Float64 (), upgradeStopDelay )
1344+ time .Sleep (delay )
1345+
1346+ t .Logger .Info ("stopping the node for upgrade" )
1347+ t .Stop ()
1348+
1349+ // Close the quit channel early to force the node to stop. This is needed because
1350+ // the Tendermint node will otherwise never quit.
1351+ close (t .quitCh )
1352+ }()
1353+ })
1354+
13041355 return nil
13051356 }
13061357
@@ -1462,6 +1513,7 @@ func New(
14621513 dataDir : dataDir ,
14631514 startedCh : make (chan struct {}),
14641515 syncedCh : make (chan struct {}),
1516+ quitCh : make (chan struct {}),
14651517 }
14661518
14671519 t .Logger .Info ("starting a full consensus node" )
0 commit comments