Skip to content

Commit dcadb23

Browse files
authored
grpc: implement OnHealthy (#8686)
This allows service implementations to report their health as a callback, rather than by polling. That, in turn, allows services to become healthy more quickly after startup.
1 parent 429d8b5 commit dcadb23

4 files changed

Lines changed: 61 additions & 14 deletions

File tree

docs/health.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# gRPC Health checking
2+
3+
We automatically implement the [gRPC health service] automatically for all our
4+
gRPC servers.
5+
6+
There are two ways a service implementation can offer health information:
7+
8+
- By implementing `Health(context.Context) error`, which will be called every
9+
5s. If it returns `nil`, the service is set to healthy. If it returns
10+
non-`nil`, the service is set to unhealthy. The health check interval can be
11+
controlled with `grpc.serverBuilder.WithCheckInterval` at build time.
12+
- By implementing `OnHealthy(func())`. This will be called by
13+
`grpc.serverBuilder.Build`, passing in a closure that sets the service status
14+
to healthy. This is useful for services that start unhealthy and then become
15+
healthy exactly once. At a protocol layer, setting the service healthy [pushes]
16+
out a message to clients immediately, so they don't need to wait on the next
17+
health check poll.
18+
19+
[gRPC health service]: https://pkg.go.dev/google.golang.org/grpc/health
20+
[pushes]: https://github.com/grpc/grpc/blob/5b6492ea90b2b867a6adad1b10a6edda28e860d1/src/proto/grpc/health/v1/health.proto#L47-L62

grpc/server.go

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,21 @@ var CodedError = status.Errorf
3131

3232
var errNilTLS = errors.New("boulder/grpc: received nil tls.Config")
3333

34-
// checker is an interface for checking the health of a grpc service
35-
// implementation.
34+
// checker can be implemented by services to receive health checks via polling.
3635
type checker interface {
3736
// Health returns nil if the service is healthy, or an error if it is not.
3837
// If the passed context is canceled, it should return immediately with an
3938
// error.
4039
Health(context.Context) error
4140
}
4241

42+
// onHealthy can be implemented by services to report health to gRPC by callbacks.
43+
type onHealthy interface {
44+
// During server setup, OnHealthy will be passed a callback. The implementation
45+
// should arrange for this callback to be invoked once the service is healthy.
46+
OnHealthy(func())
47+
}
48+
4349
// service represents a single gRPC service that can be registered with a gRPC
4450
// server.
4551
type service struct {
@@ -214,12 +220,17 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
214220
healthCtx, stopHealthChecks := context.WithCancel(context.Background())
215221
for _, s := range sb.services {
216222
check, ok := s.impl.(checker)
217-
if !ok {
218-
continue
223+
if ok {
224+
sb.initLongRunningCheck(healthCtx, s.desc.ServiceName, check.Health)
219225
}
220-
sb.initLongRunningCheck(healthCtx, s.desc.ServiceName, check.Health)
221-
}
222226

227+
registerReady, ok := s.impl.(onHealthy)
228+
if ok {
229+
registerReady.OnHealthy(func() {
230+
sb.healthSrv.SetServingStatus(s.desc.ServiceName, healthpb.HealthCheckResponse_SERVING)
231+
})
232+
}
233+
}
223234
// Start a goroutine which listens for a termination signal, and then
224235
// gracefully stops the gRPC server. This in turn causes the start() function
225236
// to exit, allowing its caller (generally a main() function) to exit.

ra/ra.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,11 @@ type RegistrationAuthorityImpl struct {
105105

106106
var _ rapb.RegistrationAuthorityServer = (*RegistrationAuthorityImpl)(nil)
107107

108-
// Health implements our grpc.checker interface. This method will be called
109-
// periodically to set the gRPC service's healthpb.Health.Check() status.
110-
func (ra *RegistrationAuthorityImpl) Health(ctx context.Context) error {
111-
if ra.txnBuilder.Ready() {
112-
return nil
113-
}
114-
return errors.New("waiting for overrides")
108+
// OnHealthy registers the callback to be invoked once the txnBuilder is healthy.
109+
//
110+
// That happens when the overrides are loaded.
111+
func (ra *RegistrationAuthorityImpl) OnHealthy(cb func()) {
112+
ra.txnBuilder.OnHealthy(cb)
115113
}
116114

117115
// NewRegistrationAuthorityImpl constructs a new RA object.

ratelimits/limit.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,9 @@ type limitRegistry struct {
322322
// overrides stores override limits by 'name:id'.
323323
overrides Limits
324324

325+
// Called once the overrides are loaded.
326+
healthyCallbacks []func()
327+
325328
// overridesLoaded is true if at least one loadOverrides attempt has
326329
// completed successfully. Callers should check this using the Ready()
327330
// method.
@@ -372,7 +375,14 @@ func (l *limitRegistry) loadOverrides(ctx context.Context) error {
372375

373376
l.Lock()
374377
defer l.Unlock()
375-
l.overridesLoaded = true
378+
379+
if !l.overridesLoaded {
380+
l.overridesLoaded = true
381+
for _, cb := range l.healthyCallbacks {
382+
cb()
383+
l.healthyCallbacks = nil
384+
}
385+
}
376386

377387
if len(newOverrides) < 1 {
378388
// If it's an empty set, don't replace any current overrides.
@@ -403,6 +413,14 @@ func (l *limitRegistry) Ready() bool {
403413
return l.overridesLoaded
404414
}
405415

416+
// OnHealthy registers the callback to be invoked once the overrides are loaded.
417+
func (l *limitRegistry) OnHealthy(cb func()) {
418+
l.Lock()
419+
defer l.Unlock()
420+
421+
l.healthyCallbacks = append(l.healthyCallbacks, cb)
422+
}
423+
406424
// loadOverridesWithRetry tries to loadOverrides, retrying at least every 30
407425
// seconds upon failure.
408426
func (l *limitRegistry) loadOverridesWithRetry(ctx context.Context) error {

0 commit comments

Comments
 (0)