From fc7183168361089b040eae8dab04cfbe0b604c80 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Fri, 13 Mar 2026 16:50:28 -0700 Subject: [PATCH 1/5] noncebalancer: use endpointsharding, ignore ready status The old noncebalancer only saw READY SubConns, which was a problem during the brief periods when a SubConn needed to reconnect (for instance due to a GOAWAY from the server). Unfortunately, that's all the balancer interface provides. And we can't get it to pass non-READY SubConns to our picker without reimplementing or copying all its SubConn management logic. Luckily, grpc provides the [`endpointsharding`] balancer implementation that does exactly what we want. It maintains a collection of child balancers each owning a single endpoint (note: for our purposes an endpoint is equivalent to addresses, though it can be one-to-many). It also lets us query the [state] of each child, including the endpoint it's responsible for us. This allows us to construct a picker that is aware of all available backends, even those that aren't currently READY. That, in turn, prevents us from temporarily serving errors while a given nonce redemption backend reconnects. To see an example of `endpointsharding` in use, see the [`customroundrobin`] implementation. For more context on how `endpointsharding` came to be implemented, see [gRFC A61: IPv4 and IPv6 Dualstack Backend Support](a61). [`endpointsharding`]: https://pkg.go.dev/google.golang.org/grpc/balancer/endpointsharding [state]: https://pkg.go.dev/google.golang.org/grpc/balancer/endpointsharding#ChildState [a61]: https://github.com/grpc/proposal/blob/master/A61-IPv4-IPv6-dualstack-backends.md [`customroundrobin`]: https://github.com/grpc/grpc-go/blob/99f36d4a0c28bc967a8d3fe23ebc2a264b322070/examples/features/customloadbalancer/client/customroundrobin/customroundrobin.go --- grpc/noncebalancer/noncebalancer.go | 123 ++++++++++++++++------- grpc/noncebalancer/noncebalancer_test.go | 96 +++++++++--------- 2 files changed, 135 insertions(+), 84 deletions(-) diff --git a/grpc/noncebalancer/noncebalancer.go b/grpc/noncebalancer/noncebalancer.go index 4867e400dd5..49f5278e4b1 100644 --- a/grpc/noncebalancer/noncebalancer.go +++ b/grpc/noncebalancer/noncebalancer.go @@ -2,12 +2,14 @@ package noncebalancer import ( "errors" + "google.golang.org/grpc/balancer/endpointsharding" + "google.golang.org/grpc/balancer/pickfirst" + "google.golang.org/grpc/connectivity" "sync" "github.com/letsencrypt/boulder/nonce" "google.golang.org/grpc/balancer" - "google.golang.org/grpc/balancer/base" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) @@ -40,38 +42,43 @@ var errMissingHMACKeyCtxKey = errors.New("nonce.HMACKeyCtxKey value required in var errInvalidPrefixCtxKeyType = errors.New("nonce.PrefixCtxKey value in RPC context must be a string") var errInvalidHMACKeyCtxKeyType = errors.New("nonce.HMACKeyCtxKey value in RPC context must be a byte slice") -// pickerBuilder implements the base.PickerBuilder interface. It's used to -// create new Picker instances. It should only be used by nonce-service clients. -type pickerBuilder struct{} - -// Build implements the base.PickerBuilder interface. It is called by the gRPC -// runtime when the balancer is first initialized and when the set of backend -// (SubConn) addresses changes. -func (b *pickerBuilder) Build(buildInfo base.PickerBuildInfo) balancer.Picker { - if len(buildInfo.ReadySCs) == 0 { - // The Picker must be rebuilt if there are no backends available. - return base.NewErrPicker(balancer.ErrNoSubConnAvailable) - } - return &picker{ - backends: buildInfo.ReadySCs, - } +// picker implements the balancer.Picker interface. It delegates to a child Picker +// based on the endpoint (IP address and port) that Picker represents. +// The child picker is provided by endpointsharding's Balancer implementation +// (https://pkg.go.dev/google.golang.org/grpc/balancer/endpointsharding), which +// abstracts away the creation and management of SubConns for us. +// +// We happen to know the child Picker is created by the "pickfirst" balancer, but +// since each child Picker only has a single Endpoint anyhow, it doesn't really matter. +type picker struct { + // This is the full list of (address -> Picker) pairs passed in by the nonceBalancer. + // In particular it is not filtered based on the state of any SubConn, since a given + // address' SubConn may be temporarily unavailable while reconnecting, and we still + // want to attempt sending traffic to that endpoint if we receive the corresponding + // prefix. + addrToPicker map[string]balancer.Picker + + // A mapping from nonce prefix to the child picker for that backend. This is derived, + // on first Pick call, from the address of each backend plus the HMAC key passed in a + // context.Context. We don't derive it on construction because we don't have access to + // the HMAC key then. + prefixToPicker map[string]balancer.Picker + prefixToPickerOnce sync.Once } -// picker implements the balancer.Picker interface. It picks a backend (SubConn) -// based on the nonce prefix contained in each request's Context. -type picker struct { - backends map[balancer.SubConn]base.SubConnInfo - prefixToBackend map[string]balancer.SubConn - prefixToBackendOnce sync.Once +// newPicker creates a picker with the given address-to-child picker map. +func newPicker(m map[string]balancer.Picker) *picker { + return &picker{ + addrToPicker: m, + } } // Pick implements the balancer.Picker interface. It is called by the gRPC // runtime for each RPC message. It is responsible for picking a backend // (SubConn) based on the context of each RPC message. func (p *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { - if len(p.backends) == 0 { - // This should never happen, the Picker should only be built when there - // are backends available. + if len(p.addrToPicker) == 0 { + // Should never happen. return balancer.PickResult{}, balancer.ErrNoSubConnAvailable } @@ -87,14 +94,14 @@ func (p *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { return balancer.PickResult{}, errInvalidHMACKeyCtxKeyType } - p.prefixToBackendOnce.Do(func() { + p.prefixToPickerOnce.Do(func() { // First call to Pick with a new Picker. - prefixToBackend := make(map[string]balancer.SubConn) - for sc, scInfo := range p.backends { - scPrefix := nonce.DerivePrefix(scInfo.Address.Addr, hmacKey) - prefixToBackend[scPrefix] = sc + prefixToPicker := make(map[string]balancer.Picker) + for addr, picker := range p.addrToPicker { + prefix := nonce.DerivePrefix(addr, hmacKey) + prefixToPicker[prefix] = picker } - p.prefixToBackend = prefixToBackend + p.prefixToPicker = prefixToPicker }) // Get the destination prefix from the RPC context. @@ -109,16 +116,60 @@ func (p *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { return balancer.PickResult{}, errInvalidPrefixCtxKeyType } - sc, ok := p.prefixToBackend[destPrefix] + childPicker, ok := p.prefixToPicker[destPrefix] if !ok { // No backend SubConn was found for the destination prefix. return balancer.PickResult{}, ErrNoBackendsMatchPrefix.Err() } - return balancer.PickResult{SubConn: sc}, nil + return childPicker.Pick(info) } func init() { - balancer.Register( - base.NewBalancerBuilder(Name, &pickerBuilder{}, base.Config{}), - ) + balancer.Register(builder{}) +} + +// builder builds a nonceBalancer (which internally uses `endpointsharding.NewBalancer`) +type builder struct{} + +func (b builder) Name() string { + return Name +} + +func (b builder) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer { + childBalancerBuilder := balancer.Get(pickfirst.Name).Build + nb := &nonceBalancer{ + ClientConn: cc, + } + nb.Balancer = endpointsharding.NewBalancer(nb, bOpts, childBalancerBuilder, endpointsharding.Options{}) + return nb +} + +// nonceBalancer sends nonce redemption requests to backends based on the nonce prefix, +// which maps to a specific IP address and port pair. +type nonceBalancer struct { + balancer.Balancer + balancer.ClientConn +} + +// UpdateState creates a `picker` that is aware of the IP address and port of all +// the child pickers available, including ones that may not have an active connection. +func (b *nonceBalancer) UpdateState(state balancer.State) { + if state.ConnectivityState != connectivity.Ready { + b.ClientConn.UpdateState(state) + return + } + + addrToPicker := make(map[string]balancer.Picker) + for _, childState := range endpointsharding.ChildStatesFromPicker(state.Picker) { + // We expect our Endpoints to always have single Addresses, but might as well + // be robust to the possibility there are more. + for _, addr := range childState.Endpoint.Addresses { + addrToPicker[addr.Addr] = childState.State.Picker + } + } + b.ClientConn.UpdateState(balancer.State{ + ConnectivityState: state.ConnectivityState, + // Here's where we build our nonce-aware picker. + Picker: newPicker(addrToPicker), + }) } diff --git a/grpc/noncebalancer/noncebalancer_test.go b/grpc/noncebalancer/noncebalancer_test.go index 1cade2f52c5..62dddbbbf7f 100644 --- a/grpc/noncebalancer/noncebalancer_test.go +++ b/grpc/noncebalancer/noncebalancer_test.go @@ -4,30 +4,56 @@ import ( "context" "testing" - "google.golang.org/grpc/balancer" - "google.golang.org/grpc/balancer/base" - "google.golang.org/grpc/resolver" - "github.com/letsencrypt/boulder/nonce" "github.com/letsencrypt/boulder/test" + "google.golang.org/grpc/balancer" ) +// mockPicker implements the balancer.Picker interface. +// +// In this test it's used to fill the role of a child picker. +type mockPicker struct { + called bool +} + +func (mp *mockPicker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { + mp.called = true + return balancer.PickResult{}, nil +} + func TestPickerPicksCorrectBackend(t *testing.T) { - _, p, subConns := setupTest(false) - prefix := nonce.DerivePrefix(subConns[0].addrs[0].Addr, []byte("Kala namak")) + addr1 := "10.77.77.77:8080" + addr2 := "10.88.88.88:9090" + prefix := nonce.DerivePrefix(addr1, []byte("Kala namak")) testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "HNmOnt8w") testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, []byte(prefix)) info := balancer.PickInfo{Ctx: testCtx} - gotPick, err := p.Pick(info) - test.AssertNotError(t, err, "Pick failed") - test.AssertDeepEquals(t, subConns[0], gotPick.SubConn) + childPicker1 := &mockPicker{} + childPicker2 := &mockPicker{} + + p := newPicker(map[string]balancer.Picker{ + addr1: childPicker1, + addr2: childPicker2, + }) + + _, err := p.Pick(info) + if err != nil { + t.Fatalf("Pick failed: %v", err) + } + + if !childPicker1.called { + t.Errorf("childPicker1 not called") + } + if childPicker2.called { + t.Errorf("childPicker2 called, should not have been") + } } func TestPickerMissingPrefixInCtx(t *testing.T) { - _, p, subConns := setupTest(false) - prefix := nonce.DerivePrefix(subConns[0].addrs[0].Addr, []byte("Kala namak")) + p, addr := setupTest() + prefix := nonce.DerivePrefix(addr, []byte("Kala namak")) testCtx := context.WithValue(context.Background(), nonce.HMACKeyCtxKey{}, []byte(prefix)) info := balancer.PickInfo{Ctx: testCtx} @@ -38,7 +64,7 @@ func TestPickerMissingPrefixInCtx(t *testing.T) { } func TestPickerInvalidPrefixInCtx(t *testing.T) { - _, p, _ := setupTest(false) + p, _ := setupTest() testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, 9) testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, []byte("foobar")) @@ -50,7 +76,7 @@ func TestPickerInvalidPrefixInCtx(t *testing.T) { } func TestPickerMissingHMACKeyInCtx(t *testing.T) { - _, p, _ := setupTest(false) + p, _ := setupTest() testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "HNmOnt8w") info := balancer.PickInfo{Ctx: testCtx} @@ -61,7 +87,7 @@ func TestPickerMissingHMACKeyInCtx(t *testing.T) { } func TestPickerInvalidHMACKeyInCtx(t *testing.T) { - _, p, _ := setupTest(false) + p, _ := setupTest() testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "HNmOnt8w") testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, 9) @@ -73,8 +99,8 @@ func TestPickerInvalidHMACKeyInCtx(t *testing.T) { } func TestPickerNoMatchingSubConnAvailable(t *testing.T) { - _, p, subConns := setupTest(false) - prefix := nonce.DerivePrefix(subConns[0].addrs[0].Addr, []byte("Kala namak")) + p, addr := setupTest() + prefix := nonce.DerivePrefix(addr, []byte("Kala namak")) testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "rUsTrUin") testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, []byte(prefix)) @@ -86,41 +112,15 @@ func TestPickerNoMatchingSubConnAvailable(t *testing.T) { } func TestPickerNoSubConnsAvailable(t *testing.T) { - b, p, _ := setupTest(true) - b.Build(base.PickerBuildInfo{}) + p := newPicker(map[string]balancer.Picker{}) info := balancer.PickInfo{Ctx: context.Background()} - gotPick, err := p.Pick(info) + _, err := p.Pick(info) test.AssertErrorIs(t, err, balancer.ErrNoSubConnAvailable) - test.AssertNil(t, gotPick.SubConn, "subConn should be nil") -} - -func setupTest(noSubConns bool) (*pickerBuilder, balancer.Picker, []*subConn) { - var subConns []*subConn - bi := base.PickerBuildInfo{ - ReadySCs: make(map[balancer.SubConn]base.SubConnInfo), - } - - sc := &subConn{} - addr := resolver.Address{Addr: "10.77.77.77:8080"} - sc.UpdateAddresses([]resolver.Address{addr}) - - if !noSubConns { - bi.ReadySCs[sc] = base.SubConnInfo{Address: addr} - subConns = append(subConns, sc) - } - - b := &pickerBuilder{} - p := b.Build(bi) - return b, p, subConns -} - -// subConn is a test mock which implements the balancer.SubConn interface. -type subConn struct { - balancer.SubConn - addrs []resolver.Address } -func (s *subConn) UpdateAddresses(addrs []resolver.Address) { - s.addrs = addrs +func setupTest() (*picker, string) { + addr := "10.77.77.77:8080" + p := newPicker(map[string]balancer.Picker{addr: &mockPicker{}}) + return p, addr } From 4ac9a53258a53aa2920dcfcec990dbe7b431cf74 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Mon, 16 Mar 2026 10:19:13 -0700 Subject: [PATCH 2/5] Re-add noncebalancerv1 --- cmd/boulder-wfe2/main.go | 7 +- cmd/config.go | 2 +- grpc/internal/resolver/dns/dns_resolver.go | 17 ++- grpc/noncebalancer/noncebalancer.go | 36 +++--- grpc/noncebalancer/noncebalancer_test.go | 2 +- grpc/noncebalancerv1/noncebalancer.go | 124 +++++++++++++++++++ grpc/noncebalancerv1/noncebalancer_test.go | 126 ++++++++++++++++++++ test/config-next/wfe2.json | 2 +- test/config/wfe2.json | 2 +- test/integration/nonce_test.go | 28 ++++- test/integration/testdata/nonce-client.json | 2 +- wfe2/verify.go | 3 +- 12 files changed, 322 insertions(+), 29 deletions(-) create mode 100644 grpc/noncebalancerv1/noncebalancer.go create mode 100644 grpc/noncebalancerv1/noncebalancer_test.go diff --git a/cmd/boulder-wfe2/main.go b/cmd/boulder-wfe2/main.go index 93d5b987a42..a5aff0b0bfa 100644 --- a/cmd/boulder-wfe2/main.go +++ b/cmd/boulder-wfe2/main.go @@ -19,6 +19,7 @@ import ( "github.com/letsencrypt/boulder/goodkey/sagoodkey" bgrpc "github.com/letsencrypt/boulder/grpc" "github.com/letsencrypt/boulder/grpc/noncebalancer" + noncebalancerv1 "github.com/letsencrypt/boulder/grpc/noncebalancerv1" "github.com/letsencrypt/boulder/issuance" "github.com/letsencrypt/boulder/nonce" rapb "github.com/letsencrypt/boulder/ra/proto" @@ -318,9 +319,11 @@ func main() { cmd.FailOnError(err, "Failed to load credentials and create gRPC connection to get nonce service") gnc := nonce.NewGetter(getNonceConn) - if c.WFE.RedeemNonceService.SRVResolver != noncebalancer.SRVResolverScheme { + if c.WFE.RedeemNonceService.SRVResolver != noncebalancer.SRVResolverScheme && + c.WFE.RedeemNonceService.SRVResolver != noncebalancerv1.SRVResolverScheme { cmd.Fail(fmt.Sprintf( - "'redeemNonceService.SRVResolver' must be set to %q", noncebalancer.SRVResolverScheme), + "'redeemNonceService.SRVResolver' must be set to %q or %q", + noncebalancer.SRVResolverScheme, noncebalancerv1.SRVResolverScheme), ) } redeemNonceConn, err := bgrpc.ClientSetup(c.WFE.RedeemNonceService, tlsConfig, stats, clk) diff --git a/cmd/config.go b/cmd/config.go index 414214e282a..acbfb4aff9c 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -306,7 +306,7 @@ type GRPCClientConfig struct { // implementation of the SRV resolver should be used. The default is 'srv' // For more details, see the documentation in: // grpc/internal/resolver/dns/dns_resolver.go. - SRVResolver string `validate:"excluded_with=ServerAddress,isdefault|oneof=srv nonce-srv"` + SRVResolver string `validate:"excluded_with=ServerAddress,isdefault|oneof=srv nonce-srv nonce-srv-v2"` // ServerAddress is a single : or `:` that // the gRPC client will, if necessary, resolve via DNS and then connect to. diff --git a/grpc/internal/resolver/dns/dns_resolver.go b/grpc/internal/resolver/dns/dns_resolver.go index a25bee078df..eb5f2e014a5 100644 --- a/grpc/internal/resolver/dns/dns_resolver.go +++ b/grpc/internal/resolver/dns/dns_resolver.go @@ -26,6 +26,7 @@ import ( "context" "errors" "fmt" + "google.golang.org/grpc/serviceconfig" "net" "net/netip" "strconv" @@ -33,13 +34,12 @@ import ( "sync" "time" - "google.golang.org/grpc/grpclog" - "google.golang.org/grpc/resolver" - "google.golang.org/grpc/serviceconfig" - "github.com/letsencrypt/boulder/bdns" "github.com/letsencrypt/boulder/grpc/internal/backoff" "github.com/letsencrypt/boulder/grpc/noncebalancer" + noncebalancerv1 "github.com/letsencrypt/boulder/grpc/noncebalancerv1" + "google.golang.org/grpc/grpclog" + "google.golang.org/grpc/resolver" ) var logger = grpclog.Component("srv") @@ -54,6 +54,7 @@ var ( func init() { resolver.Register(NewDefaultSRVBuilder()) resolver.Register(NewNonceSRVBuilder()) + resolver.Register(NewNonceSRVBuilderV2()) } const defaultDNSSvrPort = "53" @@ -90,9 +91,15 @@ func NewDefaultSRVBuilder() resolver.Builder { return &srvBuilder{scheme: "srv"} } -// NewNonceSRVBuilder creates a srvBuilder which is used to factory SRV DNS +// NewNonceSRVBuilder creates a srvBuilder which is used to build SRV DNS // resolvers with a custom grpc.Balancer used by nonce-service clients. func NewNonceSRVBuilder() resolver.Builder { + return &srvBuilder{scheme: noncebalancerv1.SRVResolverScheme, balancer: noncebalancerv1.Name} +} + +// NewNonceSRVBuilderV2 creates a srvBuilder which is used to build SRV DNS +// resolvers with a custom nonce balancer used by nonce-service clients. +func NewNonceSRVBuilderV2() resolver.Builder { return &srvBuilder{scheme: noncebalancer.SRVResolverScheme, balancer: noncebalancer.Name} } diff --git a/grpc/noncebalancer/noncebalancer.go b/grpc/noncebalancer/noncebalancer.go index 49f5278e4b1..034cda93fe6 100644 --- a/grpc/noncebalancer/noncebalancer.go +++ b/grpc/noncebalancer/noncebalancer.go @@ -17,13 +17,13 @@ import ( const ( // Name is the name used to register the nonce balancer with the gRPC // runtime. - Name = "nonce" + Name = "noncev2" // SRVResolverScheme is the scheme used to invoke an instance of the SRV // resolver which will use the noncebalancer to pick backends. It would be // ideal to export this from the SRV resolver package but that package is // internal. - SRVResolverScheme = "nonce-srv" + SRVResolverScheme = "nonce-srv-v2" ) // ErrNoBackendsMatchPrefix indicates that no backends were found which match @@ -42,7 +42,7 @@ var errMissingHMACKeyCtxKey = errors.New("nonce.HMACKeyCtxKey value required in var errInvalidPrefixCtxKeyType = errors.New("nonce.PrefixCtxKey value in RPC context must be a string") var errInvalidHMACKeyCtxKeyType = errors.New("nonce.HMACKeyCtxKey value in RPC context must be a byte slice") -// picker implements the balancer.Picker interface. It delegates to a child Picker +// prefixBasedPicker implements the balancer.Picker interface. It delegates to a child Picker // based on the endpoint (IP address and port) that Picker represents. // The child picker is provided by endpointsharding's Balancer implementation // (https://pkg.go.dev/google.golang.org/grpc/balancer/endpointsharding), which @@ -50,7 +50,7 @@ var errInvalidHMACKeyCtxKeyType = errors.New("nonce.HMACKeyCtxKey value in RPC c // // We happen to know the child Picker is created by the "pickfirst" balancer, but // since each child Picker only has a single Endpoint anyhow, it doesn't really matter. -type picker struct { +type prefixBasedPicker struct { // This is the full list of (address -> Picker) pairs passed in by the nonceBalancer. // In particular it is not filtered based on the state of any SubConn, since a given // address' SubConn may be temporarily unavailable while reconnecting, and we still @@ -61,14 +61,14 @@ type picker struct { // A mapping from nonce prefix to the child picker for that backend. This is derived, // on first Pick call, from the address of each backend plus the HMAC key passed in a // context.Context. We don't derive it on construction because we don't have access to - // the HMAC key then. + // the HMAC key at that point. prefixToPicker map[string]balancer.Picker prefixToPickerOnce sync.Once } -// newPicker creates a picker with the given address-to-child picker map. -func newPicker(m map[string]balancer.Picker) *picker { - return &picker{ +// newPicker creates a prefixBasedPicker with the given map of addresses to child pickers. +func newPicker(m map[string]balancer.Picker) *prefixBasedPicker { + return &prefixBasedPicker{ addrToPicker: m, } } @@ -76,7 +76,7 @@ func newPicker(m map[string]balancer.Picker) *picker { // Pick implements the balancer.Picker interface. It is called by the gRPC // runtime for each RPC message. It is responsible for picking a backend // (SubConn) based on the context of each RPC message. -func (p *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { +func (p *prefixBasedPicker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { if len(p.addrToPicker) == 0 { // Should never happen. return balancer.PickResult{}, balancer.ErrNoSubConnAvailable @@ -124,11 +124,11 @@ func (p *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { return childPicker.Pick(info) } -func init() { - balancer.Register(builder{}) -} - -// builder builds a nonceBalancer (which internally uses `endpointsharding.NewBalancer`) +// builder builds a nonceBalancer, which internally uses `endpointsharding.NewBalancer`. +// +// The embedded `endpointsharding` balancer manages a set of child pickers that all use +// `pickfirst` on an endpoint that consists of a single IP address (because our `"nonce-srv"` +// resolver returns single-IP endpoints). type builder struct{} func (b builder) Name() string { @@ -151,8 +151,10 @@ type nonceBalancer struct { balancer.ClientConn } -// UpdateState creates a `picker` that is aware of the IP address and port of all +// UpdateState creates a `prefixBasedPicker` that is aware of the IP address and port of all // the child pickers available, including ones that may not have an active connection. +// +// The child pickers are all `pickfirst` across a single IP address. func (b *nonceBalancer) UpdateState(state balancer.State) { if state.ConnectivityState != connectivity.Ready { b.ClientConn.UpdateState(state) @@ -173,3 +175,7 @@ func (b *nonceBalancer) UpdateState(state balancer.State) { Picker: newPicker(addrToPicker), }) } + +func init() { + balancer.Register(builder{}) +} diff --git a/grpc/noncebalancer/noncebalancer_test.go b/grpc/noncebalancer/noncebalancer_test.go index 62dddbbbf7f..3b4718b9059 100644 --- a/grpc/noncebalancer/noncebalancer_test.go +++ b/grpc/noncebalancer/noncebalancer_test.go @@ -119,7 +119,7 @@ func TestPickerNoSubConnsAvailable(t *testing.T) { test.AssertErrorIs(t, err, balancer.ErrNoSubConnAvailable) } -func setupTest() (*picker, string) { +func setupTest() (*prefixBasedPicker, string) { addr := "10.77.77.77:8080" p := newPicker(map[string]balancer.Picker{addr: &mockPicker{}}) return p, addr diff --git a/grpc/noncebalancerv1/noncebalancer.go b/grpc/noncebalancerv1/noncebalancer.go new file mode 100644 index 00000000000..4867e400dd5 --- /dev/null +++ b/grpc/noncebalancerv1/noncebalancer.go @@ -0,0 +1,124 @@ +package noncebalancer + +import ( + "errors" + "sync" + + "github.com/letsencrypt/boulder/nonce" + + "google.golang.org/grpc/balancer" + "google.golang.org/grpc/balancer/base" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +const ( + // Name is the name used to register the nonce balancer with the gRPC + // runtime. + Name = "nonce" + + // SRVResolverScheme is the scheme used to invoke an instance of the SRV + // resolver which will use the noncebalancer to pick backends. It would be + // ideal to export this from the SRV resolver package but that package is + // internal. + SRVResolverScheme = "nonce-srv" +) + +// ErrNoBackendsMatchPrefix indicates that no backends were found which match +// the nonce prefix provided in the RPC context. This can happen when the +// provided nonce is stale, valid but the backend has since been removed from +// the balancer, or valid but the backend has not yet been added to the +// balancer. +// +// In any case, when the WFE receives this error it will return a badNonce error +// to the ACME client. Note that the WFE uses exact pointer comparison to +// detect that the status it receives is this exact status object, so don't +// wrap this with fmt.Errorf when returning it. +var ErrNoBackendsMatchPrefix = status.New(codes.Unavailable, "no backends match the nonce prefix") +var errMissingPrefixCtxKey = errors.New("nonce.PrefixCtxKey value required in RPC context") +var errMissingHMACKeyCtxKey = errors.New("nonce.HMACKeyCtxKey value required in RPC context") +var errInvalidPrefixCtxKeyType = errors.New("nonce.PrefixCtxKey value in RPC context must be a string") +var errInvalidHMACKeyCtxKeyType = errors.New("nonce.HMACKeyCtxKey value in RPC context must be a byte slice") + +// pickerBuilder implements the base.PickerBuilder interface. It's used to +// create new Picker instances. It should only be used by nonce-service clients. +type pickerBuilder struct{} + +// Build implements the base.PickerBuilder interface. It is called by the gRPC +// runtime when the balancer is first initialized and when the set of backend +// (SubConn) addresses changes. +func (b *pickerBuilder) Build(buildInfo base.PickerBuildInfo) balancer.Picker { + if len(buildInfo.ReadySCs) == 0 { + // The Picker must be rebuilt if there are no backends available. + return base.NewErrPicker(balancer.ErrNoSubConnAvailable) + } + return &picker{ + backends: buildInfo.ReadySCs, + } +} + +// picker implements the balancer.Picker interface. It picks a backend (SubConn) +// based on the nonce prefix contained in each request's Context. +type picker struct { + backends map[balancer.SubConn]base.SubConnInfo + prefixToBackend map[string]balancer.SubConn + prefixToBackendOnce sync.Once +} + +// Pick implements the balancer.Picker interface. It is called by the gRPC +// runtime for each RPC message. It is responsible for picking a backend +// (SubConn) based on the context of each RPC message. +func (p *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { + if len(p.backends) == 0 { + // This should never happen, the Picker should only be built when there + // are backends available. + return balancer.PickResult{}, balancer.ErrNoSubConnAvailable + } + + // Get the HMAC key from the RPC context. + hmacKeyVal := info.Ctx.Value(nonce.HMACKeyCtxKey{}) + if hmacKeyVal == nil { + // This should never happen. + return balancer.PickResult{}, errMissingHMACKeyCtxKey + } + hmacKey, ok := hmacKeyVal.([]byte) + if !ok { + // This should never happen. + return balancer.PickResult{}, errInvalidHMACKeyCtxKeyType + } + + p.prefixToBackendOnce.Do(func() { + // First call to Pick with a new Picker. + prefixToBackend := make(map[string]balancer.SubConn) + for sc, scInfo := range p.backends { + scPrefix := nonce.DerivePrefix(scInfo.Address.Addr, hmacKey) + prefixToBackend[scPrefix] = sc + } + p.prefixToBackend = prefixToBackend + }) + + // Get the destination prefix from the RPC context. + destPrefixVal := info.Ctx.Value(nonce.PrefixCtxKey{}) + if destPrefixVal == nil { + // This should never happen. + return balancer.PickResult{}, errMissingPrefixCtxKey + } + destPrefix, ok := destPrefixVal.(string) + if !ok { + // This should never happen. + return balancer.PickResult{}, errInvalidPrefixCtxKeyType + } + + sc, ok := p.prefixToBackend[destPrefix] + if !ok { + // No backend SubConn was found for the destination prefix. + return balancer.PickResult{}, ErrNoBackendsMatchPrefix.Err() + } + return balancer.PickResult{SubConn: sc}, nil +} + +func init() { + balancer.Register( + base.NewBalancerBuilder(Name, &pickerBuilder{}, base.Config{}), + ) +} diff --git a/grpc/noncebalancerv1/noncebalancer_test.go b/grpc/noncebalancerv1/noncebalancer_test.go new file mode 100644 index 00000000000..1cade2f52c5 --- /dev/null +++ b/grpc/noncebalancerv1/noncebalancer_test.go @@ -0,0 +1,126 @@ +package noncebalancer + +import ( + "context" + "testing" + + "google.golang.org/grpc/balancer" + "google.golang.org/grpc/balancer/base" + "google.golang.org/grpc/resolver" + + "github.com/letsencrypt/boulder/nonce" + "github.com/letsencrypt/boulder/test" +) + +func TestPickerPicksCorrectBackend(t *testing.T) { + _, p, subConns := setupTest(false) + prefix := nonce.DerivePrefix(subConns[0].addrs[0].Addr, []byte("Kala namak")) + + testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "HNmOnt8w") + testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, []byte(prefix)) + info := balancer.PickInfo{Ctx: testCtx} + + gotPick, err := p.Pick(info) + test.AssertNotError(t, err, "Pick failed") + test.AssertDeepEquals(t, subConns[0], gotPick.SubConn) +} + +func TestPickerMissingPrefixInCtx(t *testing.T) { + _, p, subConns := setupTest(false) + prefix := nonce.DerivePrefix(subConns[0].addrs[0].Addr, []byte("Kala namak")) + + testCtx := context.WithValue(context.Background(), nonce.HMACKeyCtxKey{}, []byte(prefix)) + info := balancer.PickInfo{Ctx: testCtx} + + gotPick, err := p.Pick(info) + test.AssertErrorIs(t, err, errMissingPrefixCtxKey) + test.AssertNil(t, gotPick.SubConn, "subConn should be nil") +} + +func TestPickerInvalidPrefixInCtx(t *testing.T) { + _, p, _ := setupTest(false) + + testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, 9) + testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, []byte("foobar")) + info := balancer.PickInfo{Ctx: testCtx} + + gotPick, err := p.Pick(info) + test.AssertErrorIs(t, err, errInvalidPrefixCtxKeyType) + test.AssertNil(t, gotPick.SubConn, "subConn should be nil") +} + +func TestPickerMissingHMACKeyInCtx(t *testing.T) { + _, p, _ := setupTest(false) + + testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "HNmOnt8w") + info := balancer.PickInfo{Ctx: testCtx} + + gotPick, err := p.Pick(info) + test.AssertErrorIs(t, err, errMissingHMACKeyCtxKey) + test.AssertNil(t, gotPick.SubConn, "subConn should be nil") +} + +func TestPickerInvalidHMACKeyInCtx(t *testing.T) { + _, p, _ := setupTest(false) + + testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "HNmOnt8w") + testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, 9) + info := balancer.PickInfo{Ctx: testCtx} + + gotPick, err := p.Pick(info) + test.AssertErrorIs(t, err, errInvalidHMACKeyCtxKeyType) + test.AssertNil(t, gotPick.SubConn, "subConn should be nil") +} + +func TestPickerNoMatchingSubConnAvailable(t *testing.T) { + _, p, subConns := setupTest(false) + prefix := nonce.DerivePrefix(subConns[0].addrs[0].Addr, []byte("Kala namak")) + + testCtx := context.WithValue(context.Background(), nonce.PrefixCtxKey{}, "rUsTrUin") + testCtx = context.WithValue(testCtx, nonce.HMACKeyCtxKey{}, []byte(prefix)) + info := balancer.PickInfo{Ctx: testCtx} + + gotPick, err := p.Pick(info) + test.AssertErrorIs(t, err, ErrNoBackendsMatchPrefix.Err()) + test.AssertNil(t, gotPick.SubConn, "subConn should be nil") +} + +func TestPickerNoSubConnsAvailable(t *testing.T) { + b, p, _ := setupTest(true) + b.Build(base.PickerBuildInfo{}) + info := balancer.PickInfo{Ctx: context.Background()} + + gotPick, err := p.Pick(info) + test.AssertErrorIs(t, err, balancer.ErrNoSubConnAvailable) + test.AssertNil(t, gotPick.SubConn, "subConn should be nil") +} + +func setupTest(noSubConns bool) (*pickerBuilder, balancer.Picker, []*subConn) { + var subConns []*subConn + bi := base.PickerBuildInfo{ + ReadySCs: make(map[balancer.SubConn]base.SubConnInfo), + } + + sc := &subConn{} + addr := resolver.Address{Addr: "10.77.77.77:8080"} + sc.UpdateAddresses([]resolver.Address{addr}) + + if !noSubConns { + bi.ReadySCs[sc] = base.SubConnInfo{Address: addr} + subConns = append(subConns, sc) + } + + b := &pickerBuilder{} + p := b.Build(bi) + return b, p, subConns +} + +// subConn is a test mock which implements the balancer.SubConn interface. +type subConn struct { + balancer.SubConn + addrs []resolver.Address +} + +func (s *subConn) UpdateAddresses(addrs []resolver.Address) { + s.addrs = addrs +} diff --git a/test/config-next/wfe2.json b/test/config-next/wfe2.json index 6c14df15a83..0009838809c 100644 --- a/test/config-next/wfe2.json +++ b/test/config-next/wfe2.json @@ -74,7 +74,7 @@ "domain": "service.consul" } ], - "srvResolver": "nonce-srv", + "srvResolver": "nonce-srv-v2", "timeout": "15s", "noWaitForReady": true, "hostOverride": "nonce.boulder" diff --git a/test/config/wfe2.json b/test/config/wfe2.json index aede35e094a..5f90ed203af 100644 --- a/test/config/wfe2.json +++ b/test/config/wfe2.json @@ -66,7 +66,7 @@ "domain": "service.consul" } ], - "srvResolver": "nonce-srv", + "srvResolver": "nonce-srv-v2", "timeout": "15s", "noWaitForReady": true, "hostOverride": "nonce.boulder" diff --git a/test/integration/nonce_test.go b/test/integration/nonce_test.go index 8475463aff4..dc843579e41 100644 --- a/test/integration/nonce_test.go +++ b/test/integration/nonce_test.go @@ -4,7 +4,9 @@ package integration import ( "context" + "google.golang.org/protobuf/types/known/emptypb" "testing" + "time" "github.com/jmhodges/clock" "google.golang.org/grpc/status" @@ -46,6 +48,11 @@ func TestNonceBalancer_NoBackendMatchingPrefix(t *testing.T) { clk := clock.New() + getNonceConn, err := bgrpc.ClientSetup(c.NotWFE.GetNonceService, tlsConfig, metrics.NoopRegisterer, clk) + test.AssertNotError(t, err, "Failed to load credentials and create gRPC connection to get nonce service") + + gnc := nonce.NewGetter(getNonceConn) + redeemNonceConn, err := bgrpc.ClientSetup(c.NotWFE.RedeemNonceService, tlsConfig, metrics.NoopRegisterer, clk) test.AssertNotError(t, err, "Failed to load credentials and create gRPC connection to redeem nonce service") rnc := nonce.NewRedeemer(redeemNonceConn) @@ -58,5 +65,24 @@ func TestNonceBalancer_NoBackendMatchingPrefix(t *testing.T) { // We expect to get a specific gRPC status error with code NotFound. gotRPCStatus, ok := status.FromError(err) test.Assert(t, ok, "Failed to convert error to status") - test.AssertEquals(t, gotRPCStatus, nb.ErrNoBackendsMatchPrefix) + if gotRPCStatus != nb.ErrNoBackendsMatchPrefix && gotRPCStatus != nbv1.ErrNoBackendsMatchPrefix { + t.Errorf("redeeming nonce with unknown prefix: got %v, want %v", gotRPCStatus, nb.ErrNoBackendsMatchPrefix) + } + + var nonces []*noncepb.NonceMessage + for i := 0; i < 300; i++ { + nonceMsg, err := gnc.Nonce(ctx, &emptypb.Empty{}) + test.AssertNotError(t, err, "getting nonce") + + nonces = append(nonces, nonceMsg) + } + + for _, nonceMsg := range nonces { + ctx := context.WithValue(ctx, nonce.PrefixCtxKey{}, nonceMsg.Nonce[:nonce.PrefixLen]) + ctx = context.WithValue(ctx, nonce.HMACKeyCtxKey{}, rncKey) + + _, err = rnc.Redeem(ctx, &noncepb.NonceMessage{Nonce: nonceMsg.Nonce}) + test.AssertNotError(t, err, "redeeming nonce") + time.Sleep(10 * time.Millisecond) + } } diff --git a/test/integration/testdata/nonce-client.json b/test/integration/testdata/nonce-client.json index a66077e2690..2e9b0c054e1 100644 --- a/test/integration/testdata/nonce-client.json +++ b/test/integration/testdata/nonce-client.json @@ -27,7 +27,7 @@ "domain": "service.consul" } ], - "srvResolver": "nonce-srv", + "srvResolver": "nonce-srv-v2", "timeout": "15s", "noWaitForReady": true, "hostOverride": "nonce.boulder" diff --git a/wfe2/verify.go b/wfe2/verify.go index c7431d1c6c4..88ff4864f15 100644 --- a/wfe2/verify.go +++ b/wfe2/verify.go @@ -24,6 +24,7 @@ import ( "github.com/letsencrypt/boulder/goodkey" "github.com/letsencrypt/boulder/grpc" nb "github.com/letsencrypt/boulder/grpc/noncebalancer" + nbv1 "github.com/letsencrypt/boulder/grpc/noncebalancerv1" "github.com/letsencrypt/boulder/nonce" noncepb "github.com/letsencrypt/boulder/nonce/proto" sapb "github.com/letsencrypt/boulder/sa/proto" @@ -228,7 +229,7 @@ func (wfe *WebFrontEndImpl) validNonce(ctx context.Context, header jose.Header) resp, err := wfe.rnc.Redeem(ctx, &noncepb.NonceMessage{Nonce: header.Nonce}) if err != nil { rpcStatus, ok := status.FromError(err) - if ok && rpcStatus == nb.ErrNoBackendsMatchPrefix { + if ok && (rpcStatus == nb.ErrNoBackendsMatchPrefix || rpcStatus == nbv1.ErrNoBackendsMatchPrefix) { // Getting our sentinel ErrNoBackendsMatchPrefix status.Status means that // the nonce backend which issued this nonce is presently unreachable or // unrecognized by this WFE. As this is a transient failure, the client From 8fa3ca962f51014d4a0940df249ff6daae19fcf6 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Thu, 12 Mar 2026 14:52:59 -0700 Subject: [PATCH 3/5] noncebalancer: integration test reconnects Set maxConnectionAge to 1s, and make nonce_test.go collect 300 nonces, then redeem them one at a time, separated by 10ms. This creates a high likelihood of a redemption request occuring during a reconnect. --- test/config-next/nonce-a.json | 2 +- test/config-next/nonce-b.json | 2 +- test/integration/nonce_test.go | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/config-next/nonce-a.json b/test/config-next/nonce-a.json index 0b1f8c13519..516c05357b4 100644 --- a/test/config-next/nonce-a.json +++ b/test/config-next/nonce-a.json @@ -13,7 +13,7 @@ "sampleratio": 1 }, "grpc": { - "maxConnectionAge": "30m", + "maxConnectionAge": "1s", "services": { "nonce.NonceService": { "clientNames": [ diff --git a/test/config-next/nonce-b.json b/test/config-next/nonce-b.json index 0b1f8c13519..516c05357b4 100644 --- a/test/config-next/nonce-b.json +++ b/test/config-next/nonce-b.json @@ -13,7 +13,7 @@ "sampleratio": 1 }, "grpc": { - "maxConnectionAge": "30m", + "maxConnectionAge": "1s", "services": { "nonce.NonceService": { "clientNames": [ diff --git a/test/integration/nonce_test.go b/test/integration/nonce_test.go index dc843579e41..f836bede43f 100644 --- a/test/integration/nonce_test.go +++ b/test/integration/nonce_test.go @@ -4,6 +4,7 @@ package integration import ( "context" + nbv1 "github.com/letsencrypt/boulder/grpc/noncebalancerv1" "google.golang.org/protobuf/types/known/emptypb" "testing" "time" From 6726cfc36586a1742ce4b3fc139afd22691f3a97 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Tue, 17 Mar 2026 12:57:42 -0700 Subject: [PATCH 4/5] Fix import grouping --- grpc/internal/resolver/dns/dns_resolver.go | 5 +++-- grpc/noncebalancer/noncebalancer.go | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/grpc/internal/resolver/dns/dns_resolver.go b/grpc/internal/resolver/dns/dns_resolver.go index eb5f2e014a5..e9445e0f768 100644 --- a/grpc/internal/resolver/dns/dns_resolver.go +++ b/grpc/internal/resolver/dns/dns_resolver.go @@ -34,12 +34,13 @@ import ( "sync" "time" + "google.golang.org/grpc/grpclog" + "google.golang.org/grpc/resolver" + "github.com/letsencrypt/boulder/bdns" "github.com/letsencrypt/boulder/grpc/internal/backoff" "github.com/letsencrypt/boulder/grpc/noncebalancer" noncebalancerv1 "github.com/letsencrypt/boulder/grpc/noncebalancerv1" - "google.golang.org/grpc/grpclog" - "google.golang.org/grpc/resolver" ) var logger = grpclog.Component("srv") diff --git a/grpc/noncebalancer/noncebalancer.go b/grpc/noncebalancer/noncebalancer.go index 034cda93fe6..e6d7b43001c 100644 --- a/grpc/noncebalancer/noncebalancer.go +++ b/grpc/noncebalancer/noncebalancer.go @@ -2,16 +2,16 @@ package noncebalancer import ( "errors" - "google.golang.org/grpc/balancer/endpointsharding" - "google.golang.org/grpc/balancer/pickfirst" - "google.golang.org/grpc/connectivity" "sync" - "github.com/letsencrypt/boulder/nonce" - "google.golang.org/grpc/balancer" + "google.golang.org/grpc/balancer/endpointsharding" + "google.golang.org/grpc/balancer/pickfirst" "google.golang.org/grpc/codes" + "google.golang.org/grpc/connectivity" "google.golang.org/grpc/status" + + "github.com/letsencrypt/boulder/nonce" ) const ( From 7c713e8c0fb34cd160946b571935094e6c0853b0 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Thu, 19 Mar 2026 11:44:47 -0700 Subject: [PATCH 5/5] review feedback --- grpc/internal/resolver/dns/dns_resolver.go | 2 +- grpc/noncebalancer/noncebalancer.go | 2 +- test/config/wfe2.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grpc/internal/resolver/dns/dns_resolver.go b/grpc/internal/resolver/dns/dns_resolver.go index e9445e0f768..46b20d02a3e 100644 --- a/grpc/internal/resolver/dns/dns_resolver.go +++ b/grpc/internal/resolver/dns/dns_resolver.go @@ -26,7 +26,6 @@ import ( "context" "errors" "fmt" - "google.golang.org/grpc/serviceconfig" "net" "net/netip" "strconv" @@ -36,6 +35,7 @@ import ( "google.golang.org/grpc/grpclog" "google.golang.org/grpc/resolver" + "google.golang.org/grpc/serviceconfig" "github.com/letsencrypt/boulder/bdns" "github.com/letsencrypt/boulder/grpc/internal/backoff" diff --git a/grpc/noncebalancer/noncebalancer.go b/grpc/noncebalancer/noncebalancer.go index e6d7b43001c..2a1c10274be 100644 --- a/grpc/noncebalancer/noncebalancer.go +++ b/grpc/noncebalancer/noncebalancer.go @@ -127,7 +127,7 @@ func (p *prefixBasedPicker) Pick(info balancer.PickInfo) (balancer.PickResult, e // builder builds a nonceBalancer, which internally uses `endpointsharding.NewBalancer`. // // The embedded `endpointsharding` balancer manages a set of child pickers that all use -// `pickfirst` on an endpoint that consists of a single IP address (because our `"nonce-srv"` +// `pickfirst` on an endpoint that consists of a single IP address (because our `"nonce-srv-v2"` // resolver returns single-IP endpoints). type builder struct{} diff --git a/test/config/wfe2.json b/test/config/wfe2.json index 5f90ed203af..aede35e094a 100644 --- a/test/config/wfe2.json +++ b/test/config/wfe2.json @@ -66,7 +66,7 @@ "domain": "service.consul" } ], - "srvResolver": "nonce-srv-v2", + "srvResolver": "nonce-srv", "timeout": "15s", "noWaitForReady": true, "hostOverride": "nonce.boulder"