Skip to content

Commit e5de061

Browse files
va: Add experimental VA for testing Hickory
1 parent 429d8b5 commit e5de061

5 files changed

Lines changed: 294 additions & 26 deletions

File tree

cmd/boulder-va/main.go

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"time"
88

99
"github.com/jmhodges/clock"
10+
"github.com/prometheus/client_golang/prometheus"
1011

1112
"github.com/letsencrypt/boulder/bdns"
1213
"github.com/letsencrypt/boulder/cmd"
@@ -57,7 +58,23 @@ type Config struct {
5758
// when the VA first gets a quorum of (un)successful remote results.
5859
// Leaving this value zero means the VA won't early-cancel slow remotes.
5960
SlowRemoteTimeout config.Duration
60-
Features features.Config
61+
62+
// ExperimentalVA configures an optional parallel VA that shadows the
63+
// primary VA's DCV and CAA checks using an alternative DNS resolver,
64+
// emitting comparison metrics without affecting the real validation
65+
// decision.
66+
ExperimentalVA *struct {
67+
// DNSProvider is the dynamic DNS provider config for the
68+
// experimental VA's resolver.
69+
DNSProvider *cmd.DNSProvider `validate:"required"`
70+
// DNSTimeout is the timeout for DNS queries. Defaults to the
71+
// primary VA's DNSTimeout if unset.
72+
DNSTimeout config.Duration `validate:"omitempty"`
73+
// SampleRate controls the rate of validations that are shadowed
74+
// (0.0 to 1.0). A value of 0 disables shadowing.
75+
SampleRate float64 `validate:"min=0,max=1"`
76+
}
77+
Features features.Config
6178
}
6279

6380
Syslog cmd.SyslogConfig
@@ -130,6 +147,54 @@ func main() {
130147
}
131148
}
132149

150+
var experimentalVA *va.ValidationAuthorityImpl
151+
var experimentalVASampleRate float64
152+
if c.VA.ExperimentalVA != nil {
153+
servers, err := bdns.StartDynamicProvider(c.VA.ExperimentalVA.DNSProvider, 60*time.Second, "tcp")
154+
cmd.FailOnError(err, "Couldn't start experimental dynamic DNS server resolver")
155+
defer servers.Stop()
156+
157+
dnsTimeout := c.VA.ExperimentalVA.DNSTimeout.Duration
158+
if dnsTimeout <= 0 {
159+
dnsTimeout = c.VA.DNSTimeout.Duration
160+
}
161+
162+
// Prefix experimental VA metrics to avoid metric name collisions with
163+
// the primary VA.
164+
scope := prometheus.WrapRegistererWithPrefix("experimental_", scope)
165+
166+
resolver := bdns.New(
167+
dnsTimeout,
168+
servers,
169+
scope,
170+
clk,
171+
c.VA.DNSTries,
172+
c.VA.UserAgent,
173+
logger,
174+
tlsConfig,
175+
)
176+
177+
experimentalVA, err = va.NewValidationAuthorityImpl(
178+
resolver,
179+
nil,
180+
c.VA.UserAgent,
181+
c.VA.IssuerDomain,
182+
scope,
183+
clk,
184+
logger,
185+
c.VA.AccountURIPrefixes,
186+
"Experimental",
187+
"",
188+
iana.IsReservedAddr,
189+
0,
190+
c.VA.DNSAllowLoopbackAddresses,
191+
nil,
192+
0,
193+
)
194+
cmd.FailOnError(err, "Unable to create experimental VA")
195+
experimentalVASampleRate = c.VA.ExperimentalVA.SampleRate
196+
}
197+
133198
vai, err := va.NewValidationAuthorityImpl(
134199
resolver,
135200
remotes,
@@ -144,6 +209,8 @@ func main() {
144209
iana.IsReservedAddr,
145210
c.VA.SlowRemoteTimeout.Duration,
146211
c.VA.DNSAllowLoopbackAddresses,
212+
experimentalVA,
213+
experimentalVASampleRate,
147214
)
148215
cmd.FailOnError(err, "Unable to create VA server")
149216

cmd/remoteva/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ func main() {
130130
iana.IsReservedAddr,
131131
0,
132132
c.RVA.DNSAllowLoopbackAddresses,
133+
nil,
134+
0,
133135
)
134136
cmd.FailOnError(err, "Unable to create Remote-VA server")
135137

va/caa.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,25 @@ func (va *ValidationAuthorityImpl) DoCAA(ctx context.Context, req *vapb.IsCAAVal
103103
logEvent.InternalError = err.Error()
104104
prob = detailedError(err)
105105
prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", ident.Value, prob.Detail)
106-
return bgrpc.CAAResultToPB(filterProblemDetails(prob), va.perspective, va.rir)
106+
}
107+
108+
var localResult remoteResult
109+
if va.shouldDispatchExperiment() {
110+
defer func() {
111+
va.dispatchExperiment(opCAA, localResult, func(ctx context.Context) (remoteResult, error) {
112+
return va.experimentalVA.DoCAA(ctx, req)
113+
})
114+
}()
115+
}
116+
117+
// Capture the local validation result for experimental resolver comparison
118+
// before MPIC can influence the outcome.
119+
localResult, err = bgrpc.CAAResultToPB(filterProblemDetails(prob), va.perspective, va.rir)
120+
if err != nil {
121+
return nil, err
122+
}
123+
if prob != nil {
124+
return localResult.(*vapb.IsCAAValidResponse), nil
107125
}
108126

109127
if va.isPrimaryVA() {

va/va.go

Lines changed: 92 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ type vaMetrics struct {
113113
http01Redirects prometheus.Counter
114114
caaCounter *prometheus.CounterVec
115115
ipv4FallbackCounter prometheus.Counter
116+
// experimentConcurrence tracks whether the primary and experimental VAs
117+
// reached the same outcome. It's labelled by:
118+
// - operation: [dcv|caa]
119+
// - concurrence: [true|false]
120+
experimentConcurrence *prometheus.CounterVec
116121
}
117122

118123
func initMetrics(stats prometheus.Registerer) *vaMetrics {
@@ -145,6 +150,10 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
145150
Name: "tls_alpn_ipv4_fallback",
146151
Help: "A counter of IPv4 fallbacks during TLS ALPN validation",
147152
})
153+
experimentConcurrence := promauto.With(stats).NewCounterVec(prometheus.CounterOpts{
154+
Name: "experiment_concurrence",
155+
Help: "Count of validations where the experimental VA did or did not concur with the primary VA",
156+
}, []string{"operation", "concurrence"})
148157

149158
return &vaMetrics{
150159
validationLatency: validationLatency,
@@ -154,6 +163,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
154163
http01Redirects: http01Redirects,
155164
caaCounter: caaCounter,
156165
ipv4FallbackCounter: ipv4FallbackCounter,
166+
experimentConcurrence: experimentConcurrence,
157167
}
158168
}
159169

@@ -188,23 +198,25 @@ func newDefaultPortConfig() *portConfig {
188198
type ValidationAuthorityImpl struct {
189199
vapb.UnsafeVAServer
190200
vapb.UnsafeCAAServer
191-
log blog.Logger
192-
dnsClient bdns.Client
193-
issuerDomain string
194-
httpPort int
195-
httpsPort int
196-
tlsPort int
197-
userAgent string
198-
clk clock.Clock
199-
remoteVAs []RemoteVA
200-
maxRemoteFailures int
201-
accountURIPrefixes []string
202-
singleDialTimeout time.Duration
203-
slowRemoteTimeout time.Duration
204-
perspective string
205-
rir string
206-
isReservedIPFunc func(netip.Addr) error
207-
allowRestrictedAddrs bool
201+
log blog.Logger
202+
dnsClient bdns.Client
203+
issuerDomain string
204+
httpPort int
205+
httpsPort int
206+
tlsPort int
207+
userAgent string
208+
clk clock.Clock
209+
remoteVAs []RemoteVA
210+
maxRemoteFailures int
211+
accountURIPrefixes []string
212+
singleDialTimeout time.Duration
213+
slowRemoteTimeout time.Duration
214+
perspective string
215+
rir string
216+
isReservedIPFunc func(netip.Addr) error
217+
allowRestrictedAddrs bool
218+
experimentalVA *ValidationAuthorityImpl
219+
experimentalVASampleRate float64
208220

209221
metrics *vaMetrics
210222
}
@@ -227,6 +239,8 @@ func NewValidationAuthorityImpl(
227239
reservedIPChecker func(netip.Addr) error,
228240
slowRemoteTimeout time.Duration,
229241
allowRestrictedAddrs bool,
242+
experimentalVA *ValidationAuthorityImpl,
243+
experimentalVASampleRate float64,
230244
) (*ValidationAuthorityImpl, error) {
231245

232246
if len(accountURIPrefixes) == 0 {
@@ -268,17 +282,53 @@ func NewValidationAuthorityImpl(
268282
// before timing out. This timeout ignores the base RPC timeout and is strictly
269283
// used for the DialContext operations that take place during an
270284
// HTTP-01 challenge validation.
271-
singleDialTimeout: 10 * time.Second,
272-
slowRemoteTimeout: slowRemoteTimeout,
273-
perspective: perspective,
274-
rir: rir,
275-
isReservedIPFunc: reservedIPChecker,
276-
allowRestrictedAddrs: allowRestrictedAddrs,
285+
singleDialTimeout: 10 * time.Second,
286+
slowRemoteTimeout: slowRemoteTimeout,
287+
perspective: perspective,
288+
rir: rir,
289+
isReservedIPFunc: reservedIPChecker,
290+
allowRestrictedAddrs: allowRestrictedAddrs,
291+
experimentalVA: experimentalVA,
292+
experimentalVASampleRate: experimentalVASampleRate,
277293
}
278294

279295
return va, nil
280296
}
281297

298+
func (va *ValidationAuthorityImpl) shouldDispatchExperiment() bool {
299+
return va.experimentalVA != nil && rand.Float64() < va.experimentalVASampleRate
300+
}
301+
302+
func (va *ValidationAuthorityImpl) dispatchExperiment(operation string, primary remoteResult, experimentFunc func(context.Context) (remoteResult, error)) {
303+
go func() {
304+
ctx, cancel := context.WithTimeout(context.Background(), va.slowRemoteTimeout)
305+
defer cancel()
306+
307+
experimentResult, err := experimentFunc(ctx)
308+
309+
primaryPassed := primary.GetProblem() == nil
310+
experimentPassed := (err == nil) && (experimentResult.GetProblem() == nil)
311+
312+
if primaryPassed == experimentPassed {
313+
va.metrics.experimentConcurrence.WithLabelValues(operation, "true").Inc()
314+
return
315+
}
316+
va.metrics.experimentConcurrence.WithLabelValues(operation, "false").Inc()
317+
318+
logArgs := map[string]any{
319+
"operation": operation,
320+
"primaryPassed": primaryPassed,
321+
"primaryResult": primary,
322+
"experimentPassed": experimentPassed,
323+
"experimentResult": experimentResult,
324+
}
325+
if err != nil {
326+
logArgs["experimentErr"] = err.Error()
327+
}
328+
va.log.AuditInfo("Primary VA disagreed with experimental VA", logArgs)
329+
}()
330+
}
331+
282332
// maxAllowedFailures returns the maximum number of allowed failures
283333
// for a given number of remote perspectives, according to the "Quorum
284334
// Requirements" table in BRs Section 3.2.2.9, as follows:
@@ -767,7 +817,25 @@ func (va *ValidationAuthorityImpl) DoDCV(ctx context.Context, req *vapb.PerformV
767817
if err != nil {
768818
logEvent.InternalError = err.Error()
769819
prob = detailedError(err)
770-
return bgrpc.ValidationResultToPB(records, filterProblemDetails(prob), va.perspective, va.rir)
820+
}
821+
822+
var localResult remoteResult
823+
if va.shouldDispatchExperiment() {
824+
defer func() {
825+
va.dispatchExperiment(opDCV, localResult, func(ctx context.Context) (remoteResult, error) {
826+
return va.experimentalVA.DoDCV(ctx, req)
827+
})
828+
}()
829+
}
830+
831+
// Capture the local validation result for experimental resolver comparison
832+
// before MPIC can influence the outcome.
833+
localResult, err = bgrpc.ValidationResultToPB(records, filterProblemDetails(prob), va.perspective, va.rir)
834+
if err != nil {
835+
return nil, err
836+
}
837+
if prob != nil {
838+
return localResult.(*vapb.ValidationResult), nil
771839
}
772840

773841
if va.isPrimaryVA() {

0 commit comments

Comments
 (0)