Skip to content

Commit d51bd6a

Browse files
va: Add experimental VA for testing Hickory (#8688)
1 parent 9872323 commit d51bd6a

5 files changed

Lines changed: 318 additions & 26 deletions

File tree

cmd/boulder-va/main.go

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"time"
88

99
"github.com/jmhodges/clock"
10+
"github.com/prometheus/client_golang/prometheus"
1011

1112
"github.com/letsencrypt/boulder/bdns"
1213
"github.com/letsencrypt/boulder/cmd"
@@ -57,7 +58,27 @@ type Config struct {
5758
// when the VA first gets a quorum of (un)successful remote results.
5859
// Leaving this value zero means the VA won't early-cancel slow remotes.
5960
SlowRemoteTimeout config.Duration
60-
Features features.Config
61+
62+
// ExperimentalVA configures an optional parallel VA that repeats the
63+
// primary VA's DCV and CAA checks using an alternative DNS resolver,
64+
// emitting comparison metrics without affecting the real validation
65+
// decision.
66+
ExperimentalVA *struct {
67+
// DNSProvider is the dynamic DNS provider config for the
68+
// experimental VA's resolver.
69+
DNSProvider *cmd.DNSProvider `validate:"required"`
70+
// DNSTimeout is the timeout for DNS queries. Defaults to the
71+
// primary VA's DNSTimeout if unset.
72+
DNSTimeout config.Duration `validate:"omitempty"`
73+
// SampleRate controls the rate of validations that are repeated
74+
// (0.0 to 1.0). A value of 0 disables it entirely, while 1 repeats
75+
// all validations.
76+
SampleRate float64 `validate:"min=0,max=1"`
77+
// Timeout is the timeout for experimental validation operations.
78+
// This should be configured to match the RA->VA timeout.
79+
Timeout config.Duration `validate:"required"`
80+
}
81+
Features features.Config
6182
}
6283

6384
Syslog cmd.SyslogConfig
@@ -130,6 +151,57 @@ func main() {
130151
}
131152
}
132153

154+
var experimentalVA *va.ValidationAuthorityImpl
155+
var experimentalVASampleRate float64
156+
var experimentalVATimeout time.Duration
157+
if c.VA.ExperimentalVA != nil {
158+
servers, err := bdns.StartDynamicProvider(c.VA.ExperimentalVA.DNSProvider, 60*time.Second, "tcp")
159+
cmd.FailOnError(err, "Couldn't start experimental dynamic DNS server resolver")
160+
defer servers.Stop()
161+
162+
dnsTimeout := c.VA.ExperimentalVA.DNSTimeout.Duration
163+
if dnsTimeout <= 0 {
164+
dnsTimeout = c.VA.DNSTimeout.Duration
165+
}
166+
167+
// Prefix experimental VA metrics to avoid metric name collisions with
168+
// the primary VA.
169+
scope := prometheus.WrapRegistererWithPrefix("experimental_", scope)
170+
171+
resolver := bdns.New(
172+
dnsTimeout,
173+
servers,
174+
scope,
175+
clk,
176+
c.VA.DNSTries,
177+
c.VA.UserAgent,
178+
logger,
179+
tlsConfig,
180+
)
181+
182+
experimentalVA, err = va.NewValidationAuthorityImpl(
183+
resolver,
184+
nil,
185+
c.VA.UserAgent,
186+
c.VA.IssuerDomain,
187+
scope,
188+
clk,
189+
logger,
190+
c.VA.AccountURIPrefixes,
191+
"Experimental",
192+
"",
193+
iana.IsReservedAddr,
194+
0,
195+
c.VA.DNSAllowLoopbackAddresses,
196+
nil,
197+
0,
198+
0,
199+
)
200+
cmd.FailOnError(err, "Unable to create experimental VA")
201+
experimentalVASampleRate = c.VA.ExperimentalVA.SampleRate
202+
experimentalVATimeout = c.VA.ExperimentalVA.Timeout.Duration
203+
}
204+
133205
vai, err := va.NewValidationAuthorityImpl(
134206
resolver,
135207
remotes,
@@ -144,6 +216,9 @@ func main() {
144216
iana.IsReservedAddr,
145217
c.VA.SlowRemoteTimeout.Duration,
146218
c.VA.DNSAllowLoopbackAddresses,
219+
experimentalVA,
220+
experimentalVASampleRate,
221+
experimentalVATimeout,
147222
)
148223
cmd.FailOnError(err, "Unable to create VA server")
149224

cmd/remoteva/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,9 @@ func main() {
130130
iana.IsReservedAddr,
131131
0,
132132
c.RVA.DNSAllowLoopbackAddresses,
133+
nil,
134+
0,
135+
0,
133136
)
134137
cmd.FailOnError(err, "Unable to create Remote-VA server")
135138

va/caa.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,27 @@ func (va *ValidationAuthorityImpl) DoCAA(ctx context.Context, req *vapb.IsCAAVal
103103
logEvent.InternalError = err.Error()
104104
prob = detailedError(err)
105105
prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", ident.Value, prob.Detail)
106-
return bgrpc.CAAResultToPB(filterProblemDetails(prob), va.perspective, va.rir)
106+
}
107+
108+
// Capture the local validation result for experimental resolver comparison
109+
// before MPIC can influence the outcome.
110+
localResult, err := bgrpc.CAAResultToPB(filterProblemDetails(prob), va.perspective, va.rir)
111+
if err != nil {
112+
return nil, err
113+
}
114+
115+
if va.shouldRunExperiment() {
116+
go va.runExperiment(
117+
ctx,
118+
opCAA,
119+
proto.Clone(localResult).(*vapb.IsCAAValidResponse),
120+
func(ctx context.Context) (remoteResult, error) {
121+
return va.experimentalVA.DoCAA(ctx, req)
122+
})
123+
}
124+
125+
if prob != nil {
126+
return localResult, nil
107127
}
108128

109129
if va.isPrimaryVA() {

va/va.go

Lines changed: 99 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ type vaMetrics struct {
113113
http01Redirects prometheus.Counter
114114
caaCounter *prometheus.CounterVec
115115
ipv4FallbackCounter prometheus.Counter
116+
// experimentConcurrence tracks whether the primary and experimental VAs
117+
// reached the same outcome. It's labelled by:
118+
// - operation: [dcv|caa]
119+
// - concurrence: [true|false]
120+
experimentConcurrence *prometheus.CounterVec
116121
}
117122

118123
func initMetrics(stats prometheus.Registerer) *vaMetrics {
@@ -145,6 +150,10 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
145150
Name: "tls_alpn_ipv4_fallback",
146151
Help: "A counter of IPv4 fallbacks during TLS ALPN validation",
147152
})
153+
experimentConcurrence := promauto.With(stats).NewCounterVec(prometheus.CounterOpts{
154+
Name: "experiment_concurrence",
155+
Help: "Count of validations where the experimental VA did or did not concur with the primary VA",
156+
}, []string{"operation", "concurrence"})
148157

149158
return &vaMetrics{
150159
validationLatency: validationLatency,
@@ -154,6 +163,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
154163
http01Redirects: http01Redirects,
155164
caaCounter: caaCounter,
156165
ipv4FallbackCounter: ipv4FallbackCounter,
166+
experimentConcurrence: experimentConcurrence,
157167
}
158168
}
159169

@@ -188,23 +198,26 @@ func newDefaultPortConfig() *portConfig {
188198
type ValidationAuthorityImpl struct {
189199
vapb.UnsafeVAServer
190200
vapb.UnsafeCAAServer
191-
log blog.Logger
192-
dnsClient bdns.Client
193-
issuerDomain string
194-
httpPort int
195-
httpsPort int
196-
tlsPort int
197-
userAgent string
198-
clk clock.Clock
199-
remoteVAs []RemoteVA
200-
maxRemoteFailures int
201-
accountURIPrefixes []string
202-
singleDialTimeout time.Duration
203-
slowRemoteTimeout time.Duration
204-
perspective string
205-
rir string
206-
isReservedIPFunc func(netip.Addr) error
207-
allowRestrictedAddrs bool
201+
log blog.Logger
202+
dnsClient bdns.Client
203+
issuerDomain string
204+
httpPort int
205+
httpsPort int
206+
tlsPort int
207+
userAgent string
208+
clk clock.Clock
209+
remoteVAs []RemoteVA
210+
maxRemoteFailures int
211+
accountURIPrefixes []string
212+
singleDialTimeout time.Duration
213+
slowRemoteTimeout time.Duration
214+
perspective string
215+
rir string
216+
isReservedIPFunc func(netip.Addr) error
217+
allowRestrictedAddrs bool
218+
experimentalVA *ValidationAuthorityImpl
219+
experimentalVASampleRate float64
220+
experimentalVATimeout time.Duration
208221

209222
metrics *vaMetrics
210223
}
@@ -227,6 +240,9 @@ func NewValidationAuthorityImpl(
227240
reservedIPChecker func(netip.Addr) error,
228241
slowRemoteTimeout time.Duration,
229242
allowRestrictedAddrs bool,
243+
experimentalVA *ValidationAuthorityImpl,
244+
experimentalVASampleRate float64,
245+
experimentalVATimeout time.Duration,
230246
) (*ValidationAuthorityImpl, error) {
231247

232248
if len(accountURIPrefixes) == 0 {
@@ -268,17 +284,56 @@ func NewValidationAuthorityImpl(
268284
// before timing out. This timeout ignores the base RPC timeout and is strictly
269285
// used for the DialContext operations that take place during an
270286
// HTTP-01 challenge validation.
271-
singleDialTimeout: 10 * time.Second,
272-
slowRemoteTimeout: slowRemoteTimeout,
273-
perspective: perspective,
274-
rir: rir,
275-
isReservedIPFunc: reservedIPChecker,
276-
allowRestrictedAddrs: allowRestrictedAddrs,
287+
singleDialTimeout: 10 * time.Second,
288+
slowRemoteTimeout: slowRemoteTimeout,
289+
perspective: perspective,
290+
rir: rir,
291+
isReservedIPFunc: reservedIPChecker,
292+
allowRestrictedAddrs: allowRestrictedAddrs,
293+
experimentalVA: experimentalVA,
294+
experimentalVASampleRate: experimentalVASampleRate,
295+
experimentalVATimeout: experimentalVATimeout,
277296
}
278297

279298
return va, nil
280299
}
281300

301+
func (va *ValidationAuthorityImpl) shouldRunExperiment() bool {
302+
return va.experimentalVA != nil && rand.Float64() < va.experimentalVASampleRate
303+
}
304+
305+
// runExperiment compares the primary VA's local result against the experimental
306+
// VA's result and records a concurrence metric. On disagreement, it logs a
307+
// structured event with both results. The primary argument must be non-nil.
308+
// Callers should invoke this in a goroutine.
309+
func (va *ValidationAuthorityImpl) runExperiment(ctx context.Context, operation string, primary remoteResult, experimentFunc func(context.Context) (remoteResult, error)) {
310+
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), va.experimentalVATimeout)
311+
defer cancel()
312+
313+
experimentResult, err := experimentFunc(ctx)
314+
315+
primaryPassed := primary.GetProblem() == nil
316+
experimentPassed := (err == nil) && (experimentResult.GetProblem() == nil)
317+
318+
if primaryPassed == experimentPassed {
319+
va.metrics.experimentConcurrence.WithLabelValues(operation, "true").Inc()
320+
return
321+
}
322+
va.metrics.experimentConcurrence.WithLabelValues(operation, "false").Inc()
323+
324+
logArgs := map[string]any{
325+
"operation": operation,
326+
"primaryPassed": primaryPassed,
327+
"primaryResult": primary,
328+
"experimentPassed": experimentPassed,
329+
"experimentResult": experimentResult,
330+
}
331+
if err != nil {
332+
logArgs["experimentErr"] = err.Error()
333+
}
334+
va.log.AuditInfo("Primary VA disagreed with experimental VA", logArgs)
335+
}
336+
282337
// maxAllowedFailures returns the maximum number of allowed failures
283338
// for a given number of remote perspectives, according to the "Quorum
284339
// Requirements" table in BRs Section 3.2.2.9, as follows:
@@ -767,7 +822,27 @@ func (va *ValidationAuthorityImpl) DoDCV(ctx context.Context, req *vapb.PerformV
767822
if err != nil {
768823
logEvent.InternalError = err.Error()
769824
prob = detailedError(err)
770-
return bgrpc.ValidationResultToPB(records, filterProblemDetails(prob), va.perspective, va.rir)
825+
}
826+
827+
// Capture the local validation result for experimental resolver comparison
828+
// before MPIC can influence the outcome.
829+
localResult, err := bgrpc.ValidationResultToPB(records, filterProblemDetails(prob), va.perspective, va.rir)
830+
if err != nil {
831+
return nil, err
832+
}
833+
834+
if va.shouldRunExperiment() {
835+
go va.runExperiment(
836+
ctx,
837+
opDCV,
838+
proto.Clone(localResult).(*vapb.ValidationResult),
839+
func(ctx context.Context) (remoteResult, error) {
840+
return va.experimentalVA.DoDCV(ctx, req)
841+
})
842+
}
843+
844+
if prob != nil {
845+
return localResult, nil
771846
}
772847

773848
if va.isPrimaryVA() {

0 commit comments

Comments
 (0)