@@ -113,6 +113,11 @@ type vaMetrics struct {
113113 http01Redirects prometheus.Counter
114114 caaCounter * prometheus.CounterVec
115115 ipv4FallbackCounter prometheus.Counter
116+ // experimentConcurrence tracks whether the primary and experimental VAs
117+ // reached the same outcome. It's labelled by:
118+ // - operation: [dcv|caa]
119+ // - concurrence: [true|false]
120+ experimentConcurrence * prometheus.CounterVec
116121}
117122
118123func initMetrics (stats prometheus.Registerer ) * vaMetrics {
@@ -145,6 +150,10 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
145150 Name : "tls_alpn_ipv4_fallback" ,
146151 Help : "A counter of IPv4 fallbacks during TLS ALPN validation" ,
147152 })
153+ experimentConcurrence := promauto .With (stats ).NewCounterVec (prometheus.CounterOpts {
154+ Name : "experiment_concurrence" ,
155+ Help : "Count of validations where the experimental VA did or did not concur with the primary VA" ,
156+ }, []string {"operation" , "concurrence" })
148157
149158 return & vaMetrics {
150159 validationLatency : validationLatency ,
@@ -154,6 +163,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
154163 http01Redirects : http01Redirects ,
155164 caaCounter : caaCounter ,
156165 ipv4FallbackCounter : ipv4FallbackCounter ,
166+ experimentConcurrence : experimentConcurrence ,
157167 }
158168}
159169
@@ -188,23 +198,26 @@ func newDefaultPortConfig() *portConfig {
188198type ValidationAuthorityImpl struct {
189199 vapb.UnsafeVAServer
190200 vapb.UnsafeCAAServer
191- log blog.Logger
192- dnsClient bdns.Client
193- issuerDomain string
194- httpPort int
195- httpsPort int
196- tlsPort int
197- userAgent string
198- clk clock.Clock
199- remoteVAs []RemoteVA
200- maxRemoteFailures int
201- accountURIPrefixes []string
202- singleDialTimeout time.Duration
203- slowRemoteTimeout time.Duration
204- perspective string
205- rir string
206- isReservedIPFunc func (netip.Addr ) error
207- allowRestrictedAddrs bool
201+ log blog.Logger
202+ dnsClient bdns.Client
203+ issuerDomain string
204+ httpPort int
205+ httpsPort int
206+ tlsPort int
207+ userAgent string
208+ clk clock.Clock
209+ remoteVAs []RemoteVA
210+ maxRemoteFailures int
211+ accountURIPrefixes []string
212+ singleDialTimeout time.Duration
213+ slowRemoteTimeout time.Duration
214+ perspective string
215+ rir string
216+ isReservedIPFunc func (netip.Addr ) error
217+ allowRestrictedAddrs bool
218+ experimentalVA * ValidationAuthorityImpl
219+ experimentalVASampleRate float64
220+ experimentalVATimeout time.Duration
208221
209222 metrics * vaMetrics
210223}
@@ -227,6 +240,9 @@ func NewValidationAuthorityImpl(
227240 reservedIPChecker func (netip.Addr ) error ,
228241 slowRemoteTimeout time.Duration ,
229242 allowRestrictedAddrs bool ,
243+ experimentalVA * ValidationAuthorityImpl ,
244+ experimentalVASampleRate float64 ,
245+ experimentalVATimeout time.Duration ,
230246) (* ValidationAuthorityImpl , error ) {
231247
232248 if len (accountURIPrefixes ) == 0 {
@@ -268,17 +284,56 @@ func NewValidationAuthorityImpl(
268284 // before timing out. This timeout ignores the base RPC timeout and is strictly
269285 // used for the DialContext operations that take place during an
270286 // HTTP-01 challenge validation.
271- singleDialTimeout : 10 * time .Second ,
272- slowRemoteTimeout : slowRemoteTimeout ,
273- perspective : perspective ,
274- rir : rir ,
275- isReservedIPFunc : reservedIPChecker ,
276- allowRestrictedAddrs : allowRestrictedAddrs ,
287+ singleDialTimeout : 10 * time .Second ,
288+ slowRemoteTimeout : slowRemoteTimeout ,
289+ perspective : perspective ,
290+ rir : rir ,
291+ isReservedIPFunc : reservedIPChecker ,
292+ allowRestrictedAddrs : allowRestrictedAddrs ,
293+ experimentalVA : experimentalVA ,
294+ experimentalVASampleRate : experimentalVASampleRate ,
295+ experimentalVATimeout : experimentalVATimeout ,
277296 }
278297
279298 return va , nil
280299}
281300
301+ func (va * ValidationAuthorityImpl ) shouldRunExperiment () bool {
302+ return va .experimentalVA != nil && rand .Float64 () < va .experimentalVASampleRate
303+ }
304+
305+ // runExperiment compares the primary VA's local result against the experimental
306+ // VA's result and records a concurrence metric. On disagreement, it logs a
307+ // structured event with both results. The primary argument must be non-nil.
308+ // Callers should invoke this in a goroutine.
309+ func (va * ValidationAuthorityImpl ) runExperiment (ctx context.Context , operation string , primary remoteResult , experimentFunc func (context.Context ) (remoteResult , error )) {
310+ ctx , cancel := context .WithTimeout (context .WithoutCancel (ctx ), va .experimentalVATimeout )
311+ defer cancel ()
312+
313+ experimentResult , err := experimentFunc (ctx )
314+
315+ primaryPassed := primary .GetProblem () == nil
316+ experimentPassed := (err == nil ) && (experimentResult .GetProblem () == nil )
317+
318+ if primaryPassed == experimentPassed {
319+ va .metrics .experimentConcurrence .WithLabelValues (operation , "true" ).Inc ()
320+ return
321+ }
322+ va .metrics .experimentConcurrence .WithLabelValues (operation , "false" ).Inc ()
323+
324+ logArgs := map [string ]any {
325+ "operation" : operation ,
326+ "primaryPassed" : primaryPassed ,
327+ "primaryResult" : primary ,
328+ "experimentPassed" : experimentPassed ,
329+ "experimentResult" : experimentResult ,
330+ }
331+ if err != nil {
332+ logArgs ["experimentErr" ] = err .Error ()
333+ }
334+ va .log .AuditInfo ("Primary VA disagreed with experimental VA" , logArgs )
335+ }
336+
282337// maxAllowedFailures returns the maximum number of allowed failures
283338// for a given number of remote perspectives, according to the "Quorum
284339// Requirements" table in BRs Section 3.2.2.9, as follows:
@@ -767,7 +822,27 @@ func (va *ValidationAuthorityImpl) DoDCV(ctx context.Context, req *vapb.PerformV
767822 if err != nil {
768823 logEvent .InternalError = err .Error ()
769824 prob = detailedError (err )
770- return bgrpc .ValidationResultToPB (records , filterProblemDetails (prob ), va .perspective , va .rir )
825+ }
826+
827+ // Capture the local validation result for experimental resolver comparison
828+ // before MPIC can influence the outcome.
829+ localResult , err := bgrpc .ValidationResultToPB (records , filterProblemDetails (prob ), va .perspective , va .rir )
830+ if err != nil {
831+ return nil , err
832+ }
833+
834+ if va .shouldRunExperiment () {
835+ go va .runExperiment (
836+ ctx ,
837+ opDCV ,
838+ proto .Clone (localResult ).(* vapb.ValidationResult ),
839+ func (ctx context.Context ) (remoteResult , error ) {
840+ return va .experimentalVA .DoDCV (ctx , req )
841+ })
842+ }
843+
844+ if prob != nil {
845+ return localResult , nil
771846 }
772847
773848 if va .isPrimaryVA () {
0 commit comments