Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 76 additions & 1 deletion cmd/boulder-va/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/jmhodges/clock"
"github.com/prometheus/client_golang/prometheus"

"github.com/letsencrypt/boulder/bdns"
"github.com/letsencrypt/boulder/cmd"
Expand Down Expand Up @@ -57,7 +58,27 @@ type Config struct {
// when the VA first gets a quorum of (un)successful remote results.
// Leaving this value zero means the VA won't early-cancel slow remotes.
SlowRemoteTimeout config.Duration
Features features.Config

// ExperimentalVA configures an optional parallel VA that repeats the
// primary VA's DCV and CAA checks using an alternative DNS resolver,
// emitting comparison metrics without affecting the real validation
// decision.
ExperimentalVA *struct {
// DNSProvider is the dynamic DNS provider config for the
// experimental VA's resolver.
DNSProvider *cmd.DNSProvider `validate:"required"`
// DNSTimeout is the timeout for DNS queries. Defaults to the
// primary VA's DNSTimeout if unset.
DNSTimeout config.Duration `validate:"omitempty"`
// SampleRate controls the rate of validations that are repeated
// (0.0 to 1.0). A value of 0 disables it entirely, while 1 repeats
// all validations.
SampleRate float64 `validate:"min=0,max=1"`
// Timeout is the timeout for experimental validation operations.
// This should be configured to match the RA->VA timeout.
Timeout config.Duration `validate:"required"`
}
Features features.Config
}

Syslog cmd.SyslogConfig
Expand Down Expand Up @@ -130,6 +151,57 @@ func main() {
}
}

var experimentalVA *va.ValidationAuthorityImpl
var experimentalVASampleRate float64
var experimentalVATimeout time.Duration
if c.VA.ExperimentalVA != nil {
servers, err := bdns.StartDynamicProvider(c.VA.ExperimentalVA.DNSProvider, 60*time.Second, "tcp")
cmd.FailOnError(err, "Couldn't start experimental dynamic DNS server resolver")
defer servers.Stop()

dnsTimeout := c.VA.ExperimentalVA.DNSTimeout.Duration
if dnsTimeout <= 0 {
dnsTimeout = c.VA.DNSTimeout.Duration
}

// Prefix experimental VA metrics to avoid metric name collisions with
// the primary VA.
scope := prometheus.WrapRegistererWithPrefix("experimental_", scope)

resolver := bdns.New(
dnsTimeout,
servers,
scope,
clk,
c.VA.DNSTries,
c.VA.UserAgent,
logger,
tlsConfig,
)

experimentalVA, err = va.NewValidationAuthorityImpl(
resolver,
nil,
c.VA.UserAgent,
c.VA.IssuerDomain,
scope,
clk,
logger,
c.VA.AccountURIPrefixes,
"Experimental",
"",
iana.IsReservedAddr,
0,
c.VA.DNSAllowLoopbackAddresses,
nil,
0,
0,
)
cmd.FailOnError(err, "Unable to create experimental VA")
experimentalVASampleRate = c.VA.ExperimentalVA.SampleRate
experimentalVATimeout = c.VA.ExperimentalVA.Timeout.Duration
}

vai, err := va.NewValidationAuthorityImpl(
resolver,
remotes,
Expand All @@ -144,6 +216,9 @@ func main() {
iana.IsReservedAddr,
c.VA.SlowRemoteTimeout.Duration,
c.VA.DNSAllowLoopbackAddresses,
experimentalVA,
experimentalVASampleRate,
experimentalVATimeout,
)
cmd.FailOnError(err, "Unable to create VA server")

Expand Down
3 changes: 3 additions & 0 deletions cmd/remoteva/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ func main() {
iana.IsReservedAddr,
0,
c.RVA.DNSAllowLoopbackAddresses,
nil,
0,
0,
)
cmd.FailOnError(err, "Unable to create Remote-VA server")

Expand Down
22 changes: 21 additions & 1 deletion va/caa.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,27 @@ func (va *ValidationAuthorityImpl) DoCAA(ctx context.Context, req *vapb.IsCAAVal
logEvent.InternalError = err.Error()
prob = detailedError(err)
prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", ident.Value, prob.Detail)
return bgrpc.CAAResultToPB(filterProblemDetails(prob), va.perspective, va.rir)
}

// Capture the local validation result for experimental resolver comparison
// before MPIC can influence the outcome.
localResult, err := bgrpc.CAAResultToPB(filterProblemDetails(prob), va.perspective, va.rir)
if err != nil {
return nil, err
}

if va.shouldRunExperiment() {
go va.runExperiment(
ctx,
opCAA,
proto.Clone(localResult).(*vapb.IsCAAValidResponse),
func(ctx context.Context) (remoteResult, error) {
return va.experimentalVA.DoCAA(ctx, req)
})
}

if prob != nil {
return localResult, nil
}

if va.isPrimaryVA() {
Expand Down
123 changes: 99 additions & 24 deletions va/va.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ type vaMetrics struct {
http01Redirects prometheus.Counter
caaCounter *prometheus.CounterVec
ipv4FallbackCounter prometheus.Counter
// experimentConcurrence tracks whether the primary and experimental VAs
// reached the same outcome. It's labelled by:
// - operation: [dcv|caa]
// - concurrence: [true|false]
experimentConcurrence *prometheus.CounterVec
}

func initMetrics(stats prometheus.Registerer) *vaMetrics {
Expand Down Expand Up @@ -145,6 +150,10 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
Name: "tls_alpn_ipv4_fallback",
Help: "A counter of IPv4 fallbacks during TLS ALPN validation",
})
experimentConcurrence := promauto.With(stats).NewCounterVec(prometheus.CounterOpts{
Name: "experiment_concurrence",
Help: "Count of validations where the experimental VA did or did not concur with the primary VA",
}, []string{"operation", "concurrence"})

return &vaMetrics{
validationLatency: validationLatency,
Expand All @@ -154,6 +163,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
http01Redirects: http01Redirects,
caaCounter: caaCounter,
ipv4FallbackCounter: ipv4FallbackCounter,
experimentConcurrence: experimentConcurrence,
}
}

Expand Down Expand Up @@ -188,23 +198,26 @@ func newDefaultPortConfig() *portConfig {
type ValidationAuthorityImpl struct {
vapb.UnsafeVAServer
vapb.UnsafeCAAServer
log blog.Logger
dnsClient bdns.Client
issuerDomain string
httpPort int
httpsPort int
tlsPort int
userAgent string
clk clock.Clock
remoteVAs []RemoteVA
maxRemoteFailures int
accountURIPrefixes []string
singleDialTimeout time.Duration
slowRemoteTimeout time.Duration
perspective string
rir string
isReservedIPFunc func(netip.Addr) error
allowRestrictedAddrs bool
log blog.Logger
dnsClient bdns.Client
issuerDomain string
httpPort int
httpsPort int
tlsPort int
userAgent string
clk clock.Clock
remoteVAs []RemoteVA
maxRemoteFailures int
accountURIPrefixes []string
singleDialTimeout time.Duration
slowRemoteTimeout time.Duration
perspective string
rir string
isReservedIPFunc func(netip.Addr) error
allowRestrictedAddrs bool
experimentalVA *ValidationAuthorityImpl
experimentalVASampleRate float64
experimentalVATimeout time.Duration

metrics *vaMetrics
}
Expand All @@ -227,6 +240,9 @@ func NewValidationAuthorityImpl(
reservedIPChecker func(netip.Addr) error,
slowRemoteTimeout time.Duration,
allowRestrictedAddrs bool,
experimentalVA *ValidationAuthorityImpl,
experimentalVASampleRate float64,
experimentalVATimeout time.Duration,
) (*ValidationAuthorityImpl, error) {

if len(accountURIPrefixes) == 0 {
Expand Down Expand Up @@ -268,17 +284,56 @@ func NewValidationAuthorityImpl(
// before timing out. This timeout ignores the base RPC timeout and is strictly
// used for the DialContext operations that take place during an
// HTTP-01 challenge validation.
singleDialTimeout: 10 * time.Second,
slowRemoteTimeout: slowRemoteTimeout,
perspective: perspective,
rir: rir,
isReservedIPFunc: reservedIPChecker,
allowRestrictedAddrs: allowRestrictedAddrs,
singleDialTimeout: 10 * time.Second,
slowRemoteTimeout: slowRemoteTimeout,
perspective: perspective,
rir: rir,
isReservedIPFunc: reservedIPChecker,
allowRestrictedAddrs: allowRestrictedAddrs,
experimentalVA: experimentalVA,
experimentalVASampleRate: experimentalVASampleRate,
experimentalVATimeout: experimentalVATimeout,
}

return va, nil
}

func (va *ValidationAuthorityImpl) shouldRunExperiment() bool {
return va.experimentalVA != nil && rand.Float64() < va.experimentalVASampleRate
}

// runExperiment compares the primary VA's local result against the experimental
// VA's result and records a concurrence metric. On disagreement, it logs a
// structured event with both results. The primary argument must be non-nil.
// Callers should invoke this in a goroutine.
func (va *ValidationAuthorityImpl) runExperiment(ctx context.Context, operation string, primary remoteResult, experimentFunc func(context.Context) (remoteResult, error)) {
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), va.experimentalVATimeout)
defer cancel()

experimentResult, err := experimentFunc(ctx)

primaryPassed := primary.GetProblem() == nil
experimentPassed := (err == nil) && (experimentResult.GetProblem() == nil)

if primaryPassed == experimentPassed {
va.metrics.experimentConcurrence.WithLabelValues(operation, "true").Inc()
return
}
va.metrics.experimentConcurrence.WithLabelValues(operation, "false").Inc()

logArgs := map[string]any{
"operation": operation,
"primaryPassed": primaryPassed,
"primaryResult": primary,
"experimentPassed": experimentPassed,
"experimentResult": experimentResult,
}
if err != nil {
logArgs["experimentErr"] = err.Error()
}
va.log.AuditInfo("Primary VA disagreed with experimental VA", logArgs)
}

// maxAllowedFailures returns the maximum number of allowed failures
// for a given number of remote perspectives, according to the "Quorum
// Requirements" table in BRs Section 3.2.2.9, as follows:
Expand Down Expand Up @@ -767,7 +822,27 @@ func (va *ValidationAuthorityImpl) DoDCV(ctx context.Context, req *vapb.PerformV
if err != nil {
logEvent.InternalError = err.Error()
prob = detailedError(err)
return bgrpc.ValidationResultToPB(records, filterProblemDetails(prob), va.perspective, va.rir)
}

// Capture the local validation result for experimental resolver comparison
// before MPIC can influence the outcome.
localResult, err := bgrpc.ValidationResultToPB(records, filterProblemDetails(prob), va.perspective, va.rir)
if err != nil {
return nil, err
}

if va.shouldRunExperiment() {
go va.runExperiment(
ctx,
opDCV,
proto.Clone(localResult).(*vapb.ValidationResult),
func(ctx context.Context) (remoteResult, error) {
return va.experimentalVA.DoDCV(ctx, req)
})
}

if prob != nil {
return localResult, nil
}

if va.isPrimaryVA() {
Expand Down
Loading
Loading