Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
fae0b03
cert-checker: fix logging & push metrics
lenaunderwood22 May 26, 2026
1bbc8e4
add pushgatewayURL to test configs
lenaunderwood22 May 26, 2026
600b6c3
Merge branch 'main' into cert-checker-logs
lenaunderwood22 May 26, 2026
7d26e04
lena can't syntax in the merge conflict editor
lenaunderwood22 May 26, 2026
a26009e
Update cmd/cert-checker/main.go
lenaunderwood22 May 28, 2026
37ec383
Update cmd/cert-checker/main.go
lenaunderwood22 May 28, 2026
bd919b8
rename certCheckerMetrics to metrics; remove unexpiredOnly conf; smal…
lenaunderwood22 May 28, 2026
e02607b
remove 'entries' from report struct
lenaunderwood22 May 28, 2026
be9ce40
drop reportEntry too
lenaunderwood22 May 28, 2026
c5e0cf6
remove badresultsonly references
lenaunderwood22 May 28, 2026
96eb1c2
one more
lenaunderwood22 May 28, 2026
a09d4b1
more cleanup
lenaunderwood22 May 28, 2026
15ce586
rename metrics back to certCheckerMetrics
lenaunderwood22 May 29, 2026
02e95e5
lookup pushgatewayurl
lenaunderwood22 May 29, 2026
90f7d8f
update key validations
lenaunderwood22 May 29, 2026
c50dce2
fix nil pointer ref
lenaunderwood22 May 29, 2026
4b4d70e
Potential fix for pull request finding
lenaunderwood22 May 29, 2026
2a03d31
context with 10s timeout for lookups & push
lenaunderwood22 May 29, 2026
dad8b8c
hardcode http
lenaunderwood22 May 29, 2026
ad64b08
add comments to getPushgatewayURL, add tests for it
lenaunderwood22 May 29, 2026
6c75a9c
test fixes
lenaunderwood22 May 29, 2026
72fc6a7
don't need rmu anymore
lenaunderwood22 Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 119 additions & 55 deletions cmd/cert-checker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ import (
"bytes"
"context"
"crypto/x509"
"encoding/json"
"flag"
"fmt"
"net"
"net/netip"
"os"
"regexp"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
Expand Down Expand Up @@ -38,6 +39,36 @@ import (
"github.com/letsencrypt/boulder/sa"
)

type certCheckerMetrics struct {
checkerLatency prometheus.Histogram
checkerTimestamp prometheus.Gauge
checkerGoodCount prometheus.Gauge
checkerBadCount prometheus.Gauge
}

func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constructor is only called from inside this package, so it doesn't need to be Exported.

Suggested change
func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics {
func newCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics {

checkerLatency := promauto.With(stats).NewHistogram(prometheus.HistogramOpts{
Name: "cert_checker_latency",
Help: "Histogram of latencies a cert-checker worker takes to complete a batch",
})

checkerTimestamp := promauto.With(stats).NewGauge(prometheus.GaugeOpts{
Name: "cert_checker_last_run_timestamp",
Help: "Timestamp of cert-checker's last run",
})

checkerGoodCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{
Name: "cert_checker_good_count",
Help: "Cert-checker count of good certificates",
})

checkerBadCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{
Name: "cert_checker_bad_count",
Help: "Cert-checker count of bad certificates",
})
return &certCheckerMetrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount}
}

// For defense-in-depth in addition to using the PA & its identPolicy to check
// domain names we also perform a check against the regex's from the
// forbiddenDomains array
Expand All @@ -62,25 +93,9 @@ var batchSize = 1000
type report struct {
begin time.Time
end time.Time
GoodCerts int64 `json:"good-certs"`
BadCerts int64 `json:"bad-certs"`
DbErrs int64 `json:"db-errs"`
Entries map[string]reportEntry `json:"entries"`
}

func (r *report) dump() error {
content, err := json.MarshalIndent(r, "", " ")
if err != nil {
return err
}
fmt.Fprintln(os.Stdout, string(content))
return nil
}

type reportEntry struct {
Valid bool `json:"valid"`
SANs []string `json:"sans"`
Problems []string `json:"problems,omitempty"`
GoodCerts int64 `json:"good-certs"`
BadCerts int64 `json:"bad-certs"`
DbErrs int64 `json:"db-errs"`
}

// certDB is an interface collecting the borp.DbMap functions that the various
Expand All @@ -102,7 +117,6 @@ type certChecker struct {
getPrecert precertGetter
certs chan *corepb.Certificate
clock clock.Clock
rMu *sync.Mutex
issuedReport report
checkPeriod time.Duration
acceptableValidityDurations map[time.Duration]bool
Expand Down Expand Up @@ -132,9 +146,7 @@ func newChecker(saDbMap certDB,
dbMap: saDbMap,
getPrecert: precertGetter,
certs: make(chan *corepb.Certificate, batchSize),
rMu: new(sync.Mutex),
clock: clk,
issuedReport: report{Entries: make(map[string]reportEntry)},
checkPeriod: period,
acceptableValidityDurations: avd,
lints: lints,
Expand Down Expand Up @@ -265,26 +277,17 @@ func (c *certChecker) getCerts(ctx context.Context) error {
return nil
}

func (c *certChecker) processCerts(ctx context.Context, wg *sync.WaitGroup, badResultsOnly bool) {
func (c *certChecker) processCerts(ctx context.Context) {
for cert := range c.certs {
sans, problems := c.checkCert(ctx, cert)
valid := len(problems) == 0
c.rMu.Lock()
if !badResultsOnly || (badResultsOnly && !valid) {
c.issuedReport.Entries[cert.Serial] = reportEntry{
Valid: valid,
SANs: sans,
Problems: problems,
}
}
c.rMu.Unlock()
if !valid {
atomic.AddInt64(&c.issuedReport.BadCerts, 1)
c.logger.AuditErr("certificate error found", nil, map[string]any{"serial": cert.Serial, "sans": sans, "problems": problems})
Comment thread
aarongable marked this conversation as resolved.
} else {
atomic.AddInt64(&c.issuedReport.GoodCerts, 1)
}
}
wg.Done()
}

// Extensions that we allow in certificates
Expand Down Expand Up @@ -540,8 +543,19 @@ type Config struct {
cmd.HostnamePolicyConfig

Workers int `validate:"required,min=1"`
// Deprecated: this is ignored, and cert checker always checks both expired and unexpired.
UnexpiredOnly bool
// LookupDNSAuthority can only be specified with PushgatewayService. It's a single
// <hostname|IPv4|[IPv6]>:<port> of the DNS server to be used for resolution
// of pushgateway backends. If the address contains a hostname it will be resolved
// using system DNS. If the address contains a port, the client will use it
// directly, otherwise port 53 is used.
LookupDNSAuthority string `validate:"excluded_without=PushgatewayService,required_with=PushgatewayService,omitempty,ip|hostname|hostname_port"`
// PushgatewayService entry contains a service and domain name that will be used
// to construct a SRV DNS query to lookup pushgateway backends. For example: if
// the resource record is 'foo.service.consul', then the 'Service' is 'foo'
// and the 'Domain' is 'service.consul'. The expected dNSName to be
// authenticated in the server certificate would be 'foo.service.consul'.
PushgatewayService *cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"`
// Deprecated: cert-checker only logs bad results anyway.
BadResultsOnly bool
CheckPeriod config.Duration

Expand Down Expand Up @@ -577,6 +591,47 @@ type Config struct {
Syslog cmd.SyslogConfig
}

// getPushgatewayURL resolves svc via SRV+A lookups against dnsAuthority and
// returns an http:// URL whose host is an IP address. Both lookups go through
// dnsAuthority (typically Consul DNS) because the system resolver can't answer
// queries for the .consul domain. The SRV target is then flattened to an IP
// because the returned URL is consumed by net/http via cmd.PushMetrics, which
// resolves hostnames using the system resolver. Scheme is fixed to http:
// pushgateway is assumed to be on an internal network
func getPushgatewayURL(ctx context.Context, dnsAuthority string, svc cmd.ServiceDomain) (string, error) {
host, port, err := net.SplitHostPort(dnsAuthority)
if err != nil {
// Assume only hostname or IPv4 address was specified.
host = dnsAuthority
port = "53"
}
r := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, _ string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, network, net.JoinHostPort(host, port))
},
}
_, targets, err := r.LookupSRV(ctx, svc.Service, "tcp", svc.Domain)
if err != nil {
return "", fmt.Errorf("SRV lookup of _%s._tcp.%s failed: %w", svc.Service, svc.Domain, err)
}
if len(targets) == 0 {
return "", fmt.Errorf("SRV lookup of _%s._tcp.%s returned 0 results", svc.Service, svc.Domain)
}
// Flatten the SRV target to an IP using the same Consul authority; net/http
// (used downstream) would otherwise try to resolve names like
// *.addr.dc1.consul via the system resolver and fail.
target := strings.TrimSuffix(targets[0].Target, ".")
addrs, err := r.LookupHost(ctx, target)
if err != nil {
return "", fmt.Errorf("A/AAAA lookup of %q failed: %w", target, err)
}
if len(addrs) == 0 {
return "", fmt.Errorf("A/AAAA lookup of %q returned 0 results", target)
}
return fmt.Sprintf("http://%s", net.JoinHostPort(addrs[0], fmt.Sprint(targets[0].Port))), nil
}

func main() {
configFile := flag.String("config", "", "File path to the configuration file for this service")
flag.Parse()
Expand All @@ -594,6 +649,9 @@ func main() {
logger := cmd.NewLogger(config.Syslog)
cmd.LogStartup(logger)

reg := prometheus.NewRegistry()
metrics := NewCertCheckerMetrics(reg)

acceptableValidityDurations := make(map[time.Duration]bool)
if len(config.CertChecker.AcceptableValidityDurations) > 0 {
for _, entry := range config.CertChecker.AcceptableValidityDurations {
Expand All @@ -616,11 +674,6 @@ func main() {
saDbMap, err := sa.InitWrappedDb(config.CertChecker.DB, prometheus.DefaultRegisterer, logger)
cmd.FailOnError(err, "While initializing dbMap")

checkerLatency := promauto.NewHistogram(prometheus.HistogramOpts{
Name: "cert_checker_latency",
Help: "Histogram of latencies a cert-checker worker takes to complete a batch",
})

pa, err := policy.New(config.PA.Identifiers, config.PA.Challenges, logger)
cmd.FailOnError(err, "Failed to create PA")

Expand Down Expand Up @@ -663,23 +716,34 @@ func main() {
fmt.Fprintf(os.Stderr, "# Processing certificates using %d workers\n", config.CertChecker.Workers)
wg := new(sync.WaitGroup)
for range config.CertChecker.Workers {
wg.Add(1)
go func() {
wg.Go(func() {
s := checker.clock.Now()
checker.processCerts(context.TODO(), wg, config.CertChecker.BadResultsOnly)
checkerLatency.Observe(checker.clock.Since(s).Seconds())
}()
checker.processCerts(context.TODO())
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
checker.processCerts(context.TODO())
checker.processCerts(context.Background())

metrics.checkerLatency.Observe(checker.clock.Since(s).Seconds())
})
}
wg.Wait()
fmt.Fprintf(
os.Stderr,
"# Finished processing certificates, report length: %d, good: %d, bad: %d\n",
len(checker.issuedReport.Entries),
checker.issuedReport.GoodCerts,
checker.issuedReport.BadCerts,
)
err = checker.issuedReport.dump()
cmd.FailOnError(err, "Failed to dump results: %s\n")
logger.AuditInfo("Finished processing certificates", checker.issuedReport)

metrics.checkerTimestamp.SetToCurrentTime()
metrics.checkerGoodCount.Set(float64(checker.issuedReport.GoodCerts))
metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts))

if config.CertChecker.PushgatewayService != nil {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
pushgatewayURL, err := getPushgatewayURL(ctx, config.CertChecker.LookupDNSAuthority, *config.CertChecker.PushgatewayService)
if err != nil {
logger.Errf("failed to get pushgateway URL: %s", err)
} else {
err = cmd.PushMetrics("cert-checker", pushgatewayURL, reg, logger)
if err != nil {
logger.Errf("failed to push metrics to pushgateway: %s", err)
} else {
logger.Debugf("pushed metrics to pushgateway at %s", pushgatewayURL)
}
}
}

if checker.issuedReport.BadCerts > 0 {
os.Exit(1)
Expand Down
73 changes: 43 additions & 30 deletions cmd/cert-checker/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@ import (
"log"
"math/big"
mrand "math/rand/v2"
"net"
"net/url"
"os"
"slices"
"strconv"
"strings"
"sync"
"testing"
"time"

"github.com/jmhodges/clock"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/letsencrypt/boulder/cmd"
"github.com/letsencrypt/boulder/core"
corepb "github.com/letsencrypt/boulder/core/proto"
"github.com/letsencrypt/boulder/ctpolicy/loglist"
Expand Down Expand Up @@ -336,7 +339,8 @@ func TestGetAndProcessCerts(t *testing.T) {
fc := clock.NewFake()
fc.Set(fc.Now().Add(time.Hour))

checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, blog.NewMock())
mocklog := blog.NewMock()
checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, mocklog)
sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), metrics.NoopRegisterer)
test.AssertNotError(t, err, "Couldn't create SA to insert certificates")
saCleanUp := test.ResetBoulderTestDatabase(t)
Expand Down Expand Up @@ -375,11 +379,9 @@ func TestGetAndProcessCerts(t *testing.T) {
err = checker.getCerts(context.Background())
test.AssertNotError(t, err, "Failed to retrieve certificates")
test.AssertEquals(t, len(checker.certs), 5)
wg := new(sync.WaitGroup)
wg.Add(1)
checker.processCerts(context.Background(), wg, false)
checker.processCerts(context.Background())
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In unit tests, always use the context provided by the test harness:

Suggested change
checker.processCerts(context.Background())
checker.processCerts(t.Context())

test.AssertEquals(t, checker.issuedReport.BadCerts, int64(5))
test.AssertEquals(t, len(checker.issuedReport.Entries), 5)
test.AssertEquals(t, len(mocklog.GetAllMatching("certificate error found")), 5)
}

// mismatchedCountDB is a certDB implementation for `getCerts` that returns one
Expand Down Expand Up @@ -507,30 +509,6 @@ func TestGetCertsLate(t *testing.T) {
}
}

func TestSaveReport(t *testing.T) {
r := report{
begin: time.Time{},
end: time.Time{},
GoodCerts: 2,
BadCerts: 1,
Entries: map[string]reportEntry{
"020000000000004b475da49b91da5c17": {
Valid: true,
},
"020000000000004d1613e581432cba7e": {
Valid: true,
},
"020000000000004e402bc21035c6634a": {
Valid: false,
Problems: []string{"None really..."},
},
},
}

err := r.dump()
test.AssertNotError(t, err, "Failed to dump results")
}

func TestIsForbiddenDomain(t *testing.T) {
// Note: These testcases are not an exhaustive representation of domains
// Boulder won't issue for, but are instead testing the defense-in-depth
Expand Down Expand Up @@ -698,3 +676,38 @@ func TestPrecertCorrespond(t *testing.T) {
}
t.Fatalf("expected precert correspondence problem, but got: %v", problems)
}

func TestGetPushgatewayURL(t *testing.T) {
ctx := context.Background()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment here. In fact, probably leave this line out entirely, and just use t.Context() in the three places below.

t.Run("happy path", func(t *testing.T) {
gotURL, err := getPushgatewayURL(ctx, "consul.service.consul:53",
cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"})
test.AssertNotError(t, err, "")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a new test, we prefer to follow the Google Go style guide. That means using traditional error checking, and not using the assertion helper library.

Suggested change
test.AssertNotError(t, err, "")
if err != nil {
t.Fatalf("getPushgatewayURL(consul.service.consul:53) = %s, but want success", err)
}

The same applies for the assertions below. Use t.Fatalf for places where test execution should stop, and t.Errorf for places where the test should fail but we should maybe also see the results of the next thing as well.


parsed, err := url.Parse(gotURL)
test.AssertNotError(t, err, "returned URL should be parseable")
test.AssertEquals(t, parsed.Scheme, "http")

host, port, err := net.SplitHostPort(parsed.Host)
test.AssertNotError(t, err, "URL host should contain a port")
test.AssertNotNil(t, net.ParseIP(host), "host should be an IP (LookupHost flatten step)")
portNum, err := strconv.Atoi(port)
test.AssertNotError(t, err, "port should be numeric")
test.Assert(t, portNum > 0 && portNum < 65536, "port should be in valid range")
})
t.Run("DNS authority no port specified", func(t *testing.T) {
_, err := getPushgatewayURL(ctx, "consul.service.consul",
cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"})
test.AssertNotError(t, err, "")
})
t.Run("SRV not found", func(t *testing.T) {
_, err := getPushgatewayURL(ctx, "consul.service.consul:53",
cmd.ServiceDomain{Service: "doesnotexist", Domain: "service.consul"})
test.AssertError(t, err, "")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to the comment above about not using the test assertion helper, these tests should probably also ensure that they're getting the right error:

Suggested change
test.AssertError(t, err, "")
if err != nil {
t.Fatalf("getPushgatewayURL(consul.service.consul:53) = %s, but want success", err)
}
if !strings.Contains(err.Error(), "SRV lookup of doesnotexist._tcp.service.consul failed") {
t.Errorf("getPushgatewayURL(doesnotexist) = %q, but want 'SRV lookup failed'"), err)
}

or something like that

})
t.Run("DNS authority unreachable", func(t *testing.T) {
_, err := getPushgatewayURL(ctx, "doesnotexist.invalid:53",
cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"})
test.AssertError(t, err, "")
})
}
Loading