-
-
Notifications
You must be signed in to change notification settings - Fork 637
cert-checker: fix logging & push metrics #8763
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 16 commits
fae0b03
1bbc8e4
600b6c3
7d26e04
a26009e
37ec383
bd919b8
e02607b
be9ce40
c5e0cf6
96eb1c2
a09d4b1
15ce586
02e95e5
90f7d8f
c50dce2
4b4d70e
2a03d31
dad8b8c
ad64b08
6c75a9c
72fc6a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -4,13 +4,14 @@ import ( | |||||
| "bytes" | ||||||
| "context" | ||||||
| "crypto/x509" | ||||||
| "encoding/json" | ||||||
| "flag" | ||||||
| "fmt" | ||||||
| "net" | ||||||
| "net/netip" | ||||||
| "os" | ||||||
| "regexp" | ||||||
| "slices" | ||||||
| "strings" | ||||||
| "sync" | ||||||
| "sync/atomic" | ||||||
| "time" | ||||||
|
|
@@ -38,6 +39,36 @@ import ( | |||||
| "github.com/letsencrypt/boulder/sa" | ||||||
| ) | ||||||
|
|
||||||
| type certCheckerMetrics struct { | ||||||
| checkerLatency prometheus.Histogram | ||||||
| checkerTimestamp prometheus.Gauge | ||||||
| checkerGoodCount prometheus.Gauge | ||||||
| checkerBadCount prometheus.Gauge | ||||||
| } | ||||||
|
|
||||||
| func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics { | ||||||
|
aarongable marked this conversation as resolved.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This constructor is only called from inside this package, so it doesn't need to be Exported.
Suggested change
|
||||||
| checkerLatency := promauto.With(stats).NewHistogram(prometheus.HistogramOpts{ | ||||||
| Name: "cert_checker_latency", | ||||||
| Help: "Histogram of latencies a cert-checker worker takes to complete a batch", | ||||||
| }) | ||||||
|
|
||||||
| checkerTimestamp := promauto.With(stats).NewGauge(prometheus.GaugeOpts{ | ||||||
| Name: "cert_checker_last_run_timestamp", | ||||||
| Help: "Timestamp of cert-checker's last run", | ||||||
| }) | ||||||
|
|
||||||
| checkerGoodCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{ | ||||||
| Name: "cert_checker_good_count", | ||||||
| Help: "Cert-checker count of good certificates", | ||||||
| }) | ||||||
|
|
||||||
| checkerBadCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{ | ||||||
| Name: "cert_checker_bad_count", | ||||||
| Help: "Cert-checker count of bad certificates", | ||||||
| }) | ||||||
| return &certCheckerMetrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount} | ||||||
| } | ||||||
|
|
||||||
| // For defense-in-depth in addition to using the PA & its identPolicy to check | ||||||
| // domain names we also perform a check against the regex's from the | ||||||
| // forbiddenDomains array | ||||||
|
|
@@ -62,25 +93,9 @@ var batchSize = 1000 | |||||
| type report struct { | ||||||
| begin time.Time | ||||||
| end time.Time | ||||||
| GoodCerts int64 `json:"good-certs"` | ||||||
| BadCerts int64 `json:"bad-certs"` | ||||||
| DbErrs int64 `json:"db-errs"` | ||||||
| Entries map[string]reportEntry `json:"entries"` | ||||||
| } | ||||||
|
|
||||||
| func (r *report) dump() error { | ||||||
| content, err := json.MarshalIndent(r, "", " ") | ||||||
| if err != nil { | ||||||
| return err | ||||||
| } | ||||||
| fmt.Fprintln(os.Stdout, string(content)) | ||||||
| return nil | ||||||
| } | ||||||
|
|
||||||
| type reportEntry struct { | ||||||
| Valid bool `json:"valid"` | ||||||
| SANs []string `json:"sans"` | ||||||
| Problems []string `json:"problems,omitempty"` | ||||||
| GoodCerts int64 `json:"good-certs"` | ||||||
| BadCerts int64 `json:"bad-certs"` | ||||||
| DbErrs int64 `json:"db-errs"` | ||||||
| } | ||||||
|
|
||||||
| // certDB is an interface collecting the borp.DbMap functions that the various | ||||||
|
|
@@ -134,7 +149,6 @@ func newChecker(saDbMap certDB, | |||||
| certs: make(chan *corepb.Certificate, batchSize), | ||||||
| rMu: new(sync.Mutex), | ||||||
|
lenaunderwood22 marked this conversation as resolved.
Outdated
|
||||||
| clock: clk, | ||||||
| issuedReport: report{Entries: make(map[string]reportEntry)}, | ||||||
| checkPeriod: period, | ||||||
| acceptableValidityDurations: avd, | ||||||
| lints: lints, | ||||||
|
|
@@ -265,26 +279,17 @@ func (c *certChecker) getCerts(ctx context.Context) error { | |||||
| return nil | ||||||
| } | ||||||
|
|
||||||
| func (c *certChecker) processCerts(ctx context.Context, wg *sync.WaitGroup, badResultsOnly bool) { | ||||||
| func (c *certChecker) processCerts(ctx context.Context) { | ||||||
| for cert := range c.certs { | ||||||
| sans, problems := c.checkCert(ctx, cert) | ||||||
| valid := len(problems) == 0 | ||||||
| c.rMu.Lock() | ||||||
| if !badResultsOnly || (badResultsOnly && !valid) { | ||||||
| c.issuedReport.Entries[cert.Serial] = reportEntry{ | ||||||
| Valid: valid, | ||||||
| SANs: sans, | ||||||
| Problems: problems, | ||||||
| } | ||||||
| } | ||||||
| c.rMu.Unlock() | ||||||
| if !valid { | ||||||
| atomic.AddInt64(&c.issuedReport.BadCerts, 1) | ||||||
| c.logger.AuditErr("certificate error found", nil, map[string]any{"serial": cert.Serial, "sans": sans, "problems": problems}) | ||||||
|
aarongable marked this conversation as resolved.
|
||||||
| } else { | ||||||
| atomic.AddInt64(&c.issuedReport.GoodCerts, 1) | ||||||
| } | ||||||
| } | ||||||
| wg.Done() | ||||||
| } | ||||||
|
|
||||||
| // Extensions that we allow in certificates | ||||||
|
|
@@ -540,8 +545,20 @@ type Config struct { | |||||
| cmd.HostnamePolicyConfig | ||||||
|
|
||||||
| Workers int `validate:"required,min=1"` | ||||||
| // Deprecated: this is ignored, and cert checker always checks both expired and unexpired. | ||||||
| UnexpiredOnly bool | ||||||
| // LookupDNSAuthority can only be specified with PushgatewayService. It's a single | ||||||
| // <hostname|IPv4|[IPv6]>:<port> of the DNS server to be used for resolution | ||||||
| // of pushgateway backends. If the address contains a hostname it will be resolved | ||||||
| // using system DNS. If the address contains a port, the client will use it | ||||||
| // directly, otherwise port 53 is used. | ||||||
| LookupDNSAuthority string `validate:"required_with=PushgatewayService,omitempty,ip|hostname|hostname_port"` | ||||||
|
lenaunderwood22 marked this conversation as resolved.
Outdated
|
||||||
| // PushgatewayService entry contains a service and domain name that will be used | ||||||
| // to construct a SRV DNS query to lookup pushgateway backends. For example: if | ||||||
| // the resource record is 'foo.service.consul', then the 'Service' is 'foo' | ||||||
| // and the 'Domain' is 'service.consul'. The expected dNSName to be | ||||||
| // authenticated in the server certificate would be 'foo.service.consul'. | ||||||
| PushgatewayService *cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"` | ||||||
| PushgatewayScheme string `validate:"required_with=PushgatewayService,omitempty,oneof=http https"` | ||||||
|
lenaunderwood22 marked this conversation as resolved.
Outdated
|
||||||
| // Deprecated: cert-checker only logs bad results anyway. | ||||||
| BadResultsOnly bool | ||||||
| CheckPeriod config.Duration | ||||||
|
|
||||||
|
|
@@ -577,6 +594,39 @@ type Config struct { | |||||
| Syslog cmd.SyslogConfig | ||||||
| } | ||||||
|
|
||||||
| func getPushgatewayURL(dnsAuthority, scheme string, svc cmd.ServiceDomain) (string, error) { | ||||||
| host, port, err := net.SplitHostPort(dnsAuthority) | ||||||
| if err != nil { | ||||||
| // Assume only hostname or IPv4 address was specified. | ||||||
| host = dnsAuthority | ||||||
| port = "53" | ||||||
| } | ||||||
|
lenaunderwood22 marked this conversation as resolved.
Outdated
|
||||||
| r := &net.Resolver{ | ||||||
| PreferGo: true, | ||||||
| Dial: func(ctx context.Context, network, _ string) (net.Conn, error) { | ||||||
| return (&net.Dialer{}).DialContext(ctx, network, net.JoinHostPort(host, port)) | ||||||
| }, | ||||||
| } | ||||||
| lookupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) | ||||||
| defer cancel() | ||||||
| _, targets, err := r.LookupSRV(lookupCtx, svc.Service, "tcp", svc.Domain) | ||||||
| if err != nil { | ||||||
| return "", fmt.Errorf("SRV lookup of _%s._tcp.%s failed: %w", svc.Service, svc.Domain, err) | ||||||
| } | ||||||
| if len(targets) == 0 { | ||||||
| return "", fmt.Errorf("SRV lookup of _%s._tcp.%s returned 0 results", svc.Service, svc.Domain) | ||||||
| } | ||||||
| target := strings.TrimSuffix(targets[0].Target, ".") | ||||||
| addrs, err := r.LookupHost(lookupCtx, target) | ||||||
| if err != nil { | ||||||
| return "", fmt.Errorf("A/AAAA lookup of %q failed: %w", target, err) | ||||||
| } | ||||||
| if len(addrs) == 0 { | ||||||
| return "", fmt.Errorf("A/AAAA lookup of %q returned 0 results", target) | ||||||
| } | ||||||
| return fmt.Sprintf("%s://%s", scheme, net.JoinHostPort(addrs[0], fmt.Sprint(targets[0].Port))), nil | ||||||
| } | ||||||
|
lenaunderwood22 marked this conversation as resolved.
Outdated
lenaunderwood22 marked this conversation as resolved.
Outdated
|
||||||
|
|
||||||
| func main() { | ||||||
| configFile := flag.String("config", "", "File path to the configuration file for this service") | ||||||
| flag.Parse() | ||||||
|
|
@@ -594,6 +644,9 @@ func main() { | |||||
| logger := cmd.NewLogger(config.Syslog) | ||||||
| cmd.LogStartup(logger) | ||||||
|
|
||||||
| reg := prometheus.NewRegistry() | ||||||
| metrics := NewCertCheckerMetrics(reg) | ||||||
|
|
||||||
| acceptableValidityDurations := make(map[time.Duration]bool) | ||||||
| if len(config.CertChecker.AcceptableValidityDurations) > 0 { | ||||||
| for _, entry := range config.CertChecker.AcceptableValidityDurations { | ||||||
|
|
@@ -616,11 +669,6 @@ func main() { | |||||
| saDbMap, err := sa.InitWrappedDb(config.CertChecker.DB, prometheus.DefaultRegisterer, logger) | ||||||
| cmd.FailOnError(err, "While initializing dbMap") | ||||||
|
|
||||||
| checkerLatency := promauto.NewHistogram(prometheus.HistogramOpts{ | ||||||
| Name: "cert_checker_latency", | ||||||
| Help: "Histogram of latencies a cert-checker worker takes to complete a batch", | ||||||
| }) | ||||||
|
|
||||||
| pa, err := policy.New(config.PA.Identifiers, config.PA.Challenges, logger) | ||||||
| cmd.FailOnError(err, "Failed to create PA") | ||||||
|
|
||||||
|
|
@@ -663,23 +711,32 @@ func main() { | |||||
| fmt.Fprintf(os.Stderr, "# Processing certificates using %d workers\n", config.CertChecker.Workers) | ||||||
| wg := new(sync.WaitGroup) | ||||||
| for range config.CertChecker.Workers { | ||||||
| wg.Add(1) | ||||||
| go func() { | ||||||
| wg.Go(func() { | ||||||
| s := checker.clock.Now() | ||||||
| checker.processCerts(context.TODO(), wg, config.CertChecker.BadResultsOnly) | ||||||
| checkerLatency.Observe(checker.clock.Since(s).Seconds()) | ||||||
| }() | ||||||
| checker.processCerts(context.TODO()) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| metrics.checkerLatency.Observe(checker.clock.Since(s).Seconds()) | ||||||
| }) | ||||||
| } | ||||||
| wg.Wait() | ||||||
| fmt.Fprintf( | ||||||
| os.Stderr, | ||||||
| "# Finished processing certificates, report length: %d, good: %d, bad: %d\n", | ||||||
| len(checker.issuedReport.Entries), | ||||||
| checker.issuedReport.GoodCerts, | ||||||
| checker.issuedReport.BadCerts, | ||||||
| ) | ||||||
| err = checker.issuedReport.dump() | ||||||
| cmd.FailOnError(err, "Failed to dump results: %s\n") | ||||||
| logger.AuditInfo("Finished processing certificates", checker.issuedReport) | ||||||
|
|
||||||
| metrics.checkerTimestamp.SetToCurrentTime() | ||||||
| metrics.checkerGoodCount.Set(float64(checker.issuedReport.GoodCerts)) | ||||||
| metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts)) | ||||||
|
|
||||||
| if config.CertChecker.PushgatewayService != nil { | ||||||
| pushgatewayURL, err := getPushgatewayURL(config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, *config.CertChecker.PushgatewayService) | ||||||
| if err != nil { | ||||||
| logger.Errf("failed to get pushgateway URL: %s", err) | ||||||
| } else { | ||||||
| err = cmd.PushMetrics("cert-checker", pushgatewayURL, reg, logger) | ||||||
| if err != nil { | ||||||
| logger.Errf("failed to push metrics to pushgateway: %s", err) | ||||||
| } else { | ||||||
| logger.Debugf("pushed metrics to pushgateway at %s", pushgatewayURL) | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| if checker.issuedReport.BadCerts > 0 { | ||||||
| os.Exit(1) | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -17,7 +17,6 @@ import ( | |||||
| "os" | ||||||
| "slices" | ||||||
| "strings" | ||||||
| "sync" | ||||||
| "testing" | ||||||
| "time" | ||||||
|
|
||||||
|
|
@@ -336,7 +335,8 @@ func TestGetAndProcessCerts(t *testing.T) { | |||||
| fc := clock.NewFake() | ||||||
| fc.Set(fc.Now().Add(time.Hour)) | ||||||
|
|
||||||
| checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, blog.NewMock()) | ||||||
| mocklog := blog.NewMock() | ||||||
| checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, mocklog) | ||||||
| sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), metrics.NoopRegisterer) | ||||||
| test.AssertNotError(t, err, "Couldn't create SA to insert certificates") | ||||||
| saCleanUp := test.ResetBoulderTestDatabase(t) | ||||||
|
|
@@ -375,11 +375,9 @@ func TestGetAndProcessCerts(t *testing.T) { | |||||
| err = checker.getCerts(context.Background()) | ||||||
| test.AssertNotError(t, err, "Failed to retrieve certificates") | ||||||
| test.AssertEquals(t, len(checker.certs), 5) | ||||||
| wg := new(sync.WaitGroup) | ||||||
| wg.Add(1) | ||||||
| checker.processCerts(context.Background(), wg, false) | ||||||
| checker.processCerts(context.Background()) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In unit tests, always use the context provided by the test harness:
Suggested change
|
||||||
| test.AssertEquals(t, checker.issuedReport.BadCerts, int64(5)) | ||||||
| test.AssertEquals(t, len(checker.issuedReport.Entries), 5) | ||||||
| test.AssertEquals(t, len(mocklog.GetAllMatching("certificate error found")), 5) | ||||||
| } | ||||||
|
|
||||||
| // mismatchedCountDB is a certDB implementation for `getCerts` that returns one | ||||||
|
|
@@ -507,30 +505,6 @@ func TestGetCertsLate(t *testing.T) { | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| func TestSaveReport(t *testing.T) { | ||||||
| r := report{ | ||||||
| begin: time.Time{}, | ||||||
| end: time.Time{}, | ||||||
| GoodCerts: 2, | ||||||
| BadCerts: 1, | ||||||
| Entries: map[string]reportEntry{ | ||||||
| "020000000000004b475da49b91da5c17": { | ||||||
| Valid: true, | ||||||
| }, | ||||||
| "020000000000004d1613e581432cba7e": { | ||||||
| Valid: true, | ||||||
| }, | ||||||
| "020000000000004e402bc21035c6634a": { | ||||||
| Valid: false, | ||||||
| Problems: []string{"None really..."}, | ||||||
| }, | ||||||
| }, | ||||||
| } | ||||||
|
|
||||||
| err := r.dump() | ||||||
| test.AssertNotError(t, err, "Failed to dump results") | ||||||
| } | ||||||
|
|
||||||
| func TestIsForbiddenDomain(t *testing.T) { | ||||||
| // Note: These testcases are not an exhaustive representation of domains | ||||||
| // Boulder won't issue for, but are instead testing the defense-in-depth | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.