|
| 1 | +// Unless explicitly stated otherwise all files in this repository are licensed |
| 2 | +// under the Apache License Version 2.0. |
| 3 | +// This product includes software developed at Datadog (https://www.datadoghq.com/). |
| 4 | +// Copyright 2016-present Datadog, Inc. |
| 5 | + |
| 6 | +package metrics |
| 7 | + |
| 8 | +import ( |
| 9 | + "github.com/prometheus/client_golang/prometheus" |
| 10 | + "sigs.k8s.io/controller-runtime/pkg/metrics" |
| 11 | +) |
| 12 | + |
| 13 | +// Label values for TaintTimeoutsTotal. |
| 14 | +const ( |
| 15 | + // UntaintTimeoutReasonReadiness signals that a pod existed on the node but |
| 16 | + // never became Ready within --untaintControllerTimeout. |
| 17 | + UntaintTimeoutReasonReadiness = "readiness" |
| 18 | + // UntaintTimeoutReasonScheduling signals that no agent pod was scheduled on |
| 19 | + // the node within --untaintControllerSchedulingTimeout. |
| 20 | + UntaintTimeoutReasonScheduling = "scheduling" |
| 21 | + |
| 22 | + // UntaintTimeoutPolicyRemove untaints the node despite the agent not being ready. |
| 23 | + UntaintTimeoutPolicyRemove = "remove" |
| 24 | + // UntaintTimeoutPolicyKeep leaves the taint in place but emits observability signals. |
| 25 | + UntaintTimeoutPolicyKeep = "keep" |
| 26 | +) |
| 27 | + |
| 28 | +var ( |
| 29 | + // TaintRemovalsTotal is the total number of taints removed from nodes. |
| 30 | + TaintRemovalsTotal = prometheus.NewCounter( |
| 31 | + prometheus.CounterOpts{ |
| 32 | + Subsystem: untaintSubsystem, |
| 33 | + Name: "taint_removals_total", |
| 34 | + Help: "Total number of taints removed from nodes", |
| 35 | + }, |
| 36 | + ) |
| 37 | + |
| 38 | + // TaintRemovalLatency is the time between agent pod becoming Ready and taint removal. |
| 39 | + TaintRemovalLatency = prometheus.NewHistogram( |
| 40 | + prometheus.HistogramOpts{ |
| 41 | + Subsystem: untaintSubsystem, |
| 42 | + Name: "taint_removal_latency_seconds", |
| 43 | + Help: "Time between agent pod becoming Ready and taint removal from the node", |
| 44 | + Buckets: prometheus.DefBuckets, |
| 45 | + }, |
| 46 | + ) |
| 47 | + |
| 48 | + // TaintTimeoutsTotal counts timeout decisions broken down by reason and policy. |
| 49 | + TaintTimeoutsTotal = prometheus.NewCounterVec( |
| 50 | + prometheus.CounterOpts{ |
| 51 | + Subsystem: untaintSubsystem, |
| 52 | + Name: "taint_timeouts_total", |
| 53 | + Help: "Total number of untaint-controller timeout decisions, by reason and policy", |
| 54 | + }, |
| 55 | + []string{"reason", "policy"}, |
| 56 | + ) |
| 57 | + |
| 58 | + // TaintRemovalErrorsTotal counts hard errors encountered while attempting to |
| 59 | + // remove the taint (apiserver Patch failures, JSON marshal failures, …). |
| 60 | + // Benign optimistic-concurrency races (IsConflict/IsInvalid) are NOT counted |
| 61 | + // here — they're handled by requeueing. Inspect the operator's ERROR-level |
| 62 | + // logs for the specific failure cause. |
| 63 | + TaintRemovalErrorsTotal = prometheus.NewCounter( |
| 64 | + prometheus.CounterOpts{ |
| 65 | + Subsystem: untaintSubsystem, |
| 66 | + Name: "taint_removal_errors_total", |
| 67 | + Help: "Total number of errors encountered while attempting to remove the agent-not-ready taint from a node", |
| 68 | + }, |
| 69 | + ) |
| 70 | +) |
| 71 | + |
| 72 | +func init() { |
| 73 | + metrics.Registry.MustRegister(TaintRemovalsTotal) |
| 74 | + metrics.Registry.MustRegister(TaintRemovalLatency) |
| 75 | + metrics.Registry.MustRegister(TaintTimeoutsTotal) |
| 76 | + metrics.Registry.MustRegister(TaintRemovalErrorsTotal) |
| 77 | +} |
0 commit comments