Skip to content

Commit d7ab4fa

Browse files
authored
feat(metrics): add per-instance abnormal_instance gauge for stuck-state alerts (#6835)
1 parent 1ec456d commit d7ab4fa

5 files changed

Lines changed: 267 additions & 0 deletions

File tree

pkg/controllers/common/task_finalizer.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
utilerr "k8s.io/apimachinery/pkg/util/errors"
2525

2626
"github.com/pingcap/tidb-operator/v2/pkg/client"
27+
"github.com/pingcap/tidb-operator/v2/pkg/metrics"
2728
"github.com/pingcap/tidb-operator/v2/pkg/runtime"
2829
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
2930
"github.com/pingcap/tidb-operator/v2/pkg/utils/k8s"
@@ -226,6 +227,12 @@ func TaskInstanceFinalizerDel[
226227
return task.Fail().With("cannot remove finalizer: %v", err)
227228
}
228229

230+
// Finalize succeeded; drop per-instance metric series so deleted
231+
// instances do not leave stale samples in Prometheus. Doing this here
232+
// (rather than as a separate task in every component builder) ensures
233+
// new instance controllers cannot forget the cleanup.
234+
metrics.ClearInstanceConditionMetrics(state.Object())
235+
229236
return task.Complete().With("finalizer is removed")
230237
})
231238
}

pkg/controllers/common/task_status.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
2525
coreutil "github.com/pingcap/tidb-operator/v2/pkg/apiutil/core/v1alpha1"
2626
"github.com/pingcap/tidb-operator/v2/pkg/client"
27+
"github.com/pingcap/tidb-operator/v2/pkg/metrics"
2728
"github.com/pingcap/tidb-operator/v2/pkg/runtime"
2829
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
2930
"github.com/pingcap/tidb-operator/v2/pkg/utils/podutil"
@@ -220,6 +221,11 @@ func TaskInstanceConditionReady[
220221
state.SetStatusChanged()
221222
}
222223

224+
// Refresh the abnormal-instance gauge so a long-running Ready=False
225+
// (e.g. pod up but cannot serve) becomes alertable without any
226+
// per-component glue.
227+
metrics.ObserveCondition(instance, coreutil.StatusConditions[S](instance), v1alpha1.CondReady)
228+
223229
if !isReady {
224230
return task.Wait().With("instance is unready: %s", coreutil.SprintCondition(cond))
225231
}
@@ -280,6 +286,11 @@ func TaskInstanceConditionSynced[
280286
state.SetStatusChanged()
281287
}
282288

289+
// Refresh the abnormal-instance gauge so a long-running Synced=False
290+
// (rolling restart stuck, scale-in stuck, ...) becomes alertable
291+
// without any per-component glue.
292+
metrics.ObserveCondition(instance, coreutil.StatusConditions[S](instance), v1alpha1.CondSynced)
293+
283294
if !isSynced {
284295
return task.Wait().With("instance is unsynced: %s", coreutil.SprintCondition(cond))
285296
}

pkg/metrics/abnormal_instance.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package metrics
16+
17+
import (
18+
"k8s.io/apimachinery/pkg/api/meta"
19+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
20+
"sigs.k8s.io/controller-runtime/pkg/client"
21+
22+
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
23+
)
24+
25+
// trackedConditions are the condition types observed by ObserveInstance* and
26+
// cleared by ClearInstanceConditionMetrics. Add new types here when extending
27+
// the AbnormalInstance gauge to additional condition signals.
28+
var trackedConditions = []string{v1alpha1.CondSynced, v1alpha1.CondReady}
29+
30+
// instanceMetricBaseLabels reads the standard well-known labels carried by
31+
// every managed instance (LabelKeyCluster / LabelKeyComponent / LabelKeyGroup)
32+
// plus its namespace and name. The caller appends the per-condition label.
33+
func instanceMetricBaseLabels(obj client.Object) []string {
34+
ls := obj.GetLabels()
35+
return []string{
36+
obj.GetNamespace(),
37+
ls[v1alpha1.LabelKeyCluster],
38+
ls[v1alpha1.LabelKeyComponent],
39+
ls[v1alpha1.LabelKeyGroup],
40+
obj.GetName(),
41+
}
42+
}
43+
44+
// ObserveCondition writes 1 to the abnormal-instance gauge when the named
45+
// condition is False; 0 otherwise (True or absent are treated as healthy).
46+
// The series stays present so PromQL `for:` alerts can fire reliably without
47+
// gaps, and so dashboards never see missing samples for managed instances.
48+
//
49+
// condType must be one of trackedConditions so the finalize-time cleanup in
50+
// ClearInstanceConditionMetrics covers the same set of series this writes.
51+
func ObserveCondition(obj client.Object, conds []metav1.Condition, condType string) {
52+
labels := append(instanceMetricBaseLabels(obj), condType)
53+
value := 0.0
54+
if cond := meta.FindStatusCondition(conds, condType); cond != nil && cond.Status == metav1.ConditionFalse {
55+
value = 1
56+
}
57+
AbnormalInstance.WithLabelValues(labels...).Set(value)
58+
}
59+
60+
// ClearInstanceConditionMetrics removes every tracked-condition series for
61+
// the given instance.
62+
//
63+
// Called from TaskInstanceFinalizerDel after the finalizer is removed, so
64+
// every component that uses the standard finalize task is covered without
65+
// per-builder wiring. Component builders short-circuit the deletion path
66+
// with task.IfBreak around CondClusterIsDeleting / CondObjectIsDeleting, so
67+
// the normal TaskInstanceConditionSynced / TaskInstanceConditionReady tasks
68+
// (where ObserveCondition lives) never run during finalization; without
69+
// this explicit cleanup, the gauge series would stay present at its last
70+
// value forever, triggering false-positive `metric == 1 for: <duration>`
71+
// alerts on a non-existent instance and growing label cardinality across
72+
// each cluster lifecycle.
73+
func ClearInstanceConditionMetrics(obj client.Object) {
74+
base := instanceMetricBaseLabels(obj)
75+
for _, condType := range trackedConditions {
76+
AbnormalInstance.DeleteLabelValues(append(base, condType)...)
77+
}
78+
}
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package metrics
16+
17+
import (
18+
"testing"
19+
20+
"github.com/prometheus/client_golang/prometheus"
21+
dto "github.com/prometheus/client_model/go"
22+
"github.com/stretchr/testify/assert"
23+
"github.com/stretchr/testify/require"
24+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25+
26+
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
27+
)
28+
29+
func newTiKVForMetricTest(name string) *v1alpha1.TiKV {
30+
return &v1alpha1.TiKV{
31+
ObjectMeta: metav1.ObjectMeta{
32+
Namespace: "test-ns",
33+
Name: name,
34+
Labels: map[string]string{
35+
v1alpha1.LabelKeyCluster: "test-cluster",
36+
v1alpha1.LabelKeyComponent: "tikv",
37+
v1alpha1.LabelKeyGroup: "test-group",
38+
},
39+
},
40+
}
41+
}
42+
43+
// abnormalGaugeValue reads the AbnormalInstance gauge for the given instance
44+
// + condition. Returns (value, present). present is false when the series has
45+
// not been touched (collector returns the default zero metric without a Gauge
46+
// payload set).
47+
func abnormalGaugeValue(t *testing.T, instance, condition string) (float64, bool) {
48+
t.Helper()
49+
g, err := AbnormalInstance.GetMetricWithLabelValues(
50+
"test-ns", "test-cluster", "tikv", "test-group", instance, condition,
51+
)
52+
require.NoError(t, err)
53+
m := &dto.Metric{}
54+
require.NoError(t, g.Write(m))
55+
if m.Gauge == nil {
56+
return 0, false
57+
}
58+
return m.Gauge.GetValue(), true
59+
}
60+
61+
func gaugeSeriesExists(t *testing.T, instance, condition string) bool {
62+
t.Helper()
63+
ch := make(chan prometheus.Metric, 16)
64+
AbnormalInstance.Collect(ch)
65+
close(ch)
66+
want := []string{"test-ns", "test-cluster", "tikv", "test-group", instance, condition}
67+
for m := range ch {
68+
dm := &dto.Metric{}
69+
if err := m.Write(dm); err != nil {
70+
continue
71+
}
72+
if labelsEqual(dm.GetLabel(), want) {
73+
return true
74+
}
75+
}
76+
return false
77+
}
78+
79+
func labelsEqual(got []*dto.LabelPair, wantValues []string) bool {
80+
if len(got) != len(InstanceAbnormalMetricLabels) {
81+
return false
82+
}
83+
want := map[string]string{}
84+
for i, k := range InstanceAbnormalMetricLabels {
85+
want[k] = wantValues[i]
86+
}
87+
for _, p := range got {
88+
if want[p.GetName()] != p.GetValue() {
89+
return false
90+
}
91+
}
92+
return true
93+
}
94+
95+
func TestObserveCondition(t *testing.T) {
96+
cases := []struct {
97+
name string
98+
condType string
99+
status metav1.ConditionStatus
100+
omitCond bool
101+
wantValue float64
102+
}{
103+
{name: "Synced=False sets 1", condType: v1alpha1.CondSynced, status: metav1.ConditionFalse, wantValue: 1},
104+
{name: "Synced=True sets 0", condType: v1alpha1.CondSynced, status: metav1.ConditionTrue, wantValue: 0},
105+
{name: "Synced absent treated as healthy", condType: v1alpha1.CondSynced, omitCond: true, wantValue: 0},
106+
{name: "Ready=False sets 1", condType: v1alpha1.CondReady, status: metav1.ConditionFalse, wantValue: 1},
107+
{name: "Ready=True sets 0", condType: v1alpha1.CondReady, status: metav1.ConditionTrue, wantValue: 0},
108+
}
109+
110+
for _, tc := range cases {
111+
t.Run(tc.name, func(t *testing.T) {
112+
instanceName := "tikv-" + tc.name
113+
defer AbnormalInstance.DeleteLabelValues(
114+
"test-ns", "test-cluster", "tikv", "test-group", instanceName, tc.condType,
115+
)
116+
117+
obj := newTiKVForMetricTest(instanceName)
118+
var conds []metav1.Condition
119+
if !tc.omitCond {
120+
conds = []metav1.Condition{{Type: tc.condType, Status: tc.status}}
121+
}
122+
ObserveCondition(obj, conds, tc.condType)
123+
124+
val, present := abnormalGaugeValue(t, instanceName, tc.condType)
125+
require.True(t, present)
126+
assert.InDelta(t, tc.wantValue, val, 1e-9)
127+
assert.True(t, gaugeSeriesExists(t, instanceName, tc.condType),
128+
"series must remain present so PromQL `for:` alerts have continuous samples")
129+
})
130+
}
131+
}
132+
133+
func TestClearInstanceConditionMetrics(t *testing.T) {
134+
name := "tikv-clear"
135+
obj := newTiKVForMetricTest(name)
136+
137+
// Pre-populate both tracked conditions.
138+
ObserveCondition(obj, []metav1.Condition{{Type: v1alpha1.CondSynced, Status: metav1.ConditionFalse}}, v1alpha1.CondSynced)
139+
ObserveCondition(obj, []metav1.Condition{{Type: v1alpha1.CondReady, Status: metav1.ConditionFalse}}, v1alpha1.CondReady)
140+
require.True(t, gaugeSeriesExists(t, name, v1alpha1.CondSynced))
141+
require.True(t, gaugeSeriesExists(t, name, v1alpha1.CondReady))
142+
143+
ClearInstanceConditionMetrics(obj)
144+
145+
assert.False(t, gaugeSeriesExists(t, name, v1alpha1.CondSynced),
146+
"Synced series must be removed on clear")
147+
assert.False(t, gaugeSeriesExists(t, name, v1alpha1.CondReady),
148+
"Ready series must be removed on clear")
149+
}

pkg/metrics/metrics.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ import (
1919
"sigs.k8s.io/controller-runtime/pkg/metrics"
2020
)
2121

22+
// InstanceAbnormalMetricLabels is the canonical label order for the
23+
// per-instance abnormal-condition gauge. Keep in sync with WithLabelValues /
24+
// DeleteLabelValues callers.
25+
var InstanceAbnormalMetricLabels = []string{"namespace", "cluster", "component", "group", "instance", "condition"}
26+
2227
var (
2328

2429
// ControllerPanic is a counter to record the number of panics in the controller.
@@ -30,9 +35,26 @@ var (
3035
Help: "The total number of panics in the controller",
3136
}, []string{},
3237
)
38+
39+
// AbnormalInstance is 1 when the named condition on the instance is False
40+
// (abnormal), 0 otherwise. The series stays present while the operator
41+
// manages the instance and is removed only when the instance is finalized.
42+
//
43+
// Use `metric == 1` together with PromQL `for: <duration>` to alert on
44+
// instances stuck in an abnormal state, e.g. a rolling restart that cannot
45+
// converge or a pod that is up but cannot serve.
46+
AbnormalInstance = prometheus.NewGaugeVec(
47+
prometheus.GaugeOpts{
48+
Namespace: "tidb_operator",
49+
Name: "abnormal_instance",
50+
Help: "1 when the named condition on the instance is False, 0 otherwise. " +
51+
"Use `metric == 1` with PromQL `for: <duration>` to alert on stuck state.",
52+
}, InstanceAbnormalMetricLabels,
53+
)
3354
)
3455

3556
func init() {
3657
// Register custom metrics with the global prometheus registry
3758
metrics.Registry.MustRegister(ControllerPanic)
59+
metrics.Registry.MustRegister(AbnormalInstance)
3860
}

0 commit comments

Comments
 (0)