Skip to content

Commit 818d6d3

Browse files
authored
fix(metrics): centralize abnormal_instance observe/clear at reconcile entry (#6840)
1 parent bfc8846 commit 818d6d3

16 files changed

Lines changed: 284 additions & 0 deletions

File tree

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package common
16+
17+
import (
18+
"context"
19+
20+
coreutil "github.com/pingcap/tidb-operator/v2/pkg/apiutil/core/v1alpha1"
21+
"github.com/pingcap/tidb-operator/v2/pkg/metrics"
22+
"github.com/pingcap/tidb-operator/v2/pkg/runtime"
23+
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
24+
"github.com/pingcap/tidb-operator/v2/pkg/utils/task/v3"
25+
)
26+
27+
// TaskObserveInstance refreshes the AbnormalInstance gauge for the reconciled
28+
// instance every time the pipeline runs, and clears it when the instance CR
29+
// has been removed from the API server (state.Object() == nil).
30+
//
31+
// It mirrors the shape of TaskTrack so observe and clear live in one place:
32+
// the same branch that decides whether the object still exists also decides
33+
// whether to publish or retract the gauge. Placing this after TaskTrack and
34+
// before the `CondObjectHasBeenDeleted` IfBreak means it always runs once
35+
// per reconcile, including the reconcile triggered by the informer's DELETE
36+
// event - covering graceful delete, force-delete that strips finalizers, and
37+
// any other path that lands in the watch stream.
38+
func TaskObserveInstance[
39+
S scope.Instance[F, T],
40+
F Object[P],
41+
T runtime.Instance,
42+
P any,
43+
](state TrackState[F]) task.Task {
44+
return task.NameTaskFunc("ObserveInstance", func(context.Context) task.Result {
45+
obj := state.Object()
46+
if obj == nil {
47+
key := state.Key()
48+
// scope.Component[S]() is a compile-time constant for the reconcile
49+
// kind, so we can qualify the sweep even after the CR is gone and
50+
// its labels are no longer readable.
51+
metrics.ClearInstanceConditionMetricsByKey(key.Namespace, scope.Component[S](), key.Name)
52+
return task.Complete().With("cleared metrics for deleted %s", key)
53+
}
54+
conds := coreutil.StatusConditions[S](obj)
55+
metrics.ObserveConditions(obj, conds)
56+
return task.Complete().With("observed metrics for %s/%s", obj.GetNamespace(), obj.GetName())
57+
})
58+
}
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package common
16+
17+
import (
18+
"context"
19+
"testing"
20+
21+
"github.com/prometheus/client_golang/prometheus"
22+
dto "github.com/prometheus/client_model/go"
23+
"github.com/stretchr/testify/assert"
24+
"github.com/stretchr/testify/require"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
27+
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
28+
"github.com/pingcap/tidb-operator/v2/pkg/metrics"
29+
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
30+
"github.com/pingcap/tidb-operator/v2/pkg/utils/fake"
31+
"github.com/pingcap/tidb-operator/v2/pkg/utils/task/v3"
32+
)
33+
34+
// hasGaugeSample returns true if the AbnormalInstance gauge has any sample
35+
// whose (namespace, instance) labels match the inputs.
36+
func hasGaugeSample(t *testing.T, namespace, instance string) bool {
37+
t.Helper()
38+
ch := make(chan prometheus.Metric, 32)
39+
metrics.AbnormalInstance.Collect(ch)
40+
close(ch)
41+
for m := range ch {
42+
dm := &dto.Metric{}
43+
if err := m.Write(dm); err != nil {
44+
continue
45+
}
46+
labels := map[string]string{}
47+
for _, lp := range dm.GetLabel() {
48+
labels[lp.GetName()] = lp.GetValue()
49+
}
50+
if labels["namespace"] == namespace && labels["instance"] == instance {
51+
return true
52+
}
53+
}
54+
return false
55+
}
56+
57+
func TestTaskObserveInstance_ObservesConditions(t *testing.T) {
58+
const ns, name = "ns-observe", "pd-observe"
59+
defer metrics.ClearInstanceConditionMetricsByKey(ns, v1alpha1.LabelValComponentPD, name)
60+
61+
obj := fake.FakeObj(name, func(o *v1alpha1.PD) *v1alpha1.PD {
62+
o.Namespace = ns
63+
o.Labels = map[string]string{
64+
v1alpha1.LabelKeyCluster: "c",
65+
v1alpha1.LabelKeyComponent: "pd",
66+
v1alpha1.LabelKeyGroup: "g",
67+
}
68+
o.Status.Conditions = []metav1.Condition{
69+
{Type: v1alpha1.CondReady, Status: metav1.ConditionFalse},
70+
{Type: v1alpha1.CondSynced, Status: metav1.ConditionTrue},
71+
}
72+
return o
73+
})
74+
state := &fakeState[v1alpha1.PD]{ns: ns, name: name, obj: obj}
75+
76+
res, done := task.RunTask(context.Background(), TaskObserveInstance[scope.PD](state))
77+
require.Equal(t, task.SComplete, res.Status())
78+
require.False(t, done)
79+
assert.True(t, hasGaugeSample(t, ns, name),
80+
"ObserveInstance must write at least one series for the instance")
81+
}
82+
83+
func TestTaskObserveInstance_ClearsOnMissingObject(t *testing.T) {
84+
const ns, name = "ns-clear", "pd-clear"
85+
86+
// Pre-populate a series as if a previous reconcile had observed the instance.
87+
seed := fake.FakeObj(name, func(o *v1alpha1.PD) *v1alpha1.PD {
88+
o.Namespace = ns
89+
o.Labels = map[string]string{
90+
v1alpha1.LabelKeyCluster: "c",
91+
v1alpha1.LabelKeyComponent: "pd",
92+
v1alpha1.LabelKeyGroup: "g",
93+
}
94+
return o
95+
})
96+
metrics.ObserveConditions(seed, []metav1.Condition{
97+
{Type: v1alpha1.CondReady, Status: metav1.ConditionFalse},
98+
})
99+
require.True(t, hasGaugeSample(t, ns, name), "test precondition: seed series exists")
100+
101+
// Simulate a reconcile where TaskContextObject saw NotFound.
102+
state := &fakeState[v1alpha1.PD]{ns: ns, name: name, obj: nil}
103+
104+
res, done := task.RunTask(context.Background(), TaskObserveInstance[scope.PD](state))
105+
require.Equal(t, task.SComplete, res.Status())
106+
require.False(t, done)
107+
assert.False(t, hasGaugeSample(t, ns, name),
108+
"ObserveInstance must clear every series for the instance when the object is gone")
109+
}

pkg/controllers/pd/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
2626
// get pd
2727
common.TaskContextObject[scope.PD](state, r.Client),
2828
common.TaskTrack[scope.PD](state, r.Tracker),
29+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
30+
common.TaskObserveInstance[scope.PD](state),
2931
// if it's gone just return
3032
task.IfBreak(common.CondObjectHasBeenDeleted[scope.PD](state)),
3133

pkg/controllers/resourcemanager/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
2525
runner := task.NewTaskRunner(reporter,
2626
common.TaskContextObject[scope.ResourceManager](state, r.Client),
2727
common.TaskTrack[scope.ResourceManager](state, r.Tracker),
28+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
29+
common.TaskObserveInstance[scope.ResourceManager](state),
2830
task.IfBreak(common.CondObjectHasBeenDeleted[scope.ResourceManager](state)),
2931

3032
task.IfBreak(common.CondObjectIsDeleting[scope.ResourceManager](state),

pkg/controllers/router/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
2525
runner := task.NewTaskRunner(reporter,
2626
common.TaskContextObject[scope.Router](state, r.Client),
2727
common.TaskTrack[scope.Router](state, r.Tracker),
28+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
29+
common.TaskObserveInstance[scope.Router](state),
2830
task.IfBreak(common.CondObjectHasBeenDeleted[scope.Router](state)),
2931

3032
task.IfBreak(common.CondObjectIsDeleting[scope.Router](state),

pkg/controllers/scheduler/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
2626
// get scheduler
2727
common.TaskContextObject[scope.Scheduler](state, r.Client),
2828
common.TaskTrack[scope.Scheduler](state, r.Tracker),
29+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
30+
common.TaskObserveInstance[scope.Scheduler](state),
2931
// if it's gone just return
3032
task.IfBreak(common.CondObjectHasBeenDeleted[scope.Scheduler](state)),
3133

pkg/controllers/scheduling/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
2626
// get scheduling
2727
common.TaskContextObject[scope.Scheduling](state, r.Client),
2828
common.TaskTrack[scope.Scheduling](state, r.Tracker),
29+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
30+
common.TaskObserveInstance[scope.Scheduling](state),
2931
// if it's gone just return
3032
task.IfBreak(common.CondObjectHasBeenDeleted[scope.Scheduling](state)),
3133

pkg/controllers/ticdc/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
2626
// get TiCDC
2727
common.TaskContextObject[scope.TiCDC](state, r.Client),
2828
common.TaskTrack[scope.TiCDC](state, r.Tracker),
29+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
30+
common.TaskObserveInstance[scope.TiCDC](state),
2931
// if it's deleted just return
3032
task.IfBreak(common.CondObjectHasBeenDeleted[scope.TiCDC](state)),
3133

pkg/controllers/tidb/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
3131
// get tidb
3232
common.TaskContextObject[scope.TiDB](state, r.Client),
3333
common.TaskTrack[scope.TiDB](state, r.Tracker),
34+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
35+
common.TaskObserveInstance[scope.TiDB](state),
3436
tasks.TaskRegisterForAdoption(state, r.AdoptManager),
3537
// if it's deleted just return
3638
task.IfBreak(common.CondObjectHasBeenDeleted[scope.TiDB](state)),

pkg/controllers/tiflash/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
2828
// Get tiflash
2929
common.TaskContextObject[scope.TiFlash](state, r.Client),
3030
common.TaskTrack[scope.TiFlash](state, r.Tracker),
31+
// refresh the abnormal_instance gauge, or clear it if the CR is gone
32+
common.TaskObserveInstance[scope.TiFlash](state),
3133
// if it's deleted just return
3234
task.IfBreak(common.CondObjectHasBeenDeleted[scope.TiFlash](state), tasks.TaskDeregisterTiFlashClient(state, r.TiFlashClientManager)),
3335

0 commit comments

Comments
 (0)