Skip to content

Commit dbb0214

Browse files
authored
feat: support prechecking down peers before restarting tikv pod (#6877) (#6882)
1 parent 72613a0 commit dbb0214

15 files changed

Lines changed: 646 additions & 131 deletions

File tree

api/core/v1alpha1/tikv_types.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,12 @@ const (
4545
// condition
4646
TiKVCondLeadersEvicted = "LeadersEvicted"
4747
// reason
48-
ReasonNotEvicted = "NotEvicted"
49-
ReasonEvicting = "Evicting"
50-
ReasonEvicted = "Evicted"
48+
ReasonNotEvicted = "NotEvicted"
49+
ReasonEvicting = "Evicting"
50+
ReasonEvicted = "Evicted"
51+
ReasonStoreNotExist = "StoreNotExist"
52+
53+
// Deprecated: replaced by ReasonStoreNotExist
5154
ReasonStoreIsRemoved = "StoreIsRemoved"
5255
)
5356

pkg/controllers/tikv/builder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
8888
tasks.TaskOfflineStore(state, r.PDClientManager),
8989
tasks.TaskConfigMap(state, r.Client),
9090
common.TaskPVC[scope.TiKV](state, r.Client, r.VolumeModifierFactory, tasks.PVCNewer()),
91-
tasks.TaskPod(state, r.Client),
91+
tasks.TaskPod(state, r.Client, r.PDClientManager),
9292
tasks.TaskStoreLabels(state, r.Client, r.PDClientManager),
9393
tasks.TaskEvictLeader(state, r.PDClientManager),
9494
common.TaskInstanceConditionSynced[scope.TiKV](state),

pkg/controllers/tikv/tasks/ctx.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@ import (
3232
type ReconcileContext struct {
3333
State
3434

35-
LeaderEvicting bool
35+
ShouldEvictLeader bool
36+
LeaderEvicting bool
3637

37-
Store *pdv1.Store
3838
PDSynced bool
3939

40+
Store *pdv1.Store
41+
4042
// IsStoreReady will be set only when pd is synced and the store is ok
4143
// It may be outdated so the tikv is healthy only when the pod is also available
4244
// If it's true and the pod is ready but not available,
@@ -97,6 +99,7 @@ func TaskContextInfoFromPD(state *ReconcileContext, cm pdm.PDClientManager) task
9799
if coreutil.ShouldSuspendCompute(state.Cluster()) {
98100
return task.Complete().With("cluster is suspending")
99101
}
102+
100103
scheduler, err := c.Underlay().GetEvictLeaderScheduler(ctx, state.Store.ID)
101104
if err != nil {
102105
return task.Fail().With("pd is unexpectedly crashed: %v", err)

pkg/controllers/tikv/tasks/evict_leader.go

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ package tasks
1616

1717
import (
1818
"context"
19+
"fmt"
1920

21+
"k8s.io/apimachinery/pkg/api/meta"
22+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
24+
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
25+
pdv1 "github.com/pingcap/tidb-operator/v2/pkg/timanager/apis/pd/v1"
2026
pdm "github.com/pingcap/tidb-operator/v2/pkg/timanager/pd"
2127
"github.com/pingcap/tidb-operator/v2/pkg/utils/task/v3"
2228
)
@@ -27,26 +33,33 @@ func TaskEvictLeader(state *ReconcileContext, m pdm.PDClientManager) task.Task {
2733
if !ok {
2834
return task.Wait().With("wait if pd client is not registered")
2935
}
30-
switch {
31-
case !state.PDSynced:
32-
return task.Wait().With("pd is unsynced")
33-
case state.Store == nil:
36+
if state.Store == nil {
37+
if syncLeadersEvictedCond(state.TiKV(), nil, state.LeaderEvicting) {
38+
state.SetStatusChanged()
39+
}
3440
return task.Complete().With("store has been deleted or not created")
35-
case state.Instance().IsOffline() || state.IsPodTerminating():
36-
if !state.LeaderEvicting {
37-
if err := pc.Underlay().BeginEvictLeader(ctx, state.Store.ID); err != nil {
38-
return task.Fail().With("cannot add evict leader scheduler: %v", err)
39-
}
41+
}
42+
43+
if state.ShouldEvictLeader && !state.LeaderEvicting {
44+
if err := pc.Underlay().BeginEvictLeader(ctx, state.Store.ID); err != nil {
45+
return task.Fail().With("cannot add evict leader scheduler: %v", err)
4046
}
41-
return task.Complete().With("ensure evict leader scheduler exists")
42-
default:
43-
if state.LeaderEvicting {
44-
if err := pc.Underlay().EndEvictLeader(ctx, state.Store.ID); err != nil {
45-
return task.Fail().With("cannot remove evict leader scheduler: %v", err)
46-
}
47+
state.LeaderEvicting = true
48+
}
49+
50+
if state.LeaderEvicting && !state.ShouldEvictLeader {
51+
if err := pc.Underlay().EndEvictLeader(ctx, state.Store.ID); err != nil {
52+
return task.Fail().With("cannot remove evict leader scheduler: %v", err)
4753
}
48-
return task.Complete().With("ensure evict leader scheduler doesn't exist")
54+
state.LeaderEvicting = false
4955
}
56+
57+
needUpdate := syncLeadersEvictedCond(state.TiKV(), state.Store, state.LeaderEvicting)
58+
if needUpdate {
59+
state.SetStatusChanged()
60+
}
61+
62+
return task.Complete().With("sync evict leader scheduler, expected: %v, actual: %v", state.ShouldEvictLeader, state.LeaderEvicting)
5063
})
5164
}
5265

@@ -69,3 +82,32 @@ func TaskEndEvictLeader(state *ReconcileContext, m pdm.PDClientManager) task.Tas
6982
return task.Complete().With(msg)
7083
})
7184
}
85+
86+
// Status of this condition can only transfer as the below
87+
func syncLeadersEvictedCond(tikv *v1alpha1.TiKV, store *pdv1.Store, isEvicting bool) bool {
88+
status := metav1.ConditionFalse
89+
reason := v1alpha1.ReasonNotEvicted
90+
msg := "leaders are not all evicted"
91+
switch {
92+
case store == nil:
93+
status = metav1.ConditionTrue
94+
reason = v1alpha1.ReasonStoreNotExist
95+
msg = "store does not exist"
96+
case isEvicting && store.LeaderCount == 0:
97+
status = metav1.ConditionTrue
98+
reason = v1alpha1.ReasonEvicted
99+
msg = "all leaders are evicted"
100+
case isEvicting:
101+
status = metav1.ConditionFalse
102+
reason = v1alpha1.ReasonEvicting
103+
msg = fmt.Sprintf("not all leaders are evicted, still: %v", store.LeaderCount)
104+
}
105+
106+
return meta.SetStatusCondition(&tikv.Status.Conditions, metav1.Condition{
107+
Type: v1alpha1.TiKVCondLeadersEvicted,
108+
Status: status,
109+
ObservedGeneration: tikv.Generation,
110+
Reason: reason,
111+
Message: msg,
112+
})
113+
}
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package tasks
16+
17+
import (
18+
"context"
19+
"testing"
20+
21+
"github.com/stretchr/testify/assert"
22+
"github.com/stretchr/testify/require"
23+
"go.uber.org/mock/gomock"
24+
corev1 "k8s.io/api/core/v1"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
27+
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
28+
pdapi "github.com/pingcap/tidb-operator/v2/pkg/pdapi/v1"
29+
stateutil "github.com/pingcap/tidb-operator/v2/pkg/state"
30+
pdv1 "github.com/pingcap/tidb-operator/v2/pkg/timanager/apis/pd/v1"
31+
pdm "github.com/pingcap/tidb-operator/v2/pkg/timanager/pd"
32+
"github.com/pingcap/tidb-operator/v2/pkg/utils/fake"
33+
"github.com/pingcap/tidb-operator/v2/pkg/utils/task/v3"
34+
)
35+
36+
func TestTaskEvictLeader(t *testing.T) {
37+
cases := []struct {
38+
desc string
39+
state *ReconcileContext
40+
expectBegin bool
41+
expectEnd bool
42+
expectEvicting bool
43+
expectedStatus task.Status
44+
}{
45+
{
46+
desc: "begin evict leader when requested",
47+
state: &ReconcileContext{
48+
State: &state{
49+
tikv: fake.FakeObj[v1alpha1.TiKV]("aaa-xxx"),
50+
pod: fake.FakeObj[corev1.Pod]("aaa-tikv-xxx"),
51+
},
52+
ShouldEvictLeader: true,
53+
PDSynced: true,
54+
Store: &pdv1.Store{
55+
ID: "1",
56+
},
57+
},
58+
expectBegin: true,
59+
expectEvicting: true,
60+
expectedStatus: task.SComplete,
61+
},
62+
{
63+
desc: "end evict leader when annotation is absent",
64+
state: &ReconcileContext{
65+
State: &state{
66+
tikv: fake.FakeObj[v1alpha1.TiKV]("aaa-xxx"),
67+
pod: fake.FakeObj[corev1.Pod]("aaa-tikv-xxx"),
68+
},
69+
PDSynced: true,
70+
LeaderEvicting: true,
71+
Store: &pdv1.Store{
72+
ID: "1",
73+
},
74+
},
75+
expectEnd: true,
76+
expectEvicting: false,
77+
expectedStatus: task.SComplete,
78+
},
79+
{
80+
desc: "sync leaders evicted condition when store is absent",
81+
state: &ReconcileContext{
82+
State: &state{
83+
tikv: fake.FakeObj("aaa-xxx", func(obj *v1alpha1.TiKV) *v1alpha1.TiKV {
84+
obj.Generation = 3
85+
return obj
86+
}),
87+
pod: fake.FakeObj[corev1.Pod]("aaa-tikv-xxx"),
88+
},
89+
PDSynced: true,
90+
LeaderEvicting: true,
91+
Store: nil,
92+
},
93+
expectEvicting: true,
94+
expectedStatus: task.SComplete,
95+
},
96+
}
97+
98+
for i := range cases {
99+
c := &cases[i]
100+
t.Run(c.desc, func(tt *testing.T) {
101+
tt.Parallel()
102+
103+
ctrl := gomock.NewController(tt)
104+
mockPDClient := pdm.NewMockPDClient(ctrl)
105+
mockUnderlay := pdapi.NewMockPDClient(ctrl)
106+
mockPDClient.EXPECT().Underlay().Return(mockUnderlay).AnyTimes()
107+
if c.expectBegin {
108+
mockUnderlay.EXPECT().BeginEvictLeader(gomock.Any(), "1").Return(nil)
109+
}
110+
if c.expectEnd {
111+
mockUnderlay.EXPECT().EndEvictLeader(gomock.Any(), "1").Return(nil)
112+
}
113+
114+
s := c.state.State.(*state)
115+
s.IPDClient = &stubPDClientState{client: mockPDClient}
116+
117+
res, done := task.RunTask(context.Background(), TaskEvictLeader(c.state, nil))
118+
assert.Equal(tt, c.expectedStatus, res.Status())
119+
assert.False(tt, done)
120+
assert.Equal(tt, c.expectEvicting, c.state.LeaderEvicting)
121+
if c.state.Store == nil {
122+
cond := findCondition(c.state.TiKV().Status.Conditions, v1alpha1.TiKVCondLeadersEvicted)
123+
require.NotNil(tt, cond)
124+
assert.Equal(tt, metav1.ConditionTrue, cond.Status)
125+
assert.Equal(tt, v1alpha1.ReasonStoreNotExist, cond.Reason)
126+
}
127+
})
128+
}
129+
}
130+
131+
type stubPDClientState struct {
132+
client pdm.PDClient
133+
}
134+
135+
func (s *stubPDClientState) GetPDClient(pdm.PDClientManager) (pdm.PDClient, bool) {
136+
return s.client, true
137+
}
138+
139+
var _ stateutil.IPDClient = (*stubPDClientState)(nil)
140+
141+
type stubPDClientUnavailableState struct{}
142+
143+
func (s *stubPDClientUnavailableState) GetPDClient(pdm.PDClientManager) (pdm.PDClient, bool) {
144+
return nil, false
145+
}
146+
147+
var _ stateutil.IPDClient = (*stubPDClientUnavailableState)(nil)
148+
149+
func findCondition(conds []metav1.Condition, typ string) *metav1.Condition {
150+
for i := range conds {
151+
if conds[i].Type == typ {
152+
return &conds[i]
153+
}
154+
}
155+
return nil
156+
}

pkg/controllers/tikv/tasks/offline.go

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,6 @@ import (
1818
"context"
1919
"time"
2020

21-
"k8s.io/apimachinery/pkg/api/meta"
22-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23-
"k8s.io/utils/ptr"
24-
25-
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
2621
coreutil "github.com/pingcap/tidb-operator/v2/pkg/apiutil/core/v1alpha1"
2722
"github.com/pingcap/tidb-operator/v2/pkg/controllers/common"
2823
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
@@ -47,18 +42,11 @@ func TaskOfflineStore(state *ReconcileContext, m pdm.PDClientManager) task.Task
4742
// If the store is nil, it means the store has been deleted or not created yet.
4843
// No need to check if leaders are evicted.
4944
if state.Store != nil && isOffline {
50-
var reason string
51-
beginTime := getBeginEvictLeaderTime(tikv)
52-
switch {
53-
case state.LeaderEvicting && state.GetLeaderCount() == 0:
54-
reason = "leaders have been all evicted"
55-
case beginTime != nil && beginTime.Add(defaultLeaderEvictTimeout).Before(time.Now()):
56-
reason = "leader eviction timeout"
57-
}
45+
state.ShouldEvictLeader = true
5846

59-
if reason == "" {
47+
if err := CheckTiKVLeadersEvictedOrTimeout(tikv, defaultLeaderEvictTimeout); err != nil {
6048
return task.Retry(defaultLeaderEvictTimeout+jitter).
61-
With("waiting for leaders evicted or timeout, current leader count: %d", state.GetLeaderCount())
49+
With("waiting for leaders evicted or timeout: %v", err)
6250
}
6351
}
6452

@@ -87,13 +75,3 @@ func TaskOfflineStore(state *ReconcileContext, m pdm.PDClientManager) task.Task
8775
return task.Complete().With("offline is completed or no need, spec.offline: %v", isOffline)
8876
})
8977
}
90-
91-
// getBeginEvictLeaderTime returns the time when the leader eviction started.
92-
// If the condition is not found or the status is not False, it returns nil.
93-
func getBeginEvictLeaderTime(tikv *v1alpha1.TiKV) *time.Time {
94-
cond := meta.FindStatusCondition(tikv.Status.Conditions, v1alpha1.TiKVCondLeadersEvicted)
95-
if cond != nil && cond.Status == metav1.ConditionFalse {
96-
return ptr.To(cond.LastTransitionTime.Time)
97-
}
98-
return nil
99-
}

0 commit comments

Comments
 (0)