Skip to content

Commit 8229389

Browse files
tennixclaude
andcommitted
feat: pause DDL during TiDB version upgrade (smooth upgrade)
Introduce /upgrade/start and /upgrade/finish lifecycle hooks in the TiDBGroup reconciliation loop, mirroring TiUP's smooth upgrade behavior. - Before rolling upgrade begins, call POST /upgrade/start on a healthy TiDB instance to pause DDL (global for Dedicated, keyspace-scoped for Premium / TiDB Worker). - After all instances reach the new version, call POST /upgrade/finish to resume DDL. - Guards: only fires when spec.version changes AND both source and target versions support smooth upgrade (>= v7.5.0); no-op for scale/config changes. - Annotation tidb.core.pingcap.com/smooth-upgrade-phase tracks in-flight state across operator restarts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 2b81667 commit 8229389

9 files changed

Lines changed: 702 additions & 0 deletions

File tree

api/core/v1alpha1/tidb_types.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ const (
5151
TiDBGroupAvailableReason = "TiDBGroupAvailable"
5252
)
5353

54+
const (
55+
// AnnoKeySmoothUpgradePhase is set on a TiDBGroup while a smooth upgrade (DDL pause) is in progress.
56+
AnnoKeySmoothUpgradePhase = "tidb.core.pingcap.com/smooth-upgrade-phase"
57+
// AnnoValSmoothUpgradePhaseInProgress is the value of AnnoKeySmoothUpgradePhase during an active smooth upgrade.
58+
AnnoValSmoothUpgradePhaseInProgress = "in-progress"
59+
)
60+
5461
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
5562
// +kubebuilder:object:root=true
5663

pkg/compatibility/semver.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,19 @@ func (c *constraints) check(v *semver.Version) bool {
4242
return c.Check(v)
4343
}
4444

45+
// smoothUpgradeMinVersion is the minimum version that supports smooth upgrade DDL pause.
46+
var smoothUpgradeMinVersion = MustNewConstraints(">= 7.5.0")
47+
48+
// SupportsSmoothUpgrade returns true if the given version string supports the smooth upgrade
49+
// DDL pause/resume mechanism (requires TiDB >= v7.5.0).
50+
func SupportsSmoothUpgrade(version string) bool {
51+
v, err := semver.NewVersion(version)
52+
if err != nil {
53+
return false
54+
}
55+
return Check(v, smoothUpgradeMinVersion)
56+
}
57+
4558
func MustNewConstraints(expr string) Constraints {
4659
v, err := semver.NewConstraint(expr)
4760
if err != nil {

pkg/compatibility/semver_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,29 @@ import (
2121
"github.com/stretchr/testify/assert"
2222
)
2323

24+
func TestSupportsSmoothUpgrade(t *testing.T) {
25+
cases := []struct {
26+
version string
27+
want bool
28+
}{
29+
{"v7.5.0", true},
30+
{"v7.5.1", true},
31+
{"v8.0.0", true},
32+
{"v7.5.0-alpha", false}, // pre-release of 7.5.0 is before 7.5.0
33+
{"v7.5.1-alpha", true}, // pre-release of 7.5.1 is after 7.5.0
34+
{"v7.4.99", false},
35+
{"v7.4.0", false},
36+
{"v6.0.0", false},
37+
{"invalid", false},
38+
{"", false},
39+
}
40+
for _, c := range cases {
41+
t.Run(c.version, func(tt *testing.T) {
42+
assert.Equal(tt, c.want, SupportsSmoothUpgrade(c.version))
43+
})
44+
}
45+
}
46+
2447
func TestCheck(t *testing.T) {
2548
cases := []struct {
2649
desc string

pkg/controllers/tidbgroup/builder.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,15 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
5858
),
5959

6060
tasks.TaskService(state, r.Client),
61+
tasks.TaskSmoothUpgradeStart(state, r.Client),
6162
tasks.TaskUpdater(state, r.Client, r.AllocateFactory, r.AdoptManager),
6263
common.TaskGroupStatusSelector[scope.TiDBGroup](state),
6364
common.TaskGroupConditionSuspended[scope.TiDBGroup](state),
6465
common.TaskGroupConditionReady[scope.TiDBGroup](state),
6566
common.TaskGroupConditionSynced[scope.TiDBGroup](state),
6667
common.TaskStatusRevisionAndReplicas[scope.TiDBGroup](state),
6768
tasks.TaskStatusAvailable(state),
69+
tasks.TaskSmoothUpgradeFinish(state, r.Client),
6870
common.TaskStatusPersister[scope.TiDBGroup](state, r.Client),
6971
)
7072

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package tasks
16+
17+
import (
18+
"context"
19+
"encoding/json"
20+
"fmt"
21+
"time"
22+
23+
"k8s.io/apimachinery/pkg/types"
24+
25+
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
26+
"github.com/pingcap/tidb-operator/v2/pkg/apicall"
27+
coreutil "github.com/pingcap/tidb-operator/v2/pkg/apiutil/core/v1alpha1"
28+
"github.com/pingcap/tidb-operator/v2/pkg/client"
29+
"github.com/pingcap/tidb-operator/v2/pkg/compatibility"
30+
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
31+
tidbapi "github.com/pingcap/tidb-operator/v2/pkg/tidbapi/v1"
32+
"github.com/pingcap/tidb-operator/v2/pkg/utils/task/v3"
33+
)
34+
35+
const (
36+
smoothUpgradeRequestTimeout = 10 * time.Second
37+
smoothUpgradeRetryInterval = 10 * time.Second
38+
)
39+
40+
// tidbClientFactory creates a TiDB HTTP client for the given instance.
41+
// Accepting this as a parameter enables test injection without changing task semantics.
42+
type tidbClientFactory func(ctx context.Context, c client.Client, ck *v1alpha1.Cluster, tidb *v1alpha1.TiDB) (tidbapi.TiDBClient, error)
43+
44+
// TaskSmoothUpgradeStart calls /upgrade/start on a healthy TiDB instance before rolling upgrade begins.
45+
// It is a no-op when the change is not a version upgrade, or when either the source or target version
46+
// does not support smooth upgrade (< v7.5.0).
47+
func TaskSmoothUpgradeStart(state *ReconcileContext, c client.Client) task.Task {
48+
return taskSmoothUpgradeStart(state, c, newTiDBClientForGroup)
49+
}
50+
51+
func taskSmoothUpgradeStart(state *ReconcileContext, c client.Client, factory tidbClientFactory) task.Task {
52+
return task.NameTaskFunc("SmoothUpgradeStart", func(ctx context.Context) task.Result {
53+
dbg := state.TiDBGroup()
54+
55+
if !needVersionUpgrade(dbg) {
56+
return task.Complete().With("not a version upgrade, skipping smooth upgrade start")
57+
}
58+
if !compatibility.SupportsSmoothUpgrade(dbg.Status.Version) ||
59+
!compatibility.SupportsSmoothUpgrade(dbg.Spec.Template.Spec.Version) {
60+
return task.Complete().With("version does not support smooth upgrade, skipping")
61+
}
62+
if dbg.Annotations[v1alpha1.AnnoKeySmoothUpgradePhase] == v1alpha1.AnnoValSmoothUpgradePhaseInProgress {
63+
return task.Complete().With("smooth upgrade already started")
64+
}
65+
66+
tidb := pickReadyTiDB(state.TiDBSlice())
67+
if tidb == nil {
68+
return task.Retry(smoothUpgradeRetryInterval).With("no ready TiDB instance available for upgrade/start")
69+
}
70+
71+
tidbClient, err := factory(ctx, c, state.Cluster(), tidb)
72+
if err != nil {
73+
return task.Retry(smoothUpgradeRetryInterval).With("cannot create TiDB client for upgrade/start: %v", err)
74+
}
75+
76+
if err := tidbClient.UpgradeStart(ctx, dbg.Spec.Template.Spec.Keyspace); err != nil {
77+
return task.Retry(smoothUpgradeRetryInterval).With("upgrade/start failed, will retry: %v", err)
78+
}
79+
80+
phase := v1alpha1.AnnoValSmoothUpgradePhaseInProgress
81+
if err := patchSmoothUpgradeAnnotation(ctx, c, dbg, &phase); err != nil {
82+
return task.Fail().With("failed to set smooth upgrade annotation: %w", err)
83+
}
84+
85+
return task.Complete().With("smooth upgrade started, DDL paused")
86+
})
87+
}
88+
89+
// TaskSmoothUpgradeFinish calls /upgrade/finish on a healthy TiDB instance after all pods are upgraded.
90+
// It must run after TaskStatusRevisionAndReplicas so that dbg.Status.Version reflects the new version,
91+
// making needVersionUpgrade() return false as the "all done" signal.
92+
func TaskSmoothUpgradeFinish(state *ReconcileContext, c client.Client) task.Task {
93+
return taskSmoothUpgradeFinish(state, c, newTiDBClientForGroup)
94+
}
95+
96+
func taskSmoothUpgradeFinish(state *ReconcileContext, c client.Client, factory tidbClientFactory) task.Task {
97+
return task.NameTaskFunc("SmoothUpgradeFinish", func(ctx context.Context) task.Result {
98+
dbg := state.TiDBGroup()
99+
100+
if dbg.Annotations[v1alpha1.AnnoKeySmoothUpgradePhase] != v1alpha1.AnnoValSmoothUpgradePhaseInProgress {
101+
return task.Complete().With("no smooth upgrade in progress")
102+
}
103+
if needVersionUpgrade(dbg) {
104+
return task.Complete().With("upgrade still in progress, finish not yet")
105+
}
106+
107+
tidb := pickReadyTiDB(state.TiDBSlice())
108+
if tidb == nil {
109+
return task.Retry(smoothUpgradeRetryInterval).With("no ready TiDB instance available for upgrade/finish")
110+
}
111+
112+
tidbClient, err := factory(ctx, c, state.Cluster(), tidb)
113+
if err != nil {
114+
return task.Retry(smoothUpgradeRetryInterval).With("cannot create TiDB client for upgrade/finish: %v", err)
115+
}
116+
117+
if err := tidbClient.UpgradeFinish(ctx); err != nil {
118+
return task.Retry(smoothUpgradeRetryInterval).With("upgrade/finish failed, will retry: %v", err)
119+
}
120+
121+
if err := patchSmoothUpgradeAnnotation(ctx, c, dbg, nil); err != nil {
122+
return task.Fail().With("failed to remove smooth upgrade annotation: %w", err)
123+
}
124+
125+
return task.Complete().With("smooth upgrade finished, DDL resumed")
126+
})
127+
}
128+
129+
// pickReadyTiDB returns the first TiDB instance that is in the Ready state.
130+
func pickReadyTiDB(dbs []*v1alpha1.TiDB) *v1alpha1.TiDB {
131+
for _, db := range dbs {
132+
if coreutil.IsReady[scope.TiDB](db) {
133+
return db
134+
}
135+
}
136+
return nil
137+
}
138+
139+
// newTiDBClientForGroup creates a TiDB HTTP client targeting the given TiDB instance.
140+
func newTiDBClientForGroup(ctx context.Context, c client.Client, ck *v1alpha1.Cluster, tidb *v1alpha1.TiDB) (tidbapi.TiDBClient, error) {
141+
url := coreutil.InstanceAdvertiseURL[scope.TiDB](ck, tidb, coreutil.TiDBStatusPort(tidb))
142+
if !coreutil.IsTLSClusterEnabled(ck) {
143+
return tidbapi.NewTiDBClient(url, smoothUpgradeRequestTimeout, nil), nil
144+
}
145+
tlsConfig, err := apicall.GetClientTLSConfig(ctx, c, ck)
146+
if err != nil {
147+
return nil, fmt.Errorf("cannot get TLS config: %w", err)
148+
}
149+
return tidbapi.NewTiDBClient(url, smoothUpgradeRequestTimeout, tlsConfig), nil
150+
}
151+
152+
type annotationPatch struct {
153+
Metadata annotationPatchMetadata `json:"metadata"`
154+
}
155+
156+
type annotationPatchMetadata struct {
157+
ResourceVersion string `json:"resourceVersion"`
158+
Annotations map[string]*string `json:"annotations"`
159+
}
160+
161+
// patchSmoothUpgradeAnnotation sets (value non-nil) or deletes (value nil) the smooth upgrade annotation.
162+
func patchSmoothUpgradeAnnotation(ctx context.Context, c client.Client, dbg *v1alpha1.TiDBGroup, value *string) error {
163+
p := annotationPatch{
164+
Metadata: annotationPatchMetadata{
165+
ResourceVersion: dbg.GetResourceVersion(),
166+
Annotations: map[string]*string{
167+
v1alpha1.AnnoKeySmoothUpgradePhase: value,
168+
},
169+
},
170+
}
171+
data, err := json.Marshal(&p)
172+
if err != nil {
173+
return fmt.Errorf("invalid patch: %w", err)
174+
}
175+
if err := c.Patch(ctx, dbg, client.RawPatch(types.MergePatchType, data)); err != nil {
176+
return fmt.Errorf("cannot patch smooth upgrade annotation on %s/%s: %w", dbg.Namespace, dbg.Name, err)
177+
}
178+
return nil
179+
}

0 commit comments

Comments
 (0)