From 71ae34792b6588642deba0f2da057a67f8f5a358 Mon Sep 17 00:00:00 2001 From: tennix Date: Tue, 9 Jun 2026 18:18:54 +0800 Subject: [PATCH] Support TiDB smooth upgrade in v2 Add TiDB smooth-upgrade orchestration to the v2 TiDBGroup controller. Switch-controlled TiDB version upgrades now call /upgrade/start before mutating TiDB instances, persist controller-owned pause annotations, recover stale annotation state with idempotent finish, and call /upgrade/finish after the group is fully updated and ready. Extend the v2 TiDB API client with start/finish methods and document the design in an RFC.\n\nTested: go test ./pkg/tidbapi/v1 ./pkg/controllers/tidb ./pkg/controllers/tidb/tasks ./pkg/controllers/tidbgroup ./pkg/controllers/tidbgroup/tasks -count=1\nTested: go test ./pkg/controllers/... -count=1\nTested: git diff --check --- docs/rfcs/260609-tidb-smooth-upgrade.md | 380 ++++++++++++++++++ pkg/controllers/tidbgroup/builder.go | 1 + .../tidbgroup/tasks/smooth_upgrade.go | 360 +++++++++++++++++ .../tidbgroup/tasks/smooth_upgrade_test.go | 311 ++++++++++++++ pkg/controllers/tidbgroup/tasks/updater.go | 6 + pkg/tidbapi/v1/client.go | 79 +++- pkg/tidbapi/v1/client_test.go | 79 ++++ 7 files changed, 1211 insertions(+), 5 deletions(-) create mode 100644 docs/rfcs/260609-tidb-smooth-upgrade.md create mode 100644 pkg/controllers/tidbgroup/tasks/smooth_upgrade.go create mode 100644 pkg/controllers/tidbgroup/tasks/smooth_upgrade_test.go diff --git a/docs/rfcs/260609-tidb-smooth-upgrade.md b/docs/rfcs/260609-tidb-smooth-upgrade.md new file mode 100644 index 00000000000..dc03042fd04 --- /dev/null +++ b/docs/rfcs/260609-tidb-smooth-upgrade.md @@ -0,0 +1,380 @@ +# TiDB Smooth Upgrade + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories](#user-stories) + - [Story 1: switch-controlled TiDB upgrade](#story-1-switch-controlled-tidb-upgrade) + - [Story 2: unsupported smooth-upgrade version pair](#story-2-unsupported-smooth-upgrade-version-pair) + - [Story 3: operator restart during upgrade](#story-3-operator-restart-during-upgrade) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Current v2 Upgrade Flow](#current-v2-upgrade-flow) + - [TiDB HTTP API](#tidb-http-api) + - [Version Matrix](#version-matrix) + - [Unsupported](#unsupported) + - [Auto-supported, no HTTP switch needed](#auto-supported-no-http-switch-needed) + - [HTTP switch-controlled](#http-switch-controlled) + - [Detecting TiDB Version Upgrades](#detecting-tidb-version-upgrades) + - [Persisted Controller State](#persisted-controller-state) + - [Start Gate](#start-gate) + - [Finish Gate](#finish-gate) + - [API Client Changes](#api-client-changes) + - [Events and Conditions](#events-and-conditions) + - [Test Plan](#test-plan) + - [Unit Tests](#unit-tests) + - [E2E Tests](#e2e-tests) + - [Feature Gate](#feature-gate) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Store active state only in status conditions](#store-active-state-only-in-status-conditions) + - [Call TiDB smooth-upgrade APIs for every TiDB version change](#call-tidb-smooth-upgrade-apis-for-every-tidb-version-change) + - [Implement the gate in each TiDB instance controller](#implement-the-gate-in-each-tidb-instance-controller) + - [Use Upgrade Show as the reconciliation source of truth](#use-upgrade-show-as-the-reconciliation-source-of-truth) + + +## Release Signoff Checklist + +Items marked with (R) are required *prior to targeting to a release*. + +- [ ] (R) This design doc has been discussed and approved +- [ ] (R) Test plan is in place + - [ ] (R) e2e tests in kind +- [ ] (R) Graduation criteria is in place if required +- [ ] (R) User-facing documentation has been created in [pingcap/docs-tidb-operator] + +## Summary + +TiDB Operator v2 upgrades TiDB by reconciling a `TiDBGroup` template version into one `TiDB` instance CR per TiDB server. The existing updater coordinates Kubernetes-level rolling changes, but it does not coordinate TiDB smooth-upgrade mode. Users still need to avoid user DDL manually during some TiDB binary upgrade windows. + +This RFC adds smooth-upgrade orchestration for TiDB Operator v2. For TiDB version pairs that require the TiDB HTTP switch, the `TiDBGroup` controller calls `POST /upgrade/start` before it creates, updates, or deletes any `TiDB` instance as part of the version rollout. After every managed TiDB instance reaches the target revision, target version, and Ready state, the controller calls `POST /upgrade/finish`. The active pause window is recorded in controller-owned `TiDBGroup` annotations so the controller can recover after restarts and always finish a pause window that it started. + +## Motivation + +TiDB smooth upgrade reduces DDL-related risk during TiDB binary upgrades. The TiDB HTTP API flips the cluster upgrade state. TiDB DDL then pauses non-system user DDL while allowing internal/system DDL, and resumes paused user jobs after finish. TiUP already uses this mechanism during `tiup cluster upgrade`; TiDB Operator should provide the same automated behavior for Kubernetes-managed TiDB clusters. + +### Goals + +- Pause user DDL automatically during eligible TiDB Server version upgrades managed by `TiDBGroup`. +- Call TiDB's bodyless `POST /upgrade/start` before the v2 updater performs the first version rollout action. +- Call `POST /upgrade/finish` after all `TiDB` instances in the group are updated and ready. +- Apply only to TiDB version upgrades, not scale-only, config-only, resource-only, metadata-only, suspend, adoption, or restart-only changes. +- Persist active pause state on `TiDBGroup` so operator restart does not lose the obligation to call `/upgrade/finish`. +- Preserve the existing rollout behavior for unsupported and no-switch-needed version pairs. +- Keep this behavior internal to the controller with no new required user-facing CRD field. + +### Non-Goals + +- Pausing DDL for PD, TiKV, TiFlash, TiCDC, TiProxy, TSO, Scheduling, ResourceManager, DM, Router, or BR components. +- Preventing every risky user operation described in TiDB smooth-upgrade limitations. TiDB enforces its own smooth-upgrade semantics after `/upgrade/start`. +- Changing v2's component upgrade ordering policy. +- Supporting downgrades with TiDB smooth-upgrade mode. +- Requiring `POST /upgrade/show` as a normal reconcile gate. + +## Proposal + +Add a smooth-upgrade gate to the `TiDBGroup` controller's updater task: + +1. Detect whether the current reconciliation is a TiDB version upgrade. +2. Classify the source and target version pair. +3. For switch-controlled pairs, call `/upgrade/start` and persist an active pause annotation before the updater mutates any `TiDB` instance for the rollout. +4. Let the existing updater perform the rolling change. +5. On later reconciliations, when all managed `TiDB` instances are updated and ready, call `/upgrade/finish` and clear the annotations. + +### User Stories + +#### Story 1: switch-controlled TiDB upgrade + +A user runs a `TiDBGroup` at `v7.5.0` and updates `spec.template.spec.version` to `v7.5.3`. The controller detects an eligible switch-controlled version upgrade. Before the updater changes any `TiDB` instance CR, the controller chooses a healthy TiDB endpoint and sends: + +```bash +curl -X POST http://{TiDB}:10080/upgrade/start +``` + +After TiDB returns HTTP 200, the controller records smooth-upgrade annotations on the `TiDBGroup` and proceeds with the existing rolling update. When every managed `TiDB` instance is on the target revision/version and Ready, the controller sends: + +```bash +curl -X POST http://{TiDB}:10080/upgrade/finish +``` + +After finish succeeds, the controller removes the annotations. + +#### Story 2: unsupported smooth-upgrade version pair + +A user upgrades from `v6.5.10` to `v8.1.0`. The version matrix marks this pair unsupported. The controller does not call `/upgrade/start` or `/upgrade/finish`, emits a warning event, and keeps the existing updater behavior. + +#### Story 3: operator restart during upgrade + +The controller calls `/upgrade/start`, persists annotations, and then restarts while TiDB instances are still rolling. When reconciliation resumes, the controller reads `core.pingcap.com/smooth-upgrade-ddl-paused: "true"` from the `TiDBGroup`, skips duplicate start, waits for rollout completion, calls `/upgrade/finish`, and clears the annotations. + +### Risks and Mitigations + +- **TiDB returns handler/session errors as HTTP 400**: keep HTTP status and response body in returned errors. Start failure must return retry and must not allow the updater to mutate any versioned `TiDB` instance. +- **`/upgrade/start` can take about 10 seconds**: use a 30-second client timeout for start. TiDB waits for the DDL owner to sync upgrading state before returning success, so a 200 start response is the safe point to begin rollout. +- **Operator crashes after `/upgrade/start` but before annotation persistence**: retrying start is acceptable because TiDB returns a successful duplicate-upgrading response. The controller should call start before patching annotations, then patch annotations before running the updater. +- **Annotation update conflicts**: patch only the smooth-upgrade annotation keys with retry-on-conflict semantics. Do not rewrite user annotations. +- **Manual annotation removal**: document the annotations as controller-owned. If annotations are missing, the controller cannot infer a previous successful start from Kubernetes state alone; users should not edit these keys. +- **Stale or mismatched annotations**: if active annotations refer to a different source/target pair, call `/upgrade/finish`, clear the annotations only after finish succeeds, emit a warning event, and requeue before starting the current upgrade. +- **Stalled rollout holds DDL pause**: keep annotations and do not call finish until the group is healthy and up to date. Users must fix the underlying pod/image/config issue to resume DDL safely. + +## Design Details + +### Current v2 Upgrade Flow + +`pkg/controllers/tidbgroup/tasks/TaskUpdater` is the natural boundary for smooth-upgrade start: + +- It checks whether a version upgrade is needed by comparing `dbg.Spec.Template.Spec.Version` with `dbg.Status.Version`. +- It blocks TiDB upgrades until the configured cluster upgrade policy allows TiDB to upgrade after dependent components. +- It waits for ready-but-not-available instances to satisfy `minReadySeconds`. +- It computes the desired group revision. +- It calls `pkg/updater` to create, update, or delete `TiDB` instance CRs according to surge, unavailable, topology, adoption, and restart rules. + +The finish gate should also live in the `TiDBGroup` controller because only the group controller has the complete group-level view of desired replicas, revisions, and all managed `TiDB` instances. The instance controller should continue to own per-instance health, pod reconciliation, and TiDB status API calls used for ordinary readiness. + +### TiDB HTTP API + +TiDB exposes classic smooth-upgrade endpoints: + +```text +POST /upgrade/start +POST /upgrade/finish +POST /upgrade/show +``` + +This feature uses only: + +```text +POST /upgrade/start +POST /upgrade/finish +``` + +The calls are bodyless. TiDB returns HTTP 400 for non-POST requests and handler/session errors. Successful and idempotent responses are HTTP 200 with JSON string bodies such as: + +```text +"success!" +"It's a duplicated operation and the cluster is already in upgrading state." +"It's a duplicated operation and the cluster is already in normal state." +``` + +The v2 client must decode JSON string bodies before matching responses. It should treat success and duplicate-success bodies as success, and return errors containing HTTP status, response body, operation, and URL for non-200 or unexpected responses. + +### Version Matrix + +The controller classifies TiDB version upgrades into three categories. Any pair where `target <= source` is unsupported and must never call smooth-upgrade APIs. + +#### Unsupported + +- Source `< v7.1.0` to any target. +- Source `v7.1.0`, `v7.1.1`, `v7.2.0`, or `v7.3.0` to target `>= v7.4.0`. +- Any version pair that cannot be parsed as semantic TiDB versions. +- Any downgrade or same-version pair. +- Any pair not matched by the auto-supported or HTTP switch-controlled categories below. + +For unsupported pairs, do not call `/upgrade/start` or `/upgrade/finish`. Keep existing rollout behavior and emit a warning event. + +#### Auto-supported, no HTTP switch needed + +- `v7.1.0` to `v7.1.1`, `v7.2.0`, or `v7.3.0`. +- `v7.1.1` to `v7.2.0` or `v7.3.0`. +- `v7.2.0` to `v7.3.0`. + +For these pairs, do not call `/upgrade/start` or `/upgrade/finish`; smooth upgrade is automatic in TiDB. + +#### HTTP switch-controlled + +- Source in `[v7.1.2, v7.2.0)` to target in `[v7.1.2, v7.2.0)`. +- Source in `[v7.1.2, v7.2.0)` or `>= v7.4.0` to target `>= v7.4.0`. + +Only these pairs require TiDB Operator v2 to call `/upgrade/start` and `/upgrade/finish`. + +### Detecting TiDB Version Upgrades + +In v2, the desired target is `dbg.Spec.Template.Spec.Version`. The source version should come from observed group state, not from the desired spec after the user edits it. + +Use this order: + +1. If `dbg.Status.Version` is non-empty, use it as the source version. +2. If status version is empty, derive source candidates from managed `TiDB` instances that do not have the update revision and use their `spec.version` only when all outdated candidates agree. +3. If the source cannot be determined or the versions cannot be parsed, classify the pair as unsupported for smooth-upgrade API calls. + +This logic must return "not a version upgrade" when the version is unchanged even if the group revision changes for config, resource, label, annotation, feature, or scheduling changes. + +### Persisted Controller State + +Use controller-owned annotations on the `TiDBGroup` object: + +```text +core.pingcap.com/smooth-upgrade-ddl-paused: "true" +core.pingcap.com/smooth-upgrade-source-version: "" +core.pingcap.com/smooth-upgrade-target-version: "" +core.pingcap.com/smooth-upgrade-started-at: "" +``` + +Rules: + +1. Set annotations only after `/upgrade/start` succeeds. +2. Treat `smooth-upgrade-ddl-paused=true` as meaning `/upgrade/start` has succeeded and `/upgrade/finish` is still owed. +3. Do not call `/upgrade/start` again while active annotations match the current source/target pair. +4. Remove all smooth-upgrade annotations only after `/upgrade/finish` succeeds. +5. If active annotations survive operator restart, continue waiting for rollout completion and then call finish. +6. If active annotations conflict with the current source/target pair, call `/upgrade/finish`, clear annotations after finish succeeds, emit a warning, and requeue. +7. Do not store this state in `TiDBGroup.spec.template.metadata.annotations`; it must not be propagated to `TiDB` instances. + +Status conditions may be added for visibility, but annotations are the durable workflow state. + +### Start Gate + +Add `ensureSmoothUpgradeStarted(ctx, state, client)` before `updater.New(...).Build().Do(ctx)` in `TaskUpdater`. + +The function should: + +1. Return immediately when the group does not need a version upgrade. +2. Determine `source` and `target`. +3. Classify the version pair. +4. For auto-supported pairs, proceed without HTTP calls. +5. For unsupported pairs, emit `SmoothUpgradeUnsupported` and proceed without HTTP calls. +6. If active annotations match the current source/target pair, proceed without duplicate start. +7. If active annotations are stale or mismatched, call finish, clear annotations, and return retry so the next reconcile starts from clean state. +8. Select one healthy `TiDB` instance endpoint. +9. Call `POST /upgrade/start` with a 30-second timeout. +10. If start fails or times out, return retry/fail before invoking the updater. +11. If start succeeds, patch the four annotations and continue to the updater. + +Healthy endpoint selection should prefer deterministic ordering, for example by sorted `TiDB` name. It must use only instances that are Ready and have a reachable status endpoint. + +### Finish Gate + +Add `maybeFinishSmoothUpgrade(ctx, state, client)` after the updater and before final status persistence in the `TiDBGroup` runner. It can also be called inside `TaskUpdater` after a no-op updater result, but it must run only after the state has enough information to prove group completion. + +The function should: + +1. Return immediately when the pause annotation is not active. +2. Verify the group has no pending version rollout: + - `dbg.Status.ObservedGeneration == dbg.Generation` + - `dbg.Status.Version == target` + - desired replicas equal status replicas + - ready replicas, updated replicas, and current replicas all equal desired replicas + - update revision equals current revision + - every managed `TiDB` instance is Ready + - every managed `TiDB` instance has `spec.version == target` +3. If the group is not complete, return wait/retry and keep annotations. +4. Select one healthy `TiDB` endpoint. +5. Call `POST /upgrade/finish`. +6. If finish fails, return retry/fail and keep annotations. +7. If finish succeeds, delete all smooth-upgrade annotations. + +The finish gate should tolerate TiDB's duplicate-normal response as success. This is required for recovery from client timeouts and stale annotations. + +### API Client Changes + +Extend `pkg/tidbapi/v1.TiDBClient`: + +```go +type TiDBClient interface { + GetHealth(ctx context.Context) (bool, error) + GetInfo(ctx context.Context) (*ServerInfo, error) + SetServerLabels(ctx context.Context, labels map[string]string) error + GetPoolStatus(ctx context.Context) (*PoolStatus, error) + Activate(ctx context.Context, keyspace string) error + StartUpgrade(ctx context.Context) error + FinishUpgrade(ctx context.Context) error +} +``` + +Implementation notes: + +- Add path constants for `upgrade/start` and `upgrade/finish`. +- Keep the call body empty. +- Decode TiDB's JSON string response before comparing with known success bodies. +- Use 30 seconds for `StartUpgrade`. This can be a dedicated client, a per-call timeout, or a small helper that preserves the same TLS transport behavior as `NewTiDBClient`. +- `FinishUpgrade` may use the normal TiDB request timeout, but accepting the same 30-second path is also acceptable. + +The `TiDBGroup` controller needs a small helper to construct a `TiDBClient` for a selected `TiDB` instance, reusing the TLS setup already used by `pkg/controllers/tidb/tasks/TaskContextInfoFromPDAndTiDB`. + +### Events and Conditions + +Emit Kubernetes events on the `TiDBGroup`: + +- `SmoothUpgradeStarted` +- `SmoothUpgradeFinished` +- `SmoothUpgradeStartFailed` +- `SmoothUpgradeFinishFailed` +- `SmoothUpgradeUnsupported` +- `SmoothUpgradeSkipped` +- `SmoothUpgradeRecovered` + +Optionally add a `TiDBGroup` condition such as `SmoothUpgradePaused` for visibility. It must be derived from annotations and current reconciliation state, not used as the source of truth. + +### Test Plan + +#### Unit Tests + +- `pkg/tidbapi/v1/client_test.go` + - `StartUpgrade` sends bodyless `POST /upgrade/start`. + - `StartUpgrade` accepts `"success!"` and duplicate-upgrading HTTP 200 responses. + - `FinishUpgrade` sends bodyless `POST /upgrade/finish`. + - `FinishUpgrade` accepts `"success!"` and duplicate-normal HTTP 200 responses. + - HTTP 400, 5xx, client timeout, and unexpected response bodies return useful errors. + - JSON string response bodies are decoded before matching. + +- `pkg/controllers/tidbgroup/tasks/smooth_upgrade_test.go` + - Classify every row in the version matrix. + - Downgrades and same-version changes are unsupported. + - Invalid versions are unsupported. + - Config/resource/metadata-only revisions are not version upgrades. + - Read, write, clear, and patch only the four smooth-upgrade annotations. + - Active matching annotations skip duplicate start. + - Stale or mismatched annotations call finish, clear annotations, and requeue. + - Source version detection uses `status.version` first and falls back to outdated instances only when unambiguous. + +- `pkg/controllers/tidbgroup/tasks/updater_test.go` + - Switch-controlled version upgrade calls start before the updater mutates any `TiDB` instance. + - Start failure blocks rollout. + - Annotation patch failure blocks rollout and retries safely. + - Auto-supported and unsupported pairs skip start. + - Finish runs only after all group status and instance readiness checks pass. + - Finish failure keeps annotations. + - Finish success removes annotations. + +#### E2E Tests + +- Deploy a switch-controlled source version, for example `v7.5.0`, with at least two TiDB replicas. +- Upgrade to a supported target, for example `v7.5.3`. +- Verify smooth-upgrade annotations appear before rollout completes. +- Verify source and target annotation values. +- Verify annotations are removed after all TiDB instances are upgraded and ready. +- Restart the operator after start and before finish; verify finish still happens and annotations are cleared. +- Upgrade an unsupported pair, for example `v6.5.10` to the latest supported image; verify no smooth-upgrade annotations are set and rollout still completes. + +### Feature Gate + +No feature gate. This behavior is internal, applies only to supported switch-controlled TiDB version upgrades, and preserves existing behavior for every unsupported or unrelated rollout. + +## Drawbacks + +- The `TiDBGroup` controller must update metadata in addition to status and instance reconciliation, which adds conflict handling. +- A stuck TiDB rollout keeps user DDL paused until the rollout is fixed and finish succeeds. +- Unsupported pairs still rely on users avoiding DDL manually; the operator can only warn and preserve the previous behavior. +- Endpoint selection and TiDB API calls add another dependency to the group updater path. + +## Alternatives + +### Store active state only in status conditions + +Status is observational and can be recalculated. After `/upgrade/start` succeeds, the controller owns a future `/upgrade/finish` obligation. Annotations on the main object are a better durable workflow marker. + +### Call TiDB smooth-upgrade APIs for every TiDB version change + +This is simpler but incorrect. Unsupported pairs could fail upgrades that currently work, and auto-supported pairs do not need HTTP switching. + +### Implement the gate in each TiDB instance controller + +Each instance controller sees only one `TiDB` instance and cannot reliably decide the group-level first rollout action or final group completion. The start and finish decisions belong in `TiDBGroup`. + +### Use Upgrade Show as the reconciliation source of truth + +`/upgrade/show` can help diagnostics, but live TiDB state alone does not record whether this operator started the pause window or which source/target pair it owns. Controller-owned annotations provide deterministic recovery. `/upgrade/show` can be added later as an optional defensive check. diff --git a/pkg/controllers/tidbgroup/builder.go b/pkg/controllers/tidbgroup/builder.go index e193ab0eb95..60768dee82a 100644 --- a/pkg/controllers/tidbgroup/builder.go +++ b/pkg/controllers/tidbgroup/builder.go @@ -65,6 +65,7 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task common.TaskGroupConditionSynced[scope.TiDBGroup](state), common.TaskStatusRevisionAndReplicas[scope.TiDBGroup](state), tasks.TaskStatusAvailable(state), + tasks.TaskFinishSmoothUpgrade(state, r.Client), common.TaskStatusPersister[scope.TiDBGroup](state, r.Client), ) diff --git a/pkg/controllers/tidbgroup/tasks/smooth_upgrade.go b/pkg/controllers/tidbgroup/tasks/smooth_upgrade.go new file mode 100644 index 00000000000..0cc5bd7abdc --- /dev/null +++ b/pkg/controllers/tidbgroup/tasks/smooth_upgrade.go @@ -0,0 +1,360 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tasks + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "maps" + "slices" + "time" + + "github.com/Masterminds/semver/v3" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/pingcap/tidb-operator/api/v2/core/v1alpha1" + "github.com/pingcap/tidb-operator/v2/pkg/apicall" + coreutil "github.com/pingcap/tidb-operator/v2/pkg/apiutil/core/v1alpha1" + operatorclient "github.com/pingcap/tidb-operator/v2/pkg/client" + "github.com/pingcap/tidb-operator/v2/pkg/runtime/scope" + "github.com/pingcap/tidb-operator/v2/pkg/tidbapi/v1" + "github.com/pingcap/tidb-operator/v2/pkg/utils/task/v3" +) + +const ( + annSmoothUpgradeDDLPaused = v1alpha1.AnnoKeyPrefix + "smooth-upgrade-ddl-paused" + annSmoothUpgradeSourceVersion = v1alpha1.AnnoKeyPrefix + "smooth-upgrade-source-version" + annSmoothUpgradeTargetVersion = v1alpha1.AnnoKeyPrefix + "smooth-upgrade-target-version" + annSmoothUpgradeStartedAt = v1alpha1.AnnoKeyPrefix + "smooth-upgrade-started-at" + + smoothUpgradeRetryAfter = 5 * time.Second +) + +type smoothUpgradeSupport string + +const ( + smoothUpgradeNotVersionUpgrade smoothUpgradeSupport = "NotVersionUpgrade" + smoothUpgradeUnsupported smoothUpgradeSupport = "Unsupported" + smoothUpgradeAutoSupported smoothUpgradeSupport = "AutoSupportedNoop" + smoothUpgradeSwitchControlled smoothUpgradeSupport = "SwitchControlled" +) + +type tidbClientFactory func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) + +var newSmoothUpgradeTiDBClient tidbClientFactory = defaultSmoothUpgradeTiDBClient + +func ensureSmoothUpgradeStarted(ctx context.Context, state *ReconcileContext, c operatorclient.Client) task.Result { + dbg := state.TiDBGroup() + updateRevision, _, _ := state.Revision() + source, target, support := detectSmoothUpgrade(dbg, state.TiDBSlice(), updateRevision) + if support == smoothUpgradeNotVersionUpgrade { + return task.Complete().With("smooth upgrade is not needed") + } + + ann := smoothUpgradeAnnotations(dbg) + if ann.active { + if ann.source == source && ann.target == target { + return task.Complete().With("smooth upgrade already started") + } + if err := finishSmoothUpgrade(ctx, state, c); err != nil { + return task.Fail().With("cannot finish stale smooth upgrade: %w", err) + } + clearSmoothUpgradeAnnotations(dbg) + if err := patchSmoothUpgradeAnnotations(ctx, c, dbg, true); err != nil { + return task.Fail().With("cannot clear stale smooth-upgrade annotations: %w", err) + } + return task.Retry(smoothUpgradeRetryAfter).With("stale smooth-upgrade annotations are cleared") + } + + switch support { + case smoothUpgradeAutoSupported: + return task.Complete().With("smooth upgrade is auto-supported by TiDB") + case smoothUpgradeUnsupported: + return task.Complete().With("smooth upgrade is unsupported for %s -> %s", source, target) + case smoothUpgradeSwitchControlled: + default: + return task.Complete().With("smooth upgrade is not needed") + } + + if err := startSmoothUpgrade(ctx, state, c); err != nil { + return task.Fail().With("cannot start smooth upgrade: %w", err) + } + setSmoothUpgradeAnnotations(dbg, source, target) + if err := patchSmoothUpgradeAnnotations(ctx, c, dbg, false); err != nil { + return task.Fail().With("cannot persist smooth-upgrade annotations: %w", err) + } + return task.Complete().With("smooth upgrade started") +} + +func TaskFinishSmoothUpgrade(state *ReconcileContext, c operatorclient.Client) task.Task { + return task.NameTaskFunc("FinishSmoothUpgrade", func(ctx context.Context) task.Result { + dbg := state.TiDBGroup() + ann := smoothUpgradeAnnotations(dbg) + if !ann.active { + return task.Complete().With("smooth upgrade is not active") + } + if !smoothUpgradeRolloutComplete(dbg, state.TiDBSlice(), ann.target) { + return task.Wait().With("wait for tidb smooth-upgrade rollout to complete") + } + if err := finishSmoothUpgrade(ctx, state, c); err != nil { + return task.Fail().With("cannot finish smooth upgrade: %w", err) + } + clearSmoothUpgradeAnnotations(dbg) + if err := patchSmoothUpgradeAnnotations(ctx, c, dbg, true); err != nil { + return task.Fail().With("cannot clear smooth-upgrade annotations: %w", err) + } + return task.Complete().With("smooth upgrade finished") + }) +} + +func detectSmoothUpgrade( + dbg *v1alpha1.TiDBGroup, + dbs []*v1alpha1.TiDB, + updateRevision string, +) (source, target string, support smoothUpgradeSupport) { + target = dbg.Spec.Template.Spec.Version + source = dbg.Status.Version + if source == "" { + source = sourceVersionFromOutdatedTiDBs(dbs, updateRevision) + } + if source == "" || source == target { + return source, target, smoothUpgradeNotVersionUpgrade + } + return source, target, classifySmoothUpgrade(source, target) +} + +func sourceVersionFromOutdatedTiDBs(dbs []*v1alpha1.TiDB, updateRevision string) string { + versions := map[string]struct{}{} + for _, db := range dbs { + if db.Status.CurrentRevision == updateRevision { + continue + } + if db.Spec.Version == "" { + return "" + } + versions[db.Spec.Version] = struct{}{} + } + if len(versions) != 1 { + return "" + } + for v := range versions { + return v + } + return "" +} + +func classifySmoothUpgrade(source, target string) smoothUpgradeSupport { + sourceVer, err := semver.NewVersion(source) + if err != nil { + return smoothUpgradeUnsupported + } + targetVer, err := semver.NewVersion(target) + if err != nil { + return smoothUpgradeUnsupported + } + if !targetVer.GreaterThan(sourceVer) { + return smoothUpgradeUnsupported + } + if isAutoSmoothUpgradePair(sourceVer, targetVer) { + return smoothUpgradeAutoSupported + } + if isSwitchControlledSmoothUpgradePair(sourceVer, targetVer) { + return smoothUpgradeSwitchControlled + } + return smoothUpgradeUnsupported +} + +func isAutoSmoothUpgradePair(source, target *semver.Version) bool { + pairs := map[string][]string{ + "7.1.0": []string{"7.1.1", "7.2.0", "7.3.0"}, + "7.1.1": []string{"7.2.0", "7.3.0"}, + "7.2.0": []string{"7.3.0"}, + } + for _, t := range pairs[source.String()] { + v, err := semver.NewVersion(t) + if err == nil && target.Equal(v) { + return true + } + } + return false +} + +func isSwitchControlledSmoothUpgradePair(source, target *semver.Version) bool { + sourceIn712To720 := inRange(source, ">= 7.1.2, < 7.2.0") + targetIn712To720 := inRange(target, ">= 7.1.2, < 7.2.0") + sourceGE740 := inRange(source, ">= 7.4.0") + targetGE740 := inRange(target, ">= 7.4.0") + return sourceIn712To720 && targetIn712To720 || (sourceIn712To720 || sourceGE740) && targetGE740 +} + +func inRange(v *semver.Version, constraint string) bool { + c, err := semver.NewConstraint(constraint) + if err != nil { + return false + } + c.IncludePrerelease = true + return c.Check(v) +} + +type smoothUpgradeAnnotationState struct { + active bool + source string + target string +} + +func smoothUpgradeAnnotations(dbg *v1alpha1.TiDBGroup) smoothUpgradeAnnotationState { + ann := dbg.GetAnnotations() + return smoothUpgradeAnnotationState{ + active: ann[annSmoothUpgradeDDLPaused] == v1alpha1.AnnoValTrue, + source: ann[annSmoothUpgradeSourceVersion], + target: ann[annSmoothUpgradeTargetVersion], + } +} + +func setSmoothUpgradeAnnotations(dbg *v1alpha1.TiDBGroup, source, target string) { + ann := maps.Clone(dbg.GetAnnotations()) + if ann == nil { + ann = map[string]string{} + } + ann[annSmoothUpgradeDDLPaused] = v1alpha1.AnnoValTrue + ann[annSmoothUpgradeSourceVersion] = source + ann[annSmoothUpgradeTargetVersion] = target + ann[annSmoothUpgradeStartedAt] = time.Now().UTC().Format(time.RFC3339) + dbg.SetAnnotations(ann) +} + +func clearSmoothUpgradeAnnotations(dbg *v1alpha1.TiDBGroup) { + ann := maps.Clone(dbg.GetAnnotations()) + for _, key := range smoothUpgradeAnnotationKeys() { + delete(ann, key) + } + dbg.SetAnnotations(ann) +} + +func smoothUpgradeAnnotationKeys() []string { + return []string{ + annSmoothUpgradeDDLPaused, + annSmoothUpgradeSourceVersion, + annSmoothUpgradeTargetVersion, + annSmoothUpgradeStartedAt, + } +} + +func patchSmoothUpgradeAnnotations(ctx context.Context, c operatorclient.Client, dbg *v1alpha1.TiDBGroup, clear bool) error { + annotations := map[string]any{} + for _, key := range smoothUpgradeAnnotationKeys() { + if clear { + annotations[key] = nil + continue + } + annotations[key] = dbg.Annotations[key] + } + patch, err := json.Marshal(map[string]any{ + "metadata": map[string]any{ + "annotations": annotations, + }, + }) + if err != nil { + return err + } + return c.Patch(ctx, dbg, client.RawPatch(types.MergePatchType, patch)) +} + +func startSmoothUpgrade(ctx context.Context, state *ReconcileContext, c operatorclient.Client) error { + tidb, err := chooseSmoothUpgradeTiDB(state.TiDBSlice()) + if err != nil { + return err + } + cli, err := newSmoothUpgradeTiDBClient(ctx, c, state.Cluster(), tidb) + if err != nil { + return err + } + return cli.StartUpgrade(ctx) +} + +func finishSmoothUpgrade(ctx context.Context, state *ReconcileContext, c operatorclient.Client) error { + tidb, err := chooseSmoothUpgradeTiDB(state.TiDBSlice()) + if err != nil { + return err + } + cli, err := newSmoothUpgradeTiDBClient(ctx, c, state.Cluster(), tidb) + if err != nil { + return err + } + return cli.FinishUpgrade(ctx) +} + +func chooseSmoothUpgradeTiDB(dbs []*v1alpha1.TiDB) (*v1alpha1.TiDB, error) { + candidates := slices.Clone(dbs) + slices.SortFunc(candidates, func(a, b *v1alpha1.TiDB) int { + if a.Name < b.Name { + return -1 + } + if a.Name > b.Name { + return 1 + } + return 0 + }) + for _, db := range candidates { + if coreutil.IsReady[scope.TiDB](db) { + return db, nil + } + } + return nil, fmt.Errorf("no ready tidb instance for smooth upgrade") +} + +func defaultSmoothUpgradeTiDBClient(ctx context.Context, c operatorclient.Client, cluster *v1alpha1.Cluster, tidb *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + var tlsConfig *tls.Config + if coreutil.IsTLSClusterEnabled(cluster) { + var err error + tlsConfig, err = apicall.GetClientTLSConfig(ctx, c, cluster) + if err != nil { + return nil, fmt.Errorf("cannot get tls config from secret: %w", err) + } + } + return tidbapi.NewTiDBClient( + coreutil.InstanceAdvertiseURL[scope.TiDB](cluster, tidb, coreutil.TiDBStatusPort(tidb)), + 10*time.Second, + tlsConfig, + ), nil +} + +func smoothUpgradeRolloutComplete(dbg *v1alpha1.TiDBGroup, dbs []*v1alpha1.TiDB, target string) bool { + desired := coreutil.Replicas[scope.TiDBGroup](dbg) + if dbg.Status.Version != target { + return false + } + if dbg.Status.Replicas != desired || + dbg.Status.ReadyReplicas != desired || + dbg.Status.UpdatedReplicas != desired || + dbg.Status.CurrentReplicas != desired { + return false + } + if dbg.Status.UpdateRevision == "" || dbg.Status.UpdateRevision != dbg.Status.CurrentRevision { + return false + } + if int32(len(dbs)) != desired { + return false + } + for _, db := range dbs { + if db.Spec.Version != target || !coreutil.IsReady[scope.TiDB](db) { + return false + } + } + return true +} diff --git a/pkg/controllers/tidbgroup/tasks/smooth_upgrade_test.go b/pkg/controllers/tidbgroup/tasks/smooth_upgrade_test.go new file mode 100644 index 00000000000..7b5e622cea8 --- /dev/null +++ b/pkg/controllers/tidbgroup/tasks/smooth_upgrade_test.go @@ -0,0 +1,311 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tasks + +import ( + "context" + "errors" + "testing" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + + "github.com/pingcap/tidb-operator/api/v2/core/v1alpha1" + "github.com/pingcap/tidb-operator/v2/pkg/adoption" + operatorclient "github.com/pingcap/tidb-operator/v2/pkg/client" + "github.com/pingcap/tidb-operator/v2/pkg/tidbapi/v1" + "github.com/pingcap/tidb-operator/v2/pkg/utils/fake" + "github.com/pingcap/tidb-operator/v2/pkg/utils/task/v3" + "github.com/pingcap/tidb-operator/v2/pkg/utils/tracker" +) + +func TestClassifySmoothUpgrade(t *testing.T) { + cases := []struct { + source string + target string + want smoothUpgradeSupport + }{ + {"v7.1.0", "v7.1.1", smoothUpgradeAutoSupported}, + {"v7.1.1", "v7.3.0", smoothUpgradeAutoSupported}, + {"v7.2.0", "v7.3.0", smoothUpgradeAutoSupported}, + {"v7.1.2", "v7.1.3", smoothUpgradeSwitchControlled}, + {"v7.1.3", "v7.4.0", smoothUpgradeSwitchControlled}, + {"v7.4.0", "v7.5.0", smoothUpgradeSwitchControlled}, + {"v6.5.10", "v8.1.0", smoothUpgradeUnsupported}, + {"v7.3.0", "v7.4.0", smoothUpgradeUnsupported}, + {"v7.5.0", "v7.4.0", smoothUpgradeUnsupported}, + {"v7.5.0", "v7.5.0", smoothUpgradeUnsupported}, + {"nightly", "v7.5.0", smoothUpgradeUnsupported}, + } + for _, c := range cases { + assert.Equal(t, c.want, classifySmoothUpgrade(c.source, c.target), "%s -> %s", c.source, c.target) + } +} + +func TestEnsureSmoothUpgradeStarted(t *testing.T) { + oldFactory := newSmoothUpgradeTiDBClient + defer func() { newSmoothUpgradeTiDBClient = oldFactory }() + + ctx := context.Background() + mock := &mockTiDBClient{} + newSmoothUpgradeTiDBClient = func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + return mock, nil + } + + dbg := smoothUpgradeTiDBGroup("db", "v7.5.0", "v7.5.3") + db := smoothUpgradeTiDB("db-a", "v7.5.0", oldRevision) + state := &ReconcileContext{State: &state{ + dbg: dbg, + cluster: fake.FakeObj[v1alpha1.Cluster]("cluster"), + dbs: []*v1alpha1.TiDB{db}, + updateRevision: newRevision, + }} + fc := operatorclient.NewFakeClient(dbg, state.Cluster()) + + res := ensureSmoothUpgradeStarted(ctx, state, fc) + require.Equal(t, task.SComplete, res.Status()) + assert.Equal(t, 1, mock.startCalls) + assert.Equal(t, v1alpha1.AnnoValTrue, dbg.Annotations[annSmoothUpgradeDDLPaused]) + assert.Equal(t, "v7.5.0", dbg.Annotations[annSmoothUpgradeSourceVersion]) + assert.Equal(t, "v7.5.3", dbg.Annotations[annSmoothUpgradeTargetVersion]) +} + +func TestEnsureSmoothUpgradeStartedSkipsActiveAnnotation(t *testing.T) { + oldFactory := newSmoothUpgradeTiDBClient + defer func() { newSmoothUpgradeTiDBClient = oldFactory }() + + mock := &mockTiDBClient{} + newSmoothUpgradeTiDBClient = func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + return mock, nil + } + + dbg := smoothUpgradeTiDBGroup("db", "v7.5.0", "v7.5.3") + setSmoothUpgradeAnnotations(dbg, "v7.5.0", "v7.5.3") + state := &ReconcileContext{State: &state{ + dbg: dbg, + cluster: fake.FakeObj[v1alpha1.Cluster]("cluster"), + dbs: []*v1alpha1.TiDB{smoothUpgradeTiDB("db-a", "v7.5.0", oldRevision)}, + updateRevision: newRevision, + }} + + res := ensureSmoothUpgradeStarted(context.Background(), state, operatorclient.NewFakeClient(dbg, state.Cluster())) + require.Equal(t, task.SComplete, res.Status()) + assert.Zero(t, mock.startCalls) + assert.Zero(t, mock.finishCalls) +} + +func TestEnsureSmoothUpgradeStartedRecoversStaleAnnotation(t *testing.T) { + oldFactory := newSmoothUpgradeTiDBClient + defer func() { newSmoothUpgradeTiDBClient = oldFactory }() + + mock := &mockTiDBClient{} + newSmoothUpgradeTiDBClient = func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + return mock, nil + } + + dbg := smoothUpgradeTiDBGroup("db", "v7.5.0", "v7.5.3") + setSmoothUpgradeAnnotations(dbg, "v7.5.0", "v7.5.1") + state := &ReconcileContext{State: &state{ + dbg: dbg, + cluster: fake.FakeObj[v1alpha1.Cluster]("cluster"), + dbs: []*v1alpha1.TiDB{smoothUpgradeTiDB("db-a", "v7.5.0", oldRevision)}, + updateRevision: newRevision, + }} + + res := ensureSmoothUpgradeStarted(context.Background(), state, operatorclient.NewFakeClient(dbg, state.Cluster())) + require.Equal(t, task.SRetry, res.Status()) + assert.Zero(t, mock.startCalls) + assert.Equal(t, 1, mock.finishCalls) + assert.NotContains(t, dbg.Annotations, annSmoothUpgradeDDLPaused) +} + +func TestEnsureSmoothUpgradeStartedUnsupportedPairSkipsHTTP(t *testing.T) { + oldFactory := newSmoothUpgradeTiDBClient + defer func() { newSmoothUpgradeTiDBClient = oldFactory }() + + mock := &mockTiDBClient{} + newSmoothUpgradeTiDBClient = func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + return mock, nil + } + + dbg := smoothUpgradeTiDBGroup("db", "v6.5.10", "v8.1.0") + state := &ReconcileContext{State: &state{ + dbg: dbg, + cluster: fake.FakeObj[v1alpha1.Cluster]("cluster"), + dbs: []*v1alpha1.TiDB{smoothUpgradeTiDB("db-a", "v6.5.10", oldRevision)}, + updateRevision: newRevision, + }} + + res := ensureSmoothUpgradeStarted(context.Background(), state, operatorclient.NewFakeClient(dbg, state.Cluster())) + require.Equal(t, task.SComplete, res.Status()) + assert.Zero(t, mock.startCalls) + assert.Zero(t, mock.finishCalls) + assert.NotContains(t, dbg.Annotations, annSmoothUpgradeDDLPaused) +} + +func TestTaskFinishSmoothUpgrade(t *testing.T) { + oldFactory := newSmoothUpgradeTiDBClient + defer func() { newSmoothUpgradeTiDBClient = oldFactory }() + + mock := &mockTiDBClient{} + newSmoothUpgradeTiDBClient = func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + return mock, nil + } + + dbg := smoothUpgradeTiDBGroup("db", "v7.5.0", "v7.5.3") + setSmoothUpgradeAnnotations(dbg, "v7.5.0", "v7.5.3") + dbg.Status.Version = "v7.5.3" + dbg.Status.Replicas = 1 + dbg.Status.ReadyReplicas = 1 + dbg.Status.UpdatedReplicas = 1 + dbg.Status.CurrentReplicas = 1 + dbg.Status.UpdateRevision = newRevision + dbg.Status.CurrentRevision = newRevision + db := smoothUpgradeTiDB("db-a", "v7.5.3", newRevision) + state := &ReconcileContext{State: &state{ + dbg: dbg, + cluster: fake.FakeObj[v1alpha1.Cluster]("cluster"), + dbs: []*v1alpha1.TiDB{db}, + updateRevision: newRevision, + }} + + res, done := task.RunTask(context.Background(), TaskFinishSmoothUpgrade(state, operatorclient.NewFakeClient(dbg, state.Cluster()))) + require.False(t, done) + require.Equal(t, task.SComplete, res.Status()) + assert.Equal(t, 1, mock.finishCalls) + assert.NotContains(t, dbg.Annotations, annSmoothUpgradeDDLPaused) +} + +func TestTaskFinishSmoothUpgradeKeepsAnnotationOnFailure(t *testing.T) { + oldFactory := newSmoothUpgradeTiDBClient + defer func() { newSmoothUpgradeTiDBClient = oldFactory }() + + mock := &mockTiDBClient{finishErr: errors.New("boom")} + newSmoothUpgradeTiDBClient = func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + return mock, nil + } + + dbg := smoothUpgradeTiDBGroup("db", "v7.5.0", "v7.5.3") + setSmoothUpgradeAnnotations(dbg, "v7.5.0", "v7.5.3") + dbg.Status.Version = "v7.5.3" + dbg.Status.Replicas = 1 + dbg.Status.ReadyReplicas = 1 + dbg.Status.UpdatedReplicas = 1 + dbg.Status.CurrentReplicas = 1 + dbg.Status.UpdateRevision = newRevision + dbg.Status.CurrentRevision = newRevision + state := &ReconcileContext{State: &state{ + dbg: dbg, + cluster: fake.FakeObj[v1alpha1.Cluster]("cluster"), + dbs: []*v1alpha1.TiDB{smoothUpgradeTiDB("db-a", "v7.5.3", newRevision)}, + updateRevision: newRevision, + }} + + res, _ := task.RunTask(context.Background(), TaskFinishSmoothUpgrade(state, operatorclient.NewFakeClient(dbg, state.Cluster()))) + require.Equal(t, task.SFail, res.Status()) + assert.Equal(t, v1alpha1.AnnoValTrue, dbg.Annotations[annSmoothUpgradeDDLPaused]) +} + +func TestTaskUpdaterSmoothUpgradeStartFailureBlocksRollout(t *testing.T) { + oldFactory := newSmoothUpgradeTiDBClient + defer func() { newSmoothUpgradeTiDBClient = oldFactory }() + + mock := &mockTiDBClient{startErr: errors.New("boom")} + newSmoothUpgradeTiDBClient = func(context.Context, operatorclient.Client, *v1alpha1.Cluster, *v1alpha1.TiDB) (tidbapi.TiDBClient, error) { + return mock, nil + } + + dbg := smoothUpgradeTiDBGroup("db", "v7.5.0", "v7.5.3") + db := smoothUpgradeTiDB("db-a", "v7.5.0", oldRevision) + state := &ReconcileContext{State: &state{ + dbg: dbg, + cluster: fake.FakeObj[v1alpha1.Cluster]("cluster"), + dbs: []*v1alpha1.TiDB{db}, + updateRevision: newRevision, + }} + fc := operatorclient.NewFakeClient(dbg, state.Cluster(), db) + + res, _ := task.RunTask(context.Background(), TaskUpdater(state, fc, tracker.New().AllocateFactory("tidb"), adoption.New(logr.Discard()))) + require.Equal(t, task.SFail, res.Status()) + assert.Equal(t, 1, mock.startCalls) + + var dbs v1alpha1.TiDBList + require.NoError(t, fc.List(context.Background(), &dbs)) + assert.Len(t, dbs.Items, 1) + assert.NotContains(t, dbg.Annotations, annSmoothUpgradeDDLPaused) +} + +func smoothUpgradeTiDBGroup(name, source, target string) *v1alpha1.TiDBGroup { + return fake.FakeObj(name, func(obj *v1alpha1.TiDBGroup) *v1alpha1.TiDBGroup { + obj.Spec.Cluster.Name = "cluster" + obj.Spec.Replicas = ptr.To[int32](1) + obj.Spec.Template.Spec.Version = target + obj.Status.Version = source + return obj + }) +} + +func smoothUpgradeTiDB(name, version, revision string) *v1alpha1.TiDB { + return fake.FakeObj(name, func(obj *v1alpha1.TiDB) *v1alpha1.TiDB { + obj.Spec.Version = version + obj.Status.CurrentRevision = revision + obj.Status.Conditions = append(obj.Status.Conditions, metav1.Condition{ + Type: v1alpha1.CondReady, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Unix(0, 0), + }) + return obj + }) +} + +type mockTiDBClient struct { + startCalls int + finishCalls int + startErr error + finishErr error +} + +func (m *mockTiDBClient) GetHealth(context.Context) (bool, error) { + return true, nil +} + +func (m *mockTiDBClient) GetInfo(context.Context) (*tidbapi.ServerInfo, error) { + return nil, nil +} + +func (m *mockTiDBClient) SetServerLabels(context.Context, map[string]string) error { + return nil +} + +func (m *mockTiDBClient) GetPoolStatus(context.Context) (*tidbapi.PoolStatus, error) { + return nil, nil +} + +func (m *mockTiDBClient) Activate(context.Context, string) error { + return nil +} + +func (m *mockTiDBClient) StartUpgrade(context.Context) error { + m.startCalls++ + return m.startErr +} + +func (m *mockTiDBClient) FinishUpgrade(context.Context) error { + m.finishCalls++ + return m.finishErr +} diff --git a/pkg/controllers/tidbgroup/tasks/updater.go b/pkg/controllers/tidbgroup/tasks/updater.go index 0fa5d447760..21c50be27a6 100644 --- a/pkg/controllers/tidbgroup/tasks/updater.go +++ b/pkg/controllers/tidbgroup/tasks/updater.go @@ -95,6 +95,12 @@ func TaskUpdater(state *ReconcileContext, c client.Client, af tracker.AllocateFa noUpdate = true } + if needVersionUpgrade(dbg) { + if res := ensureSmoothUpgradeStarted(ctx, state, c); res.Status() != task.SComplete { + return res + } + } + var instances []string for _, in := range dbs { instances = append(instances, in.Name) diff --git a/pkg/tidbapi/v1/client.go b/pkg/tidbapi/v1/client.go index afd6f7d99ef..5d45078265b 100644 --- a/pkg/tidbapi/v1/client.go +++ b/pkg/tidbapi/v1/client.go @@ -20,6 +20,7 @@ import ( "crypto/tls" "encoding/json" "fmt" + "io" "net" "net/http" "time" @@ -28,11 +29,18 @@ import ( ) const ( - statusPath = "status" - infoPath = "info" - labelsPath = "labels" - tidbPoolActivatePath = "tidb-pool/activate" - tidbPoolStatusPath = "tidb-pool/status" + statusPath = "status" + infoPath = "info" + labelsPath = "labels" + tidbPoolActivatePath = "tidb-pool/activate" + tidbPoolStatusPath = "tidb-pool/status" + tidbUpgradeStartPath = "upgrade/start" + tidbUpgradeFinishPath = "upgrade/finish" + + tidbUpgradeSuccessBody = "success!" + tidbUpgradeDuplicateUpgradingBody = "It's a duplicated operation and the cluster is already in upgrading state." + tidbUpgradeDuplicateNormalBody = "It's a duplicated operation and the cluster is already in normal state." + tidbUpgradeStartTimeout = 30 * time.Second ) // TiDBClient provides TiDB server's APIs used by TiDB Operator. @@ -48,6 +56,11 @@ type TiDBClient interface { GetPoolStatus(ctx context.Context) (*PoolStatus, error) // Activate sets the keyspace of a standby TiDB instance. Activate(ctx context.Context, keyspace string) error + + // StartUpgrade sets TiDB cluster upgrade state before rolling TiDB binaries. + StartUpgrade(ctx context.Context) error + // FinishUpgrade clears TiDB cluster upgrade state after rolling TiDB binaries. + FinishUpgrade(ctx context.Context) error } // tidbClient is the default implementation of TiDBClient. @@ -153,3 +166,59 @@ func (c *tidbClient) GetPoolStatus(ctx context.Context) (*PoolStatus, error) { } return &status, nil } + +func (c *tidbClient) StartUpgrade(ctx context.Context) error { + return c.upgrade(ctx, tidbUpgradeStartPath, tidbUpgradeDuplicateUpgradingBody, tidbUpgradeStartTimeout) +} + +func (c *tidbClient) FinishUpgrade(ctx context.Context) error { + return c.upgrade(ctx, tidbUpgradeFinishPath, tidbUpgradeDuplicateNormalBody, 0) +} + +func (c *tidbClient) upgrade(ctx context.Context, path, duplicateSuccessBody string, timeout time.Duration) error { + httpClient := c.httpClient + if timeout != 0 { + transport := http.DefaultTransport + if c.httpClient.Transport != nil { + transport = c.httpClient.Transport + } + httpClient = &http.Client{ + Timeout: timeout, + Transport: transport, + } + } + + apiURL := fmt.Sprintf("%s/%s", c.url, path) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, apiURL, nil) + if err != nil { + return err + } + req.Header.Add("Content-Type", "application/json") + res, err := httpClient.Do(req) //nolint:gosec // G704: URL is constructed from trusted internal config + if err != nil { + return err + } + defer httputil.DeferClose(res.Body) + body, err := io.ReadAll(res.Body) + if err != nil { + return err + } + + bodyText := string(body) + if res.StatusCode != http.StatusOK { + return fmt.Errorf("tidb %s error response %d %q URL: %s", path, res.StatusCode, bodyText, apiURL) + } + successText := normalizeTiDBUpgradeResponse(body) + if successText == tidbUpgradeSuccessBody || successText == duplicateSuccessBody { + return nil + } + return fmt.Errorf("tidb %s unexpected response %d %q URL: %s", path, res.StatusCode, bodyText, apiURL) +} + +func normalizeTiDBUpgradeResponse(body []byte) string { + var decoded string + if err := json.Unmarshal(body, &decoded); err == nil { + return decoded + } + return string(body) +} diff --git a/pkg/tidbapi/v1/client_test.go b/pkg/tidbapi/v1/client_test.go index caf46fd80a6..a3697fffdc8 100644 --- a/pkg/tidbapi/v1/client_test.go +++ b/pkg/tidbapi/v1/client_test.go @@ -158,3 +158,82 @@ func TestTiDBClient_GetPoolStatus(t *testing.T) { }) } } + +func TestTiDBClient_Upgrade(t *testing.T) { + cases := []struct { + desc string + method func(TiDBClient, context.Context) error + path string + body string + wantErr bool + }{ + { + desc: "start success", + method: func(c TiDBClient, ctx context.Context) error { return c.StartUpgrade(ctx) }, + path: "/upgrade/start", + body: `"success!"`, + }, + { + desc: "start duplicate success", + method: func(c TiDBClient, ctx context.Context) error { return c.StartUpgrade(ctx) }, + path: "/upgrade/start", + body: `"It's a duplicated operation and the cluster is already in upgrading state."`, + }, + { + desc: "finish success", + method: func(c TiDBClient, ctx context.Context) error { return c.FinishUpgrade(ctx) }, + path: "/upgrade/finish", + body: `"success!"`, + }, + { + desc: "finish duplicate success", + method: func(c TiDBClient, ctx context.Context) error { return c.FinishUpgrade(ctx) }, + path: "/upgrade/finish", + body: `"It's a duplicated operation and the cluster is already in normal state."`, + }, + { + desc: "unexpected body", + method: func(c TiDBClient, ctx context.Context) error { return c.StartUpgrade(ctx) }, + path: "/upgrade/start", + body: `"not success"`, + wantErr: true, + }, + } + for _, c := range cases { + t.Run(c.desc, func(tt *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(tt, c.path, r.URL.Path) + assert.Equal(tt, http.MethodPost, r.Method) + body, err := io.ReadAll(r.Body) + assert.NoError(tt, err) + assert.Empty(tt, body) + _, err = w.Write([]byte(c.body)) + assert.NoError(tt, err) + })) + defer server.Close() + + client := NewTiDBClient(server.URL, 5*time.Second, nil) + err := c.method(client, context.Background()) + if c.wantErr { + require.Error(tt, err) + } else { + require.NoError(tt, err) + } + }) + } +} + +func TestTiDBClient_UpgradeErrorIncludesStatusAndBody(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _, err := w.Write([]byte(`ddl owner not ready`)) + assert.NoError(t, err) + })) + defer server.Close() + + client := NewTiDBClient(server.URL, 5*time.Second, nil) + err := client.StartUpgrade(context.Background()) + require.Error(t, err) + assert.Contains(t, err.Error(), "400") + assert.Contains(t, err.Error(), "ddl owner not ready") +}