Skip to content

Commit c132acf

Browse files
committed
fix: disable auth before backuping
1 parent 9361cb4 commit c132acf

3 files changed

Lines changed: 110 additions & 18 deletions

File tree

cmd/etcd-migrate/main.go

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ import (
2828
"k8s.io/apimachinery/pkg/runtime"
2929
"k8s.io/client-go/dynamic"
3030
"k8s.io/client-go/kubernetes"
31-
"k8s.io/client-go/rest"
3231
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
3332
_ "k8s.io/client-go/plugin/pkg/client/auth" // Import all auth providers
33+
"k8s.io/client-go/rest"
3434
"k8s.io/client-go/tools/clientcmd"
3535
"sigs.k8s.io/controller-runtime/pkg/client"
3636

@@ -186,24 +186,18 @@ func runMigration(ctx context.Context, cfg *Config, stdin io.Reader, stdout io.W
186186
return fmt.Errorf("aborted")
187187
}
188188

189-
// Backup phase: every to-be-adopted cluster is snapshotted before any
190-
// mutation; a failed backup excludes that cluster from the apply.
189+
var backup func() error
191190
if cfg.backupConfigured() {
192-
if err := runBackups(ctx, cfg, restCfg, kube, ctrlClient, plans, d, stdout); err != nil {
193-
return err
194-
}
191+
backup = func() error { return runBackups(ctx, cfg, restCfg, kube, ctrlClient, plans, d, stdout) }
195192
} else {
196193
fmt.Fprintln(stdout, "! pre-adoption backup skipped (--skip-backup)")
197194
}
198195

199-
// Auth phase: the legacy NoPassword root cannot match a credentials
200-
// Secret, so auth is switched off on the live etcd; the new operator
201-
// re-enables it with the referenced Secret once it takes over.
202-
if err := disableAuthForAdoptions(ctx, restCfg, kube, plans, d, facts, stdout); err != nil {
203-
return err
204-
}
205-
206-
stats, err := applyPlans(ctx, ctrlClient, dyn, plans, stdout)
196+
stats, err := runMutationPhases(
197+
func() error { return disableAuthForAdoptions(ctx, restCfg, kube, plans, d, facts, stdout) },
198+
backup,
199+
func() (applyStats, error) { return applyPlans(ctx, ctrlClient, dyn, plans, stdout) },
200+
)
207201
if err != nil {
208202
return err
209203
}
@@ -217,6 +211,33 @@ func runMigration(ctx context.Context, cfg *Config, stdin io.Reader, stdout io.W
217211
return errorIfPlanFailed(plans)
218212
}
219213

214+
// runMutationPhases runs the post-confirmation adoption phases in the order
215+
// their inter-phase contracts REQUIRE, and returns the apply stats:
216+
//
217+
// 1. authDisable — switch auth off on every auth-enabled legacy etcd.
218+
// 2. backup — snapshot each to-be-adopted cluster (nil ⇒ --skip-backup).
219+
// 3. apply — re-own the data plane and create the new CRs.
220+
//
221+
// Auth-disable MUST precede backup: the snapshot Job dials etcd anonymously
222+
// (cert-only, no user), and etcd gates the Maintenance Snapshot RPC behind
223+
// auth when it is enabled — so for an auth-enabled cluster (the Cozystack/
224+
// Kamaji case) the backup can only succeed once auth is off. Running them in
225+
// the reverse order silently flips exactly those clusters to ActionError and
226+
// excludes them from adoption. A cluster whose auth-disable fails is itself
227+
// flipped to ActionError and then skipped by the backup and apply phases, so
228+
// an unprotected cluster is never adopted.
229+
func runMutationPhases(authDisable func() error, backup func() error, apply func() (applyStats, error)) (applyStats, error) {
230+
if err := authDisable(); err != nil {
231+
return applyStats{}, err
232+
}
233+
if backup != nil {
234+
if err := backup(); err != nil {
235+
return applyStats{}, err
236+
}
237+
}
238+
return apply()
239+
}
240+
220241
// mustNamespace/mustName split a pre-validated namespace/name ref.
221242
func mustNamespace(ref string) string { ns, _, _ := splitRef(ref); return ns }
222243
func mustName(ref string) string { _, n, _ := splitRef(ref); return n }

cmd/etcd-migrate/main_test.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,73 @@ func TestApplyAdoption(t *testing.T) {
447447
}
448448
}
449449

450+
// TestRunMutationPhases_AuthDisableBeforeBackup pins the inter-phase contract
451+
// that the snapshot Job depends on: auth-disable MUST run before the backup,
452+
// because the Job dials etcd anonymously and etcd gates the Maintenance
453+
// Snapshot RPC behind auth. The historical bug ran backup first, which made
454+
// the safety backup fail for exactly the auth-enabled clusters the tool
455+
// targets. This test fails if the order regresses.
456+
func TestRunMutationPhases_AuthDisableBeforeBackup(t *testing.T) {
457+
var order []string
458+
authDisable := func() error { order = append(order, "auth"); return nil }
459+
backup := func() error { order = append(order, "backup"); return nil }
460+
apply := func() (applyStats, error) { order = append(order, "apply"); return applyStats{Adopted: 1}, nil }
461+
462+
stats, err := runMutationPhases(authDisable, backup, apply)
463+
if err != nil {
464+
t.Fatalf("runMutationPhases: %v", err)
465+
}
466+
if stats.Adopted != 1 {
467+
t.Errorf("stats not propagated from apply: %+v", stats)
468+
}
469+
want := []string{"auth", "backup", "apply"}
470+
if len(order) != len(want) {
471+
t.Fatalf("phase order = %v, want %v", order, want)
472+
}
473+
for i := range want {
474+
if order[i] != want[i] {
475+
t.Fatalf("phase order = %v, want %v (auth-disable must precede backup)", order, want)
476+
}
477+
}
478+
}
479+
480+
// TestRunMutationPhases_AuthFailureSkipsBackupAndApply: when auth-disable
481+
// errors, neither backup nor apply runs and the error propagates — an
482+
// unprotected, still-auth'd cluster must never reach the mutation phases.
483+
func TestRunMutationPhases_AuthFailureSkipsBackupAndApply(t *testing.T) {
484+
wantErr := fmt.Errorf("auth boom")
485+
var ran []string
486+
_, err := runMutationPhases(
487+
func() error { ran = append(ran, "auth"); return wantErr },
488+
func() error { ran = append(ran, "backup"); return nil },
489+
func() (applyStats, error) { ran = append(ran, "apply"); return applyStats{}, nil },
490+
)
491+
if err == nil {
492+
t.Fatal("expected auth-disable error to propagate")
493+
}
494+
if len(ran) != 1 || ran[0] != "auth" {
495+
t.Errorf("after auth-disable failure, ran = %v; want only [auth]", ran)
496+
}
497+
}
498+
499+
// TestRunMutationPhases_NilBackupSkips: --skip-backup (nil backup fn) still
500+
// runs auth-disable then apply, in that order.
501+
func TestRunMutationPhases_NilBackupSkips(t *testing.T) {
502+
var order []string
503+
_, err := runMutationPhases(
504+
func() error { order = append(order, "auth"); return nil },
505+
nil,
506+
func() (applyStats, error) { order = append(order, "apply"); return applyStats{}, nil },
507+
)
508+
if err != nil {
509+
t.Fatalf("runMutationPhases: %v", err)
510+
}
511+
want := []string{"auth", "apply"}
512+
if len(order) != len(want) || order[0] != want[0] || order[1] != want[1] {
513+
t.Errorf("phase order with nil backup = %v, want %v", order, want)
514+
}
515+
}
516+
450517
func assertControllerOwner(t *testing.T, refs []metav1.OwnerReference, kind, name string) {
451518
t.Helper()
452519
for _, o := range refs {

docs/migration.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,11 @@ Per cluster, the tool:
5454
1. **Inspects** the live etcd (read-only, over a port-forward with the legacy
5555
operator's client certificate): member list, cluster ID, auth status.
5656
Runs in dry-run too, so the printed plan shows the real IDs.
57-
2. **Backs up** the cluster (see below) — before anything is mutated.
58-
3. **Disables legacy auth** if enabled (the legacy NoPassword root can never
59-
match a credentials Secret; the new operator re-enables auth itself).
57+
2. **Disables legacy auth** if enabled (the legacy NoPassword root can never
58+
match a credentials Secret; the new operator re-enables auth itself). This
59+
runs **before** the backup on purpose: the snapshot Job dials etcd
60+
anonymously, and etcd rejects the Maintenance Snapshot RPC while auth is on.
61+
3. **Backs up** the cluster (see below) — before anything is mutated.
6062
4. **Creates the new CRs with prefilled status**: the `EtcdCluster` gets
6163
`status.clusterID`/`clusterToken`/`observed` (so the operator's bootstrap
6264
branch never fires against a cluster that already exists), and one
@@ -148,7 +150,9 @@ merge the CA into the referenced Secret **before** starting the new operator
148150

149151
Adoption rewires ownership of live storage, so the tool snapshots every
150152
cluster to the `--backup-s3-*`/`--backup-pvc-claim` destination **before any
151-
mutation** (a one-off Job running the operator image's snapshot agent —
153+
ownership/data-plane mutation** — the only step that precedes it is the
154+
auth-disable above, which the snapshot Job's anonymous dial depends on (a
155+
one-off Job running the operator image's snapshot agent —
152156
`--agent-image` overrides; by default the image is read from the new
153157
controller Deployment's spec, which works at replicas=0). Nothing is restored
154158
from the artifact — the data never moves — it exists purely for disaster

0 commit comments

Comments
 (0)