Skip to content

Commit f4afff4

Browse files
committed
fix(main): add support peer-auto-tls
Signed-off-by: Andrey Kolkov <androndo@gmail.com>
1 parent d7dcf6a commit f4afff4

12 files changed

Lines changed: 319 additions & 19 deletions

File tree

api/v1alpha2/cel_validation_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ func TestCEL_TLSPeerCertManagerAndSecretRefMutuallyExclusive(t *testing.T) {
477477
_ = k8s.Delete(ctx, c)
478478
t.Fatalf("apiserver accepted both peer.secretRef and peer.certManager; expected rejection")
479479
}
480-
if !strings.Contains(err.Error(), "exactly one of spec.tls.peer.secretRef or spec.tls.peer.certManager") {
480+
if !strings.Contains(err.Error(), "exactly one of spec.tls.peer.secretRef") {
481481
t.Fatalf("error did not mention peer mutual exclusion: %v", err)
482482
}
483483
}

api/v1alpha2/etcdmember_types.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,17 @@ type EtcdMemberTLS struct {
4444
// mTLS (--peer-client-cert-auth=true).
4545
// +optional
4646
PeerSecretRef *corev1.LocalObjectReference `json:"peerSecretRef,omitempty"`
47+
48+
// PeerAutoTLS is operator-managed plumbing: it carries the cluster's
49+
// reserved "etcd-operator.cozystack.io/peer-auto-tls" annotation down to
50+
// the member so buildPod renders etcd's --peer-auto-tls (self-signed, no
51+
// shared CA) instead of mounting a peer secret. INSECURE — peer is
52+
// encrypted but NOT authenticated. Set only on clusters adopted from a
53+
// legacy --peer-auto-tls cluster, and never together with PeerSecretRef
54+
// (an explicit peer secret supersedes the annotation). Users do not set
55+
// this directly; the cluster controller derives it.
56+
// +optional
57+
PeerAutoTLS bool `json:"peerAutoTLS,omitempty"`
4758
}
4859

4960
// Condition types for EtcdMember.

charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdmembers.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,17 @@ spec:
13361336
type: string
13371337
type: object
13381338
x-kubernetes-map-type: atomic
1339+
peerAutoTLS:
1340+
description: |-
1341+
PeerAutoTLS is operator-managed plumbing: it carries the cluster's
1342+
reserved "etcd-operator.cozystack.io/peer-auto-tls" annotation down to
1343+
the member so buildPod renders etcd's --peer-auto-tls (self-signed, no
1344+
shared CA) instead of mounting a peer secret. INSECURE — peer is
1345+
encrypted but NOT authenticated. Set only on clusters adopted from a
1346+
legacy --peer-auto-tls cluster, and never together with PeerSecretRef
1347+
(an explicit peer secret supersedes the annotation). Users do not set
1348+
this directly; the cluster controller derives it.
1349+
type: boolean
13391350
peerSecretRef:
13401351
description: |-
13411352
PeerSecretRef mirrors EtcdClusterTLS.Peer.SecretRef. When nil, the

cmd/etcd-migrate/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ func runMigration(ctx context.Context, cfg *Config, stdin io.Reader, stdout io.W
221221
fmt.Fprintln(stdout, "\nNEXT: scale the new operator up — it will take over the adopted clusters without touching the pods:\n kubectl -n "+
222222
mustNamespace(cfg.NewController)+" scale deploy "+mustName(cfg.NewController)+" --replicas=1")
223223
}
224+
renderSecuritySummary(stdout, plans)
224225
printCRDNotice(stdout)
225226
return errorIfPlanFailed(plans)
226227
}

cmd/etcd-migrate/output.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ func render(w io.Writer, plans []migrate.ResourcePlan) {
3030
for _, e := range p.Errors {
3131
fmt.Fprintf(w, " ERROR: %s\n", e)
3232
}
33+
for _, sw := range p.SecurityWarnings {
34+
fmt.Fprintf(w, " ⚠️ SECURITY: %s\n", sw)
35+
}
3336
for _, warn := range p.Warnings {
3437
fmt.Fprintf(w, " warning: %s\n", warn)
3538
}
@@ -85,6 +88,29 @@ func renderManifest(w io.Writer, obj client.Object) {
8588
_, _ = w.Write(data)
8689
}
8790

91+
// renderSecuritySummary re-surfaces every SecurityWarning from the plans that
92+
// were actually adopted, AFTER --apply has run. The pre-apply plan already
93+
// shows them, but for a security-posture downgrade (e.g. an unauthenticated
94+
// --peer-auto-tls peer plane) that is not enough: the plan scrolls past, so the
95+
// operator must see the downgrade again in the closing summary, once it is a
96+
// fait accompli. No-op when nothing was downgraded.
97+
func renderSecuritySummary(w io.Writer, plans []migrate.ResourcePlan) {
98+
var any bool
99+
for i := range plans {
100+
p := &plans[i]
101+
if p.Action != migrate.ActionAdopt || len(p.SecurityWarnings) == 0 {
102+
continue
103+
}
104+
if !any {
105+
fmt.Fprintln(w, "\n⚠️ SECURITY — review before relying on the adopted clusters:")
106+
any = true
107+
}
108+
for _, sw := range p.SecurityWarnings {
109+
fmt.Fprintf(w, " • %s/%s: %s\n", p.Namespace, p.SourceName, sw)
110+
}
111+
}
112+
}
113+
88114
// printCRDNotice reminds about the one cleanup step the tool never performs.
89115
func printCRDNotice(w io.Writer) {
90116
fmt.Fprintln(w, `

controllers/etcdmember_controller.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,16 @@ func (r *EtcdMemberReconciler) buildPod(member *lll.EtcdMember) *corev1.Pod {
712712
Name: "tls-client", MountPath: "/etc/etcd/tls/client", ReadOnly: true,
713713
})
714714
}
715-
if peerTLS {
715+
switch {
716+
case member.Spec.TLS != nil && member.Spec.TLS.PeerAutoTLS:
717+
// INSECURE legacy-compat peer mode: etcd generates a self-signed peer
718+
// cert per member with no shared CA, so peer is encrypted but NOT
719+
// authenticated and there is nothing to mount. Only reached for
720+
// clusters adopted from a --peer-auto-tls legacy cluster: the cluster
721+
// controller derives this from the reserved AnnPeerAutoTLS annotation
722+
// etcd-migrate stamps (see AnnPeerAutoTLS).
723+
cmd = append(cmd, "--peer-auto-tls")
724+
case peerTLS:
716725
cmd = append(cmd,
717726
"--peer-cert-file=/etc/etcd/tls/peer/tls.crt",
718727
"--peer-key-file=/etc/etcd/tls/peer/tls.key",

controllers/etcdmember_controller_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2150,6 +2150,39 @@ func TestBuildPod_PeerTLSAlwaysMTLS(t *testing.T) {
21502150
}
21512151
}
21522152

2153+
// TestBuildPod_PeerAutoTLS: the legacy-compat insecure peer mode emits
2154+
// --peer-auto-tls on an https peer listener and mounts NO peer secret (etcd
2155+
// self-signs; there is no shared CA and no client-cert-auth).
2156+
func TestBuildPod_PeerAutoTLS(t *testing.T) {
2157+
r := &EtcdMemberReconciler{}
2158+
pod := r.buildPod(&lll.EtcdMember{
2159+
ObjectMeta: metav1.ObjectMeta{Name: "m", Namespace: "ns"},
2160+
Spec: lll.EtcdMemberSpec{
2161+
ClusterName: "test", Version: "3.5.17", Storage: lll.StorageSpec{Size: quickQty(t, "1Gi")},
2162+
TLS: &lll.EtcdMemberTLS{PeerAutoTLS: true},
2163+
},
2164+
})
2165+
cmd := pod.Spec.Containers[0].Command
2166+
if !cmdContains(cmd, "--listen-peer-urls=https://0.0.0.0:2380") {
2167+
t.Fatalf("peer listen URL not https: %v", cmd)
2168+
}
2169+
if !cmdContains(cmd, "--peer-auto-tls") {
2170+
t.Fatalf("expected --peer-auto-tls; got %v", cmd)
2171+
}
2172+
for _, unwanted := range []string{
2173+
"--peer-cert-file=/etc/etcd/tls/peer/tls.crt",
2174+
"--peer-trusted-ca-file=/etc/etcd/tls/peer/ca.crt",
2175+
"--peer-client-cert-auth=true",
2176+
} {
2177+
if cmdContains(cmd, unwanted) {
2178+
t.Fatalf("auto-tls must not set BYO peer flag %q: %v", unwanted, cmd)
2179+
}
2180+
}
2181+
if v := volumeFor(pod, "tls-peer"); v != nil {
2182+
t.Fatalf("auto-tls must mount no peer secret; got volume %+v", v)
2183+
}
2184+
}
2185+
21532186
// TestBuildPod_AlwaysExposesMetricsPort guards the cozystack-shaped
21542187
// monitoring contract: VMPodScrape (and equivalent Prometheus scrapers)
21552188
// target the named "metrics" container port unconditionally, and the
@@ -2460,6 +2493,7 @@ func TestDeriveMemberTLS(t *testing.T) {
24602493
hasClient bool
24612494
hasPeer bool
24622495
clientMTLS bool
2496+
peerAutoTLS bool
24632497
serverSecret string
24642498
opSecret string
24652499
peerSecret string
@@ -2536,6 +2570,29 @@ func TestDeriveMemberTLS(t *testing.T) {
25362570
}}}),
25372571
want: want{hasPeer: true, peerSecret: "etcd-peer-tls"},
25382572
},
2573+
{
2574+
// Legacy-compat --peer-auto-tls carried on the reserved cluster
2575+
// annotation (no typed spec.tls.peer) projects to PeerAutoTLS.
2576+
name: "peer-auto-tls annotation only",
2577+
in: func() *lll.EtcdCluster {
2578+
c := withName(&lll.EtcdCluster{})
2579+
c.Annotations = map[string]string{AnnPeerAutoTLS: "true"}
2580+
return c
2581+
}(),
2582+
want: want{peerAutoTLS: true},
2583+
},
2584+
{
2585+
// An explicit peer secretRef supersedes the annotation.
2586+
name: "peer secretRef beats peer-auto-tls annotation",
2587+
in: func() *lll.EtcdCluster {
2588+
c := withName(&lll.EtcdCluster{Spec: lll.EtcdClusterSpec{TLS: &lll.EtcdClusterTLS{
2589+
Peer: &lll.PeerTLS{SecretRef: &corev1.LocalObjectReference{Name: "p"}},
2590+
}}})
2591+
c.Annotations = map[string]string{AnnPeerAutoTLS: "true"}
2592+
return c
2593+
}(),
2594+
want: want{hasPeer: true, peerSecret: "p"},
2595+
},
25392596
}
25402597
for _, tc := range cases {
25412598
t.Run(tc.name, func(t *testing.T) {
@@ -2555,6 +2612,9 @@ func TestDeriveMemberTLS(t *testing.T) {
25552612
if (got.PeerSecretRef != nil) != tc.want.hasPeer {
25562613
t.Fatalf("hasPeer = %v; want %v", got.PeerSecretRef != nil, tc.want.hasPeer)
25572614
}
2615+
if got.PeerAutoTLS != tc.want.peerAutoTLS {
2616+
t.Fatalf("PeerAutoTLS = %v; want %v", got.PeerAutoTLS, tc.want.peerAutoTLS)
2617+
}
25582618
if got.ClientMTLS != tc.want.clientMTLS {
25592619
t.Fatalf("ClientMTLS = %v; want %v", got.ClientMTLS, tc.want.clientMTLS)
25602620
}

controllers/helpers.go

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,22 @@ const (
5858
// (validDataDirSubPath) — an annotation has no apiserver schema, so the
5959
// controller fails closed against a mount-escaping value.
6060
AnnDataDirSubPath = ReservedAnnotationPrefix + "data-dir-subpath"
61+
62+
// AnnPeerAutoTLS, set to "true" on an EtcdCluster, runs the peer plane
63+
// with etcd's --peer-auto-tls: per-member self-signed certs, NO shared
64+
// CA, so peer traffic is encrypted but NOT authenticated. This is a
65+
// migration-only knob etcd-migrate stamps when adopting a legacy cluster
66+
// that ran the previous operator's unconditional --peer-auto-tls default
67+
// (no CA exists to do real mTLS, so a strict-mTLS replacement could never
68+
// rejoin the still-auto-tls members). Unlike AnnHeadlessServiceName /
69+
// AnnDataDirSubPath it is cluster-level and does NOT self-wipe: the
70+
// controller propagates it to every member it builds so replacement/
71+
// scaled members keep interoperating. Deliberately NOT a typed spec field
72+
// — an unauthenticated peer plane must not be a discoverable, CEL-blessed
73+
// option for new clusters; an undocumented reserved key is the lesser
74+
// footgun. Superseded by an explicit spec.tls.peer.secretRef/certManager
75+
// (real mTLS wins; precedence lives in clusterPeerAutoTLS).
76+
AnnPeerAutoTLS = ReservedAnnotationPrefix + "peer-auto-tls"
6177
)
6278

6379
// etcdDataDirRoot is the mount path of every member's data volume; --data-dir
@@ -136,14 +152,33 @@ func clusterClientScheme(cluster *lll.EtcdCluster) string {
136152
}
137153

138154
// clusterPeerScheme returns "https" when the cluster has peer TLS configured,
139-
// "http" otherwise.
155+
// "http" otherwise. The legacy-compat --peer-auto-tls mode (carried on the
156+
// AnnPeerAutoTLS annotation, no typed spec.tls.peer) also serves peer over
157+
// https, so it counts too.
140158
func clusterPeerScheme(cluster *lll.EtcdCluster) string {
141159
if cluster != nil && cluster.Spec.TLS != nil && cluster.Spec.TLS.Peer != nil {
142160
return "https"
143161
}
162+
if clusterPeerAutoTLS(cluster) {
163+
return "https"
164+
}
144165
return "http"
145166
}
146167

168+
// clusterPeerAutoTLS reports whether the cluster runs the legacy-compat
169+
// --peer-auto-tls peer mode, carried on the reserved AnnPeerAutoTLS annotation
170+
// (see its doc). An explicit typed peer TLS mode (secretRef/certManager) always
171+
// wins, so the annotation is honoured only when spec.tls.peer is unset.
172+
func clusterPeerAutoTLS(cluster *lll.EtcdCluster) bool {
173+
if cluster == nil {
174+
return false
175+
}
176+
if cluster.Spec.TLS != nil && cluster.Spec.TLS.Peer != nil {
177+
return false
178+
}
179+
return cluster.Annotations[AnnPeerAutoTLS] == "true"
180+
}
181+
147182
// memberClientScheme is the per-member counterpart to clusterClientScheme,
148183
// keyed off the propagated EtcdMemberSpec.TLS.
149184
func memberClientScheme(member *lll.EtcdMember) string {
@@ -155,7 +190,8 @@ func memberClientScheme(member *lll.EtcdMember) string {
155190

156191
// memberPeerScheme is the per-member counterpart to clusterPeerScheme.
157192
func memberPeerScheme(member *lll.EtcdMember) string {
158-
if member != nil && member.Spec.TLS != nil && member.Spec.TLS.PeerSecretRef != nil {
193+
if member != nil && member.Spec.TLS != nil &&
194+
(member.Spec.TLS.PeerSecretRef != nil || member.Spec.TLS.PeerAutoTLS) {
159195
return "https"
160196
}
161197
return "http"
@@ -179,19 +215,27 @@ func buildInitialCluster(peerScheme string, names []string, service, namespace s
179215
// Secret names regardless of source, so buildPod / ensurePod /
180216
// buildOperatorTLSConfig stay source-agnostic.
181217
func deriveMemberTLS(cluster *lll.EtcdCluster) *lll.EtcdMemberTLS {
182-
if cluster == nil || cluster.Spec.TLS == nil {
183-
return nil
184-
}
185-
if cluster.Spec.TLS.Client == nil && cluster.Spec.TLS.Peer == nil {
218+
if cluster == nil {
186219
return nil
187220
}
188221
out := &lll.EtcdMemberTLS{}
189-
if name := serverSecretName(cluster); name != "" {
190-
out.ClientServerSecretRef = &corev1.LocalObjectReference{Name: name}
191-
out.ClientMTLS = operatorClientSecretName(cluster) != ""
222+
if cluster.Spec.TLS != nil {
223+
if name := serverSecretName(cluster); name != "" {
224+
out.ClientServerSecretRef = &corev1.LocalObjectReference{Name: name}
225+
out.ClientMTLS = operatorClientSecretName(cluster) != ""
226+
}
227+
if name := peerSecretName(cluster); name != "" {
228+
out.PeerSecretRef = &corev1.LocalObjectReference{Name: name}
229+
}
230+
}
231+
// Carry the legacy-compat --peer-auto-tls posture (a cluster-level
232+
// reserved annotation, not typed spec) down to the member. clusterPeerAutoTLS
233+
// already yields false when an explicit peer mode is set, so real mTLS wins.
234+
if out.PeerSecretRef == nil && clusterPeerAutoTLS(cluster) {
235+
out.PeerAutoTLS = true
192236
}
193-
if name := peerSecretName(cluster); name != "" {
194-
out.PeerSecretRef = &corev1.LocalObjectReference{Name: name}
237+
if out.ClientServerSecretRef == nil && out.PeerSecretRef == nil && !out.PeerAutoTLS {
238+
return nil
195239
}
196240
return out
197241
}

docs/migration.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,47 @@ TLS caveat: the legacy API kept CAs in separate Secrets
159159
merge the CA into the referenced Secret **before** starting the new operator
160160
(with cert-manager-issued secrets, `ca.crt` is typically already in place).
161161

162+
### Peer auto-TLS (legacy `--peer-auto-tls`)
163+
164+
The legacy operator ran etcd with `--peer-auto-tls` **unconditionally** unless
165+
you supplied a BYO peer Secret. Under that flag each member generates its own
166+
self-signed peer certificate and there is **no shared CA**: peer traffic is
167+
encrypted but **not authenticated** — any TLS-capable workload that can reach a
168+
member's `:2380` can peer with the cluster or impersonate a member. This is a
169+
weaker posture than the real mutual-TLS the native operator offers via
170+
`spec.tls.peer.secretRef` / `spec.tls.peer.certManager`, and it is **not** the
171+
same thing as the [SAN-coverage caveat](#endpoint-compatibility) above (that is
172+
about explicit mTLS certs needing both DNS domains during rollover — a different
173+
scenario; don't conflate them).
174+
175+
The tool **detects this and carries it forward**, because it has to: with no CA
176+
in existence there is nothing to mint real mTLS certs from, so a replacement or
177+
scaled-up member running strict mTLS (or plaintext peer) could never rejoin the
178+
still-auto-tls members. Carry-forward keeps replacement/scale working.
179+
180+
It is **not** exposed as a typed spec field — an unauthenticated peer plane must
181+
not be a discoverable, first-class option for new clusters. Instead the tool
182+
stamps a reserved cluster annotation:
183+
184+
```yaml
185+
metadata:
186+
annotations:
187+
etcd-operator.cozystack.io/peer-auto-tls: "true"
188+
```
189+
190+
The operator reads it and propagates `--peer-auto-tls` to every member it builds
191+
for that cluster. It is superseded by an explicit `spec.tls.peer.secretRef` /
192+
`certManager` (real mTLS always wins). The dry-run plan flags the adoption with a
193+
loud `⚠️ SECURITY:` line, and the post-`--apply` summary re-surfaces it — you
194+
cannot complete a migration without being told you adopted an unauthenticated
195+
peer plane.
196+
197+
**The only off-ramp to real mTLS is delete-and-recreate** (`spec.tls` is
198+
immutable), or a careful manual rolling restart onto BYO/cert-manager peer
199+
certs. Because strict-mTLS and auto-tls members **cannot peer with each other**,
200+
either route has a brief no-quorum window at the cutover — plan it like any
201+
peer-cert rotation.
202+
162203
### The safety backup
163204

164205
Adoption rewires ownership of live storage, so the tool snapshots every

0 commit comments

Comments
 (0)