Skip to content

Commit 0e0963a

Browse files
jparrillclaude
andcommitted
feat(backup): integrate HCPEtcdBackup lifecycle into OADP backup flow
Add etcdSnapshot backup method that creates and monitors HCPEtcdBackup CRs during Velero backup. When etcdBackupMethod=etcdSnapshot is configured in the plugin ConfigMap, the plugin: - Creates an HCPEtcdBackup CR in the HCP namespace using BSL storage config - Copies BSL credentials to the HO namespace (remapping key for controller) - Polls the CR until backup completes or fails - Excludes etcd pods and PVCs from Velero backup (no CSI/FS backup needed) - Stores the etcd snapshot alongside the Velero backup data in the BSL The default method remains volumeSnapshot (unchanged behavior). Also cleans up dead config parameters (readoptNodes, managedServices, awsRegenPrivateLink) and registers apiextensionsv1 in the scheme for CRD existence checks. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Juan Manuel Parrilla Madrid <jparrill@redhat.com>
1 parent 94b9444 commit 0e0963a

12 files changed

Lines changed: 1182 additions & 55 deletions

File tree

pkg/common/scheme.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
veleroapiv1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
88
veleroapiv2alpha1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1"
99
corev1 "k8s.io/api/core/v1"
10+
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
1011
"k8s.io/apimachinery/pkg/runtime"
1112
)
1213

@@ -35,6 +36,9 @@ func init() {
3536
if err := hive.AddToScheme(CustomScheme); err != nil {
3637
errs = append(errs, err)
3738
}
39+
if err := apiextensionsv1.AddToScheme(CustomScheme); err != nil {
40+
errs = append(errs, err)
41+
}
3842

3943
if len(errs) > 0 {
4044
panic(errs)

pkg/common/types.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ const (
1616

1717
// Integration with Hypershift, more info here: https://github.com/openshift/hypershift/pull/6195
1818
HostedClusterRestoredFromBackupAnnotation string = "hypershift.openshift.io/restored-from-backup"
19+
// Etcd snapshot URL annotation: set during backup so the restore plugin can read it
20+
// (Velero strips status from items during restore, so we persist it as an annotation)
21+
EtcdSnapshotURLAnnotation string = "hypershift.openshift.io/etcd-snapshot-url"
1922

2023
// hypershift/cluster-api kinds
2124
HostedClusterKind string = "HostedCluster"
@@ -25,6 +28,29 @@ const (
2528
PersistentVolumeClaimKind string = "PersistentVolumeClaim"
2629
ClusterDeploymentKind string = "ClusterDeployment"
2730
DataVolumeKind string = "DataVolume"
31+
HCPEtcdBackupKind string = "HCPEtcdBackup"
32+
33+
// Default HyperShift Operator namespace
34+
DefaultHONamespace string = "hypershift"
35+
// ConfigMap key to override the HO namespace
36+
ConfigKeyHONamespace string = "hoNamespace"
37+
38+
// Etcd backup method configuration
39+
ConfigKeyEtcdBackupMethod string = "etcdBackupMethod"
40+
EtcdBackupMethodVolume string = "volumeSnapshot"
41+
EtcdBackupMethodEtcdSnapshot string = "etcdSnapshot"
42+
43+
// Velero annotation to exclude specific volumes from backup
44+
BackupVolumesExcludesAnnotation string = "backup.velero.io/backup-volumes-excludes"
45+
// Etcd data volume name in the StatefulSet pod
46+
EtcdDataVolumeName string = "data"
47+
// Etcd PVC name prefix (StatefulSet pattern: {volumeName}-{stsName}-{index})
48+
EtcdPVCPrefix string = "data-etcd-"
49+
50+
// TODO(CNTRLPLANE-2685): Remove these local constants once openshift/hypershift#8139 is merged
51+
// and the vendor is updated. These must match the values used by the HCPEtcdBackup controller.
52+
BackupInProgressReason string = "BackupInProgress"
53+
BackupRejectedReason string = "BackupRejected"
2854
)
2955

3056
var (

pkg/common/utils.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package common
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"os"
78
"path/filepath"
@@ -163,6 +164,32 @@ func GetHCPNamespace(name, namespace string) string {
163164
return fmt.Sprintf("%s-%s", namespace, name)
164165
}
165166

167+
// GetHostedCluster finds the HostedCluster that owns the HCP by deriving
168+
// its namespace and name from the HCP namespace convention: {hc-namespace}-{hc-name}.
169+
func GetHostedCluster(ctx context.Context, c crclient.Client, includedNamespaces []string, hcpNamespace string) (*hyperv1.HostedCluster, error) {
170+
var errs []error
171+
for _, ns := range includedNamespaces {
172+
if ns == hcpNamespace {
173+
continue
174+
}
175+
hcList := &hyperv1.HostedClusterList{}
176+
if err := c.List(ctx, hcList, crclient.InNamespace(ns)); err != nil {
177+
errs = append(errs, fmt.Errorf("list HostedClusters in namespace %s: %w", ns, err))
178+
continue
179+
}
180+
for i := range hcList.Items {
181+
hc := &hcList.Items[i]
182+
if GetHCPNamespace(hc.Name, hc.Namespace) == hcpNamespace {
183+
return hc, nil
184+
}
185+
}
186+
}
187+
if len(errs) > 0 {
188+
return nil, errors.Join(errs...)
189+
}
190+
return nil, nil
191+
}
192+
166193
// ShouldEndPluginExecution checks if the plugin should end execution by verifying if the required
167194
// Hypershift resources (HostedControlPlane and HostedCluster) exist in the cluster.
168195
// Returns true if the plugin should end execution (i.e., if this is not a Hypershift cluster).

pkg/common/utils_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,34 @@ func TestShouldEndPluginExecution(t *testing.T) {
573573
}
574574
}
575575

576+
func TestGetHostedCluster(t *testing.T) {
577+
scheme := runtime.NewScheme()
578+
_ = hyperv1.AddToScheme(scheme)
579+
580+
t.Run("When GetHostedCluster runs with a HostedCluster matching HCP namespace, It Should return that cluster", func(t *testing.T) {
581+
g := NewWithT(t)
582+
hc := &hyperv1.HostedCluster{
583+
ObjectMeta: metav1.ObjectMeta{Name: "my-cluster", Namespace: "clusters"},
584+
}
585+
c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(hc).Build()
586+
587+
result, err := GetHostedCluster(context.TODO(), c, []string{"clusters", "clusters-my-cluster"}, "clusters-my-cluster")
588+
g.Expect(err).NotTo(HaveOccurred())
589+
g.Expect(result).NotTo(BeNil())
590+
g.Expect(result.Name).To(Equal("my-cluster"))
591+
g.Expect(result.Namespace).To(Equal("clusters"))
592+
})
593+
594+
t.Run("When GetHostedCluster runs with no HostedClusters in client, It Should return nil", func(t *testing.T) {
595+
g := NewWithT(t)
596+
c := fake.NewClientBuilder().WithScheme(scheme).Build()
597+
598+
result, err := GetHostedCluster(context.TODO(), c, []string{"clusters", "clusters-my-cluster"}, "clusters-my-cluster")
599+
g.Expect(err).NotTo(HaveOccurred())
600+
g.Expect(result).To(BeNil())
601+
})
602+
}
603+
576604
func TestCRDExists(t *testing.T) {
577605
scheme := runtime.NewScheme()
578606
_ = hyperv1.AddToScheme(scheme)

pkg/core/backup.go

Lines changed: 166 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
common "github.com/openshift/hypershift-oadp-plugin/pkg/common"
99
plugtypes "github.com/openshift/hypershift-oadp-plugin/pkg/core/types"
1010
validation "github.com/openshift/hypershift-oadp-plugin/pkg/core/validation"
11+
"github.com/openshift/hypershift-oadp-plugin/pkg/etcdbackup"
1112
"github.com/openshift/hypershift-oadp-plugin/pkg/platform/agent"
1213
hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1"
1314
"github.com/sirupsen/logrus"
@@ -31,6 +32,12 @@ type BackupPlugin struct {
3132
validator validation.BackupValidator
3233
hcp *hyperv1.HostedControlPlane
3334
*plugtypes.BackupOptions
35+
36+
// Etcd backup orchestration
37+
etcdOrchestrator *etcdbackup.Orchestrator
38+
hoNamespace string
39+
etcdBackupMethod string
40+
etcdSnapshotURL string // populated after HCPEtcdBackup completes
3441
}
3542

3643
// NewBackupPlugin instantiates BackupPlugin.
@@ -71,12 +78,27 @@ func NewBackupPlugin(logger logrus.FieldLogger) (*BackupPlugin, error) {
7178
Client: client,
7279
}
7380

81+
hoNamespace := common.DefaultHONamespace
82+
if v, ok := pluginConfig.Data[common.ConfigKeyHONamespace]; ok && v != "" {
83+
hoNamespace = v
84+
}
85+
86+
etcdBackupMethod := common.EtcdBackupMethodVolume
87+
if v, ok := pluginConfig.Data[common.ConfigKeyEtcdBackupMethod]; ok && v != "" {
88+
etcdBackupMethod = v
89+
}
90+
if etcdBackupMethod != common.EtcdBackupMethodVolume && etcdBackupMethod != common.EtcdBackupMethodEtcdSnapshot {
91+
return nil, fmt.Errorf("invalid etcdBackupMethod %q: must be %q or %q", etcdBackupMethod, common.EtcdBackupMethodVolume, common.EtcdBackupMethodEtcdSnapshot)
92+
}
93+
7494
bp := &BackupPlugin{
75-
log: logger,
76-
client: client,
77-
config: pluginConfig.Data,
78-
ctx: ctx,
79-
validator: validator,
95+
log: logger,
96+
client: client,
97+
config: pluginConfig.Data,
98+
ctx: ctx,
99+
validator: validator,
100+
hoNamespace: hoNamespace,
101+
etcdBackupMethod: etcdBackupMethod,
80102
}
81103

82104
if bp.BackupOptions, err = bp.validator.ValidatePluginConfig(bp.config); err != nil {
@@ -119,7 +141,6 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
119141
}
120142
return nil, nil, fmt.Errorf("error getting HCP namespace: %v", err)
121143
}
122-
123144
}
124145

125146
kind := item.GetObjectKind().GroupVersionKind().Kind
@@ -134,6 +155,16 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
134155
return nil, nil, fmt.Errorf("error checking platform configuration: %v", err)
135156
}
136157

158+
// Etcd backup: create after validation, wait for completion
159+
if p.etcdBackupMethod == common.EtcdBackupMethodEtcdSnapshot {
160+
if err := p.createEtcdBackup(ctx, backup); err != nil {
161+
return nil, nil, fmt.Errorf("error creating HCPEtcdBackup: %v", err)
162+
}
163+
}
164+
if err := p.waitForEtcdBackupCompletion(ctx); err != nil {
165+
return nil, nil, err
166+
}
167+
137168
case kind == common.HostedClusterKind:
138169
metadata, err := meta.Accessor(item)
139170
if err != nil {
@@ -142,16 +173,53 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
142173
common.AddAnnotation(metadata, common.HostedClusterRestoredFromBackupAnnotation, "")
143174
p.log.Infof("Added restore annotation to HostedCluster %s", metadata.GetName())
144175

145-
case kind == "Pod":
146-
// In case of FSBackup, we need to add the label to the pod
147-
if backup.Spec.DefaultVolumesToFsBackup != nil && !*backup.Spec.DefaultVolumesToFsBackup {
148-
metadata, err := meta.Accessor(item)
149-
if err != nil {
150-
return nil, nil, fmt.Errorf("error getting metadata accessor: %v", err)
176+
// Etcd backup: create if not yet created (HC may arrive before HCP),
177+
// wait for completion, and inject snapshotURL into the HC item.
178+
// Velero captures the item as-is from the API server before the HCPEtcdBackup
179+
// controller updates the HC status with lastSuccessfulEtcdBackupURL.
180+
// We must inject it here so the backed-up HC contains the URL for restore.
181+
if p.etcdBackupMethod == common.EtcdBackupMethodEtcdSnapshot {
182+
if err := p.createEtcdBackup(ctx, backup); err != nil {
183+
return nil, nil, fmt.Errorf("error creating HCPEtcdBackup: %v", err)
151184
}
185+
}
186+
if err := p.waitForEtcdBackupCompletion(ctx); err != nil {
187+
return nil, nil, err
188+
}
189+
if p.etcdSnapshotURL != "" {
190+
// Persist as annotation so the restore plugin can read it
191+
// (Velero strips status from items during restore)
192+
common.AddAnnotation(metadata, common.EtcdSnapshotURLAnnotation, p.etcdSnapshotURL)
193+
p.log.Infof("Added etcd snapshot URL annotation to HostedCluster %s: %s", metadata.GetName(), p.etcdSnapshotURL)
152194

153-
if strings.Contains(metadata.GetName(), "etcd-") {
154-
common.AddLabel(metadata, common.FSBackupLabelName, "true")
195+
unstructuredContent := item.UnstructuredContent()
196+
status, ok := unstructuredContent["status"].(map[string]interface{})
197+
if !ok {
198+
status = map[string]interface{}{}
199+
unstructuredContent["status"] = status
200+
}
201+
status["lastSuccessfulEtcdBackupURL"] = p.etcdSnapshotURL
202+
item.SetUnstructuredContent(unstructuredContent)
203+
p.log.Infof("Injected lastSuccessfulEtcdBackupURL into HostedCluster %s: %s", metadata.GetName(), p.etcdSnapshotURL)
204+
}
205+
206+
case kind == "Pod":
207+
metadata, err := meta.Accessor(item)
208+
if err != nil {
209+
return nil, nil, fmt.Errorf("error getting metadata accessor: %v", err)
210+
}
211+
212+
if strings.Contains(metadata.GetName(), "etcd-") {
213+
switch p.etcdBackupMethod {
214+
case common.EtcdBackupMethodEtcdSnapshot:
215+
// Skip etcd pods entirely, snapshot is handled by HCPEtcdBackup.
216+
// This prevents both FSBackup and CSI VolumeSnapshots of etcd volumes.
217+
p.log.Infof("Skipping etcd pod %s from backup (using etcdSnapshot method)", metadata.GetName())
218+
return nil, nil, nil
219+
case common.EtcdBackupMethodVolume:
220+
if backup.Spec.DefaultVolumesToFsBackup != nil && !*backup.Spec.DefaultVolumesToFsBackup {
221+
common.AddLabel(metadata, common.FSBackupLabelName, "true")
222+
}
155223
}
156224
}
157225

@@ -172,7 +240,91 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
172240
if _, exists := labels[common.KubevirtRHCOSLabel]; exists {
173241
return nil, nil, nil
174242
}
243+
244+
// Exclude etcd data PVCs when using etcdSnapshot method.
245+
// PVC names follow the StatefulSet pattern: data-etcd-{index}
246+
if kind == common.PersistentVolumeClaimKind &&
247+
strings.HasPrefix(metadata.GetName(), common.EtcdPVCPrefix) &&
248+
p.etcdBackupMethod == common.EtcdBackupMethodEtcdSnapshot {
249+
p.log.Infof("Excluding etcd PVC %s from backup (using etcdSnapshot method)", metadata.GetName())
250+
return nil, nil, nil
251+
}
175252
}
176253

177254
return item, nil, nil
178255
}
256+
257+
// createEtcdBackup creates an HCPEtcdBackup CR in the HCP namespace.
258+
// It is idempotent: if the orchestrator already created a backup, it returns immediately.
259+
// Requires the HCPEtcdBackup CRD to exist in the cluster (safenet check).
260+
func (p *BackupPlugin) createEtcdBackup(ctx context.Context, backup *velerov1.Backup) error {
261+
// Already created by a previous Execute() call
262+
if p.etcdOrchestrator != nil && p.etcdOrchestrator.IsCreated() {
263+
return nil
264+
}
265+
266+
crdExists, err := common.CRDExists(ctx, "hcpetcdbackups.hypershift.openshift.io", p.client)
267+
if err != nil {
268+
return fmt.Errorf("failed to check for HCPEtcdBackup CRD: %w", err)
269+
}
270+
if !crdExists {
271+
return fmt.Errorf("etcdBackupMethod is %q but HCPEtcdBackup CRD not found in the cluster", common.EtcdBackupMethodEtcdSnapshot)
272+
}
273+
274+
oadpNS, err := common.GetCurrentNamespace()
275+
if err != nil {
276+
return fmt.Errorf("failed to get OADP namespace: %w", err)
277+
}
278+
279+
p.etcdOrchestrator = etcdbackup.NewOrchestrator(p.log, p.client, p.hoNamespace, oadpNS)
280+
281+
// Fetch the HostedCluster for encryption config
282+
hc, err := common.GetHostedCluster(ctx, p.client, backup.Spec.IncludedNamespaces, p.hcp.Namespace)
283+
if err != nil {
284+
p.log.Warnf("Could not find HostedCluster for encryption config: %v", err)
285+
}
286+
287+
if err := p.etcdOrchestrator.CreateEtcdBackup(ctx, backup, p.hcp.Namespace, hc); err != nil {
288+
if cleanupErr := p.etcdOrchestrator.CleanupCredentialSecret(ctx); cleanupErr != nil {
289+
p.log.Warnf("Failed to cleanup credential Secret after create error: %v", cleanupErr)
290+
}
291+
return err
292+
}
293+
294+
if err := p.etcdOrchestrator.VerifyInProgress(ctx); err != nil {
295+
if cleanupErr := p.etcdOrchestrator.CleanupCredentialSecret(ctx); cleanupErr != nil {
296+
p.log.Warnf("Failed to cleanup credential Secret after verify error: %v", cleanupErr)
297+
}
298+
return err
299+
}
300+
301+
return nil
302+
}
303+
304+
// waitForEtcdBackupCompletion waits for the HCPEtcdBackup to finish and cleans up
305+
// the copied credential Secret. Caches the snapshotURL on the plugin struct so it
306+
// is available regardless of item processing order (HC before HCP or vice versa).
307+
// It is a no-op if no etcd backup was created.
308+
func (p *BackupPlugin) waitForEtcdBackupCompletion(ctx context.Context) error {
309+
if p.etcdOrchestrator == nil || !p.etcdOrchestrator.IsCreated() {
310+
return nil
311+
}
312+
313+
// Already completed in a previous Execute() call
314+
if p.etcdSnapshotURL != "" {
315+
return nil
316+
}
317+
318+
snapshotURL, err := p.etcdOrchestrator.WaitForCompletion(ctx)
319+
if err != nil {
320+
return fmt.Errorf("HCPEtcdBackup failed: %v", err)
321+
}
322+
p.etcdSnapshotURL = snapshotURL
323+
p.log.Infof("HCPEtcdBackup completed, snapshotURL: %s", snapshotURL)
324+
325+
if cleanupErr := p.etcdOrchestrator.CleanupCredentialSecret(ctx); cleanupErr != nil {
326+
p.log.Warnf("Failed to cleanup etcd backup credential Secret: %v", cleanupErr)
327+
}
328+
329+
return nil
330+
}

pkg/core/types/types.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package types
33
var (
44
BackupCommonResources = []string{
55
"hostedclusters", "hostedcluster", "hostedcontrolplanes", "hostedcontrolplane", "nodepools", "nodepool",
6+
"hcpetcdbackups", "hcpetcdbackup",
67
"secrets", "secret", "configmaps", "configmap", "persistentvolumes", "persistentvolume", "persistentvolumeclaims", "persistentvolumeclaim", "pods", "pod", "statefulsets", "statefulset", "deployments", "deployment",
78
"clusters", "cluster", "machines", "machine", "machinedeployments", "machinedeployment", "machinesets", "machineset",
89
"serviceaccounts", "serviceaccount", "roles", "role", "rolebindings", "rolebinding",
@@ -20,17 +21,9 @@ var (
2021
type BackupOptions struct {
2122
// Migration is a flag to indicate if the backup is for migration purposes.
2223
Migration bool
23-
// Readopt Nodes is a flag to indicate if the nodes should be reprovisioned or not during restore.
24-
ReadoptNodes bool
25-
// ManagedServices is a flag to indicate if the backup is done for ManagedServices like ROSA, ARO, etc.
26-
ManagedServices bool
2724
}
2825

2926
type RestoreOptions struct {
3027
// Migration is a flag to indicate if the backup is for migration purposes.
3128
Migration bool
32-
// Readopt Nodes is a flag to indicate if the nodes should be reprovisioned or not during restore.
33-
ReadoptNodes bool
34-
// ManagedServices is a flag to indicate if the backup is done for ManagedServices like ROSA, ARO, etc.
35-
ManagedServices bool
3629
}

0 commit comments

Comments
 (0)