Skip to content

Commit 1d580b1

Browse files
jparrillclaude
andcommitted
feat(backup): integrate HCPEtcdBackup lifecycle into OADP backup flow
Add etcdSnapshot backup method that creates and monitors HCPEtcdBackup CRs during Velero backup. When etcdBackupMethod=etcdSnapshot is configured in the plugin ConfigMap, the plugin: - Creates an HCPEtcdBackup CR in the HCP namespace using BSL storage config - Copies BSL credentials to the HO namespace (remapping key for controller) - Polls the CR until backup completes or fails - Excludes etcd pods and PVCs from Velero backup (no CSI/FS backup needed) - Stores the etcd snapshot alongside the Velero backup data in the BSL The default method remains volumeSnapshot (unchanged behavior). Also cleans up dead config parameters (readoptNodes, managedServices, awsRegenPrivateLink) and registers apiextensionsv1 in the scheme for CRD existence checks. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Juan Manuel Parrilla Madrid <jparrill@redhat.com>
1 parent 36545d4 commit 1d580b1

12 files changed

Lines changed: 1122 additions & 54 deletions

File tree

pkg/common/scheme.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
veleroapiv1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
88
veleroapiv2alpha1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1"
99
corev1 "k8s.io/api/core/v1"
10+
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
1011
"k8s.io/apimachinery/pkg/runtime"
1112
)
1213

@@ -35,6 +36,9 @@ func init() {
3536
if err := hive.AddToScheme(CustomScheme); err != nil {
3637
errs = append(errs, err)
3738
}
39+
if err := apiextensionsv1.AddToScheme(CustomScheme); err != nil {
40+
errs = append(errs, err)
41+
}
3842

3943
if len(errs) > 0 {
4044
panic(errs)

pkg/common/types.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,30 @@ const (
2525
PersistentVolumeClaimKind string = "PersistentVolumeClaim"
2626
ClusterDeploymentKind string = "ClusterDeployment"
2727
DataVolumeKind string = "DataVolume"
28+
HCPEtcdBackupKind string = "HCPEtcdBackup"
29+
30+
// Default HyperShift Operator namespace
31+
DefaultHONamespace string = "hypershift"
32+
// ConfigMap key to override the HO namespace
33+
ConfigKeyHONamespace string = "hoNamespace"
34+
35+
// Etcd backup method configuration
36+
ConfigKeyEtcdBackupMethod string = "etcdBackupMethod"
37+
EtcdBackupMethodVolume string = "volumeSnapshot"
38+
EtcdBackupMethodEtcdSnapshot string = "etcdSnapshot"
39+
40+
// Velero annotation to exclude specific volumes from backup
41+
BackupVolumesExcludesAnnotation string = "backup.velero.io/backup-volumes-excludes"
42+
// Etcd data volume name in the StatefulSet pod
43+
EtcdDataVolumeName string = "data"
44+
// Etcd PVC name prefix (StatefulSet pattern: {volumeName}-{stsName}-{index})
45+
EtcdPVCPrefix string = "data-etcd-"
46+
47+
// TODO(CNTRLPLANE-2685): Remove these local constants once openshift/hypershift#8139 is merged
48+
// and the vendor is updated. These must match the values used by the HCPEtcdBackup controller.
49+
BackupInProgressReason string = "BackupInProgress"
50+
BackupRejectedReason string = "BackupRejected"
51+
EtcdBackupSucceeded string = "EtcdBackupSucceeded"
2852
)
2953

3054
var (

pkg/common/utils.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,27 @@ func GetHCPNamespace(name, namespace string) string {
163163
return fmt.Sprintf("%s-%s", namespace, name)
164164
}
165165

166+
// GetHostedCluster finds the HostedCluster that owns the HCP by deriving
167+
// its namespace and name from the HCP namespace convention: {hc-namespace}-{hc-name}.
168+
func GetHostedCluster(ctx context.Context, c crclient.Client, includedNamespaces []string, hcpNamespace string) (*hyperv1.HostedCluster, error) {
169+
for _, ns := range includedNamespaces {
170+
if ns == hcpNamespace {
171+
continue
172+
}
173+
hcList := &hyperv1.HostedClusterList{}
174+
if err := c.List(ctx, hcList, crclient.InNamespace(ns)); err != nil {
175+
continue
176+
}
177+
for i := range hcList.Items {
178+
hc := &hcList.Items[i]
179+
if GetHCPNamespace(hc.Name, hc.Namespace) == hcpNamespace {
180+
return hc, nil
181+
}
182+
}
183+
}
184+
return nil, nil
185+
}
186+
166187
// ShouldEndPluginExecution checks if the plugin should end execution by verifying if the required
167188
// Hypershift resources (HostedControlPlane and HostedCluster) exist in the cluster.
168189
// Returns true if the plugin should end execution (i.e., if this is not a Hypershift cluster).

pkg/common/utils_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,34 @@ func TestShouldEndPluginExecution(t *testing.T) {
573573
}
574574
}
575575

576+
func TestGetHostedCluster(t *testing.T) {
577+
scheme := runtime.NewScheme()
578+
_ = hyperv1.AddToScheme(scheme)
579+
580+
t.Run("When GetHostedCluster runs with a HostedCluster matching HCP namespace, It Should return that cluster", func(t *testing.T) {
581+
g := NewWithT(t)
582+
hc := &hyperv1.HostedCluster{
583+
ObjectMeta: metav1.ObjectMeta{Name: "my-cluster", Namespace: "clusters"},
584+
}
585+
c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(hc).Build()
586+
587+
result, err := GetHostedCluster(context.TODO(), c, []string{"clusters", "clusters-my-cluster"}, "clusters-my-cluster")
588+
g.Expect(err).NotTo(HaveOccurred())
589+
g.Expect(result).NotTo(BeNil())
590+
g.Expect(result.Name).To(Equal("my-cluster"))
591+
g.Expect(result.Namespace).To(Equal("clusters"))
592+
})
593+
594+
t.Run("When GetHostedCluster runs with no HostedClusters in client, It Should return nil", func(t *testing.T) {
595+
g := NewWithT(t)
596+
c := fake.NewClientBuilder().WithScheme(scheme).Build()
597+
598+
result, err := GetHostedCluster(context.TODO(), c, []string{"clusters", "clusters-my-cluster"}, "clusters-my-cluster")
599+
g.Expect(err).NotTo(HaveOccurred())
600+
g.Expect(result).To(BeNil())
601+
})
602+
}
603+
576604
func TestCRDExists(t *testing.T) {
577605
scheme := runtime.NewScheme()
578606
_ = hyperv1.AddToScheme(scheme)

pkg/core/backup.go

Lines changed: 128 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
common "github.com/openshift/hypershift-oadp-plugin/pkg/common"
99
plugtypes "github.com/openshift/hypershift-oadp-plugin/pkg/core/types"
1010
validation "github.com/openshift/hypershift-oadp-plugin/pkg/core/validation"
11+
"github.com/openshift/hypershift-oadp-plugin/pkg/etcdbackup"
1112
"github.com/openshift/hypershift-oadp-plugin/pkg/platform/agent"
1213
hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1"
1314
"github.com/sirupsen/logrus"
@@ -31,6 +32,11 @@ type BackupPlugin struct {
3132
validator validation.BackupValidator
3233
hcp *hyperv1.HostedControlPlane
3334
*plugtypes.BackupOptions
35+
36+
// Etcd backup orchestration
37+
etcdOrchestrator *etcdbackup.Orchestrator
38+
hoNamespace string
39+
etcdBackupMethod string
3440
}
3541

3642
// NewBackupPlugin instantiates BackupPlugin.
@@ -71,12 +77,27 @@ func NewBackupPlugin(logger logrus.FieldLogger) (*BackupPlugin, error) {
7177
Client: client,
7278
}
7379

80+
hoNamespace := common.DefaultHONamespace
81+
if v, ok := pluginConfig.Data[common.ConfigKeyHONamespace]; ok && v != "" {
82+
hoNamespace = v
83+
}
84+
85+
etcdBackupMethod := common.EtcdBackupMethodVolume
86+
if v, ok := pluginConfig.Data[common.ConfigKeyEtcdBackupMethod]; ok && v != "" {
87+
etcdBackupMethod = v
88+
}
89+
if etcdBackupMethod != common.EtcdBackupMethodVolume && etcdBackupMethod != common.EtcdBackupMethodEtcdSnapshot {
90+
return nil, fmt.Errorf("invalid etcdBackupMethod %q: must be %q or %q", etcdBackupMethod, common.EtcdBackupMethodVolume, common.EtcdBackupMethodEtcdSnapshot)
91+
}
92+
7493
bp := &BackupPlugin{
75-
log: logger,
76-
client: client,
77-
config: pluginConfig.Data,
78-
ctx: ctx,
79-
validator: validator,
94+
log: logger,
95+
client: client,
96+
config: pluginConfig.Data,
97+
ctx: ctx,
98+
validator: validator,
99+
hoNamespace: hoNamespace,
100+
etcdBackupMethod: etcdBackupMethod,
80101
}
81102

82103
if bp.BackupOptions, err = bp.validator.ValidatePluginConfig(bp.config); err != nil {
@@ -119,7 +140,14 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
119140
}
120141
return nil, nil, fmt.Errorf("error getting HCP namespace: %v", err)
121142
}
143+
}
122144

145+
// Etcd backup: create HCPEtcdBackup CR as early as possible (once).
146+
// Only when etcdBackupMethod is "etcdSnapshot".
147+
if p.etcdBackupMethod == common.EtcdBackupMethodEtcdSnapshot {
148+
if err := p.createEtcdBackup(ctx, backup); err != nil {
149+
return nil, nil, fmt.Errorf("error creating HCPEtcdBackup: %v", err)
150+
}
123151
}
124152

125153
kind := item.GetObjectKind().GroupVersionKind().Kind
@@ -134,6 +162,11 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
134162
return nil, nil, fmt.Errorf("error checking platform configuration: %v", err)
135163
}
136164

165+
// Etcd backup: wait for completion
166+
if err := p.waitForEtcdBackupCompletion(ctx); err != nil {
167+
return nil, nil, err
168+
}
169+
137170
case kind == common.HostedClusterKind:
138171
metadata, err := meta.Accessor(item)
139172
if err != nil {
@@ -142,16 +175,28 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
142175
common.AddAnnotation(metadata, common.HostedClusterRestoredFromBackupAnnotation, "")
143176
p.log.Infof("Added restore annotation to HostedCluster %s", metadata.GetName())
144177

178+
// Etcd backup: wait for completion
179+
if err := p.waitForEtcdBackupCompletion(ctx); err != nil {
180+
return nil, nil, err
181+
}
182+
145183
case kind == "Pod":
146-
// In case of FSBackup, we need to add the label to the pod
147-
if backup.Spec.DefaultVolumesToFsBackup != nil && !*backup.Spec.DefaultVolumesToFsBackup {
148-
metadata, err := meta.Accessor(item)
149-
if err != nil {
150-
return nil, nil, fmt.Errorf("error getting metadata accessor: %v", err)
151-
}
184+
metadata, err := meta.Accessor(item)
185+
if err != nil {
186+
return nil, nil, fmt.Errorf("error getting metadata accessor: %v", err)
187+
}
152188

153-
if strings.Contains(metadata.GetName(), "etcd-") {
154-
common.AddLabel(metadata, common.FSBackupLabelName, "true")
189+
if strings.Contains(metadata.GetName(), "etcd-") {
190+
switch p.etcdBackupMethod {
191+
case common.EtcdBackupMethodEtcdSnapshot:
192+
// Skip etcd pods entirely, snapshot is handled by HCPEtcdBackup.
193+
// This prevents both FSBackup and CSI VolumeSnapshots of etcd volumes.
194+
p.log.Infof("Skipping etcd pod %s from backup (using etcdSnapshot method)", metadata.GetName())
195+
return nil, nil, nil
196+
case common.EtcdBackupMethodVolume:
197+
if backup.Spec.DefaultVolumesToFsBackup != nil && !*backup.Spec.DefaultVolumesToFsBackup {
198+
common.AddLabel(metadata, common.FSBackupLabelName, "true")
199+
}
155200
}
156201
}
157202

@@ -172,7 +217,77 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *velerov1.Backu
172217
if _, exists := labels[common.KubevirtRHCOSLabel]; exists {
173218
return nil, nil, nil
174219
}
220+
221+
// Exclude etcd data PVCs when using etcdSnapshot method.
222+
// PVC names follow the StatefulSet pattern: data-etcd-{index}
223+
if kind == common.PersistentVolumeClaimKind &&
224+
strings.HasPrefix(metadata.GetName(), common.EtcdPVCPrefix) &&
225+
p.etcdBackupMethod == common.EtcdBackupMethodEtcdSnapshot {
226+
p.log.Infof("Excluding etcd PVC %s from backup (using etcdSnapshot method)", metadata.GetName())
227+
return nil, nil, nil
228+
}
175229
}
176230

177231
return item, nil, nil
178232
}
233+
234+
// createEtcdBackup creates an HCPEtcdBackup CR in the HCP namespace.
235+
// It is idempotent: if the orchestrator already created a backup, it returns immediately.
236+
// Requires the HCPEtcdBackup CRD to exist in the cluster (safenet check).
237+
func (p *BackupPlugin) createEtcdBackup(ctx context.Context, backup *velerov1.Backup) error {
238+
// Already created by a previous Execute() call
239+
if p.etcdOrchestrator != nil && p.etcdOrchestrator.IsCreated() {
240+
return nil
241+
}
242+
243+
crdExists, err := common.CRDExists(ctx, "hcpetcdbackups.hypershift.openshift.io", p.client)
244+
if err != nil {
245+
return fmt.Errorf("failed to check for HCPEtcdBackup CRD: %w", err)
246+
}
247+
if !crdExists {
248+
return fmt.Errorf("etcdBackupMethod is %q but HCPEtcdBackup CRD not found in the cluster", common.EtcdBackupMethodEtcdSnapshot)
249+
}
250+
251+
oadpNS, err := common.GetCurrentNamespace()
252+
if err != nil {
253+
return fmt.Errorf("failed to get OADP namespace: %w", err)
254+
}
255+
256+
p.etcdOrchestrator = etcdbackup.NewOrchestrator(p.log, p.client, p.hoNamespace, oadpNS)
257+
258+
// Fetch the HostedCluster for encryption config
259+
hc, err := common.GetHostedCluster(ctx, p.client, backup.Spec.IncludedNamespaces, p.hcp.Namespace)
260+
if err != nil {
261+
p.log.Warnf("Could not find HostedCluster for encryption config: %v", err)
262+
}
263+
264+
if err := p.etcdOrchestrator.CreateEtcdBackup(ctx, backup, p.hcp.Namespace, hc); err != nil {
265+
return err
266+
}
267+
268+
if err := p.etcdOrchestrator.VerifyInProgress(ctx); err != nil {
269+
return err
270+
}
271+
272+
return nil
273+
}
274+
275+
// waitForEtcdBackupCompletion waits for the HCPEtcdBackup to finish and cleans up
276+
// the copied credential Secret. It is a no-op if no etcd backup was created.
277+
func (p *BackupPlugin) waitForEtcdBackupCompletion(ctx context.Context) error {
278+
if p.etcdOrchestrator == nil || !p.etcdOrchestrator.IsCreated() {
279+
return nil
280+
}
281+
282+
snapshotURL, err := p.etcdOrchestrator.WaitForCompletion(ctx)
283+
if err != nil {
284+
return fmt.Errorf("HCPEtcdBackup failed: %v", err)
285+
}
286+
p.log.Infof("HCPEtcdBackup completed, snapshotURL: %s", snapshotURL)
287+
288+
if cleanupErr := p.etcdOrchestrator.CleanupCredentialSecret(ctx); cleanupErr != nil {
289+
p.log.Warnf("Failed to cleanup etcd backup credential Secret: %v", cleanupErr)
290+
}
291+
292+
return nil
293+
}

pkg/core/types/types.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package types
33
var (
44
BackupCommonResources = []string{
55
"hostedclusters", "hostedcluster", "hostedcontrolplanes", "hostedcontrolplane", "nodepools", "nodepool",
6+
"hcpetcdbackups", "hcpetcdbackup",
67
"secrets", "secret", "configmaps", "configmap", "persistentvolumes", "persistentvolume", "persistentvolumeclaims", "persistentvolumeclaim", "pods", "pod", "statefulsets", "statefulset", "deployments", "deployment",
78
"clusters", "cluster", "machines", "machine", "machinedeployments", "machinedeployment", "machinesets", "machineset",
89
"serviceaccounts", "serviceaccount", "roles", "role", "rolebindings", "rolebinding",
@@ -20,17 +21,9 @@ var (
2021
type BackupOptions struct {
2122
// Migration is a flag to indicate if the backup is for migration purposes.
2223
Migration bool
23-
// Readopt Nodes is a flag to indicate if the nodes should be reprovisioned or not during restore.
24-
ReadoptNodes bool
25-
// ManagedServices is a flag to indicate if the backup is done for ManagedServices like ROSA, ARO, etc.
26-
ManagedServices bool
2724
}
2825

2926
type RestoreOptions struct {
3027
// Migration is a flag to indicate if the backup is for migration purposes.
3128
Migration bool
32-
// Readopt Nodes is a flag to indicate if the nodes should be reprovisioned or not during restore.
33-
ReadoptNodes bool
34-
// ManagedServices is a flag to indicate if the backup is done for ManagedServices like ROSA, ARO, etc.
35-
ManagedServices bool
3629
}

pkg/core/validation/backup.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,8 @@ func (p *BackupPluginValidator) ValidatePluginConfig(config map[string]string) (
3535
case "migration":
3636
p.Log.Debugf("reading/parsing migration %s", value)
3737
bo.Migration = value == "true"
38-
case "readoptNodes":
39-
p.Log.Debugf("reading/parsing readoptNodes %s", value)
40-
bo.ReadoptNodes = value == "true"
41-
case "managedServices":
42-
p.Log.Debugf("reading/parsing managedServices %s", value)
43-
bo.ManagedServices = value == "true"
38+
case "etcdBackupMethod", "hoNamespace":
39+
p.Log.Debugf("configuration key %s=%s handled by plugin init", key, value)
4440
default:
4541
p.Log.Warnf("unknown configuration key: %s with value %s", key, value)
4642
}

pkg/core/validation/backup_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,9 @@ func TestValidatePluginConfig(t *testing.T) {
2121
expectError: false,
2222
},
2323
{
24-
name: "valid config with all options",
24+
name: "valid config with migration",
2525
config: map[string]string{
26-
"migration": "true",
27-
"readoptNodes": "true",
28-
"managedServices": "true",
26+
"migration": "true",
2927
},
3028
expectError: false,
3129
},

pkg/core/validation/restore.go

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"fmt"
55

66
plugtypes "github.com/openshift/hypershift-oadp-plugin/pkg/core/types"
7-
aws "github.com/openshift/hypershift-oadp-plugin/pkg/platform/aws"
87
hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1"
98
"github.com/sirupsen/logrus"
109
crclient "sigs.k8s.io/controller-runtime/pkg/client"
@@ -36,12 +35,8 @@ func (p *RestorePluginValidator) ValidatePluginConfig(config map[string]string)
3635
case "migration":
3736
p.Log.Debugf("reading/parsing migration %s", value)
3837
bo.Migration = value == "true"
39-
case "readoptNodes":
40-
p.Log.Debugf("reading/parsing readoptNodes %s", value)
41-
bo.ReadoptNodes = value == "true"
42-
case "managedServices":
43-
p.Log.Debugf("reading/parsing managedServices %s", value)
44-
bo.ManagedServices = value == "true"
38+
case "etcdBackupMethod", "hoNamespace":
39+
p.Log.Debugf("configuration key %s=%s handled by plugin init", key, value)
4540
default:
4641
p.Log.Warnf("unknown configuration key: %s with value %s", key, value)
4742
}
@@ -77,13 +72,6 @@ func (p *RestorePluginValidator) validateAWSPlatform(hcp *hyperv1.HostedControlP
7772
// Validate if the AWS platform is configured properly
7873
// Validate ROSA
7974
p.Log.Infof("%s AWS platform configuration is valid for HCP: %s", p.LogHeader, hcp.Name)
80-
81-
if config["managedServices"] == "true" || config["awsRegenPrivateLink"] == "true" {
82-
p.Log.Infof("%s AWS platform restore tasks for HCP: %s", p.LogHeader, hcp.Name)
83-
if err := aws.RestoreTasks(hcp, p.Client); err != nil {
84-
return fmt.Errorf("error executing ROSA platform restore tasks: %s", err.Error())
85-
}
86-
}
8775
return nil
8876
}
8977

0 commit comments

Comments
 (0)