Skip to content

Commit b8d069e

Browse files
committed
feat(backup): cascade-delete + chain repair for NAS incrementals
Adds the delete-with-chain-repair semantics agreed in the RFC review: scripts/vm/hypervisor/kvm/nasbackup.sh - New '-o rebase' operation: rebases an existing on-NAS qcow2 onto a new backing parent. Uses a SAFE rebase (no -u) so the target absorbs blocks of the about-to-be-deleted parent before the backing pointer is moved up to the grandparent. Writes the new backing reference relative to the target's directory so it survives mount-point changes. - New CLI flags --rebase-target, --rebase-new-backing (both passed mount-relative). RebaseBackupCommand + LibvirtRebaseBackupCommandWrapper - New agent command that wraps the script's rebase operation. The provider sends one of these per child that needs re-pointing. NASBackupProvider.deleteBackup - Now plans the chain repair before touching files via computeChainRepair(): * No chain metadata -> single-file delete (legacy behaviour) * Tail incremental -> single delete, no rebase * Middle incremental -> rebase immediate child onto our parent, then delete; shift chain_position of all later descendants by -1 * Full with descendants -> refuse unless forced=true; with forced=true delete full + every descendant newest-first - Updates parent_backup_id, chain_position metadata in backup_details after each rebase so the model in the DB matches the on-disk chain. This implements the cascade-delete behaviour requested in @abh1sar's review point #7. Refs: #12899
1 parent 39303fb commit b8d069e

4 files changed

Lines changed: 425 additions & 13 deletions

File tree

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
//
2+
// Licensed to the Apache Software Foundation (ASF) under one
3+
// or more contributor license agreements. See the NOTICE file
4+
// distributed with this work for additional information
5+
// regarding copyright ownership. The ASF licenses this file
6+
// to you under the Apache License, Version 2.0 (the
7+
// "License"); you may not use this file except in compliance
8+
// with the License. You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing,
13+
// software distributed under the License is distributed on an
14+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
// KIND, either express or implied. See the License for the
16+
// specific language governing permissions and limitations
17+
// under the License.
18+
//
19+
20+
package org.apache.cloudstack.backup;
21+
22+
import com.cloud.agent.api.Command;
23+
import com.cloud.agent.api.LogLevel;
24+
25+
/**
26+
* Tells the KVM agent to rebase a NAS backup qcow2 onto a new backing parent. Used by the
27+
* NAS backup provider during chain repair when a middle incremental is being deleted: the
28+
* immediate child must absorb the soon-to-be-deleted parent's blocks and then re-link to
29+
* the grandparent. Both target and new-backing paths are NAS-mount-relative.
30+
*/
31+
public class RebaseBackupCommand extends Command {
32+
private String targetPath; // mount-relative path of the qcow2 to repoint
33+
private String newBackingPath; // mount-relative path of the new backing parent
34+
private String backupRepoType;
35+
private String backupRepoAddress;
36+
@LogLevel(LogLevel.Log4jLevel.Off)
37+
private String mountOptions;
38+
39+
public RebaseBackupCommand(String targetPath, String newBackingPath,
40+
String backupRepoType, String backupRepoAddress, String mountOptions) {
41+
super();
42+
this.targetPath = targetPath;
43+
this.newBackingPath = newBackingPath;
44+
this.backupRepoType = backupRepoType;
45+
this.backupRepoAddress = backupRepoAddress;
46+
this.mountOptions = mountOptions;
47+
}
48+
49+
public String getTargetPath() {
50+
return targetPath;
51+
}
52+
53+
public String getNewBackingPath() {
54+
return newBackingPath;
55+
}
56+
57+
public String getBackupRepoType() {
58+
return backupRepoType;
59+
}
60+
61+
public String getBackupRepoAddress() {
62+
return backupRepoAddress;
63+
}
64+
65+
public String getMountOptions() {
66+
return mountOptions == null ? "" : mountOptions;
67+
}
68+
69+
@Override
70+
public boolean executeInSequence() {
71+
return true;
72+
}
73+
}

plugins/backup/nas/src/main/java/org/apache/cloudstack/backup/NASBackupProvider.java

Lines changed: 233 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -693,24 +693,244 @@ public boolean deleteBackup(Backup backup, boolean forced) {
693693
throw new CloudRuntimeException(String.format("Unable to find a running KVM host in zone %d to delete backup %s", backup.getZoneId(), backup.getUuid()));
694694
}
695695

696-
DeleteBackupCommand command = new DeleteBackupCommand(backup.getExternalId(), backupRepository.getType(),
697-
backupRepository.getAddress(), backupRepository.getMountOptions());
696+
// Repair the chain (if any) before removing the backup file. For chained backups,
697+
// children that point at this backup must be re-pointed at this backup's parent
698+
// (with their blocks merged via qemu-img rebase). For a full at the head of a chain
699+
// with surviving children, refuse unless forced — `forced=true` then deletes the
700+
// full plus every descendant.
701+
ChainRepairPlan plan = computeChainRepair(backup, forced);
702+
if (!plan.proceed) {
703+
throw new CloudRuntimeException(plan.reason);
704+
}
698705

699-
BackupAnswer answer;
700-
try {
701-
answer = (BackupAnswer) agentManager.send(host.getId(), command);
702-
} catch (AgentUnavailableException e) {
703-
throw new CloudRuntimeException("Unable to contact backend control plane to initiate backup");
704-
} catch (OperationTimedoutException e) {
705-
throw new CloudRuntimeException("Operation to delete backup timed out, please try again");
706+
// Issue rebase commands for each child that needs re-pointing (ordered so each rebase
707+
// operates on a chain that still resolves: children first if there are nested ones).
708+
for (RebaseStep step : plan.rebaseSteps) {
709+
RebaseBackupCommand rebase = new RebaseBackupCommand(step.targetMountRelativePath,
710+
step.newBackingMountRelativePath, backupRepository.getType(),
711+
backupRepository.getAddress(), backupRepository.getMountOptions());
712+
BackupAnswer rebaseAnswer;
713+
try {
714+
rebaseAnswer = (BackupAnswer) agentManager.send(host.getId(), rebase);
715+
} catch (AgentUnavailableException e) {
716+
throw new CloudRuntimeException("Unable to contact backend control plane to repair backup chain");
717+
} catch (OperationTimedoutException e) {
718+
throw new CloudRuntimeException("Backup chain repair (rebase) timed out, please try again");
719+
}
720+
if (rebaseAnswer == null || !rebaseAnswer.getResult()) {
721+
throw new CloudRuntimeException(String.format(
722+
"Backup chain repair failed: rebase of %s onto %s returned %s",
723+
step.targetMountRelativePath, step.newBackingMountRelativePath,
724+
rebaseAnswer == null ? "no answer" : rebaseAnswer.getDetails()));
725+
}
726+
// Update the rebased child's parent reference + position in backup_details.
727+
BackupDetailVO parentDetail = backupDetailsDao.findDetail(step.childBackupId, NASBackupChainKeys.PARENT_BACKUP_ID);
728+
if (parentDetail != null) {
729+
parentDetail.setValue(step.newParentUuid == null ? "" : step.newParentUuid);
730+
backupDetailsDao.update(parentDetail.getId(), parentDetail);
731+
} else if (step.newParentUuid != null) {
732+
backupDetailsDao.persist(new BackupDetailVO(step.childBackupId,
733+
NASBackupChainKeys.PARENT_BACKUP_ID, step.newParentUuid, true));
734+
}
735+
BackupDetailVO posDetail = backupDetailsDao.findDetail(step.childBackupId, NASBackupChainKeys.CHAIN_POSITION);
736+
if (posDetail != null) {
737+
posDetail.setValue(String.valueOf(step.newChainPosition));
738+
backupDetailsDao.update(posDetail.getId(), posDetail);
739+
}
706740
}
707741

708-
if (answer != null && answer.getResult()) {
709-
return backupDao.remove(backup.getId());
742+
// Now delete this backup's files. For a forced delete of a full with descendants we
743+
// also delete all descendants' files (newest first so each rm targets a leaf).
744+
for (Backup victim : plan.toDelete) {
745+
DeleteBackupCommand command = new DeleteBackupCommand(victim.getExternalId(), backupRepository.getType(),
746+
backupRepository.getAddress(), backupRepository.getMountOptions());
747+
BackupAnswer answer;
748+
try {
749+
answer = (BackupAnswer) agentManager.send(host.getId(), command);
750+
} catch (AgentUnavailableException e) {
751+
throw new CloudRuntimeException("Unable to contact backend control plane to initiate backup");
752+
} catch (OperationTimedoutException e) {
753+
throw new CloudRuntimeException("Operation to delete backup timed out, please try again");
754+
}
755+
if (answer == null || !answer.getResult()) {
756+
logger.warn("Failed to delete backup file for {} ({}); leaving DB row intact", victim.getUuid(), victim.getExternalId());
757+
return false;
758+
}
759+
backupDao.remove(victim.getId());
710760
}
711761

712-
logger.debug("There was an error removing the backup with id {}", backup.getId());
713-
return false;
762+
// Shift chain_position down by 1 for any survivors deeper in the chain than the
763+
// backup we just removed (their direct parent reference is unchanged, but their
764+
// numeric position needs to stay consistent so future full-every cadence math works).
765+
if (plan.shiftPositionsBelow != null) {
766+
for (Backup b : backupDao.listByVmId(null, backup.getVmId())) {
767+
if (!plan.shiftPositionsBelow.chainId.equals(readDetail(b, NASBackupChainKeys.CHAIN_ID))) {
768+
continue;
769+
}
770+
int pos = chainPosition(b);
771+
if (pos > plan.shiftPositionsBelow.afterPosition && pos != Integer.MAX_VALUE) {
772+
BackupDetailVO posDetail = backupDetailsDao.findDetail(b.getId(), NASBackupChainKeys.CHAIN_POSITION);
773+
if (posDetail != null) {
774+
posDetail.setValue(String.valueOf(pos - 1));
775+
backupDetailsDao.update(posDetail.getId(), posDetail);
776+
}
777+
}
778+
}
779+
}
780+
781+
return true;
782+
}
783+
784+
private static final class PositionShift {
785+
final String chainId;
786+
final int afterPosition; // shift positions strictly greater than this by -1
787+
PositionShift(String chainId, int afterPosition) {
788+
this.chainId = chainId;
789+
this.afterPosition = afterPosition;
790+
}
791+
}
792+
793+
/**
794+
* Result of {@link #computeChainRepair}: whether to proceed, what to rebase, what to delete.
795+
*/
796+
private static final class ChainRepairPlan {
797+
final boolean proceed;
798+
final String reason;
799+
final List<RebaseStep> rebaseSteps;
800+
final List<Backup> toDelete;
801+
final PositionShift shiftPositionsBelow;
802+
803+
private ChainRepairPlan(boolean proceed, String reason, List<RebaseStep> rebaseSteps, List<Backup> toDelete,
804+
PositionShift shiftPositionsBelow) {
805+
this.proceed = proceed;
806+
this.reason = reason;
807+
this.rebaseSteps = rebaseSteps;
808+
this.toDelete = toDelete;
809+
this.shiftPositionsBelow = shiftPositionsBelow;
810+
}
811+
812+
static ChainRepairPlan refuse(String reason) {
813+
return new ChainRepairPlan(false, reason, Collections.emptyList(), Collections.emptyList(), null);
814+
}
815+
816+
static ChainRepairPlan proceed(List<RebaseStep> rebaseSteps, List<Backup> toDelete) {
817+
return new ChainRepairPlan(true, null, rebaseSteps, toDelete, null);
818+
}
819+
820+
static ChainRepairPlan proceed(List<RebaseStep> rebaseSteps, List<Backup> toDelete, PositionShift shift) {
821+
return new ChainRepairPlan(true, null, rebaseSteps, toDelete, shift);
822+
}
823+
}
824+
825+
private static final class RebaseStep {
826+
final long childBackupId;
827+
final String targetMountRelativePath;
828+
final String newBackingMountRelativePath;
829+
final String newParentUuid; // null when re-pointed onto an existing full's UUID is desired but unavailable
830+
final int newChainPosition;
831+
832+
RebaseStep(long childBackupId, String targetMountRelativePath, String newBackingMountRelativePath,
833+
String newParentUuid, int newChainPosition) {
834+
this.childBackupId = childBackupId;
835+
this.targetMountRelativePath = targetMountRelativePath;
836+
this.newBackingMountRelativePath = newBackingMountRelativePath;
837+
this.newParentUuid = newParentUuid;
838+
this.newChainPosition = newChainPosition;
839+
}
840+
}
841+
842+
/**
843+
* Compute the chain-repair plan for deleting {@code backup}. Conservative semantics:
844+
* - Backups outside any tracked chain (no NAS chain metadata) are deleted as-is.
845+
* - A standalone backup with no children is deleted as-is.
846+
* - A middle incremental: rebase its immediate child onto its own parent, then delete it.
847+
* Descendants of that child are unaffected (their backing chain still resolves).
848+
* - A full with surviving descendants: refuse unless {@code forced=true}; then delete
849+
* full + every descendant (newest first).
850+
*/
851+
private ChainRepairPlan computeChainRepair(Backup backup, boolean forced) {
852+
String chainId = readDetail(backup, NASBackupChainKeys.CHAIN_ID);
853+
if (chainId == null) {
854+
// Pre-incremental backups (or callers that never wrote chain metadata) — single delete.
855+
return ChainRepairPlan.proceed(Collections.emptyList(), Collections.singletonList(backup));
856+
}
857+
858+
// Gather every backup in the same chain for this VM.
859+
List<Backup> chain = new ArrayList<>();
860+
for (Backup b : backupDao.listByVmId(null, backup.getVmId())) {
861+
if (chainId.equals(readDetail(b, NASBackupChainKeys.CHAIN_ID))) {
862+
chain.add(b);
863+
}
864+
}
865+
chain.sort(Comparator.comparingInt(b -> chainPosition(b)));
866+
867+
int targetPos = chainPosition(backup);
868+
boolean isFull = targetPos == 0;
869+
List<Backup> descendants = chain.stream()
870+
.filter(b -> chainPosition(b) > targetPos)
871+
.collect(Collectors.toList());
872+
873+
if (isFull) {
874+
if (descendants.isEmpty()) {
875+
return ChainRepairPlan.proceed(Collections.emptyList(), Collections.singletonList(backup));
876+
}
877+
if (!forced) {
878+
return ChainRepairPlan.refuse(String.format(
879+
"Backup %s is the full anchor of a chain with %d incremental(s). Delete the incrementals first, " +
880+
"or pass forced=true to remove the entire chain.",
881+
backup.getUuid(), descendants.size()));
882+
}
883+
// Forced delete: remove descendants newest first, then the full.
884+
List<Backup> victims = new ArrayList<>(descendants);
885+
victims.sort(Comparator.comparingInt((Backup b) -> chainPosition(b)).reversed());
886+
victims.add(backup);
887+
return ChainRepairPlan.proceed(Collections.emptyList(), victims);
888+
}
889+
890+
// Middle (or tail) incremental.
891+
if (descendants.isEmpty()) {
892+
// Tail: nothing to rebase, just delete.
893+
return ChainRepairPlan.proceed(Collections.emptyList(), Collections.singletonList(backup));
894+
}
895+
896+
// Middle: only the immediate child needs to absorb our blocks and rebase onto our parent.
897+
Backup immediateChild = descendants.stream()
898+
.min(Comparator.comparingInt(b -> chainPosition(b)))
899+
.orElseThrow(() -> new CloudRuntimeException("Internal error: no immediate child found for chain repair"));
900+
Backup ourParent = chain.stream()
901+
.filter(b -> chainPosition(b) == targetPos - 1)
902+
.findFirst()
903+
.orElseThrow(() -> new CloudRuntimeException(String.format(
904+
"Cannot delete %s: its parent (chain_position=%d) is missing from the chain",
905+
backup.getUuid(), targetPos - 1)));
906+
907+
VolumeVO rootVolume = volumeDao.getInstanceRootVolume(backup.getVmId());
908+
String volUuid = rootVolume == null ? "root" : rootVolume.getUuid();
909+
String childPath = immediateChild.getExternalId() + "/root." + volUuid + ".qcow2";
910+
String parentPath = ourParent.getExternalId() + "/root." + volUuid + ".qcow2";
911+
912+
RebaseStep step = new RebaseStep(immediateChild.getId(), childPath, parentPath,
913+
ourParent.getUuid(), chainPosition(immediateChild) - 1);
914+
915+
// After we delete the middle backup, every descendant's numeric chain_position
916+
// becomes stale (off by one). Their backing-file pointers don't need re-writing
917+
// (only the immediate child changed parents) but their position metadata does.
918+
return ChainRepairPlan.proceed(
919+
Collections.singletonList(step),
920+
Collections.singletonList(backup),
921+
new PositionShift(chainId, targetPos));
922+
}
923+
924+
private int chainPosition(Backup b) {
925+
String s = readDetail(b, NASBackupChainKeys.CHAIN_POSITION);
926+
if (s == null) {
927+
return Integer.MAX_VALUE; // no metadata => sort to end
928+
}
929+
try {
930+
return Integer.parseInt(s);
931+
} catch (NumberFormatException e) {
932+
return Integer.MAX_VALUE;
933+
}
714934
}
715935

716936
public void syncBackupMetrics(Long zoneId) {
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
//
2+
// Licensed to the Apache Software Foundation (ASF) under one
3+
// or more contributor license agreements. See the NOTICE file
4+
// distributed with this work for additional information
5+
// regarding copyright ownership. The ASF licenses this file
6+
// to you under the Apache License, Version 2.0 (the
7+
// "License"); you may not use this file except in compliance
8+
// with the License. You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing,
13+
// software distributed under the License is distributed on an
14+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
// KIND, either express or implied. See the License for the
16+
// specific language governing permissions and limitations
17+
// under the License.
18+
//
19+
20+
package com.cloud.hypervisor.kvm.resource.wrapper;
21+
22+
import com.cloud.agent.api.Answer;
23+
import com.cloud.hypervisor.kvm.resource.LibvirtComputingResource;
24+
import com.cloud.resource.CommandWrapper;
25+
import com.cloud.resource.ResourceWrapper;
26+
import com.cloud.utils.Pair;
27+
import com.cloud.utils.script.Script;
28+
import org.apache.cloudstack.backup.BackupAnswer;
29+
import org.apache.cloudstack.backup.RebaseBackupCommand;
30+
31+
import java.util.ArrayList;
32+
import java.util.List;
33+
34+
@ResourceWrapper(handles = RebaseBackupCommand.class)
35+
public class LibvirtRebaseBackupCommandWrapper extends CommandWrapper<RebaseBackupCommand, Answer, LibvirtComputingResource> {
36+
@Override
37+
public Answer execute(RebaseBackupCommand command, LibvirtComputingResource libvirtComputingResource) {
38+
List<String[]> commands = new ArrayList<>();
39+
commands.add(new String[]{
40+
libvirtComputingResource.getNasBackupPath(),
41+
"-o", "rebase",
42+
"-t", command.getBackupRepoType(),
43+
"-s", command.getBackupRepoAddress(),
44+
"-m", command.getMountOptions(),
45+
"--rebase-target", command.getTargetPath(),
46+
"--rebase-new-backing", command.getNewBackingPath()
47+
});
48+
49+
Pair<Integer, String> result = Script.executePipedCommands(commands, libvirtComputingResource.getCmdsTimeout());
50+
logger.debug("Backup rebase result: {} , exit code: {}", result.second(), result.first());
51+
52+
if (result.first() != 0) {
53+
logger.warn("Failed to rebase backup file {} onto {}: {}",
54+
command.getTargetPath(), command.getNewBackingPath(), result.second());
55+
return new BackupAnswer(command, false, result.second());
56+
}
57+
return new BackupAnswer(command, true, null);
58+
}
59+
}

0 commit comments

Comments
 (0)