@@ -693,24 +693,244 @@ public boolean deleteBackup(Backup backup, boolean forced) {
693693 throw new CloudRuntimeException (String .format ("Unable to find a running KVM host in zone %d to delete backup %s" , backup .getZoneId (), backup .getUuid ()));
694694 }
695695
696- DeleteBackupCommand command = new DeleteBackupCommand (backup .getExternalId (), backupRepository .getType (),
697- backupRepository .getAddress (), backupRepository .getMountOptions ());
696+ // Repair the chain (if any) before removing the backup file. For chained backups,
697+ // children that point at this backup must be re-pointed at this backup's parent
698+ // (with their blocks merged via qemu-img rebase). For a full at the head of a chain
699+ // with surviving children, refuse unless forced — `forced=true` then deletes the
700+ // full plus every descendant.
701+ ChainRepairPlan plan = computeChainRepair (backup , forced );
702+ if (!plan .proceed ) {
703+ throw new CloudRuntimeException (plan .reason );
704+ }
698705
699- BackupAnswer answer ;
700- try {
701- answer = (BackupAnswer ) agentManager .send (host .getId (), command );
702- } catch (AgentUnavailableException e ) {
703- throw new CloudRuntimeException ("Unable to contact backend control plane to initiate backup" );
704- } catch (OperationTimedoutException e ) {
705- throw new CloudRuntimeException ("Operation to delete backup timed out, please try again" );
706+ // Issue rebase commands for each child that needs re-pointing (ordered so each rebase
707+ // operates on a chain that still resolves: children first if there are nested ones).
708+ for (RebaseStep step : plan .rebaseSteps ) {
709+ RebaseBackupCommand rebase = new RebaseBackupCommand (step .targetMountRelativePath ,
710+ step .newBackingMountRelativePath , backupRepository .getType (),
711+ backupRepository .getAddress (), backupRepository .getMountOptions ());
712+ BackupAnswer rebaseAnswer ;
713+ try {
714+ rebaseAnswer = (BackupAnswer ) agentManager .send (host .getId (), rebase );
715+ } catch (AgentUnavailableException e ) {
716+ throw new CloudRuntimeException ("Unable to contact backend control plane to repair backup chain" );
717+ } catch (OperationTimedoutException e ) {
718+ throw new CloudRuntimeException ("Backup chain repair (rebase) timed out, please try again" );
719+ }
720+ if (rebaseAnswer == null || !rebaseAnswer .getResult ()) {
721+ throw new CloudRuntimeException (String .format (
722+ "Backup chain repair failed: rebase of %s onto %s returned %s" ,
723+ step .targetMountRelativePath , step .newBackingMountRelativePath ,
724+ rebaseAnswer == null ? "no answer" : rebaseAnswer .getDetails ()));
725+ }
726+ // Update the rebased child's parent reference + position in backup_details.
727+ BackupDetailVO parentDetail = backupDetailsDao .findDetail (step .childBackupId , NASBackupChainKeys .PARENT_BACKUP_ID );
728+ if (parentDetail != null ) {
729+ parentDetail .setValue (step .newParentUuid == null ? "" : step .newParentUuid );
730+ backupDetailsDao .update (parentDetail .getId (), parentDetail );
731+ } else if (step .newParentUuid != null ) {
732+ backupDetailsDao .persist (new BackupDetailVO (step .childBackupId ,
733+ NASBackupChainKeys .PARENT_BACKUP_ID , step .newParentUuid , true ));
734+ }
735+ BackupDetailVO posDetail = backupDetailsDao .findDetail (step .childBackupId , NASBackupChainKeys .CHAIN_POSITION );
736+ if (posDetail != null ) {
737+ posDetail .setValue (String .valueOf (step .newChainPosition ));
738+ backupDetailsDao .update (posDetail .getId (), posDetail );
739+ }
706740 }
707741
708- if (answer != null && answer .getResult ()) {
709- return backupDao .remove (backup .getId ());
742+ // Now delete this backup's files. For a forced delete of a full with descendants we
743+ // also delete all descendants' files (newest first so each rm targets a leaf).
744+ for (Backup victim : plan .toDelete ) {
745+ DeleteBackupCommand command = new DeleteBackupCommand (victim .getExternalId (), backupRepository .getType (),
746+ backupRepository .getAddress (), backupRepository .getMountOptions ());
747+ BackupAnswer answer ;
748+ try {
749+ answer = (BackupAnswer ) agentManager .send (host .getId (), command );
750+ } catch (AgentUnavailableException e ) {
751+ throw new CloudRuntimeException ("Unable to contact backend control plane to initiate backup" );
752+ } catch (OperationTimedoutException e ) {
753+ throw new CloudRuntimeException ("Operation to delete backup timed out, please try again" );
754+ }
755+ if (answer == null || !answer .getResult ()) {
756+ logger .warn ("Failed to delete backup file for {} ({}); leaving DB row intact" , victim .getUuid (), victim .getExternalId ());
757+ return false ;
758+ }
759+ backupDao .remove (victim .getId ());
710760 }
711761
712- logger .debug ("There was an error removing the backup with id {}" , backup .getId ());
713- return false ;
762+ // Shift chain_position down by 1 for any survivors deeper in the chain than the
763+ // backup we just removed (their direct parent reference is unchanged, but their
764+ // numeric position needs to stay consistent so future full-every cadence math works).
765+ if (plan .shiftPositionsBelow != null ) {
766+ for (Backup b : backupDao .listByVmId (null , backup .getVmId ())) {
767+ if (!plan .shiftPositionsBelow .chainId .equals (readDetail (b , NASBackupChainKeys .CHAIN_ID ))) {
768+ continue ;
769+ }
770+ int pos = chainPosition (b );
771+ if (pos > plan .shiftPositionsBelow .afterPosition && pos != Integer .MAX_VALUE ) {
772+ BackupDetailVO posDetail = backupDetailsDao .findDetail (b .getId (), NASBackupChainKeys .CHAIN_POSITION );
773+ if (posDetail != null ) {
774+ posDetail .setValue (String .valueOf (pos - 1 ));
775+ backupDetailsDao .update (posDetail .getId (), posDetail );
776+ }
777+ }
778+ }
779+ }
780+
781+ return true ;
782+ }
783+
784+ private static final class PositionShift {
785+ final String chainId ;
786+ final int afterPosition ; // shift positions strictly greater than this by -1
787+ PositionShift (String chainId , int afterPosition ) {
788+ this .chainId = chainId ;
789+ this .afterPosition = afterPosition ;
790+ }
791+ }
792+
793+ /**
794+ * Result of {@link #computeChainRepair}: whether to proceed, what to rebase, what to delete.
795+ */
796+ private static final class ChainRepairPlan {
797+ final boolean proceed ;
798+ final String reason ;
799+ final List <RebaseStep > rebaseSteps ;
800+ final List <Backup > toDelete ;
801+ final PositionShift shiftPositionsBelow ;
802+
803+ private ChainRepairPlan (boolean proceed , String reason , List <RebaseStep > rebaseSteps , List <Backup > toDelete ,
804+ PositionShift shiftPositionsBelow ) {
805+ this .proceed = proceed ;
806+ this .reason = reason ;
807+ this .rebaseSteps = rebaseSteps ;
808+ this .toDelete = toDelete ;
809+ this .shiftPositionsBelow = shiftPositionsBelow ;
810+ }
811+
812+ static ChainRepairPlan refuse (String reason ) {
813+ return new ChainRepairPlan (false , reason , Collections .emptyList (), Collections .emptyList (), null );
814+ }
815+
816+ static ChainRepairPlan proceed (List <RebaseStep > rebaseSteps , List <Backup > toDelete ) {
817+ return new ChainRepairPlan (true , null , rebaseSteps , toDelete , null );
818+ }
819+
820+ static ChainRepairPlan proceed (List <RebaseStep > rebaseSteps , List <Backup > toDelete , PositionShift shift ) {
821+ return new ChainRepairPlan (true , null , rebaseSteps , toDelete , shift );
822+ }
823+ }
824+
825+ private static final class RebaseStep {
826+ final long childBackupId ;
827+ final String targetMountRelativePath ;
828+ final String newBackingMountRelativePath ;
829+ final String newParentUuid ; // null when re-pointed onto an existing full's UUID is desired but unavailable
830+ final int newChainPosition ;
831+
832+ RebaseStep (long childBackupId , String targetMountRelativePath , String newBackingMountRelativePath ,
833+ String newParentUuid , int newChainPosition ) {
834+ this .childBackupId = childBackupId ;
835+ this .targetMountRelativePath = targetMountRelativePath ;
836+ this .newBackingMountRelativePath = newBackingMountRelativePath ;
837+ this .newParentUuid = newParentUuid ;
838+ this .newChainPosition = newChainPosition ;
839+ }
840+ }
841+
842+ /**
843+ * Compute the chain-repair plan for deleting {@code backup}. Conservative semantics:
844+ * - Backups outside any tracked chain (no NAS chain metadata) are deleted as-is.
845+ * - A standalone backup with no children is deleted as-is.
846+ * - A middle incremental: rebase its immediate child onto its own parent, then delete it.
847+ * Descendants of that child are unaffected (their backing chain still resolves).
848+ * - A full with surviving descendants: refuse unless {@code forced=true}; then delete
849+ * full + every descendant (newest first).
850+ */
851+ private ChainRepairPlan computeChainRepair (Backup backup , boolean forced ) {
852+ String chainId = readDetail (backup , NASBackupChainKeys .CHAIN_ID );
853+ if (chainId == null ) {
854+ // Pre-incremental backups (or callers that never wrote chain metadata) — single delete.
855+ return ChainRepairPlan .proceed (Collections .emptyList (), Collections .singletonList (backup ));
856+ }
857+
858+ // Gather every backup in the same chain for this VM.
859+ List <Backup > chain = new ArrayList <>();
860+ for (Backup b : backupDao .listByVmId (null , backup .getVmId ())) {
861+ if (chainId .equals (readDetail (b , NASBackupChainKeys .CHAIN_ID ))) {
862+ chain .add (b );
863+ }
864+ }
865+ chain .sort (Comparator .comparingInt (b -> chainPosition (b )));
866+
867+ int targetPos = chainPosition (backup );
868+ boolean isFull = targetPos == 0 ;
869+ List <Backup > descendants = chain .stream ()
870+ .filter (b -> chainPosition (b ) > targetPos )
871+ .collect (Collectors .toList ());
872+
873+ if (isFull ) {
874+ if (descendants .isEmpty ()) {
875+ return ChainRepairPlan .proceed (Collections .emptyList (), Collections .singletonList (backup ));
876+ }
877+ if (!forced ) {
878+ return ChainRepairPlan .refuse (String .format (
879+ "Backup %s is the full anchor of a chain with %d incremental(s). Delete the incrementals first, " +
880+ "or pass forced=true to remove the entire chain." ,
881+ backup .getUuid (), descendants .size ()));
882+ }
883+ // Forced delete: remove descendants newest first, then the full.
884+ List <Backup > victims = new ArrayList <>(descendants );
885+ victims .sort (Comparator .comparingInt ((Backup b ) -> chainPosition (b )).reversed ());
886+ victims .add (backup );
887+ return ChainRepairPlan .proceed (Collections .emptyList (), victims );
888+ }
889+
890+ // Middle (or tail) incremental.
891+ if (descendants .isEmpty ()) {
892+ // Tail: nothing to rebase, just delete.
893+ return ChainRepairPlan .proceed (Collections .emptyList (), Collections .singletonList (backup ));
894+ }
895+
896+ // Middle: only the immediate child needs to absorb our blocks and rebase onto our parent.
897+ Backup immediateChild = descendants .stream ()
898+ .min (Comparator .comparingInt (b -> chainPosition (b )))
899+ .orElseThrow (() -> new CloudRuntimeException ("Internal error: no immediate child found for chain repair" ));
900+ Backup ourParent = chain .stream ()
901+ .filter (b -> chainPosition (b ) == targetPos - 1 )
902+ .findFirst ()
903+ .orElseThrow (() -> new CloudRuntimeException (String .format (
904+ "Cannot delete %s: its parent (chain_position=%d) is missing from the chain" ,
905+ backup .getUuid (), targetPos - 1 )));
906+
907+ VolumeVO rootVolume = volumeDao .getInstanceRootVolume (backup .getVmId ());
908+ String volUuid = rootVolume == null ? "root" : rootVolume .getUuid ();
909+ String childPath = immediateChild .getExternalId () + "/root." + volUuid + ".qcow2" ;
910+ String parentPath = ourParent .getExternalId () + "/root." + volUuid + ".qcow2" ;
911+
912+ RebaseStep step = new RebaseStep (immediateChild .getId (), childPath , parentPath ,
913+ ourParent .getUuid (), chainPosition (immediateChild ) - 1 );
914+
915+ // After we delete the middle backup, every descendant's numeric chain_position
916+ // becomes stale (off by one). Their backing-file pointers don't need re-writing
917+ // (only the immediate child changed parents) but their position metadata does.
918+ return ChainRepairPlan .proceed (
919+ Collections .singletonList (step ),
920+ Collections .singletonList (backup ),
921+ new PositionShift (chainId , targetPos ));
922+ }
923+
924+ private int chainPosition (Backup b ) {
925+ String s = readDetail (b , NASBackupChainKeys .CHAIN_POSITION );
926+ if (s == null ) {
927+ return Integer .MAX_VALUE ; // no metadata => sort to end
928+ }
929+ try {
930+ return Integer .parseInt (s );
931+ } catch (NumberFormatException e ) {
932+ return Integer .MAX_VALUE ;
933+ }
714934 }
715935
716936 public void syncBackupMetrics (Long zoneId ) {
0 commit comments