Skip to content

Commit 058007e

Browse files
committed
fix lockhost on creation of volumes from snap and fix bitmap issue when migrating a vol with incremental snap
1 parent 237f074 commit 058007e

File tree

3 files changed

+246
-5
lines changed

3 files changed

+246
-5
lines changed

engine/storage/datamotion/src/main/java/org/apache/cloudstack/storage/motion/AncientDataMotionStrategy.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import javax.inject.Inject;
2828

2929
import com.cloud.agent.api.to.DiskTO;
30+
import com.cloud.storage.ClvmLockManager;
3031
import com.cloud.storage.Storage;
3132
import org.apache.cloudstack.engine.subsystem.api.storage.ClusterScope;
3233
import org.apache.cloudstack.engine.subsystem.api.storage.CopyCommandResult;
@@ -108,6 +109,8 @@ public class AncientDataMotionStrategy implements DataMotionStrategy {
108109
StorageCacheManager cacheMgr;
109110
@Inject
110111
VolumeDataStoreDao volumeDataStoreDao;
112+
@Inject
113+
ClvmLockManager clvmLockManager;
111114

112115
@Inject
113116
StorageManager storageManager;
@@ -309,6 +312,8 @@ protected Answer copyVolumeFromSnapshot(DataObject snapObj, DataObject volObj) {
309312
ep = selector.select(srcData, volObj);
310313
}
311314

315+
updateLockHostForVolume(ep, volObj);
316+
312317
CopyCommand cmd = new CopyCommand(srcData.getTO(), addFullCloneAndDiskprovisiongStrictnessFlagOnVMwareDest(volObj.getTO()), _createVolumeFromSnapshotWait, VirtualMachineManager.ExecuteInSequence.value());
313318

314319
Answer answer = null;
@@ -331,6 +336,21 @@ protected Answer copyVolumeFromSnapshot(DataObject snapObj, DataObject volObj) {
331336
}
332337
}
333338

339+
private void updateLockHostForVolume(EndPoint ep, DataObject volObj) {
340+
if (ep != null && volObj instanceof VolumeInfo) {
341+
VolumeInfo volumeInfo = (VolumeInfo) volObj;
342+
StoragePool destPool = (StoragePool) volObj.getDataStore();
343+
if (destPool != null && ClvmLockManager.isClvmPoolType(destPool.getPoolType())) {
344+
Long hostId = ep.getId();
345+
Long existingHostId = clvmLockManager.getClvmLockHostId(volumeInfo.getId(), volumeInfo.getUuid());
346+
if (existingHostId == null) {
347+
clvmLockManager.setClvmLockHostId(volumeInfo.getId(), hostId);
348+
logger.debug("Set lock host ID {} for CLVM volume {} being created from snapshot", hostId, volumeInfo.getId());
349+
}
350+
}
351+
}
352+
}
353+
334354
protected Answer cloneVolume(DataObject template, DataObject volume) {
335355
CopyCommand cmd = new CopyCommand(template.getTO(), addFullCloneAndDiskprovisiongStrictnessFlagOnVMwareDest(volume.getTO()), 0, VirtualMachineManager.ExecuteInSequence.value());
336356
try {

engine/storage/src/main/java/org/apache/cloudstack/storage/endpoint/DefaultEndPointSelector.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -449,11 +449,22 @@ public EndPoint select(DataObject object, boolean encryptionSupportRequired) {
449449
DataStore store = object.getDataStore();
450450

451451
// This ensures volumes are created on the correct host with exclusive locks
452-
if (object instanceof VolumeInfo && store.getRole() == DataStoreRole.Primary) {
453-
VolumeInfo volInfo = (VolumeInfo) object;
454-
EndPoint clvmEndpoint = selectClvmEndpointIfApplicable(volInfo, "volume creation");
455-
if (clvmEndpoint != null) {
456-
return clvmEndpoint;
452+
String operation = "";
453+
if (DataStoreRole.Primary == store.getRole()) {
454+
VolumeInfo volume = null;
455+
if (object instanceof VolumeInfo) {
456+
volume = (VolumeInfo) object;
457+
operation = "volume creation";
458+
} else if (object instanceof SnapshotInfo) {
459+
volume = ((SnapshotInfo) object).getBaseVolume();
460+
operation = "snapshot creation";
461+
}
462+
463+
if (volume != null) {
464+
EndPoint clvmEndpoint = selectClvmEndpointIfApplicable(volume, operation);
465+
if (clvmEndpoint != null) {
466+
return clvmEndpoint;
467+
}
457468
}
458469
}
459470

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStorageProcessor.java

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2119,6 +2119,13 @@ private SnapshotObjectTO takeIncrementalVolumeSnapshotOfStoppedVm(SnapshotObject
21192119
logger.debug("Taking incremental volume snapshot of volume [{}]. Snapshot will be copied to [{}].", volumeObjectTo,
21202120
ObjectUtils.defaultIfNull(secondaryPool, primaryPool));
21212121
try {
2122+
// For CLVM_NG incremental snapshots, validate bitmap before proceeding
2123+
SnapshotObjectTO bitmapValidationResult = validateClvmNgBitmapAndFallbackIfNeeded(snapshotObjectTO, primaryPool,
2124+
secondaryPool, secondaryPoolUrl, snapshotName, volumeObjectTo, conn, wait);
2125+
if (bitmapValidationResult != null) {
2126+
return bitmapValidationResult;
2127+
}
2128+
21222129
String vmName = String.format("DUMMY-VM-%s", snapshotName);
21232130

21242131
String vmXml = getVmXml(primaryPool, volumeObjectTo, vmName);
@@ -2309,6 +2316,209 @@ protected String getParentCheckpointName(String[] parents) {
23092316
return immediateParentPath.substring(immediateParentPath.lastIndexOf(File.separator) + 1);
23102317
}
23112318

2319+
/**
2320+
* Checks if a QEMU bitmap exists in the volume and is usable (not in-use or corrupted).
2321+
* This is important after lock migration where bitmaps may be left in "in-use" state.
2322+
*
2323+
* @param pool The storage pool containing the volume
2324+
* @param volume The volume to check
2325+
* @param bitmapName The name of the bitmap to check
2326+
* @return true if bitmap exists and is usable, false if missing, in-use, or corrupted
2327+
*/
2328+
protected boolean isBitmapUsable(KVMStoragePool pool, VolumeObjectTO volume, String bitmapName) {
2329+
try {
2330+
String volumePath = pool.getLocalPathFor(volume.getPath());
2331+
2332+
String command = String.format("qemu-img info --output=json -U %s", volumePath);
2333+
String jsonOutput = Script.runSimpleBashScriptWithFullResult(command, 30);
2334+
2335+
if (jsonOutput == null || jsonOutput.trim().isEmpty()) {
2336+
logger.warn("Failed to get qemu-img info for volume [{}]", volumePath);
2337+
return false;
2338+
}
2339+
2340+
logger.debug("qemu-img info output for volume [{}]: {}", volumePath, jsonOutput);
2341+
2342+
try {
2343+
ObjectMapper mapper = new ObjectMapper();
2344+
JsonNode root = mapper.readTree(jsonOutput);
2345+
2346+
JsonNode formatSpecific = root.path("format-specific");
2347+
if (formatSpecific.isMissingNode()) {
2348+
logger.debug("No format-specific data found for volume [{}], bitmap check skipped", volumePath);
2349+
return false;
2350+
}
2351+
2352+
JsonNode data = formatSpecific.path("data");
2353+
JsonNode bitmaps = data.path("bitmaps");
2354+
2355+
if (bitmaps.isMissingNode() || !bitmaps.isArray()) {
2356+
logger.debug("No bitmaps found in volume [{}]", volumePath);
2357+
return false;
2358+
}
2359+
2360+
for (JsonNode bitmap : bitmaps) {
2361+
String name = bitmap.path("name").asText();
2362+
if (bitmapName.equals(name)) {
2363+
JsonNode flags = bitmap.path("flags");
2364+
if (flags.isArray()) {
2365+
for (JsonNode flag : flags) {
2366+
String flagValue = flag.asText();
2367+
if ("in-use".equals(flagValue)) {
2368+
logger.warn("Bitmap [{}] in volume [{}] is marked as 'in-use' and cannot be used for incremental snapshot",
2369+
bitmapName, volumePath);
2370+
return false;
2371+
}
2372+
}
2373+
}
2374+
logger.debug("Bitmap [{}] found in volume [{}] and is usable", bitmapName, volumePath);
2375+
return true;
2376+
}
2377+
}
2378+
2379+
logger.warn("Bitmap [{}] not found in volume [{}]", bitmapName, volumePath);
2380+
return false;
2381+
2382+
} catch (JsonProcessingException e) {
2383+
logger.error("Failed to parse qemu-img JSON output for volume [{}]: {}", volumePath, e.getMessage(), e);
2384+
return false;
2385+
}
2386+
2387+
} catch (Exception e) {
2388+
logger.error("Error checking bitmap [{}] for volume [{}]: {}", bitmapName, volume, e.getMessage(), e);
2389+
return false;
2390+
}
2391+
}
2392+
2393+
/**
2394+
* Removes a broken or unusable bitmap from a volume.
2395+
* Called before falling back to full snapshot to keep volume metadata clean.
2396+
*
2397+
* @param pool Storage pool containing the volume
2398+
* @param volume Volume with broken bitmap
2399+
* @param bitmapName Name of bitmap to remove
2400+
*/
2401+
private void cleanupBrokenBitmap(KVMStoragePool pool, VolumeObjectTO volume, String bitmapName) {
2402+
try {
2403+
String volumePath = pool.getLocalPathFor(volume.getPath());
2404+
2405+
logger.info("Removing broken bitmap [{}] from volume [{}] before taking full snapshot",
2406+
bitmapName, volumePath);
2407+
2408+
QemuImgFile volumeFile = new QemuImgFile(volumePath, PhysicalDiskFormat.QCOW2);
2409+
QemuImg qemuImg = new QemuImg(0);
2410+
2411+
try {
2412+
qemuImg.bitmap(QemuImg.BitmapOperation.Remove, volumeFile, bitmapName);
2413+
logger.info("Successfully removed broken bitmap [{}] from volume [{}]",
2414+
bitmapName, volume.getPath());
2415+
} catch (QemuImgException e) {
2416+
logger.warn("Failed to remove broken bitmap [{}] from volume [{}]: {}. " +
2417+
"Proceeding with fallback anyway.",
2418+
bitmapName, volume.getPath(), e.getMessage());
2419+
}
2420+
2421+
} catch (Exception e) {
2422+
logger.warn("Exception while cleaning up broken bitmap [{}] for volume [{}]: {}. " +
2423+
"Proceeding with fallback anyway.",
2424+
bitmapName, volume.getPath(), e.getMessage());
2425+
}
2426+
}
2427+
2428+
/**
2429+
* Validates QEMU bitmap for CLVM_NG incremental snapshots and falls back to full snapshot if needed.
2430+
* This method checks if the bitmap from the parent checkpoint is usable. If the bitmap is corrupted,
2431+
* in-use, or missing, it cleans up the broken bitmap and falls back to taking a full snapshot with
2432+
* a new checkpoint to restart the incremental chain.
2433+
*
2434+
* @param snapshotObjectTO Snapshot being created
2435+
* @param primaryPool Primary storage pool
2436+
* @param secondaryPool Secondary storage pool for backup
2437+
* @param secondaryPoolUrl Secondary pool URL
2438+
* @param snapshotName Name of the snapshot
2439+
* @param volumeObjectTo Volume being snapshotted
2440+
* @param conn Libvirt connection
2441+
* @param wait Timeout for operations
2442+
* @return SnapshotObjectTO if fallback to full snapshot occurred, null if validation passed
2443+
* @throws LibvirtException if libvirt operations fail
2444+
*/
2445+
private SnapshotObjectTO validateClvmNgBitmapAndFallbackIfNeeded(SnapshotObjectTO snapshotObjectTO,
2446+
KVMStoragePool primaryPool,
2447+
KVMStoragePool secondaryPool,
2448+
String secondaryPoolUrl,
2449+
String snapshotName,
2450+
VolumeObjectTO volumeObjectTo,
2451+
Connect conn,
2452+
int wait) throws LibvirtException {
2453+
if (primaryPool.getType() != StoragePoolType.CLVM_NG || snapshotObjectTO.getParentSnapshotPath() == null) {
2454+
return null;
2455+
}
2456+
2457+
String[] parents = snapshotObjectTO.getParents();
2458+
if (parents == null || parents.length == 0) {
2459+
return null;
2460+
}
2461+
2462+
String parentCheckpointName = getParentCheckpointName(parents);
2463+
logger.debug("Validating bitmap [{}] for CLVM_NG volume [{}] before taking incremental snapshot",
2464+
parentCheckpointName, volumeObjectTo.getPath());
2465+
2466+
if (!isBitmapUsable(primaryPool, volumeObjectTo, parentCheckpointName)) {
2467+
logger.warn("Bitmap [{}] is not usable for volume [{}]. Falling back to full snapshot with new checkpoint.",
2468+
parentCheckpointName, volumeObjectTo.getPath());
2469+
cleanupBrokenBitmap(primaryPool, volumeObjectTo, parentCheckpointName);
2470+
return takeFullVolumeSnapshotOfStoppedVmForIncremental(snapshotObjectTO, primaryPool, secondaryPool,
2471+
secondaryPoolUrl, snapshotName, volumeObjectTo, conn, wait);
2472+
}
2473+
2474+
logger.debug("Bitmap [{}] is valid and usable for incremental snapshot", parentCheckpointName);
2475+
return null;
2476+
}
2477+
2478+
/**
2479+
* Takes a full snapshot of a stopped VM and creates a new checkpoint to restart the incremental chain.
2480+
* This is used as a fallback when incremental snapshot fails due to bitmap issues.
2481+
*/
2482+
private SnapshotObjectTO takeFullVolumeSnapshotOfStoppedVmForIncremental(SnapshotObjectTO snapshotObjectTO,
2483+
KVMStoragePool primaryPool,
2484+
KVMStoragePool secondaryPool,
2485+
String secondaryPoolUrl,
2486+
String snapshotName,
2487+
VolumeObjectTO volumeObjectTo,
2488+
Connect conn,
2489+
int wait) throws LibvirtException {
2490+
resource.validateLibvirtAndQemuVersionForIncrementalSnapshots();
2491+
Domain vm = null;
2492+
logger.info("Taking full volume snapshot (with new checkpoint) of volume [{}] to restart incremental chain. " +
2493+
"Snapshot will be copied to [{}].", volumeObjectTo, ObjectUtils.defaultIfNull(secondaryPool, primaryPool));
2494+
try {
2495+
String vmName = String.format("DUMMY-VM-%s", snapshotName);
2496+
2497+
String vmXml = getVmXml(primaryPool, volumeObjectTo, vmName);
2498+
2499+
logger.debug("Creating dummy VM with volume [{}] to take a full snapshot with checkpoint.", volumeObjectTo);
2500+
resource.startVM(conn, vmName, vmXml, Domain.CreateFlags.PAUSED);
2501+
2502+
vm = resource.getDomain(conn, vmName);
2503+
2504+
SnapshotObjectTO fullSnapshotTO = new SnapshotObjectTO();
2505+
fullSnapshotTO.setPath(snapshotObjectTO.getPath());
2506+
fullSnapshotTO.setVolume(snapshotObjectTO.getVolume());
2507+
fullSnapshotTO.setParentSnapshotPath(null); // No parent - forces full snapshot
2508+
2509+
return takeIncrementalVolumeSnapshotOfRunningVm(fullSnapshotTO, primaryPool, secondaryPool,
2510+
secondaryPoolUrl, snapshotName, volumeObjectTo, vm, conn, wait);
2511+
} catch (InternalErrorException | LibvirtException | CloudRuntimeException e) {
2512+
logger.error("Failed to take full volume snapshot with checkpoint for volume [{}] due to {}.",
2513+
volumeObjectTo, e.getMessage(), e);
2514+
throw new CloudRuntimeException(e);
2515+
} finally {
2516+
if (vm != null) {
2517+
vm.destroy();
2518+
}
2519+
}
2520+
}
2521+
23122522
private Path createFileAndWrite(String content, String dir, String fileName) {
23132523
File dirFile = new File(dir);
23142524
if (!dirFile.exists()) {

0 commit comments

Comments
 (0)