Skip to content

Commit 1a1f8ff

Browse files
committed
fix(linstor): pre-flight check destination is a LINSTOR satellite before live migration
LinstorDataMotionStrategy.copyAsync would call createResource on the destination pool's controller without first verifying that the destination KVM host is registered as a LINSTOR satellite there. Two failure modes: 1. The resource group's auto-placement filter happens to match a different node (a registered satellite that is NOT the migration destination), and the resource is silently created on the wrong node. The subsequent migrate then fails because the destination KVM host has no DRBD device for the resource. 2. The auto-placement filter has no candidates and the LINSTOR API returns an opaque error. The operator has to correlate the migration failure with an unrelated controller log entry to understand what happened. This change adds verifyDestinationIsLinstorSatellite() called at the top of copyAsync. For each LINSTOR-typed destination pool it: - fetches the controller's node list via LinstorUtil.getLinstorNodeNames - throws CloudRuntimeException with a clear actionable message (lists known satellites) if destHost.getName() is missing from that list - silently skips on transient controller errors so a network blip against the controller doesn't block an otherwise valid migration Non-LINSTOR destination pools in the volumeDataStoreMap are skipped (mixed-storage migrations are unaffected).
1 parent 6f4445c commit 1a1f8ff

1 file changed

Lines changed: 61 additions & 0 deletions

File tree

plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/motion/LinstorDataMotionStrategy.java

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,58 @@ private boolean needsExactSizeProp(VolumeInfo srcVolumeInfo) {
314314
return true;
315315
}
316316

317+
/**
318+
* Verify that the destination KVM host is a registered LINSTOR satellite on the controller
319+
* backing every destination pool involved in this migration. Throws CloudRuntimeException
320+
* with a clear message when it isn't, instead of letting the resource creation later fail
321+
* obscurely inside auto-placement.
322+
*
323+
* Best-effort: a transient controller error during this check does not block the migration
324+
* — we log a warning and let the downstream resource-create surface the real issue. Only a
325+
* confirmed "host not in node list" outcome aborts the migration up-front.
326+
*/
327+
private void verifyDestinationIsLinstorSatellite(Map<VolumeInfo, DataStore> volumeDataStoreMap, Host destHost) {
328+
if (destHost == null || destHost.getName() == null) {
329+
// Without a destination host name to match, the only sensible thing is to let the
330+
// existing flow run and report whatever it would have reported.
331+
return;
332+
}
333+
for (Map.Entry<VolumeInfo, DataStore> entry : volumeDataStoreMap.entrySet()) {
334+
DataStore destDataStore = entry.getValue();
335+
StoragePoolVO destStoragePool = _storagePool.findById(destDataStore.getId());
336+
if (destStoragePool == null
337+
|| destStoragePool.getPoolType() != Storage.StoragePoolType.Linstor) {
338+
continue;
339+
}
340+
DevelopersApi api = LinstorUtil.getLinstorAPI(destStoragePool.getHostAddress());
341+
try {
342+
List<String> nodes = LinstorUtil.getLinstorNodeNames(api);
343+
if (nodes == null) {
344+
logger.warn("LINSTOR controller {} returned null node list; skipping pre-flight",
345+
destStoragePool.getHostAddress());
346+
return;
347+
}
348+
if (!nodes.contains(destHost.getName())) {
349+
throw new CloudRuntimeException(String.format(
350+
"Cannot migrate to host '%s': it is not a registered LINSTOR satellite on " +
351+
"controller %s (pool '%s'). Known satellites: %s. Either register the " +
352+
"host with `linstor node create` or pick a different destination.",
353+
destHost.getName(),
354+
destStoragePool.getHostAddress(),
355+
destStoragePool.getName(),
356+
nodes));
357+
}
358+
} catch (ApiException apiEx) {
359+
// Don't block migration on a transient controller hiccup — log and let the
360+
// downstream resource creation handle the real failure.
361+
logger.warn("LINSTOR pre-flight check could not contact controller {}: {}; " +
362+
"letting downstream resource creation proceed",
363+
destStoragePool.getHostAddress(), apiEx.getBestMessage());
364+
return;
365+
}
366+
}
367+
}
368+
317369
@Override
318370
public void copyAsync(Map<VolumeInfo, DataStore> volumeDataStoreMap, VirtualMachineTO vmTO, Host srcHost,
319371
Host destHost, AsyncCompletionCallback<CopyCommandResult> callback) {
@@ -323,6 +375,15 @@ public void copyAsync(Map<VolumeInfo, DataStore> volumeDataStoreMap, VirtualMach
323375
String.format("Invalid hypervisor type [%s]. Only KVM supported", srcHost.getHypervisorType()));
324376
}
325377

378+
// Pre-flight: verify the destination KVM host is registered as a satellite on the
379+
// LINSTOR controller backing each destination pool. Without this check, resource
380+
// creation falls through to the resource-group's auto-placement filters and may
381+
// either silently place the resource on the wrong node or fail with an opaque
382+
// auto-place error from the LINSTOR API. Failing fast here gives operators a clear
383+
// actionable message instead of having to correlate the live-migration failure with
384+
// an unrelated LINSTOR controller log entry.
385+
verifyDestinationIsLinstorSatellite(volumeDataStoreMap, destHost);
386+
326387
String errMsg = null;
327388
VMInstanceVO vmInstance = _vmDao.findById(vmTO.getId());
328389
vmTO.setState(vmInstance.getState());

0 commit comments

Comments
 (0)