Skip to content

Commit 21c6110

Browse files
committed
fix(linstor): verify resource definition deletion completes; warn if stuck
The LINSTOR plugin treats a successful HTTP response from resourceDefinitionDelete as proof the resource is gone, then drops the volume from CloudStack's accounting. In practice LINSTOR can return success while the resource lingers in DELETING state — for example when a DRBD peer is unreachable, quorum was lost, or a satellite is down. The plugin had no retry, no verification, and no sweeper. Operators have been finding hundreds of stuck DELETING resources accumulated over weeks because nothing surfaced the divergence between the CS view and the LINSTOR view. This change adds two helpers to LinstorUtil: isResourceDefinitionGone(api, rscName) - quick existence check via resourceDefinitionList waitForResourceDefinitionDeleted(api, rscName, timeoutMillis) - polls every second until the resource is gone OR timeout elapses - returns true on confirmed-gone, false on timeout and calls waitForResourceDefinitionDeleted from both delete sites (driver: LinstorPrimaryDataStoreDriverImpl.deleteResourceDefinition; adaptor: LinstorStorageAdaptor.deRefOrDeleteResource) with a 30s default timeout. On timeout the plugin logs a WARN with the resource name and a hint pointing at `linstor resource list`. We deliberately do NOT throw on timeout: the CS-side accounting has already moved on, and throwing would create a different inconsistency. This is the minimal Tier-1 fix that surfaces the problem in the operator's view. A follow-up could add a periodic sweeper that attempts force-delete on long-stuck DELETING resources.
1 parent 6f4445c commit 21c6110

3 files changed

Lines changed: 76 additions & 0 deletions

File tree

plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStorageAdaptor.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,17 @@ private boolean deRefOrDeleteResource(DevelopersApi api, String rscName, String
514514
ApiCallRcList answers = api.resourceDefinitionDelete(rd.getName());
515515
checkLinstorAnswersThrow(answers);
516516
deleted = true;
517+
518+
// LINSTOR can return success here while the resource lingers in DELETING state
519+
// on the controller (down peer, lost quorum, etc.). Confirm it's actually gone
520+
// — if not, log a WARN so operators can clear it manually. Don't throw: the
521+
// CloudStack-side accounting has already moved on.
522+
if (!LinstorUtil.waitForResourceDefinitionDeleted(api, rd.getName(),
523+
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) {
524+
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
525+
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list).",
526+
rd.getName(), LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
527+
}
517528
}
518529
}
519530
return deleted;

plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/driver/LinstorPrimaryDataStoreDriverImpl.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,20 @@ private void deleteResourceDefinition(StoragePoolVO storagePoolVO, String rscDef
230230
throw new CloudRuntimeException("Linstor: Unable to delete resource definition: " + rscDefName);
231231
}
232232
logger.info("Linstor: Deleted resource {}", rscDefName);
233+
234+
// LINSTOR can return success on the delete API call while the resource lingers in
235+
// DELETING state (peer issues, lost quorum, satellite down). Verify the resource is
236+
// actually gone — if not, log a WARN so operators see it. We deliberately do NOT
237+
// throw here: the volume is already considered gone on the CloudStack side, and
238+
// throwing would leave the CS DB and LINSTOR in different states.
239+
if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi, rscDefName,
240+
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS))
241+
{
242+
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
243+
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list) " +
244+
"and clear manually if the resource has no live peers.",
245+
rscDefName, LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
246+
}
233247
} catch (ApiException apiEx)
234248
{
235249
logger.error("Linstor: ApiEx - " + apiEx.getMessage());

plugins/storage/volume/linstor/src/main/java/org/apache/cloudstack/storage/datastore/util/LinstorUtil.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,57 @@ public static List<ResourceDefinition> getRDListStartingWith(DevelopersApi api,
401401
.collect(Collectors.toList());
402402
}
403403

404+
/**
405+
* Default per-call timeout for {@link #waitForResourceDefinitionDeleted}. Long enough for a
406+
* healthy LINSTOR controller to finish a normal delete; short enough not to block the calling
407+
* agent thread for too long if the delete is genuinely stuck.
408+
*/
409+
public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L;
410+
411+
/**
412+
* Returns {@code true} if the named resource definition is no longer present on the LINSTOR
413+
* controller. Used after a {@code resourceDefinitionDelete} to verify the delete actually
414+
* completed (LINSTOR can return success on the API call while the resource lingers in
415+
* DELETING state due to peer issues, lost quorum, or down satellites).
416+
*/
417+
public static boolean isResourceDefinitionGone(DevelopersApi api, String rscName) throws ApiException {
418+
List<ResourceDefinition> all = api.resourceDefinitionList(null, false, null, null, null);
419+
if (all == null) {
420+
return true;
421+
}
422+
return all.stream().noneMatch(rd -> rscName.equalsIgnoreCase(rd.getName()));
423+
}
424+
425+
/**
426+
* Polls the controller until the named resource definition is gone or the timeout elapses.
427+
* Returns {@code true} if the resource was confirmed gone, {@code false} if it was still
428+
* present (or the controller kept erroring) at the deadline. Callers should NOT throw on a
429+
* {@code false} return — the upstream API call already reported success and the operator
430+
* may need to investigate manually. Log a WARN with the resource name instead.
431+
*/
432+
public static boolean waitForResourceDefinitionDeleted(DevelopersApi api, String rscName, long timeoutMillis) {
433+
final long deadline = System.currentTimeMillis() + timeoutMillis;
434+
while (true) {
435+
try {
436+
if (isResourceDefinitionGone(api, rscName)) {
437+
return true;
438+
}
439+
} catch (ApiException e) {
440+
LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}", rscName, e.getMessage());
441+
// Keep polling — controller may be transiently unavailable.
442+
}
443+
if (System.currentTimeMillis() >= deadline) {
444+
return false;
445+
}
446+
try {
447+
Thread.sleep(1_000L);
448+
} catch (InterruptedException ie) {
449+
Thread.currentThread().interrupt();
450+
return false;
451+
}
452+
}
453+
}
454+
404455
/**
405456
* Returns a pair list of resource-definitions with ther 1:1 mapped resource-group objects that start with the
406457
* resource name `startWith`

0 commit comments

Comments
 (0)