Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changelog.d/deploy-lock-correctness.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- `wheels deploy` lock acquisition is now all-or-nothing across the fleet: the lock is acquired on every (deduped) host sequentially in config order with failures surfaced, already-acquired locks are rolled back on a partial failure (`Wheels.Deploy.LockAcquireFailed` names the contended host; the contended host's own lock is never touched), and release fans out to every acquired host. Previously the first-success-wins `onAny` dispatch swallowed contention on one host and silently re-acquired on another, so concurrent deploys were only mutually excluded on single-host configs — and release could target a different host than acquire, stranding stale locks. The manual `wheels deploy lock acquire/release/status` verbs follow the same fleet-wide semantics (#2957)
- Deploy lock metadata now actually expands `$(hostname)` and `$(date --iso-8601=seconds)` on the remote: the symlink target double-quotes the substitution segment while keeping the user and message inert via `shellEscape` single-quoting — previously the whole target was single-quoted, which suppressed command substitution and recorded the literal `$(hostname)` text (#2957)
218 changes: 184 additions & 34 deletions cli/lucli/services/deploy/cli/DeployLockCli.cfc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ component {
user: $currentUser(),
message: arguments.opts.message ?: "manual acquire"
});
$dispatchAny($allHosts(cfg), cmd, dryRun);
// The deploy flow holds the lock on EVERY host (##2957 DEP-1), so a
// manual acquire must match — locking a single host would let a
// concurrent deploy that probes another host proceed.
$acquireLockAllOrNothing($uniqueHosts($allHosts(cfg)), cmd, lock, dryRun);
return $renderResult(arguments.opts, "Acquired deploy lock for " & cfg.service());
}

Expand All @@ -37,8 +40,17 @@ component {
var lock = new modules.wheels.services.deploy.commands.LockCommands(cfg);
// rm -f is idempotent; surfacing a failure here only obscures the
// operator's intent ("clear the lock if it's there"). #2696.
$dispatchAny($allHosts(cfg), lock.release(), dryRun, true);
return $renderResult(arguments.opts, "Released deploy lock for " & cfg.service());
// Fan out to every host — the lock lives fleet-wide (##2957 DEP-1),
// so clearing one host would strand stale locks on the rest — but
// per host, best-effort: fleet-wide stale locks most plausibly
// exist BECAUSE a host died mid-deploy, so the recovery path must
// keep working around an unreachable host instead of aborting on it.
var failed = $dispatchPerHostTolerant($uniqueHosts($allHosts(cfg)), lock.release(), dryRun);
return $renderResult(
arguments.opts,
"Released deploy lock for " & cfg.service()
& $skippedHostsSuffix(failed, "the lock was NOT released there; re-run 'wheels deploy lock release' when the host is back")
);
}

public string function status(required struct opts) {
Expand All @@ -48,12 +60,16 @@ component {
var lock = new modules.wheels.services.deploy.commands.LockCommands(cfg);
// readlink exits nonzero when the lock file is missing — which is
// exactly what the operator wants to learn from `status`. Treat that
// as advisory output, not a thrown error. #2696.
// #2957 DEP-6a: surface readlink's output (the lock holder on stdout,
// or the "No such file" diagnostic on stderr) instead of dropping it.
var lines = $dispatchAnyCollect($allHosts(cfg), lock.status(), dryRun, true);
// as advisory output, not a thrown error. #2696. Checked on every
// host since the lock lives fleet-wide (##2957 DEP-1) — and the same
// advisory contract covers an unreachable host: report it, don't throw.
// #2957 DEP-6a: also surface readlink's output (the lock holder on
// stdout, or the "No such file" diagnostic on stderr) per host instead
// of dropping it.
var collected = $collectPerHostTolerant($uniqueHosts($allHosts(cfg)), lock.status(), dryRun);
var summary = "Checked deploy lock status for " & cfg.service();
if (arrayLen(lines)) summary &= chr(10) & arrayToList(lines, chr(10));
if (arrayLen(collected.lines)) summary &= chr(10) & arrayToList(collected.lines, chr(10));
summary &= $skippedHostsSuffix(collected.failed, "the lock state there is unknown");
return $renderResult(arguments.opts, summary);
}

Expand All @@ -77,47 +93,181 @@ component {
return out;
}

private void function $dispatchAny(required array hosts, required string cmd, required boolean dryRun, boolean allowFail = false) {
/**
* All-or-nothing lock acquisition across every host, in config order,
* with rollback of already-acquired locks on the first failure.
*
* MIRROR: DeployMainCli.$acquireLockAllOrNothing is the deploy-flow
* twin of this contract (##2957 DEP-1) — keep them in lockstep.
*/
private void function $acquireLockAllOrNothing(
required array hosts,
required string acquireCmd,
required any lock,
required boolean dryRun
) {
if (arguments.dryRun) {
if (arrayLen(arguments.hosts)) {
arrayAppend(variables.dryRunBuffer, "[" & arguments.hosts[1] & "] " & arguments.cmd);
for (var h in arguments.hosts) {
arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.acquireCmd);
}
return;
}
// Lock ops target just one host (the lock file lives on one path; any host works).
// #2696: acquire stays strict (contention should surface); release/status tolerate.
var c = arguments.cmd;
var doRaise = !arguments.allowFail;
variables.sshPool.onAny(arguments.hosts, function(ssh, host) { ssh.run(c, {raise: doRaise}); });
var c = arguments.acquireCmd;
// Shared struct so the callback can record progress — closures can't
// reliably mutate outer scalars across engines (anti-pattern ##10).
var state = {acquired: [], lastHost: ""};
try {
variables.sshPool.sequential(arguments.hosts, function(ssh, host) {
state.lastHost = host;
ssh.run(c, {raise: true});
arrayAppend(state.acquired, host);
});
} catch (any e) {
$rollbackAcquiredLocks(state.acquired, arguments.lock);
throw(
type = "Wheels.Deploy.LockAcquireFailed",
message = "Could not acquire the deploy lock on " & state.lastHost
& " — another deploy may hold it. Rolled back "
& arrayLen(state.acquired) & " already-acquired lock(s). "
& "Inspect with 'wheels deploy lock status'; clear a stale lock with "
& "'wheels deploy lock release'. Cause: " & e.message,
detail = e.detail ?: ""
);
}
}

/**
* Best-effort release of the locks a partially-failed acquire already
* placed. A rollback failure must never shadow the LockAcquireFailed
* the caller is about to throw. Host-granular: one unreachable host
* must not stop the rollback from clearing the remaining healthy hosts.
*/
private void function $rollbackAcquiredLocks(required array hosts, required any lock) {
if (!arrayLen(arguments.hosts)) return;
// $dispatchPerHostTolerant never throws — per-host failures are
// swallowed deliberately; the acquire error is the one the operator
// needs to see.
$dispatchPerHostTolerant(arguments.hosts, arguments.lock.release(), false);
}

/** Order-preserving dedupe — a host serving several roles appears once. */
private array function $uniqueHosts(required array hosts) {
var seen = {};
var out = [];
for (var h in arguments.hosts) {
if (!structKeyExists(seen, h)) {
seen[h] = true;
arrayAppend(out, h);
}
}
return out;
}

/**
* Like $dispatchAny, but returns the remote output host-prefixed
* (`[host] line`). Stdout wins; stderr is the fallback so tolerated
* failures (allowFail) still surface their diagnostic. #2957 DEP-6a.
* Per-host best-effort dispatch. allowFail-style onEach is NOT enough
* for tolerant fan-out: the real SshPool.onEach pre-resolves a
* connection for EVERY host before submitting any task, so a single
* unreachable host throws before the command runs anywhere, and a
* transport failure inside a task (dead cached connection) is rethrown
* from future.get() regardless of {raise: false}. Dispatching each host
* in its own sequential([host]) call with a per-host try/catch confines
* every failure mode — connect and transport alike — to its host.
*
* @return array of {host, message} structs for hosts that failed.
*
* MIRROR: DeployMainCli.$dispatchPerHostTolerant is the deploy-flow
* twin of this helper — keep them in lockstep.
*/
private array function $dispatchAnyCollect(required array hosts, required string cmd, required boolean dryRun, boolean allowFail = false) {
private array function $dispatchPerHostTolerant(
required array hosts,
required string cmd,
required boolean dryRun
) {
if (arguments.dryRun) {
if (arrayLen(arguments.hosts)) {
arrayAppend(variables.dryRunBuffer, "[" & arguments.hosts[1] & "] " & arguments.cmd);
for (var h in arguments.hosts) {
arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.cmd);
}
return [];
}
var c = arguments.cmd;
var doRaise = !arguments.allowFail;
var failed = [];
for (var h in arguments.hosts) {
try {
variables.sshPool.sequential([h], function(ssh, host) {
ssh.run(c, {raise: false});
});
} catch (any e) {
arrayAppend(failed, {host: h, message: e.message});
}
}
return failed;
}

/**
* Render the unreachable-host warning appended to a verb's summary.
* Empty string when nothing failed.
*/
private string function $skippedHostsSuffix(required array failed, required string consequence) {
if (!arrayLen(arguments.failed)) return "";
var parts = [];
for (var f in arguments.failed) {
arrayAppend(parts, f.host & " (" & f.message & ")");
}
return chr(10) & "WARNING: skipped " & arrayLen(arguments.failed)
& " unreachable host(s): " & arrayToList(parts, "; ")
& " — " & arguments.consequence & ".";
}

/**
* Per-host best-effort dispatch that ALSO collects the remote output,
* host-prefixed (`[host] line`). Combines two ##2957 contracts that the
* `lock status` verb needs at once:
* - DEP-1: read the lock on EVERY host (the lock lives fleet-wide), and
* tolerate an unreachable host — report it, don't throw. Each host
* runs in its own sequential([host]) with a per-host try/catch so a
* dead connect or a dead cached session is confined to that host (see
* $dispatchPerHostTolerant for why allowFail-style onEach/onAny is not
* enough).
* - DEP-6a: surface what readlink actually said. Stdout wins (the lock
* holder); stderr is the fallback so the "No such file" diagnostic on
* an unheld lock still reaches the operator. The command is run with
* {raise: false} so a nonzero exit (no lock held) is advisory output,
* not a thrown error.
*
* @return struct {lines: array of "[host] line", failed: array of
* {host, message} for hosts that were unreachable}.
*/
private struct function $collectPerHostTolerant(
required array hosts,
required string cmd,
required boolean dryRun
) {
if (arguments.dryRun) {
for (var h in arguments.hosts) {
arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.cmd);
}
return {lines: [], failed: []};
}
var c = arguments.cmd;
// Closures can't write outer locals reliably — collect via a shared struct.
var ctx = {lines: []};
variables.sshPool.onAny(arguments.hosts, function(ssh, host) {
var res = ssh.run(c, {raise: doRaise});
var text = trim(res.stdout ?: "");
if (!len(text)) text = trim(res.stderr ?: "");
if (!len(text)) return;
text = replace(text, chr(13), "", "all");
for (var line in listToArray(text, chr(10))) {
arrayAppend(ctx.lines, "[" & host & "] " & line);
var ctx = {lines: [], failed: []};
for (var h in arguments.hosts) {
try {
variables.sshPool.sequential([h], function(ssh, host) {
var res = ssh.run(c, {raise: false});
var text = trim(res.stdout ?: "");
if (!len(text)) text = trim(res.stderr ?: "");
if (!len(text)) return;
text = replace(text, chr(13), "", "all");
for (var line in listToArray(text, chr(10))) {
arrayAppend(ctx.lines, "[" & host & "] " & line);
}
});
} catch (any e) {
arrayAppend(ctx.failed, {host: h, message: e.message});
}
});
return ctx.lines;
}
return ctx;
}

private string function $currentUser() {
Expand Down
Loading
Loading