wheels-dev
diff --git a/‎changelog.d/deploy-lock-correctness.fixed.md‎
Lines changed: 2 additions & 0 deletions b/‎changelog.d/deploy-lock-correctness.fixed.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cli/lucli/services/deploy/cli/DeployLockCli.cfc‎
Lines changed: 184 additions & 34 deletions b/‎cli/lucli/services/deploy/cli/DeployLockCli.cfc‎
Lines changed: 184 additions & 34 deletions
@@ -0,0 +1,2 @@
+- `wheels deploy` lock acquisition is now all-or-nothing across the fleet: the lock is acquired on every (deduped) host sequentially in config order with failures surfaced, already-acquired locks are rolled back on a partial failure (`Wheels.Deploy.LockAcquireFailed` names the contended host; the contended host's own lock is never touched), and release fans out to every acquired host. Previously the first-success-wins `onAny` dispatch swallowed contention on one host and silently re-acquired on another, so concurrent deploys were only mutually excluded on single-host configs — and release could target a different host than acquire, stranding stale locks. The manual `wheels deploy lock acquire/release/status` verbs follow the same fleet-wide semantics (#2957)
+- Deploy lock metadata now actually expands `$(hostname)` and `$(date --iso-8601=seconds)` on the remote: the symlink target double-quotes the substitution segment while keeping the user and message inert via `shellEscape` single-quoting — previously the whole target was single-quoted, which suppressed command substitution and recorded the literal `$(hostname)` text (#2957)
@@ -26,7 +26,10 @@ component {
             user: $currentUser(),
             message: arguments.opts.message ?: "manual acquire"
         });
-        $dispatchAny($allHosts(cfg), cmd, dryRun);
+        // The deploy flow holds the lock on EVERY host (##2957 DEP-1), so a
+        // manual acquire must match — locking a single host would let a
+        // concurrent deploy that probes another host proceed.
+        $acquireLockAllOrNothing($uniqueHosts($allHosts(cfg)), cmd, lock, dryRun);
         return $renderResult(arguments.opts, "Acquired deploy lock for " & cfg.service());
     }
 
@@ -37,8 +40,17 @@ component {
         var lock = new modules.wheels.services.deploy.commands.LockCommands(cfg);
         // rm -f is idempotent; surfacing a failure here only obscures the
         // operator's intent ("clear the lock if it's there"). #2696.
-        $dispatchAny($allHosts(cfg), lock.release(), dryRun, true);
-        return $renderResult(arguments.opts, "Released deploy lock for " & cfg.service());
+        // Fan out to every host — the lock lives fleet-wide (##2957 DEP-1),
+        // so clearing one host would strand stale locks on the rest — but
+        // per host, best-effort: fleet-wide stale locks most plausibly
+        // exist BECAUSE a host died mid-deploy, so the recovery path must
+        // keep working around an unreachable host instead of aborting on it.
+        var failed = $dispatchPerHostTolerant($uniqueHosts($allHosts(cfg)), lock.release(), dryRun);
+        return $renderResult(
+            arguments.opts,
+            "Released deploy lock for " & cfg.service()
+                & $skippedHostsSuffix(failed, "the lock was NOT released there; re-run 'wheels deploy lock release' when the host is back")
+        );
     }
 
     public string function status(required struct opts) {
@@ -48,12 +60,16 @@ component {
         var lock = new modules.wheels.services.deploy.commands.LockCommands(cfg);
         // readlink exits nonzero when the lock file is missing — which is
         // exactly what the operator wants to learn from `status`. Treat that
-        // as advisory output, not a thrown error. #2696.
-        // #2957 DEP-6a: surface readlink's output (the lock holder on stdout,
-        // or the "No such file" diagnostic on stderr) instead of dropping it.
-        var lines = $dispatchAnyCollect($allHosts(cfg), lock.status(), dryRun, true);
+        // as advisory output, not a thrown error. #2696. Checked on every
+        // host since the lock lives fleet-wide (##2957 DEP-1) — and the same
+        // advisory contract covers an unreachable host: report it, don't throw.
+        // #2957 DEP-6a: also surface readlink's output (the lock holder on
+        // stdout, or the "No such file" diagnostic on stderr) per host instead
+        // of dropping it.
+        var collected = $collectPerHostTolerant($uniqueHosts($allHosts(cfg)), lock.status(), dryRun);
         var summary = "Checked deploy lock status for " & cfg.service();
-        if (arrayLen(lines)) summary &= chr(10) & arrayToList(lines, chr(10));
+        if (arrayLen(collected.lines)) summary &= chr(10) & arrayToList(collected.lines, chr(10));
+        summary &= $skippedHostsSuffix(collected.failed, "the lock state there is unknown");
         return $renderResult(arguments.opts, summary);
     }
 
@@ -77,47 +93,181 @@ component {
         return out;
     }
 
-    private void function $dispatchAny(required array hosts, required string cmd, required boolean dryRun, boolean allowFail = false) {
+    /**
+     * All-or-nothing lock acquisition across every host, in config order,
+     * with rollback of already-acquired locks on the first failure.
+     *
+     * MIRROR: DeployMainCli.$acquireLockAllOrNothing is the deploy-flow
+     * twin of this contract (##2957 DEP-1) — keep them in lockstep.
+     */
+    private void function $acquireLockAllOrNothing(
+        required array hosts,
+        required string acquireCmd,
+        required any lock,
+        required boolean dryRun
+    ) {
         if (arguments.dryRun) {
-            if (arrayLen(arguments.hosts)) {
-                arrayAppend(variables.dryRunBuffer, "[" & arguments.hosts[1] & "] " & arguments.cmd);
+            for (var h in arguments.hosts) {
+                arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.acquireCmd);
             }
             return;
         }
-        // Lock ops target just one host (the lock file lives on one path; any host works).
-        // #2696: acquire stays strict (contention should surface); release/status tolerate.
-        var c = arguments.cmd;
-        var doRaise = !arguments.allowFail;
-        variables.sshPool.onAny(arguments.hosts, function(ssh, host) { ssh.run(c, {raise: doRaise}); });
+        var c = arguments.acquireCmd;
+        // Shared struct so the callback can record progress — closures can't
+        // reliably mutate outer scalars across engines (anti-pattern ##10).
+        var state = {acquired: [], lastHost: ""};
+        try {
+            variables.sshPool.sequential(arguments.hosts, function(ssh, host) {
+                state.lastHost = host;
+                ssh.run(c, {raise: true});
+                arrayAppend(state.acquired, host);
+            });
+        } catch (any e) {
+            $rollbackAcquiredLocks(state.acquired, arguments.lock);
+            throw(
+                type = "Wheels.Deploy.LockAcquireFailed",
+                message = "Could not acquire the deploy lock on " & state.lastHost
+                    & " — another deploy may hold it. Rolled back "
+                    & arrayLen(state.acquired) & " already-acquired lock(s). "
+                    & "Inspect with 'wheels deploy lock status'; clear a stale lock with "
+                    & "'wheels deploy lock release'. Cause: " & e.message,
+                detail = e.detail ?: ""
+            );
+        }
+    }
+
+    /**
+     * Best-effort release of the locks a partially-failed acquire already
+     * placed. A rollback failure must never shadow the LockAcquireFailed
+     * the caller is about to throw. Host-granular: one unreachable host
+     * must not stop the rollback from clearing the remaining healthy hosts.
+     */
+    private void function $rollbackAcquiredLocks(required array hosts, required any lock) {
+        if (!arrayLen(arguments.hosts)) return;
+        // $dispatchPerHostTolerant never throws — per-host failures are
+        // swallowed deliberately; the acquire error is the one the operator
+        // needs to see.
+        $dispatchPerHostTolerant(arguments.hosts, arguments.lock.release(), false);
+    }
+
+    /** Order-preserving dedupe — a host serving several roles appears once. */
+    private array function $uniqueHosts(required array hosts) {
+        var seen = {};
+        var out = [];
+        for (var h in arguments.hosts) {
+            if (!structKeyExists(seen, h)) {
+                seen[h] = true;
+                arrayAppend(out, h);
+            }
+        }
+        return out;
     }
 
     /**
-     * Like $dispatchAny, but returns the remote output host-prefixed
-     * (`[host] line`). Stdout wins; stderr is the fallback so tolerated
-     * failures (allowFail) still surface their diagnostic. #2957 DEP-6a.
+     * Per-host best-effort dispatch. allowFail-style onEach is NOT enough
+     * for tolerant fan-out: the real SshPool.onEach pre-resolves a
+     * connection for EVERY host before submitting any task, so a single
+     * unreachable host throws before the command runs anywhere, and a
+     * transport failure inside a task (dead cached connection) is rethrown
+     * from future.get() regardless of {raise: false}. Dispatching each host
+     * in its own sequential([host]) call with a per-host try/catch confines
+     * every failure mode — connect and transport alike — to its host.
+     *
+     * @return array of {host, message} structs for hosts that failed.
+     *
+     * MIRROR: DeployMainCli.$dispatchPerHostTolerant is the deploy-flow
+     * twin of this helper — keep them in lockstep.
      */
-    private array function $dispatchAnyCollect(required array hosts, required string cmd, required boolean dryRun, boolean allowFail = false) {
+    private array function $dispatchPerHostTolerant(
+        required array hosts,
+        required string cmd,
+        required boolean dryRun
+    ) {
         if (arguments.dryRun) {
-            if (arrayLen(arguments.hosts)) {
-                arrayAppend(variables.dryRunBuffer, "[" & arguments.hosts[1] & "] " & arguments.cmd);
+            for (var h in arguments.hosts) {
+                arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.cmd);
             }
             return [];
         }
         var c = arguments.cmd;
-        var doRaise = !arguments.allowFail;
+        var failed = [];
+        for (var h in arguments.hosts) {
+            try {
+                variables.sshPool.sequential([h], function(ssh, host) {
+                    ssh.run(c, {raise: false});
+                });
+            } catch (any e) {
+                arrayAppend(failed, {host: h, message: e.message});
+            }
+        }
+        return failed;
+    }
+
+    /**
+     * Render the unreachable-host warning appended to a verb's summary.
+     * Empty string when nothing failed.
+     */
+    private string function $skippedHostsSuffix(required array failed, required string consequence) {
+        if (!arrayLen(arguments.failed)) return "";
+        var parts = [];
+        for (var f in arguments.failed) {
+            arrayAppend(parts, f.host & " (" & f.message & ")");
+        }
+        return chr(10) & "WARNING: skipped " & arrayLen(arguments.failed)
+            & " unreachable host(s): " & arrayToList(parts, "; ")
+            & " — " & arguments.consequence & ".";
+    }
+
+    /**
+     * Per-host best-effort dispatch that ALSO collects the remote output,
+     * host-prefixed (`[host] line`). Combines two ##2957 contracts that the
+     * `lock status` verb needs at once:
+     *   - DEP-1: read the lock on EVERY host (the lock lives fleet-wide), and
+     *     tolerate an unreachable host — report it, don't throw. Each host
+     *     runs in its own sequential([host]) with a per-host try/catch so a
+     *     dead connect or a dead cached session is confined to that host (see
+     *     $dispatchPerHostTolerant for why allowFail-style onEach/onAny is not
+     *     enough).
+     *   - DEP-6a: surface what readlink actually said. Stdout wins (the lock
+     *     holder); stderr is the fallback so the "No such file" diagnostic on
+     *     an unheld lock still reaches the operator. The command is run with
+     *     {raise: false} so a nonzero exit (no lock held) is advisory output,
+     *     not a thrown error.
+     *
+     * @return struct {lines: array of "[host] line", failed: array of
+     *         {host, message} for hosts that were unreachable}.
+     */
+    private struct function $collectPerHostTolerant(
+        required array hosts,
+        required string cmd,
+        required boolean dryRun
+    ) {
+        if (arguments.dryRun) {
+            for (var h in arguments.hosts) {
+                arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.cmd);
+            }
+            return {lines: [], failed: []};
+        }
+        var c = arguments.cmd;
         // Closures can't write outer locals reliably — collect via a shared struct.
-        var ctx = {lines: []};
-        variables.sshPool.onAny(arguments.hosts, function(ssh, host) {
-            var res = ssh.run(c, {raise: doRaise});
-            var text = trim(res.stdout ?: "");
-            if (!len(text)) text = trim(res.stderr ?: "");
-            if (!len(text)) return;
-            text = replace(text, chr(13), "", "all");
-            for (var line in listToArray(text, chr(10))) {
-                arrayAppend(ctx.lines, "[" & host & "] " & line);
+        var ctx = {lines: [], failed: []};
+        for (var h in arguments.hosts) {
+            try {
+                variables.sshPool.sequential([h], function(ssh, host) {
+                    var res = ssh.run(c, {raise: false});
+                    var text = trim(res.stdout ?: "");
+                    if (!len(text)) text = trim(res.stderr ?: "");
+                    if (!len(text)) return;
+                    text = replace(text, chr(13), "", "all");
+                    for (var line in listToArray(text, chr(10))) {
+                        arrayAppend(ctx.lines, "[" & host & "] " & line);
+                    }
+                });
+            } catch (any e) {
+                arrayAppend(ctx.failed, {host: h, message: e.message});
             }
-        });
-        return ctx.lines;
+        }
+        return ctx;
     }
 
     private string function $currentUser() {
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+- `wheels deploy` lock acquisition is now all-or-nothing across the fleet: the lock is acquired on every (deduped) host sequentially in config order with failures surfaced, already-acquired locks are rolled back on a partial failure (`Wheels.Deploy.LockAcquireFailed` names the contended host; the contended host's own lock is never touched), and release fans out to every acquired host. Previously the first-success-wins `onAny` dispatch swallowed contention on one host and silently re-acquired on another, so concurrent deploys were only mutually excluded on single-host configs — and release could target a different host than acquire, stranding stale locks. The manual `wheels deploy lock acquire/release/status` verbs follow the same fleet-wide semantics (#2957)
	`2`	+- Deploy lock metadata now actually expands `$(hostname)` and `$(date --iso-8601=seconds)` on the remote: the symlink target double-quotes the substitution segment while keeping the user and message inert via `shellEscape` single-quoting — previously the whole target was single-quoted, which suppressed command substitution and recorded the literal `$(hostname)` text (#2957)