diff --git a/changelog.d/deploy-lock-correctness.fixed.md b/changelog.d/deploy-lock-correctness.fixed.md new file mode 100644 index 000000000..01c7fb2a7 --- /dev/null +++ b/changelog.d/deploy-lock-correctness.fixed.md @@ -0,0 +1,2 @@ +- `wheels deploy` lock acquisition is now all-or-nothing across the fleet: the lock is acquired on every (deduped) host sequentially in config order with failures surfaced, already-acquired locks are rolled back on a partial failure (`Wheels.Deploy.LockAcquireFailed` names the contended host; the contended host's own lock is never touched), and release fans out to every acquired host. Previously the first-success-wins `onAny` dispatch swallowed contention on one host and silently re-acquired on another, so concurrent deploys were only mutually excluded on single-host configs — and release could target a different host than acquire, stranding stale locks. The manual `wheels deploy lock acquire/release/status` verbs follow the same fleet-wide semantics (#2957) +- Deploy lock metadata now actually expands `$(hostname)` and `$(date --iso-8601=seconds)` on the remote: the symlink target double-quotes the substitution segment while keeping the user and message inert via `shellEscape` single-quoting — previously the whole target was single-quoted, which suppressed command substitution and recorded the literal `$(hostname)` text (#2957) diff --git a/cli/lucli/services/deploy/cli/DeployLockCli.cfc b/cli/lucli/services/deploy/cli/DeployLockCli.cfc index 61a8a0b76..b24f41acc 100644 --- a/cli/lucli/services/deploy/cli/DeployLockCli.cfc +++ b/cli/lucli/services/deploy/cli/DeployLockCli.cfc @@ -26,7 +26,10 @@ component { user: $currentUser(), message: arguments.opts.message ?: "manual acquire" }); - $dispatchAny($allHosts(cfg), cmd, dryRun); + // The deploy flow holds the lock on EVERY host (##2957 DEP-1), so a + // manual acquire must match — locking a single host would let a + // concurrent deploy that probes another host proceed. + $acquireLockAllOrNothing($uniqueHosts($allHosts(cfg)), cmd, lock, dryRun); return $renderResult(arguments.opts, "Acquired deploy lock for " & cfg.service()); } @@ -37,8 +40,17 @@ component { var lock = new modules.wheels.services.deploy.commands.LockCommands(cfg); // rm -f is idempotent; surfacing a failure here only obscures the // operator's intent ("clear the lock if it's there"). #2696. - $dispatchAny($allHosts(cfg), lock.release(), dryRun, true); - return $renderResult(arguments.opts, "Released deploy lock for " & cfg.service()); + // Fan out to every host — the lock lives fleet-wide (##2957 DEP-1), + // so clearing one host would strand stale locks on the rest — but + // per host, best-effort: fleet-wide stale locks most plausibly + // exist BECAUSE a host died mid-deploy, so the recovery path must + // keep working around an unreachable host instead of aborting on it. + var failed = $dispatchPerHostTolerant($uniqueHosts($allHosts(cfg)), lock.release(), dryRun); + return $renderResult( + arguments.opts, + "Released deploy lock for " & cfg.service() + & $skippedHostsSuffix(failed, "the lock was NOT released there; re-run 'wheels deploy lock release' when the host is back") + ); } public string function status(required struct opts) { @@ -48,12 +60,16 @@ component { var lock = new modules.wheels.services.deploy.commands.LockCommands(cfg); // readlink exits nonzero when the lock file is missing — which is // exactly what the operator wants to learn from `status`. Treat that - // as advisory output, not a thrown error. #2696. - // #2957 DEP-6a: surface readlink's output (the lock holder on stdout, - // or the "No such file" diagnostic on stderr) instead of dropping it. - var lines = $dispatchAnyCollect($allHosts(cfg), lock.status(), dryRun, true); + // as advisory output, not a thrown error. #2696. Checked on every + // host since the lock lives fleet-wide (##2957 DEP-1) — and the same + // advisory contract covers an unreachable host: report it, don't throw. + // #2957 DEP-6a: also surface readlink's output (the lock holder on + // stdout, or the "No such file" diagnostic on stderr) per host instead + // of dropping it. + var collected = $collectPerHostTolerant($uniqueHosts($allHosts(cfg)), lock.status(), dryRun); var summary = "Checked deploy lock status for " & cfg.service(); - if (arrayLen(lines)) summary &= chr(10) & arrayToList(lines, chr(10)); + if (arrayLen(collected.lines)) summary &= chr(10) & arrayToList(collected.lines, chr(10)); + summary &= $skippedHostsSuffix(collected.failed, "the lock state there is unknown"); return $renderResult(arguments.opts, summary); } @@ -77,47 +93,181 @@ component { return out; } - private void function $dispatchAny(required array hosts, required string cmd, required boolean dryRun, boolean allowFail = false) { + /** + * All-or-nothing lock acquisition across every host, in config order, + * with rollback of already-acquired locks on the first failure. + * + * MIRROR: DeployMainCli.$acquireLockAllOrNothing is the deploy-flow + * twin of this contract (##2957 DEP-1) — keep them in lockstep. + */ + private void function $acquireLockAllOrNothing( + required array hosts, + required string acquireCmd, + required any lock, + required boolean dryRun + ) { if (arguments.dryRun) { - if (arrayLen(arguments.hosts)) { - arrayAppend(variables.dryRunBuffer, "[" & arguments.hosts[1] & "] " & arguments.cmd); + for (var h in arguments.hosts) { + arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.acquireCmd); } return; } - // Lock ops target just one host (the lock file lives on one path; any host works). - // #2696: acquire stays strict (contention should surface); release/status tolerate. - var c = arguments.cmd; - var doRaise = !arguments.allowFail; - variables.sshPool.onAny(arguments.hosts, function(ssh, host) { ssh.run(c, {raise: doRaise}); }); + var c = arguments.acquireCmd; + // Shared struct so the callback can record progress — closures can't + // reliably mutate outer scalars across engines (anti-pattern ##10). + var state = {acquired: [], lastHost: ""}; + try { + variables.sshPool.sequential(arguments.hosts, function(ssh, host) { + state.lastHost = host; + ssh.run(c, {raise: true}); + arrayAppend(state.acquired, host); + }); + } catch (any e) { + $rollbackAcquiredLocks(state.acquired, arguments.lock); + throw( + type = "Wheels.Deploy.LockAcquireFailed", + message = "Could not acquire the deploy lock on " & state.lastHost + & " — another deploy may hold it. Rolled back " + & arrayLen(state.acquired) & " already-acquired lock(s). " + & "Inspect with 'wheels deploy lock status'; clear a stale lock with " + & "'wheels deploy lock release'. Cause: " & e.message, + detail = e.detail ?: "" + ); + } + } + + /** + * Best-effort release of the locks a partially-failed acquire already + * placed. A rollback failure must never shadow the LockAcquireFailed + * the caller is about to throw. Host-granular: one unreachable host + * must not stop the rollback from clearing the remaining healthy hosts. + */ + private void function $rollbackAcquiredLocks(required array hosts, required any lock) { + if (!arrayLen(arguments.hosts)) return; + // $dispatchPerHostTolerant never throws — per-host failures are + // swallowed deliberately; the acquire error is the one the operator + // needs to see. + $dispatchPerHostTolerant(arguments.hosts, arguments.lock.release(), false); + } + + /** Order-preserving dedupe — a host serving several roles appears once. */ + private array function $uniqueHosts(required array hosts) { + var seen = {}; + var out = []; + for (var h in arguments.hosts) { + if (!structKeyExists(seen, h)) { + seen[h] = true; + arrayAppend(out, h); + } + } + return out; } /** - * Like $dispatchAny, but returns the remote output host-prefixed - * (`[host] line`). Stdout wins; stderr is the fallback so tolerated - * failures (allowFail) still surface their diagnostic. #2957 DEP-6a. + * Per-host best-effort dispatch. allowFail-style onEach is NOT enough + * for tolerant fan-out: the real SshPool.onEach pre-resolves a + * connection for EVERY host before submitting any task, so a single + * unreachable host throws before the command runs anywhere, and a + * transport failure inside a task (dead cached connection) is rethrown + * from future.get() regardless of {raise: false}. Dispatching each host + * in its own sequential([host]) call with a per-host try/catch confines + * every failure mode — connect and transport alike — to its host. + * + * @return array of {host, message} structs for hosts that failed. + * + * MIRROR: DeployMainCli.$dispatchPerHostTolerant is the deploy-flow + * twin of this helper — keep them in lockstep. */ - private array function $dispatchAnyCollect(required array hosts, required string cmd, required boolean dryRun, boolean allowFail = false) { + private array function $dispatchPerHostTolerant( + required array hosts, + required string cmd, + required boolean dryRun + ) { if (arguments.dryRun) { - if (arrayLen(arguments.hosts)) { - arrayAppend(variables.dryRunBuffer, "[" & arguments.hosts[1] & "] " & arguments.cmd); + for (var h in arguments.hosts) { + arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.cmd); } return []; } var c = arguments.cmd; - var doRaise = !arguments.allowFail; + var failed = []; + for (var h in arguments.hosts) { + try { + variables.sshPool.sequential([h], function(ssh, host) { + ssh.run(c, {raise: false}); + }); + } catch (any e) { + arrayAppend(failed, {host: h, message: e.message}); + } + } + return failed; + } + + /** + * Render the unreachable-host warning appended to a verb's summary. + * Empty string when nothing failed. + */ + private string function $skippedHostsSuffix(required array failed, required string consequence) { + if (!arrayLen(arguments.failed)) return ""; + var parts = []; + for (var f in arguments.failed) { + arrayAppend(parts, f.host & " (" & f.message & ")"); + } + return chr(10) & "WARNING: skipped " & arrayLen(arguments.failed) + & " unreachable host(s): " & arrayToList(parts, "; ") + & " — " & arguments.consequence & "."; + } + + /** + * Per-host best-effort dispatch that ALSO collects the remote output, + * host-prefixed (`[host] line`). Combines two ##2957 contracts that the + * `lock status` verb needs at once: + * - DEP-1: read the lock on EVERY host (the lock lives fleet-wide), and + * tolerate an unreachable host — report it, don't throw. Each host + * runs in its own sequential([host]) with a per-host try/catch so a + * dead connect or a dead cached session is confined to that host (see + * $dispatchPerHostTolerant for why allowFail-style onEach/onAny is not + * enough). + * - DEP-6a: surface what readlink actually said. Stdout wins (the lock + * holder); stderr is the fallback so the "No such file" diagnostic on + * an unheld lock still reaches the operator. The command is run with + * {raise: false} so a nonzero exit (no lock held) is advisory output, + * not a thrown error. + * + * @return struct {lines: array of "[host] line", failed: array of + * {host, message} for hosts that were unreachable}. + */ + private struct function $collectPerHostTolerant( + required array hosts, + required string cmd, + required boolean dryRun + ) { + if (arguments.dryRun) { + for (var h in arguments.hosts) { + arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.cmd); + } + return {lines: [], failed: []}; + } + var c = arguments.cmd; // Closures can't write outer locals reliably — collect via a shared struct. - var ctx = {lines: []}; - variables.sshPool.onAny(arguments.hosts, function(ssh, host) { - var res = ssh.run(c, {raise: doRaise}); - var text = trim(res.stdout ?: ""); - if (!len(text)) text = trim(res.stderr ?: ""); - if (!len(text)) return; - text = replace(text, chr(13), "", "all"); - for (var line in listToArray(text, chr(10))) { - arrayAppend(ctx.lines, "[" & host & "] " & line); + var ctx = {lines: [], failed: []}; + for (var h in arguments.hosts) { + try { + variables.sshPool.sequential([h], function(ssh, host) { + var res = ssh.run(c, {raise: false}); + var text = trim(res.stdout ?: ""); + if (!len(text)) text = trim(res.stderr ?: ""); + if (!len(text)) return; + text = replace(text, chr(13), "", "all"); + for (var line in listToArray(text, chr(10))) { + arrayAppend(ctx.lines, "[" & host & "] " & line); + } + }); + } catch (any e) { + arrayAppend(ctx.failed, {host: h, message: e.message}); } - }); - return ctx.lines; + } + return ctx; } private string function $currentUser() { diff --git a/cli/lucli/services/deploy/cli/DeployMainCli.cfc b/cli/lucli/services/deploy/cli/DeployMainCli.cfc index 77c0628ef..17e1f5356 100644 --- a/cli/lucli/services/deploy/cli/DeployMainCli.cfc +++ b/cli/lucli/services/deploy/cli/DeployMainCli.cfc @@ -107,10 +107,16 @@ component { $fireHook(hooks, "pre-deploy", hookEnv, dryRun); + // The lock lives on EVERY host (deduped — a host serving two roles + // must not contend with itself) so a concurrent deploy collides with + // it no matter which host it probes (##2957 DEP-1). + var lockHosts = $uniqueHosts(hosts); + try { - $dispatchAny( - hosts, + $acquireLockAllOrNothing( + lockHosts, lock.acquire({user: $currentUser(), message: "deploy " & ver}), + lock, dryRun ); @@ -162,10 +168,16 @@ component { $dispatch(hosts, auditor.record("completed deploy of version " & ver), dryRun, true); } finally { // Tolerate release failures so they can never shadow the - // original deploy exception inside this finally block. rm -f - // is idempotent; if it genuinely fails on a remote, the deploy - // already has a real error to surface from the try body. - $dispatchAny(hosts, lock.release(), dryRun, true); + // original deploy exception inside this finally block — and + // "tolerate" must cover transport failures, not just exit + // codes: allowFail only maps to {raise: false}, while a host + // that died mid-deploy (the correlated case) makes the + // release's startSession throw out of onEach and REPLACE the + // in-flight deploy error. Per-host best-effort dispatch + // confines each failure to its host, so every healthy host + // is still released (##2957 DEP-1b). rm -f is idempotent; + // skipped hosts are logged for manual cleanup. + $logSkippedLockReleases($dispatchPerHostTolerant(lockHosts, lock.release(), dryRun)); } hookEnv.KAMAL_RUNTIME = int((getTickCount() - deployStart) / 1000); @@ -545,6 +557,143 @@ component { }); } + /** + * All-or-nothing deploy-lock acquisition (##2957 DEP-1). + * + * The lock only provides mutual exclusion if a concurrent deploy is + * guaranteed to collide with it. The old $dispatchAny acquire was + * first-success-wins: contention on host 1 was swallowed by + * SshPool.onAny and a fresh lock was acquired on host 2, so two deploys + * could run side by side on any multi-host fleet. Instead: acquire on + * EVERY host, in deterministic (config) order, sequentially — two + * concurrent deploys probe hosts in the same order, so exactly one wins + * the first host and the other aborts there. On a partial failure, roll + * back ONLY the locks already acquired (the contended host's lock + * belongs to the other deploy) and surface the per-host error. + * + * MIRROR: DeployLockCli.$acquireLockAllOrNothing implements the same + * contract for the manual lock verbs — keep them in lockstep. + */ + private void function $acquireLockAllOrNothing( + required array hosts, + required string acquireCmd, + required any lock, + required boolean dryRun + ) { + if (arguments.dryRun) { + for (var h in arguments.hosts) { + arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.acquireCmd); + } + return; + } + var c = arguments.acquireCmd; + // Shared struct so the callback can record progress — closures can't + // reliably mutate outer scalars across engines (anti-pattern ##10). + var state = {acquired: [], lastHost: ""}; + try { + variables.sshPool.sequential(arguments.hosts, function(ssh, host) { + state.lastHost = host; + ssh.run(c, {raise: true}); + arrayAppend(state.acquired, host); + }); + } catch (any e) { + $rollbackAcquiredLocks(state.acquired, arguments.lock); + throw( + type = "Wheels.Deploy.LockAcquireFailed", + message = "Could not acquire the deploy lock on " & state.lastHost + & " — another deploy may hold it. Rolled back " + & arrayLen(state.acquired) & " already-acquired lock(s). " + & "Inspect with 'wheels deploy lock status'; clear a stale lock with " + & "'wheels deploy lock release'. Cause: " & e.message, + detail = e.detail ?: "" + ); + } + } + + /** + * Best-effort release of the locks a partially-failed acquire already + * placed. A rollback failure must never shadow the LockAcquireFailed + * the caller is about to throw. Host-granular: one unreachable host + * must not stop the rollback from clearing the remaining healthy hosts. + */ + private void function $rollbackAcquiredLocks(required array hosts, required any lock) { + if (!arrayLen(arguments.hosts)) return; + // $dispatchPerHostTolerant never throws — per-host failures are + // swallowed deliberately; the acquire error is the one the operator + // needs to see. Stale locks are recoverable via + // `wheels deploy lock release`. + $dispatchPerHostTolerant(arguments.hosts, arguments.lock.release(), false); + } + + /** + * Per-host best-effort dispatch. allowFail-style onEach is NOT enough + * for tolerant fan-out: the real SshPool.onEach pre-resolves a + * connection for EVERY host before submitting any task, so a single + * unreachable host throws before the command runs anywhere, and a + * transport failure inside a task (dead cached connection) is rethrown + * from future.get() regardless of {raise: false}. Dispatching each host + * in its own sequential([host]) call with a per-host try/catch confines + * every failure mode — connect and transport alike — to its host. + * + * @return array of {host, message} structs for hosts that failed. + * + * MIRROR: DeployLockCli.$dispatchPerHostTolerant is the manual-verb + * twin of this helper — keep them in lockstep. + */ + private array function $dispatchPerHostTolerant( + required array hosts, + required string cmd, + required boolean dryRun + ) { + if (arguments.dryRun) { + for (var h in arguments.hosts) { + arrayAppend(variables.dryRunBuffer, "[" & h & "] " & arguments.cmd); + } + return []; + } + var c = arguments.cmd; + var failed = []; + for (var h in arguments.hosts) { + try { + variables.sshPool.sequential([h], function(ssh, host) { + ssh.run(c, {raise: false}); + }); + } catch (any e) { + arrayAppend(failed, {host: h, message: e.message}); + } + } + return failed; + } + + /** + * Surface (but never throw) the hosts a best-effort lock release could + * not reach. Mirrors the post-deploy-failure hook logging (#3087): + * visibility without shadowing whatever error is already in flight. + * Lives in a helper because loops inside `finally` miscompile on + * Lucee 7 (cross-engine invariant 12). + */ + private void function $logSkippedLockReleases(required array failed) { + for (var f in arguments.failed) { + writeOutput( + "[lock:release] " & f.host & ": " & f.message + & " (ignored — run 'wheels deploy lock release' when the host is back)" & chr(10) + ); + } + } + + /** Order-preserving dedupe — a host serving several roles appears once. */ + private array function $uniqueHosts(required array hosts) { + var seen = {}; + var out = []; + for (var h in arguments.hosts) { + if (!structKeyExists(seen, h)) { + seen[h] = true; + arrayAppend(out, h); + } + } + return out; + } + /** * Resolved key→value map from the SecretResolver the loader built for * the most recent load(). Empty struct when no resolver exists (e.g. a @@ -645,7 +794,9 @@ component { /** * Dispatch a single command to "any one" host — used for operations - * that only need to happen once across the fleet (lock acquire/release). + * that only need to happen once across the fleet (proxy boot check). + * NOT suitable for the deploy lock: onAny swallows per-host failures, + * which is exactly the multi-host lock bypass fixed in ##2957 DEP-1. * FakeSshPool.onAny records exactly one call. */ private void function $dispatchAny( diff --git a/cli/lucli/services/deploy/commands/LockCommands.cfc b/cli/lucli/services/deploy/commands/LockCommands.cfc index 8bc3a7d38..48a199d5d 100644 --- a/cli/lucli/services/deploy/commands/LockCommands.cfc +++ b/cli/lucli/services/deploy/commands/LockCommands.cfc @@ -29,14 +29,17 @@ component extends="Base" { public string function acquire(required struct opts) { var user = arguments.opts.user ?: "unknown"; var message = arguments.opts.message ?: ""; - // $(hostname) and $(date ...) resolved by the remote shell; user and - // message are escaped by surrounding the whole target in single - // quotes so shell metacharacters don't blow up the ln command. - // Inner single quotes in either value are closed and re-opened - // ( "'\''"). - var safeUser = replace(user, "'", "'\''", "all"); - var safeMessage = replace(message, "'", "'\''", "all"); - var target = "'" & safeUser & "@$(hostname)/$(date --iso-8601=seconds)/" & safeMessage & "'"; + // The symlink target is three concatenated shell words: the single- + // quoted (inert) user, a double-quoted middle segment whose + // $(hostname) and $(date ...) ARE resolved by the remote shell, and + // the single-quoted (inert) message. Adjacent quoted segments join + // into one argument, so metacharacters in user/message can't execute + // while the metadata substitutions still expand. Previously the whole + // target was single-quoted, which suppressed command substitution and + // recorded the literal "$(hostname)" text (##2957 DEP-10). + var target = shellEscape(user) + & """@$(hostname)/$(date --iso-8601=seconds)/""" + & shellEscape(message); return "ln -s " & target & " " & lockPath(); } diff --git a/cli/lucli/services/deploy/lib/FakeSshPool.cfc b/cli/lucli/services/deploy/lib/FakeSshPool.cfc index d5fd46d91..4deec523c 100644 --- a/cli/lucli/services/deploy/lib/FakeSshPool.cfc +++ b/cli/lucli/services/deploy/lib/FakeSshPool.cfc @@ -14,31 +14,93 @@ component { variables.strict = arguments.opts.strict ?: false; variables.calls = []; variables.expectations = {}; + variables.connectFailures = {}; return this; } + /** + * Scripted result for a (host, cmd) pair. Beyond the usual + * {exitCode, stdout, stderr} shape, a result containing a + * `transportError` key makes the fake's run() THROW that message + * regardless of opts.raise — modeling a dead cached connection whose + * startSession() fails inside the real SshClient.run (a transport + * failure is a thrown Java exception, never an exit code, so + * {raise: false} cannot suppress it). + */ public void function expect(required string host, required string cmd, required struct result) { variables.expectations["#arguments.host#|#arguments.cmd#"] = arguments.result; } + /** + * Script a transport-level connection failure for a host — models the + * real SshPool.getConnection, whose eager SshClient.init connect throws + * on an unreachable host. Mirrored semantics per entry point: + * - onEach pre-resolves EVERY host before running any task, so one + * dead host aborts the whole fan-out with zero commands executed; + * - sequential resolves lazily, so earlier hosts have already run; + * - onAny catches per host and tries the next. + */ + public void function failConnection(required string host, string message = "") { + variables.connectFailures[arguments.host] = len(arguments.message) + ? arguments.message + : "Connection refused: " & arguments.host & " (scripted transport failure)"; + } + public array function calls() { return variables.calls; } public void function reset() { arrayClear(variables.calls); } public void function onEach(required array hosts, required any callback) { + // Mirror real SshPool.onEach: connections are pre-resolved for EVERY + // host on the submitting thread before any task runs, so a single + // unreachable host aborts the whole fan-out before any command + // executes anywhere. + for (var host in arguments.hosts) { + $resolveOrThrow(host); + } for (var host in arguments.hosts) { var ssh = $makeFakeSsh(host); arguments.callback(ssh, host); } } - public void function onAny(required array hosts, required any callback) { + public any function onAny(required array hosts, required any callback) { + // Mirror real SshPool.onAny: serial, first success wins, per-host + // failures swallowed, last error rethrown if every host fails. if (arrayLen(arguments.hosts) == 0) return; - var ssh = $makeFakeSsh(arguments.hosts[1]); - arguments.callback(ssh, arguments.hosts[1]); + var state = {lastError: "", haveError: false}; + for (var host in arguments.hosts) { + try { + $resolveOrThrow(host); + var ssh = $makeFakeSsh(host); + return arguments.callback(ssh, host); + } catch (any e) { + state.lastError = e; + state.haveError = true; + } + } + if (state.haveError) { + throw(object = state.lastError); + } } public void function sequential(required array hosts, required any callback) { - onEach(arguments.hosts, arguments.callback); + // Mirror real SshPool.sequential: connections resolve lazily per + // host, in order — earlier hosts have already executed when a later + // host's connect fails. + for (var host in arguments.hosts) { + $resolveOrThrow(host); + var ssh = $makeFakeSsh(host); + arguments.callback(ssh, host); + } + } + + private void function $resolveOrThrow(required string host) { + if (structKeyExists(variables.connectFailures, arguments.host)) { + throw( + type = "FakeSshPool.ConnectionFailure", + message = variables.connectFailures[arguments.host] + ); + } } // Closure accessors — closures can't reach variables scope directly on Adobe. @@ -93,6 +155,15 @@ component { } else { result = {exitCode: 0, stdout: "", stderr: "", durationMs: 0}; } + // A scripted transport failure throws BEFORE the raise check — + // the real SshClient.run dies in startSession() on a dead + // cached connection, so {raise: false} can never suppress it. + if (structKeyExists(result, "transportError")) { + throw( + type = "FakeSshPool.TransportFailure", + message = result.transportError + ); + } // Mirror SshClient.run's opts.raise contract — #2696. Tests // assert that the deploy dispatch layer surfaces nonzero exit // codes by opting into raise=true at the call site. diff --git a/cli/lucli/tests/_fixtures/deploy/configs/multi-host.yml b/cli/lucli/tests/_fixtures/deploy/configs/multi-host.yml new file mode 100644 index 000000000..516acf3a3 --- /dev/null +++ b/cli/lucli/tests/_fixtures/deploy/configs/multi-host.yml @@ -0,0 +1,9 @@ +service: demo +image: acme/demo +servers: + - 10.0.0.1 + - 10.0.0.2 +registry: + username: demo + password: + - REGISTRY_PASSWORD diff --git a/cli/lucli/tests/_fixtures/deploy/configs/shared-host.yml b/cli/lucli/tests/_fixtures/deploy/configs/shared-host.yml new file mode 100644 index 000000000..6f6b69466 --- /dev/null +++ b/cli/lucli/tests/_fixtures/deploy/configs/shared-host.yml @@ -0,0 +1,11 @@ +service: demo +image: acme/demo +servers: + web: + - 10.0.0.5 + job: + - 10.0.0.5 +registry: + username: demo + password: + - REGISTRY_PASSWORD diff --git a/cli/lucli/tests/specs/deploy/cli/DeployLockAcquisitionSpec.cfc b/cli/lucli/tests/specs/deploy/cli/DeployLockAcquisitionSpec.cfc new file mode 100644 index 000000000..b7c1ba73f --- /dev/null +++ b/cli/lucli/tests/specs/deploy/cli/DeployLockAcquisitionSpec.cfc @@ -0,0 +1,184 @@ +/** + * Regression suite for #2957 DEP-1 — the deploy lock was silently bypassable + * on any multi-host fleet. + * + * The old flow acquired the lock via $dispatchAny → SshPool.onAny + * (first-success-wins, every per-host exception swallowed): contention on + * host 1 raised, onAny caught it, acquired a fresh lock on host 2, and the + * concurrent deploy proceeded. Release was also $dispatchAny, so the released + * host could differ from the acquired host, stranding stale locks. + * + * The fixed contract, asserted here via FakeSshPool: + * - acquire on EVERY (deduped) host, sequentially, with raise=true; + * - on a partial failure, roll back ONLY the already-acquired locks (the + * contended host's lock belongs to the other deploy) and abort with + * Wheels.Deploy.LockAcquireFailed naming the failing host; + * - release fans out to every host the lock was acquired on. + */ +component extends="wheels.wheelstest.system.BaseSpec" { + + function beforeAll() { + variables.multiHostFixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/multi-host.yml"); + variables.sharedHostFixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/shared-host.yml"); + } + + function run() { + describe("deploy lock acquisition is all-or-nothing across the fleet (##2957 DEP-1)", () => { + + it("acquires the lock on every host before pulling and releases it on every host", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + dc.deploy({configPath: variables.multiHostFixture, version: "v1"}); + + var calls = fake.calls(); + var acquireHosts = []; + var releaseHosts = []; + var lastAcquireIdx = 0; + var firstPullIdx = 0; + var firstReleaseIdx = 0; + for (var i = 1; i <= arrayLen(calls); i++) { + var cmd = calls[i].cmd ?: ""; + if (findNoCase("ln -s ", cmd) && findNoCase("kamal_deploy_lock_demo", cmd)) { + arrayAppend(acquireHosts, calls[i].host); + lastAcquireIdx = i; + } + if (findNoCase("rm -f ", cmd) && findNoCase("kamal_deploy_lock_demo", cmd)) { + arrayAppend(releaseHosts, calls[i].host); + if (!firstReleaseIdx) firstReleaseIdx = i; + } + if (!firstPullIdx && findNoCase("docker pull", cmd)) firstPullIdx = i; + } + + // Lock lives on EVERY host so a concurrent deploy collides no + // matter which host it tries first. + expect(acquireHosts).toBe(["10.0.0.1", "10.0.0.2"]); + // Release targets the same hosts the lock was acquired on (DEP-1b). + expect(releaseHosts).toBe(["10.0.0.1", "10.0.0.2"]); + // Bracketing: every acquire precedes the pull; release follows it. + expect(firstPullIdx).toBeGT(lastAcquireIdx); + expect(firstReleaseIdx).toBeGT(firstPullIdx); + }); + + it("rolls back already-acquired locks and aborts when a later host's acquire fails", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cfg = new cli.lucli.services.deploy.config.ConfigLoader().load(variables.multiHostFixture); + var lockCmds = new cli.lucli.services.deploy.commands.LockCommands(cfg); + var acquireCmd = lockCmds.acquire({user: $envUser(), message: "deploy v1"}); + // Host 2 is contended — another deploy holds the lock there. + fake.expect("10.0.0.2", acquireCmd, { + exitCode: 1, + stdout: "", + stderr: "ln: failed to create symbolic link '/tmp/kamal_deploy_lock_demo': File exists" + }); + + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + var state = {threw = false, errType = "", errMsg = "", errDetail = ""}; + try { + dc.deploy({configPath: variables.multiHostFixture, version: "v1"}); + } catch (any e) { + state.threw = true; + state.errType = e.type; + state.errMsg = e.message; + state.errDetail = e.detail ?: ""; + } + + expect(state.threw).toBeTrue(); + expect(state.errType).toBe("Wheels.Deploy.LockAcquireFailed"); + // The per-host error is surfaced, not swallowed. + expect(state.errMsg).toInclude("10.0.0.2"); + expect(state.errDetail).toInclude("File exists"); + + // The deploy body never started. + var sawPull = false; + var releaseHosts = []; + for (var c in fake.calls()) { + var cmd = c.cmd ?: ""; + if (findNoCase("docker pull", cmd)) sawPull = true; + if (findNoCase("rm -f ", cmd) && findNoCase("kamal_deploy_lock_demo", cmd)) { + arrayAppend(releaseHosts, c.host); + } + } + expect(sawPull).toBeFalse(); + // Rollback released ONLY the lock we acquired on host 1 — the + // contended lock on host 2 belongs to the other deploy and + // must never be removed. + expect(releaseHosts).toBe(["10.0.0.1"]); + }); + + it("a contended first host aborts with no rollback releases and no deploy body", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cfg = new cli.lucli.services.deploy.config.ConfigLoader().load(variables.multiHostFixture); + var lockCmds = new cli.lucli.services.deploy.commands.LockCommands(cfg); + var acquireCmd = lockCmds.acquire({user: $envUser(), message: "deploy v1"}); + fake.expect("10.0.0.1", acquireCmd, { + exitCode: 1, + stdout: "", + stderr: "ln: failed to create symbolic link '/tmp/kamal_deploy_lock_demo': File exists" + }); + + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + var state = {threw = false, errType = ""}; + try { + dc.deploy({configPath: variables.multiHostFixture, version: "v1"}); + } catch (any e) { + state.threw = true; + state.errType = e.type; + } + expect(state.threw).toBeTrue(); + expect(state.errType).toBe("Wheels.Deploy.LockAcquireFailed"); + + // Nothing was acquired, so nothing may be released — and the + // acquire on host 2 must never have been attempted. + for (var c in fake.calls()) { + var cmd = c.cmd ?: ""; + expect(findNoCase("rm -f ", cmd)).toBe(0); + expect(findNoCase("docker pull", cmd)).toBe(0); + if (findNoCase("ln -s ", cmd)) { + expect(c.host).toBe("10.0.0.1"); + } + } + }); + + it("locks a host serving multiple roles only once (no self-contention)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + dc.deploy({configPath: variables.sharedHostFixture, version: "v1"}); + + var acquires = []; + for (var c in fake.calls()) { + var cmd = c.cmd ?: ""; + if (findNoCase("ln -s ", cmd) && findNoCase("kamal_deploy_lock_demo", cmd)) { + arrayAppend(acquires, c.host); + } + } + // 10.0.0.5 serves both the web and job roles; acquiring twice + // would fail the second ln -s and deadlock the deploy on itself. + expect(acquires).toBe(["10.0.0.5"]); + }); + + it("dry-run renders the lock acquire per host without touching the pool", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + var out = dc.deploy({ + configPath: variables.multiHostFixture, + version: "v1", + dryRun: true + }); + expect(arrayLen(fake.calls())).toBe(0); + expect(out).toInclude("[10.0.0.1] ln -s"); + expect(out).toInclude("[10.0.0.2] ln -s"); + expect(out).toInclude("[10.0.0.1] rm -f"); + expect(out).toInclude("[10.0.0.2] rm -f"); + }); + }); + } + + // Mirrors DeployMainCli.$currentUser so FakeSshPool expectations match + // the exact acquire command the deploy flow builds. + private string function $envUser() { + var sys = createObject("java", "java.lang.System"); + var user = sys.getenv("USER"); + if (isNull(user) || !len(user)) user = "unknown"; + return user; + } +} diff --git a/cli/lucli/tests/specs/deploy/cli/DeployLockCliSpec.cfc b/cli/lucli/tests/specs/deploy/cli/DeployLockCliSpec.cfc index 987ac7cac..b65475e2b 100644 --- a/cli/lucli/tests/specs/deploy/cli/DeployLockCliSpec.cfc +++ b/cli/lucli/tests/specs/deploy/cli/DeployLockCliSpec.cfc @@ -2,6 +2,7 @@ component extends="wheels.wheelstest.system.BaseSpec" { function beforeAll() { variables.fixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/minimal.yml"); + variables.multiHostFixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/multi-host.yml"); } function run() { @@ -39,6 +40,142 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(fake.calls()[1].cmd).toInclude("readlink /tmp/kamal_deploy_lock_demo"); }); + // Regression suite for #2957 DEP-1 — the manual lock verbs used the + // same first-success-wins $dispatchAny as the deploy flow, so a + // manual acquire could land on a different host than a concurrent + // deploy probed, and a manual release could clear only one host of + // a fleet-wide lock, stranding the rest. + + it("acquire dispatches to every host in order (##2957 DEP-1)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cli = new cli.lucli.services.deploy.cli.DeployLockCli(fake); + cli.acquire({configPath: variables.multiHostFixture, message: "surgery"}); + var hosts = []; + for (var c in fake.calls()) { + expect(c.cmd).toInclude("ln -s"); + arrayAppend(hosts, c.host); + } + expect(hosts).toBe(["10.0.0.1", "10.0.0.2"]); + }); + + it("acquire rolls back already-acquired locks when a later host is contended (##2957 DEP-1)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cfg = new cli.lucli.services.deploy.config.ConfigLoader().load(variables.multiHostFixture); + var lockCmds = new cli.lucli.services.deploy.commands.LockCommands(cfg); + var acquireCmd = lockCmds.acquire({user: $envUser(), message: "surgery"}); + fake.expect("10.0.0.2", acquireCmd, { + exitCode: 1, + stdout: "", + stderr: "ln: failed to create symbolic link '/tmp/kamal_deploy_lock_demo': File exists" + }); + + var cli = new cli.lucli.services.deploy.cli.DeployLockCli(fake); + var state = {threw = false, errType = "", errMsg = ""}; + try { + cli.acquire({configPath: variables.multiHostFixture, message: "surgery"}); + } catch (any e) { + state.threw = true; + state.errType = e.type; + state.errMsg = e.message; + } + expect(state.threw).toBeTrue(); + expect(state.errType).toBe("Wheels.Deploy.LockAcquireFailed"); + expect(state.errMsg).toInclude("10.0.0.2"); + + var releaseHosts = []; + for (var c in fake.calls()) { + if (findNoCase("rm -f ", c.cmd ?: "")) arrayAppend(releaseHosts, c.host); + } + // Only the acquired host is rolled back — the contended lock + // on host 2 belongs to someone else. + expect(releaseHosts).toBe(["10.0.0.1"]); + }); + + it("release fans out to every host (##2957 DEP-1)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cli = new cli.lucli.services.deploy.cli.DeployLockCli(fake); + cli.release({configPath: variables.multiHostFixture}); + var hosts = []; + for (var c in fake.calls()) { + expect(c.cmd).toInclude("rm -f /tmp/kamal_deploy_lock_demo"); + arrayAppend(hosts, c.host); + } + expect(hosts).toBe(["10.0.0.1", "10.0.0.2"]); + }); + + it("status fans out to every host (##2957 DEP-1)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cli = new cli.lucli.services.deploy.cli.DeployLockCli(fake); + cli.status({configPath: variables.multiHostFixture}); + var hosts = []; + for (var c in fake.calls()) { + expect(c.cmd).toInclude("readlink /tmp/kamal_deploy_lock_demo"); + arrayAppend(hosts, c.host); + } + expect(hosts).toBe(["10.0.0.1", "10.0.0.2"]); + }); + + // Regression suite for the ##2957 review: release/status fanned out + // via SshPool.onEach, which pre-resolves a connection for EVERY + // host before running anything — one dead host aborted the whole + // verb with zero commands executed. That turned the prescribed + // stale-lock recovery path ("clear a stale lock with 'wheels + // deploy lock release'") into a dead-end exactly when a host died + // mid-deploy, the most plausible cause of fleet-wide stale locks. + + it("release skips an unreachable host and still clears the lock on the rest", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + fake.failConnection("10.0.0.1", "Connection refused"); + var cli = new cli.lucli.services.deploy.cli.DeployLockCli(fake); + var out = cli.release({configPath: variables.multiHostFixture}); + var releaseHosts = []; + for (var c in fake.calls()) { + if (findNoCase("rm -f ", c.cmd ?: "")) arrayAppend(releaseHosts, c.host); + } + expect(releaseHosts).toBe(["10.0.0.2"]); + // The summary names the skipped host so the operator knows the + // release there is still pending. + expect(out).toInclude("Released deploy lock"); + expect(out).toInclude("WARNING"); + expect(out).toInclude("10.0.0.1"); + expect(out).toInclude("Connection refused"); + }); + + it("status reports an unreachable host as advisory output instead of throwing", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + fake.failConnection("10.0.0.2"); + var cli = new cli.lucli.services.deploy.cli.DeployLockCli(fake); + var out = cli.status({configPath: variables.multiHostFixture}); + var statusHosts = []; + for (var c in fake.calls()) { + if (findNoCase("readlink ", c.cmd ?: "")) arrayAppend(statusHosts, c.host); + } + expect(statusHosts).toBe(["10.0.0.1"]); + expect(out).toInclude("Checked deploy lock status"); + expect(out).toInclude("WARNING"); + expect(out).toInclude("10.0.0.2"); + }); + + it("a transport-dead release still proceeds when the failure is a dead session, not a dead connect", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cfg = new cli.lucli.services.deploy.config.ConfigLoader().load(variables.multiHostFixture); + var lockCmds = new cli.lucli.services.deploy.commands.LockCommands(cfg); + // Host 1's connection is cached-but-dead: the connect succeeds, + // the rm -f session throws a transport error despite raise=false. + fake.expect("10.0.0.1", lockCmds.release(), {transportError: "Broken pipe"}); + var cli = new cli.lucli.services.deploy.cli.DeployLockCli(fake); + var out = cli.release({configPath: variables.multiHostFixture}); + var releaseHosts = []; + for (var c in fake.calls()) { + if (findNoCase("rm -f ", c.cmd ?: "")) arrayAppend(releaseHosts, c.host); + } + // Host 1's release was attempted (recorded) before the session + // died; host 2's still went through. + expect(releaseHosts).toBe(["10.0.0.1", "10.0.0.2"]); + expect(out).toInclude("WARNING"); + expect(out).toInclude("10.0.0.1"); + }); + // Regression suite for #2957 (Wave 3, DEP-6a) — `lock status` // dropped the ssh.run() result, so the operator never saw who // holds the lock (readlink's stdout) in live mode. @@ -79,4 +216,13 @@ component extends="wheels.wheelstest.system.BaseSpec" { }); }); } + + // Mirrors DeployLockCli.$currentUser so FakeSshPool expectations match + // the exact acquire command the verb builds. + private string function $envUser() { + var sys = createObject("java", "java.lang.System"); + var user = sys.getenv("USER"); + if (isNull(user) || !len(user)) user = "unknown"; + return user; + } } diff --git a/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc b/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc index 0df29f0d4..80ba88069 100644 --- a/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc +++ b/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc @@ -3,6 +3,7 @@ component extends="wheels.wheelstest.system.BaseSpec" { function beforeAll() { variables.fixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/minimal.yml"); variables.proxyFixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/with-proxy.yml"); + variables.multiHostFixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/multi-host.yml"); variables.multiRoleFixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/full.yml"); variables.accessoriesFixture = expandPath("/cli/lucli/tests/_fixtures/deploy/configs/with-accessories.yml"); } @@ -588,6 +589,67 @@ component extends="wheels.wheelstest.system.BaseSpec" { } }); + // Regression suite for the ##2957 review: the ##2696 guard above only + // scripts an exit-code release failure, which {raise: false} already + // tolerates. A TRANSPORT failure (host died mid-deploy, the cached + // connection's startSession throws inside the release) used to + // propagate out of the finally block and REPLACE the in-flight + // deploy exception — the operator saw a connect error instead of + // the real deploy failure, and the post-deploy-failure hook got the + // wrong KAMAL_ERROR. + + it("a transport-dead lock release in the finally block does not mask the original deploy exception", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cfg = new cli.lucli.services.deploy.config.ConfigLoader().load(variables.fixture); + var builder = new cli.lucli.services.deploy.commands.BuilderCommands(cfg); + var lockCmds = new cli.lucli.services.deploy.commands.LockCommands(cfg); + fake.expect("1.2.3.4", builder.pull("v1"), { + exitCode: 1, stdout: "", stderr: "manifest unknown" + }); + // The same host then drops off the network: the release in the + // finally dies in the SSH transport, not in an exit code. + fake.expect("1.2.3.4", lockCmds.release(), {transportError: "Broken pipe"}); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + var state = {threw = false, errType = "", errMsg = "", logged = ""}; + savecontent variable="state.logged" { + try { + dc.deploy({configPath: variables.fixture, version: "v1"}); + } catch (any e) { + state.threw = true; + state.errType = e.type; + state.errMsg = e.message; + } + } + expect(state.threw).toBeTrue(); + expect(state.errType).toBe("Wheels.Deploy.RemoteExecutionFailed"); + expect(state.errMsg).toInclude("docker pull"); + // ...and the skipped release is logged, not thrown. + expect(state.logged).toInclude("[lock:release]"); + expect(state.logged).toInclude("1.2.3.4"); + }); + + it("a transport-dead host during the finally lock release does not fail an otherwise-successful deploy and still releases the rest", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cfg = new cli.lucli.services.deploy.config.ConfigLoader().load(variables.multiHostFixture); + var lockCmds = new cli.lucli.services.deploy.commands.LockCommands(cfg); + fake.expect("10.0.0.1", lockCmds.release(), {transportError: "Connection reset"}); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + var state = {out = "", logged = ""}; + savecontent variable="state.logged" { + state.out = dc.deploy({configPath: variables.multiHostFixture, version: "v1"}); + } + expect(state.out).toInclude("Deployed"); + // Host 2's lock was still released even though host 1 dropped. + var releaseHosts = []; + for (var c in fake.calls()) { + if (findNoCase("rm -f ", c.cmd ?: "") && findNoCase("kamal_deploy_lock_demo", c.cmd ?: "")) { + arrayAppend(releaseHosts, c.host); + } + } + expect(releaseHosts).toInclude("10.0.0.2"); + expect(state.logged).toInclude("[lock:release] 10.0.0.1"); + }); + // Regression suite for #3087 — a failing post-deploy-failure hook used to // throw DeployMainCli.HookFailed from inside the deploy catch block, // replacing the original deploy error. The hook fires on an already-failed diff --git a/cli/lucli/tests/specs/deploy/commands/LockCommandsSpec.cfc b/cli/lucli/tests/specs/deploy/commands/LockCommandsSpec.cfc index bb4852306..af973b253 100644 --- a/cli/lucli/tests/specs/deploy/commands/LockCommandsSpec.cfc +++ b/cli/lucli/tests/specs/deploy/commands/LockCommandsSpec.cfc @@ -49,6 +49,28 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(cmd).toInclude("ln -s"); expect(cmd).toInclude("/tmp/kamal_deploy_lock_demo"); }); + + // Regression suite for #2957 DEP-10 — the whole symlink target used + // to be single-quoted, which suppressed command substitution: lock + // metadata recorded the LITERAL "$(hostname)/$(date ...)" text. + // The fixed target concatenates three shell words — single-quoted + // user, double-quoted substitution segment, single-quoted message — + // so the metadata expands while hostile chars stay inert. + + it("acquire() exposes $(hostname)/$(date) in a double-quoted segment so the remote shell expands them (##2957 DEP-10)", () => { + var cmd = variables.lock.acquire({user: "deploy", message: "deploying v1"}); + expect(cmd).toInclude('"@$(hostname)/$(date --iso-8601=seconds)/"'); + }); + + it("acquire() keeps shell metacharacters in the message inert via single quotes (##2957 DEP-10)", () => { + var cmd = variables.lock.acquire({user: "deploy", message: "pwn$(touch /tmp/pwned)"}); + expect(cmd).toInclude("'pwn$(touch /tmp/pwned)'"); + }); + + it("acquire() keeps shell metacharacters in the user inert via single quotes (##2957 DEP-10)", () => { + var cmd = variables.lock.acquire({user: "$(whoami)", message: "x"}); + expect(cmd).toInclude("'$(whoami)'"); + }); }); } } diff --git a/cli/lucli/tests/specs/deploy/lib/FakeSshPoolSpec.cfc b/cli/lucli/tests/specs/deploy/lib/FakeSshPoolSpec.cfc index 070ae734c..dcd37d921 100644 --- a/cli/lucli/tests/specs/deploy/lib/FakeSshPoolSpec.cfc +++ b/cli/lucli/tests/specs/deploy/lib/FakeSshPoolSpec.cfc @@ -128,6 +128,70 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(threw).toBeFalse(); }); + // Transport-failure modeling — mirrors the REAL pool's semantics so + // specs can exercise unreachable-host paths (##2957 review follow-up): + // - onEach pre-resolves every connection first, so one dead host + // aborts the whole fan-out with zero commands executed; + // - sequential resolves lazily, so earlier hosts already ran; + // - onAny catches per host and falls through to the next; + // - a scripted `transportError` result throws from run() itself, + // regardless of {raise: false} (dead cached connection). + + it("failConnection makes onEach abort wholesale before any command runs (mirrors real pre-resolve)", () => { + var p = new cli.lucli.services.deploy.lib.FakeSshPool(); + p.failConnection("h2"); + expect(() => p.onEach(["h1", "h2"], function(ssh, host) { ssh.run("x"); })) + .toThrow(type="FakeSshPool.ConnectionFailure"); + expect(arrayLen(p.calls())).toBe(0); + }); + + it("failConnection makes sequential fail at that host after earlier hosts ran", () => { + var p = new cli.lucli.services.deploy.lib.FakeSshPool(); + p.failConnection("h2", "No route to host"); + var state = {threw = false, errMsg = ""}; + try { + p.sequential(["h1", "h2", "h3"], function(ssh, host) { ssh.run("x"); }); + } catch (any e) { + state.threw = true; + state.errMsg = e.message; + } + expect(state.threw).toBeTrue(); + expect(state.errMsg).toInclude("No route to host"); + expect(arrayLen(p.calls())).toBe(1); + expect(p.calls()[1].host).toBe("h1"); + }); + + it("failConnection makes onAny skip to the next host", () => { + var p = new cli.lucli.services.deploy.lib.FakeSshPool(); + p.failConnection("h1"); + p.onAny(["h1", "h2"], function(ssh, host) { ssh.run("x"); }); + expect(arrayLen(p.calls())).toBe(1); + expect(p.calls()[1].host).toBe("h2"); + }); + + it("onAny rethrows the last error when every host is unreachable", () => { + var p = new cli.lucli.services.deploy.lib.FakeSshPool(); + p.failConnection("h1"); + p.failConnection("h2", "last one"); + var state = {threw = false, errMsg = ""}; + try { + p.onAny(["h1", "h2"], function(ssh, host) { ssh.run("x"); }); + } catch (any e) { + state.threw = true; + state.errMsg = e.message; + } + expect(state.threw).toBeTrue(); + expect(state.errMsg).toInclude("last one"); + }); + + it("a scripted transportError throws from run regardless of raise=false", () => { + var p = new cli.lucli.services.deploy.lib.FakeSshPool(); + p.expect("h1", "rm -f /lock", {transportError: "Broken pipe"}); + expect(() => p.onEach(["h1"], function(ssh, host) { + ssh.run("rm -f /lock", {raise: false}); + })).toThrow(type="FakeSshPool.TransportFailure", regex="Broken pipe"); + }); + it("inline run trims very long stderr in the thrown error detail", () => { var p = new cli.lucli.services.deploy.lib.FakeSshPool(); var longErr = repeatString("x", 800);