diff --git a/changelog.d/2957-deploy-w4-rollup.fixed.md b/changelog.d/2957-deploy-w4-rollup.fixed.md new file mode 100644 index 000000000..bbec575c6 --- /dev/null +++ b/changelog.d/2957-deploy-w4-rollup.fixed.md @@ -0,0 +1,5 @@ +- `wheels deploy`: a same-version redeploy no longer hits a guaranteed `docker run --name` conflict — deploy now force-removes an existing same-name container (exact-anchored, idempotent) before each app run, and stops superseded versions best-effort after the kamal-proxy cutover instead of leaving every old container running forever (#2957 DEP-11a) +- `wheels deploy secrets extract ` now matches the key case-sensitively, like Kamal — previously CFML's case-insensitive `==` let `extract path` return the `PATH` line's value (#2957 DEP-11b) +- `wheels deploy`: the kamal-proxy config volume now derives the remote home from the ssh user (`/root` for the default root user, `/home/` otherwise) instead of hardcoding the `/home/` layout, which was wrong on every default config (#2957 DEP-11c) +- `wheels deploy`: secret resolution from `.kamal/secrets` is now bounded by a timeout (default 60s, configurable via `timeoutSeconds`) — a command blocking on interactive input (e.g. `op read` prompting for sign-in) used to hang the deploy thread forever; it now kills the bash process and throws a clear `SecretResolver.ResolutionFailed` (#2957) +- `wheels deploy rollback --destination=X` now applies the destination overlay — rollback was the last verb loading `deploy.yml` without forwarding `--destination`, so it targeted the base config's hosts (#2957; same defect class as the `config` fix in #3085) diff --git a/cli/lucli/services/deploy/cli/DeployMainCli.cfc b/cli/lucli/services/deploy/cli/DeployMainCli.cfc index 17e1f5356..52b0285d7 100644 --- a/cli/lucli/services/deploy/cli/DeployMainCli.cfc +++ b/cli/lucli/services/deploy/cli/DeployMainCli.cfc @@ -142,6 +142,9 @@ component { for (var role in cfg.roles()) { for (var host in role.hosts()) { + // A same-version redeploy would hit a guaranteed + // docker run --name conflict otherwise (#2957 DEP-11a). + $dispatch([host], app.remove_conflicting(role, ver), dryRun); if (arrayLen(secretNames)) { $deliverEnvFile( [host], @@ -163,6 +166,11 @@ component { dryRun ); } + // Post-cutover cleanup: stop superseded versions + // (#2957 DEP-11a). Best-effort (allowFail) — a failed + // stop of an OLD container must not fail an + // otherwise-complete deploy of the NEW one. + $dispatch([host], app.stop_old_versions(role, ver), dryRun, true); } } $dispatch(hosts, auditor.record("completed deploy of version " & ver), dryRun, true); @@ -220,7 +228,13 @@ component { message = "rollback requires a version (pass opts.version)" ); } - var cfg = variables.loader.load(arguments.opts.configPath); + // Forward the destination overlay — rollback() was the last + // loader.load() call site not doing so (#2957; same defect class + // as the config() fix in #3085). + var cfg = variables.loader.load( + arguments.opts.configPath, + {destination: arguments.opts.destination ?: ""} + ); var app = new modules.wheels.services.deploy.commands.AppCommands(cfg); var proxy = new modules.wheels.services.deploy.commands.ProxyCommands(cfg); var dryRun = arguments.opts.dryRun ?: false; diff --git a/cli/lucli/services/deploy/cli/DeploySecretsCli.cfc b/cli/lucli/services/deploy/cli/DeploySecretsCli.cfc index fd780bd54..08d2a18ac 100644 --- a/cli/lucli/services/deploy/cli/DeploySecretsCli.cfc +++ b/cli/lucli/services/deploy/cli/DeploySecretsCli.cfc @@ -67,7 +67,10 @@ component { var eq = find("=", line); // eq > 1, not > 0: a line starting with '=' has no key, and // left(line, 0) crashes Lucee 7 (Cross-Engine Invariant 8). - if (eq > 1 && left(line, eq - 1) == key) { + // compare(), not ==: CFML == is case-insensitive, but env var + // names are case-sensitive and Kamal's extract is an exact + // match (#2957 DEP-11b). + if (eq > 1 && compare(left(line, eq - 1), key) == 0) { return mid(line, eq + 1, 99999); } } diff --git a/cli/lucli/services/deploy/commands/AppCommands.cfc b/cli/lucli/services/deploy/commands/AppCommands.cfc index d0651856f..f65868c74 100644 --- a/cli/lucli/services/deploy/commands/AppCommands.cfc +++ b/cli/lucli/services/deploy/commands/AppCommands.cfc @@ -29,6 +29,43 @@ component extends="Base" { ); } + /** + * Idempotent same-name conflict guard (#2957 DEP-11a): `docker run + * --name X` hard-fails when ANY container — running or stopped — + * already holds the name, which is the guaranteed state on a + * same-version redeploy. The name filter is regex-anchored for an + * exact match and `xargs -r` makes the whole pipeline a no-op + * (exit 0) when nothing matches. + */ + public string function remove_conflicting(required any role, required string version) { + return pipe([ + docker("container", "ls", "--all", + "--filter name=^#container_name(arguments.role, arguments.version)#$", + "--quiet"), + "xargs -r docker container rm --force" + ]); + } + + /** + * Stop every superseded version of this role's container after the + * proxy cutover (#2957 DEP-11a) — previously old versions kept + * running (and auto-restarting) forever. Scoped by the same + * service/role/destination labels $labelArgs stamps on at run time; + * the just-deployed container is excluded by exact name. `xargs -r` + * keeps a first deploy (no old versions) a no-op. + */ + public string function stop_old_versions(required any role, required string version) { + return pipe([ + docker("ps", + "--filter label=service=#variables.config.service()#", + "--filter label=role=#arguments.role.name()#", + "--filter label=destination=#variables.config.destination()#", + "--format {{.Names}}"), + "grep -v '^#container_name(arguments.role, arguments.version)#$'", + "xargs -r docker stop" + ]); + } + public string function start(required any role, required string version) { return docker("start", container_name(arguments.role, arguments.version)); } diff --git a/cli/lucli/services/deploy/commands/ProxyCommands.cfc b/cli/lucli/services/deploy/commands/ProxyCommands.cfc index f1e31ce89..bca4c6c60 100644 --- a/cli/lucli/services/deploy/commands/ProxyCommands.cfc +++ b/cli/lucli/services/deploy/commands/ProxyCommands.cfc @@ -27,7 +27,7 @@ component extends="Base" { "--network kamal", "--publish 80:80", "--publish 443:443", - "--volume /home/#variables.config.ssh().user()#/.config/kamal-proxy:/home/kamal-proxy/.config/kamal-proxy", + "--volume #$remoteHome()#/.config/kamal-proxy:/home/kamal-proxy/.config/kamal-proxy", variables.PROXY_IMAGE ); } @@ -91,4 +91,15 @@ component extends="Base" { public string function restart() { return docker("restart", variables.PROXY_CONTAINER_NAME); } + + /** + * Home directory of the deploy user on the remote host, for the proxy + * config volume. The previous hardcoded `/home/` was wrong for + * the DEFAULT ssh user (root's home is /root) — #2957 DEP-11c. + * compare(), not ==: unix usernames are case-sensitive. + */ + private string function $remoteHome() { + var sshUser = variables.config.ssh().user(); + return compare(sshUser, "root") == 0 ? "/root" : "/home/" & sshUser; + } } diff --git a/cli/lucli/services/deploy/lib/SecretResolver.cfc b/cli/lucli/services/deploy/lib/SecretResolver.cfc index f0a08e3d2..37aecc444 100644 --- a/cli/lucli/services/deploy/lib/SecretResolver.cfc +++ b/cli/lucli/services/deploy/lib/SecretResolver.cfc @@ -38,6 +38,11 @@ component { // (e.g. a non-standard Git Bash install on Windows). Also the // seam the BashUnavailable spec uses. variables.bashCmd = arguments.opts.bashCmd ?: "bash"; + // Upper bound on resolution, in seconds (#2957): a command inside + // the secrets file that blocks on interactive input — `op read` + // prompting for sign-in, `bw get` waiting to unlock — used to hang + // the deploy thread forever on an unbounded waitFor()/stdout read. + variables.timeoutSeconds = arguments.opts.timeoutSeconds ?: 60; variables.resolved = $loadAll(); return this; } @@ -165,7 +170,7 @@ component { // the stderr write while we block on the stdout read. A file sink // never fills, so bash always runs to completion. Secret values // travel on stdout (read in-memory); only diagnostics touch disk, - // and the file is deleted in the finally block even when waitFor() + // and the file is deleted in the finally block even when the drain // or the throw paths interrupt the happy path. var errPath = getTempFile(getTempDirectory(), "wheels-secret-err"); try { @@ -181,7 +186,9 @@ component { detail = "Secret resolution requires a local bash for $(cmd) expansion. On Windows, run inside WSL or Git Bash." ); } - var out = $readStream(proc.getInputStream()); + var out = $drainWithDeadline(proc); + // The drain only returns once the process has exited, so this + // waitFor() is an immediate exit-code read, not a blocking wait. var exitCode = proc.waitFor(); var err = fileExists(errPath) ? fileRead(errPath, "UTF-8") : ""; return {exitCode: exitCode, out: out, err: err}; @@ -190,12 +197,43 @@ component { } } - private string function $readStream(required any inputStream) { - var scanner = createObject("java", "java.util.Scanner").init(arguments.inputStream, "UTF-8"); - scanner.useDelimiter("\A"); - var content = scanner.hasNext() ? scanner.next() : ""; - scanner.close(); - return content; + /** + * Drain the process's stdout to completion, bounded by + * variables.timeoutSeconds (#2957). The previous shape — a blocking + * read-to-EOF followed by an unbounded waitFor() — hung the deploy + * thread forever when a command inside the secrets file blocked on + * interactive input while holding stdout open. Polling `available()` + * against a deadline bounds both hang shapes (silent never-exits AND + * endless streamers); on expiry the bash process is force-killed and + * a clear ResolutionFailed surfaces. + */ + private string function $drainWithDeadline(required any proc) { + var deadlineAt = getTickCount() + (variables.timeoutSeconds * 1000); + var stdoutStream = arguments.proc.getInputStream(); + var buffer = createObject("java", "java.io.ByteArrayOutputStream").init(); + while (arguments.proc.isAlive()) { + if (getTickCount() >= deadlineAt) { + arguments.proc.destroyForcibly(); + throw( + type = "SecretResolver.ResolutionFailed", + message = "Resolving .kamal/secrets timed out after " & variables.timeoutSeconds + & " second(s) — a command inside the secrets file may be waiting for" + & " interactive input (e.g. a credential manager prompting for sign-in)." + & " The bash process was killed.", + detail = "Sign in to the secret manager in a terminal first, or raise the limit" + & " via SecretResolver init opts.timeoutSeconds for legitimately slow commands." + ); + } + var avail = stdoutStream.available(); + if (avail > 0) { + buffer.writeBytes(stdoutStream.readNBytes(avail)); + } else { + sleep(25); + } + } + // Process exited — anything left in the pipe reads to EOF instantly. + buffer.writeBytes(stdoutStream.readAllBytes()); + return buffer.toString("UTF-8"); } private string function $shellEscape(required string path) { diff --git a/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc b/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc index 80ba88069..cb9d60746 100644 --- a/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc +++ b/cli/lucli/tests/specs/deploy/cli/DeployMainCliSpec.cfc @@ -961,6 +961,59 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(out).toInclude("--name demo-web-v1"); }); + // Regression suite for #2957 (Wave 4 roll-ups). + // (DEP-11a) deploy() issued `docker run --name --` + // with no prior conflict removal — a same-version redeploy hit a + // guaranteed name conflict — and superseded containers were never + // stopped after cutover. (rollback --destination) rollback() was the + // last loader.load() call site not forwarding the destination overlay + // (same defect class as the config() fix in #3085). + + it("deploy removes a conflicting same-name container before docker run (##2957 DEP-11a)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + dc.deploy({configPath: variables.fixture, version: "v1"}); + var cmds = $cmds(fake); + var rmIdx = 0; var runIdx = 0; + for (var i = 1; i <= arrayLen(cmds); i++) { + if (!rmIdx && findNoCase("name=^demo-web-v1$", cmds[i]) + && findNoCase("docker container rm --force", cmds[i])) rmIdx = i; + // "--name demo-web-v1" pins the APP container run — the + // proxy's start_or_run also embeds a "docker run --detach". + if (!runIdx && findNoCase("docker run --detach", cmds[i]) + && findNoCase("--name demo-web-v1", cmds[i])) runIdx = i; + } + expect(rmIdx).toBeGT(0); + expect(runIdx).toBeGT(rmIdx); + }); + + it("deploy stops superseded versions after the proxy cutover (##2957 DEP-11a)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + dc.deploy({configPath: variables.fixture, version: "v1"}); + var cmds = $cmds(fake); + var proxyIdx = 0; var stopOldIdx = 0; + for (var i = 1; i <= arrayLen(cmds); i++) { + if (!proxyIdx && findNoCase("kamal-proxy deploy", cmds[i])) proxyIdx = i; + if (!stopOldIdx && findNoCase("grep -v '^demo-web-v1$'", cmds[i]) + && findNoCase("xargs -r docker stop", cmds[i])) stopOldIdx = i; + } + expect(proxyIdx).toBeGT(0); + expect(stopOldIdx).toBeGT(proxyIdx); + }); + + it("a failing old-version stop does not fail an otherwise-complete deploy (##2957 DEP-11a)", () => { + var fake = new cli.lucli.services.deploy.lib.FakeSshPool(); + var cfg = new cli.lucli.services.deploy.config.ConfigLoader().load(variables.fixture); + var app = new cli.lucli.services.deploy.commands.AppCommands(cfg); + fake.expect("1.2.3.4", app.stop_old_versions(cfg.roles()[1], "v1"), { + exitCode: 1, stdout: "", stderr: "Cannot connect to the Docker daemon" + }); + var dc = new cli.lucli.services.deploy.cli.DeployMainCli(fake); + var out = dc.deploy({configPath: variables.fixture, version: "v1"}); + expect(out).toInclude("Deployed"); + }); + // Regression suite for #2957 (Wave 3 — observability). (DEP-6a) the // dispatch closures dropped every ssh.run() result, so read verbs // (`audit`, `details`) returned only host-count summaries in live @@ -1059,6 +1112,34 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(out).toInclude("Deployed"); }); + it("rollback --dry-run applies the destination overlay (##2957, same class as ##3085)", () => { + var base = getTempFile(getTempDirectory(), "yml"); + fileWrite( + base, + "service: demo#chr(10)#image: acme/demo#chr(10)#servers: [192.0.2.10]" + & "#chr(10)#registry: {username: u, password: [X]}" + ); + var overlay = new cli.lucli.services.deploy.config.ConfigLoader() + .$overlayPathFor(base, "staging"); + fileWrite(overlay, "servers:#chr(10)# - 192.0.2.99"); + try { + var dc = new cli.lucli.services.deploy.cli.DeployMainCli( + new cli.lucli.services.deploy.lib.FakeSshPool() + ); + var out = dc.rollback({ + configPath: base, + destination: "staging", + version: "v1", + dryRun: true + }); + expect(out).toInclude("192.0.2.99"); + expect(out).notToInclude("192.0.2.10"); + } finally { + fileDelete(base); + fileDelete(overlay); + } + }); + // Regression for #2671 — git's stderr ("fatal: not a git repository...") used to leak through as the version string. it("$gitShortSha() returns 'unknown' when run outside a git repo", () => { var nonGitDir = getTempDirectory() & "/wheels-2671-main-" & createUUID(); diff --git a/cli/lucli/tests/specs/deploy/cli/DeploySecretsCliSpec.cfc b/cli/lucli/tests/specs/deploy/cli/DeploySecretsCliSpec.cfc index d0183f05c..751e85f02 100644 --- a/cli/lucli/tests/specs/deploy/cli/DeploySecretsCliSpec.cfc +++ b/cli/lucli/tests/specs/deploy/cli/DeploySecretsCliSpec.cfc @@ -56,6 +56,17 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(cli.extract({key: "", from: "FOO=bar"})).toBe(""); }); + // #2957 DEP-11b — CFML `==` is case-insensitive, so `extract path` + // matched the PATH line. Kamal's extract is an exact (case-sensitive) + // key match; env var names are case-sensitive in every shell. + it("extract key match is case-sensitive (##2957 DEP-11b)", () => { + var cli = new cli.lucli.services.deploy.cli.DeploySecretsCli(); + var block = "PATH=/usr/bin" & chr(10) & "path=lower-value"; + expect(cli.extract({key: "path", from: block})).toBe("lower-value"); + expect(cli.extract({key: "PATH", from: block})).toBe("/usr/bin"); + expect(cli.extract({key: "Path", from: block})).toBe(""); + }); + it("extract skips malformed lines that start with '='", () => { // left(line, 0) on a keyless '=...' line crashes Lucee 7 // (Cross-Engine Invariant 8); such lines must be skipped. diff --git a/cli/lucli/tests/specs/deploy/commands/AppCommandsSpec.cfc b/cli/lucli/tests/specs/deploy/commands/AppCommandsSpec.cfc index 41046ed2f..81cc7b3d6 100644 --- a/cli/lucli/tests/specs/deploy/commands/AppCommandsSpec.cfc +++ b/cli/lucli/tests/specs/deploy/commands/AppCommandsSpec.cfc @@ -161,6 +161,32 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(state.message).notToInclude("PRESENT"); }); + // #2957 DEP-11a — `docker run --name X` hard-fails when ANY container + // (running or stopped) already holds the name, which is the guaranteed + // state on a same-version redeploy. The conflict guard must be exact-name + // anchored and a no-op when nothing matches (xargs -r). + it("remove_conflicting() force-removes an existing same-name container, idempotently (##2957 DEP-11a)", () => { + var cmd = new cli.lucli.services.deploy.commands.AppCommands(variables.cfg) + .remove_conflicting(variables.cfg.roles()[1], "v1"); + expect(cmd).toInclude("docker container ls --all"); + expect(cmd).toInclude("--filter name=^demo-web-v1$"); + expect(cmd).toInclude("--quiet"); + expect(cmd).toInclude("xargs -r docker container rm --force"); + }); + + // #2957 DEP-11a — superseded containers were never stopped after + // cutover, so every old version kept running (and restarting) forever. + it("stop_old_versions() stops same-role containers except the current version (##2957 DEP-11a)", () => { + var cmd = new cli.lucli.services.deploy.commands.AppCommands(variables.cfg) + .stop_old_versions(variables.cfg.roles()[1], "v2"); + expect(cmd).toInclude("docker ps"); + expect(cmd).toInclude("--filter label=service=demo"); + expect(cmd).toInclude("--filter label=role=web"); + expect(cmd).toInclude("--filter label=destination="); + expect(cmd).toInclude("grep -v '^demo-web-v2$'"); + expect(cmd).toInclude("xargs -r docker stop"); + }); + it("remove() chains docker stop and docker rm", () => { var cmd = new cli.lucli.services.deploy.commands.AppCommands(variables.cfg) .remove(variables.cfg.roles()[1], "v9"); diff --git a/cli/lucli/tests/specs/deploy/commands/ProxyCommandsSpec.cfc b/cli/lucli/tests/specs/deploy/commands/ProxyCommandsSpec.cfc index 68c6cb2ee..6a0a0aad7 100644 --- a/cli/lucli/tests/specs/deploy/commands/ProxyCommandsSpec.cfc +++ b/cli/lucli/tests/specs/deploy/commands/ProxyCommandsSpec.cfc @@ -80,6 +80,23 @@ component extends="wheels.wheelstest.system.BaseSpec" { expect(cmd).toInclude("docker start kamal-proxy || docker run"); expect(cmd).notToInclude("docker ps"); }); + + // #2957 DEP-11c — boot() hardcoded the config volume to + // /home/, but the DEFAULT ssh user is root, whose home + // is /root. The mount path must derive the real remote home from + // the ssh user instead of assuming the /home/ layout. + it("boot() mounts /root for the default root ssh user (##2957 DEP-11c)", () => { + var cmd = new cli.lucli.services.deploy.commands.ProxyCommands(variables.cfg).boot(); + expect(cmd).toInclude("--volume /root/.config/kamal-proxy:/home/kamal-proxy/.config/kamal-proxy"); + expect(cmd).notToInclude("/home/root"); + }); + + it("boot() mounts /home/ for a non-root ssh user (##2957 DEP-11c)", () => { + var sshCfg = new cli.lucli.services.deploy.config.ConfigLoader() + .load(expandPath("/cli/lucli/tests/_fixtures/deploy/configs/with-ssh.yml")); + var cmd = new cli.lucli.services.deploy.commands.ProxyCommands(sshCfg).boot(); + expect(cmd).toInclude("--volume /home/admin/.config/kamal-proxy:/home/kamal-proxy/.config/kamal-proxy"); + }); }); } } diff --git a/cli/lucli/tests/specs/deploy/lib/SecretResolverSpec.cfc b/cli/lucli/tests/specs/deploy/lib/SecretResolverSpec.cfc index 48d36f6a9..66057102a 100644 --- a/cli/lucli/tests/specs/deploy/lib/SecretResolverSpec.cfc +++ b/cli/lucli/tests/specs/deploy/lib/SecretResolverSpec.cfc @@ -135,6 +135,25 @@ component extends="wheels.wheelstest.system.BaseSpec" { }).toThrow(type="SecretResolver.ResolutionFailed"); }); + // #2957 — $runBash had an unbounded proc.waitFor()/stdout read, so a + // secrets command blocking on interactive input (op/bw prompting for + // sign-in) hung the deploy thread forever. Resolution is now bounded + // by opts.timeoutSeconds (default 60); on expiry the bash process is + // killed and a clear ResolutionFailed surfaces. + it("throws ResolutionFailed when a secrets command hangs past the timeout (##2957)", () => { + fileWrite(variables.tempRoot & "/.kamal/secrets", "SLOW=$(sleep 30)"); + var startedAt = getTickCount(); + expect(() => { + new cli.lucli.services.deploy.lib.SecretResolver({ + projectRoot: variables.tempRoot, + timeoutSeconds: 1 + }); + }).toThrow(type="SecretResolver.ResolutionFailed", regex="timed out"); + // The hang must be cut at the deadline, not ridden out to the + // command's own 30s completion. + expect(getTickCount() - startedAt).toBeLT(10000); + }); + it("throws BashUnavailable when bash cannot be launched", () => { fileWrite(variables.tempRoot & "/.kamal/secrets", "FOO=bar"); expect(() => {