From e1a1f447246a8fd24be1a70214ab3c57774b2b46 Mon Sep 17 00:00:00 2001 From: sajdakabir Date: Wed, 6 May 2026 22:54:46 +0530 Subject: [PATCH] fix(deploy): notify on container crash loop after deployment --- packages/server/src/services/application.ts | 22 +++++- packages/server/src/utils/docker/utils.ts | 83 +++++++++++++++++++++ 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/packages/server/src/services/application.ts b/packages/server/src/services/application.ts index ac1fbb4492..c6a5ef6059 100644 --- a/packages/server/src/services/application.ts +++ b/packages/server/src/services/application.ts @@ -30,7 +30,7 @@ import { createTraefikConfig } from "@dokploy/server/utils/traefik/application"; import { TRPCError } from "@trpc/server"; import { eq } from "drizzle-orm"; import type { z } from "zod"; -import { encodeBase64 } from "../utils/docker/utils"; +import { encodeBase64, waitForSwarmServiceStable } from "../utils/docker/utils"; import { getDokployUrl } from "./admin"; import { createDeployment, @@ -222,6 +222,16 @@ export const deployApplication = async ({ } await mechanizeDockerContainer(application); + + const stability = await waitForSwarmServiceStable(application.appName, { + serverId, + }); + if (!stability.stable) { + throw new Error( + `Container did not stay running after deployment: ${stability.reason}`, + ); + } + await updateDeploymentStatus(deployment.deploymentId, "done"); await updateApplicationStatus(applicationId, "done"); @@ -313,6 +323,16 @@ export const rebuildApplication = async ({ await execAsync(commandWithLog); } await mechanizeDockerContainer(application); + + const stability = await waitForSwarmServiceStable(application.appName, { + serverId, + }); + if (!stability.stable) { + throw new Error( + `Container did not stay running after rebuild: ${stability.reason}`, + ); + } + await updateDeploymentStatus(deployment.deploymentId, "done"); await updateApplicationStatus(applicationId, "done"); diff --git a/packages/server/src/utils/docker/utils.ts b/packages/server/src/utils/docker/utils.ts index 8065b7dd93..c23b3e8213 100644 --- a/packages/server/src/utils/docker/utils.ts +++ b/packages/server/src/utils/docker/utils.ts @@ -854,6 +854,89 @@ const getSwarmServiceContainerId = async ( } }; +export type SwarmStabilityResult = + | { stable: true } + | { stable: false; reason: string }; + +export const waitForSwarmServiceStable = async ( + appName: string, + { + serverId, + windowMs = 60_000, + pollMs = 5_000, + }: { serverId?: string | null; windowMs?: number; pollMs?: number } = {}, +): Promise => { + const remoteDocker = await getRemoteDocker(serverId); + const deadline = Date.now() + windowMs; + let everRunning = false; + let lastReason = "Service did not reach running state"; + + while (Date.now() < deadline) { + try { + const tasks = await remoteDocker.listTasks({ + filters: JSON.stringify({ service: [appName] }), + }); + + const sorted = [...tasks].sort((a, b) => { + const at = new Date(a.UpdatedAt ?? 0).getTime(); + const bt = new Date(b.UpdatedAt ?? 0).getTime(); + return bt - at; + }); + const latest = sorted[0]; + const state = latest?.Status?.State; + const message = latest?.Status?.Err || latest?.Status?.Message || ""; + + if (state === "failed" || state === "rejected") { + return { + stable: false, + reason: message + ? `Task ${state}: ${message}` + : `Task entered ${state} state`, + }; + } + + const runningCount = sorted.filter( + (t) => t.Status?.State === "running", + ).length; + const startingCount = sorted.filter((t) => + [ + "new", + "pending", + "assigned", + "accepted", + "preparing", + "starting", + ].includes(t.Status?.State ?? ""), + ).length; + + if (runningCount > 0) { + everRunning = true; + } else if (everRunning && startingCount > 0) { + return { + stable: false, + reason: message + ? `Container restarted after running: ${message}` + : "Container restarted after reaching running state", + }; + } + + lastReason = message + ? `Latest task state: ${state ?? "unknown"} (${message})` + : `Latest task state: ${state ?? "unknown"}`; + } catch (error) { + lastReason = + error instanceof Error ? error.message : "Failed to inspect service"; + } + + await new Promise((resolve) => setTimeout(resolve, pollMs)); + } + + if (everRunning) { + return { stable: true }; + } + return { stable: false, reason: lastReason }; +}; + export const checkPostgresHealth = async (): Promise => { const serviceCheck = await checkSwarmServiceRunning("dokploy-postgres"); if (serviceCheck.status === "unhealthy") {