Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .server-changes/fix-worker-deployment-version-race.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: webapp
type: fix
---

Retry on unique-constraint collisions when assigning the next worker deployment version so concurrent deploys to the same environment no longer fail with P2002.
147 changes: 73 additions & 74 deletions apps/webapp/app/v3/services/initializeDeployment.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ import { type AuthenticatedEnvironment } from "~/services/apiAuth.server";
import { logger } from "~/services/logger.server";
import { generateFriendlyId } from "../friendlyIdentifiers";
import { createRemoteImageBuild, remoteBuildsEnabled } from "../remoteImageBuilder.server";
import { calculateNextBuildVersion } from "../utils/calculateNextBuildVersion";
import { BaseService, ServiceValidationError } from "./baseService.server";
import { TimeoutDeploymentService } from "./timeoutDeployment.server";
import { getDeploymentImageRef } from "../getDeploymentImageRef.server";
import { tryCatch } from "@trigger.dev/core";
import { getRegistryConfig } from "../registryConfig.server";
import { DeploymentService } from "./deployment.server";
import { createDeploymentWithNextVersion } from "./initializeDeployment/createDeploymentWithNextVersion.server";
import { errAsync } from "neverthrow";

const nanoid = customAlphabet("1234567890abcdefghijklmnopqrstuvwxyz", 8);
Expand Down Expand Up @@ -97,18 +97,6 @@ export class InitializeDeploymentService extends BaseService {
});
}

const latestDeployment = await this._prisma.workerDeployment.findFirst({
where: {
environmentId: environment.id,
},
orderBy: {
createdAt: "desc",
},
take: 1,
});

const nextVersion = calculateNextBuildVersion(latestDeployment?.version);

if (payload.selfHosted && remoteBuildsEnabled()) {
throw new ServiceValidationError(
"Self-hosted deployments are not supported on this instance"
Expand Down Expand Up @@ -146,30 +134,6 @@ export class InitializeDeploymentService extends BaseService {

const deploymentShortCode = nanoid(8);

const [imageRefError, imageRefResult] = await tryCatch(
getDeploymentImageRef({
registry: registryConfig,
projectRef: environment.project.externalRef,
nextVersion,
environmentType: environment.type,
deploymentShortCode,
})
);

if (imageRefError) {
logger.error("Failed to get deployment image ref", {
environmentId: environment.id,
projectId: environment.projectId,
version: nextVersion,
triggeredById: triggeredBy?.id,
type: payload.type,
cause: imageRefError.message,
});
throw new ServiceValidationError("Failed to get deployment image ref");
}

const { imageRef, isEcr, repoCreated } = imageRefResult;

// We keep using `BUILDING` as the initial status if not explicitly set
// to avoid changing the behavior for deployments not created in the build server.
// Native builds always start in the `PENDING` status.
Expand Down Expand Up @@ -208,20 +172,6 @@ export class InitializeDeploymentService extends BaseService {
}
: undefined;

logger.debug("Creating deployment", {
environmentId: environment.id,
projectId: environment.projectId,
version: nextVersion,
triggeredById: triggeredBy?.id,
type: payload.type,
imageRef,
isEcr,
repoCreated,
initialStatus,
artifactKey: payload.isNativeBuild ? payload.artifactKey : undefined,
isNativeBuild: payload.isNativeBuild,
});

const buildServerMetadata: BuildServerMetadata | undefined =
payload.isNativeBuild || payload.buildId
? {
Expand All @@ -238,28 +188,77 @@ export class InitializeDeploymentService extends BaseService {
}
: undefined;

const deployment = await this._prisma.workerDeployment.create({
data: {
friendlyId: generateFriendlyId("deployment"),
contentHash: payload.contentHash,
shortCode: deploymentShortCode,
version: nextVersion,
status: initialStatus,
environmentId: environment.id,
projectId: environment.projectId,
externalBuildData,
buildServerMetadata,
triggeredById: triggeredBy?.id,
type: payload.type,
imageReference: imageRef,
imagePlatform: env.DEPLOY_IMAGE_PLATFORM,
git: payload.gitMeta ?? undefined,
commitSHA: payload.gitMeta?.commitSha ?? undefined,
runtime: payload.runtime ?? undefined,
triggeredVia: payload.triggeredVia ?? undefined,
startedAt: initialStatus === "BUILDING" ? new Date() : undefined,
},
});
// Concurrent deploys to the same environment race on the
// `(environmentId, version)` unique constraint. The helper retries on
// P2002, recomputing the version (and re-running the image ref call so
// the persisted imageReference always matches the persisted version)
// each attempt.
const deployment = await createDeploymentWithNextVersion(
this._prisma,
environment.id,
async (nextVersion) => {
const [imageRefError, imageRefResult] = await tryCatch(
getDeploymentImageRef({
registry: registryConfig,
projectRef: environment.project.externalRef,
nextVersion,
environmentType: environment.type,
deploymentShortCode,
})
);

if (imageRefError) {
logger.error("Failed to get deployment image ref", {
environmentId: environment.id,
projectId: environment.projectId,
version: nextVersion,
triggeredById: triggeredBy?.id,
type: payload.type,
cause: imageRefError.message,
});
throw new ServiceValidationError("Failed to get deployment image ref");
}

const { imageRef, isEcr, repoCreated } = imageRefResult;

logger.debug("Creating deployment", {
environmentId: environment.id,
projectId: environment.projectId,
version: nextVersion,
triggeredById: triggeredBy?.id,
type: payload.type,
imageRef,
isEcr,
repoCreated,
initialStatus,
artifactKey: payload.isNativeBuild ? payload.artifactKey : undefined,
isNativeBuild: payload.isNativeBuild,
});

return {
// Regenerated per attempt: each attempt is a fresh `create` that
// must satisfy `WorkerDeployment.friendlyId @unique`, so reusing a
// friendlyId across retries would risk a spurious P2002 on
// friendlyId instead of the version collision we're retrying.
friendlyId: generateFriendlyId("deployment"),
contentHash: payload.contentHash,
shortCode: deploymentShortCode,
status: initialStatus,
projectId: environment.projectId,
externalBuildData,
buildServerMetadata,
triggeredById: triggeredBy?.id,
type: payload.type,
imageReference: imageRef,
imagePlatform: env.DEPLOY_IMAGE_PLATFORM,
git: payload.gitMeta ?? undefined,
commitSHA: payload.gitMeta?.commitSha ?? undefined,
runtime: payload.runtime ?? undefined,
triggeredVia: payload.triggeredVia ?? undefined,
startedAt: initialStatus === "BUILDING" ? new Date() : undefined,
};
}
);

const timeoutMs =
deployment.status === "PENDING" ? env.DEPLOY_QUEUE_TIMEOUT_MS : env.DEPLOY_TIMEOUT_MS;
Expand Down Expand Up @@ -309,7 +308,7 @@ export class InitializeDeploymentService extends BaseService {

return {
deployment,
imageRef,
imageRef: deployment.imageReference ?? "",
eventStream,
};
});
Expand Down
Comment thread
d-cs marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import {
isUniqueConstraintError,
type Prisma,
type PrismaClientOrTransaction,
type WorkerDeployment,
} from "@trigger.dev/database";
import { setTimeout as sleep } from "node:timers/promises";
import { logger } from "~/services/logger.server";
import { calculateNextBuildVersion } from "../../utils/calculateNextBuildVersion";

export type CreateDeploymentData = Omit<
Prisma.WorkerDeploymentUncheckedCreateInput,
"version" | "environmentId"
>;

export type CreateDeploymentWithNextVersionOptions = {
maxRetries?: number;
jitterMs?: { min: number; max: number };
};

const DEFAULT_MAX_RETRIES = 5;
const DEFAULT_JITTER_MS = { min: 5, max: 50 };

export class DeploymentVersionCollisionError extends Error {
readonly name = "DeploymentVersionCollisionError";
readonly environmentId: string;
readonly attempts: number;
readonly lastAttemptedVersion: string;

constructor(args: {
environmentId: string;
attempts: number;
lastAttemptedVersion: string;
cause: unknown;
}) {
super(
`Failed to allocate a unique worker deployment version for environment ${args.environmentId} after ${args.attempts} attempt(s); last tried "${args.lastAttemptedVersion}"`,
{ cause: args.cause }
);
this.environmentId = args.environmentId;
this.attempts = args.attempts;
this.lastAttemptedVersion = args.lastAttemptedVersion;
}
}

export async function createDeploymentWithNextVersion(
prisma: PrismaClientOrTransaction,
environmentId: string,
buildData: (nextVersion: string) => CreateDeploymentData | Promise<CreateDeploymentData>,
options: CreateDeploymentWithNextVersionOptions = {}
): Promise<WorkerDeployment> {
const maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES;
const jitterMs = options.jitterMs ?? DEFAULT_JITTER_MS;

let lastError: unknown;
let lastVersion = "";

for (let attempt = 0; attempt <= maxRetries; attempt++) {
const latest = await prisma.workerDeployment.findFirst({
where: { environmentId },
orderBy: { createdAt: "desc" },
take: 1,
});

const version = calculateNextBuildVersion(latest?.version);
lastVersion = version;
const data = await buildData(version);

try {
return await prisma.workerDeployment.create({
data: { ...data, environmentId, version },
});
} catch (error) {
if (!isUniqueConstraintError(error, ["environmentId", "version"])) {
throw error;
}

lastError = error;
logger.warn("Worker deployment version collided, retrying", {
environmentId,
attempt: attempt + 1,
maxRetries,
attemptedVersion: version,
});

// Randomised backoff so N concurrent racers don't loop in lockstep into the
// same collision again.
const delay = jitterMs.min + Math.random() * (jitterMs.max - jitterMs.min);
await sleep(delay);
}
}

throw new DeploymentVersionCollisionError({
environmentId,
attempts: maxRetries + 1,
lastAttemptedVersion: lastVersion,
cause: lastError,
});
}
Loading
Loading