Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions packages/config/local/cron-jobs.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ import {
clearCronJobsForTests,
closeCronJobDatabaseForTests,
createCronJob,
disableCronJob,
getCronJobById,
markCronJobCompleted,
markCronJobFailed,
markCronJobRunning,
patchCronJob,
reconcileInterruptedCronJobs,
} from "./cron-jobs";

Expand Down Expand Up @@ -202,3 +204,77 @@ describe("reconcileInterruptedCronJobs", () => {
expect(getCronJobById(job.id)?.lastRunStatus).toBe("failed");
});
});

describe("disableCronJob", () => {
test("flips enabled to false and reports the transition", () => {
const job = createCronJob({
title: "to-disable",
cronExpression: "*/5 * * * *",
channelId: "C_TEST",
messageText: "hi",
});
expect(getCronJobById(job.id)?.enabled).toBe(true);

const changed = disableCronJob(job.id);
expect(changed).toBe(true);
expect(getCronJobById(job.id)?.enabled).toBe(false);
});

test("returns false when the row is already disabled (idempotent)", () => {
const job = createCronJob({
title: "already-off",
cronExpression: "*/5 * * * *",
channelId: "C_TEST",
messageText: "hi",
enabled: false,
});
const changed = disableCronJob(job.id);
expect(changed).toBe(false);
expect(getCronJobById(job.id)?.enabled).toBe(false);
});

test("returns false when the row does not exist", () => {
expect(disableCronJob("does-not-exist")).toBe(false);
});

test("succeeds even when the row's channel was removed from local config", () => {
// This is the scenario `disableCronJob` exists to solve:
// a cron job points at a channel that has since been removed from
// ode.json. `patchCronJob` would throw via `getChannelSnapshot`, but
// the direct-SQL `disableCronJob` must still flip `enabled` to false
// so the scheduler stops firing the row.
const job = createCronJob({
title: "stale-channel",
cronExpression: "*/5 * * * *",
channelId: "C_TEST",
messageText: "hi",
});

// Drop the channel from the on-disk config so getChannelSnapshot fails.
const emptyConfig = {
user: {},
workspaces: [
{
id: "ws-test",
name: "Test Workspace",
type: "slack",
channelDetails: [], // C_TEST is gone
},
],
};
fs.writeFileSync(ODE_CONFIG_FILE, JSON.stringify(emptyConfig));
invalidateOdeConfigCache();

// Sanity check: patchCronJob blows up because it re-validates the
// channel before writing. This is exactly the bug Codex flagged on
// PR #211 and is the reason we route around it.
expect(() => patchCronJob(job.id, { enabled: false })).toThrow(
/Channel not found/i,
);
expect(getCronJobById(job.id)?.enabled).toBe(true);

// disableCronJob bypasses channel resolution entirely.
expect(disableCronJob(job.id)).toBe(true);
expect(getCronJobById(job.id)?.enabled).toBe(false);
});
});
25 changes: 25 additions & 0 deletions packages/config/local/cron-jobs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,31 @@ export function patchCronJob(id: string, params: PatchCronJobParams): CronJobRec
return updateCronJob(id, merged);
}

/**
* Directly flip `enabled` to 0 without re-validating the rest of the cron
* row. Unlike `patchCronJob`, this does NOT re-resolve the row's channel
* via `getChannelSnapshot`, so it is safe to call when the destination
* channel was removed from the local config — which is exactly the
* scenario that motivates auto-disable in the first place (bot kicked,
* workspace re-onboarded, channel id stale, etc.).
*
* Returns `true` if a row actually transitioned from enabled→disabled, so
* callers can avoid duplicate log/notification work if another writer beat
* them to it. A missing row returns `false` as well.
*/
export function disableCronJob(id: string): boolean {
const db = getDatabase();
const result = db.query(`
UPDATE cron_jobs
SET
enabled = 0,
updated_at = ?
WHERE id = ?
AND enabled = 1
`).run(Date.now(), id);
return result.changes > 0;
}

export function markCronJobTriggered(id: string, minuteStartMs: number): boolean {
const db = getDatabase();
const result = db.query(`
Expand Down
98 changes: 97 additions & 1 deletion packages/core/cron/scheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
} from "@/config";
import {
type CronJobRecord,
disableCronJob,
getCronJobById,
listEnabledCronJobs,
markCronJobCompleted,
Expand Down Expand Up @@ -37,6 +38,7 @@ import { sendSlackChannelMessage } from "@/core/runtime/slack-senders";
import { buildSessionEnvironment, prepareSessionWorkspace } from "@/core/session";
import { sendChannelMessage as sendDiscordChannelMessage } from "@/ims/discord/client";
import { sendChannelMessage as sendLarkChannelMessage } from "@/ims/lark/client";
import { isPermanentChannelError } from "@/shared/delivery/permanent-error";
import { log } from "@/utils";

const CRON_POLL_INTERVAL_MS = 15_000;
Expand Down Expand Up @@ -265,6 +267,57 @@ async function prepareCronSession(job: CronJobRecord, runId: string): Promise<{
return { session, sessionId, cwd, created };
}

/**
* Auto-disable a cron job whose destination channel is permanently
* unreachable (bot kicked, channel archived, token revoked, etc.). Without
* this, every cron tick would re-attempt the same send and capture an
* identical Sentry event for the lifetime of the daemon — see
* `isPermanentChannelError` for the precise classification.
*
* Best-effort: we never let bookkeeping failures here shadow the original
* delivery error. The function does NOT throw.
*/
async function disableCronJobForPermanentChannelError(
job: CronJobRecord,
error: unknown,
agentResultDetailId: string | null,
): Promise<void> {
const reason = error instanceof Error ? error.message : String(error);
const summary = `auto-disabled: destination channel unreachable (${reason})`;
if (agentResultDetailId) {
try {
failAgentResult({ detailId: agentResultDetailId, errorText: summary });
} catch (failError) {
log.warn("Failed to mark cron agent_result detail as failed during auto-disable", {
detailId: agentResultDetailId,
error: String(failError),
});
}
}
try {
disableCronJob(job.id);
} catch (disableError) {
log.warn("Failed to disable cron job after permanent channel error", {
cronJobId: job.id,
error: String(disableError),
});
}
try {
markCronJobFailed(job.id, summary);
} catch (markError) {
log.warn("Failed to mark auto-disabled cron job as failed", {
cronJobId: job.id,
error: String(markError),
});
}
log.warn("Auto-disabled cron job: destination channel unreachable", {
cronJobId: job.id,
title: job.title,
channelId: job.channelId,
error: reason,
});
}

async function runCronJob(job: CronJobRecord, minuteStartMs: number): Promise<void> {
const agent = createAgentAdapter();
const runId = getCronRunId(minuteStartMs);
Expand Down Expand Up @@ -350,7 +403,22 @@ async function runCronJob(job: CronJobRecord, minuteStartMs: number): Promise<vo
);
const finalText = buildFinalResponseText(responses) ?? "_Done_";

const realThreadId = await sendResultToChannel(job, finalText);
let realThreadId: string | undefined;
try {
realThreadId = await sendResultToChannel(job, finalText);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Treat undefined sends as delivery failures

When a Lark cron job's channel/workspace credentials have been removed, sendLarkChannelMessage returns undefined instead of throwing (packages/ims/lark/client.ts:383-384), so this await falls through the new permanent-error handling and the run is later marked completed while the job remains enabled and undelivered. Fresh evidence after the stale-channel fix is that the direct-SQL disable helper is never reached for this no-credentials Lark path because no error is raised.

Useful? React with 👍 / 👎.

} catch (sendError) {
// The agent turn succeeded, but we can't deliver the result. If the
// channel is permanently unreachable (bot kicked, channel archived,
// token revoked), auto-disable the job so the scheduler stops
// generating identical Sentry events on every tick. Transient send
// failures fall through to the generic error handler below and the
// job stays enabled for the next tick.
if (isPermanentChannelError(sendError)) {
await disableCronJobForPermanentChannelError(job, sendError, agentResultDetailId);
return;
}
throw sendError;
}
if (realThreadId) {
seedCronChannelThreadSession({
platform: job.platform,
Expand Down Expand Up @@ -409,6 +477,34 @@ async function runCronJob(job: CronJobRecord, minuteStartMs: number): Promise<vo
const failureText = `*Cron job failed:* ${job.title}\n${message}`;
await sendResultToChannel(job, failureText);
} catch (notifyError) {
// If the failure-notification itself hits a permanent channel error,
// the destination is the real problem (not the agent turn). Auto-
// disable so we don't spam the same channel on every tick. We
// intentionally check the *notify* error here, not the original
// `error`, because the original could be a transient agent timeout
// while the channel is still healthy.
if (isPermanentChannelError(notifyError)) {
try {
disableCronJob(job.id);
markCronJobFailed(
job.id,
`auto-disabled: channel unreachable (${message})`,
);
log.warn("Auto-disabled cron job: destination channel unreachable", {
cronJobId: job.id,
title: job.title,
channelId: job.channelId,
originalError: message,
sendError: String(notifyError),
});
} catch (disableError) {
log.warn("Failed to auto-disable cron job after permanent channel error", {
cronJobId: job.id,
error: String(disableError),
});
}
return;
}
log.warn("Failed to send cron job failure notification", {
cronJobId: job.id,
error: String(notifyError),
Expand Down
76 changes: 75 additions & 1 deletion packages/ims/discord/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,52 @@ async function maybeHandleLauncherCommand(params: {
}
async function resolveTextChannel(channelId: string, processorId?: string) {
const attempts: string[] = [];
// Collect Discord API error codes across every fetch attempt so the
// caller can detect permanent-access failures (Unknown Channel / Missing
// Access / etc.) even though we re-throw a generic wrapper Error below.
// discord.js surfaces these on `err.code` as numbers; we forward them on
// the wrapper as `discordErrorCodes` and also expose the first as `code`
// so `isPermanentChannelError` can pick it up via the `code` shape.
//
// The pinned bot's code is tracked separately: when `processorId` is
// provided we know which bot actually owns the channel, so its result
// is authoritative and unrelated bots' "permanent" errors (e.g. another
// workspace's bot legitimately returning `Missing Access` for a channel
// it cannot see) MUST NOT promote the wrapper to permanent. See PR #211
// discussion: "Avoid disabling Discord jobs on mixed fetch failures".
const discordErrorCodes: number[] = [];
let pinnedBotErrorCode: number | undefined;
let pinnedBotAttempted = false;
// Count every fetch attempt that ended in a thrown error — not just the
// subset that carried a numeric `err.code`. The wrapper's
// `isPermanentChannelError` classification must NOT promote to permanent
// when any attempt failed transiently (e.g. ECONNRESET, generic network
// error), because that bot might succeed on retry. See PR #211 discussion:
// "Require every Discord fetch attempt to be permanent".
let totalFailedAttempts = 0;
const captureCode = (error: unknown, isPinnedBot: boolean) => {
totalFailedAttempts += 1;
if (typeof error === "object" && error !== null) {
const code = (error as { code?: unknown }).code;
if (typeof code === "number") {
discordErrorCodes.push(code);
if (isPinnedBot) pinnedBotErrorCode = code;
}
}
};

if (processorId) {
const pinnedClient = discordClientByProcessorId.get(processorId);
if (pinnedClient) {
pinnedBotAttempted = true;
try {
const channel = await pinnedClient.channels.fetch(channelId);
if (channel && channel.isTextBased()) {
return channel as any;
}
attempts.push(`bot=${pinnedClient.user?.id || "unknown"}: channel_not_text_or_missing`);
} catch (error) {
captureCode(error, true);
const errorMessage = error instanceof Error ? error.message : String(error);
attempts.push(`bot=${pinnedClient.user?.id || "unknown"}: ${errorMessage}`);
}
Expand All @@ -142,6 +178,7 @@ async function resolveTextChannel(channelId: string, processorId?: string) {
}
attempts.push(`bot=${client.user?.id || "unknown"}: channel_not_text_or_missing`);
} catch (error) {
captureCode(error, false);
const errorMessage = error instanceof Error ? error.message : String(error);
attempts.push(`bot=${client.user?.id || "unknown"}: ${errorMessage}`);
}
Expand All @@ -155,7 +192,44 @@ async function resolveTextChannel(channelId: string, processorId?: string) {
});
}

throw new Error(`Discord channel ${channelId} is not text-based or inaccessible`);
const wrapper = new Error(`Discord channel ${channelId} is not text-based or inaccessible`);
if (discordErrorCodes.length > 0) {
const PERMANENT_PRIORITY = new Set([10003, 50001, 50013, 50007]);

// Pick which code to forward as the `code` field consumed by
// `isPermanentChannelError`. Rules:
// 1. If a pinned bot was attempted, its outcome is authoritative —
// forward its code (if any). Other bots' permanent codes for the
// same channel are unrelated noise.
// 2. If no pinned bot was attempted (e.g. cron top-level send with
// no processorId), only treat the failure as permanent when every
// failed attempt carried a permanent code. A transient failure
// (ECONNRESET, network blip) without a numeric `code` would not
// appear in `discordErrorCodes`, so counting only those would let
// an unrelated bot's permanent code promote the wrapper even though
// a retry might succeed. We therefore also require that the number
// of captured permanent codes equals the number of failed attempts.
let forwardedCode: number | undefined;
if (pinnedBotAttempted) {
forwardedCode = pinnedBotErrorCode;
} else {
const permanentCodes = discordErrorCodes.filter((c) => PERMANENT_PRIORITY.has(c));
const allAttemptsPermanent =
permanentCodes.length === totalFailedAttempts && totalFailedAttempts > 0;
if (allAttemptsPermanent) {
forwardedCode = permanentCodes[0];
} else {
forwardedCode = discordErrorCodes[0];
}
}

if (forwardedCode !== undefined) {
(wrapper as Error & { code?: number; discordErrorCodes?: number[] }).code = forwardedCode;
}
(wrapper as Error & { code?: number; discordErrorCodes?: number[] }).discordErrorCodes =
[...discordErrorCodes];
}
throw wrapper;
}

async function buildDiscordContext(
Expand Down
Loading
Loading