Skip to content

Commit e8f635b

Browse files
authored
Merge branch 'main' into christiaan/minimax-free
2 parents 935f090 + a443127 commit e8f635b

43 files changed

Lines changed: 3748 additions & 228 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cloudflare-gastown/src/dos/Town.do.ts

Lines changed: 64 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,11 @@ export class TownDO extends DurableObject<Env> {
240240
}
241241

242242
private emitEvent(data: Omit<GastownEventData, 'userId' | 'delivery'>): void {
243-
writeEvent(this.env, { ...data, delivery: 'internal', userId: this._ownerUserId });
243+
writeEvent(this.env, {
244+
...data,
245+
delivery: 'internal',
246+
userId: this._ownerUserId,
247+
});
244248
}
245249

246250
/** Build the context object used by the scheduling sub-module. */
@@ -290,7 +294,9 @@ export class TownDO extends DurableObject<Env> {
290294
});
291295
}
292296

293-
return scheduling.dispatchAgent(schedulingCtx, agent, bead, { systemPromptOverride });
297+
return scheduling.dispatchAgent(schedulingCtx, agent, bead, {
298+
systemPromptOverride,
299+
});
294300
},
295301
stopAgent: async agentId => {
296302
await dispatch.stopAgentInContainer(this.env, this.townId, agentId);
@@ -677,7 +683,9 @@ export class TownDO extends DurableObject<Env> {
677683
const townConfig = await this.getTownConfig();
678684
if (!townConfig.kilocode_token || townConfig.kilocode_token !== rigConfig.kilocodeToken) {
679685
console.log(`${TOWN_LOG} configureRig: propagating kilocodeToken to town config`);
680-
await this.updateTownConfig({ kilocode_token: rigConfig.kilocodeToken });
686+
await this.updateTownConfig({
687+
kilocode_token: rigConfig.kilocodeToken,
688+
});
681689
}
682690
}
683691

@@ -1214,10 +1222,14 @@ export class TownDO extends DurableObject<Env> {
12141222
* Return undelivered, non-expired nudges for an agent.
12151223
* Urgent nudges are returned first, then FIFO within same priority.
12161224
*/
1217-
async getPendingNudges(
1218-
agentId: string
1219-
): Promise<
1220-
{ nudge_id: string; message: string; mode: string; priority: string; source: string }[]
1225+
async getPendingNudges(agentId: string): Promise<
1226+
{
1227+
nudge_id: string;
1228+
message: string;
1229+
mode: string;
1230+
priority: string;
1231+
source: string;
1232+
}[]
12211233
> {
12221234
const rows = [
12231235
...query(
@@ -1722,6 +1734,14 @@ export class TownDO extends DurableObject<Env> {
17221734
return reviewQueue.advanceMoleculeStep(this.sql, agentId, summary);
17231735
}
17241736

1737+
async getMergeQueueData(params: {
1738+
rigId?: string;
1739+
limit?: number;
1740+
since?: string;
1741+
}): Promise<reviewQueue.MergeQueueData> {
1742+
return reviewQueue.getMergeQueueData(this.sql, params);
1743+
}
1744+
17251745
// ══════════════════════════════════════════════════════════════════
17261746
// Atomic Sling (create bead + agent + hook)
17271747
// ══════════════════════════════════════════════════════════════════
@@ -1768,7 +1788,12 @@ export class TownDO extends DurableObject<Env> {
17681788

17691789
/** Build the rig list for mayor agent startup (browse worktree setup on fresh containers). */
17701790
private async rigListForMayor(): Promise<
1771-
Array<{ rigId: string; gitUrl: string; defaultBranch: string; platformIntegrationId?: string }>
1791+
Array<{
1792+
rigId: string;
1793+
gitUrl: string;
1794+
defaultBranch: string;
1795+
platformIntegrationId?: string;
1796+
}>
17721797
> {
17731798
const rigRecords = rigs.listRigs(this.sql);
17741799
return Promise.all(
@@ -1792,7 +1817,10 @@ export class TownDO extends DurableObject<Env> {
17921817
message: string,
17931818
_model?: string,
17941819
uiContext?: string
1795-
): Promise<{ agentId: string; sessionStatus: 'idle' | 'active' | 'starting' }> {
1820+
): Promise<{
1821+
agentId: string;
1822+
sessionStatus: 'idle' | 'active' | 'starting';
1823+
}> {
17961824
const townId = this.townId;
17971825

17981826
let mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null;
@@ -1876,7 +1904,10 @@ export class TownDO extends DurableObject<Env> {
18761904
* Called eagerly on page load so the terminal is available immediately
18771905
* without requiring the user to send a message first.
18781906
*/
1879-
async ensureMayor(): Promise<{ agentId: string; sessionStatus: 'idle' | 'active' | 'starting' }> {
1907+
async ensureMayor(): Promise<{
1908+
agentId: string;
1909+
sessionStatus: 'idle' | 'active' | 'starting';
1910+
}> {
18801911
const townId = this.townId;
18811912

18821913
let mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null;
@@ -2250,7 +2281,10 @@ export class TownDO extends DurableObject<Env> {
22502281
tasks: Array<{ title: string; body?: string; depends_on?: number[] }>;
22512282
merge_mode?: 'review-then-land' | 'review-and-merge';
22522283
staged?: boolean;
2253-
}): Promise<{ convoy: ConvoyEntry; beads: Array<{ bead: Bead; agent: Agent | null }> }> {
2284+
}): Promise<{
2285+
convoy: ConvoyEntry;
2286+
beads: Array<{ bead: Bead; agent: Agent | null }>;
2287+
}> {
22542288
// Resolve staged: explicit request wins, otherwise fall back to town config default.
22552289
const townConfig = await this.getTownConfig();
22562290
const isStaged = input.staged ?? townConfig.staged_convoys_default;
@@ -2449,9 +2483,10 @@ export class TownDO extends DurableObject<Env> {
24492483
/**
24502484
* Transition a staged convoy to active: hook agents and begin dispatch.
24512485
*/
2452-
async startConvoy(
2453-
convoyId: string
2454-
): Promise<{ convoy: ConvoyEntry; beads: Array<{ bead: Bead; agent: Agent | null }> }> {
2486+
async startConvoy(convoyId: string): Promise<{
2487+
convoy: ConvoyEntry;
2488+
beads: Array<{ bead: Bead; agent: Agent | null }>;
2489+
}> {
24552490
const convoy = this.getConvoy(convoyId);
24562491
if (!convoy) throw new Error(`Convoy not found: ${convoyId}`);
24572492
if (!convoy.staged) throw new Error(`Convoy is not staged: ${convoyId}`);
@@ -2994,9 +3029,14 @@ export class TownDO extends DurableObject<Env> {
29943029
const violations = reconciler.checkInvariants(this.sql);
29953030
metrics.invariantViolations = violations.length;
29963031
if (violations.length > 0) {
2997-
console.error(
2998-
`${TOWN_LOG} [reconciler:invariants] town=${townId} ${violations.length} violation(s): ${JSON.stringify(violations)}`
2999-
);
3032+
// Emit as an analytics event for observability dashboards instead
3033+
// of console.error (which spams Workers logs every 5s per town).
3034+
this.emitEvent({
3035+
event: 'reconciler.invariant_violations',
3036+
townId,
3037+
label: violations.map(v => `[${v.invariant}] ${v.message}`).join('; '),
3038+
value: violations.length,
3039+
});
30003040
}
30013041
} catch (err) {
30023042
console.warn(`${TOWN_LOG} [reconciler:invariants] town=${townId} check failed`, err);
@@ -3575,7 +3615,13 @@ export class TownDO extends DurableObject<Env> {
35753615
[]
35763616
),
35773617
];
3578-
const beadCounts = { open: 0, inProgress: 0, inReview: 0, failed: 0, triageRequests: 0 };
3618+
const beadCounts = {
3619+
open: 0,
3620+
inProgress: 0,
3621+
inReview: 0,
3622+
failed: 0,
3623+
triageRequests: 0,
3624+
};
35793625
for (const row of beadRows) {
35803626
const s = `${row.status as string}`;
35813627
const c = Number(row.cnt);

cloudflare-gastown/src/dos/town/actions.ts

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,11 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro
576576
}
577577

578578
case 'send_nudge': {
579-
// Insert nudge record synchronously
579+
// Insert nudge record synchronously.
580+
// Explicitly set created_at to ISO 8601 so it matches the format used
581+
// by hasRecentNudge's cutoff comparison (#1412). SQLite's default
582+
// datetime('now') produces 'YYYY-MM-DD HH:MM:SS' (space separator)
583+
// which compares incorrectly against JS toISOString().
580584
const nudgeId = crypto.randomUUID();
581585
query(
582586
sql,
@@ -588,10 +592,18 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro
588592
${agent_nudges.columns.mode},
589593
${agent_nudges.columns.priority},
590594
${agent_nudges.columns.source},
595+
${agent_nudges.columns.created_at},
591596
${agent_nudges.columns.expires_at}
592-
) VALUES (?, ?, ?, 'immediate', 'urgent', ?, ?)
597+
) VALUES (?, ?, ?, 'immediate', 'urgent', ?, ?, ?)
593598
`,
594-
[nudgeId, action.agent_id, action.message, `reconciler:${action.tier}`, null]
599+
[
600+
nudgeId,
601+
action.agent_id,
602+
action.message,
603+
`reconciler:${action.tier}`,
604+
new Date().toISOString(),
605+
null,
606+
]
595607
);
596608

597609
return async () => {

cloudflare-gastown/src/dos/town/agents.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,11 +546,20 @@ export function touchAgent(
546546
activeTools?: string[];
547547
}
548548
): void {
549+
// A heartbeat is proof the agent is alive in the container.
550+
// If the agent's status is 'idle' (e.g. due to a dispatch timeout
551+
// race — see #1358), restore it to 'working'. This prevents the
552+
// reconciler from treating the agent as lost while it's actively
553+
// sending heartbeats.
549554
query(
550555
sql,
551556
/* sql */ `
552557
UPDATE ${agent_metadata}
553558
SET ${agent_metadata.columns.last_activity_at} = ?,
559+
${agent_metadata.columns.status} = CASE
560+
WHEN ${agent_metadata.columns.status} = 'idle' THEN 'working'
561+
ELSE ${agent_metadata.columns.status}
562+
END,
554563
${agent_metadata.columns.last_event_type} = COALESCE(?, ${agent_metadata.columns.last_event_type}),
555564
${agent_metadata.columns.last_event_at} = COALESCE(?, ${agent_metadata.columns.last_event_at}),
556565
${agent_metadata.columns.active_tools} = COALESCE(?, ${agent_metadata.columns.active_tools})

cloudflare-gastown/src/dos/town/reconciler.ts

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,18 @@ export function applyEvent(sql: SqlStorage, event: TownEventRecord): void {
249249
const agent = agents.getAgent(sql, event.agent_id);
250250
if (!agent) return;
251251

252-
// Only act on working/stalled agents whose container has stopped
252+
// Only act on working/stalled agents whose container has stopped.
253+
// For 'not_found': skip if the agent was dispatched recently (#1358).
254+
// During a cold start the container may 404 on /agents/:id/status
255+
// because the agent hasn't registered in the process manager yet.
256+
// The 3-minute grace period covers the 60s HTTP timeout plus
257+
// typical cold start time (git clone + worktree). Truly dead
258+
// agents are caught by reconcileAgents after 90s of no heartbeats.
259+
if (containerStatus === 'not_found' && agent.last_activity_at) {
260+
const ageSec = (Date.now() - new Date(agent.last_activity_at).getTime()) / 1000;
261+
if (ageSec < 180) return; // 3-minute grace for cold starts
262+
}
263+
253264
if (
254265
(agent.status === 'working' || agent.status === 'stalled') &&
255266
(containerStatus === 'exited' || containerStatus === 'not_found')
@@ -340,6 +351,9 @@ export function reconcileAgents(sql: SqlStorage): Action[] {
340351
]);
341352

342353
for (const agent of workingAgents) {
354+
// Mayors are always working with no hook — skip them
355+
if (agent.role === 'mayor') continue;
356+
343357
if (!agent.last_activity_at) {
344358
// No heartbeat ever received — container may have failed to start
345359
actions.push({
@@ -357,6 +371,18 @@ export function reconcileAgents(sql: SqlStorage): Action[] {
357371
to: 'idle',
358372
reason: 'heartbeat lost (3 missed cycles)',
359373
});
374+
} else if (!agent.current_hook_bead_id) {
375+
// Agent is working with fresh heartbeat but no hook — it's running
376+
// in the container but has no bead to work on (gt_done already ran,
377+
// or the hook was cleared by another code path). Set to idle so
378+
// processReviewQueue / schedulePendingWork can use it.
379+
actions.push({
380+
type: 'transition_agent',
381+
agent_id: agent.bead_id,
382+
from: 'working',
383+
to: 'idle',
384+
reason: 'working agent has no hook (gt_done already completed)',
385+
});
360386
}
361387
}
362388

@@ -599,18 +625,24 @@ export function reconcileBeads(sql: SqlStorage): Action[] {
599625
for (const bead of staleInProgress) {
600626
if (!staleMs(bead.updated_at, STALE_IN_PROGRESS_TIMEOUT_MS)) continue;
601627

602-
// Check if any agent is hooked AND working/stalled
628+
// Check if any agent is hooked AND (working/stalled OR has a recent
629+
// heartbeat). The heartbeat check is defense-in-depth for #1358: if
630+
// the agent's status is wrong (e.g. stuck on 'idle' due to a dispatch
631+
// timeout race), a fresh heartbeat proves the agent is alive.
603632
const hookedAgent = z
604-
.object({ status: z.string() })
633+
.object({ status: z.string(), last_activity_at: z.string().nullable() })
605634
.array()
606635
.parse([
607636
...query(
608637
sql,
609638
/* sql */ `
610-
SELECT ${agent_metadata.status}
639+
SELECT ${agent_metadata.status}, ${agent_metadata.last_activity_at}
611640
FROM ${agent_metadata}
612641
WHERE ${agent_metadata.current_hook_bead_id} = ?
613-
AND ${agent_metadata.status} IN ('working', 'stalled')
642+
AND (
643+
${agent_metadata.status} IN ('working', 'stalled')
644+
OR ${agent_metadata.last_activity_at} > strftime('%Y-%m-%dT%H:%M:%fZ', 'now', '-90 seconds')
645+
)
614646
`,
615647
[bead.bead_id]
616648
),
@@ -991,7 +1023,11 @@ export function reconcileReviewQueue(sql: SqlStorage): Action[] {
9911023
}
9921024

9931025
const mrRows = z
994-
.object({ status: z.string(), type: z.string(), rig_id: z.string().nullable() })
1026+
.object({
1027+
status: z.string(),
1028+
type: z.string(),
1029+
rig_id: z.string().nullable(),
1030+
})
9951031
.array()
9961032
.parse([
9971033
...query(
@@ -1231,6 +1267,7 @@ export function reconcileGUPP(sql: SqlStorage): Action[] {
12311267
FROM ${agent_metadata}
12321268
LEFT JOIN ${beads} b ON b.${beads.columns.bead_id} = ${agent_metadata.bead_id}
12331269
WHERE ${agent_metadata.status} IN ('working', 'stalled')
1270+
AND ${agent_metadata.role} != 'mayor'
12341271
`,
12351272
[]
12361273
),
@@ -1420,18 +1457,20 @@ function getIdleAgentHookedTo(sql: SqlStorage, beadId: string): string | null {
14201457
function hasRecentNudge(sql: SqlStorage, agentId: string, tier: string): boolean {
14211458
// Check if a nudge with this exact tier source was created in the last 60 min.
14221459
// The source is set to `reconciler:${tier}` by applyAction('send_nudge').
1423-
const cutoff = new Date(Date.now() - 60 * 60_000).toISOString();
1460+
// Use SQLite's datetime() for the cutoff so the comparison works regardless
1461+
// of whether created_at was stored in SQLite's native 'YYYY-MM-DD HH:MM:SS'
1462+
// format (old rows) or ISO 8601 'YYYY-MM-DDTHH:MM:SS.000Z' (new rows).
14241463
const rows = [
14251464
...query(
14261465
sql,
14271466
/* sql */ `
14281467
SELECT 1 FROM ${agent_nudges}
14291468
WHERE ${agent_nudges.agent_bead_id} = ?
14301469
AND ${agent_nudges.source} = ?
1431-
AND ${agent_nudges.created_at} > ?
1470+
AND datetime(${agent_nudges.created_at}) > datetime('now', '-60 minutes')
14321471
LIMIT 1
14331472
`,
1434-
[agentId, `reconciler:${tier}`, cutoff]
1473+
[agentId, `reconciler:${tier}`]
14351474
),
14361475
];
14371476
return rows.length > 0;

0 commit comments

Comments
 (0)