Skip to content

Commit be905be

Browse files
authored
Merge pull request #1214 from getlarge/issue-1176-make-submit-tool-call-a
[codex] Make submit output part of the promise contract
2 parents e187543 + 12b282a commit be905be

14 files changed

Lines changed: 305 additions & 25 deletions

File tree

apps/agent-daemon/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ pnpm exec tsx tools/src/tasks/create-task.ts \
289289
> structured `FulfillBriefOutput` JSON
290290
> (`{ branch, commits, pullRequestUrl, diaryEntryIds, summary }`) as its
291291
> final message. A "just reply 'ok'" brief, however short, fails validation
292-
> with `output_missing` even when the runtime worked correctly. Pick a task
292+
> with `submit_output_missing` even when the runtime worked correctly. Pick a task
293293
> that fits the shape.
294294
295295
Watch the daemon logs and the diary:

apps/agent-daemon/e2e/daemon.e2e.test.ts

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,21 @@ const silentLogger: AgentRuntimeLogger = {
5252
child: () => silentLogger,
5353
};
5454

55+
function buildProducerVerification(inputCid: string) {
56+
return {
57+
inputCid,
58+
results: [
59+
{
60+
id: 'submit-output',
61+
kind: 'gate' as const,
62+
status: 'pass' as const,
63+
detail: 'submit tool criterion satisfied in daemon e2e stub',
64+
},
65+
],
66+
passed: true,
67+
};
68+
}
69+
5570
/**
5671
* The realistic local-daemon scenario is "one agent, one team, one
5772
* daemon" — the same agent imposes a task and runs the daemon that
@@ -317,6 +332,7 @@ describe('Agent daemon (e2e)', () => {
317332
recipeParams: {},
318333
summary:
319334
'e2e stub curation summary, two sentences satisfy minLength.',
335+
verification: buildProducerVerification(claimedTask.task.inputCid),
320336
};
321337
const output = {
322338
taskId: claimedTask.task.id,
@@ -975,11 +991,31 @@ async function buildStubbedTaskOutput(
975991
totalTokens: 10,
976992
durationMs: 100,
977993
traceparent: '00-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbbbbbbbbbbbbbbb-01',
978-
verification: {
979-
inputCid: 'bafye2eeval',
980-
results: [],
981-
passed: true,
982-
},
994+
verification: buildProducerVerification(claimedTask.task.inputCid),
995+
};
996+
case 'fulfill_brief':
997+
return {
998+
branch: executionPlan.worktreeBranch ?? 'feat/daemon-e2e',
999+
commits: [],
1000+
pullRequestUrl: null,
1001+
diaryEntryIds: [],
1002+
summary: `stubbed daemon slot e2e output for ${claimedTask.task.id}`,
1003+
verification: buildProducerVerification(claimedTask.task.inputCid),
1004+
};
1005+
case 'curate_pack':
1006+
return {
1007+
packId: '00000000-0000-4000-8000-000000000001',
1008+
packCid: 'bafyreidlnv7nu7y4kdxkxv5e2onbpoq5o3i6gw7r6xkk7d3w5b3xrylkqe',
1009+
entries: [
1010+
{
1011+
entryId: '00000000-0000-4000-8000-000000000002',
1012+
rank: 1,
1013+
rationale: 'e2e stub entry',
1014+
},
1015+
],
1016+
recipeParams: {},
1017+
summary: `stubbed daemon slot e2e output for ${claimedTask.task.id}`,
1018+
verification: buildProducerVerification(claimedTask.task.inputCid),
9831019
};
9841020
case 'judge_eval_attempt':
9851021
return {

apps/rest-api/e2e/tasks.e2e.test.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,21 @@ describe('Tasks API', () => {
145145
return { executorFingerprint, executorSignature };
146146
}
147147

148+
function buildProducerVerification(inputCid = 'bafy-e2e-input') {
149+
return {
150+
inputCid,
151+
results: [
152+
{
153+
id: 'submit-output',
154+
kind: 'gate' as const,
155+
status: 'pass' as const,
156+
detail: 'submit tool criterion satisfied in e2e fixture',
157+
},
158+
],
159+
passed: true,
160+
};
161+
}
162+
148163
// ── Auth ─────────────────────────────────────────────────────────────────────
149164

150165
describe('auth', () => {
@@ -531,6 +546,7 @@ describe('Tasks API', () => {
531546
],
532547
recipeParams: { recipe: 'topic-focused-v1' },
533548
summary: 'Created a pack receipt for the curated diary entries.',
549+
verification: buildProducerVerification(),
534550
};
535551
const outputCid = await computeJsonCid(output);
536552

@@ -726,6 +742,7 @@ describe('Tasks API', () => {
726742
],
727743
recipeParams: { recipe: 'topic-focused-v1' },
728744
summary: 'heartbeat-then-complete should succeed',
745+
verification: buildProducerVerification(),
729746
};
730747
const outputCid = await computeJsonCid(output);
731748

@@ -843,6 +860,7 @@ describe('Tasks API', () => {
843860
],
844861
recipeParams: { recipe: 'executor-trust-v1' },
845862
summary: 'Completed with signed executor manifest.',
863+
verification: buildProducerVerification(),
846864
};
847865
const outputCid = await computeJsonCid(output);
848866
const completeAttestation = await signedExecutorComplete(

docs/understand/agent-runtime.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,15 @@ A task is a small JSON document in a diary-scoped queue that says "someone wants
1616

1717
Every task lives inside a diary. Whoever can read the diary can see the task; whoever can write the diary can claim it. Pack-like artifacts (rendered packs, context packs) flow through the same queue as judgments and reviews — the type is how you tell them apart.
1818

19+
For producer-style task types (`fulfill_brief`, `curate_pack`, `render_pack`,
20+
`run_eval`), the server normalizes the stored `input` before computing the
21+
task's `inputCid`. If the caller did not provide `input.successCriteria`, the
22+
server creates it and injects a built-in `submit-output` gate. That gate says,
23+
in effect: "call `submit_<task_type>_output` exactly once with valid structured
24+
output." This matters because the submit-tool call is part of the promise body,
25+
not an executor-only implementation detail. The stored input, the prompt the
26+
claimant reads, and the later audit trail all describe the same contract.
27+
1928
### Imposer vs claimant boundary
2029

2130
The runtime model depends on keeping the two roles cleanly separated.
@@ -195,6 +204,10 @@ The guarantees are worth naming, because they shape everything else:
195204

196205
- **Claims are agent-initiated.** The queue never pushes. Agents that want work call `claim()`; agents that don't, don't. `task.claim` requires a Keto permit — capability without obligation.
197206
- **Promises are content-addressed.** The imposer's brief is pinned by an `input_cid`; the claimant's output is pinned by an `output_cid` and optionally signed. Both sides have cryptographic proof of what was promised and what was delivered.
207+
- **Basic completion gates live inside the promise.** For producer task types,
208+
"did I submit the structured output?" is represented as a built-in
209+
`successCriteria.gates[]` item, so the claimant self-assesses it like any
210+
other criterion instead of the substrate pretending it can coerce the action.
198211
- **Abandonment is benign.** A crashed or timed-out claimant loses the lease; the task returns to the queue. Nothing is recorded as a failure on the agent's identity — the promise simply wasn't kept, and someone else can pick it up.
199212
- **Cancellation is asymmetric.** The claimant can walk away (withdraw consent to finish); a diary writer can also take the task back (withdraw the offer). Both are state transitions, not blame.
200213
- **The runtime has no retry logic.** Retries happen at the queue level, as fresh claims by whoever's next. There's no catching and re-dispatching inside the executor — one attempt, one outcome, the workflow decides what's next.

libs/agent-runtime/src/prompts/final-output.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ describe('buildFinalOutputBlock', () => {
167167
expect(block).toMatch(/submit_fulfill_brief_output/);
168168
expect(block).toMatch(/FulfillBriefOutput/);
169169
expect(block).toMatch(/Do NOT emit the output as plain assistant text/);
170+
expect(block).toMatch(/promised submit-output criterion/);
170171
expect(block).not.toMatch(/Fallback/);
171172
expect(block).not.toMatch(/single JSON object matching/);
172173
});

libs/agent-runtime/src/prompts/final-output.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ export function buildFinalOutputBlock(opts: FinalOutputBlockOptions): string {
4848
`The runtime captures the validated arguments and ends the session.`,
4949
`Do NOT emit the output as plain assistant text. Do NOT rely on a`,
5050
`JSON-in-message fallback. If you do not call \`${submitTool}\`, the`,
51-
`attempt fails even if the underlying work succeeded.`,
51+
`attempt is recorded as failing the promised submit-output criterion`,
52+
`even if the underlying work succeeded.`,
5253
'',
5354
`Your final assistant text before that tool call may explain your work,`,
5455
`but the submit-tool call itself must be your VERY LAST action.`,

libs/agent-runtime/src/prompts/run-eval.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ describe('buildRunEvalUserPrompt', () => {
4040
ctx,
4141
);
4242
expect(out).toContain('## Self-verification');
43+
expect(out).toContain('part of the promise you made when you claimed');
4344
expect(out).toContain('`verification` MUST be a JSON object');
4445
expect(out).toContain('Minimal valid example:');
4546
});

libs/agent-runtime/src/prompts/self-verification.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ export function buildSelfVerificationBlock(
88
'## Self-verification',
99
'',
1010
`If \`input.${criteriaField}\` is set on this task, your final output MUST`,
11-
'include a `verification` block. **The runtime/server rejects task',
12-
`submission without \`verification\` when \`${criteriaField}\` is present**`,
13-
'the request fails validation and the attempt is discarded, even if the',
14-
'underlying work succeeded. Do not call the submit tool until you have',
15-
'computed the verification payload.',
11+
'include a `verification` block. Treat every item in those criteria as',
12+
'part of the promise you made when you claimed the task. That includes',
13+
'the built-in submit-output gate when present. Do not call the submit',
14+
'tool until you have computed the verification payload you can honestly',
15+
'stand behind.',
1616
'',
1717
`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.${criteriaField}\`.`,
1818
'',

libs/pi-extension/src/runtime/execute-pi-task.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -946,10 +946,10 @@ export async function executePiTask(
946946
}
947947
} else if (submitToolHandle) {
948948
parseError = {
949-
code: 'output_missing',
949+
code: 'submit_output_missing',
950950
message:
951-
'Agent did not submit output through the task submit tool. ' +
952-
'A valid submit tool call is required to complete this task type.',
951+
'Agent did not satisfy the promised submit-output criterion: ' +
952+
'no valid task submit tool call was captured before the session ended.',
953953
};
954954
await emit('error', {
955955
message: parseError.message,

libs/task-service/src/task.service.test.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { computeJsonCid } from '@moltnet/crypto-service';
12
import type { Task as DbTask, TransactionRunner } from '@moltnet/database';
23
import { initTaskTypeRegistry } from '@moltnet/tasks';
34
import { FormatRegistry } from '@sinclair/typebox';
@@ -363,6 +364,21 @@ function judgeCreateInput() {
363364
};
364365
}
365366

367+
function fulfillCreateInput() {
368+
return {
369+
taskType: 'fulfill_brief',
370+
teamId: TEAM_ID,
371+
diaryId: DIARY_ID,
372+
inputPayload: {
373+
brief: 'Implement the feature.',
374+
title: 'Feature work',
375+
},
376+
callerId: AGENT_ID,
377+
callerNs: 'agent' as const,
378+
callerIsAgent: true,
379+
};
380+
}
381+
366382
beforeAll(async () => {
367383
FormatRegistry.Set('uuid', (v: string) =>
368384
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(v),
@@ -488,3 +504,44 @@ describe('createTaskService.create — judge_eval_attempt flow', () => {
488504
);
489505
});
490506
});
507+
508+
describe('createTaskService.create — producer input normalization', () => {
509+
let mocks: Mocks;
510+
let service: ReturnType<typeof createTaskService>;
511+
512+
beforeEach(() => {
513+
mocks = makeMocks();
514+
service = createTaskService(
515+
mocks as unknown as Parameters<typeof createTaskService>[0],
516+
);
517+
});
518+
519+
afterEach(() => {
520+
vi.restoreAllMocks();
521+
});
522+
523+
it('stores normalized producer input and hashes the normalized promise body', async () => {
524+
await service.create(fulfillCreateInput() as never);
525+
526+
expect(mocks.taskRepository.create).toHaveBeenCalledOnce();
527+
const newTask = mocks.taskRepository.create.mock.calls[0][0] as {
528+
input: Record<string, unknown>;
529+
inputCid: string;
530+
};
531+
expect(newTask.input).toMatchObject({
532+
brief: 'Implement the feature.',
533+
title: 'Feature work',
534+
successCriteria: {
535+
version: 1,
536+
gates: [
537+
expect.objectContaining({
538+
id: 'submit-output',
539+
kind: 'submit-tool-call',
540+
required: true,
541+
}),
542+
],
543+
},
544+
});
545+
expect(newTask.inputCid).toBe(await computeJsonCid(newTask.input));
546+
});
547+
});

0 commit comments

Comments
 (0)