Merge pull request #1214 from getlarge/issue-1176-make-submit-tool-call-a

getlarge · web-flow · commit be905bef50fe · 2026-05-22T16:08:39.000+02:00
[codex] Make submit output part of the promise contract
diff --git a/apps/agent-daemon/README.md b/apps/agent-daemon/README.md
@@ -289,7 +289,7 @@ pnpm exec tsx tools/src/tasks/create-task.ts \
 > structured `FulfillBriefOutput` JSON
 > (`{ branch, commits, pullRequestUrl, diaryEntryIds, summary }`) as its
 > final message. A "just reply 'ok'" brief, however short, fails validation
-> with `output_missing` even when the runtime worked correctly. Pick a task
+> with `submit_output_missing` even when the runtime worked correctly. Pick a task
 > that fits the shape.
 
 Watch the daemon logs and the diary:
diff --git a/apps/agent-daemon/e2e/daemon.e2e.test.ts b/apps/agent-daemon/e2e/daemon.e2e.test.ts
@@ -52,6 +52,21 @@ const silentLogger: AgentRuntimeLogger = {
   child: () => silentLogger,
 };
 
+function buildProducerVerification(inputCid: string) {
+  return {
+    inputCid,
+    results: [
+      {
+        id: 'submit-output',
+        kind: 'gate' as const,
+        status: 'pass' as const,
+        detail: 'submit tool criterion satisfied in daemon e2e stub',
+      },
+    ],
+    passed: true,
+  };
+}
+
 /**
  * The realistic local-daemon scenario is "one agent, one team, one
  * daemon" — the same agent imposes a task and runs the daemon that
@@ -317,6 +332,7 @@ describe('Agent daemon (e2e)', () => {
           recipeParams: {},
           summary:
             'e2e stub curation summary, two sentences satisfy minLength.',
+          verification: buildProducerVerification(claimedTask.task.inputCid),
         };
         const output = {
           taskId: claimedTask.task.id,
@@ -975,11 +991,31 @@ async function buildStubbedTaskOutput(
         totalTokens: 10,
         durationMs: 100,
         traceparent: '00-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbbbbbbbbbbbbbbb-01',
-        verification: {
-          inputCid: 'bafye2eeval',
-          results: [],
-          passed: true,
-        },
+        verification: buildProducerVerification(claimedTask.task.inputCid),
+      };
+    case 'fulfill_brief':
+      return {
+        branch: executionPlan.worktreeBranch ?? 'feat/daemon-e2e',
+        commits: [],
+        pullRequestUrl: null,
+        diaryEntryIds: [],
+        summary: `stubbed daemon slot e2e output for ${claimedTask.task.id}`,
+        verification: buildProducerVerification(claimedTask.task.inputCid),
+      };
+    case 'curate_pack':
+      return {
+        packId: '00000000-0000-4000-8000-000000000001',
+        packCid: 'bafyreidlnv7nu7y4kdxkxv5e2onbpoq5o3i6gw7r6xkk7d3w5b3xrylkqe',
+        entries: [
+          {
+            entryId: '00000000-0000-4000-8000-000000000002',
+            rank: 1,
+            rationale: 'e2e stub entry',
+          },
+        ],
+        recipeParams: {},
+        summary: `stubbed daemon slot e2e output for ${claimedTask.task.id}`,
+        verification: buildProducerVerification(claimedTask.task.inputCid),
       };
     case 'judge_eval_attempt':
       return {
diff --git a/apps/rest-api/e2e/tasks.e2e.test.ts b/apps/rest-api/e2e/tasks.e2e.test.ts
@@ -145,6 +145,21 @@ describe('Tasks API', () => {
     return { executorFingerprint, executorSignature };
   }
 
+  function buildProducerVerification(inputCid = 'bafy-e2e-input') {
+    return {
+      inputCid,
+      results: [
+        {
+          id: 'submit-output',
+          kind: 'gate' as const,
+          status: 'pass' as const,
+          detail: 'submit tool criterion satisfied in e2e fixture',
+        },
+      ],
+      passed: true,
+    };
+  }
+
   // ── Auth ─────────────────────────────────────────────────────────────────────
 
   describe('auth', () => {
@@ -531,6 +546,7 @@ describe('Tasks API', () => {
         ],
         recipeParams: { recipe: 'topic-focused-v1' },
         summary: 'Created a pack receipt for the curated diary entries.',
+        verification: buildProducerVerification(),
       };
       const outputCid = await computeJsonCid(output);
 
@@ -726,6 +742,7 @@ describe('Tasks API', () => {
         ],
         recipeParams: { recipe: 'topic-focused-v1' },
         summary: 'heartbeat-then-complete should succeed',
+        verification: buildProducerVerification(),
       };
       const outputCid = await computeJsonCid(output);
 
@@ -843,6 +860,7 @@ describe('Tasks API', () => {
         ],
         recipeParams: { recipe: 'executor-trust-v1' },
         summary: 'Completed with signed executor manifest.',
+        verification: buildProducerVerification(),
       };
       const outputCid = await computeJsonCid(output);
       const completeAttestation = await signedExecutorComplete(
diff --git a/docs/understand/agent-runtime.md b/docs/understand/agent-runtime.md
@@ -16,6 +16,15 @@ A task is a small JSON document in a diary-scoped queue that says "someone wants
 
 Every task lives inside a diary. Whoever can read the diary can see the task; whoever can write the diary can claim it. Pack-like artifacts (rendered packs, context packs) flow through the same queue as judgments and reviews — the type is how you tell them apart.
 
+For producer-style task types (`fulfill_brief`, `curate_pack`, `render_pack`,
+`run_eval`), the server normalizes the stored `input` before computing the
+task's `inputCid`. If the caller did not provide `input.successCriteria`, the
+server creates it and injects a built-in `submit-output` gate. That gate says,
+in effect: "call `submit_<task_type>_output` exactly once with valid structured
+output." This matters because the submit-tool call is part of the promise body,
+not an executor-only implementation detail. The stored input, the prompt the
+claimant reads, and the later audit trail all describe the same contract.
+
 ### Imposer vs claimant boundary
 
 The runtime model depends on keeping the two roles cleanly separated.
@@ -195,6 +204,10 @@ The guarantees are worth naming, because they shape everything else:
 
 - **Claims are agent-initiated.** The queue never pushes. Agents that want work call `claim()`; agents that don't, don't. `task.claim` requires a Keto permit — capability without obligation.
 - **Promises are content-addressed.** The imposer's brief is pinned by an `input_cid`; the claimant's output is pinned by an `output_cid` and optionally signed. Both sides have cryptographic proof of what was promised and what was delivered.
+- **Basic completion gates live inside the promise.** For producer task types,
+  "did I submit the structured output?" is represented as a built-in
+  `successCriteria.gates[]` item, so the claimant self-assesses it like any
+  other criterion instead of the substrate pretending it can coerce the action.
 - **Abandonment is benign.** A crashed or timed-out claimant loses the lease; the task returns to the queue. Nothing is recorded as a failure on the agent's identity — the promise simply wasn't kept, and someone else can pick it up.
 - **Cancellation is asymmetric.** The claimant can walk away (withdraw consent to finish); a diary writer can also take the task back (withdraw the offer). Both are state transitions, not blame.
 - **The runtime has no retry logic.** Retries happen at the queue level, as fresh claims by whoever's next. There's no catching and re-dispatching inside the executor — one attempt, one outcome, the workflow decides what's next.
diff --git a/libs/agent-runtime/src/prompts/final-output.test.ts b/libs/agent-runtime/src/prompts/final-output.test.ts
@@ -167,6 +167,7 @@ describe('buildFinalOutputBlock', () => {
     expect(block).toMatch(/submit_fulfill_brief_output/);
     expect(block).toMatch(/FulfillBriefOutput/);
     expect(block).toMatch(/Do NOT emit the output as plain assistant text/);
+    expect(block).toMatch(/promised submit-output criterion/);
     expect(block).not.toMatch(/Fallback/);
     expect(block).not.toMatch(/single JSON object matching/);
   });
diff --git a/libs/agent-runtime/src/prompts/final-output.ts b/libs/agent-runtime/src/prompts/final-output.ts
@@ -48,7 +48,8 @@ export function buildFinalOutputBlock(opts: FinalOutputBlockOptions): string {
     `The runtime captures the validated arguments and ends the session.`,
     `Do NOT emit the output as plain assistant text. Do NOT rely on a`,
     `JSON-in-message fallback. If you do not call \`${submitTool}\`, the`,
-    `attempt fails even if the underlying work succeeded.`,
+    `attempt is recorded as failing the promised submit-output criterion`,
+    `even if the underlying work succeeded.`,
     '',
     `Your final assistant text before that tool call may explain your work,`,
     `but the submit-tool call itself must be your VERY LAST action.`,
diff --git a/libs/agent-runtime/src/prompts/run-eval.test.ts b/libs/agent-runtime/src/prompts/run-eval.test.ts
@@ -40,6 +40,7 @@ describe('buildRunEvalUserPrompt', () => {
       ctx,
     );
     expect(out).toContain('## Self-verification');
+    expect(out).toContain('part of the promise you made when you claimed');
     expect(out).toContain('`verification` MUST be a JSON object');
     expect(out).toContain('Minimal valid example:');
   });
diff --git a/libs/agent-runtime/src/prompts/self-verification.ts b/libs/agent-runtime/src/prompts/self-verification.ts
@@ -8,11 +8,11 @@ export function buildSelfVerificationBlock(
     '## Self-verification',
     '',
     `If \`input.${criteriaField}\` is set on this task, your final output MUST`,
-    'include a `verification` block. **The runtime/server rejects task',
-    `submission without \`verification\` when \`${criteriaField}\` is present**`,
-    '— the request fails validation and the attempt is discarded, even if the',
-    'underlying work succeeded. Do not call the submit tool until you have',
-    'computed the verification payload.',
+    'include a `verification` block. Treat every item in those criteria as',
+    'part of the promise you made when you claimed the task. That includes',
+    'the built-in submit-output gate when present. Do not call the submit',
+    'tool until you have computed the verification payload you can honestly',
+    'stand behind.',
     '',
     `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.${criteriaField}\`.`,
     '',
diff --git a/libs/pi-extension/src/runtime/execute-pi-task.ts b/libs/pi-extension/src/runtime/execute-pi-task.ts
@@ -946,10 +946,10 @@ export async function executePiTask(
         }
       } else if (submitToolHandle) {
         parseError = {
-          code: 'output_missing',
+          code: 'submit_output_missing',
           message:
-            'Agent did not submit output through the task submit tool. ' +
-            'A valid submit tool call is required to complete this task type.',
+            'Agent did not satisfy the promised submit-output criterion: ' +
+            'no valid task submit tool call was captured before the session ended.',
         };
         await emit('error', {
           message: parseError.message,
diff --git a/libs/task-service/src/task.service.test.ts b/libs/task-service/src/task.service.test.ts
@@ -1,3 +1,4 @@
+import { computeJsonCid } from '@moltnet/crypto-service';
 import type { Task as DbTask, TransactionRunner } from '@moltnet/database';
 import { initTaskTypeRegistry } from '@moltnet/tasks';
 import { FormatRegistry } from '@sinclair/typebox';
@@ -363,6 +364,21 @@ function judgeCreateInput() {
   };
 }
 
+function fulfillCreateInput() {
+  return {
+    taskType: 'fulfill_brief',
+    teamId: TEAM_ID,
+    diaryId: DIARY_ID,
+    inputPayload: {
+      brief: 'Implement the feature.',
+      title: 'Feature work',
+    },
+    callerId: AGENT_ID,
+    callerNs: 'agent' as const,
+    callerIsAgent: true,
+  };
+}
+
 beforeAll(async () => {
   FormatRegistry.Set('uuid', (v: string) =>
     /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(v),
@@ -488,3 +504,44 @@ describe('createTaskService.create — judge_eval_attempt flow', () => {
     );
   });
 });
+
+describe('createTaskService.create — producer input normalization', () => {
+  let mocks: Mocks;
+  let service: ReturnType<typeof createTaskService>;
+
+  beforeEach(() => {
+    mocks = makeMocks();
+    service = createTaskService(
+      mocks as unknown as Parameters<typeof createTaskService>[0],
+    );
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('stores normalized producer input and hashes the normalized promise body', async () => {
+    await service.create(fulfillCreateInput() as never);
+
+    expect(mocks.taskRepository.create).toHaveBeenCalledOnce();
+    const newTask = mocks.taskRepository.create.mock.calls[0][0] as {
+      input: Record<string, unknown>;
+      inputCid: string;
+    };
+    expect(newTask.input).toMatchObject({
+      brief: 'Implement the feature.',
+      title: 'Feature work',
+      successCriteria: {
+        version: 1,
+        gates: [
+          expect.objectContaining({
+            id: 'submit-output',
+            kind: 'submit-tool-call',
+            required: true,
+          }),
+        ],
+      },
+    });
+    expect(newTask.inputCid).toBe(await computeJsonCid(newTask.input));
+  });
+});
diff --git a/libs/task-service/src/task.service.ts b/libs/task-service/src/task.service.ts
@@ -33,6 +33,7 @@ import {
   type ExecutorTrustLevel as WireExecutorTrustLevel,
   getTaskCreateSideEffects,
   getTaskTypeRegistry,
+  normalizeTaskInputForCreate,
   type OutputKind,
   type ResolvedContextPack,
   type ResolvedRenderedPack,
@@ -397,9 +398,13 @@ export function createTaskService(deps: TaskServiceDeps) {
 
   return {
     async create(input: CreateTaskInput): Promise<Task> {
+      const normalizedInput = normalizeTaskInputForCreate(
+        input.taskType,
+        input.inputPayload,
+      ) as Record<string, unknown>;
       const createErrors = validateTaskCreateRequest({
         taskType: input.taskType,
-        input: input.inputPayload,
+        input: normalizedInput,
         references: input.references as Task['references'] | undefined,
       });
       if (createErrors.length > 0) {
@@ -445,7 +450,7 @@ export function createTaskService(deps: TaskServiceDeps) {
       );
       const asyncErrors = await validateTaskInputAsync(
         input.taskType,
-        input.inputPayload,
+        normalizedInput,
         asyncCtx,
       );
       if (asyncErrors.length > 0) {
@@ -520,7 +525,7 @@ export function createTaskService(deps: TaskServiceDeps) {
           ],
         );
       }
-      const inputCid = await computeJsonCid(input.inputPayload);
+      const inputCid = await computeJsonCid(normalizedInput);
 
       const expiresAt = input.expiresInSec
         ? new Date(Date.now() + input.expiresInSec * 1000)
@@ -531,7 +536,7 @@ export function createTaskService(deps: TaskServiceDeps) {
         teamId: input.teamId,
         diaryId: input.diaryId,
         outputKind: taskTypeDef.outputKind,
-        input: input.inputPayload,
+        input: normalizedInput,
         inputSchemaCid,
         inputCid,
         taskRefs: (input.references ?? []) as NewTask['taskRefs'],
@@ -556,7 +561,7 @@ export function createTaskService(deps: TaskServiceDeps) {
       // effects are pure data; applying them is what the tx wraps.
       const sideEffects = await getTaskCreateSideEffects(
         input.taskType,
-        input.inputPayload,
+        normalizedInput,
         asyncCtx,
       );
 
diff --git a/libs/tasks/src/success-criteria.ts b/libs/tasks/src/success-criteria.ts
@@ -12,7 +12,7 @@
  * attaches to any task type. It has four orthogonal sections — pick
  * whichever apply per task type:
  *
- *   - `gates`        Deterministic structural checks (CID/schema match)
+ *   - `gates`        Promise-level structural/process checks
  *   - `assertions`   Declarative claims about output JSON
  *   - `rubric`       Weighted-criteria scoring instrument, reused
  *                    verbatim from `./rubric.ts`.
@@ -57,10 +57,11 @@ import { type Static, Type } from '@sinclair/typebox';
 import { Rubric } from './rubric.js';
 
 // ---------------------------------------------------------------------------
-// Gates — pure JSON evaluation, server-re-verifiable. v1 is intentionally
-// narrow: `schema-check` and `cid-equals` only. `http`/`shell` are
-// deferred (SSRF design and executor-sandbox capability declarations
-// needed first).
+// Gates — structural or process checks that belong to the task's promise
+// body. Some are server-re-verifiable (`schema-check`, `cid-equals`);
+// others are producer self-reported (`submit-tool-call`). Keep the shape
+// explicit so the same `successCriteria` document tells the agent what it
+// promised and later tells auditors what was checked.
 // ---------------------------------------------------------------------------
 
 const SchemaCheckSpec = Type.Object(
@@ -87,8 +88,25 @@ const CidEqualsSpec = Type.Object(
   { additionalProperties: false },
 );
 
+export const SubmitToolCallGate = Type.Object(
+  {
+    id: Type.String({ minLength: 1 }),
+    kind: Type.Literal('submit-tool-call'),
+    /**
+     * Human-readable contract text shown to the producer when it fetches
+     * `input.successCriteria`. This is a promise-level gate rather than a
+     * transport-level runtime hint.
+     */
+    description: Type.String({ minLength: 1 }),
+    required: Type.Boolean(),
+  },
+  { additionalProperties: false },
+);
+export type SubmitToolCallGate = Static<typeof SubmitToolCallGate>;
+
 export const Gate = Type.Union(
   [
+    SubmitToolCallGate,
     Type.Object(
       {
         id: Type.String({ minLength: 1 }),
diff --git a/libs/tasks/src/validation.test.ts b/libs/tasks/src/validation.test.ts
diff --git a/libs/tasks/src/validation.ts b/libs/tasks/src/validation.ts