Merge pull request #1221 from getlarge/issue-1175-run-eval-prompt-shape

getlarge · web-flow · commit 5f87d8bb94f5 · 2026-05-22T17:43:29.000+02:00
refactor(agent-runtime): tighten run_eval producer prompt shape
diff --git a/libs/agent-runtime/src/prompts/__snapshots__/snapshot.test.ts.snap b/libs/agent-runtime/src/prompts/__snapshots__/snapshot.test.ts.snap
@@ -1142,12 +1142,6 @@ exports[`prompt snapshots (assembler refactor pin) > run_eval — baseline (no c
 You are running an evaluation scenario as variant \`baseline\`.
 Task id: \`t-1\`
 
-## Execution mode
-
-Mode: \`vitro\`
-Workspace: \`none\`
-You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly.
-
 ## Scenario
 
 List the top 3 risks in this code.
@@ -1197,30 +1191,21 @@ exports[`prompt snapshots (assembler refactor pin) > run_eval — with-context +
 You are running an evaluation scenario as variant \`with-pack\`.
 Task id: \`t-1\`
 
-## Correlation
-
-This task carries correlationId \`corr-abc\`. It joins
-this variant to its sibling \`run_eval\` tasks (other variants of the
-same scenario and to any later \`judge_eval_attempt\` tasks created
-against those variants. You do not need to act on it directly — it
-is recorded for cross-variant aggregation at query time.
-
-## Execution mode
-
-Mode: \`vivo\`
-Workspace: \`dedicated_worktree\`
-You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.
-
-## Injected context discipline
-
-This task includes extra injected context from the task creator.
-You MUST inspect and use that context BEFORE you write solution
-files or draft your final answer.
-Do not solve first and only review the context afterward.
-For \`context_inline\`, your FIRST content-inspection step should be a \`read\` of \`/workspace/context-pack.md\` before your first \`write\` call. The same content is also mirrored in \`/workspace/AGENTS.md\` and may be referenced from \`/workspace/.claude/CLAUDE.md\`.
-If \`/workspace/context-pack.md\` exists and you skip reading it before writing solution files, you are not following the task instructions.
-If the injected context contains repo- or workflow-specific rules,
-those rules override your generic instincts.
+## Injected Task Context
+
+This task includes Injected Task Context supplied by the task
+creator. You MUST inspect it BEFORE you write solution files or
+draft your final answer — not after.
+
+Reconcile every constraint from that context **into the code path
+itself**: function bodies, control flow, transaction boundaries,
+guard clauses. Quoting a constraint back in a comment, a
+\`// note:\` line, the task summary, or the \`verification\` field is
+NOT following the task. If the constraint affects behavior, it
+must affect behavior.
+For \`context_inline\`, your FIRST content-inspection step is a \`read\` of \`/workspace/context-pack.md\` before your first \`write\` call. The same content is also mirrored in \`/workspace/AGENTS.md\` and may be referenced from \`/workspace/.claude/CLAUDE.md\`.
+If the Injected Task Context contains repo- or workflow-specific
+rules, those rules override your generic instincts.
 
 ## Scenario
 
diff --git a/libs/agent-runtime/src/prompts/run-eval.test.ts b/libs/agent-runtime/src/prompts/run-eval.test.ts
@@ -68,19 +68,11 @@ describe('buildRunEvalUserPrompt', () => {
     expect(out).toContain('must be an object, never a string');
   });
 
-  it('describes the requested execution mode and workspace', () => {
-    const out = render(baseInput);
-    expect(out).toContain('## Execution mode');
-    expect(out).toContain('Mode: `vitro`');
-    expect(out).toContain('Workspace: `none`');
-    expect(out).toContain('no repository checkout mounted');
+  it('omits the discipline section when no task context exists', () => {
+    expect(render(baseInput)).not.toContain('## Injected Task Context');
   });
 
-  it('omits injected-context discipline when no task context exists', () => {
-    expect(render(baseInput)).not.toContain('## Injected context discipline');
-  });
-
-  it('requires inspecting context before solving when task context exists', () => {
+  it('requires reconciling injected context INTO the code (not into comments)', () => {
     const out = render({
       ...baseInput,
       context: [
@@ -91,32 +83,37 @@ describe('buildRunEvalUserPrompt', () => {
         },
       ],
     });
-    expect(out).toContain('## Injected context discipline');
-    expect(out).toContain(
-      'MUST inspect and use that context BEFORE you write solution',
-    );
-    expect(out).toContain(
-      'Do not solve first and only review the context afterward.',
-    );
-    expect(out).toContain(
-      'your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call',
-    );
-    expect(out).toContain('skip reading it before writing solution files');
+    // Section uses the exact phrase "Injected Task Context" so weaker
+    // models see one anchor repeated between this heading and the
+    // materialized context block header.
+    expect(out).toContain('## Injected Task Context');
+    expect(out).toContain('MUST inspect it BEFORE you write solution files');
+    // The reconciliation rule is explicit: code, not comments.
+    expect(out).toContain('Reconcile every constraint from that context');
+    expect(out).toContain('into the code path');
+    expect(out).toContain('Quoting a constraint back in a comment');
+    expect(out).toContain('NOT following the task');
+    // Inline-context path still names the materialized files.
     expect(out).toContain('/workspace/context-pack.md');
     expect(out).toContain('/workspace/AGENTS.md');
   });
 
-  it('omits the correlation section when correlationId is null/absent', () => {
-    expect(render(baseInput)).not.toContain('## Correlation');
-    expect(render(baseInput, { ...ctx, correlationId: null })).not.toContain(
-      '## Correlation',
+  it('does NOT leak the judge rubric or judge-only sections', () => {
+    // RunEvalSuccessCriteria intentionally excludes `rubric` so the
+    // producer cannot see the judge's answer key. Assert the rendered
+    // prompt and trace reflect that contract.
+    const assembled = buildRunEvalUserPrompt(
+      { ...baseInput, successCriteria: { version: 1 as const } },
+      ctx,
     );
-  });
-
-  it('emits the correlation section when correlationId is set', () => {
-    const out = render(baseInput, { ...ctx, correlationId: 'corr-abc' });
-    expect(out).toContain('## Correlation');
-    expect(out).toContain('corr-abc');
+    const out = assembled.text;
+    expect(out).not.toContain('## Rubric');
+    expect(out).not.toContain('## Criteria');
+    expect(out).not.toMatch(/\| Criterion \| Weight \|/);
+    expect(out).not.toContain('Composite arithmetic');
+    expect(
+      assembled.trace.find((t) => t.source === 'rubric_judge'),
+    ).toBeUndefined();
   });
 
   it('exposes a structured per-section trace alongside the text', () => {
@@ -126,10 +123,9 @@ describe('buildRunEvalUserPrompt', () => {
     expect(ids).toContain('run_eval.header');
     expect(ids).toContain('run_eval.scenario');
     expect(ids).toContain('run_eval.final_output');
-    // Absent optional sections are kept in the trace with char_count 0.
-    const correlation = assembled.trace.find(
-      (t) => t.id === 'run_eval.correlation',
-    );
-    expect(correlation?.char_count).toBe(0);
+    // Dropped sections must not appear in the trace either — replay
+    // tooling treats "absent from trace" as "never rendered".
+    expect(ids).not.toContain('run_eval.correlation');
+    expect(ids).not.toContain('run_eval.execution_mode');
   });
 });
diff --git a/libs/agent-runtime/src/prompts/run-eval.ts b/libs/agent-runtime/src/prompts/run-eval.ts
@@ -13,9 +13,9 @@ interface Ctx {
   /** Task id — the agent must report it in its final structured output. */
   taskId: string;
   /**
-   * MoltNet correlationId. For eval scenarios this groups the N variant
-   * `run_eval` tasks plus any eventual `judge_eval_attempt` tasks under a
-   * single id. May be null for ad-hoc single-variant runs.
+   * MoltNet correlationId. Recorded on the task object so attempt events
+   * can group sibling variants of an eval scenario; intentionally NOT
+   * surfaced in the user prompt — the producer never acts on it.
    */
   correlationId?: string | null;
 }
@@ -33,12 +33,26 @@ interface Ctx {
  * the body, `skill` items are persisted at the runtime's skill path,
  * and `user_inline` items are appended to the first user message. This
  * builder does NOT inline `input.context[]` itself.
+ *
+ * Prompt-shape notes (issue #1175, area 1):
+ * - No `Correlation` section: the agent never acts on it. The id is
+ *   still carried on attempt event metadata for cross-variant queries.
+ * - No `Execution mode` section: the workspace already reflects the
+ *   chosen mode by its shape (scratch/shared mount/dedicated worktree).
+ *   Restating it as text adds noise without changing model behavior.
+ * - The "Injected Task Context" phrase is used identically in this
+ *   prompt's discipline section and in the materialized context block
+ *   header (see context-bindings.ts) so weaker models see one repeated
+ *   anchor.
+ * - The discipline copy demands the model encode injected constraints
+ *   into the code path itself, not into comments or the verification
+ *   field. Quoting the constraint back is not following the task.
  */
 export function buildRunEvalUserPrompt(
   input: RunEvalInput,
   ctx: Ctx,
 ): AssembledPrompt {
-  const { scenario, variantLabel, execution, successCriteria } = input;
+  const { scenario, variantLabel, successCriteria } = input;
   const hasContext = input.context.length > 0;
   const hasInlineContext = input.context.some(
     (entry) => entry.binding === 'context_inline',
@@ -49,40 +63,23 @@ export function buildRunEvalUserPrompt(
     `You are running an evaluation scenario as variant \`${variantLabel}\`.\n` +
     `Task id: \`${ctx.taskId}\``;
 
-  const correlation = ctx.correlationId
-    ? [
-        `This task carries correlationId \`${ctx.correlationId}\`. It joins`,
-        'this variant to its sibling `run_eval` tasks (other variants of the',
-        'same scenario and to any later `judge_eval_attempt` tasks created',
-        'against those variants. You do not need to act on it directly — it',
-        'is recorded for cross-variant aggregation at query time.',
-      ].join('\n')
-    : '';
-
-  const executionMode = [
-    `Mode: \`${execution.mode}\``,
-    `Workspace: \`${execution.workspace}\``,
-    execution.workspace === 'none'
-      ? 'You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly.'
-      : execution.workspace === 'shared_mount'
-        ? 'You are running against the daemon shared mount. Treat any repository mutations as affecting the mounted checkout directly.'
-        : 'You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.',
-  ].join('\n');
-
   const contextDiscipline = hasContext
     ? [
-        'This task includes extra injected context from the task creator.',
-        'You MUST inspect and use that context BEFORE you write solution',
-        'files or draft your final answer.',
-        'Do not solve first and only review the context afterward.',
-        hasInlineContext
-          ? 'For `context_inline`, your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`.'
-          : 'If injected context was provided as a skill, inspect that task-injected context before solving.',
+        'This task includes Injected Task Context supplied by the task',
+        'creator. You MUST inspect it BEFORE you write solution files or',
+        'draft your final answer — not after.',
+        '',
+        'Reconcile every constraint from that context **into the code path',
+        'itself**: function bodies, control flow, transaction boundaries,',
+        'guard clauses. Quoting a constraint back in a comment, a',
+        '`// note:` line, the task summary, or the `verification` field is',
+        'NOT following the task. If the constraint affects behavior, it',
+        'must affect behavior.',
         hasInlineContext
-          ? 'If `/workspace/context-pack.md` exists and you skip reading it before writing solution files, you are not following the task instructions.'
-          : 'Do not rely on memory alone when task-injected context is available; inspect it first.',
-        'If the injected context contains repo- or workflow-specific rules,',
-        'those rules override your generic instincts.',
+          ? 'For `context_inline`, your FIRST content-inspection step is a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`.'
+          : 'When the context is delivered as a skill, inspect it before solving.',
+        'If the Injected Task Context contains repo- or workflow-specific',
+        'rules, those rules override your generic instincts.',
       ].join('\n')
     : '';
 
@@ -117,22 +114,10 @@ export function buildRunEvalUserPrompt(
 
   const sections: PromptSection[] = [
     { id: 'run_eval.header', source: 'header', body: header },
-    {
-      id: 'run_eval.correlation',
-      source: 'task_input',
-      header: 'Correlation',
-      body: correlation,
-    },
-    {
-      id: 'run_eval.execution_mode',
-      source: 'task_input',
-      header: 'Execution mode',
-      body: executionMode,
-    },
     {
       id: 'run_eval.context_discipline',
       source: 'discipline',
-      header: 'Injected context discipline',
+      header: 'Injected Task Context',
       body: contextDiscipline,
     },
     {