Skip to content

Commit 5f87d8b

Browse files
authored
Merge pull request #1221 from getlarge/issue-1175-run-eval-prompt-shape
refactor(agent-runtime): tighten run_eval producer prompt shape
2 parents f0de7da + c597840 commit 5f87d8b

3 files changed

Lines changed: 81 additions & 115 deletions

File tree

libs/agent-runtime/src/prompts/__snapshots__/snapshot.test.ts.snap

Lines changed: 15 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,12 +1142,6 @@ exports[`prompt snapshots (assembler refactor pin) > run_eval — baseline (no c
11421142
You are running an evaluation scenario as variant \`baseline\`.
11431143
Task id: \`t-1\`
11441144
1145-
## Execution mode
1146-
1147-
Mode: \`vitro\`
1148-
Workspace: \`none\`
1149-
You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly.
1150-
11511145
## Scenario
11521146
11531147
List the top 3 risks in this code.
@@ -1197,30 +1191,21 @@ exports[`prompt snapshots (assembler refactor pin) > run_eval — with-context +
11971191
You are running an evaluation scenario as variant \`with-pack\`.
11981192
Task id: \`t-1\`
11991193
1200-
## Correlation
1201-
1202-
This task carries correlationId \`corr-abc\`. It joins
1203-
this variant to its sibling \`run_eval\` tasks (other variants of the
1204-
same scenario and to any later \`judge_eval_attempt\` tasks created
1205-
against those variants. You do not need to act on it directly — it
1206-
is recorded for cross-variant aggregation at query time.
1207-
1208-
## Execution mode
1209-
1210-
Mode: \`vivo\`
1211-
Workspace: \`dedicated_worktree\`
1212-
You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.
1213-
1214-
## Injected context discipline
1215-
1216-
This task includes extra injected context from the task creator.
1217-
You MUST inspect and use that context BEFORE you write solution
1218-
files or draft your final answer.
1219-
Do not solve first and only review the context afterward.
1220-
For \`context_inline\`, your FIRST content-inspection step should be a \`read\` of \`/workspace/context-pack.md\` before your first \`write\` call. The same content is also mirrored in \`/workspace/AGENTS.md\` and may be referenced from \`/workspace/.claude/CLAUDE.md\`.
1221-
If \`/workspace/context-pack.md\` exists and you skip reading it before writing solution files, you are not following the task instructions.
1222-
If the injected context contains repo- or workflow-specific rules,
1223-
those rules override your generic instincts.
1194+
## Injected Task Context
1195+
1196+
This task includes Injected Task Context supplied by the task
1197+
creator. You MUST inspect it BEFORE you write solution files or
1198+
draft your final answer — not after.
1199+
1200+
Reconcile every constraint from that context **into the code path
1201+
itself**: function bodies, control flow, transaction boundaries,
1202+
guard clauses. Quoting a constraint back in a comment, a
1203+
\`// note:\` line, the task summary, or the \`verification\` field is
1204+
NOT following the task. If the constraint affects behavior, it
1205+
must affect behavior.
1206+
For \`context_inline\`, your FIRST content-inspection step is a \`read\` of \`/workspace/context-pack.md\` before your first \`write\` call. The same content is also mirrored in \`/workspace/AGENTS.md\` and may be referenced from \`/workspace/.claude/CLAUDE.md\`.
1207+
If the Injected Task Context contains repo- or workflow-specific
1208+
rules, those rules override your generic instincts.
12241209
12251210
## Scenario
12261211

libs/agent-runtime/src/prompts/run-eval.test.ts

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -68,19 +68,11 @@ describe('buildRunEvalUserPrompt', () => {
6868
expect(out).toContain('must be an object, never a string');
6969
});
7070

71-
it('describes the requested execution mode and workspace', () => {
72-
const out = render(baseInput);
73-
expect(out).toContain('## Execution mode');
74-
expect(out).toContain('Mode: `vitro`');
75-
expect(out).toContain('Workspace: `none`');
76-
expect(out).toContain('no repository checkout mounted');
71+
it('omits the discipline section when no task context exists', () => {
72+
expect(render(baseInput)).not.toContain('## Injected Task Context');
7773
});
7874

79-
it('omits injected-context discipline when no task context exists', () => {
80-
expect(render(baseInput)).not.toContain('## Injected context discipline');
81-
});
82-
83-
it('requires inspecting context before solving when task context exists', () => {
75+
it('requires reconciling injected context INTO the code (not into comments)', () => {
8476
const out = render({
8577
...baseInput,
8678
context: [
@@ -91,32 +83,37 @@ describe('buildRunEvalUserPrompt', () => {
9183
},
9284
],
9385
});
94-
expect(out).toContain('## Injected context discipline');
95-
expect(out).toContain(
96-
'MUST inspect and use that context BEFORE you write solution',
97-
);
98-
expect(out).toContain(
99-
'Do not solve first and only review the context afterward.',
100-
);
101-
expect(out).toContain(
102-
'your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call',
103-
);
104-
expect(out).toContain('skip reading it before writing solution files');
86+
// Section uses the exact phrase "Injected Task Context" so weaker
87+
// models see one anchor repeated between this heading and the
88+
// materialized context block header.
89+
expect(out).toContain('## Injected Task Context');
90+
expect(out).toContain('MUST inspect it BEFORE you write solution files');
91+
// The reconciliation rule is explicit: code, not comments.
92+
expect(out).toContain('Reconcile every constraint from that context');
93+
expect(out).toContain('into the code path');
94+
expect(out).toContain('Quoting a constraint back in a comment');
95+
expect(out).toContain('NOT following the task');
96+
// Inline-context path still names the materialized files.
10597
expect(out).toContain('/workspace/context-pack.md');
10698
expect(out).toContain('/workspace/AGENTS.md');
10799
});
108100

109-
it('omits the correlation section when correlationId is null/absent', () => {
110-
expect(render(baseInput)).not.toContain('## Correlation');
111-
expect(render(baseInput, { ...ctx, correlationId: null })).not.toContain(
112-
'## Correlation',
101+
it('does NOT leak the judge rubric or judge-only sections', () => {
102+
// RunEvalSuccessCriteria intentionally excludes `rubric` so the
103+
// producer cannot see the judge's answer key. Assert the rendered
104+
// prompt and trace reflect that contract.
105+
const assembled = buildRunEvalUserPrompt(
106+
{ ...baseInput, successCriteria: { version: 1 as const } },
107+
ctx,
113108
);
114-
});
115-
116-
it('emits the correlation section when correlationId is set', () => {
117-
const out = render(baseInput, { ...ctx, correlationId: 'corr-abc' });
118-
expect(out).toContain('## Correlation');
119-
expect(out).toContain('corr-abc');
109+
const out = assembled.text;
110+
expect(out).not.toContain('## Rubric');
111+
expect(out).not.toContain('## Criteria');
112+
expect(out).not.toMatch(/\| Criterion \| Weight \|/);
113+
expect(out).not.toContain('Composite arithmetic');
114+
expect(
115+
assembled.trace.find((t) => t.source === 'rubric_judge'),
116+
).toBeUndefined();
120117
});
121118

122119
it('exposes a structured per-section trace alongside the text', () => {
@@ -126,10 +123,9 @@ describe('buildRunEvalUserPrompt', () => {
126123
expect(ids).toContain('run_eval.header');
127124
expect(ids).toContain('run_eval.scenario');
128125
expect(ids).toContain('run_eval.final_output');
129-
// Absent optional sections are kept in the trace with char_count 0.
130-
const correlation = assembled.trace.find(
131-
(t) => t.id === 'run_eval.correlation',
132-
);
133-
expect(correlation?.char_count).toBe(0);
126+
// Dropped sections must not appear in the trace either — replay
127+
// tooling treats "absent from trace" as "never rendered".
128+
expect(ids).not.toContain('run_eval.correlation');
129+
expect(ids).not.toContain('run_eval.execution_mode');
134130
});
135131
});

libs/agent-runtime/src/prompts/run-eval.ts

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ interface Ctx {
1313
/** Task id — the agent must report it in its final structured output. */
1414
taskId: string;
1515
/**
16-
* MoltNet correlationId. For eval scenarios this groups the N variant
17-
* `run_eval` tasks plus any eventual `judge_eval_attempt` tasks under a
18-
* single id. May be null for ad-hoc single-variant runs.
16+
* MoltNet correlationId. Recorded on the task object so attempt events
17+
* can group sibling variants of an eval scenario; intentionally NOT
18+
* surfaced in the user prompt — the producer never acts on it.
1919
*/
2020
correlationId?: string | null;
2121
}
@@ -33,12 +33,26 @@ interface Ctx {
3333
* the body, `skill` items are persisted at the runtime's skill path,
3434
* and `user_inline` items are appended to the first user message. This
3535
* builder does NOT inline `input.context[]` itself.
36+
*
37+
* Prompt-shape notes (issue #1175, area 1):
38+
* - No `Correlation` section: the agent never acts on it. The id is
39+
* still carried on attempt event metadata for cross-variant queries.
40+
* - No `Execution mode` section: the workspace already reflects the
41+
* chosen mode by its shape (scratch/shared mount/dedicated worktree).
42+
* Restating it as text adds noise without changing model behavior.
43+
* - The "Injected Task Context" phrase is used identically in this
44+
* prompt's discipline section and in the materialized context block
45+
* header (see context-bindings.ts) so weaker models see one repeated
46+
* anchor.
47+
* - The discipline copy demands the model encode injected constraints
48+
* into the code path itself, not into comments or the verification
49+
* field. Quoting the constraint back is not following the task.
3650
*/
3751
export function buildRunEvalUserPrompt(
3852
input: RunEvalInput,
3953
ctx: Ctx,
4054
): AssembledPrompt {
41-
const { scenario, variantLabel, execution, successCriteria } = input;
55+
const { scenario, variantLabel, successCriteria } = input;
4256
const hasContext = input.context.length > 0;
4357
const hasInlineContext = input.context.some(
4458
(entry) => entry.binding === 'context_inline',
@@ -49,40 +63,23 @@ export function buildRunEvalUserPrompt(
4963
`You are running an evaluation scenario as variant \`${variantLabel}\`.\n` +
5064
`Task id: \`${ctx.taskId}\``;
5165

52-
const correlation = ctx.correlationId
53-
? [
54-
`This task carries correlationId \`${ctx.correlationId}\`. It joins`,
55-
'this variant to its sibling `run_eval` tasks (other variants of the',
56-
'same scenario and to any later `judge_eval_attempt` tasks created',
57-
'against those variants. You do not need to act on it directly — it',
58-
'is recorded for cross-variant aggregation at query time.',
59-
].join('\n')
60-
: '';
61-
62-
const executionMode = [
63-
`Mode: \`${execution.mode}\``,
64-
`Workspace: \`${execution.workspace}\``,
65-
execution.workspace === 'none'
66-
? 'You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly.'
67-
: execution.workspace === 'shared_mount'
68-
? 'You are running against the daemon shared mount. Treat any repository mutations as affecting the mounted checkout directly.'
69-
: 'You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.',
70-
].join('\n');
71-
7266
const contextDiscipline = hasContext
7367
? [
74-
'This task includes extra injected context from the task creator.',
75-
'You MUST inspect and use that context BEFORE you write solution',
76-
'files or draft your final answer.',
77-
'Do not solve first and only review the context afterward.',
78-
hasInlineContext
79-
? 'For `context_inline`, your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`.'
80-
: 'If injected context was provided as a skill, inspect that task-injected context before solving.',
68+
'This task includes Injected Task Context supplied by the task',
69+
'creator. You MUST inspect it BEFORE you write solution files or',
70+
'draft your final answer — not after.',
71+
'',
72+
'Reconcile every constraint from that context **into the code path',
73+
'itself**: function bodies, control flow, transaction boundaries,',
74+
'guard clauses. Quoting a constraint back in a comment, a',
75+
'`// note:` line, the task summary, or the `verification` field is',
76+
'NOT following the task. If the constraint affects behavior, it',
77+
'must affect behavior.',
8178
hasInlineContext
82-
? 'If `/workspace/context-pack.md` exists and you skip reading it before writing solution files, you are not following the task instructions.'
83-
: 'Do not rely on memory alone when task-injected context is available; inspect it first.',
84-
'If the injected context contains repo- or workflow-specific rules,',
85-
'those rules override your generic instincts.',
79+
? 'For `context_inline`, your FIRST content-inspection step is a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`.'
80+
: 'When the context is delivered as a skill, inspect it before solving.',
81+
'If the Injected Task Context contains repo- or workflow-specific',
82+
'rules, those rules override your generic instincts.',
8683
].join('\n')
8784
: '';
8885

@@ -117,22 +114,10 @@ export function buildRunEvalUserPrompt(
117114

118115
const sections: PromptSection[] = [
119116
{ id: 'run_eval.header', source: 'header', body: header },
120-
{
121-
id: 'run_eval.correlation',
122-
source: 'task_input',
123-
header: 'Correlation',
124-
body: correlation,
125-
},
126-
{
127-
id: 'run_eval.execution_mode',
128-
source: 'task_input',
129-
header: 'Execution mode',
130-
body: executionMode,
131-
},
132117
{
133118
id: 'run_eval.context_discipline',
134119
source: 'discipline',
135-
header: 'Injected context discipline',
120+
header: 'Injected Task Context',
136121
body: contextDiscipline,
137122
},
138123
{

0 commit comments

Comments
 (0)