@@ -103,7 +103,30 @@ const depthInstructions = {
103103 deep: 'Provide exhaustive assessment. Cover: execution quality, output completeness and correctness, artifact quality and structure, step-to-step handoff integrity, error handling, performance signals, architecture implications, edge cases.'
104104};
105105
106- const analysisPrompt = ` PURPOSE : Analyze the output of workflow step " ${step.name}" (step ${stepIdx + 1 }/ ${state .steps .length }) to assess quality, identify issues, and evaluate handoff readiness for the next step.
106+ // ★ Build test requirements section (the evaluation baseline)
107+ const testReqs = step.test_requirements;
108+ let testReqSection = '';
109+ if (testReqs) {
110+ testReqSection = `
111+ TEST REQUIREMENTS (Acceptance Criteria — use these as the PRIMARY evaluation baseline):
112+ Pass Criteria: ${testReqs .pass_criteria }
113+ Expected Outputs: ${(testReqs .expected_outputs || []).join (' , ' ) || ' not specified' }
114+ Content Signals (patterns that indicate success): ${(testReqs .content_signals || []).join (' , ' ) || ' not specified' }
115+ Quality Thresholds: ${(testReqs .quality_thresholds || []).join (' , ' ) || ' not specified' }
116+ Fail Signals (patterns that indicate failure): ${(testReqs .fail_signals || []).join (' , ' ) || ' not specified' }
117+ Handoff Contract (what next step needs): ${testReqs .handoff_contract || ' not specified' }
118+
119+ IMPORTANT : Score quality_score based on how well the actual output matches these test requirements.
120+ - 90 - 100 : All pass_criteria met, all expected_outputs present, content_signals found, no fail_signals
121+ - 70 - 89 : Most criteria met, minor gaps
122+ - 50 - 69 : Partial match, significant gaps
123+ - 0 - 49 : Fail — fail_signals present or pass_criteria not met` ;
124+ } else {
125+ testReqSection = `
126+ NOTE : No pre- generated test requirements for this step . Evaluate based on general quality signals and workflow context.` ;
127+ }
128+
129+ const analysisPrompt = ` PURPOSE : Evaluate workflow step " ${step.name}" (step ${stepIdx + 1 }/ ${state .steps .length }) against its acceptance criteria . Judge whether the command execution met the pre- defined test requirements.
107130
108131WORKFLOW CONTEXT :
109132Name: ${state .workflow_name }
@@ -116,6 +139,7 @@ Name: ${step.name}
116139Type: ${step .type }
117140Command: ${step .command }
118141${step .success_criteria ? ` Success Criteria: ${ step .success_criteria } ` : ' ' }
142+ ${testReqSection}
119143
120144EXECUTION RESULT :
121145${execSummary}
@@ -129,15 +153,25 @@ ANALYSIS DEPTH: ${state.analysis_depth}
129153${depthInstructions[state .analysis_depth ]}
130154
131155TASK :
132- 1. Assess step execution quality (did it succeed ? complete output ? )
133- 2. Evaluate artifact quality (content correctness, completeness, format)
134- 3. Check handoff readiness (can the next step consume this output ? )
135- 4. Identify issues, risks, or optimization opportunities
136- 5. Rate overall step quality 0 - 100
156+ 1. ** Requirement Matching ** : Compare actual output against test requirements (pass_criteria, expected_outputs, content_signals )
157+ 2. ** Fail Signal Detection ** : Check for any fail_signals in the output
158+ 3. ** Handoff Contract Verification ** : Does the output satisfy handoff_contract for the next step ?
159+ 4. ** Gap Analysis ** : What ' s missing between actual output and requirements?
160+ 5. **Quality Score**: Rate 0-100 based on requirement fulfillment (NOT general quality)
137161
138162EXPECTED OUTPUT (strict JSON, no markdown):
139163{
140164 "quality_score": <0-100>,
165+ "requirement_match": {
166+ "pass": <true|false>,
167+ "criteria_met": ["<which pass_criteria were satisfied>"],
168+ "criteria_missed": ["<which pass_criteria were NOT satisfied>"],
169+ "expected_outputs_found": ["<expected files that exist>"],
170+ "expected_outputs_missing": ["<expected files that are absent>"],
171+ "content_signals_found": ["<success patterns detected in output>"],
172+ "content_signals_missing": ["<success patterns NOT found>"],
173+ "fail_signals_detected": ["<failure patterns found, if any>"]
174+ },
141175 "execution_assessment": {
142176 "success": <true|false>,
143177 "completeness": "<complete|partial|failed>",
@@ -151,6 +185,7 @@ EXPECTED OUTPUT (strict JSON, no markdown):
151185 },
152186 "handoff_assessment": {
153187 "ready": <true|false>,
188+ "contract_satisfied": <true|false|null>,
154189 "next_step_compatible": <true|false|null>,
155190 "handoff_notes": "<what next step should know>"
156191 },
@@ -163,7 +198,7 @@ EXPECTED OUTPUT (strict JSON, no markdown):
163198 "step_summary": "<1-2 sentence summary for process log>"
164199}
165200
166- CONSTRAINTS : Be specific, reference artifact content where possible, output ONLY JSON ` ;
201+ CONSTRAINTS: Be specific, reference artifact content where possible, score against requirements not general quality, output ONLY JSON`;
167202```
168203
169204### Step 3.4: Execute via ccw cli Gemini with Resume
@@ -231,11 +266,36 @@ if (jsonMatch) {
231266}
232267
233268// Write step analysis file
269+ const reqMatch = analysis.requirement_match;
270+ const reqMatchSection = reqMatch ? `
271+ ## Requirement Match — ${reqMatch.pass ? 'PASS ✓' : 'FAIL ✗'}
272+
273+ ### Criteria Met
274+ ${(reqMatch.criteria_met || []).map(c => `- ✓ ${c}`).join('\n ') || '- None'}
275+
276+ ### Criteria Missed
277+ ${(reqMatch.criteria_missed || []).map(c => `- ✗ ${c}`).join('\n ') || '- None'}
278+
279+ ### Expected Outputs
280+ - Found: ${(reqMatch.expected_outputs_found || []).join(', ') || 'None'}
281+ - Missing: ${(reqMatch.expected_outputs_missing || []).join(', ') || 'None'}
282+
283+ ### Content Signals
284+ - Detected: ${(reqMatch.content_signals_found || []).join(', ') || 'None'}
285+ - Missing: ${(reqMatch.content_signals_missing || []).join(', ') || 'None'}
286+
287+ ### Fail Signals
288+ ${(reqMatch.fail_signals_detected || []).length > 0
289+ ? (reqMatch.fail_signals_detected || []).map(f => `- ⚠ ${f}`).join('\n ')
290+ : '- None detected'}
291+ ` : '';
292+
234293const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name}
235294
236295**Quality Score**: ${analysis.quality_score}/100
296+ **Requirement Match**: ${reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A (no test requirements)'}
237297**Date**: ${new Date().toISOString()}
238-
298+ ${reqMatchSection}
239299## Execution
240300- Success: ${analysis.execution_assessment?.success}
241301- Completeness: ${analysis.execution_assessment?.completeness}
@@ -249,6 +309,7 @@ const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name}
249309
250310## Handoff Readiness
251311- Ready: ${analysis.handoff_assessment?.ready}
312+ - Contract Satisfied: ${analysis.handoff_assessment?.contract_satisfied}
252313- Next Step Compatible: ${analysis.handoff_assessment?.next_step_compatible}
253314- Notes: ${analysis.handoff_assessment?.handoff_notes}
254315
@@ -262,14 +323,16 @@ ${(analysis.optimization_opportunities || []).map(o => `- [${o.impact}] ${o.area
262323Write(`${stepDir}/step-${stepIdx + 1}-analysis.md`, stepAnalysisReport);
263324
264325// Append to process log
326+ const reqPassStr = reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A';
265327const processLogEntry = `
266- ## Step ${stepIdx + 1 }: ${step .name } — Score: ${analysis .quality_score }/ 100
328+ ## Step ${stepIdx + 1}: ${step.name} — Score: ${analysis.quality_score}/100 | Req: ${reqPassStr}
267329
268330**Command**: \` ${step.command}\`
269331**Result**: ${analysis.execution_assessment?.completeness || 'unknown'} | ${analysis.artifact_assessment?.count || 0} artifacts
332+ **Requirement Match**: ${reqPassStr}${reqMatch ? ` — Met: ${(reqMatch.criteria_met || []).length}, Missed: ${(reqMatch.criteria_missed || []).length}, Fail Signals: ${(reqMatch.fail_signals_detected || []).length}` : ''}
270333**Summary**: ${analysis.step_summary || 'No summary'}
271334**Issues**: ${(analysis.issues || []).filter(i => i.severity === 'high').map(i => i.description).join('; ') || 'None critical'}
272- ** Handoff** : ${analysis .handoff_assessment ? .handoff_notes || ' Ready' }
335+ **Handoff**: ${analysis.handoff_assessment?.contract_satisfied ? 'Contract satisfied' : analysis.handoff_assessment?. handoff_notes || 'Ready'}
273336
274337---
275338`;
@@ -281,8 +344,13 @@ Write(`${state.work_dir}/process-log.md`, currentLog + processLogEntry);
281344// Update state
282345state.steps[stepIdx].analysis = {
283346 quality_score: analysis.quality_score,
347+ requirement_pass: reqMatch?.pass ?? null,
348+ criteria_met_count: (reqMatch?.criteria_met || []).length,
349+ criteria_missed_count: (reqMatch?.criteria_missed || []).length,
350+ fail_signals_count: (reqMatch?.fail_signals_detected || []).length,
284351 key_outputs: analysis.artifact_assessment?.key_outputs || [],
285352 handoff_notes: analysis.handoff_assessment?.handoff_notes || '',
353+ contract_satisfied: analysis.handoff_assessment?.contract_satisfied ?? null,
286354 issue_count: (analysis.issues || []).length,
287355 high_issues: (analysis.issues || []).filter(i => i.severity === 'high').length,
288356 optimization_count: (analysis.optimization_opportunities || []).length,
0 commit comments