Skip to content

Commit 051d208

Browse files
authored
feat(agent-docs-audit): policy, L1 scanner, L2+L3 SDK script, weekly workflow (#3296)
Three-layer audit modeled on risk-assess.mjs: - L1 (.github/scripts/agent-docs-l1.mjs): deterministic scan. Walks agent-context docs, counts lines, classifies AGENTS/CLAUDE pairs, detects broken @imports, broken path refs (with context-aware resolution), and unresolved pnpm commands. No model calls. - L2 (.github/scripts/agent-docs-audit.mjs Haiku triage): given an L1-flagged doc + policy, decides via tool-use whether the doc needs deep review. Cheap (~$0.01). - L3 (same file, Sonnet via claude-agent-sdk): reads the doc, uses Read/Glob/Grep/Bash to verify concrete claims (paths, identifiers, commands, architecture). Emits structured KEEP/TRIM/MOVE/UPDATE/ INVESTIGATE findings. ~$0.20/doc. Workflow (.github/workflows/agent-docs-audit.yml): triggers on PR doc-path changes, weekly Monday cron, and workflow_dispatch. Skips AI layers gracefully if ANTHROPIC_API_KEY missing (fork PRs). Warning-only: uploads /tmp/agent-docs-audit.json and /tmp/agent-docs-audit-summary.md as artifacts plus a Step Summary, no PR comments and no failing CI yet. Policy (agent-docs-policy.md, 91 lines): codifies size budgets, placement rules, write/do-not-write criteria, verifiable claims standard, and the five finding labels. Manual prototype run against current main: 5 of 9 L1-flagged docs passed Haiku triage to Sonnet review, 15 concrete findings produced for $1.19 total. Notable finds the deterministic scanner alone cannot catch: `blockIdToEntry` identifier in packages/layout-engine/AGENTS.md does not exist in renderer.ts (stale symbol).
1 parent 4b8f2fd commit 051d208

4 files changed

Lines changed: 883 additions & 0 deletions

File tree

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
#!/usr/bin/env node
2+
/**
3+
* Agent-docs semantic audit.
4+
*
5+
* Three layers, modeled on risk-assess.mjs:
6+
* L1: deterministic scan (sizes, paths, symlinks, broken refs) - free
7+
* L2: Haiku triage per doc - needs review? - ~$0.01/doc
8+
* L3: Sonnet deep analysis on flagged docs - structured findings - ~$0.10/doc
9+
*
10+
* Usage:
11+
* node agent-docs-audit.mjs # audit all flagged docs (L1+L2+L3)
12+
* node agent-docs-audit.mjs --only <relpath> # audit one specific doc
13+
* node agent-docs-audit.mjs --skip-ai # L1 only, no API calls
14+
* node agent-docs-audit.mjs --dry-run # all layers stubbed
15+
*
16+
* Env:
17+
* ANTHROPIC_API_KEY required for L2/L3; if missing the script auto-falls back to L1-only
18+
* REPO_ROOT target repo path (default: cwd)
19+
* POLICY_FILE path to agent-docs-policy.md (default: <REPO_ROOT>/agent-docs-policy.md)
20+
*
21+
* Output: Markdown report to stdout, structured JSON to /tmp/agent-docs-audit.json,
22+
* L1 markdown to /tmp/agent-docs-audit-l1.md.
23+
*/
24+
25+
import { existsSync, readFileSync, writeFileSync } from 'node:fs';
26+
import { join, resolve } from 'node:path';
27+
import { runL1Scan, flaggedForReview, renderL1Markdown } from './agent-docs-l1.mjs';
28+
29+
delete process.env.CLAUDECODE;
30+
31+
const REPO_ROOT = resolve(process.env.REPO_ROOT ?? process.cwd());
32+
const POLICY_FILE_DEFAULT = join(REPO_ROOT, 'agent-docs-policy.md');
33+
const POLICY_FILE = process.env.POLICY_FILE ?? POLICY_FILE_DEFAULT;
34+
35+
const args = process.argv.slice(2);
36+
const DRY_RUN = args.includes('--dry-run');
37+
const SKIP_AI = args.includes('--skip-ai') || (!DRY_RUN && !process.env.ANTHROPIC_API_KEY);
38+
const ONLY = args.includes('--only') ? args[args.indexOf('--only') + 1] : null;
39+
40+
// ── Layer 2: Haiku triage ──
41+
42+
async function haikuTriage(doc, policyText) {
43+
if (DRY_RUN) return { decision: 'review', reason: 'dry-run forces review', cost: 0 };
44+
const Anthropic = (await import('@anthropic-ai/sdk')).default;
45+
const client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
46+
const docContent = readFileSync(join(REPO_ROOT, doc.path), 'utf-8');
47+
const prompt = `You are triaging an agent-context doc for review.
48+
49+
Doc path: ${doc.path}
50+
Doc size: ${doc.lines} lines
51+
Deterministic flags: ${doc.reasons.join('; ')}
52+
53+
Doc content:
54+
\`\`\`markdown
55+
${docContent}
56+
\`\`\`
57+
58+
Policy excerpt:
59+
${policyText.slice(0, 2000)}
60+
61+
Decide whether this doc needs deep review against the policy. Respond using the triage tool. Bias toward "review" only when the deterministic flags suggest real issues (broken refs likely stale; large size likely contains content that could be trimmed/moved).`;
62+
63+
const result = await client.messages.create({
64+
model: 'claude-haiku-4-5-20251001',
65+
max_tokens: 256,
66+
tools: [{
67+
name: 'triage',
68+
description: 'Decide whether the doc needs deep review.',
69+
input_schema: {
70+
type: 'object',
71+
properties: {
72+
decision: { type: 'string', enum: ['review', 'skip'] },
73+
reason: { type: 'string', description: 'One sentence' },
74+
},
75+
required: ['decision', 'reason'],
76+
},
77+
}],
78+
tool_choice: { type: 'tool', name: 'triage' },
79+
messages: [{ role: 'user', content: prompt }],
80+
});
81+
const toolUse = result.content.find((b) => b.type === 'tool_use');
82+
const cost = (result.usage.input_tokens * 0.0000008) + (result.usage.output_tokens * 0.000004);
83+
return { ...toolUse.input, cost };
84+
}
85+
86+
// ── Layer 3: Sonnet deep analysis ──
87+
88+
async function sonnetDeep(doc, policyText) {
89+
if (DRY_RUN) {
90+
return { findings: [{ label: 'INVESTIGATE', section: '(dry-run)', claim: '', reason: 'no API call', evidence: '', suggested_action: 'rerun without --dry-run' }], cost: 0, durationMs: 0, toolCalls: [] };
91+
}
92+
const { query } = await import('@anthropic-ai/claude-agent-sdk');
93+
const docContent = readFileSync(join(REPO_ROOT, doc.path), 'utf-8');
94+
95+
const prompt = `You are auditing an agent-context doc against a policy. Identify sections that are stale, incorrect, redundant, or misplaced.
96+
97+
Doc path: ${doc.path}
98+
Doc size: ${doc.lines} lines
99+
Deterministic flags: ${doc.reasons.join('; ')}
100+
Deterministic broken refs (if any): ${doc.brokenRefs.join(', ') || 'none'}
101+
102+
Doc content:
103+
\`\`\`markdown
104+
${docContent}
105+
\`\`\`
106+
107+
Policy:
108+
${policyText}
109+
110+
Scope your investigation tightly:
111+
1. Verify each deterministic broken-ref flag (real miss or just shorthand?). Emit UPDATE or INVESTIGATE.
112+
2. Look at the 2 largest H2 sections. Decide if either is a MOVE candidate per the policy (duplicates content elsewhere, package-specific, etc.). Cite the destination doc.
113+
3. Sample 2-3 specific concrete claims (a command, path, function name). Verify them.
114+
115+
Do not pad the report. Most sections will produce no finding - that is correct. Prefer "drop the hardcoded value" over "update to the current value" when the value is likely to drift.
116+
117+
Limit yourself to 8 tool calls total. Use Grep for identifiers, Read for short files, Glob for existence, Bash sparingly (git log, complex rg).
118+
119+
End your response with this JSON. No markdown fences:
120+
121+
{"findings":[{"label":"KEEP|TRIM|MOVE|UPDATE|INVESTIGATE","section":"H2 or H3 header text","claim":"verbatim excerpt or paraphrase","reason":"why this label","evidence":"what you checked and what you found","suggested_action":"concrete next step"}]}
122+
123+
Only emit KEEP when explaining why a flagged section should remain. The default for verified content is silence.`;
124+
125+
let resultText = '';
126+
let cost = 0;
127+
let durationMs = 0;
128+
const toolCalls = [];
129+
130+
for await (const msg of query({
131+
prompt,
132+
options: {
133+
// allowedTools is not a strict allowlist under bypassPermissions; Bash
134+
// gets through. Listing what we expect to use; relying on the prompt
135+
// budget ("6-8 tool calls") and maxTurns to constrain cost.
136+
allowedTools: ['Read', 'Glob', 'Grep', 'Bash'],
137+
disallowedTools: ['Edit', 'Write', 'Task', 'WebFetch', 'WebSearch', 'mcp__*'],
138+
permissionMode: 'bypassPermissions',
139+
maxTurns: 15,
140+
cwd: REPO_ROOT,
141+
},
142+
})) {
143+
if (msg.type === 'assistant' && msg.message?.content) {
144+
for (const block of msg.message.content) {
145+
if (block.type === 'text') resultText = block.text;
146+
if (block.type === 'tool_use') {
147+
toolCalls.push(`${block.name}: ${JSON.stringify(block.input).slice(0, 100)}`);
148+
}
149+
}
150+
}
151+
if (msg.type === 'result') {
152+
cost = msg.total_cost_usd ?? 0;
153+
durationMs = msg.duration_api_ms ?? msg.duration_ms ?? 0;
154+
}
155+
}
156+
157+
const jsonMatch = resultText.match(/\{[\s\S]*"findings"[\s\S]*\}/);
158+
if (!jsonMatch) throw new Error(`Sonnet did not produce findings JSON. Tail: ${resultText.slice(-300)}`);
159+
const parsed = JSON.parse(jsonMatch[0]);
160+
return { ...parsed, cost, durationMs, toolCalls };
161+
}
162+
163+
// ── Output ──
164+
165+
function renderMarkdown(report) {
166+
const lines = [`# Agent docs audit\n`, `Target: \`${REPO_ROOT}\`\n`];
167+
if (DRY_RUN) lines.push('**DRY RUN** - no API calls were made.\n');
168+
lines.push(`Total cost: $${report.totalCost.toFixed(4)} (${report.docs.length} docs reviewed)\n`);
169+
for (const d of report.docs) {
170+
lines.push(`## \`${d.path}\` (${d.lines} lines)\n`);
171+
lines.push(`Reasons flagged: ${d.reasons.join('; ')}`);
172+
lines.push(`Triage: ${d.triage?.decision ?? 'n/a'} - ${d.triage?.reason ?? ''}`);
173+
if (!d.deep) { lines.push(''); continue; }
174+
if (d.deep.findings.length === 0) {
175+
lines.push('No findings.\n');
176+
continue;
177+
}
178+
lines.push('| Label | Section | Reason | Suggested action |');
179+
lines.push('|---|---|---|---|');
180+
for (const f of d.deep.findings) {
181+
lines.push(`| ${f.label} | ${f.section} | ${f.reason} | ${f.suggested_action} |`);
182+
}
183+
lines.push('');
184+
}
185+
return lines.join('\n');
186+
}
187+
188+
// ── Main ──
189+
190+
async function main() {
191+
// SKIP_AI is set automatically if ANTHROPIC_API_KEY is missing, or via --skip-ai.
192+
// In that case the script runs L1 only and the workflow uploads an L1-only report.
193+
194+
const policyText = existsSync(POLICY_FILE)
195+
? readFileSync(POLICY_FILE, 'utf-8')
196+
: '# Default policy\n\n(No policy file found; using built-in defaults: root <= 120 lines, nested <= 200, label findings KEEP/TRIM/MOVE/UPDATE/INVESTIGATE.)';
197+
198+
console.error('[L1] running deterministic scan...');
199+
const scan = runL1Scan(REPO_ROOT);
200+
writeFileSync('/tmp/agent-docs-audit-l1.md', renderL1Markdown(scan));
201+
let flagged = flaggedForReview(scan).map((f) => ({
202+
path: f.relPath,
203+
lines: f.lineCount,
204+
kind: f.isSymlink ? 'symlink' : 'file',
205+
brokenRefs: f.brokenPathRefs,
206+
reasons: f.reasons,
207+
}));
208+
if (ONLY) flagged = flagged.filter((d) => d.path === ONLY);
209+
console.error(`[L1] ${scan.files.length} doc(s) inventoried, ${flagged.length} flagged for review${ONLY ? ` (filtered to --only ${ONLY})` : ''}`);
210+
211+
if (SKIP_AI) {
212+
console.error('[L2/L3] skipped (no ANTHROPIC_API_KEY or --skip-ai). Writing L1 report only.');
213+
const stub = { docs: flagged.map((d) => ({ ...d, triage: null, deep: null })), totalCost: 0, l1Only: true };
214+
writeFileSync('/tmp/agent-docs-audit.json', JSON.stringify(stub, null, 2));
215+
console.log(renderL1Markdown(scan));
216+
return;
217+
}
218+
219+
const report = { docs: [], totalCost: 0 };
220+
for (const doc of flagged) {
221+
console.error(`[L2] triage: ${doc.path}`);
222+
const triage = await haikuTriage(doc, policyText);
223+
report.totalCost += triage.cost;
224+
const entry = { ...doc, triage, deep: null };
225+
if (triage.decision === 'review') {
226+
console.error(`[L3] deep analysis: ${doc.path}`);
227+
const deep = await sonnetDeep(doc, policyText);
228+
report.totalCost += deep.cost;
229+
entry.deep = deep;
230+
}
231+
report.docs.push(entry);
232+
}
233+
234+
writeFileSync('/tmp/agent-docs-audit.json', JSON.stringify(report, null, 2));
235+
console.log(renderMarkdown(report));
236+
}
237+
238+
main().catch((err) => {
239+
console.error(`audit failed: ${err.stack || err.message}`);
240+
process.exit(1);
241+
});

0 commit comments

Comments
 (0)