superdoc/.github/scripts/agent-docs-audit.mjs at 407b2787fc8cf42acf274716ef60d6f46e8d87c2 · superdoc-dev/superdoc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env node
/**
 * Agent-docs semantic audit.
 *
 * Three layers, modeled on risk-assess.mjs:
 *   L1: deterministic scan (sizes, paths, symlinks, broken refs) - free
 *   L2: Haiku triage per doc - needs review? - ~$0.01/doc
 *   L3: Sonnet deep analysis on flagged docs - structured findings - ~$0.10/doc
 *
 * Usage:
 *   node agent-docs-audit.mjs                       # audit all flagged docs (L1+L2+L3)
 *   node agent-docs-audit.mjs --only <relpath>      # audit one specific doc
 *   node agent-docs-audit.mjs --skip-ai             # L1 only, no API calls
 *   node agent-docs-audit.mjs --dry-run             # all layers stubbed
 *
 * Env:
 *   ANTHROPIC_API_KEY     required for L2/L3; if missing the script auto-falls back to L1-only
 *   REPO_ROOT             target repo path (default: cwd)
 *   POLICY_FILE           path to agent-docs-policy.md (default: <REPO_ROOT>/agent-docs-policy.md)
 *
 * Output: Markdown report to stdout, structured JSON to /tmp/agent-docs-audit.json,
 *         L1 markdown to /tmp/agent-docs-audit-l1.md.
 */

import { existsSync, readFileSync, writeFileSync } from 'node:fs';
import { join, resolve } from 'node:path';
import { runL1Scan, flaggedForReview, renderL1Markdown } from './agent-docs-l1.mjs';

delete process.env.CLAUDECODE;

const REPO_ROOT = resolve(process.env.REPO_ROOT ?? process.cwd());
const POLICY_FILE_DEFAULT = join(REPO_ROOT, 'agent-docs-policy.md');
const POLICY_FILE = process.env.POLICY_FILE ?? POLICY_FILE_DEFAULT;

const args = process.argv.slice(2);
const DRY_RUN = args.includes('--dry-run');
const SKIP_AI = args.includes('--skip-ai') || (!DRY_RUN && !process.env.ANTHROPIC_API_KEY);
const ONLY = args.includes('--only') ? args[args.indexOf('--only') + 1] : null;

// ── Layer 2: Haiku triage ──

async function haikuTriage(doc, policyText) {
  if (DRY_RUN) return { decision: 'review', reason: 'dry-run forces review', cost: 0 };
  const Anthropic = (await import('@anthropic-ai/sdk')).default;
  const client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
  const docContent = readFileSync(join(REPO_ROOT, doc.path), 'utf-8');
  const prompt = `You are triaging an agent-context doc for review.

Doc path: ${doc.path}
Doc size: ${doc.lines} lines
Deterministic flags: ${doc.reasons.join('; ')}

Doc content:
\`\`\`markdown
${docContent}
\`\`\`

Policy excerpt:
${policyText.slice(0, 2000)}

Decide whether this doc needs deep review against the policy. Respond using the triage tool. Bias toward "review" only when the deterministic flags suggest real issues (broken refs likely stale; large size likely contains content that could be trimmed/moved).`;

  const result = await client.messages.create({
    model: 'claude-haiku-4-5-20251001',
    max_tokens: 256,
    tools: [{
      name: 'triage',
      description: 'Decide whether the doc needs deep review.',
      input_schema: {
        type: 'object',
        properties: {
          decision: { type: 'string', enum: ['review', 'skip'] },
          reason: { type: 'string', description: 'One sentence' },
        },
        required: ['decision', 'reason'],
      },
    }],
    tool_choice: { type: 'tool', name: 'triage' },
    messages: [{ role: 'user', content: prompt }],
  });
  const toolUse = result.content.find((b) => b.type === 'tool_use');
  const cost = (result.usage.input_tokens * 0.0000008) + (result.usage.output_tokens * 0.000004);
  return { ...toolUse.input, cost };
}

// ── Layer 3: Sonnet deep analysis ──

async function sonnetDeep(doc, policyText) {
  if (DRY_RUN) {
    return { findings: [{ label: 'INVESTIGATE', section: '(dry-run)', claim: '', reason: 'no API call', evidence: '', suggested_action: 'rerun without --dry-run' }], cost: 0, durationMs: 0, toolCalls: [] };
  }
  const { query } = await import('@anthropic-ai/claude-agent-sdk');
  const docContent = readFileSync(join(REPO_ROOT, doc.path), 'utf-8');

  const prompt = `You are auditing an agent-context doc against a policy. Identify sections that are stale, incorrect, redundant, or misplaced.

Doc path: ${doc.path}
Doc size: ${doc.lines} lines
Deterministic flags: ${doc.reasons.join('; ')}
Deterministic broken refs (if any): ${doc.brokenRefs.join(', ') || 'none'}

Doc content:
\`\`\`markdown
${docContent}
\`\`\`

Policy:
${policyText}

Scope your investigation tightly:
1. Verify each deterministic broken-ref flag (real miss or just shorthand?). Emit UPDATE or INVESTIGATE.
2. Look at the 2 largest H2 sections. Decide if either is a MOVE candidate per the policy (duplicates content elsewhere, package-specific, etc.). Cite the destination doc.
3. Sample 2-3 specific concrete claims (a command, path, function name). Verify them.

Do not pad the report. Most sections will produce no finding - that is correct. Prefer "drop the hardcoded value" over "update to the current value" when the value is likely to drift.

Limit yourself to 8 tool calls total. Use Grep for identifiers, Read for short files, Glob for existence, Bash sparingly (git log, complex rg).

End your response with this JSON. No markdown fences:

{"findings":[{"label":"KEEP|TRIM|MOVE|UPDATE|INVESTIGATE","section":"H2 or H3 header text","claim":"verbatim excerpt or paraphrase","reason":"why this label","evidence":"what you checked and what you found","suggested_action":"concrete next step"}]}

Only emit KEEP when explaining why a flagged section should remain. The default for verified content is silence.`;

  let resultText = '';
  let cost = 0;
  let durationMs = 0;
  const toolCalls = [];

  for await (const msg of query({
    prompt,
    options: {
      // allowedTools is not a strict allowlist under bypassPermissions; Bash
      // gets through. Listing what we expect to use; relying on the prompt
      // budget ("6-8 tool calls") and maxTurns to constrain cost.
      allowedTools: ['Read', 'Glob', 'Grep', 'Bash'],
      disallowedTools: ['Edit', 'Write', 'Task', 'WebFetch', 'WebSearch', 'mcp__*'],
      permissionMode: 'bypassPermissions',
      maxTurns: 15,
      cwd: REPO_ROOT,
    },
  })) {
    if (msg.type === 'assistant' && msg.message?.content) {
      for (const block of msg.message.content) {
        if (block.type === 'text') resultText = block.text;
        if (block.type === 'tool_use') {
          toolCalls.push(`${block.name}: ${JSON.stringify(block.input).slice(0, 100)}`);
        }
      }
    }
    if (msg.type === 'result') {
      cost = msg.total_cost_usd ?? 0;
      durationMs = msg.duration_api_ms ?? msg.duration_ms ?? 0;
    }
  }

  const jsonMatch = resultText.match(/\{[\s\S]*"findings"[\s\S]*\}/);
  if (!jsonMatch) throw new Error(`Sonnet did not produce findings JSON. Tail: ${resultText.slice(-300)}`);
  const parsed = JSON.parse(jsonMatch[0]);
  return { ...parsed, cost, durationMs, toolCalls };
}

// ── Output ──

function renderMarkdown(report) {
  const lines = [`# Agent docs audit\n`, `Target: \`${REPO_ROOT}\`\n`];
  if (DRY_RUN) lines.push('**DRY RUN** - no API calls were made.\n');
  lines.push(`Total cost: $${report.totalCost.toFixed(4)} (${report.docs.length} docs reviewed)\n`);
  for (const d of report.docs) {
    lines.push(`## \`${d.path}\` (${d.lines} lines)\n`);
    lines.push(`Reasons flagged: ${d.reasons.join('; ')}`);
    lines.push(`Triage: ${d.triage?.decision ?? 'n/a'} - ${d.triage?.reason ?? ''}`);
    if (!d.deep) { lines.push(''); continue; }
    if (d.deep.findings.length === 0) {
      lines.push('No findings.\n');
      continue;
    }
    lines.push('| Label | Section | Reason | Suggested action |');
    lines.push('|---|---|---|---|');
    for (const f of d.deep.findings) {
      lines.push(`| ${f.label} | ${f.section} | ${f.reason} | ${f.suggested_action} |`);
    }
    lines.push('');
  }
  return lines.join('\n');
}

// ── Main ──

async function main() {
  // SKIP_AI is set automatically if ANTHROPIC_API_KEY is missing, or via --skip-ai.
  // In that case the script runs L1 only and the workflow uploads an L1-only report.

  const policyText = existsSync(POLICY_FILE)
    ? readFileSync(POLICY_FILE, 'utf-8')
    : '# Default policy\n\n(No policy file found; using built-in defaults: root <= 120 lines, nested <= 200, label findings KEEP/TRIM/MOVE/UPDATE/INVESTIGATE.)';

  console.error('[L1] running deterministic scan...');
  const scan = runL1Scan(REPO_ROOT);
  writeFileSync('/tmp/agent-docs-audit-l1.md', renderL1Markdown(scan));
  let flagged = flaggedForReview(scan).map((f) => ({
    path: f.relPath,
    lines: f.lineCount,
    kind: f.isSymlink ? 'symlink' : 'file',
    brokenRefs: f.brokenPathRefs,
    reasons: f.reasons,
  }));
  if (ONLY) flagged = flagged.filter((d) => d.path === ONLY);
  console.error(`[L1] ${scan.files.length} doc(s) inventoried, ${flagged.length} flagged for review${ONLY ? ` (filtered to --only ${ONLY})` : ''}`);

  if (SKIP_AI) {
    console.error('[L2/L3] skipped (no ANTHROPIC_API_KEY or --skip-ai). Writing L1 report only.');
    const stub = { docs: flagged.map((d) => ({ ...d, triage: null, deep: null })), totalCost: 0, l1Only: true };
    writeFileSync('/tmp/agent-docs-audit.json', JSON.stringify(stub, null, 2));
    console.log(renderL1Markdown(scan));
    return;
  }

  const report = { docs: [], totalCost: 0 };
  for (const doc of flagged) {
    console.error(`[L2] triage: ${doc.path}`);
    const triage = await haikuTriage(doc, policyText);
    report.totalCost += triage.cost;
    const entry = { ...doc, triage, deep: null };
    if (triage.decision === 'review') {
      console.error(`[L3] deep analysis: ${doc.path}`);
      const deep = await sonnetDeep(doc, policyText);
      report.totalCost += deep.cost;
      entry.deep = deep;
    }
    report.docs.push(entry);
  }

  writeFileSync('/tmp/agent-docs-audit.json', JSON.stringify(report, null, 2));
  console.log(renderMarkdown(report));
}

main().catch((err) => {
  console.error(`audit failed: ${err.stack || err.message}`);
  process.exit(1);
});