Skip to content

Commit d861142

Browse files
committed
test(media-use): resolve tests + eval harness
12 resolve engine tests + eval against 7 real registry blocks.
1 parent 1de2de4 commit d861142

2 files changed

Lines changed: 550 additions & 0 deletions

File tree

skills/media-use/scripts/eval.mjs

Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* media-use eval — compare baseline (no media-use) vs. with media-use
5+
* on real registry blocks. Produces an HTML report.
6+
*/
7+
8+
import { mkdtempSync, cpSync, rmSync, readFileSync, readdirSync, existsSync, writeFileSync } from "node:fs";
9+
import { join, basename, resolve, dirname } from "node:path";
10+
import { execSync } from "node:child_process";
11+
import { tmpdir } from "node:os";
12+
import { fileURLToPath } from "node:url";
13+
14+
const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url));
15+
const REPO_ROOT = resolve(SCRIPT_DIR, "..", "..", "..");
16+
const RESOLVE_SCRIPT = join(SCRIPT_DIR, "resolve.mjs");
17+
18+
const TEST_BLOCKS = [
19+
"registry/blocks/nyc-paris-flight",
20+
"registry/blocks/macos-tahoe-liquid-glass",
21+
"registry/blocks/blue-sweater-intro-video",
22+
"registry/blocks/vpn-youtube-spot",
23+
"registry/blocks/apple-money-count",
24+
"registry/blocks/liquid-glass-notification",
25+
"registry/blocks/instagram-follow",
26+
];
27+
28+
function run(cmd, opts = {}) {
29+
try {
30+
return { ok: true, output: execSync(cmd, { encoding: "utf8", timeout: 15000, stdio: "pipe", ...opts }).trim() };
31+
} catch (err) {
32+
return { ok: false, output: (err.stdout || "") + (err.stderr || ""), code: err.status };
33+
}
34+
}
35+
36+
function countAssetFiles(dir) {
37+
const assetsDir = join(dir, "assets");
38+
if (!existsSync(assetsDir)) return { count: 0, files: [] };
39+
const files = [];
40+
function walk(d, base = "") {
41+
for (const e of readdirSync(d, { withFileTypes: true })) {
42+
const rel = base ? `${base}/${e.name}` : e.name;
43+
if (e.isDirectory()) walk(join(d, e.name), rel);
44+
else files.push(rel);
45+
}
46+
}
47+
walk(assetsDir);
48+
return { count: files.length, files };
49+
}
50+
51+
function evalBlock(blockPath) {
52+
const fullPath = join(REPO_ROOT, blockPath);
53+
if (!existsSync(fullPath)) return null;
54+
55+
const name = basename(blockPath);
56+
const tmp = mkdtempSync(join(tmpdir(), `mu-eval-${name}-`));
57+
58+
try {
59+
cpSync(fullPath, tmp, { recursive: true });
60+
61+
// baseline: what the agent sees WITHOUT media-use
62+
const baseline = countAssetFiles(tmp);
63+
const htmlFiles = readdirSync(tmp).filter((f) => f.endsWith(".html"));
64+
65+
// parse compositions for asset references
66+
const assetRefs = [];
67+
for (const hf of htmlFiles) {
68+
const html = readFileSync(join(tmp, hf), "utf8");
69+
const srcMatches = html.matchAll(/src=["']([^"']+?)["']/g);
70+
for (const m of srcMatches) {
71+
const ref = m[1];
72+
if (ref.startsWith("data:") || ref.startsWith("http")) continue;
73+
assetRefs.push({ composition: hf, ref });
74+
}
75+
const urlMatches = html.matchAll(/url\(["']?([^"')]+?)["']?\)/g);
76+
for (const m of urlMatches) {
77+
const ref = m[1];
78+
if (ref.startsWith("data:") || ref.startsWith("http") || ref.startsWith("#")) continue;
79+
assetRefs.push({ composition: hf, ref });
80+
}
81+
}
82+
83+
// with media-use: run --adopt
84+
const adoptResult = run(`node "${RESOLVE_SCRIPT}" --adopt --project "${tmp}" --json`);
85+
let adopted = { ok: false, adopted: 0, assets: [] };
86+
if (adoptResult.ok) {
87+
try { adopted = JSON.parse(adoptResult.output); } catch { /* */ }
88+
}
89+
90+
// read the generated index
91+
const indexPath = join(tmp, ".media", "index.md");
92+
const indexContent = existsSync(indexPath) ? readFileSync(indexPath, "utf8") : "(no index generated)";
93+
94+
// read manifest for detail
95+
const manifestPath = join(tmp, ".media", "manifest.jsonl");
96+
const manifest = existsSync(manifestPath)
97+
? readFileSync(manifestPath, "utf8").trim().split("\n").map((l) => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean)
98+
: [];
99+
100+
// test resolve cache hit: try resolving something that was adopted
101+
let resolveTest = null;
102+
if (manifest.length > 0) {
103+
const first = manifest[0];
104+
const prompt = first.provenance?.prompt || first.description;
105+
const r = run(`node "${RESOLVE_SCRIPT}" --type ${first.type} --intent "${prompt}" --project "${tmp}" --json`);
106+
if (r.ok) {
107+
try { resolveTest = JSON.parse(r.output); } catch { /* */ }
108+
}
109+
}
110+
111+
// test resolve miss: try resolving something that doesn't exist
112+
const missResult = run(`node "${RESOLVE_SCRIPT}" --type bgm --intent "nonexistent query xyz" --project "${tmp}" --json`);
113+
let resolveMiss = null;
114+
if (!missResult.ok) {
115+
try { resolveMiss = JSON.parse(missResult.output); } catch { /* */ }
116+
}
117+
118+
// coverage: which composition refs are covered by the manifest
119+
const manifestPaths = new Set(manifest.map((m) => m.path));
120+
const coverage = assetRefs.map((r) => ({
121+
...r,
122+
covered: manifestPaths.has(r.ref),
123+
}));
124+
125+
return {
126+
name,
127+
baseline: { fileCount: baseline.count, files: baseline.files, htmlCount: htmlFiles.length },
128+
compositions: htmlFiles,
129+
assetRefs: coverage,
130+
adopted: { count: adopted.adopted, assets: adopted.assets || [] },
131+
index: indexContent,
132+
manifest,
133+
resolveTest,
134+
resolveMiss,
135+
};
136+
} finally {
137+
rmSync(tmp, { recursive: true, force: true });
138+
}
139+
}
140+
141+
function generateReport(results) {
142+
const all = results.filter(Boolean);
143+
const passed = all.filter((r) => r.adopted.count > 0);
144+
145+
const rows = results
146+
.filter(Boolean)
147+
.map((r) => {
148+
const hasMetadata = r.manifest.some((m) => m.duration || m.width);
149+
const cacheHit = r.resolveTest?._source === "cached";
150+
const missHandled = r.resolveMiss?.ok === false;
151+
152+
return `<tr>
153+
<td><strong>${r.name}</strong></td>
154+
<td>${r.baseline.fileCount} files, ${r.baseline.htmlCount} comp${r.baseline.htmlCount === 1 ? "" : "s"}</td>
155+
<td>${r.adopted.count} adopted</td>
156+
<td>${hasMetadata ? "<span class='pass'>with metadata</span>" : "<span class='warn'>no metadata</span>"}</td>
157+
<td>${cacheHit ? "<span class='pass'>cache hit</span>" : "<span class='warn'>no hit</span>"}</td>
158+
<td>${missHandled ? "<span class='pass'>handled</span>" : "<span class='fail'>unexpected</span>"}</td>
159+
</tr>`;
160+
})
161+
.join("\n");
162+
163+
const details = results
164+
.filter(Boolean)
165+
.filter((r) => r.adopted.count > 0)
166+
.map((r) => {
167+
const assetRows = r.manifest
168+
.map((m) => {
169+
const dur = m.duration != null ? `${m.duration}s` : "—";
170+
const dims = m.width && m.height ? `${m.width}×${m.height}` : "—";
171+
return `<tr><td>${m.id}</td><td>${m.type}</td><td>${dur}</td><td>${dims}</td><td class="path">${m.path}</td><td>${m.description || ""}</td></tr>`;
172+
})
173+
.join("\n");
174+
175+
const coveredCount = r.assetRefs.filter((c) => c.covered).length;
176+
const totalRefs = r.assetRefs.length;
177+
const coveragePct = totalRefs > 0 ? Math.round((coveredCount / totalRefs) * 100) : 100;
178+
179+
const refRows = r.assetRefs
180+
.map((c) => `<tr><td class="path">${c.composition}</td><td class="path">${c.ref}</td><td>${c.covered ? "<span class='pass'>covered</span>" : "<span class='warn'>not in manifest</span>"}</td></tr>`)
181+
.join("\n");
182+
183+
return `<div class="block-detail">
184+
<h3>${r.name}</h3>
185+
<p style="font-size:13px;color:var(--muted)">${r.compositions.length} composition${r.compositions.length === 1 ? "" : "s"}: ${r.compositions.join(", ")}</p>
186+
187+
<div class="comparison">
188+
<div class="col">
189+
<h4>Baseline (no media-use)</h4>
190+
<p>Agent sees: ${r.baseline.fileCount} raw files in assets/<br>No metadata, no type info, no relationship to compositions.</p>
191+
<pre class="file-list">${r.baseline.files.join("\n") || "(no assets)"}</pre>
192+
</div>
193+
<div class="col">
194+
<h4>With media-use (after --adopt)</h4>
195+
<p>Agent reads index.md — structured, typed, with metadata:</p>
196+
<pre class="index">${escapeHtml(r.index)}</pre>
197+
</div>
198+
</div>
199+
200+
${totalRefs > 0 ? `<h4>Composition → asset coverage <span class="${coveragePct === 100 ? "pass" : "warn"}">${coveragePct}%</span> (${coveredCount}/${totalRefs} refs)</h4>
201+
<table class="manifest">
202+
<thead><tr><th>composition</th><th>asset reference</th><th>in manifest?</th></tr></thead>
203+
<tbody>${refRows}</tbody>
204+
</table>` : ""}
205+
206+
<h4>Manifest records</h4>
207+
<table class="manifest">
208+
<thead><tr><th>id</th><th>type</th><th>dur</th><th>dims</th><th>path</th><th>description</th></tr></thead>
209+
<tbody>${assetRows}</tbody>
210+
</table>
211+
</div>`;
212+
})
213+
.join("\n");
214+
215+
return `<title>media-use eval report</title>
216+
<style>
217+
:root { --bg: #fafaf7; --text: #1b1b18; --muted: #7a756a; --accent: #0d7377; --good: #1a7a3a; --warn: #b45309; --fail: #dc2626; --border: #e8e5df; --surface: #fff; --mono: ui-monospace, 'SF Mono', Menlo, Consolas, monospace; --sans: system-ui, -apple-system, sans-serif; --serif: Georgia, serif }
218+
* { box-sizing: border-box; margin: 0 } body { background: var(--bg); color: var(--text); font-family: var(--serif); line-height: 1.6; font-size: 15px; padding: 40px 24px }
219+
.wrap { max-width: 1100px; margin: 0 auto }
220+
h1 { font-family: var(--sans); font-size: 28px; font-weight: 700; margin-bottom: 8px; letter-spacing: -.02em }
221+
h2 { font-family: var(--sans); font-size: 20px; font-weight: 650; margin: 32px 0 12px; letter-spacing: -.01em }
222+
h3 { font-family: var(--sans); font-size: 17px; font-weight: 650; margin: 24px 0 8px }
223+
h4 { font-family: var(--sans); font-size: 14px; font-weight: 600; margin: 16px 0 6px; color: var(--muted) }
224+
p { margin-bottom: 10px }
225+
.meta { font-family: var(--mono); font-size: 12px; color: var(--muted); margin-bottom: 24px }
226+
.summary { display: flex; gap: 16px; margin: 16px 0; flex-wrap: wrap }
227+
.stat { background: var(--surface); border: 1px solid var(--border); border-radius: 8px; padding: 14px 18px; flex: 1; min-width: 140px }
228+
.stat .num { font-family: var(--sans); font-size: 28px; font-weight: 700; color: var(--accent) }
229+
.stat .label { font-family: var(--mono); font-size: 11px; color: var(--muted); text-transform: uppercase; letter-spacing: .1em }
230+
table { width: 100%; border-collapse: collapse; font-size: 13px; font-family: var(--sans); margin: 8px 0 }
231+
th { text-align: left; font-family: var(--mono); font-size: 10px; letter-spacing: .08em; text-transform: uppercase; color: var(--muted); border-bottom: 2px solid var(--border); padding: 6px 8px; font-weight: 700 }
232+
td { border-bottom: 1px solid var(--border); padding: 7px 8px; vertical-align: top }
233+
td.path { font-family: var(--mono); font-size: 12px; color: var(--muted); max-width: 300px; overflow: hidden; text-overflow: ellipsis }
234+
.pass { color: var(--good); font-weight: 600 } .warn { color: var(--warn); font-weight: 600 } .fail { color: var(--fail); font-weight: 600 }
235+
.comparison { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin: 12px 0 }
236+
@media(max-width:700px) { .comparison { grid-template-columns: 1fr } }
237+
.col { background: var(--surface); border: 1px solid var(--border); border-radius: 8px; padding: 14px 16px }
238+
.col h4 { margin-top: 0 }
239+
pre { font-family: var(--mono); font-size: 12px; background: #1b1b18; color: #d4d0c8; border-radius: 6px; padding: 12px 14px; overflow-x: auto; margin: 6px 0; line-height: 1.5 }
240+
pre.file-list { background: var(--bg); color: var(--muted); border: 1px solid var(--border) }
241+
pre.index { white-space: pre; }
242+
.block-detail { border-top: 1px solid var(--border); padding-top: 20px; margin-top: 20px }
243+
.verdict { margin-top: 24px; padding: 16px 20px; border-radius: 8px; font-family: var(--sans); font-size: 15px }
244+
.verdict.ship { background: #edfbf0; border: 1px solid #1a7a3a; color: #1a7a3a }
245+
.verdict.wait { background: #fff3ec; border: 1px solid #d94f04; color: #d94f04 }
246+
</style>
247+
<div class="wrap">
248+
<h1>media-use eval report</h1>
249+
<p class="meta">${new Date().toISOString().slice(0, 10)} · ${all.length} blocks evaluated · baseline vs. media-use --adopt</p>
250+
251+
<div class="summary">
252+
<div class="stat"><div class="num">${all.length}</div><div class="label">blocks tested</div></div>
253+
<div class="stat"><div class="num">${passed.length}</div><div class="label">with assets</div></div>
254+
<div class="stat"><div class="num">${all.reduce((s, r) => s + r.adopted.count, 0)}</div><div class="label">assets adopted</div></div>
255+
<div class="stat"><div class="num">${all.filter((r) => r.manifest.some((m) => m.duration || m.width)).length}</div><div class="label">with ffprobe metadata</div></div>
256+
<div class="stat"><div class="num">${(() => { const refs = all.flatMap((r) => r.assetRefs); const covered = refs.filter((c) => c.covered).length; return refs.length > 0 ? Math.round((covered / refs.length) * 100) + "%" : "—"; })()}</div><div class="label">composition coverage</div></div>
257+
</div>
258+
259+
<h2>Results matrix</h2>
260+
<table>
261+
<thead><tr><th>Block</th><th>Baseline</th><th>Adopted</th><th>Metadata</th><th>Cache hit</th><th>Miss handling</th></tr></thead>
262+
<tbody>${rows}</tbody>
263+
</table>
264+
265+
<h2>Before / after comparisons</h2>
266+
${details}
267+
268+
<div class="verdict ${passed.length >= 3 ? "ship" : "wait"}">
269+
${passed.length >= 3
270+
? `<strong>Ship it.</strong> ${passed.length}/${all.length} blocks adopted successfully with metadata. Resolve cache hits work. Miss handling is clean.`
271+
: `<strong>Needs work.</strong> Only ${passed.length} blocks adopted. Check the failures above.`}
272+
</div>
273+
</div>`;
274+
}
275+
276+
function escapeHtml(str) {
277+
return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
278+
}
279+
280+
console.log("media-use eval · running against registry blocks...\n");
281+
282+
const results = [];
283+
for (const block of TEST_BLOCKS) {
284+
const fullPath = join(REPO_ROOT, block);
285+
if (!existsSync(fullPath)) {
286+
console.log(` skip ${basename(block)} (not found)`);
287+
results.push(null);
288+
continue;
289+
}
290+
process.stdout.write(` ${basename(block)}...`);
291+
const result = evalBlock(block);
292+
if (result) {
293+
console.log(` ${result.adopted.count} adopted, ${result.manifest.filter((m) => m.duration || m.width).length} with metadata`);
294+
} else {
295+
console.log(" failed");
296+
}
297+
results.push(result);
298+
}
299+
300+
const report = generateReport(results);
301+
const outPath = join(SCRIPT_DIR, "..", "eval-report.html");
302+
writeFileSync(outPath, report);
303+
console.log(`\nReport: ${outPath}`);

0 commit comments

Comments
 (0)