|
| 1 | +/** |
| 2 | + * STAGE-7 HONEST evaluator — the "intelligent" L5d (fine-tuned Apache-2.0 |
| 3 | + * distil-mBERT) vs the working baseline (v3 L5a + bert-tiny L5b + Stage-6 |
| 4 | + * from-scratch l5c), model-only on the SAME seeded leakage-free split, plus |
| 5 | + * the SYSTEM-level P4RS measurement (Spotlighting: obfuscation is owned by |
| 6 | + * L1 normalize + L4 fence which PRECEDE the model — the Stage-6 wrong-bar |
| 7 | + * was model-only-on-raw). NO TUNING here: pure measurement at the SAME |
| 8 | + * 0.445 model threshold every prior stage used (0.5 for the P4RS guard, |
| 9 | + * 0.8 for the product cascade). |
| 10 | + * |
| 11 | + * Held-out slices NEVER trained/tuned/early-stopped on: |
| 12 | + * • FROZEN test (training/.real_cache/test.jsonl) |
| 13 | + * • UNSEEN-SOURCE (deepset/prompt-injections + Lakera/gandalf_sum) |
| 14 | + * • POEM-UNSEEN (whole disjoint synthetic-poem source) |
| 15 | + * • OOD pooled (training/.ood_cache/ood.jsonl attack rows) |
| 16 | + * • P4RS3LT0NGV3 battery |
| 17 | + * Leakage-free benign FP on the split's ood_benign_not_in_train.json. |
| 18 | + * |
| 19 | + * Run: npm run build && node scripts/eval-l5d.mjs |
| 20 | + */ |
| 21 | +import fs from "node:fs"; |
| 22 | +import crypto from "node:crypto"; |
| 23 | +import { createRequire } from "node:module"; |
| 24 | +import { fileURLToPath } from "node:url"; |
| 25 | +import { dirname, join } from "node:path"; |
| 26 | +const require = createRequire(import.meta.url); |
| 27 | +const { createPromptPurify } = require("../dist/index.cjs"); |
| 28 | +const { |
| 29 | + createL5aClassifier, |
| 30 | + createL5bRunner, |
| 31 | + createL5cRunner, |
| 32 | + createL5dRunner, |
| 33 | + createL5Cascade, |
| 34 | +} = require("../dist/l5/index.cjs"); |
| 35 | +import { buildPositives } from "./corpus-l5a.mjs"; |
| 36 | + |
| 37 | +const T = 0.445; |
| 38 | +const __dir = dirname(fileURLToPath(import.meta.url)); |
| 39 | +const C = join(__dir, "..", "training", ".real_cache"); |
| 40 | + |
| 41 | +const reg = createPromptPurify({ profile: "balanced" }); // L1+L4, NO model |
| 42 | +const l5a = createL5aClassifier(); |
| 43 | +let l5b = null; |
| 44 | +try { |
| 45 | + l5b = await createL5bRunner(); |
| 46 | +} catch (e) { |
| 47 | + console.error("L5b unavailable:", e.message); |
| 48 | +} |
| 49 | +let l5c = null; |
| 50 | +try { |
| 51 | + l5c = await createL5cRunner(); |
| 52 | +} catch (e) { |
| 53 | + console.error("L5c unavailable:", e.message); |
| 54 | +} |
| 55 | +const l5d = await createL5dRunner(); // intelligent (must exist) |
| 56 | + |
| 57 | +const parse = (p) => |
| 58 | + fs.readFileSync(p, "utf8").trim().split("\n").filter(Boolean).map((l) => |
| 59 | + JSON.parse(l), |
| 60 | + ); |
| 61 | + |
| 62 | +const ood = parse(join(__dir, "..", "training", ".ood_cache", "ood.jsonl")); |
| 63 | +const benign = ood.filter((r) => r.y === 0); |
| 64 | +const attack = ood.filter((r) => r.y === 1); |
| 65 | +const test = parse(join(C, "test.jsonl")); |
| 66 | +const unseen = parse(join(C, "unseen_bench.jsonl")); |
| 67 | +const poemUnseen = parse(join(C, "poem_unseen_bench.jsonl")); |
| 68 | + |
| 69 | +const san = (t) => |
| 70 | + reg.sanitize(String(t || "").slice(0, 4000), { sink: "untrusted_data" }); |
| 71 | +const regBad = (t) => |
| 72 | + reg.inspect(String(t || "").slice(0, 4000), { sink: "untrusted_data" }) |
| 73 | + .verdict !== "clean-structural"; |
| 74 | + |
| 75 | +async function scoreModels(rows) { |
| 76 | + const out = []; |
| 77 | + for (const r of rows) { |
| 78 | + const s = san(r.text); |
| 79 | + const { score: sA } = await l5a(s, { sink: "untrusted_data" }); |
| 80 | + const sB = l5b ? await l5b.score(s) : NaN; |
| 81 | + const sC = l5c ? await l5c.score(s) : NaN; |
| 82 | + const sD = await l5d.score(s); |
| 83 | + out.push({ y: r.y, l4: regBad(r.text), l5a: sA, l5b: sB, l5c: sC, l5d: sD }); |
| 84 | + } |
| 85 | + return out; |
| 86 | +} |
| 87 | + |
| 88 | +const pct = (n, d) => (d ? ((100 * n) / d).toFixed(2) : "n/a") + "%"; |
| 89 | +const rate = (set, key, thr = T) => |
| 90 | + set.length ? set.filter((r) => r[key] >= thr).length / set.length : NaN; |
| 91 | + |
| 92 | +const normHash = (s) => { |
| 93 | + let x = (s || "").normalize("NFKC").toLowerCase(); |
| 94 | + x = x.replace(/[^\p{L}\p{N}_\s]/gu, " "); |
| 95 | + x = x.replace(/\s+/gu, " ").trim(); |
| 96 | + return crypto.createHash("sha1").update(x, "utf8").digest("hex"); |
| 97 | +}; |
| 98 | +const lfPath = join(C, "ood_benign_not_in_train.json"); |
| 99 | +const lfSet = fs.existsSync(lfPath) |
| 100 | + ? new Set(JSON.parse(fs.readFileSync(lfPath, "utf8")).hashes) |
| 101 | + : null; |
| 102 | + |
| 103 | +const B = await scoreModels(benign); |
| 104 | +const A = await scoreModels(attack); |
| 105 | +const TST = await scoreModels(test); |
| 106 | +const UNS = await scoreModels(unseen); |
| 107 | +const PUN = await scoreModels(poemUnseen); |
| 108 | + |
| 109 | +const tstPos = TST.filter((r) => r.y === 1); |
| 110 | +const unsPos = UNS.filter((r) => r.y === 1); |
| 111 | +const punPos = PUN.filter((r) => r.y === 1); |
| 112 | + |
| 113 | +// ---- P4RS3LT0NGV3 battery: model-only AND SYSTEM-level ---------------------- |
| 114 | +const par = { l4: 0, l5a: 0, l5b: 0, l5c: 0, l5d: 0, sys: 0, n: 0 }; |
| 115 | +const ppSys = createPromptPurify({ |
| 116 | + classifier: createL5Cascade({ enableL5b: true, l5dIntelligent: true }), |
| 117 | +}); |
| 118 | +for (const text of buildPositives()) { |
| 119 | + const s = san(text); |
| 120 | + par.n++; |
| 121 | + if (regBad(text)) par.l4++; |
| 122 | + const { score: sA } = await l5a(s, { sink: "untrusted_data" }); |
| 123 | + if (sA >= 0.5) par.l5a++; |
| 124 | + if (l5b && (await l5b.score(s)) >= 0.5) par.l5b++; |
| 125 | + if (l5c && (await l5c.score(s)) >= 0.5) par.l5c++; |
| 126 | + if ((await l5d.score(s)) >= 0.5) par.l5d++; |
| 127 | + // SYSTEM-level: raw text through the FULL pipeline (L1 normalize + L4 |
| 128 | + // fence + opt-in l5d cascade) — Spotlighting bar, the honest one. |
| 129 | + const v = await ppSys.inspectAsync(String(text).slice(0, 4000), { |
| 130 | + sink: "untrusted_data", |
| 131 | + }); |
| 132 | + if (v.verdict !== "clean-structural") par.sys++; // flagged|blocked = caught |
| 133 | +} |
| 134 | + |
| 135 | +const Bclean = lfSet |
| 136 | + ? B.filter((_, i) => lfSet.has(normHash(benign[i].text))) |
| 137 | + : []; |
| 138 | +const lf = (key) => |
| 139 | + Bclean.length |
| 140 | + ? Bclean.filter((r) => r[key] >= T).length / Bclean.length |
| 141 | + : NaN; |
| 142 | + |
| 143 | +console.log("\n========= STAGE-7 INTELLIGENT L5d EVAL ========="); |
| 144 | +console.log(`L5a: ${l5a("x", { sink: "untrusted_data" }).version}`); |
| 145 | +console.log(`L5b: ${l5b ? l5b.version : "UNAVAILABLE"}`); |
| 146 | +console.log(`L5c: ${l5c ? l5c.version : "UNAVAILABLE"}`); |
| 147 | +console.log(`L5d: ${l5d.version} (PRETRAINED, FINE-TUNED)`); |
| 148 | +console.log( |
| 149 | + `n: benignOOD=${benign.length} attackOOD=${attack.length} ` + |
| 150 | + `frozenTESTpos=${tstPos.length} UNSEENpos=${unsPos.length} ` + |
| 151 | + `POEMUNSEEN=${punPos.length}\n`, |
| 152 | +); |
| 153 | + |
| 154 | +const row = (label, key, posset) => |
| 155 | + console.log( |
| 156 | + ` ${label.padEnd(14)} ${key.padEnd(4)}: ` + |
| 157 | + `frozen=${pct(posset.test.filter((r) => r[key] >= T).length, posset.test.length)} ` + |
| 158 | + `UNSEEN=${pct(posset.uns.filter((r) => r[key] >= T).length, posset.uns.length)} ` + |
| 159 | + `POEM=${pct(posset.pun.filter((r) => r[key] >= T).length, posset.pun.length)} ` + |
| 160 | + `OOD=${pct(posset.ood.filter((r) => r[key] >= T).length, posset.ood.length)}`, |
| 161 | + ); |
| 162 | +const PS = { test: tstPos, uns: unsPos, pun: punPos, ood: A }; |
| 163 | +console.log("HELD-OUT RECALL @0.445 (model-only, worst cell = POEM/UNSEEN):"); |
| 164 | +row("v3 shipped", "l5a", PS); |
| 165 | +if (l5b) row("v3 bert-tiny", "l5b", PS); |
| 166 | +if (l5c) row("S6 from-scratch", "l5c", PS); |
| 167 | +row("INTELLIGENT", "l5d", PS); |
| 168 | + |
| 169 | +console.log( |
| 170 | + `\nLEAKAGE-FREE benign FP (n=${Bclean.length} of ${benign.length}, ` + |
| 171 | + `HARD CEILING <1%):`, |
| 172 | +); |
| 173 | +console.log( |
| 174 | + ` L4=${pct(Bclean.filter((r) => r.l4).length, Bclean.length)} ` + |
| 175 | + `L5a=${pct(Bclean.filter((r) => r.l5a >= T).length, Bclean.length)} ` + |
| 176 | + (l5b ? `L5b=${pct(Bclean.filter((r) => r.l5b >= T).length, Bclean.length)} ` : "") + |
| 177 | + (l5c ? `L5c=${pct(Bclean.filter((r) => r.l5c >= T).length, Bclean.length)} ` : "") + |
| 178 | + `L5d=${pct(Bclean.filter((r) => r.l5d >= T).length, Bclean.length)}`, |
| 179 | +); |
| 180 | + |
| 181 | +console.log("\nP4RS3LT0NGV3 battery (regression guard):"); |
| 182 | +console.log( |
| 183 | + ` model-only@0.5: L4 ${par.l4}/${par.n} L5a ${par.l5a}/${par.n} ` + |
| 184 | + (l5b ? `L5b ${par.l5b}/${par.n} ` : "") + |
| 185 | + (l5c ? `L5c ${par.l5c}/${par.n} ` : "") + |
| 186 | + `L5d ${par.l5d}/${par.n}`, |
| 187 | +); |
| 188 | +console.log( |
| 189 | + ` SYSTEM-level (L1+L4+l5d cascade, the honest bar): ` + |
| 190 | + `${par.sys}/${par.n} = ${pct(par.sys, par.n)}`, |
| 191 | +); |
| 192 | + |
| 193 | +// Opt-in intelligent cascade frozen-test benign FPR@0.8 + recall@0.8. |
| 194 | +const ppFull = createPromptPurify(); |
| 195 | +const l1 = (t) => ppFull.sanitize(t, { sink: "untrusted_data" }); |
| 196 | +const casc = createL5Cascade({ enableL5b: true, l5dIntelligent: true }); |
| 197 | +let cfp = 0, |
| 198 | + cn = 0, |
| 199 | + crec = 0, |
| 200 | + cp = 0; |
| 201 | +for (const r of test) { |
| 202 | + const s = l1(r.text); |
| 203 | + const fired = |
| 204 | + reg.inspect(String(r.text).slice(0, 4000), { sink: "untrusted_data" }) |
| 205 | + .verdict !== "clean-structural"; |
| 206 | + const sc = (await casc(s, { sink: "untrusted_data" })).score; |
| 207 | + const hit = fired || sc >= 0.8; |
| 208 | + if (r.y === 0) { |
| 209 | + cn++; |
| 210 | + if (hit) cfp++; |
| 211 | + } else { |
| 212 | + cp++; |
| 213 | + if (hit) crec++; |
| 214 | + } |
| 215 | +} |
| 216 | +console.log( |
| 217 | + `\nOPT-IN intelligent cascade @0.8 on FROZEN test: ` + |
| 218 | + `benignFPR=${pct(cfp, cn)} recall=${pct(crec, cp)} ` + |
| 219 | + `(shipped L5a-only default UNCHANGED)`, |
| 220 | +); |
| 221 | + |
| 222 | +const H = { |
| 223 | + l5a_version: l5a("x", { sink: "untrusted_data" }).version, |
| 224 | + l5b_version: l5b ? l5b.version : null, |
| 225 | + l5c_version: l5c ? l5c.version : null, |
| 226 | + l5d_version: l5d.version, |
| 227 | + n: { |
| 228 | + benign_ood: benign.length, |
| 229 | + attack_ood: attack.length, |
| 230 | + test_pos: tstPos.length, |
| 231 | + unseen_pos: unsPos.length, |
| 232 | + poem_unseen: punPos.length, |
| 233 | + leakage_free_benign: Bclean.length, |
| 234 | + }, |
| 235 | + recall_frozen_test: { |
| 236 | + l5a: rate(tstPos, "l5a"), |
| 237 | + l5b: rate(tstPos, "l5b"), |
| 238 | + l5c: rate(tstPos, "l5c"), |
| 239 | + l5d: rate(tstPos, "l5d"), |
| 240 | + }, |
| 241 | + recall_unseen: { |
| 242 | + l5a: rate(unsPos, "l5a"), |
| 243 | + l5b: rate(unsPos, "l5b"), |
| 244 | + l5c: rate(unsPos, "l5c"), |
| 245 | + l5d: rate(unsPos, "l5d"), |
| 246 | + }, |
| 247 | + recall_poem_unseen: { |
| 248 | + l5a: rate(punPos, "l5a"), |
| 249 | + l5b: rate(punPos, "l5b"), |
| 250 | + l5c: rate(punPos, "l5c"), |
| 251 | + l5d: rate(punPos, "l5d"), |
| 252 | + }, |
| 253 | + recall_ood: { |
| 254 | + l5a: rate(A, "l5a"), |
| 255 | + l5b: rate(A, "l5b"), |
| 256 | + l5c: rate(A, "l5c"), |
| 257 | + l5d: rate(A, "l5d"), |
| 258 | + }, |
| 259 | + fp_leakage_free: { |
| 260 | + n: Bclean.length, |
| 261 | + l4: Bclean.length ? Bclean.filter((r) => r.l4).length / Bclean.length : null, |
| 262 | + l5a: lf("l5a"), |
| 263 | + l5b: lf("l5b"), |
| 264 | + l5c: lf("l5c"), |
| 265 | + l5d: lf("l5d"), |
| 266 | + }, |
| 267 | + p4rs3lt0ngv3: { |
| 268 | + n: par.n, |
| 269 | + l4: par.l4 / par.n, |
| 270 | + l5a: par.l5a / par.n, |
| 271 | + l5b: l5b ? par.l5b / par.n : null, |
| 272 | + l5c: l5c ? par.l5c / par.n : null, |
| 273 | + l5d: par.l5d / par.n, |
| 274 | + system_level: par.sys / par.n, |
| 275 | + }, |
| 276 | + optin_intelligent_cascade_frozen: { |
| 277 | + benign_fpr: cn ? cfp / cn : null, |
| 278 | + recall: cp ? crec / cp : null, |
| 279 | + }, |
| 280 | +}; |
| 281 | +console.log("\n===JSON==="); |
| 282 | +console.log(JSON.stringify(H)); |
0 commit comments