Skip to content

Commit ebb6bf4

Browse files
sandeepl337claude
andcommitted
stage7(l5d): pretrained backbone SHATTERS the generalization ceiling — RED on FP
distilbert-base-multilingual-cased (Apache-2.0, 135M, WordPiece reuses the repo's zero-dep TS tokenizer) fine-tuned on the full corpus, research-grounded (Instruction-Hierarchy/Spotlighting/StruQ/PG2/PINT). THE result: POEM-UNSEEN 6.67 -> 77.78% (4.1x). That metric was BIT-IDENTICAL 6.67% across logistic/bert-tiny/from-scratch — proven unbreakable WITHOUT a language prior. With a prior it breaks decisively. UNSEEN-SOURCE 38.69->48.74, frozen 94->98, system-level P4RS (L1+L4+l5d) 100% (299/299). Pretraining = the generalization lever, confirmed with evidence. leakage=0, converged (val-F1 0.973), held-out never read. RED: leakage-free benign FP 9.45% vs <1% ceiling, threshold- irreducible (8.4% floor @0.99) — the documented PG2/ProtectAI OOD-benign over-fire. Diagnosed: benign-data BREADTH, not capacity. Opt-in only (createL5Cascade({l5dIntelligent:true})); src/index.ts fusion 0 lines changed; shipped L5a default + l5c + v3 untouched; 130MB artifact gitignored+npm-excluded. 148 green, bundle byte- identical, pack dist-only. Two named next problems: (1) prior must be OURS (self-pretrain), (2) OOD-benign FP breadth. Both = the self-pretraining stage. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6966a18 commit ebb6bf4

10 files changed

Lines changed: 1535 additions & 4 deletions

File tree

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ models/l5b/_student_fp32/
1818
# (files:["dist"]), large ONNX — gitignored from the shipped path exactly
1919
# like models/l5b. Reproducible from training/train_scratch.py (seed 1337).
2020
models/l5c/
21+
# STAGE-7 "intelligent" L5d artifact (fine-tuned Apache-2.0 distil-mBERT):
22+
# opt-in, npm-excluded (files:["dist"]), large INT8 ONNX — gitignored from
23+
# the shipped path exactly like models/l5b, l5c. Reproducible from
24+
# training/train_intelligent.py + export_intelligent.py (seed 1337).
25+
models/l5d/
2126
# isolated offline training venv (stable pinned CPU stack; never shipped)
2227
training/.venv/
2328
# python bytecode cache (training scripts; never shipped)

scripts/eval-l5d.mjs

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
/**
2+
* STAGE-7 HONEST evaluator — the "intelligent" L5d (fine-tuned Apache-2.0
3+
* distil-mBERT) vs the working baseline (v3 L5a + bert-tiny L5b + Stage-6
4+
* from-scratch l5c), model-only on the SAME seeded leakage-free split, plus
5+
* the SYSTEM-level P4RS measurement (Spotlighting: obfuscation is owned by
6+
* L1 normalize + L4 fence which PRECEDE the model — the Stage-6 wrong-bar
7+
* was model-only-on-raw). NO TUNING here: pure measurement at the SAME
8+
* 0.445 model threshold every prior stage used (0.5 for the P4RS guard,
9+
* 0.8 for the product cascade).
10+
*
11+
* Held-out slices NEVER trained/tuned/early-stopped on:
12+
* • FROZEN test (training/.real_cache/test.jsonl)
13+
* • UNSEEN-SOURCE (deepset/prompt-injections + Lakera/gandalf_sum)
14+
* • POEM-UNSEEN (whole disjoint synthetic-poem source)
15+
* • OOD pooled (training/.ood_cache/ood.jsonl attack rows)
16+
* • P4RS3LT0NGV3 battery
17+
* Leakage-free benign FP on the split's ood_benign_not_in_train.json.
18+
*
19+
* Run: npm run build && node scripts/eval-l5d.mjs
20+
*/
21+
import fs from "node:fs";
22+
import crypto from "node:crypto";
23+
import { createRequire } from "node:module";
24+
import { fileURLToPath } from "node:url";
25+
import { dirname, join } from "node:path";
26+
const require = createRequire(import.meta.url);
27+
const { createPromptPurify } = require("../dist/index.cjs");
28+
const {
29+
createL5aClassifier,
30+
createL5bRunner,
31+
createL5cRunner,
32+
createL5dRunner,
33+
createL5Cascade,
34+
} = require("../dist/l5/index.cjs");
35+
import { buildPositives } from "./corpus-l5a.mjs";
36+
37+
const T = 0.445;
38+
const __dir = dirname(fileURLToPath(import.meta.url));
39+
const C = join(__dir, "..", "training", ".real_cache");
40+
41+
const reg = createPromptPurify({ profile: "balanced" }); // L1+L4, NO model
42+
const l5a = createL5aClassifier();
43+
let l5b = null;
44+
try {
45+
l5b = await createL5bRunner();
46+
} catch (e) {
47+
console.error("L5b unavailable:", e.message);
48+
}
49+
let l5c = null;
50+
try {
51+
l5c = await createL5cRunner();
52+
} catch (e) {
53+
console.error("L5c unavailable:", e.message);
54+
}
55+
const l5d = await createL5dRunner(); // intelligent (must exist)
56+
57+
const parse = (p) =>
58+
fs.readFileSync(p, "utf8").trim().split("\n").filter(Boolean).map((l) =>
59+
JSON.parse(l),
60+
);
61+
62+
const ood = parse(join(__dir, "..", "training", ".ood_cache", "ood.jsonl"));
63+
const benign = ood.filter((r) => r.y === 0);
64+
const attack = ood.filter((r) => r.y === 1);
65+
const test = parse(join(C, "test.jsonl"));
66+
const unseen = parse(join(C, "unseen_bench.jsonl"));
67+
const poemUnseen = parse(join(C, "poem_unseen_bench.jsonl"));
68+
69+
const san = (t) =>
70+
reg.sanitize(String(t || "").slice(0, 4000), { sink: "untrusted_data" });
71+
const regBad = (t) =>
72+
reg.inspect(String(t || "").slice(0, 4000), { sink: "untrusted_data" })
73+
.verdict !== "clean-structural";
74+
75+
async function scoreModels(rows) {
76+
const out = [];
77+
for (const r of rows) {
78+
const s = san(r.text);
79+
const { score: sA } = await l5a(s, { sink: "untrusted_data" });
80+
const sB = l5b ? await l5b.score(s) : NaN;
81+
const sC = l5c ? await l5c.score(s) : NaN;
82+
const sD = await l5d.score(s);
83+
out.push({ y: r.y, l4: regBad(r.text), l5a: sA, l5b: sB, l5c: sC, l5d: sD });
84+
}
85+
return out;
86+
}
87+
88+
const pct = (n, d) => (d ? ((100 * n) / d).toFixed(2) : "n/a") + "%";
89+
const rate = (set, key, thr = T) =>
90+
set.length ? set.filter((r) => r[key] >= thr).length / set.length : NaN;
91+
92+
const normHash = (s) => {
93+
let x = (s || "").normalize("NFKC").toLowerCase();
94+
x = x.replace(/[^\p{L}\p{N}_\s]/gu, " ");
95+
x = x.replace(/\s+/gu, " ").trim();
96+
return crypto.createHash("sha1").update(x, "utf8").digest("hex");
97+
};
98+
const lfPath = join(C, "ood_benign_not_in_train.json");
99+
const lfSet = fs.existsSync(lfPath)
100+
? new Set(JSON.parse(fs.readFileSync(lfPath, "utf8")).hashes)
101+
: null;
102+
103+
const B = await scoreModels(benign);
104+
const A = await scoreModels(attack);
105+
const TST = await scoreModels(test);
106+
const UNS = await scoreModels(unseen);
107+
const PUN = await scoreModels(poemUnseen);
108+
109+
const tstPos = TST.filter((r) => r.y === 1);
110+
const unsPos = UNS.filter((r) => r.y === 1);
111+
const punPos = PUN.filter((r) => r.y === 1);
112+
113+
// ---- P4RS3LT0NGV3 battery: model-only AND SYSTEM-level ----------------------
114+
const par = { l4: 0, l5a: 0, l5b: 0, l5c: 0, l5d: 0, sys: 0, n: 0 };
115+
const ppSys = createPromptPurify({
116+
classifier: createL5Cascade({ enableL5b: true, l5dIntelligent: true }),
117+
});
118+
for (const text of buildPositives()) {
119+
const s = san(text);
120+
par.n++;
121+
if (regBad(text)) par.l4++;
122+
const { score: sA } = await l5a(s, { sink: "untrusted_data" });
123+
if (sA >= 0.5) par.l5a++;
124+
if (l5b && (await l5b.score(s)) >= 0.5) par.l5b++;
125+
if (l5c && (await l5c.score(s)) >= 0.5) par.l5c++;
126+
if ((await l5d.score(s)) >= 0.5) par.l5d++;
127+
// SYSTEM-level: raw text through the FULL pipeline (L1 normalize + L4
128+
// fence + opt-in l5d cascade) — Spotlighting bar, the honest one.
129+
const v = await ppSys.inspectAsync(String(text).slice(0, 4000), {
130+
sink: "untrusted_data",
131+
});
132+
if (v.verdict !== "clean-structural") par.sys++; // flagged|blocked = caught
133+
}
134+
135+
const Bclean = lfSet
136+
? B.filter((_, i) => lfSet.has(normHash(benign[i].text)))
137+
: [];
138+
const lf = (key) =>
139+
Bclean.length
140+
? Bclean.filter((r) => r[key] >= T).length / Bclean.length
141+
: NaN;
142+
143+
console.log("\n========= STAGE-7 INTELLIGENT L5d EVAL =========");
144+
console.log(`L5a: ${l5a("x", { sink: "untrusted_data" }).version}`);
145+
console.log(`L5b: ${l5b ? l5b.version : "UNAVAILABLE"}`);
146+
console.log(`L5c: ${l5c ? l5c.version : "UNAVAILABLE"}`);
147+
console.log(`L5d: ${l5d.version} (PRETRAINED, FINE-TUNED)`);
148+
console.log(
149+
`n: benignOOD=${benign.length} attackOOD=${attack.length} ` +
150+
`frozenTESTpos=${tstPos.length} UNSEENpos=${unsPos.length} ` +
151+
`POEMUNSEEN=${punPos.length}\n`,
152+
);
153+
154+
const row = (label, key, posset) =>
155+
console.log(
156+
` ${label.padEnd(14)} ${key.padEnd(4)}: ` +
157+
`frozen=${pct(posset.test.filter((r) => r[key] >= T).length, posset.test.length)} ` +
158+
`UNSEEN=${pct(posset.uns.filter((r) => r[key] >= T).length, posset.uns.length)} ` +
159+
`POEM=${pct(posset.pun.filter((r) => r[key] >= T).length, posset.pun.length)} ` +
160+
`OOD=${pct(posset.ood.filter((r) => r[key] >= T).length, posset.ood.length)}`,
161+
);
162+
const PS = { test: tstPos, uns: unsPos, pun: punPos, ood: A };
163+
console.log("HELD-OUT RECALL @0.445 (model-only, worst cell = POEM/UNSEEN):");
164+
row("v3 shipped", "l5a", PS);
165+
if (l5b) row("v3 bert-tiny", "l5b", PS);
166+
if (l5c) row("S6 from-scratch", "l5c", PS);
167+
row("INTELLIGENT", "l5d", PS);
168+
169+
console.log(
170+
`\nLEAKAGE-FREE benign FP (n=${Bclean.length} of ${benign.length}, ` +
171+
`HARD CEILING <1%):`,
172+
);
173+
console.log(
174+
` L4=${pct(Bclean.filter((r) => r.l4).length, Bclean.length)} ` +
175+
`L5a=${pct(Bclean.filter((r) => r.l5a >= T).length, Bclean.length)} ` +
176+
(l5b ? `L5b=${pct(Bclean.filter((r) => r.l5b >= T).length, Bclean.length)} ` : "") +
177+
(l5c ? `L5c=${pct(Bclean.filter((r) => r.l5c >= T).length, Bclean.length)} ` : "") +
178+
`L5d=${pct(Bclean.filter((r) => r.l5d >= T).length, Bclean.length)}`,
179+
);
180+
181+
console.log("\nP4RS3LT0NGV3 battery (regression guard):");
182+
console.log(
183+
` model-only@0.5: L4 ${par.l4}/${par.n} L5a ${par.l5a}/${par.n} ` +
184+
(l5b ? `L5b ${par.l5b}/${par.n} ` : "") +
185+
(l5c ? `L5c ${par.l5c}/${par.n} ` : "") +
186+
`L5d ${par.l5d}/${par.n}`,
187+
);
188+
console.log(
189+
` SYSTEM-level (L1+L4+l5d cascade, the honest bar): ` +
190+
`${par.sys}/${par.n} = ${pct(par.sys, par.n)}`,
191+
);
192+
193+
// Opt-in intelligent cascade frozen-test benign FPR@0.8 + recall@0.8.
194+
const ppFull = createPromptPurify();
195+
const l1 = (t) => ppFull.sanitize(t, { sink: "untrusted_data" });
196+
const casc = createL5Cascade({ enableL5b: true, l5dIntelligent: true });
197+
let cfp = 0,
198+
cn = 0,
199+
crec = 0,
200+
cp = 0;
201+
for (const r of test) {
202+
const s = l1(r.text);
203+
const fired =
204+
reg.inspect(String(r.text).slice(0, 4000), { sink: "untrusted_data" })
205+
.verdict !== "clean-structural";
206+
const sc = (await casc(s, { sink: "untrusted_data" })).score;
207+
const hit = fired || sc >= 0.8;
208+
if (r.y === 0) {
209+
cn++;
210+
if (hit) cfp++;
211+
} else {
212+
cp++;
213+
if (hit) crec++;
214+
}
215+
}
216+
console.log(
217+
`\nOPT-IN intelligent cascade @0.8 on FROZEN test: ` +
218+
`benignFPR=${pct(cfp, cn)} recall=${pct(crec, cp)} ` +
219+
`(shipped L5a-only default UNCHANGED)`,
220+
);
221+
222+
const H = {
223+
l5a_version: l5a("x", { sink: "untrusted_data" }).version,
224+
l5b_version: l5b ? l5b.version : null,
225+
l5c_version: l5c ? l5c.version : null,
226+
l5d_version: l5d.version,
227+
n: {
228+
benign_ood: benign.length,
229+
attack_ood: attack.length,
230+
test_pos: tstPos.length,
231+
unseen_pos: unsPos.length,
232+
poem_unseen: punPos.length,
233+
leakage_free_benign: Bclean.length,
234+
},
235+
recall_frozen_test: {
236+
l5a: rate(tstPos, "l5a"),
237+
l5b: rate(tstPos, "l5b"),
238+
l5c: rate(tstPos, "l5c"),
239+
l5d: rate(tstPos, "l5d"),
240+
},
241+
recall_unseen: {
242+
l5a: rate(unsPos, "l5a"),
243+
l5b: rate(unsPos, "l5b"),
244+
l5c: rate(unsPos, "l5c"),
245+
l5d: rate(unsPos, "l5d"),
246+
},
247+
recall_poem_unseen: {
248+
l5a: rate(punPos, "l5a"),
249+
l5b: rate(punPos, "l5b"),
250+
l5c: rate(punPos, "l5c"),
251+
l5d: rate(punPos, "l5d"),
252+
},
253+
recall_ood: {
254+
l5a: rate(A, "l5a"),
255+
l5b: rate(A, "l5b"),
256+
l5c: rate(A, "l5c"),
257+
l5d: rate(A, "l5d"),
258+
},
259+
fp_leakage_free: {
260+
n: Bclean.length,
261+
l4: Bclean.length ? Bclean.filter((r) => r.l4).length / Bclean.length : null,
262+
l5a: lf("l5a"),
263+
l5b: lf("l5b"),
264+
l5c: lf("l5c"),
265+
l5d: lf("l5d"),
266+
},
267+
p4rs3lt0ngv3: {
268+
n: par.n,
269+
l4: par.l4 / par.n,
270+
l5a: par.l5a / par.n,
271+
l5b: l5b ? par.l5b / par.n : null,
272+
l5c: l5c ? par.l5c / par.n : null,
273+
l5d: par.l5d / par.n,
274+
system_level: par.sys / par.n,
275+
},
276+
optin_intelligent_cascade_frozen: {
277+
benign_fpr: cn ? cfp / cn : null,
278+
recall: cp ? crec / cp : null,
279+
},
280+
};
281+
console.log("\n===JSON===");
282+
console.log(JSON.stringify(H));

src/l5/cascade.ts

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ import {
5353
type L5bOptions,
5454
} from "./transformer.js";
5555
import { createL5cRunner, type L5cRunner } from "./l5c.js";
56+
import { createL5dRunner, type L5dRunner } from "./l5d.js";
5657

5758
export interface L5CascadeOptions extends L5bOptions {
5859
/**
@@ -87,6 +88,18 @@ export interface L5CascadeOptions extends L5bOptions {
8788
* never clears a structural block, never `.safe`.
8889
*/
8990
l5cFromScratch?: boolean;
91+
/**
92+
* STAGE-7 OPT-IN: use the "intelligent" L5d model
93+
* (`distilbert-base-multilingual-cased`, Apache-2.0, fine-tuned on our
94+
* full seeded leakage-free corpus) as the ambiguous-band escalator
95+
* INSTEAD of L5b/L5c. Default false. Requires `enableL5b: true` to take
96+
* effect (it shares the EXACT same confidence-gated escalation path;
97+
* only the runner differs). Takes precedence over `l5cFromScratch` if
98+
* both are set. The SHIPPED L5a-only default and src/index.ts honesty
99+
* fusion are UNCHANGED. Honesty contract identical: probabilistic,
100+
* advisory only, never clears a structural block, never `.safe`.
101+
*/
102+
l5dIntelligent?: boolean;
90103
/**
91104
* s_a strictly below this ⇒ confident ALLOW, L5b skipped. Default 0.15
92105
* (below the in-repo corpus positive-class minimum ≈0.146; the bulk of
@@ -141,10 +154,15 @@ export function createL5Cascade(opts: L5CascadeOptions = {}): Classifier {
141154
const gate = opts.l5bGate ?? 1;
142155
const withReason = opts.withReason !== false;
143156

144-
const useL5c = opts.l5cFromScratch === true;
145-
let runnerPromise: Promise<L5bRunner | L5cRunner> | null = null;
157+
const useL5d = opts.l5dIntelligent === true;
158+
const useL5c = !useL5d && opts.l5cFromScratch === true;
159+
let runnerPromise: Promise<L5bRunner | L5cRunner | L5dRunner> | null = null;
146160
const getRunner = () =>
147-
(runnerPromise ??= useL5c ? createL5cRunner(opts) : createL5bRunner(opts));
161+
(runnerPromise ??= useL5d
162+
? createL5dRunner(opts)
163+
: useL5c
164+
? createL5cRunner(opts)
165+
: createL5bRunner(opts));
148166

149167
return async (
150168
text: string,
@@ -199,7 +217,7 @@ export function createL5Cascade(opts: L5CascadeOptions = {}): Classifier {
199217
// existing inspectAsync catch turns it into a classifier-error info
200218
// risk. We must NOT silently fall back to s_a on the band where the
201219
// semantic check actually matters — that would be a false-safe.
202-
let runner: L5bRunner | L5cRunner;
220+
let runner: L5bRunner | L5cRunner | L5dRunner;
203221
try {
204222
runner = await getRunner();
205223
} catch (e) {

src/l5/index.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,12 @@ export {
100100
type L5cOptions,
101101
type L5cRunner,
102102
} from "./l5c.js";
103+
// STAGE-7 "intelligent" L5d: fine-tuned pretrained Apache-2.0 backbone
104+
// (distilbert-base-multilingual-cased). Opt-in only (cascade
105+
// `l5dIntelligent`). Shipped L5a-only default + src/index.ts honesty
106+
// fusion UNCHANGED. See training/RESEARCH.md + training/INTEGRATION.md.
107+
export {
108+
createL5dRunner,
109+
type L5dOptions,
110+
type L5dRunner,
111+
} from "./l5d.js";

0 commit comments

Comments
 (0)