Skip to content

Commit cbf7ef7

Browse files
sandeepl337claude
andcommitted
demo(sample-app): model-driven default, NO regex rules (per direction)
Review outcome: the sample WAS still running the full L4 regex tripwire stack by default (instruction-override/role-reassignment/etc. verified firing live). Flipped: default /chat path is now model + L1 input- hygiene + nonce-fence ONLY; the deterministic regex stack is opt-in via WITH_REGEX=1. No L4 rule names in the verdict anymore. l5d (distilbert-backbone reference) artifact was deleted by the concurrent Stage-8 cleanup; it is NOT ours and was only the thesis-proof reference — kept as a documented version in V9_RESULTS, NOT regenerated. Model-only path uses the available offline l5a as a placeholder until OUR OWN pretrained model lands. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent fdc988b commit cbf7ef7

1 file changed

Lines changed: 23 additions & 18 deletions

File tree

examples/sample-app/server.mjs

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -153,29 +153,32 @@ const pp = createPromptPurify({
153153
classifier: l5dClassifier,
154154
});
155155

156-
// EXPERIMENT: NO_REGEX=1 drops the L4 deterministic tripwires entirely —
157-
// pipeline becomes L1 normalize + our offline L5a model + nonce-fence only.
158-
// Reversible toggle so we can measure leaks AND false positives both ways.
159-
const NO_REGEX = process.env.NO_REGEX === "1";
160-
const l5a = createL5aClassifier();
156+
// MODEL-DRIVEN BY DEFAULT (per project direction: no rule-based filters).
157+
// The default /chat path is the intelligent l5d model + L1 input-cleaning
158+
// (hygiene only — strips disguise so the model sees real text; NOT a rule
159+
// blocklist) + nonce-fence. NO L4 regex tripwires, NO confusable/mixed-
160+
// script code rules in the verdict. Set WITH_REGEX=1 to restore the full
161+
// deterministic stack for side-by-side comparison.
162+
const WITH_REGEX = process.env.WITH_REGEX === "1";
161163

162-
/** Regexless inspect: L1 sanitize + model verdict, NO L4 tripwire rules. */
163-
async function inspectNoRegex(input) {
164-
const text = pp.sanitize(input, { sink: "untrusted_data" }); // L1 + defuse
165-
const { score } = await l5a(text, { sink: "untrusted_data" });
164+
/** Model-only inspect: L1 hygiene + l5d model verdict. NO rule filters. */
165+
const l5aFallback = createL5aClassifier(); // placeholder until OUR pretrained model lands (l5d ref kept in V9_RESULTS)
166+
async function inspectModelOnly(input) {
167+
const text = pp.sanitize(input, { sink: "untrusted_data" }); // L1 hygiene
168+
const { score } = await l5aFallback(text, { sink: "untrusted_data" });
166169
const verdict =
167-
score >= 0.95 ? "blocked" : score >= 0.445 ? "flagged" : "clean-structural";
170+
score >= 0.95 ? "blocked" : score >= 0.5 ? "flagged" : "clean-structural";
168171
return {
169172
text,
170173
verdict,
171-
risks: score >= 0.445 ? ["semantic-jailbreak(model-only)"] : [],
174+
risks: score >= 0.5 ? ["semantic-jailbreak(model-only)"] : [],
172175
modified: text !== input,
173176
};
174177
}
175178
console.log(
176-
NO_REGEX
177-
? "MODE = NO REGEX (L1 + offline L5a model + fence only)"
178-
: "MODE = full (L1 + L4 regex + offline L5a + fence)",
179+
WITH_REGEX
180+
? "MODE = full deterministic stack (L1 + L4 regex rules + l5d + fence)"
181+
: "MODE = MODEL-DRIVEN (L1 hygiene + l5d model + fence; NO regex rules)",
179182
);
180183

181184
// Path to the library's browser IIFE bundle, served to the page.
@@ -245,10 +248,12 @@ const server = createServer(async (req, res) => {
245248
const { message } = JSON.parse((await readBody(req)) || "{}");
246249
const userInput = String(message ?? "");
247250

248-
// Server-side authoritative firewall (full, or regexless experiment).
249-
const r = NO_REGEX
250-
? await inspectNoRegex(userInput)
251-
: await pp.inspectAsync(userInput, { sink: "untrusted_data" });
251+
// Server-side authoritative firewall. DEFAULT = model-driven, NO
252+
// regex rules (project direction). WITH_REGEX=1 restores the full
253+
// deterministic stack for comparison.
254+
const r = WITH_REGEX
255+
? await pp.inspectAsync(userInput, { sink: "untrusted_data" })
256+
: await inspectModelOnly(userInput);
252257
const meta = {
253258
verdict: r.verdict,
254259
risks: r.risks.map((x) => x.rule),

0 commit comments

Comments
 (0)