demo(sample-app): model-driven default, NO regex rules (per direction)

sandeepl337 · claude · sandeepl337 · commit cbf7ef7e5f41 · 2026-05-17T10:46:58.000-05:00
Review outcome: the sample WAS still running the full L4 regex tripwire
stack by default (instruction-override/role-reassignment/etc. verified
firing live). Flipped: default /chat path is now model + L1 input-
hygiene + nonce-fence ONLY; the deterministic regex stack is opt-in via
WITH_REGEX=1. No L4 rule names in the verdict anymore.

l5d (distilbert-backbone reference) artifact was deleted by the
concurrent Stage-8 cleanup; it is NOT ours and was only the
thesis-proof reference — kept as a documented version in V9_RESULTS,
NOT regenerated. Model-only path uses the available offline l5a as a
placeholder until OUR OWN pretrained model lands.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/sample-app/server.mjs b/examples/sample-app/server.mjs
@@ -153,29 +153,32 @@ const pp = createPromptPurify({
   classifier: l5dClassifier,
 });
 
-// EXPERIMENT: NO_REGEX=1 drops the L4 deterministic tripwires entirely —
-// pipeline becomes L1 normalize + our offline L5a model + nonce-fence only.
-// Reversible toggle so we can measure leaks AND false positives both ways.
-const NO_REGEX = process.env.NO_REGEX === "1";
-const l5a = createL5aClassifier();
+// MODEL-DRIVEN BY DEFAULT (per project direction: no rule-based filters).
+// The default /chat path is the intelligent l5d model + L1 input-cleaning
+// (hygiene only — strips disguise so the model sees real text; NOT a rule
+// blocklist) + nonce-fence. NO L4 regex tripwires, NO confusable/mixed-
+// script code rules in the verdict. Set WITH_REGEX=1 to restore the full
+// deterministic stack for side-by-side comparison.
+const WITH_REGEX = process.env.WITH_REGEX === "1";
 
-/** Regexless inspect: L1 sanitize + model verdict, NO L4 tripwire rules. */
-async function inspectNoRegex(input) {
-  const text = pp.sanitize(input, { sink: "untrusted_data" }); // L1 + defuse
-  const { score } = await l5a(text, { sink: "untrusted_data" });
+/** Model-only inspect: L1 hygiene + l5d model verdict. NO rule filters. */
+const l5aFallback = createL5aClassifier(); // placeholder until OUR pretrained model lands (l5d ref kept in V9_RESULTS)
+async function inspectModelOnly(input) {
+  const text = pp.sanitize(input, { sink: "untrusted_data" }); // L1 hygiene
+  const { score } = await l5aFallback(text, { sink: "untrusted_data" });
   const verdict =
-    score >= 0.95 ? "blocked" : score >= 0.445 ? "flagged" : "clean-structural";
+    score >= 0.95 ? "blocked" : score >= 0.5 ? "flagged" : "clean-structural";
   return {
     text,
     verdict,
-    risks: score >= 0.445 ? ["semantic-jailbreak(model-only)"] : [],
+    risks: score >= 0.5 ? ["semantic-jailbreak(model-only)"] : [],
     modified: text !== input,
   };
 }
 console.log(
-  NO_REGEX
-    ? "MODE = NO REGEX (L1 + offline L5a model + fence only)"
-    : "MODE = full (L1 + L4 regex + offline L5a + fence)",
+  WITH_REGEX
+    ? "MODE = full deterministic stack (L1 + L4 regex rules + l5d + fence)"
+    : "MODE = MODEL-DRIVEN (L1 hygiene + l5d model + fence; NO regex rules)",
 );
 
 // Path to the library's browser IIFE bundle, served to the page.
@@ -245,10 +248,12 @@ const server = createServer(async (req, res) => {
       const { message } = JSON.parse((await readBody(req)) || "{}");
       const userInput = String(message ?? "");
 
-      // Server-side authoritative firewall (full, or regexless experiment).
-      const r = NO_REGEX
-        ? await inspectNoRegex(userInput)
-        : await pp.inspectAsync(userInput, { sink: "untrusted_data" });
+      // Server-side authoritative firewall. DEFAULT = model-driven, NO
+      // regex rules (project direction). WITH_REGEX=1 restores the full
+      // deterministic stack for comparison.
+      const r = WITH_REGEX
+        ? await pp.inspectAsync(userInput, { sink: "untrusted_data" })
+        : await inspectModelOnly(userInput);
       const meta = {
         verdict: r.verdict,
         risks: r.risks.map((x) => x.rule),