Skip to content

Commit fe1aa37

Browse files
chore: stabilize UI regressions and enhance adversarial de-obfuscation flow
1 parent c183ebb commit fe1aa37

3 files changed

Lines changed: 70 additions & 103 deletions

File tree

app/policies.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -239,17 +239,20 @@ def _try_decode(query: str) -> Optional[str]:
239239
return decoded
240240
except: pass
241241

242-
# 4. Leetspeak (Basic)
243-
leet_map = {'4':'a', '3':'e', '1':'i', '0':'o', '@':'a', '$':'s', '7':'t', '5':'s'}
244-
if any(c in leet_map for c in q):
245-
decoded = "".join(leet_map.get(c, c) for c in q.lower())
242+
# 4. Leetspeak (Comprehensive)
243+
leet_map = {'4':'a', '3':'e', '1':'i', '0':'o', '@':'a', '$':'s', '7':'t', '5':'s', '8':'b', '9':'g', 'v':'\/', 'w':'\/\/'}
244+
q_lower = q.lower()
245+
if any(c in leet_map for c in q_lower):
246+
decoded = "".join(leet_map.get(c, c) for c in q_lower)
246247
# Check if leet-decoded string triggers more keywords
247-
dangerous = ["bomb", "kill", "hack", "poison", "weapon", "jailbreak", "dan"]
248-
if any(w in decoded for w in dangerous) and decoded != q.lower():
248+
dangerous = ["bomb", "kill", "hack", "poison", "weapon", "jailbreak", "dan", "exploit", "bypass", "malware"]
249+
if any(w in decoded for w in dangerous) and decoded != q_lower:
249250
return decoded
250251

251252
return None
252253

254+
return None
255+
253256

254257
def _compute_risk(triggered: List[str], flags: Dict[str, bool], history: List[Dict]) -> int:
255258
"""Calculate risk level (0-5)"""

app/static/index.html

Lines changed: 61 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,7 @@
683683
padding: 14px;
684684
flex-direction: column;
685685
gap: 12px;
686+
min-height: 0; /* Fix for flex children scrolling */
686687
}
687688

688689
.tab-content.active {
@@ -3192,6 +3193,47 @@
31923193
});
31933194
}
31943195

3196+
/* ════════════════════════════ MODES */
3197+
function setMode(mode) {
3198+
S.mode = mode;
3199+
3200+
// Toggle button highlights
3201+
document.querySelectorAll('.mode-btn').forEach(b => {
3202+
b.classList.remove('active');
3203+
if (b.id === 'mode-' + mode) b.classList.add('active');
3204+
});
3205+
3206+
// Update indicators
3207+
const mi = $('mode-indicator');
3208+
mi.className = 'mode-indicator ' + mode;
3209+
3210+
if (mode === 'auto') {
3211+
$('mi-icon').textContent = '🤖';
3212+
$('mi-text').textContent = 'AUTO MODE — AI agent will automatically process each query and show its reasoning';
3213+
$('auto-result').style.display = 'block';
3214+
$('auto-log').style.display = 'block';
3215+
$('manual-section').style.display = 'none';
3216+
$('custom-query-wrap').style.display = 'none';
3217+
} else if (mode === 'manual') {
3218+
$('mi-icon').textContent = '👤';
3219+
$('mi-text').textContent = 'MANUAL MODE — Evaluate queries yourself and justify your policy decisions';
3220+
$('auto-result').style.display = 'none';
3221+
$('auto-log').style.display = 'none';
3222+
$('manual-section').style.display = 'block';
3223+
$('custom-query-wrap').style.display = 'none';
3224+
} else if (mode === 'custom') {
3225+
$('mi-icon').textContent = '✏️';
3226+
$('mi-text').textContent = 'CUSTOM MODE — Load your own adversarial queries and test the safety engine';
3227+
$('auto-result').style.display = 'none';
3228+
$('auto-log').style.display = 'none';
3229+
$('manual-section').style.display = 'block'; // Allow decisions on custom queries
3230+
$('custom-query-wrap').style.display = 'block';
3231+
}
3232+
3233+
toast('Switched to ' + mode.toUpperCase() + ' mode', 'i');
3234+
updateTriggers(S.query || { text: '' });
3235+
}
3236+
31953237
/* ════════════════════════════ THINKING TRACE */
31963238
function runTrace(decision, risk) {
31973239
const steps = [{ done: true }, { done: true }, { done: true }, { done: risk >= 3 }, { done: true }];
@@ -3416,13 +3458,11 @@
34163458
session_id: S.backendSid,
34173459
action: {
34183460
decision,
3419-
reason,
3420-
modified_response: modified || null,
3461+
reason: reason,
34213462
confidence: confidence / 100,
3422-
// Unified Flow: Support custom query evaluation in the backend
3423-
query_text: S.mode === 'custom' ? query.text : null,
3424-
risk_level: S.mode === 'custom' ? query.risk : null,
3425-
attack_type: S.mode === 'custom' ? query.at : null
3463+
query_text: S.mode === 'custom' ? S.query.text : null,
3464+
risk_level: S.mode === 'custom' ? S.query.risk : null,
3465+
attack_type: S.mode === 'custom' ? S.query.at : null
34263466
}
34273467
})
34283468
})
@@ -3431,33 +3471,25 @@
34313471
if (res.info && res.info.decoded_query) {
34323472
const dc = $('decoded-intent-card');
34333473
dc.style.display = 'block';
3434-
dc.style.background = 'rgba(16, 24, 40, 0.85)';
3435-
dc.style.backdropFilter = 'blur(10px)';
3436-
dc.style.border = '1px solid var(--cyan-dim)';
3437-
$('decoded-text').textContent = res.info.decoded_query;
3438-
$('decoded-text').style.color = 'var(--cyan)';
3439-
$('decoded-text').style.textShadow = '0 0 10px var(--cyan-glow)';
3440-
} else {
3441-
$('decoded-intent-card').style.display = 'none';
3474+
dc.className = 'decoded-intent-card glass-glow'; // Apply surgery UI styles
3475+
$('decoded-text').textContent = 'REAL STATEMENT: ' + res.info.decoded_query;
3476+
toast('Server de-obfuscation complete', 's');
34423477
}
34433478
}).catch(() => { });
34443479
}
3445-
3446-
// Update breakdown
3447-
runTrace(decision, query.risk);
3448-
updateBreakdown(decision, reason, confidence);
3449-
3480+
34503481
// Show comparison
34513482
const entry = { turn: S.turn, query: query.text, at: query.at, decision, reason, risk: query.risk, correct: decision === query.exp };
34523483
S.history.push(entry); addRH(entry); updateTE(); updateMetrics(); updatePills();
3453-
3484+
34543485
// Update comparison section
34553486
$('comp-section').style.display = 'block';
34563487
const cc = $('comp-container'); cc.innerHTML = '';
3457-
S.history.forEach(h => {
3488+
3489+
S.history.forEach((h, idx) => {
34583490
const row = document.createElement('div'); row.className = 'comp-row';
3459-
const ec = h.correct ? 'var(--green)' : 'var(--yellow)';
3460-
const expVal = SCENARIOS[S.task][S.scenario].queries[h.turn - 1].exp;
3491+
let expVal = 'block';
3492+
try { expVal = SCENARIOS[S.task][S.scenario].queries[idx].exp; } catch(e) {}
34613493
row.innerHTML = `<span class="comp-lbl">T${h.turn}</span><span class="comp-badge hcd-${expVal}" style="background:${expVal === 'block' ? 'var(--red-dim)' : expVal === 'allow' ? 'var(--green-dim)' : 'var(--yellow-dim)'};">${expVal.toUpperCase()}</span><span class="comp-vs">vs</span><span class="comp-badge hcd-${h.decision}">${h.decision.toUpperCase()}</span><span class="comp-result">${h.correct ? '✅' : '❌'}</span>`;
34623494
cc.appendChild(row);
34633495
});
@@ -3470,11 +3502,14 @@
34703502
S.done = true; S.status = 'complete';
34713503
const ss = $('sess-status'); ss.className = 'status-pill sp-done'; ss.textContent = 'COMPLETE';
34723504
btn.disabled = true;
3473-
S.completedEpisodes.push({ task: S.task, scenario: S.scenario, score: Math.floor(S.reward + 0.5), mode: 'manual', turns: S.history.slice(), timestamp: new Date().toLocaleTimeString() });
3505+
S.completedEpisodes.push({ task: S.task, scenario: S.scenario, score: Math.floor(S.reward + 0.5), mode: S.mode, turns: S.history.slice(), timestamp: new Date().toLocaleTimeString() });
34743506
setTimeout(showFinal, 600);
34753507
} else {
3476-
S.turn++; S.query = SCENARIOS[S.task][S.scenario].queries[S.turn - 1];
3477-
loadQuery(S.query);
3508+
S.turn++;
3509+
if (S.mode !== 'custom') {
3510+
S.query = SCENARIOS[S.task][S.scenario].queries[S.turn - 1];
3511+
loadQuery(S.query);
3512+
}
34783513
}
34793514

34803515
// Reset form

scratch/test_custom_flow.py

Lines changed: 0 additions & 71 deletions
This file was deleted.

0 commit comments

Comments
 (0)