Skip to content

Commit bf4a203

Browse files
Jazzcortclaude
andcommitted
feat: implement session-based tracking for malicious behavior
Add BehaviorRecord and BehaviorRecordManager to track gatekeeper verdicts per session. A single MALICIOUS verdict sets a temporary warning that forces human confirmation on the next script; the warning clears once the human approves. If a session accumulates 4 total or 3 consecutive malicious verdicts, it is permanently flagged and all subsequent scripts require confirmation. Integrate behavior tracking into validate_script, run_script, run_script_interactive, run_script_with_confirmation, and execute_script. Emit malicious_activity_warning through the RunScriptInteractiveResult for the mcp-app UI. Add documentation for session-based behavior tracking in guarded-command-execution.md. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 867d26b commit bf4a203

5 files changed

Lines changed: 170 additions & 19 deletions

File tree

docs/guarded-command-execution.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,28 @@ The scores give an approximate sense of the capability of the model acting as a
6666
actual performance in real-world situations may vary.
6767
Smaller models than those listed above are *not recommended*.
6868

69+
## Session-Based Behavior Tracking
70+
71+
In addition to the per-script gatekeeper check,
72+
linux-mcp-server tracks gatekeeper verdicts across each session
73+
using a `BehaviorRecord`.
74+
This detects patterns of malicious activity
75+
that might not be obvious from any single script.
76+
77+
Each session maintains counters for consecutive and total malicious actions.
78+
A session is escalated to require confirmation for **all** subsequent scripts if either threshold is met:
79+
80+
* **3** consecutive scripts flagged as `MALICIOUS` by the gatekeeper.
81+
* **4** total scripts flagged as `MALICIOUS` across the session.
82+
83+
Once a session is permanently flagged, the flag cannot be cleared.
84+
85+
A single `MALICIOUS` verdict triggers a **temporary warning**
86+
that forces the next script to require human confirmation,
87+
even if it would otherwise run without approval.
88+
The temporary warning is cleared once the human approves execution.
89+
A non-malicious verdict resets the consecutive counter but not the total counter.
90+
6991
## Human In The Loop
7092

7193
To provide a better experience for the human approving the call:

mcp-app/src/global.css

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@
111111
.execution-state-tag {
112112
@apply border border-app-border-primary rounded-md w-fit px-2 text-sm
113113
}
114+
115+
.security-warning {
116+
@apply bg-red-200 text-red-600 rounded-lg p-4 mb-4
117+
}
114118
}
115119

116120
* {

mcp-app/src/run-script-app.tsx

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,15 @@ function RunScriptAppInner({
251251
<div className="app-container">
252252
<div className="script-main-box">
253253
<div className="mb-4">
254+
{validatedToolResult.maliciousActivityWarning && (
255+
<p className="security-warning">
256+
Suspicious activity detected - your chat client may be under
257+
attack. Please examine previous tool calls in detail, and if you
258+
have any doubts, do not approve this command and terminate this
259+
chat session.
260+
</p>
261+
)}
262+
254263
{/* TODO: we can dynamically inject the platform that users are using here */}
255264
<p>
256265
Goose wants to perform the following action on{" "}

mcp-app/src/types.ts

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,27 @@ export const ExecuteScriptResultSchema = z.object({
3333

3434
export type ExecuteScriptResult = z.infer<typeof ExecuteScriptResultSchema>;
3535

36-
export const McpAppToolResultSchema = z.object({
37-
status: z.enum([
38-
"OK",
39-
"BAD_DESCRIPTION",
40-
"POLICY",
41-
"MODIFIES_SYSTEM",
42-
"UNCLEAR",
43-
"DANGEROUS",
44-
"MALICIOUS",
45-
]),
46-
detail: z.string(),
47-
id: z.string(),
48-
});
36+
export const McpAppToolResultSchema = z
37+
.object({
38+
status: z.enum([
39+
"OK",
40+
"BAD_DESCRIPTION",
41+
"POLICY",
42+
"MODIFIES_SYSTEM",
43+
"UNCLEAR",
44+
"DANGEROUS",
45+
"MALICIOUS",
46+
]),
47+
detail: z.string(),
48+
id: z.string(),
49+
malicious_activity_warning: z.boolean(),
50+
})
51+
.transform((data) => ({
52+
status: data.status,
53+
detail: data.detail,
54+
id: data.id,
55+
maliciousActivityWarning: data.malicious_activity_warning,
56+
}));
4957

5058
export type McpAppToolResult = z.infer<typeof McpAppToolResultSchema>;
5159

src/linux_mcp_server/tools/run_script.py

Lines changed: 114 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,84 @@ def set_script_state(self, id: str, new_state: ExecutionState):
154154

155155

156156
script_store = ScriptStore()
157+
MAX_CONSECUTIVE_MALICIOUS_ACTIONS = 3
158+
MAX_TOTAL_MALICIOUS_ACTIONS = 4
157159

158160

161+
class BehaviorRecord:
162+
"""
163+
Tracks gatekeeper verdicts for a single session to detect malicious behavior.
164+
165+
Each MALICIOUS verdict sets a temporary warning that forces the next script to
166+
require human confirmation. The temporary warning is cleared once a human approves
167+
execution. Non-malicious verdicts reset the consecutive counter but not the total.
168+
169+
A session is permanently flagged if it accumulates MAX_TOTAL_MALICIOUS_ACTIONS
170+
total malicious actions or MAX_CONSECUTIVE_MALICIOUS_ACTIONS consecutive malicious
171+
actions. Once permanently flagged, all subsequent scripts require confirmation and
172+
the flag cannot be cleared.
173+
"""
174+
175+
def __init__(self):
176+
self._consecutive_malicious_action_counts = 0
177+
self._total_malicious_action_counts = 0
178+
self._has_temporary_warning = False
179+
self._is_malicious = False
180+
181+
def add_record(self, status: GatekeeperStatus):
182+
"""
183+
Record a gatekeeper verdict and update tracking state.
184+
185+
A MALICIOUS verdict increments both the consecutive and total counters and
186+
sets a temporary warning. Any other verdict resets the consecutive counter.
187+
If either threshold is met, the session is permanently flagged. No-ops if
188+
the session is already permanently flagged.
189+
"""
190+
# No need to update the record if it's already considered security compromised
191+
if self._is_malicious:
192+
return
193+
194+
if status == GatekeeperStatus.MALICIOUS:
195+
self._consecutive_malicious_action_counts += 1
196+
self._total_malicious_action_counts += 1
197+
self._has_temporary_warning = True
198+
else:
199+
self._consecutive_malicious_action_counts = 0
200+
201+
# Check if the record matches the conditions of being considered as malicious
202+
if self._total_malicious_action_counts >= MAX_TOTAL_MALICIOUS_ACTIONS:
203+
self._is_malicious = True
204+
return
205+
206+
if self._consecutive_malicious_action_counts >= MAX_CONSECUTIVE_MALICIOUS_ACTIONS:
207+
self._is_malicious = True
208+
209+
def remove_temporary_warning(self):
210+
"""Clear the temporary warning flag, typically after a human approves execution."""
211+
self._has_temporary_warning = False
212+
213+
@property
214+
def malicious_activity_warning(self) -> bool:
215+
"""Return True if the session is permanently flagged or has a pending temporary warning."""
216+
return self._is_malicious or self._has_temporary_warning
217+
218+
219+
class BehaviorRecordManager:
220+
"""Manages per-session BehaviorRecords, creating them on first access."""
221+
222+
def __init__(self):
223+
self._records: dict[str, BehaviorRecord] = dict()
224+
225+
def get_record_by_session_id(self, session_id: str) -> BehaviorRecord:
226+
"""Return the BehaviorRecord for a session, creating one if it doesn't exist."""
227+
if session_id not in self._records:
228+
self._records[session_id] = BehaviorRecord()
229+
230+
return self._records[session_id]
231+
232+
233+
behavior_record_manager = BehaviorRecordManager()
234+
159235
BASH_STRICT_PREAMBLE = "set -euo pipefail; "
160236

161237
SYSTEMD_RUN_ARGS = [
@@ -209,6 +285,7 @@ class RunScriptInteractiveResult(BaseModel):
209285
id: str
210286
status: GatekeeperStatus
211287
detail: str
288+
malicious_activity_warning: bool
212289

213290

214291
# class UserInfo(BaseModel):
@@ -241,13 +318,18 @@ class ExecuteScriptResult:
241318
@log_tool_call
242319
@disallow_local_execution_in_containers
243320
async def execute_script(
321+
ctx: Context,
244322
id: t.Annotated[str, Field(description="The associated ID of the script to be executed")],
245323
) -> ToolResult:
246324
script_details = script_store.get_script_details(id)
247325
command = _wrap_script(script_details.script_type, script_details.script)
248326
script_store.set_script_state(id, "executing")
249327
content: list[ContentBlock] = []
250328

329+
# Clear temporary warning flag by human approval
330+
behavior_record = behavior_record_manager.get_record_by_session_id(ctx.session_id)
331+
behavior_record.remove_temporary_warning()
332+
251333
try:
252334
returncode, stdout, stderr = await execute_command(command, host=script_details.host)
253335
except Exception:
@@ -314,9 +396,12 @@ async def run_script_interactive(
314396
host: Host = None,
315397
) -> ToolResult:
316398
script_details = script_store.get_script_details(token)
399+
behavior_record = behavior_record_manager.get_record_by_session_id(ctx.session_id)
400+
401+
needs_confirmation = script_details.needs_confirmation or behavior_record.malicious_activity_warning
317402

318403
# Verify that this script requires confirmation
319-
if not script_details.needs_confirmation:
404+
if not needs_confirmation:
320405
raise ToolError("This script does not require confirmation. Use run_script instead of run_script_interactive.")
321406

322407
# Check if the passed parameters match the stored script details
@@ -338,6 +423,8 @@ async def run_script_interactive(
338423
(BASH_STRICT_PREAMBLE + script) if script_type == SCRIPT_TYPE_BASH else script,
339424
readonly=readonly,
340425
)
426+
behavior_record.add_record(gatekeeper_result.status)
427+
341428
if gatekeeper_result.status != GatekeeperStatus.OK:
342429
script_store.set_script_state(token, "rejected-gatekeeper")
343430
raise ToolError(gatekeeper_result.description)
@@ -352,7 +439,12 @@ async def run_script_interactive(
352439
)
353440
]
354441

355-
structured_content_obj = RunScriptInteractiveResult(id=result_id, status=GatekeeperStatus.OK, detail="")
442+
structured_content_obj = RunScriptInteractiveResult(
443+
id=result_id,
444+
status=GatekeeperStatus.OK,
445+
detail="",
446+
malicious_activity_warning=behavior_record.malicious_activity_warning,
447+
)
356448

357449
return ToolResult(content=content, structured_content=structured_content_obj.model_dump())
358450

@@ -413,23 +505,28 @@ async def validate_script(
413505
readonly=readonly,
414506
)
415507

508+
behavior_record = behavior_record_manager.get_record_by_session_id(ctx.session_id)
509+
behavior_record.add_record(gatekeeper_result.status)
510+
416511
id = script_store.add_script(description, script, script_type, host, readonly)
417512
script_details = script_store.get_script_details(id)
418513

419514
if gatekeeper_result.status != GatekeeperStatus.OK:
420515
script_store.set_script_state(id, "rejected-gatekeeper")
421516
raise ToolError(gatekeeper_result.description)
422517

518+
needs_confirmation = script_details.needs_confirmation or behavior_record.malicious_activity_warning
519+
423520
result = ToolResult(
424521
content=[
425522
TextContent(
426523
type="text",
427-
text=f"Script passed gatekeeper validation and is stored with ID {id}. Please use {_pick_execution_tool(script_details.needs_confirmation)} to execute the validated script.",
524+
text=f"Script passed gatekeeper validation and is stored with ID {id}. Please use {_pick_execution_tool(needs_confirmation)} to execute the validated script.",
428525
)
429526
],
430527
structured_content={
431528
"token": id,
432-
"needs_confirmation": script_details.needs_confirmation,
529+
"needs_confirmation": needs_confirmation,
433530
},
434531
)
435532
return result
@@ -448,9 +545,12 @@ async def run_script(
448545
token: t.Annotated[str, Field(description="The token returned by the validate_script tool.")],
449546
) -> str:
450547
script_details = script_store.get_script_details(token)
548+
behavior_record = behavior_record_manager.get_record_by_session_id(ctx.session_id)
549+
550+
needs_confirmation = script_details.needs_confirmation or behavior_record.malicious_activity_warning
451551

452552
# Verify that this script doesn't require confirmation
453-
if script_details.needs_confirmation:
553+
if needs_confirmation:
454554
raise ToolError(f"This script requires confirmation. Use {_pick_execution_tool(True)} instead of run_script.")
455555

456556
script_store.set_script_state(token, "executing")
@@ -498,13 +598,19 @@ async def run_script_with_confirmation(
498598
host: Host = None,
499599
) -> str:
500600
script_details = script_store.get_script_details(token)
601+
behavior_record = behavior_record_manager.get_record_by_session_id(ctx.session_id)
602+
603+
needs_confirmation = script_details.needs_confirmation or behavior_record.malicious_activity_warning
501604

502605
# Verify that this script requires confirmation
503-
if not script_details.needs_confirmation:
606+
if not needs_confirmation:
504607
raise ToolError(
505608
"This script does not require confirmation. Use run_script instead of run_script_with_confirmation."
506609
)
507610

611+
# Clear temporary warning flag by human approval
612+
behavior_record.remove_temporary_warning()
613+
508614
# Verify the retrieved script details match the incoming parameters
509615
new_details = ScriptDetails(
510616
state="waiting-approval",
@@ -527,6 +633,8 @@ async def run_script_with_confirmation(
527633
(BASH_STRICT_PREAMBLE + script) if script_type == SCRIPT_TYPE_BASH else script,
528634
readonly=readonly,
529635
)
636+
behavior_record.add_record(gatekeeper_result.status)
637+
530638
if gatekeeper_result.status != GatekeeperStatus.OK:
531639
script_store.set_script_state(token, "rejected-gatekeeper")
532640
raise ToolError(gatekeeper_result.description)

0 commit comments

Comments
 (0)