Skip to content

Commit b462c47

Browse files
authored
Merge pull request #66 from xoxruns/agent_iteration : pre-release 0.1.4
pre-release 0.1.4: Multiple changes have been made to simplify the performance issues of certain architectural decisions and agentic iteration over the agent's memory. - [x] switching to sqlite and removing PGvector. Not usable for fast multi-agent structures. Still maybe need to see if the installation is still done otherwise remove it. needs clean up - [x] adding an agentic virtual filesystem with tooling to read, list, grep and write. grep is now used with ripgrep, needs testing. - [x] The avfs can have different workspaces to work with. the idea today is that we have a memory workspace and the root one for future codebase analysis. - [x] We've switched the simple python sandbox to a more usable component instead of a server like transport to the webassembly sandbox. We still need to check if the sandbox runs perfectly when built. We also should remove the download of the former implementation. - [x] Adding a validation process with numerous exit strategies. Through pattern, llm-as-judge, tool backed or hybrid. - [x] Adding report generation at the end of each successful vuln found.
2 parents 41ca4b7 + abd1ea3 commit b462c47

112 files changed

Lines changed: 6687 additions & 2761 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/docker-build.yml

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1+
# Pushes to Docker Hub only on: push to main/release_* (not on pull_request), or workflow_dispatch.
2+
# PRs build the image for CI but intentionally do not push (no registry credentials / avoid polluting tags).
3+
14
name: Build and Push Deadend Docker Image
25

36
on:
47
push:
58
branches: [ main, release_* ]
69
paths:
7-
- 'setup/images/kalilinux.Dockerfile'
10+
- 'environments/images/kalilinux.Dockerfile'
811
- '.github/workflows/docker-build.yml'
912
pull_request:
1013
branches: [ main ]
1114
paths:
12-
- 'setup/images/kalilinux.Dockerfile'
15+
- 'environments/images/kalilinux.Dockerfile'
1316
- '.github/workflows/docker-build.yml'
1417
workflow_dispatch:
1518
inputs:
@@ -20,7 +23,7 @@ on:
2023

2124
env:
2225
REGISTRY: docker.io
23-
IMAGE_NAME: bargacy/deadend-pentest
26+
IMAGE_NAME: xoxruns/sandboxed_kali
2427

2528
jobs:
2629
build-and-push:
@@ -36,6 +39,22 @@ jobs:
3639
- name: Set up Docker Buildx
3740
uses: docker/setup-buildx-action@v3
3841

42+
- name: Explain no push on pull requests
43+
if: github.event_name == 'pull_request'
44+
run: |
45+
echo "PR workflow: image is built only (push=false). After merge to main or release_*, a push event will build and push to Docker Hub."
46+
47+
- name: Require Docker Hub secrets when pushing
48+
if: github.event_name != 'pull_request'
49+
env:
50+
DOCKER_USER: ${{ secrets.DOCKER_USERNAME }}
51+
DOCKER_PASS: ${{ secrets.DOCKER_TOKEN }}
52+
run: |
53+
if [ -z "$DOCKER_USER" ] || [ -z "$DOCKER_PASS" ]; then
54+
echo "::error::Add repository Actions secrets DOCKER_USERNAME and DOCKER_TOKEN (Docker Hub access token). Without them, the job cannot log in and nothing is pushed to Docker Hub."
55+
exit 1
56+
fi
57+
3958
- name: Log in to Docker Hub
4059
if: github.event_name != 'pull_request'
4160
uses: docker/login-action@v3
@@ -58,10 +77,11 @@ jobs:
5877
type=raw,value=${{ github.event.inputs.tag }},enable=${{ github.event.inputs.tag != '' }}
5978
6079
- name: Build and push Docker image
80+
id: build
6181
uses: docker/build-push-action@v5
6282
with:
6383
context: .
64-
file: ./setup/images/kalilinux.Dockerfile
84+
file: ./environments/images/kalilinux.Dockerfile
6585
push: ${{ github.event_name != 'pull_request' }}
6686
tags: ${{ steps.meta.outputs.tags }}
6787
labels: ${{ steps.meta.outputs.labels }}

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "benchmarks/xbow/validation-benchmarks-xbow"]
55
path = benchmarks/xbow/validation-benchmarks-xbow
66
url = https://github.com/xoxruns/validation-benchmarks-xbow.git
7+
[submodule "deadend_cli/simple-python-interpreter-sandbox"]
8+
path = deadend_cli/simple-python-interpreter-sandbox
9+
url = https://github.com/xoxruns/simple-python-interpreter-sandbox.git

benchmarks/run_xbow_benchmark.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ echo "[+] Launching eval agent with uv run"
248248
echo "[+] Logging uv run output to: $LOG_FILE"
249249
(
250250
cd "$REPO_ROOT/deadend_cli/src/deadend_cli" && \
251-
uv run main.py eval-agent --eval-metadata-file "$META_FILE" --llm-providers azure_ai
251+
uv run main.py eval-agent --eval-metadata-file "$META_FILE" --provider azure_ai --model-name Kimi-K2.5
252252
) 2>&1 | tee "$LOG_FILE"
253253

254254
echo "[+] Stopping benchmark services with make stop"

cli/deadend_cli/components/ComponentHealth.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ export interface ComponentHealthProps {
2222
// Short display names for components
2323
const COMPONENT_NAMES: Record<string, string> = {
2424
docker: "Docker",
25-
pgvector: "pgvector",
25+
rag: "RAG",
2626
config: "Config",
2727
python_sandbox: "Python",
2828
shell_sandbox: "Shell",
@@ -31,8 +31,8 @@ const COMPONENT_NAMES: Record<string, string> = {
3131
// Order to display components
3232
const COMPONENT_ORDER = [
3333
"docker",
34-
"pgvector",
3534
"config",
35+
"rag",
3636
"python_sandbox",
3737
"shell_sandbox",
3838
];

cli/deadend_cli/hooks/useComponentHealth.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import type { DeadEndRpcClient } from "../runtime/deadend-rpc-client.ts";
44

55
export type ComponentName =
66
| "docker"
7-
| "pgvector"
7+
| "rag"
88
| "config"
99
| "python_sandbox"
1010
| "shell_sandbox";
@@ -104,8 +104,8 @@ export function useComponentHealth(
104104
case "docker":
105105
result = await rpcClient.initDocker();
106106
break;
107-
case "pgvector":
108-
result = await rpcClient.initPgvector();
107+
case "rag":
108+
result = await rpcClient.initRag();
109109
break;
110110
case "config":
111111
result = await rpcClient.initConfig();
@@ -154,8 +154,8 @@ export function useComponentHealth(
154154

155155
const components: ComponentName[] = [
156156
"docker",
157-
"pgvector",
158157
"config",
158+
"rag",
159159
"python_sandbox",
160160
"shell_sandbox",
161161
];

cli/deadend_cli/main.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ function App({ cliArgs }: AppProps) {
109109

110110
// Initialize all components at once using init_all
111111
setInitStatus("Initializing all components...");
112-
// We can wait longer here for pgvector and the sandbox
112+
// Allow time for sandboxes and Playwright during init_all
113113
const initResult = await client.initAll(300000);
114114

115115
// Store component results for display
@@ -125,7 +125,7 @@ function App({ cliArgs }: AppProps) {
125125
}
126126

127127
// Check for critical failures (all components required for task execution)
128-
const criticalComponents = ["docker", "config", "model_registry", "pgvector", "shell_sandbox"];
128+
const criticalComponents = ["docker", "config", "model_registry", "rag", "shell_sandbox"];
129129
const criticalFailures = initResult.failed_components.filter(
130130
(c) => criticalComponents.includes(c)
131131
);

cli/deadend_cli/runtime/deadend-rpc-client.ts

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* │ ┌─────────────────────────────────────────────────────────────────────────┐│
2525
* │ │ DeadEndRpcClient ││
2626
* │ │ - runTask() / runTaskWithCallbacks() ││
27-
* │ │ - healthAll() / initDocker() / initPgvector() / ... ││
27+
* │ │ - healthAll() / initDocker() / initRag() / ... ││
2828
* │ │ - subscribeEvents() / interrupt() / approve() ││
2929
* │ └────────────────────────────────────────────────────────────────────────┘│
3030
* │ │ │
@@ -67,7 +67,7 @@
6767
*
6868
* // Initialize components
6969
* await client.initDocker();
70-
* await client.initPgvector();
70+
* await client.initRag();
7171
* await client.initShellSandbox();
7272
*
7373
* // Run security testing task
@@ -301,8 +301,8 @@ export interface DeadEndRpcClientOptions extends StdioRpcClientOptions {
301301
*
302302
* Before running tasks, components must be initialized in order:
303303
* 1. `initDocker()` - Docker daemon connection (required)
304-
* 2. `initPgvector()` - Vector database for RAG (optional)
305-
* 3. `initConfig()` - Load LLM API keys and settings
304+
* 2. `initConfig()` - Load LLM API keys and settings
305+
* 3. `initRag()` - SQLite-backed RAG session manager (after config, for storage paths)
306306
* 4. `initShellSandbox()` - Prepare Kali container for shell commands
307307
* 5. `initPythonSandbox()` - Start Python interpreter sandbox
308308
* 6. `initPlaywright()` - Browser automation (optional)
@@ -443,7 +443,7 @@ export class DeadEndRpcClient {
443443
*
444444
* Returns a comprehensive health report including:
445445
* - Docker daemon connectivity
446-
* - pgvector database status
446+
* - RAG (SQLite) session manager status
447447
* - Python sandbox process status
448448
* - Shell sandbox readiness
449449
* - Playwright browser status
@@ -466,12 +466,12 @@ export class DeadEndRpcClient {
466466
}
467467

468468
/**
469-
* Checks pgvector database health.
469+
* Checks RAG (SQLite) session manager health.
470470
*
471-
* @returns Promise resolving to HealthResult for pgvector
471+
* @returns Promise resolving to HealthResult for RAG
472472
*/
473-
async healthPgvector(): Promise<HealthResult> {
474-
const result = await this.client.call("health_pgvector");
473+
async healthRag(): Promise<HealthResult> {
474+
const result = await this.client.call("health_rag");
475475
return result as HealthResult;
476476
}
477477

@@ -523,17 +523,14 @@ export class DeadEndRpcClient {
523523
}
524524

525525
/**
526-
* Initializes the pgvector database container.
526+
* Initializes the SQLite-backed RAG session manager.
527527
*
528-
* Starts the pgvector container if not running and verifies
529-
* database connectivity. Used for RAG (retrieval-augmented generation).
530-
*
531-
* Requires: initDocker() must be called first
528+
* Prefer calling after `initConfig()` so storage paths from config apply.
532529
*
533530
* @returns Promise resolving to InitResult with success status
534531
*/
535-
async initPgvector(): Promise<InitResult> {
536-
const result = await this.client.call("init_pgvector");
532+
async initRag(): Promise<InitResult> {
533+
const result = await this.client.call("init_rag");
537534
return result as InitResult;
538535
}
539536

@@ -611,9 +608,9 @@ export class DeadEndRpcClient {
611608
* proper dependency order and provides a comprehensive result.
612609
*
613610
* Initialization order:
614-
* 1. Docker (required by pgvector and shell_sandbox)
615-
* 2. Config (required by model_registry)
616-
* 3. pgvector (requires Docker)
611+
* 1. Docker (required by shell_sandbox)
612+
* 2. Config (required by model_registry and RAG paths)
613+
* 3. RAG session manager (SQLite, no Docker)
617614
* 4. Model Registry (requires Config)
618615
* 5. Python sandbox (standalone)
619616
* 6. Shell sandbox (requires Docker)
@@ -1093,8 +1090,7 @@ export class DeadEndRpcClient {
10931090
* - Playwright browser
10941091
* - Python sandbox process
10951092
* - Shell sandbox containers
1096-
* - pgvector database (optional)
1097-
* - RAG connector
1093+
* - RAG session manager (SQLite)
10981094
*
10991095
* @returns Promise resolving to shutdown status for each component
11001096
*/

cli/deadend_cli/types/rpc.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -501,12 +501,10 @@ export interface AgentErrorData {
501501
*
502502
* @property thought - The full reasoning text
503503
* @property summary - Condensed version of the thought
504-
* @property relevance - How relevant this thought is to the task (0.0-1.0)
505504
*/
506505
export interface AgentThoughtData {
507506
thought: string;
508507
summary?: string;
509-
relevance: number;
510508
}
511509

512510
/**
@@ -647,7 +645,7 @@ export type ComponentStatus =
647645
*
648646
* Returned by health_* RPC methods to report component status.
649647
*
650-
* @property component - Name of the component (docker, pgvector, etc.)
648+
* @property component - Name of the component (docker, rag, config, etc.)
651649
* @property healthy - Whether the component is functioning correctly
652650
* @property status - Current lifecycle state of the component
653651
* @property message - Human-readable status message

0 commit comments

Comments
 (0)