Skip to content

Commit affd507

Browse files
Add baseline evaluation infrastructure
Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 923d8ea commit affd507

10 files changed

Lines changed: 659 additions & 0 deletions

File tree

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ The source checkout alone is training-ready; the published RC2/v1.1 release arti
4141
- [Model Architecture](#model-architecture)
4242
- [Prompt and Output Format](#prompt-and-output-format)
4343
- [Integration Status](#integration-status)
44+
- [Evaluation and Baselines](#evaluation-and-baselines)
4445
- [Architecture](#architecture)
4546
- [Why Nullsec S1 exists](#why-nullsec-s1-exists)
4647
- [Core system components](#core-system-components)
@@ -205,6 +206,15 @@ This verifies that local public claims match the downloaded adapter, benchmark r
205206

206207
See [`docs/PROMPT_FORMAT.md`](docs/PROMPT_FORMAT.md) for the canonical system prompt, user prompt shape, safe-code response format, JSON schema, taxonomy categories, and guidance for reducing false positives/false negatives.
207208

209+
## Evaluation and Baselines
210+
211+
Baseline comparison infrastructure is available in [`docs/EVALS.md`](docs/EVALS.md). It supports reproducible comparisons against:
212+
213+
- the base `Qwen/Qwen2.5-Coder-7B-Instruct` model without the Nullsec adapter;
214+
- Semgrep with local benchmark rules and explicit coverage limitations.
215+
216+
No baseline numbers are listed here until reports are generated by the scripts. Do not hand-enter comparison metrics.
217+
208218
## Integration Status
209219

210220
| Integration | Status |

benchmarks/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,28 @@ python benchmarks/run_all.py --mode replay --replay captured.jsonl
4141

4242
Reports are written to `benchmarks/reports/` (git-ignored). Every report carries a `provenance` block: model name, version, fingerprint, taxonomy version, run mode, adapter, dataset, and UTC timestamp — so any number can be traced to the exact Nullsec-1 release and dataset that produced it.
4343

44+
## Baselines
45+
46+
Baseline runners live under `benchmarks/baselines/`:
47+
48+
```bash
49+
# Base model, no Nullsec adapter (GPU required)
50+
python benchmarks/baselines/base_qwen.py --mode model
51+
52+
# Semgrep static-analysis baseline (CPU; requires semgrep)
53+
python -m pip install semgrep
54+
python benchmarks/baselines/semgrep_baseline.py
55+
56+
# Markdown comparison from generated reports
57+
python benchmarks/compare_baselines.py \
58+
--nullsec benchmarks/reports/SUITE.json \
59+
--base benchmarks/reports/baselines/qwen2_5_coder_7b/SUITE.json \
60+
--semgrep benchmarks/reports/baselines/semgrep/SUITE.json
61+
```
62+
63+
Generated baseline reports are written under `benchmarks/reports/baselines/` and
64+
are not committed by default. See [`docs/EVALS.md`](../docs/EVALS.md).
65+
4466
## Datasets
4567

4668
`datasets/detection.json` is the labeled corpus: each case has `code`, `expected_categories`, `expected_min_severity`, `expected_production_ready`, and (where applicable) an `owasp` tag. It is part of the initial seed and is meant to grow with the ingestion pipeline; benchmark strength scales with corpus breadth.

benchmarks/baselines/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""Baseline runners for comparing Nullsec-S1 against other systems.
2+
3+
Generated reports are written under benchmarks/reports/baselines/ and are not
4+
committed by default.
5+
"""

benchmarks/baselines/base_qwen.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env python3
2+
"""Run the 111-case security benchmark against the base Qwen model.
3+
4+
This baseline uses the same prompt, raw-output finalization, and metrics as
5+
Nullsec-S1, but loads Qwen/Qwen2.5-Coder-7B-Instruct without the Nullsec adapter.
6+
Malformed/non-JSON output is counted honestly as a miss.
7+
"""
8+
from __future__ import annotations
9+
10+
import argparse
11+
import sys
12+
from pathlib import Path
13+
14+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
15+
16+
from benchmarks.baselines.common import write_baseline_report
17+
from benchmarks.harness import OutputProvider, load_dataset
18+
from benchmarks.runners._common import collect_verdicts
19+
from nullsec.core.version import TRAINING_BASE
20+
21+
22+
def main() -> int:
23+
ap = argparse.ArgumentParser(description="Base Qwen2.5-Coder benchmark baseline")
24+
ap.add_argument("--mode", choices=["model", "replay"], default="model")
25+
ap.add_argument("--dataset", default="detection.json")
26+
ap.add_argument("--replay", default=None, help="captured raw outputs JSONL for replay mode")
27+
args = ap.parse_args()
28+
29+
ds = load_dataset(args.dataset)
30+
provider = OutputProvider(mode=args.mode, adapter=None, replay_path=Path(args.replay) if args.replay else None)
31+
verdicts = collect_verdicts(ds, provider)
32+
out = write_baseline_report(
33+
system_id="qwen2_5_coder_7b",
34+
system_name="Qwen2.5-Coder-7B-Instruct (base, no Nullsec adapter)",
35+
dataset=args.dataset,
36+
cases=ds["cases"],
37+
verdicts=verdicts,
38+
provenance={"run_mode": args.mode, "base_model": TRAINING_BASE, "adapter": None},
39+
)
40+
print(f"base Qwen baseline report -> {out}")
41+
print("Generated report is under benchmarks/reports/ and should not be committed unless explicitly approved.")
42+
return 0
43+
44+
45+
if __name__ == "__main__":
46+
raise SystemExit(main())
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Semgrep rule-id to Nullsec category mapping for baseline evaluation.
2+
3+
The mapping is intentionally conservative. Unsupported categories remain
4+
unsupported rather than being forced into misleading coverage.
5+
"""
6+
from __future__ import annotations
7+
8+
RULE_TO_CATEGORY = {
9+
"nullsec.exposed-secret": "EXPOSED_SECRET",
10+
"nullsec.sql-injection": "SQL_INJECTION",
11+
"nullsec.xss-dangerously-set-html": "XSS",
12+
"nullsec.ssrf-user-controlled-fetch": "SSRF",
13+
"nullsec.command-injection-shell": "COMMAND_INJECTION",
14+
"nullsec.dangerous-shell-curl-pipe": "DANGEROUS_SHELL_COMMAND",
15+
"nullsec.dangerous-shell-chmod-777": "DANGEROUS_SHELL_COMMAND",
16+
"nullsec.unsafe-admin-route": "UNSAFE_ADMIN_ROUTE",
17+
"nullsec.missing-rate-limit": "MISSING_RATE_LIMIT",
18+
"nullsec.unsafe-file-upload": "UNSAFE_FILE_UPLOAD",
19+
"nullsec.dependency-risk-postinstall": "DEPENDENCY_RISK",
20+
"nullsec.env-exposure": "ENVIRONMENT_EXPOSURE",
21+
}
22+
23+
SUPPORTED_CATEGORIES = sorted(set(RULE_TO_CATEGORY.values()))
24+
PARTIAL_OR_UNSUPPORTED_CATEGORIES = [
25+
"BROKEN_AUTH",
26+
"MCP_TOOL_ABUSE",
27+
"PROMPT_INJECTION",
28+
"SMART_CONTRACT_RISK",
29+
"WALLET_TRANSACTION_RISK",
30+
]
31+
32+
33+
def category_for_rule(rule_id: str) -> str | None:
34+
return RULE_TO_CATEGORY.get(rule_id)

benchmarks/baselines/common.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""Shared baseline evaluation helpers.
2+
3+
Baselines emit the same `verdicts` shape consumed by `benchmarks.metrics`:
4+
5+
dict[case_id] = (Verdict | None, aligned_bool)
6+
7+
This lets base-model and tool baselines use the same scoring functions as
8+
Nullsec-S1 without changing labels or metric semantics.
9+
"""
10+
from __future__ import annotations
11+
12+
import json
13+
from datetime import datetime, timezone
14+
from pathlib import Path
15+
16+
from benchmarks.metrics import (
17+
debug_report,
18+
detection_over,
19+
failed_case_ids,
20+
false_safe_rate,
21+
hallucination_rate,
22+
owasp_coverage,
23+
patch_correctness,
24+
per_category_recall,
25+
secure_generation_score,
26+
)
27+
28+
ROOT = Path(__file__).resolve().parents[2]
29+
REPORT_ROOT = ROOT / "benchmarks" / "reports" / "baselines"
30+
31+
FAMILIES = {
32+
"detection_accuracy": detection_over,
33+
"false_safe_rate": false_safe_rate,
34+
"hallucination_rate": hallucination_rate,
35+
"owasp_coverage": owasp_coverage,
36+
"patch_correctness": patch_correctness,
37+
"secure_generation": secure_generation_score,
38+
"per_category_recall": per_category_recall,
39+
"failed_cases": failed_case_ids,
40+
}
41+
42+
43+
def build_suite(cases: list[dict], verdicts: dict, *, include_patch_metrics: bool = True) -> dict:
44+
suite = {}
45+
for name, fn in FAMILIES.items():
46+
if not include_patch_metrics and name in {"patch_correctness", "secure_generation"}:
47+
suite[name] = {
48+
"not_applicable": True,
49+
"note": "baseline does not generate secure patches",
50+
}
51+
continue
52+
suite[name] = fn(cases, verdicts)
53+
54+
dbg = debug_report(cases, verdicts, failed_only=True)
55+
suite["debug"] = dbg
56+
produced = sum(1 for c in cases if verdicts.get(c["id"], (None, False))[1])
57+
suite["summary"] = {
58+
"total_cases": len(cases),
59+
"total_outputs": produced,
60+
"detection_f1": suite["detection_accuracy"].get("detection_f1"),
61+
"detection_precision": suite["detection_accuracy"].get("detection_precision"),
62+
"detection_recall": suite["detection_accuracy"].get("detection_recall"),
63+
"false_safe_rate": suite["false_safe_rate"].get("false_safe_rate"),
64+
"hallucination_rate": suite["hallucination_rate"].get("hallucination_rate"),
65+
"patch_correctness": suite["patch_correctness"].get("patch_correctness"),
66+
"secure_generation_score": suite["secure_generation"].get("secure_generation_score"),
67+
"owasp_coverage": suite["owasp_coverage"].get("coverage"),
68+
"categories_below_full_recall": suite["per_category_recall"].get("categories_below_full_recall"),
69+
"failed_case_ids": [f["id"] for f in suite["failed_cases"].get("failed", [])],
70+
}
71+
return suite
72+
73+
74+
def write_baseline_report(
75+
*,
76+
system_id: str,
77+
system_name: str,
78+
dataset: str,
79+
cases: list[dict],
80+
verdicts: dict,
81+
provenance: dict,
82+
include_patch_metrics: bool = True,
83+
coverage_limits: dict | None = None,
84+
) -> Path:
85+
suite = build_suite(cases, verdicts, include_patch_metrics=include_patch_metrics)
86+
if coverage_limits:
87+
suite["coverage_limits"] = coverage_limits
88+
report = {
89+
"benchmark": "SUITE",
90+
"system": system_name,
91+
"provenance": {
92+
"system_id": system_id,
93+
"system_name": system_name,
94+
"dataset": dataset,
95+
"generated_at": datetime.now(timezone.utc).isoformat(),
96+
**provenance,
97+
},
98+
"results": suite,
99+
}
100+
out_dir = REPORT_ROOT / system_id
101+
out_dir.mkdir(parents=True, exist_ok=True)
102+
out = out_dir / "SUITE.json"
103+
out.write_text(json.dumps(report, indent=2), encoding="utf-8")
104+
return out

0 commit comments

Comments
 (0)