Skip to content

Commit 3eeb84d

Browse files
Add hosted model baseline runners
Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent a589df2 commit 3eeb84d

6 files changed

Lines changed: 348 additions & 1 deletion

File tree

.env.example

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,11 @@
44
NULLSEC_ADAPTER_PATH=outputs/nullsec-s1-qlora
55
NULLSEC_BASE_MODEL=Qwen/Qwen2.5-Coder-7B-Instruct
66
NULLSEC_MAX_NEW_TOKENS=1536
7+
8+
# Optional hosted-model baseline evaluation.
9+
# No defaults are provided because exact provider model IDs must be recorded in
10+
# generated reports.
11+
ANTHROPIC_API_KEY=
12+
ANTHROPIC_MODEL=
13+
OPENAI_API_KEY=
14+
OPENAI_MODEL=

benchmarks/baselines/api_common.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""Shared helpers for hosted API model baselines.
2+
3+
API baselines are comparison-only. They use the same prompt and scoring pipeline
4+
as Nullsec-S1, write raw responses incrementally for resumability, and never
5+
invent metrics.
6+
"""
7+
from __future__ import annotations
8+
9+
import json
10+
import time
11+
from pathlib import Path
12+
13+
from benchmarks.baselines.common import write_baseline_report
14+
from benchmarks.harness import finalize_raw, load_dataset
15+
from nullsec.core.prompts import ANALYZE_TEMPLATE, SYSTEM_PROMPT
16+
17+
REPORT_ROOT = Path(__file__).resolve().parents[2] / "benchmarks" / "reports" / "baselines"
18+
19+
20+
def user_prompt(case: dict) -> str:
21+
return ANALYZE_TEMPLATE.format(
22+
filename=case.get("filename", "input"),
23+
lang=case.get("lang", ""),
24+
code=case["code"],
25+
)
26+
27+
28+
def load_raw_cache(path: Path) -> dict[str, dict]:
29+
if not path.exists():
30+
return {}
31+
out: dict[str, dict] = {}
32+
for line in path.read_text(encoding="utf-8").splitlines():
33+
if not line.strip():
34+
continue
35+
obj = json.loads(line)
36+
out[obj["id"]] = obj
37+
return out
38+
39+
40+
def append_raw(path: Path, obj: dict) -> None:
41+
path.parent.mkdir(parents=True, exist_ok=True)
42+
with path.open("a", encoding="utf-8") as fh:
43+
fh.write(json.dumps(obj) + "\n")
44+
45+
46+
def run_api_baseline(
47+
*,
48+
provider_id: str,
49+
system_name: str,
50+
dataset_name: str,
51+
model: str,
52+
limit: int | None,
53+
resume: bool,
54+
sleep_s: float,
55+
call_model,
56+
provenance: dict,
57+
) -> Path:
58+
ds = load_dataset(dataset_name)
59+
cases = ds["cases"][:limit] if limit else ds["cases"]
60+
out_dir = REPORT_ROOT / provider_id
61+
raw_path = out_dir / "raw_outputs.jsonl"
62+
cached = load_raw_cache(raw_path) if resume else {}
63+
verdicts: dict = {}
64+
65+
print(f"{provider_id}: {len(cases)} case(s), model={model}, resume={resume}")
66+
for i, case in enumerate(cases, start=1):
67+
cid = case["id"]
68+
if cid in cached:
69+
raw = cached[cid]["raw"]
70+
print(f"[{i}/{len(cases)}] {cid}: cached")
71+
else:
72+
print(f"[{i}/{len(cases)}] {cid}: requesting")
73+
raw = call_model(SYSTEM_PROMPT, user_prompt(case))
74+
append_raw(raw_path, {"id": cid, "raw": raw, "provider": provider_id, "model": model})
75+
if sleep_s:
76+
time.sleep(sleep_s)
77+
verdicts[cid] = finalize_raw(raw)
78+
79+
report = write_baseline_report(
80+
system_id=provider_id,
81+
system_name=system_name,
82+
dataset=dataset_name,
83+
cases=cases,
84+
verdicts=verdicts,
85+
provenance={
86+
"provider": provider_id,
87+
"model": model,
88+
"limit": limit,
89+
"raw_outputs": str(raw_path),
90+
**provenance,
91+
},
92+
)
93+
print(f"{provider_id} baseline report -> {report}")
94+
print("Generated report/cache are under benchmarks/reports/ and should not be committed unless explicitly approved.")
95+
return report

benchmarks/baselines/claude_api.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/usr/bin/env python3
2+
"""Claude API baseline for the Nullsec-S1 benchmark.
3+
4+
Requires ANTHROPIC_API_KEY and ANTHROPIC_MODEL (or --model). Generated reports
5+
and raw outputs are written under benchmarks/reports/baselines/claude/ and are
6+
not committed by default.
7+
"""
8+
from __future__ import annotations
9+
10+
import argparse
11+
import os
12+
import sys
13+
from pathlib import Path
14+
15+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
16+
17+
from benchmarks.baselines.api_common import run_api_baseline
18+
19+
20+
def require_env(name: str) -> str:
21+
value = os.environ.get(name)
22+
if not value:
23+
raise SystemExit(f"{name} is required. Export {name}=... before running this baseline.")
24+
return value
25+
26+
27+
def main() -> int:
28+
ap = argparse.ArgumentParser(description="Claude API baseline for the Nullsec-S1 benchmark")
29+
ap.add_argument("--dataset", default="detection.json")
30+
ap.add_argument("--limit", type=int, default=None, help="run only the first N cases (smoke test)")
31+
ap.add_argument("--model", default=os.environ.get("ANTHROPIC_MODEL"))
32+
ap.add_argument("--resume", action="store_true", help="reuse cached raw_outputs.jsonl entries")
33+
ap.add_argument("--sleep", type=float, default=0.0, help="seconds to sleep between requests")
34+
ap.add_argument("--max-tokens", type=int, default=1536)
35+
args = ap.parse_args()
36+
37+
api_key = require_env("ANTHROPIC_API_KEY")
38+
if not args.model:
39+
raise SystemExit("ANTHROPIC_MODEL is required (or pass --model).")
40+
41+
try:
42+
import anthropic
43+
except ImportError as e:
44+
raise SystemExit("anthropic package is required. Install with: python -m pip install anthropic") from e
45+
46+
client = anthropic.Anthropic(api_key=api_key)
47+
48+
def call(system_prompt: str, prompt: str) -> str:
49+
msg = client.messages.create(
50+
model=args.model,
51+
max_tokens=args.max_tokens,
52+
temperature=0,
53+
system=system_prompt,
54+
messages=[{"role": "user", "content": prompt}],
55+
)
56+
parts = []
57+
for block in msg.content:
58+
if getattr(block, "type", None) == "text":
59+
parts.append(block.text)
60+
return "\n".join(parts).strip()
61+
62+
run_api_baseline(
63+
provider_id="claude",
64+
system_name=f"Claude API baseline ({args.model})",
65+
dataset_name=args.dataset,
66+
model=args.model,
67+
limit=args.limit,
68+
resume=args.resume,
69+
sleep_s=args.sleep,
70+
call_model=call,
71+
provenance={"api": "anthropic_messages", "max_tokens": args.max_tokens},
72+
)
73+
return 0
74+
75+
76+
if __name__ == "__main__":
77+
raise SystemExit(main())

benchmarks/baselines/openai_api.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/env python3
2+
"""OpenAI/Codex API baseline for the Nullsec-S1 benchmark.
3+
4+
Requires OPENAI_API_KEY and OPENAI_MODEL (or --model). Generated reports and raw
5+
outputs are written under benchmarks/reports/baselines/openai/ and are not
6+
committed by default.
7+
"""
8+
from __future__ import annotations
9+
10+
import argparse
11+
import os
12+
import sys
13+
from pathlib import Path
14+
15+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
16+
17+
from benchmarks.baselines.api_common import run_api_baseline
18+
19+
20+
def require_env(name: str) -> str:
21+
value = os.environ.get(name)
22+
if not value:
23+
raise SystemExit(f"{name} is required. Export {name}=... before running this baseline.")
24+
return value
25+
26+
27+
def _extract_response_text(resp) -> str:
28+
text = getattr(resp, "output_text", None)
29+
if text:
30+
return text.strip()
31+
chunks = []
32+
for item in getattr(resp, "output", []) or []:
33+
for content in getattr(item, "content", []) or []:
34+
if getattr(content, "type", None) in {"output_text", "text"}:
35+
chunks.append(getattr(content, "text", ""))
36+
return "\n".join(chunks).strip()
37+
38+
39+
def main() -> int:
40+
ap = argparse.ArgumentParser(description="OpenAI/Codex API baseline for the Nullsec-S1 benchmark")
41+
ap.add_argument("--dataset", default="detection.json")
42+
ap.add_argument("--limit", type=int, default=None, help="run only the first N cases (smoke test)")
43+
ap.add_argument("--model", default=os.environ.get("OPENAI_MODEL"))
44+
ap.add_argument("--resume", action="store_true", help="reuse cached raw_outputs.jsonl entries")
45+
ap.add_argument("--sleep", type=float, default=0.0, help="seconds to sleep between requests")
46+
ap.add_argument("--max-tokens", type=int, default=1536)
47+
args = ap.parse_args()
48+
49+
api_key = require_env("OPENAI_API_KEY")
50+
if not args.model:
51+
raise SystemExit("OPENAI_MODEL is required (or pass --model).")
52+
53+
try:
54+
from openai import OpenAI
55+
except ImportError as e:
56+
raise SystemExit("openai package is required. Install with: python -m pip install openai") from e
57+
58+
client = OpenAI(api_key=api_key)
59+
60+
def call(system_prompt: str, prompt: str) -> str:
61+
try:
62+
resp = client.responses.create(
63+
model=args.model,
64+
temperature=0,
65+
max_output_tokens=args.max_tokens,
66+
input=[
67+
{"role": "system", "content": system_prompt},
68+
{"role": "user", "content": prompt},
69+
],
70+
)
71+
return _extract_response_text(resp)
72+
except AttributeError:
73+
# Compatibility with older SDKs that only expose chat.completions.
74+
resp = client.chat.completions.create(
75+
model=args.model,
76+
temperature=0,
77+
max_tokens=args.max_tokens,
78+
messages=[
79+
{"role": "system", "content": system_prompt},
80+
{"role": "user", "content": prompt},
81+
],
82+
)
83+
return resp.choices[0].message.content.strip()
84+
85+
run_api_baseline(
86+
provider_id="openai",
87+
system_name=f"OpenAI/Codex API baseline ({args.model})",
88+
dataset_name=args.dataset,
89+
model=args.model,
90+
limit=args.limit,
91+
resume=args.resume,
92+
sleep_s=args.sleep,
93+
call_model=call,
94+
provenance={"api": "openai_responses_or_chat", "max_tokens": args.max_tokens},
95+
)
96+
return 0
97+
98+
99+
if __name__ == "__main__":
100+
raise SystemExit(main())

benchmarks/compare_baselines.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ def main() -> int:
6969
ap.add_argument("--nullsec", help="Nullsec-S1 SUITE.json")
7070
ap.add_argument("--base", help="base Qwen SUITE.json")
7171
ap.add_argument("--semgrep", help="Semgrep SUITE.json")
72+
ap.add_argument("--claude", help="Claude API baseline SUITE.json")
73+
ap.add_argument("--openai", help="OpenAI/Codex API baseline SUITE.json")
7274
ap.add_argument("--out", default=None, help="optional Markdown output path")
7375
args = ap.parse_args()
7476

@@ -77,6 +79,10 @@ def main() -> int:
7779
row("Base Qwen", load_report(args.base, "base Qwen"), "base model, no Nullsec adapter"),
7880
row("Semgrep", load_report(args.semgrep, "Semgrep"), "static rules; partial category coverage"),
7981
]
82+
if args.claude:
83+
reports.append(row("Claude", load_report(args.claude, "Claude"), "hosted API baseline; model id/date in report"))
84+
if args.openai:
85+
reports.append(row("OpenAI/Codex", load_report(args.openai, "OpenAI/Codex"), "hosted API baseline; model id/date in report"))
8086
md = render(reports)
8187
if args.out:
8288
p = Path(args.out)

docs/EVALS.md

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,65 @@ specialized rules). Unsupported categories are documented in the report.
8484

8585
Docker fallback for Semgrep is a future enhancement, not implemented today.
8686

87+
### Claude API
88+
89+
Claude comparisons are optional hosted-model baselines. They require an API key
90+
and an explicit model id; no default model is hardcoded because provider model
91+
IDs and dates must be recorded in the report.
92+
93+
Smoke test:
94+
95+
```bash
96+
export ANTHROPIC_API_KEY=...
97+
export ANTHROPIC_MODEL=...
98+
python benchmarks/baselines/claude_api.py --limit 5 --sleep 1
99+
```
100+
101+
Full run (costs money; run intentionally):
102+
103+
```bash
104+
python benchmarks/baselines/claude_api.py --sleep 1
105+
```
106+
107+
Report and raw cache:
108+
109+
```text
110+
benchmarks/reports/baselines/claude/SUITE.json
111+
benchmarks/reports/baselines/claude/raw_outputs.jsonl
112+
```
113+
114+
Use `--resume` to skip already-cached case ids if a run is interrupted.
115+
116+
### OpenAI / Codex API
117+
118+
OpenAI/Codex comparisons are optional hosted-model baselines. They require an API
119+
key and an explicit model id via `OPENAI_MODEL` or `--model`.
120+
121+
Smoke test:
122+
123+
```bash
124+
export OPENAI_API_KEY=...
125+
export OPENAI_MODEL=...
126+
python benchmarks/baselines/openai_api.py --limit 5 --sleep 1
127+
```
128+
129+
Full run (costs money; run intentionally):
130+
131+
```bash
132+
python benchmarks/baselines/openai_api.py --sleep 1
133+
```
134+
135+
Report and raw cache:
136+
137+
```text
138+
benchmarks/reports/baselines/openai/SUITE.json
139+
benchmarks/reports/baselines/openai/raw_outputs.jsonl
140+
```
141+
142+
Provider models can change over time. Reports record the exact provider, model
143+
id, run date, dataset, and raw-output cache path. Do not compare hosted-model
144+
results without those fields.
145+
87146
## Comparison table
88147

89148
Generate a Markdown comparison from existing reports:
@@ -93,6 +152,8 @@ python benchmarks/compare_baselines.py \
93152
--nullsec benchmarks/reports/SUITE.json \
94153
--base benchmarks/reports/baselines/qwen2_5_coder_7b/SUITE.json \
95154
--semgrep benchmarks/reports/baselines/semgrep/SUITE.json \
155+
--claude benchmarks/reports/baselines/claude/SUITE.json \
156+
--openai benchmarks/reports/baselines/openai/SUITE.json \
96157
--out benchmarks/reports/baselines/COMPARISON.md
97158
```
98159

@@ -140,7 +201,7 @@ output was not alignable for scoring, so the comparison table shows `110`.
140201
- Semgrep is not expected to cover all categories and should be interpreted as a
141202
static-analysis baseline, not a security LLM.
142203
- Frontier/API model baselines such as Claude, GPT, or other hosted models are
143-
not included yet.
204+
optional and must be generated from scripts with exact model IDs recorded.
144205
- This comparison does not prove universal vulnerability detection performance.
145206
- Do not claim Nullsec-S1 beats another model or tool unless the comparison
146207
script output proves it.

0 commit comments

Comments
 (0)