trynullsec
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎benchmarks/README.md‎
Lines changed: 22 additions & 0 deletions b/‎benchmarks/README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎benchmarks/baselines/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎benchmarks/baselines/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/baselines/base_qwen.py‎
Lines changed: 46 additions & 0 deletions b/‎benchmarks/baselines/base_qwen.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎benchmarks/baselines/category_map.py‎
Lines changed: 34 additions & 0 deletions b/‎benchmarks/baselines/category_map.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎benchmarks/baselines/common.py‎
Lines changed: 104 additions & 0 deletions b/‎benchmarks/baselines/common.py‎
Lines changed: 104 additions & 0 deletions
@@ -41,6 +41,7 @@ The source checkout alone is training-ready; the published RC2/v1.1 release arti
 - [Model Architecture](#model-architecture)
 - [Prompt and Output Format](#prompt-and-output-format)
 - [Integration Status](#integration-status)
+- [Evaluation and Baselines](#evaluation-and-baselines)
 - [Architecture](#architecture)
 - [Why Nullsec S1 exists](#why-nullsec-s1-exists)
 - [Core system components](#core-system-components)
@@ -205,6 +206,15 @@ This verifies that local public claims match the downloaded adapter, benchmark r
 
 See [`docs/PROMPT_FORMAT.md`](docs/PROMPT_FORMAT.md) for the canonical system prompt, user prompt shape, safe-code response format, JSON schema, taxonomy categories, and guidance for reducing false positives/false negatives.
 
+## Evaluation and Baselines
+
+Baseline comparison infrastructure is available in [`docs/EVALS.md`](docs/EVALS.md). It supports reproducible comparisons against:
+
+- the base `Qwen/Qwen2.5-Coder-7B-Instruct` model without the Nullsec adapter;
+- Semgrep with local benchmark rules and explicit coverage limitations.
+
+No baseline numbers are listed here until reports are generated by the scripts. Do not hand-enter comparison metrics.
+
 ## Integration Status
 
 | Integration | Status |
 
@@ -41,6 +41,28 @@ python benchmarks/run_all.py --mode replay --replay captured.jsonl
 
 Reports are written to `benchmarks/reports/` (git-ignored). Every report carries a `provenance` block: model name, version, fingerprint, taxonomy version, run mode, adapter, dataset, and UTC timestamp — so any number can be traced to the exact Nullsec-1 release and dataset that produced it.
 
+## Baselines
+
+Baseline runners live under `benchmarks/baselines/`:
+
+```bash
+# Base model, no Nullsec adapter (GPU required)
+python benchmarks/baselines/base_qwen.py --mode model
+
+# Semgrep static-analysis baseline (CPU; requires semgrep)
+python -m pip install semgrep
+python benchmarks/baselines/semgrep_baseline.py
+
+# Markdown comparison from generated reports
+python benchmarks/compare_baselines.py \
+  --nullsec benchmarks/reports/SUITE.json \
+  --base benchmarks/reports/baselines/qwen2_5_coder_7b/SUITE.json \
+  --semgrep benchmarks/reports/baselines/semgrep/SUITE.json
+```
+
+Generated baseline reports are written under `benchmarks/reports/baselines/` and
+are not committed by default. See [`docs/EVALS.md`](../docs/EVALS.md).
+
 ## Datasets
 
 `datasets/detection.json` is the labeled corpus: each case has `code`, `expected_categories`, `expected_min_severity`, `expected_production_ready`, and (where applicable) an `owasp` tag. It is part of the initial seed and is meant to grow with the ingestion pipeline; benchmark strength scales with corpus breadth.
@@ -0,0 +1,5 @@
+"""Baseline runners for comparing Nullsec-S1 against other systems.
+
+Generated reports are written under benchmarks/reports/baselines/ and are not
+committed by default.
+"""
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Run the 111-case security benchmark against the base Qwen model.
+
+This baseline uses the same prompt, raw-output finalization, and metrics as
+Nullsec-S1, but loads Qwen/Qwen2.5-Coder-7B-Instruct without the Nullsec adapter.
+Malformed/non-JSON output is counted honestly as a miss.
+"""
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+
+from benchmarks.baselines.common import write_baseline_report
+from benchmarks.harness import OutputProvider, load_dataset
+from benchmarks.runners._common import collect_verdicts
+from nullsec.core.version import TRAINING_BASE
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Base Qwen2.5-Coder benchmark baseline")
+    ap.add_argument("--mode", choices=["model", "replay"], default="model")
+    ap.add_argument("--dataset", default="detection.json")
+    ap.add_argument("--replay", default=None, help="captured raw outputs JSONL for replay mode")
+    args = ap.parse_args()
+
+    ds = load_dataset(args.dataset)
+    provider = OutputProvider(mode=args.mode, adapter=None, replay_path=Path(args.replay) if args.replay else None)
+    verdicts = collect_verdicts(ds, provider)
+    out = write_baseline_report(
+        system_id="qwen2_5_coder_7b",
+        system_name="Qwen2.5-Coder-7B-Instruct (base, no Nullsec adapter)",
+        dataset=args.dataset,
+        cases=ds["cases"],
+        verdicts=verdicts,
+        provenance={"run_mode": args.mode, "base_model": TRAINING_BASE, "adapter": None},
+    )
+    print(f"base Qwen baseline report -> {out}")
+    print("Generated report is under benchmarks/reports/ and should not be committed unless explicitly approved.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,34 @@
+"""Semgrep rule-id to Nullsec category mapping for baseline evaluation.
+
+The mapping is intentionally conservative. Unsupported categories remain
+unsupported rather than being forced into misleading coverage.
+"""
+from __future__ import annotations
+
+RULE_TO_CATEGORY = {
+    "nullsec.exposed-secret": "EXPOSED_SECRET",
+    "nullsec.sql-injection": "SQL_INJECTION",
+    "nullsec.xss-dangerously-set-html": "XSS",
+    "nullsec.ssrf-user-controlled-fetch": "SSRF",
+    "nullsec.command-injection-shell": "COMMAND_INJECTION",
+    "nullsec.dangerous-shell-curl-pipe": "DANGEROUS_SHELL_COMMAND",
+    "nullsec.dangerous-shell-chmod-777": "DANGEROUS_SHELL_COMMAND",
+    "nullsec.unsafe-admin-route": "UNSAFE_ADMIN_ROUTE",
+    "nullsec.missing-rate-limit": "MISSING_RATE_LIMIT",
+    "nullsec.unsafe-file-upload": "UNSAFE_FILE_UPLOAD",
+    "nullsec.dependency-risk-postinstall": "DEPENDENCY_RISK",
+    "nullsec.env-exposure": "ENVIRONMENT_EXPOSURE",
+}
+
+SUPPORTED_CATEGORIES = sorted(set(RULE_TO_CATEGORY.values()))
+PARTIAL_OR_UNSUPPORTED_CATEGORIES = [
+    "BROKEN_AUTH",
+    "MCP_TOOL_ABUSE",
+    "PROMPT_INJECTION",
+    "SMART_CONTRACT_RISK",
+    "WALLET_TRANSACTION_RISK",
+]
+
+
+def category_for_rule(rule_id: str) -> str | None:
+    return RULE_TO_CATEGORY.get(rule_id)
@@ -0,0 +1,104 @@
+"""Shared baseline evaluation helpers.
+
+Baselines emit the same `verdicts` shape consumed by `benchmarks.metrics`:
+
+    dict[case_id] = (Verdict | None, aligned_bool)
+
+This lets base-model and tool baselines use the same scoring functions as
+Nullsec-S1 without changing labels or metric semantics.
+"""
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+
+from benchmarks.metrics import (
+    debug_report,
+    detection_over,
+    failed_case_ids,
+    false_safe_rate,
+    hallucination_rate,
+    owasp_coverage,
+    patch_correctness,
+    per_category_recall,
+    secure_generation_score,
+)
+
+ROOT = Path(__file__).resolve().parents[2]
+REPORT_ROOT = ROOT / "benchmarks" / "reports" / "baselines"
+
+FAMILIES = {
+    "detection_accuracy": detection_over,
+    "false_safe_rate": false_safe_rate,
+    "hallucination_rate": hallucination_rate,
+    "owasp_coverage": owasp_coverage,
+    "patch_correctness": patch_correctness,
+    "secure_generation": secure_generation_score,
+    "per_category_recall": per_category_recall,
+    "failed_cases": failed_case_ids,
+}
+
+
+def build_suite(cases: list[dict], verdicts: dict, *, include_patch_metrics: bool = True) -> dict:
+    suite = {}
+    for name, fn in FAMILIES.items():
+        if not include_patch_metrics and name in {"patch_correctness", "secure_generation"}:
+            suite[name] = {
+                "not_applicable": True,
+                "note": "baseline does not generate secure patches",
+            }
+            continue
+        suite[name] = fn(cases, verdicts)
+
+    dbg = debug_report(cases, verdicts, failed_only=True)
+    suite["debug"] = dbg
+    produced = sum(1 for c in cases if verdicts.get(c["id"], (None, False))[1])
+    suite["summary"] = {
+        "total_cases": len(cases),
+        "total_outputs": produced,
+        "detection_f1": suite["detection_accuracy"].get("detection_f1"),
+        "detection_precision": suite["detection_accuracy"].get("detection_precision"),
+        "detection_recall": suite["detection_accuracy"].get("detection_recall"),
+        "false_safe_rate": suite["false_safe_rate"].get("false_safe_rate"),
+        "hallucination_rate": suite["hallucination_rate"].get("hallucination_rate"),
+        "patch_correctness": suite["patch_correctness"].get("patch_correctness"),
+        "secure_generation_score": suite["secure_generation"].get("secure_generation_score"),
+        "owasp_coverage": suite["owasp_coverage"].get("coverage"),
+        "categories_below_full_recall": suite["per_category_recall"].get("categories_below_full_recall"),
+        "failed_case_ids": [f["id"] for f in suite["failed_cases"].get("failed", [])],
+    }
+    return suite
+
+
+def write_baseline_report(
+    *,
+    system_id: str,
+    system_name: str,
+    dataset: str,
+    cases: list[dict],
+    verdicts: dict,
+    provenance: dict,
+    include_patch_metrics: bool = True,
+    coverage_limits: dict | None = None,
+) -> Path:
+    suite = build_suite(cases, verdicts, include_patch_metrics=include_patch_metrics)
+    if coverage_limits:
+        suite["coverage_limits"] = coverage_limits
+    report = {
+        "benchmark": "SUITE",
+        "system": system_name,
+        "provenance": {
+            "system_id": system_id,
+            "system_name": system_name,
+            "dataset": dataset,
+            "generated_at": datetime.now(timezone.utc).isoformat(),
+            **provenance,
+        },
+        "results": suite,
+    }
+    out_dir = REPORT_ROOT / system_id
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out = out_dir / "SUITE.json"
+    out.write_text(json.dumps(report, indent=2), encoding="utf-8")
+    return out