Skip to content

Commit 4284073

Browse files
committed
Normalize security findings taxonomy
1 parent c1a1b4e commit 4284073

4 files changed

Lines changed: 406 additions & 6 deletions

File tree

Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
"""Deterministic response normalization for common security sink confusions.
2+
3+
This layer runs after model/rule output is aligned and before API responses are
4+
returned. It does not replace model analysis; it corrects high-confidence sink
5+
classification mistakes and removes a few known false positives.
6+
"""
7+
from __future__ import annotations
8+
9+
import copy
10+
import re
11+
from typing import Any
12+
13+
Severity = str
14+
15+
_RISK_BY_SEVERITY = {
16+
"CRITICAL": 90,
17+
"HIGH": 70,
18+
"MEDIUM": 40,
19+
"LOW": 20,
20+
"INFO": 0,
21+
}
22+
_SEVERITY_ORDER = {"INFO": 0, "LOW": 1, "MEDIUM": 2, "HIGH": 3, "CRITICAL": 4}
23+
24+
_UNTRUSTED = r"(?:req\.(?:body|query|params)|params\b|request\.(?:body|query|params))"
25+
_FILESYSTEM_SINK_RE = re.compile(
26+
rf"(?:fs\.(?:readFileSync|readFile|createReadStream)\s*\(\s*{_UNTRUSTED}"
27+
rf"|(?:^|[^\w])open\s*\(\s*{_UNTRUSTED}"
28+
rf"|path\.(?:join|resolve|normalize)\s*\([^)]*{_UNTRUSTED})",
29+
re.IGNORECASE | re.DOTALL,
30+
)
31+
_EVAL_RCE_RE = re.compile(
32+
rf"(?:\beval\s*\(\s*{_UNTRUSTED}"
33+
rf"|\bnew\s+Function\s*\(\s*{_UNTRUSTED}"
34+
rf"|\bFunction\s*\(\s*{_UNTRUSTED}"
35+
rf"|\bvm\.runInNewContext\s*\(\s*{_UNTRUSTED})",
36+
re.IGNORECASE | re.DOTALL,
37+
)
38+
_COMMAND_INJECTION_RE = re.compile(
39+
rf"(?:\bexec(?:Sync)?\s*\(\s*{_UNTRUSTED}"
40+
rf"|\bspawn\s*\([^)]*{_UNTRUSTED}"
41+
rf"|\bchild_process\.(?:exec|execSync|spawn)\s*\([^)]*{_UNTRUSTED})",
42+
re.IGNORECASE | re.DOTALL,
43+
)
44+
_STATIC_EXEC_RE = re.compile(
45+
r"\bexec\s*\(\s*(['\"])(?P<cmd>(?:npm run build|git status))\1\s*\)",
46+
re.IGNORECASE,
47+
)
48+
_HARDCODED_SECRET_RE = re.compile(
49+
r"(?:sk-[A-Za-z0-9][A-Za-z0-9_\-]{8,}|ghp_[A-Za-z0-9_]{8,}|xoxb-[A-Za-z0-9\-]{8,}|-----BEGIN [A-Z ]*PRIVATE KEY-----)",
50+
re.IGNORECASE,
51+
)
52+
_ENV_SECRET_RE = re.compile(r"process\.env\.[A-Z0-9_]*(?:SECRET|TOKEN|API_KEY|PASSWORD|PRIVATE_KEY)[A-Z0-9_]*")
53+
_ENV_SECRET_RESPONSE_RE = re.compile(
54+
r"(?:res\.json|response\.json|Response\.json)\s*\([^)]*process\.env\.[A-Z0-9_]+",
55+
re.IGNORECASE | re.DOTALL,
56+
)
57+
_CLIENT_ENV_SECRET_RE = re.compile(
58+
r"(?:['\"]use client['\"]|NEXT_PUBLIC_|PUBLIC_)[\s\S]{0,200}process\.env\.[A-Z0-9_]+",
59+
re.IGNORECASE,
60+
)
61+
_SQLI_RE = re.compile(
62+
rf"(?:\b(?:db\.)?(?:query|execute|raw)\s*\(\s*`[^`]*\$\{{\s*{_UNTRUSTED}[\s\S]*?`"
63+
rf"|\b(?:db\.)?(?:query|execute|raw)\s*\(\s*['\"][^'\"]*(?:SELECT|UPDATE|DELETE|INSERT)[^'\"]*['\"]\s*\+[^)]*{_UNTRUSTED})",
64+
re.IGNORECASE | re.DOTALL,
65+
)
66+
_XSS_RE = re.compile(r"dangerouslySetInnerHTML\s*=\s*\{\{[^}]*__html\s*:\s*req\.", re.IGNORECASE | re.DOTALL)
67+
_SSRF_RE = re.compile(r"\bfetch\s*\(\s*(?:req\.(?:body|query|params)|params\b)", re.IGNORECASE | re.DOTALL)
68+
69+
70+
def _line_for(content: str, index: int) -> int:
71+
return content[:index].count("\n") + 1
72+
73+
74+
def _finding(
75+
*,
76+
title: str,
77+
category: str,
78+
severity: Severity,
79+
cwe: str,
80+
file: str,
81+
line: int,
82+
description: str,
83+
recommendation: str,
84+
) -> dict[str, Any]:
85+
return {
86+
"category": category,
87+
"title": title,
88+
"severity": severity,
89+
"confidence": "HIGH",
90+
"file": file,
91+
"line": line,
92+
"description": description,
93+
"cwe": cwe,
94+
"exploit_scenario": description,
95+
"recommended_fix": recommendation,
96+
"recommendation": recommendation,
97+
"secure_patch": "",
98+
}
99+
100+
101+
def _rule_findings(file_content: str, filename: str) -> list[dict[str, Any]]:
102+
findings: list[dict[str, Any]] = []
103+
104+
if match := _FILESYSTEM_SINK_RE.search(file_content):
105+
findings.append(
106+
_finding(
107+
title="PATH_TRAVERSAL",
108+
category="UNSAFE_FILE_UPLOAD",
109+
severity="HIGH",
110+
cwe="CWE-22",
111+
file=filename,
112+
line=_line_for(file_content, match.start()),
113+
description="Untrusted user-controlled file path reaches filesystem access.",
114+
recommendation="Normalize and restrict paths to an allowed base directory; reject ../ and absolute paths.",
115+
)
116+
)
117+
118+
if match := _EVAL_RCE_RE.search(file_content):
119+
findings.append(
120+
_finding(
121+
title="REMOTE_CODE_EXECUTION",
122+
category="COMMAND_INJECTION",
123+
severity="CRITICAL",
124+
cwe="CWE-94",
125+
file=filename,
126+
line=_line_for(file_content, match.start()),
127+
description="User-controlled input reaches dynamic code execution.",
128+
recommendation="Never execute user-controlled code. Use a sandboxed allowlisted interpreter if absolutely required.",
129+
)
130+
)
131+
132+
if match := _COMMAND_INJECTION_RE.search(file_content):
133+
findings.append(
134+
_finding(
135+
title="COMMAND_INJECTION",
136+
category="COMMAND_INJECTION",
137+
severity="CRITICAL",
138+
cwe="CWE-78",
139+
file=filename,
140+
line=_line_for(file_content, match.start()),
141+
description="User-controlled input reaches OS command execution.",
142+
recommendation="Do not pass user-controlled input to shell commands; use fixed commands with validated argument arrays.",
143+
)
144+
)
145+
elif match := _STATIC_EXEC_RE.search(file_content):
146+
findings.append(
147+
_finding(
148+
title="UNSAFE_PROCESS_EXECUTION",
149+
category="DANGEROUS_SHELL_COMMAND",
150+
severity="LOW",
151+
cwe="CWE-78",
152+
file=filename,
153+
line=_line_for(file_content, match.start()),
154+
description="Static shell command execution detected. Lower risk because no user-controlled input is used, but prefer spawnFile/spawn with arg arrays.",
155+
recommendation="Prefer spawnFile/spawn with fixed executable and argument arrays; avoid shell parsing where possible.",
156+
)
157+
)
158+
159+
if _HARDCODED_SECRET_RE.search(file_content) or _ENV_SECRET_RESPONSE_RE.search(file_content) or _CLIENT_ENV_SECRET_RE.search(file_content):
160+
match = _HARDCODED_SECRET_RE.search(file_content) or _ENV_SECRET_RESPONSE_RE.search(file_content) or _CLIENT_ENV_SECRET_RE.search(file_content)
161+
findings.append(
162+
_finding(
163+
title="EXPOSED_SECRET",
164+
category="EXPOSED_SECRET",
165+
severity="CRITICAL",
166+
cwe="CWE-798",
167+
file=filename,
168+
line=_line_for(file_content, match.start()) if match else 1,
169+
description="A secret or environment credential is exposed to client-visible output.",
170+
recommendation="Keep secrets server-side only; never return secret environment values to clients.",
171+
)
172+
)
173+
174+
if match := _SQLI_RE.search(file_content):
175+
findings.append(
176+
_finding(
177+
title="SQL_INJECTION",
178+
category="SQL_INJECTION",
179+
severity="CRITICAL",
180+
cwe="CWE-89",
181+
file=filename,
182+
line=_line_for(file_content, match.start()),
183+
description="User-controlled input is interpolated into a SQL query.",
184+
recommendation="Use parameterized queries or query builders that bind values separately from SQL text.",
185+
)
186+
)
187+
188+
if match := _XSS_RE.search(file_content):
189+
findings.append(
190+
_finding(
191+
title="XSS",
192+
category="XSS",
193+
severity="HIGH",
194+
cwe="CWE-79",
195+
file=filename,
196+
line=_line_for(file_content, match.start()),
197+
description="User-controlled HTML reaches dangerouslySetInnerHTML.",
198+
recommendation="Avoid raw HTML sinks; sanitize with a trusted sanitizer and prefer escaped rendering.",
199+
)
200+
)
201+
202+
if match := _SSRF_RE.search(file_content):
203+
findings.append(
204+
_finding(
205+
title="SSRF",
206+
category="SSRF",
207+
severity="HIGH",
208+
cwe="CWE-918",
209+
file=filename,
210+
line=_line_for(file_content, match.start()),
211+
description="User-controlled URL reaches a server-side fetch call.",
212+
recommendation="Allowlist outbound destinations and reject private, loopback, and metadata IP ranges.",
213+
)
214+
)
215+
216+
return findings
217+
218+
219+
def _signature(finding: dict[str, Any]) -> tuple[str, str, int | None]:
220+
label = str(finding.get("title") or finding.get("category") or "")
221+
return label, str(finding.get("file") or ""), finding.get("line")
222+
223+
224+
def _is_env_secret_false_positive(finding: dict[str, Any], file_content: str) -> bool:
225+
label = str(finding.get("title") or finding.get("category") or "").upper()
226+
if label != "EXPOSED_SECRET":
227+
return False
228+
if not _ENV_SECRET_RE.search(file_content):
229+
return False
230+
return not (
231+
_HARDCODED_SECRET_RE.search(file_content)
232+
or _ENV_SECRET_RESPONSE_RE.search(file_content)
233+
or _CLIENT_ENV_SECRET_RE.search(file_content)
234+
)
235+
236+
237+
def normalizeFinding(finding: dict[str, Any], fileContent: str, filename: str = "input") -> dict[str, Any] | None: # noqa: N802
238+
"""Normalize one model finding against file content.
239+
240+
Returns None when a finding is a deterministic false positive, currently for
241+
env-secret reads that are not exposed or hardcoded.
242+
"""
243+
if _is_env_secret_false_positive(finding, fileContent):
244+
return None
245+
246+
normalized = copy.deepcopy(finding)
247+
normalized.setdefault("title", normalized.get("category", "Security finding"))
248+
normalized.setdefault("recommendation", normalized.get("recommended_fix", ""))
249+
250+
rule_findings = _rule_findings(fileContent, str(normalized.get("file") or filename))
251+
if not rule_findings:
252+
return normalized
253+
254+
current_label = str(normalized.get("title") or normalized.get("category") or "").upper()
255+
for rule in rule_findings:
256+
if rule["title"] in {
257+
current_label,
258+
"PATH_TRAVERSAL" if current_label in {"DANGEROUS_SHELL_COMMAND", "UNSAFE_FILE_UPLOAD"} else "",
259+
"REMOTE_CODE_EXECUTION" if current_label in {"COMMAND_INJECTION", "DANGEROUS_SHELL_COMMAND"} else "",
260+
"UNSAFE_PROCESS_EXECUTION" if current_label in {"COMMAND_INJECTION", "DANGEROUS_SHELL_COMMAND"} else "",
261+
}:
262+
return rule
263+
return normalized
264+
265+
266+
def _dedupe(findings: list[dict[str, Any]]) -> list[dict[str, Any]]:
267+
out: list[dict[str, Any]] = []
268+
seen: set[tuple[str, str, int | None]] = set()
269+
for finding in findings:
270+
sig = _signature(finding)
271+
if sig in seen:
272+
continue
273+
seen.add(sig)
274+
out.append(finding)
275+
return out
276+
277+
278+
def _recalculate_risk(payload: dict[str, Any]) -> None:
279+
findings = payload.get("findings") or []
280+
if not findings:
281+
payload["risk_score"] = 0
282+
payload["severity"] = "INFO"
283+
payload["production_ready"] = True
284+
if isinstance(payload.get("_safety_layer"), dict):
285+
payload["_safety_layer"]["production_ready"] = True
286+
payload["_safety_layer"]["blocking_reasons"] = []
287+
return
288+
289+
max_sev = max((str(f.get("severity", "INFO")).upper() for f in findings), key=lambda sev: _SEVERITY_ORDER.get(sev, 0))
290+
payload["severity"] = max_sev
291+
payload["risk_score"] = _RISK_BY_SEVERITY.get(max_sev, 0)
292+
if _SEVERITY_ORDER.get(max_sev, 0) >= _SEVERITY_ORDER["HIGH"]:
293+
payload["production_ready"] = False
294+
if isinstance(payload.get("_safety_layer"), dict):
295+
payload["_safety_layer"]["production_ready"] = False
296+
297+
298+
def normalize_verdict_payload(payload: dict[str, Any], file_content: str, filename: str = "input") -> dict[str, Any]:
299+
"""Apply deterministic finding normalization and risk recalculation."""
300+
normalized = copy.deepcopy(payload)
301+
existing: list[dict[str, Any]] = []
302+
for finding in normalized.get("findings") or []:
303+
clean = normalizeFinding(finding, file_content, filename)
304+
if clean is not None:
305+
existing.append(clean)
306+
307+
deterministic = _rule_findings(file_content, filename)
308+
normalized["findings"] = _dedupe([*existing, *deterministic])
309+
normalized["affected_files"] = sorted({str(f.get("file") or filename) for f in normalized["findings"]})
310+
_recalculate_risk(normalized)
311+
return normalized

serving/server.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
model_info,
4545
)
4646
from nullsec.safety import VerdictParseError
47+
from nullsec.safety.finding_normalization import normalize_verdict_payload
4748

4849
logging.basicConfig(level=os.environ.get("NULLSEC_LOG_LEVEL", "INFO"))
4950
LOGGER = logging.getLogger("nullsec.s1")
@@ -263,7 +264,9 @@ def _finding_from_verdict(finding: dict, fallback_file: str) -> dict:
263264
severity = str(finding.get("severity", "INFO")).lower()
264265
return {
265266
"severity": severity,
266-
"title": finding.get("category", "Security finding"),
267+
"title": finding.get("title") or finding.get("category", "Security finding"),
268+
"category": finding.get("category", "Security finding"),
269+
"cwe": finding.get("cwe"),
267270
"file": finding.get("file") or fallback_file,
268271
"line": finding.get("line"),
269272
"description": finding.get("description") or finding.get("summary") or "",
@@ -307,7 +310,8 @@ def analyze(req: AnalyzeRequest) -> dict:
307310
raise _load_error_response(e)
308311
except VerdictParseError as e:
309312
raise HTTPException(status_code=502, detail=f"Nullsec-1 output could not be aligned: {e}")
310-
return {"ok": True, "verdict": json.loads(pipeline.to_json(result))}
313+
verdict = normalize_verdict_payload(json.loads(pipeline.to_json(result)), req.code, req.filename)
314+
return {"ok": True, "verdict": verdict}
311315

312316

313317
@app.post("/v1/arena/raw")
@@ -374,7 +378,7 @@ def scan(req: RepoScanRequest) -> dict:
374378
try:
375379
for scan_file in req.files:
376380
result = pipeline.analyze(scan_file.path, scan_file.content, scan_file.language or "")
377-
verdict = json.loads(pipeline.to_json(result))
381+
verdict = normalize_verdict_payload(json.loads(pipeline.to_json(result)), scan_file.content, scan_file.path)
378382
max_risk = max(max_risk, int(verdict.get("risk_score", 0)))
379383
for finding in verdict.get("findings", []):
380384
findings.append(_finding_from_verdict(finding, scan_file.path))
@@ -431,7 +435,7 @@ def event_gen():
431435
raw = "".join(buffer)
432436
try:
433437
result = pipeline.finalize(raw)
434-
final = json.loads(pipeline.to_json(result))
438+
final = normalize_verdict_payload(json.loads(pipeline.to_json(result)), req.code, req.filename)
435439
yield f"event: verdict\ndata: {json.dumps(final)}\n\n"
436440
except VerdictParseError as e:
437441
yield f"event: error\ndata: {json.dumps({'error': str(e)})}\n\n"

0 commit comments

Comments
 (0)