|
| 1 | +"""Deterministic response normalization for common security sink confusions. |
| 2 | +
|
| 3 | +This layer runs after model/rule output is aligned and before API responses are |
| 4 | +returned. It does not replace model analysis; it corrects high-confidence sink |
| 5 | +classification mistakes and removes a few known false positives. |
| 6 | +""" |
| 7 | +from __future__ import annotations |
| 8 | + |
| 9 | +import copy |
| 10 | +import re |
| 11 | +from typing import Any |
| 12 | + |
| 13 | +Severity = str |
| 14 | + |
| 15 | +_RISK_BY_SEVERITY = { |
| 16 | + "CRITICAL": 90, |
| 17 | + "HIGH": 70, |
| 18 | + "MEDIUM": 40, |
| 19 | + "LOW": 20, |
| 20 | + "INFO": 0, |
| 21 | +} |
| 22 | +_SEVERITY_ORDER = {"INFO": 0, "LOW": 1, "MEDIUM": 2, "HIGH": 3, "CRITICAL": 4} |
| 23 | + |
| 24 | +_UNTRUSTED = r"(?:req\.(?:body|query|params)|params\b|request\.(?:body|query|params))" |
| 25 | +_FILESYSTEM_SINK_RE = re.compile( |
| 26 | + rf"(?:fs\.(?:readFileSync|readFile|createReadStream)\s*\(\s*{_UNTRUSTED}" |
| 27 | + rf"|(?:^|[^\w])open\s*\(\s*{_UNTRUSTED}" |
| 28 | + rf"|path\.(?:join|resolve|normalize)\s*\([^)]*{_UNTRUSTED})", |
| 29 | + re.IGNORECASE | re.DOTALL, |
| 30 | +) |
| 31 | +_EVAL_RCE_RE = re.compile( |
| 32 | + rf"(?:\beval\s*\(\s*{_UNTRUSTED}" |
| 33 | + rf"|\bnew\s+Function\s*\(\s*{_UNTRUSTED}" |
| 34 | + rf"|\bFunction\s*\(\s*{_UNTRUSTED}" |
| 35 | + rf"|\bvm\.runInNewContext\s*\(\s*{_UNTRUSTED})", |
| 36 | + re.IGNORECASE | re.DOTALL, |
| 37 | +) |
| 38 | +_COMMAND_INJECTION_RE = re.compile( |
| 39 | + rf"(?:\bexec(?:Sync)?\s*\(\s*{_UNTRUSTED}" |
| 40 | + rf"|\bspawn\s*\([^)]*{_UNTRUSTED}" |
| 41 | + rf"|\bchild_process\.(?:exec|execSync|spawn)\s*\([^)]*{_UNTRUSTED})", |
| 42 | + re.IGNORECASE | re.DOTALL, |
| 43 | +) |
| 44 | +_STATIC_EXEC_RE = re.compile( |
| 45 | + r"\bexec\s*\(\s*(['\"])(?P<cmd>(?:npm run build|git status))\1\s*\)", |
| 46 | + re.IGNORECASE, |
| 47 | +) |
| 48 | +_HARDCODED_SECRET_RE = re.compile( |
| 49 | + r"(?:sk-[A-Za-z0-9][A-Za-z0-9_\-]{8,}|ghp_[A-Za-z0-9_]{8,}|xoxb-[A-Za-z0-9\-]{8,}|-----BEGIN [A-Z ]*PRIVATE KEY-----)", |
| 50 | + re.IGNORECASE, |
| 51 | +) |
| 52 | +_ENV_SECRET_RE = re.compile(r"process\.env\.[A-Z0-9_]*(?:SECRET|TOKEN|API_KEY|PASSWORD|PRIVATE_KEY)[A-Z0-9_]*") |
| 53 | +_ENV_SECRET_RESPONSE_RE = re.compile( |
| 54 | + r"(?:res\.json|response\.json|Response\.json)\s*\([^)]*process\.env\.[A-Z0-9_]+", |
| 55 | + re.IGNORECASE | re.DOTALL, |
| 56 | +) |
| 57 | +_CLIENT_ENV_SECRET_RE = re.compile( |
| 58 | + r"(?:['\"]use client['\"]|NEXT_PUBLIC_|PUBLIC_)[\s\S]{0,200}process\.env\.[A-Z0-9_]+", |
| 59 | + re.IGNORECASE, |
| 60 | +) |
| 61 | +_SQLI_RE = re.compile( |
| 62 | + rf"(?:\b(?:db\.)?(?:query|execute|raw)\s*\(\s*`[^`]*\$\{{\s*{_UNTRUSTED}[\s\S]*?`" |
| 63 | + rf"|\b(?:db\.)?(?:query|execute|raw)\s*\(\s*['\"][^'\"]*(?:SELECT|UPDATE|DELETE|INSERT)[^'\"]*['\"]\s*\+[^)]*{_UNTRUSTED})", |
| 64 | + re.IGNORECASE | re.DOTALL, |
| 65 | +) |
| 66 | +_XSS_RE = re.compile(r"dangerouslySetInnerHTML\s*=\s*\{\{[^}]*__html\s*:\s*req\.", re.IGNORECASE | re.DOTALL) |
| 67 | +_SSRF_RE = re.compile(r"\bfetch\s*\(\s*(?:req\.(?:body|query|params)|params\b)", re.IGNORECASE | re.DOTALL) |
| 68 | + |
| 69 | + |
| 70 | +def _line_for(content: str, index: int) -> int: |
| 71 | + return content[:index].count("\n") + 1 |
| 72 | + |
| 73 | + |
| 74 | +def _finding( |
| 75 | + *, |
| 76 | + title: str, |
| 77 | + category: str, |
| 78 | + severity: Severity, |
| 79 | + cwe: str, |
| 80 | + file: str, |
| 81 | + line: int, |
| 82 | + description: str, |
| 83 | + recommendation: str, |
| 84 | +) -> dict[str, Any]: |
| 85 | + return { |
| 86 | + "category": category, |
| 87 | + "title": title, |
| 88 | + "severity": severity, |
| 89 | + "confidence": "HIGH", |
| 90 | + "file": file, |
| 91 | + "line": line, |
| 92 | + "description": description, |
| 93 | + "cwe": cwe, |
| 94 | + "exploit_scenario": description, |
| 95 | + "recommended_fix": recommendation, |
| 96 | + "recommendation": recommendation, |
| 97 | + "secure_patch": "", |
| 98 | + } |
| 99 | + |
| 100 | + |
| 101 | +def _rule_findings(file_content: str, filename: str) -> list[dict[str, Any]]: |
| 102 | + findings: list[dict[str, Any]] = [] |
| 103 | + |
| 104 | + if match := _FILESYSTEM_SINK_RE.search(file_content): |
| 105 | + findings.append( |
| 106 | + _finding( |
| 107 | + title="PATH_TRAVERSAL", |
| 108 | + category="UNSAFE_FILE_UPLOAD", |
| 109 | + severity="HIGH", |
| 110 | + cwe="CWE-22", |
| 111 | + file=filename, |
| 112 | + line=_line_for(file_content, match.start()), |
| 113 | + description="Untrusted user-controlled file path reaches filesystem access.", |
| 114 | + recommendation="Normalize and restrict paths to an allowed base directory; reject ../ and absolute paths.", |
| 115 | + ) |
| 116 | + ) |
| 117 | + |
| 118 | + if match := _EVAL_RCE_RE.search(file_content): |
| 119 | + findings.append( |
| 120 | + _finding( |
| 121 | + title="REMOTE_CODE_EXECUTION", |
| 122 | + category="COMMAND_INJECTION", |
| 123 | + severity="CRITICAL", |
| 124 | + cwe="CWE-94", |
| 125 | + file=filename, |
| 126 | + line=_line_for(file_content, match.start()), |
| 127 | + description="User-controlled input reaches dynamic code execution.", |
| 128 | + recommendation="Never execute user-controlled code. Use a sandboxed allowlisted interpreter if absolutely required.", |
| 129 | + ) |
| 130 | + ) |
| 131 | + |
| 132 | + if match := _COMMAND_INJECTION_RE.search(file_content): |
| 133 | + findings.append( |
| 134 | + _finding( |
| 135 | + title="COMMAND_INJECTION", |
| 136 | + category="COMMAND_INJECTION", |
| 137 | + severity="CRITICAL", |
| 138 | + cwe="CWE-78", |
| 139 | + file=filename, |
| 140 | + line=_line_for(file_content, match.start()), |
| 141 | + description="User-controlled input reaches OS command execution.", |
| 142 | + recommendation="Do not pass user-controlled input to shell commands; use fixed commands with validated argument arrays.", |
| 143 | + ) |
| 144 | + ) |
| 145 | + elif match := _STATIC_EXEC_RE.search(file_content): |
| 146 | + findings.append( |
| 147 | + _finding( |
| 148 | + title="UNSAFE_PROCESS_EXECUTION", |
| 149 | + category="DANGEROUS_SHELL_COMMAND", |
| 150 | + severity="LOW", |
| 151 | + cwe="CWE-78", |
| 152 | + file=filename, |
| 153 | + line=_line_for(file_content, match.start()), |
| 154 | + description="Static shell command execution detected. Lower risk because no user-controlled input is used, but prefer spawnFile/spawn with arg arrays.", |
| 155 | + recommendation="Prefer spawnFile/spawn with fixed executable and argument arrays; avoid shell parsing where possible.", |
| 156 | + ) |
| 157 | + ) |
| 158 | + |
| 159 | + if _HARDCODED_SECRET_RE.search(file_content) or _ENV_SECRET_RESPONSE_RE.search(file_content) or _CLIENT_ENV_SECRET_RE.search(file_content): |
| 160 | + match = _HARDCODED_SECRET_RE.search(file_content) or _ENV_SECRET_RESPONSE_RE.search(file_content) or _CLIENT_ENV_SECRET_RE.search(file_content) |
| 161 | + findings.append( |
| 162 | + _finding( |
| 163 | + title="EXPOSED_SECRET", |
| 164 | + category="EXPOSED_SECRET", |
| 165 | + severity="CRITICAL", |
| 166 | + cwe="CWE-798", |
| 167 | + file=filename, |
| 168 | + line=_line_for(file_content, match.start()) if match else 1, |
| 169 | + description="A secret or environment credential is exposed to client-visible output.", |
| 170 | + recommendation="Keep secrets server-side only; never return secret environment values to clients.", |
| 171 | + ) |
| 172 | + ) |
| 173 | + |
| 174 | + if match := _SQLI_RE.search(file_content): |
| 175 | + findings.append( |
| 176 | + _finding( |
| 177 | + title="SQL_INJECTION", |
| 178 | + category="SQL_INJECTION", |
| 179 | + severity="CRITICAL", |
| 180 | + cwe="CWE-89", |
| 181 | + file=filename, |
| 182 | + line=_line_for(file_content, match.start()), |
| 183 | + description="User-controlled input is interpolated into a SQL query.", |
| 184 | + recommendation="Use parameterized queries or query builders that bind values separately from SQL text.", |
| 185 | + ) |
| 186 | + ) |
| 187 | + |
| 188 | + if match := _XSS_RE.search(file_content): |
| 189 | + findings.append( |
| 190 | + _finding( |
| 191 | + title="XSS", |
| 192 | + category="XSS", |
| 193 | + severity="HIGH", |
| 194 | + cwe="CWE-79", |
| 195 | + file=filename, |
| 196 | + line=_line_for(file_content, match.start()), |
| 197 | + description="User-controlled HTML reaches dangerouslySetInnerHTML.", |
| 198 | + recommendation="Avoid raw HTML sinks; sanitize with a trusted sanitizer and prefer escaped rendering.", |
| 199 | + ) |
| 200 | + ) |
| 201 | + |
| 202 | + if match := _SSRF_RE.search(file_content): |
| 203 | + findings.append( |
| 204 | + _finding( |
| 205 | + title="SSRF", |
| 206 | + category="SSRF", |
| 207 | + severity="HIGH", |
| 208 | + cwe="CWE-918", |
| 209 | + file=filename, |
| 210 | + line=_line_for(file_content, match.start()), |
| 211 | + description="User-controlled URL reaches a server-side fetch call.", |
| 212 | + recommendation="Allowlist outbound destinations and reject private, loopback, and metadata IP ranges.", |
| 213 | + ) |
| 214 | + ) |
| 215 | + |
| 216 | + return findings |
| 217 | + |
| 218 | + |
| 219 | +def _signature(finding: dict[str, Any]) -> tuple[str, str, int | None]: |
| 220 | + label = str(finding.get("title") or finding.get("category") or "") |
| 221 | + return label, str(finding.get("file") or ""), finding.get("line") |
| 222 | + |
| 223 | + |
| 224 | +def _is_env_secret_false_positive(finding: dict[str, Any], file_content: str) -> bool: |
| 225 | + label = str(finding.get("title") or finding.get("category") or "").upper() |
| 226 | + if label != "EXPOSED_SECRET": |
| 227 | + return False |
| 228 | + if not _ENV_SECRET_RE.search(file_content): |
| 229 | + return False |
| 230 | + return not ( |
| 231 | + _HARDCODED_SECRET_RE.search(file_content) |
| 232 | + or _ENV_SECRET_RESPONSE_RE.search(file_content) |
| 233 | + or _CLIENT_ENV_SECRET_RE.search(file_content) |
| 234 | + ) |
| 235 | + |
| 236 | + |
| 237 | +def normalizeFinding(finding: dict[str, Any], fileContent: str, filename: str = "input") -> dict[str, Any] | None: # noqa: N802 |
| 238 | + """Normalize one model finding against file content. |
| 239 | +
|
| 240 | + Returns None when a finding is a deterministic false positive, currently for |
| 241 | + env-secret reads that are not exposed or hardcoded. |
| 242 | + """ |
| 243 | + if _is_env_secret_false_positive(finding, fileContent): |
| 244 | + return None |
| 245 | + |
| 246 | + normalized = copy.deepcopy(finding) |
| 247 | + normalized.setdefault("title", normalized.get("category", "Security finding")) |
| 248 | + normalized.setdefault("recommendation", normalized.get("recommended_fix", "")) |
| 249 | + |
| 250 | + rule_findings = _rule_findings(fileContent, str(normalized.get("file") or filename)) |
| 251 | + if not rule_findings: |
| 252 | + return normalized |
| 253 | + |
| 254 | + current_label = str(normalized.get("title") or normalized.get("category") or "").upper() |
| 255 | + for rule in rule_findings: |
| 256 | + if rule["title"] in { |
| 257 | + current_label, |
| 258 | + "PATH_TRAVERSAL" if current_label in {"DANGEROUS_SHELL_COMMAND", "UNSAFE_FILE_UPLOAD"} else "", |
| 259 | + "REMOTE_CODE_EXECUTION" if current_label in {"COMMAND_INJECTION", "DANGEROUS_SHELL_COMMAND"} else "", |
| 260 | + "UNSAFE_PROCESS_EXECUTION" if current_label in {"COMMAND_INJECTION", "DANGEROUS_SHELL_COMMAND"} else "", |
| 261 | + }: |
| 262 | + return rule |
| 263 | + return normalized |
| 264 | + |
| 265 | + |
| 266 | +def _dedupe(findings: list[dict[str, Any]]) -> list[dict[str, Any]]: |
| 267 | + out: list[dict[str, Any]] = [] |
| 268 | + seen: set[tuple[str, str, int | None]] = set() |
| 269 | + for finding in findings: |
| 270 | + sig = _signature(finding) |
| 271 | + if sig in seen: |
| 272 | + continue |
| 273 | + seen.add(sig) |
| 274 | + out.append(finding) |
| 275 | + return out |
| 276 | + |
| 277 | + |
| 278 | +def _recalculate_risk(payload: dict[str, Any]) -> None: |
| 279 | + findings = payload.get("findings") or [] |
| 280 | + if not findings: |
| 281 | + payload["risk_score"] = 0 |
| 282 | + payload["severity"] = "INFO" |
| 283 | + payload["production_ready"] = True |
| 284 | + if isinstance(payload.get("_safety_layer"), dict): |
| 285 | + payload["_safety_layer"]["production_ready"] = True |
| 286 | + payload["_safety_layer"]["blocking_reasons"] = [] |
| 287 | + return |
| 288 | + |
| 289 | + max_sev = max((str(f.get("severity", "INFO")).upper() for f in findings), key=lambda sev: _SEVERITY_ORDER.get(sev, 0)) |
| 290 | + payload["severity"] = max_sev |
| 291 | + payload["risk_score"] = _RISK_BY_SEVERITY.get(max_sev, 0) |
| 292 | + if _SEVERITY_ORDER.get(max_sev, 0) >= _SEVERITY_ORDER["HIGH"]: |
| 293 | + payload["production_ready"] = False |
| 294 | + if isinstance(payload.get("_safety_layer"), dict): |
| 295 | + payload["_safety_layer"]["production_ready"] = False |
| 296 | + |
| 297 | + |
| 298 | +def normalize_verdict_payload(payload: dict[str, Any], file_content: str, filename: str = "input") -> dict[str, Any]: |
| 299 | + """Apply deterministic finding normalization and risk recalculation.""" |
| 300 | + normalized = copy.deepcopy(payload) |
| 301 | + existing: list[dict[str, Any]] = [] |
| 302 | + for finding in normalized.get("findings") or []: |
| 303 | + clean = normalizeFinding(finding, file_content, filename) |
| 304 | + if clean is not None: |
| 305 | + existing.append(clean) |
| 306 | + |
| 307 | + deterministic = _rule_findings(file_content, filename) |
| 308 | + normalized["findings"] = _dedupe([*existing, *deterministic]) |
| 309 | + normalized["affected_files"] = sorted({str(f.get("file") or filename) for f in normalized["findings"]}) |
| 310 | + _recalculate_risk(normalized) |
| 311 | + return normalized |
0 commit comments