feat(eval): add error-handling and footgun fixture packs

haasonsaas · haasonsaas · commit 4b7bcfd7ea7c · 2026-03-14T14:14:26.000-07:00
Add deep review benchmark packs for error-handling regressions and language-specific semantic traps, plus checked-in loader tests that assert the new bug.error-handling.* and bug.lang.* rule IDs.
diff --git a/eval/fixtures/deep_review_suite/review_depth_error_handling.json b/eval/fixtures/deep_review_suite/review_depth_error_handling.json
@@ -0,0 +1,164 @@
+{
+  "name": "review-depth-error-handling",
+  "author": "diffscope",
+  "version": "1.0.0",
+  "description": "Error-handling and panic-safety benchmark pack for deeper live review runs.",
+  "languages": [
+    "rust",
+    "go",
+    "python",
+    "typescript"
+  ],
+  "categories": [
+    "bug"
+  ],
+  "thresholds": {
+    "min_precision": 0.45,
+    "min_recall": 0.35,
+    "min_f1": 0.4,
+    "max_false_positive_rate": 0.45,
+    "min_weighted_score": 0.45
+  },
+  "metadata": {
+    "purpose": "live-regression-eval",
+    "family": "review-depth"
+  },
+  "fixtures": [
+    {
+      "name": "rust-unwrap-in-request-handler",
+      "category": "bug",
+      "language": "rust",
+      "difficulty": "Easy",
+      "diff_content": "diff --git a/src/http.rs b/src/http.rs\nindex 1111111..2222222 100644\n--- a/src/http.rs\n+++ b/src/http.rs\n@@ -1,5 +1,5 @@\n async fn get_user(Query(params): Query<HashMap<String, String>>) -> Result<Json<User>, StatusCode> {\n-    let user_id = params.get(\"user_id\").and_then(|value| value.parse::<u64>().ok()).ok_or(StatusCode::BAD_REQUEST)?;\n+    let user_id = params[\"user_id\"].parse::<u64>().unwrap();\n     Ok(Json(load_user(user_id).await?))\n }\n",
+      "expected_findings": [
+        {
+          "description": "Request handler unwraps user-controlled input and can panic instead of returning a bad-request error.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "src/http.rs",
+          "line_hint": 2,
+          "contains_any": [
+            "unwrap on user input",
+            "panic in request handler",
+            "bad request should be handled gracefully",
+            "parse::<u64>().unwrap",
+            "user-controlled input can panic"
+          ],
+          "rule_id": "bug.error-handling.unwrap-request"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Request paths should return validation failures instead of panicking on malformed user input.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "go-ignored-error-return",
+      "category": "bug",
+      "language": "go",
+      "difficulty": "Easy",
+      "diff_content": "diff --git a/internal/store/users.go b/internal/store/users.go\nindex 1111111..2222222 100644\n--- a/internal/store/users.go\n+++ b/internal/store/users.go\n@@ -1,6 +1,4 @@\n func DeleteUser(ctx context.Context, db *sql.DB, id int64) error {\n-    if _, err := db.ExecContext(ctx, \"DELETE FROM users WHERE id = ?\", id); err != nil {\n-        return err\n-    }\n+    _, _ = db.ExecContext(ctx, \"DELETE FROM users WHERE id = ?\", id)\n     return nil\n }\n",
+      "expected_findings": [
+        {
+          "description": "Database error is explicitly ignored, so delete failures are silently dropped.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "internal/store/users.go",
+          "line_hint": 2,
+          "contains_any": [
+            "ignored error",
+            "discarded error return",
+            "db.ExecContext error is dropped",
+            "silent failure",
+            "_, _ = db.ExecContext"
+          ],
+          "rule_id": "bug.error-handling.ignored-error"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Go code should not discard important error returns from database operations.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "python-bare-except-silences-error",
+      "category": "bug",
+      "language": "python",
+      "difficulty": "Easy",
+      "diff_content": "diff --git a/app/profile.py b/app/profile.py\nindex 1111111..2222222 100644\n--- a/app/profile.py\n+++ b/app/profile.py\n@@ -1,6 +1,7 @@\n def load_profile(path: str) -> dict:\n     try:\n         return json.loads(Path(path).read_text())\n-    except json.JSONDecodeError:\n-        return {}\n+    except:\n+        pass\n+    return {}\n",
+      "expected_findings": [
+        {
+          "description": "Bare except swallows every exception, including interrupts and unrelated runtime failures.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "app/profile.py",
+          "line_hint": 4,
+          "contains_any": [
+            "bare except",
+            "swallows all exceptions",
+            "except: catches KeyboardInterrupt",
+            "silences unrelated runtime errors",
+            "should catch a specific exception"
+          ],
+          "rule_id": "bug.error-handling.bare-except"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Python exception handlers should avoid bare except blocks that hide real failures.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "ts-unhandled-promise-rejection",
+      "category": "bug",
+      "language": "typescript",
+      "difficulty": "Medium",
+      "diff_content": "diff --git a/src/jobs.ts b/src/jobs.ts\nindex 1111111..2222222 100644\n--- a/src/jobs.ts\n+++ b/src/jobs.ts\n@@ -1,5 +1,5 @@\n export async function handleJob(job: Job) {\n-  await sendWebhook(job);\n+  sendWebhook(job);\n   job.status = \"done\";\n }\n",
+      "expected_findings": [
+        {
+          "description": "Async call is started without await or catch, so rejections can become unhandled and the job is marked done too early.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "src/jobs.ts",
+          "line_hint": 2,
+          "contains_any": [
+            "unhandled promise rejection",
+            "missing await",
+            "promise is neither awaited nor caught",
+            "job may be marked done before sendWebhook finishes",
+            "should await sendWebhook"
+          ],
+          "rule_id": "bug.error-handling.unhandled-promise"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Promise-returning work should be awaited or handled so failures are observable.",
+      "source": "deep-review-suite"
+    }
+  ]
+}
diff --git a/eval/fixtures/deep_review_suite/review_depth_language_footguns.json b/eval/fixtures/deep_review_suite/review_depth_language_footguns.json
@@ -0,0 +1,164 @@
+{
+  "name": "review-depth-language-footguns",
+  "author": "diffscope",
+  "version": "1.0.0",
+  "description": "Language-specific semantic footgun benchmark pack for deeper live review runs.",
+  "languages": [
+    "go",
+    "python",
+    "rust",
+    "typescript"
+  ],
+  "categories": [
+    "bug"
+  ],
+  "thresholds": {
+    "min_precision": 0.45,
+    "min_recall": 0.35,
+    "min_f1": 0.4,
+    "max_false_positive_rate": 0.45,
+    "min_weighted_score": 0.45
+  },
+  "metadata": {
+    "purpose": "live-regression-eval",
+    "family": "review-depth"
+  },
+  "fixtures": [
+    {
+      "name": "go-nil-interface-comparison",
+      "category": "bug",
+      "language": "go",
+      "difficulty": "Hard",
+      "diff_content": "diff --git a/internal/validate/error.go b/internal/validate/error.go\nindex 1111111..2222222 100644\n--- a/internal/validate/error.go\n+++ b/internal/validate/error.go\n@@ -1,4 +1,5 @@\n func ValidateConfig(raw []byte) error {\n-    return nil\n+    var err *ConfigError = nil\n+    return err\n }\n",
+      "expected_findings": [
+        {
+          "description": "Typed nil pointer is returned as an error interface, so callers can observe a non-nil error even though the concrete pointer is nil.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "internal/validate/error.go",
+          "line_hint": 3,
+          "contains_any": [
+            "typed nil error",
+            "non-nil interface",
+            "nil interface gotcha",
+            "returning *ConfigError as error",
+            "if err != nil can misfire"
+          ],
+          "rule_id": "bug.lang.nil-interface"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Go reviewers should catch typed-nil values that become non-nil when stored in interfaces.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "python-mutable-default-arg",
+      "category": "bug",
+      "language": "python",
+      "difficulty": "Easy",
+      "diff_content": "diff --git a/app/collector.py b/app/collector.py\nindex 1111111..2222222 100644\n--- a/app/collector.py\n+++ b/app/collector.py\n@@ -1,4 +1,3 @@\n-def record_event(event, seen=None):\n-    seen = seen or []\n+def record_event(event, seen=[]):\n     seen.append(event)\n     return seen\n",
+      "expected_findings": [
+        {
+          "description": "Mutable default list is shared across calls and will accumulate state between invocations.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "app/collector.py",
+          "line_hint": 1,
+          "contains_any": [
+            "mutable default argument",
+            "shared list across calls",
+            "default [] is reused",
+            "state leaks between invocations",
+            "use None and initialize inside"
+          ],
+          "rule_id": "bug.lang.mutable-default"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Python reviewers should catch mutable default arguments that silently share state.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "rust-lifetime-dangling-ref",
+      "category": "bug",
+      "language": "rust",
+      "difficulty": "Hard",
+      "diff_content": "diff --git a/src/labels.rs b/src/labels.rs\nindex 1111111..2222222 100644\n--- a/src/labels.rs\n+++ b/src/labels.rs\n@@ -1,3 +1,4 @@\n pub fn default_label() -> &'static str {\n-    \"worker\"\n+    let label = String::from(\"worker\");\n+    unsafe { std::mem::transmute::<&str, &'static str>(label.as_str()) }\n }\n",
+      "expected_findings": [
+        {
+          "description": "Unsafe transmute extends the lifetime of a reference to a local String, creating a dangling reference and undefined behavior.",
+          "severity": "Error",
+          "category": "Bug",
+          "file_pattern": "src/labels.rs",
+          "line_hint": 3,
+          "contains_any": [
+            "dangling reference",
+            "unsafe transmute",
+            "reference to local string escapes",
+            "undefined behavior",
+            "invalid lifetime extension"
+          ],
+          "rule_id": "bug.lang.dangling-reference"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "Rust reviewers should detect unsafe lifetime extension that returns references to freed stack data.",
+      "source": "deep-review-suite"
+    },
+    {
+      "name": "ts-equality-coercion-trap",
+      "category": "bug",
+      "language": "typescript",
+      "difficulty": "Easy",
+      "diff_content": "diff --git a/src/filter.ts b/src/filter.ts\nindex 1111111..2222222 100644\n--- a/src/filter.ts\n+++ b/src/filter.ts\n@@ -1,3 +1,3 @@\n export function isZeroish(value: string | number | boolean) {\n-  return value === 0;\n+  return value == 0;\n }\n",
+      "expected_findings": [
+        {
+          "description": "Loose equality allows coercion so values like empty strings or false can compare equal to 0 unexpectedly.",
+          "severity": "Warning",
+          "category": "Bug",
+          "file_pattern": "src/filter.ts",
+          "line_hint": 2,
+          "contains_any": [
+            "loose equality",
+            "type coercion",
+            "== 0 can match \"\" or false",
+            "use strict equality",
+            "=== instead of =="
+          ],
+          "rule_id": "bug.lang.loose-equality"
+        }
+      ],
+      "negative_findings": [
+        {
+          "description": "Avoid style-only comments.",
+          "contains": "style"
+        }
+      ],
+      "min_total": 1,
+      "max_total": 8,
+      "description": "TypeScript reviewers should flag loose equality checks that rely on coercive semantics.",
+      "source": "deep-review-suite"
+    }
+  ]
+}
diff --git a/src/commands/eval/fixtures.rs b/src/commands/eval/fixtures.rs