fix: reduce cognitive complexity in benchmarks and scripts (SonarCloud S3776)

nikolay-e · nikolay-e · commit de2ba0b6954e · 2026-04-18T12:10:54.000+02:00
diff --git a/QA.md b/QA.md
@@ -5,8 +5,6 @@
 - CI matrix: Linux/macOS/Windows × Python 3.10-3.14
   (15 test matrices + 3 lint/arch jobs)
 - Windows jobs slowest — typically 7-10min vs 2-4min for Linux/macOS
-- Windows runners occasionally hang indefinitely (>1hr) —
-  cancel and rerun with `gh run rerun <id> --failed`
 - All jobs must pass; no flaky CI tolerance
 
 ## SonarCloud
@@ -20,18 +18,23 @@
   merge with `or` when bodies identical
 - Test YAML fixture "Password" triggers false positive VULNERABILITY
   (yaml:S2068) — expected
-- `dataclasses.replace()` return type: mypy may flag as OK or error
-  depending on version — check before adding `# type: ignore`
-- Lambda capturing loop variable in immediate-use context (e.g. `max()`)
-  — SonarCloud flags it; suppress with `# noqa: B023` if mypy rejects
-  the default-arg workaround
+- `dataclasses.replace()` return type: mypy (modern) infers correctly —
+  do NOT add `cast(T, replace(...))`, mypy will flag as redundant-cast;
+  remove cast and let type inference work
+- Lambda capturing loop variable (S1515): use `dict.__getitem__` instead
+  of `lambda k: d[k]` — simpler and avoids the flag
+- S3776 cognitive complexity: SonarCloud counts boolean operators (`and`,
+  `or`) as separate increments — extracting complex conditions into named
+  helpers reduces complexity even without deep nesting changes
+- `cast` import removal: after removing casts, remove the `typing.cast`
+  import too or ruff/mypy will flag unused imports
 
 ## Test Suite
 
 - Run `python -m pytest --tb=no -q` for quick status
 - test_graph.py separate from test_yaml_diff.py — check both
-- 87 xfails currently — all strict=False, bidirectional discovery
-  precision tradeoff
+- Many xfails (strict=False) for bidirectional discovery precision tradeoff —
+  check count trends, not absolute numbers
 
 ## Code Review
 
diff --git a/benchmarks/contextbench_diffctx.py b/benchmarks/contextbench_diffctx.py
@@ -216,6 +216,32 @@ def line_overlap(
     }
 
 
+def _collect_instance_diagnostics(
+    frag_count: int,
+    lo_all: dict,
+    file_recall: float,
+    gf: set,
+    sel_files: set,
+    nontrivial_recall: float,
+    output: dict,
+) -> list[str]:
+    diagnostics: list[str] = []
+    if frag_count == 0:
+        diagnostics.append("WARN: diffctx returned 0 fragments")
+    if lo_all["line_recall"] < 1e-9 and frag_count > 0:
+        diagnostics.append("DIAG: line_recall=0 with fragments>0 — possible line parse bug or no file overlap")
+    if file_recall < 1e-9 and frag_count > 0:
+        diagnostics.append("DIAG: file_recall=0 with fragments>0 — selected files don't overlap gold at all")
+        diagnostics.append(f"  gold_files: {sorted(gf)[:5]}")
+        diagnostics.append(f"  selected:   {sorted(sel_files)[:5]}")
+    if nontrivial_recall < 1e-9 and frag_count > 5:
+        diagnostics.append("DIAG: nontrivial_recall=0 — diffctx may only be selecting patch-adjacent files")
+    unparsed = sum(1 for f in output.get("fragments", []) if parse_lines_field(f.get("lines", "")) is None)
+    if unparsed:
+        diagnostics.append(f"DIAG: {unparsed}/{frag_count} fragments have unparseable 'lines' field")
+    return diagnostics
+
+
 def evaluate_instance(
     inst: dict,
     budget: int = 8000,
@@ -301,26 +327,7 @@ def evaluate_instance(
         },
     }
 
-    diagnostics = []
-
-    if frag_count == 0:
-        diagnostics.append("WARN: diffctx returned 0 fragments")
-
-    if lo_all["line_recall"] < 1e-9 and frag_count > 0:
-        diagnostics.append("DIAG: line_recall=0 with fragments>0 — possible line parse bug or no file overlap")
-
-    if file_recall < 1e-9 and frag_count > 0:
-        diagnostics.append("DIAG: file_recall=0 with fragments>0 — selected files don't overlap gold at all")
-        diagnostics.append(f"  gold_files: {sorted(gf)[:5]}")
-        diagnostics.append(f"  selected:   {sorted(sel_files)[:5]}")
-
-    if nontrivial_recall < 1e-9 and frag_count > 5:
-        diagnostics.append("DIAG: nontrivial_recall=0 — diffctx may only be selecting patch-adjacent files")
-
-    unparsed = sum(1 for f in output.get("fragments", []) if parse_lines_field(f.get("lines", "")) is None)
-    if unparsed:
-        diagnostics.append(f"DIAG: {unparsed}/{frag_count} fragments have unparseable 'lines' field")
-
+    diagnostics = _collect_instance_diagnostics(frag_count, lo_all, file_recall, gf, sel_files, nontrivial_recall, output)
     result["diagnostics"] = diagnostics
 
     print(f"Fragments: {frag_count} | Time: {elapsed:.1f}s")
@@ -333,6 +340,28 @@ def evaluate_instance(
     return result
 
 
+def _print_per_language_breakdown(ok: list[dict], by_lang: dict) -> None:
+    if len(by_lang) <= 1:
+        return
+    print("\nPer-language breakdown:")
+    for lang in sorted(by_lang):
+        lr = by_lang[lang]
+        avg_fr = sum(r["file_recall"] for r in lr) / len(lr)
+        avg_ntr = sum(r["nontrivial_file_recall"] for r in lr) / len(lr)
+        avg_lr = sum(r["line_recall"] for r in lr) / len(lr)
+        print(f"  {lang:12s} (n={len(lr):3d}): file_recall={avg_fr:.3f}  nontrivial={avg_ntr:.3f}  line_recall={avg_lr:.3f}")
+
+
+def _print_per_repo_breakdown(ok: list[dict], by_repo: dict) -> None:
+    if len(by_repo) <= 1:
+        return
+    print("\nPer-repo breakdown:")
+    for repo in sorted(by_repo, key=lambda r: -len(by_repo[r])):
+        rr = by_repo[repo]
+        avg_ntr = sum(r["nontrivial_file_recall"] for r in rr) / len(rr)
+        print(f"  {repo:30s} (n={len(rr):3d}): nontrivial_recall={avg_ntr:.3f}")
+
+
 def aggregate(results: list[dict]) -> None:
     ok = [r for r in results if r["status"] == "ok"]
     if not ok:
@@ -360,26 +389,12 @@ def aggregate(results: list[dict]) -> None:
     by_lang: dict[str, list[dict]] = defaultdict(list)
     for r in ok:
         by_lang[r["language"]].append(r)
-
-    if len(by_lang) > 1:
-        print("\nPer-language breakdown:")
-        for lang in sorted(by_lang):
-            lr = by_lang[lang]
-            avg_fr = sum(r["file_recall"] for r in lr) / len(lr)
-            avg_ntr = sum(r["nontrivial_file_recall"] for r in lr) / len(lr)
-            avg_lr = sum(r["line_recall"] for r in lr) / len(lr)
-            print(f"  {lang:12s} (n={len(lr):3d}): file_recall={avg_fr:.3f}  nontrivial={avg_ntr:.3f}  line_recall={avg_lr:.3f}")
+    _print_per_language_breakdown(ok, by_lang)
 
     by_repo: dict[str, list[dict]] = defaultdict(list)
     for r in ok:
         by_repo[r["repo"]].append(r)
-
-    if len(by_repo) > 1:
-        print("\nPer-repo breakdown:")
-        for repo in sorted(by_repo, key=lambda r: -len(by_repo[r])):
-            rr = by_repo[repo]
-            avg_ntr = sum(r["nontrivial_file_recall"] for r in rr) / len(rr)
-            print(f"  {repo:30s} (n={len(rr):3d}): nontrivial_recall={avg_ntr:.3f}")
+    _print_per_repo_breakdown(ok, by_repo)
 
     zero_frag = sum(1 for r in ok if r["fragments"] == 0)
     zero_line = sum(1 for r in ok if r["line_recall"] < 1e-9 and r["fragments"] > 0)
diff --git a/benchmarks/forensic_contextbench.py b/benchmarks/forensic_contextbench.py
@@ -219,6 +219,22 @@ def _print_nontrivial_report(
         print(f"    HIT: {f}  max_ppr={max_score:.6f}")
 
 
+def _classify_nontrivial_stages(
+    nontrivial: set,
+    selected: set,
+    sel_dump: set,
+    fragmented: set,
+    universe: set,
+) -> dict[str, str]:
+    stage_per_file: dict[str, str] = {}
+    for f in nontrivial:
+        if f in selected:
+            stage_per_file[f] = "selected"
+        else:
+            stage_per_file[f] = _classify_failure_stage(f, sel_dump, fragmented, universe)
+    return stage_per_file
+
+
 def evaluate_one(inst: dict, budget: int) -> dict:
     iid = inst["instance_id"]
     print("\n" + "=" * 78)
@@ -293,12 +309,7 @@ def evaluate_one(inst: dict, budget: int) -> dict:
     nt_recall = len(nontrivial_hits) / len(nontrivial) if nontrivial else 0.0
     patch_coverage = len(p_set & selected) / len(p_set) if p_set else 0.0
 
-    stage_per_file: dict[str, str] = {}
-    for f in nontrivial:
-        if f in selected:
-            stage_per_file[f] = "selected"
-        else:
-            stage_per_file[f] = _classify_failure_stage(f, sel_dump, fragmented, universe)
+    stage_per_file = _classify_nontrivial_stages(nontrivial, selected, sel_dump, fragmented, universe)
 
     return {
         "id": iid,
@@ -328,6 +339,40 @@ def _print_threshold_sanity_check():
     print(f"diffctx _LOW_RELEVANCE_THRESHOLD = {v}", file=sys.stderr)
 
 
+def _filter_nontrivial_instances(insts: list) -> list:
+    kept = []
+    for i in insts:
+        gb = json.loads(i["gold_context"]) if isinstance(i["gold_context"], str) else i["gold_context"]
+        gold = {normalize_gold_path(g["file"]) for g in gb}
+        added, deleted, modified = patch_files_detailed(i["patch"])
+        if gold - (added | deleted | modified):
+            kept.append(i)
+    return kept
+
+
+def _print_ok_summary(ok: list[dict]) -> None:
+    print(f"\nAvg patch_coverage: {sum(r['patch_coverage'] for r in ok)/len(ok):.3f}")
+    print(f"Avg file_recall:    {sum(r['file_recall'] for r in ok)/len(ok):.3f}")
+    print(f"Avg nontrivial:     {sum(r['nt_recall'] for r in ok)/len(ok):.3f}")
+    total_deleted = sum(r["n_deleted_in_patch"] for r in ok)
+    print(f"Total deleted files across all instances: {total_deleted}")
+
+    stages: dict[str, int] = {}
+    total_nt = 0
+    for r in ok:
+        for _f, stage in r.get("stage_per_file", {}).items():
+            stages[stage] = stages.get(stage, 0) + 1
+            total_nt += 1
+    if total_nt:
+        print(f"\nStage-wise breakdown ({total_nt} nontrivial gold files):")
+        for stage in sorted(stages, key=lambda s: -stages[s]):
+            pct = 100 * stages[stage] / total_nt
+            print(f"  {stage:50s}: {stages[stage]:4d} ({pct:5.1f}%)")
+
+    print("\nIf patch_coverage < 0.95: BUG — diffctx is losing files from its own diff input.")
+    print("If patch_coverage > 0.95: not a patch-loss bug, look elsewhere.")
+
+
 def main():
     _print_threshold_sanity_check()
 
@@ -342,14 +387,7 @@ def main():
     ds = load_dataset("Contextbench/ContextBench", "contextbench_verified", split="train")
     insts = list(ds)
     if args.nontrivial_only:
-        kept = []
-        for i in insts:
-            gb = json.loads(i["gold_context"]) if isinstance(i["gold_context"], str) else i["gold_context"]
-            gold = {normalize_gold_path(g["file"]) for g in gb}
-            added, deleted, modified = patch_files_detailed(i["patch"])
-            if gold - (added | deleted | modified):
-                kept.append(i)
-        insts = kept
+        insts = _filter_nontrivial_instances(insts)
     insts = insts[: args.limit]
 
     print(f"Diagnosing {len(insts)} nontrivial instances at budget={args.budget}\n")
@@ -370,26 +408,7 @@ def main():
     for r in fail:
         print(f"  FAIL [{r['status']}]: {r['id']}")
     if ok:
-        print(f"\nAvg patch_coverage: {sum(r['patch_coverage'] for r in ok)/len(ok):.3f}")
-        print(f"Avg file_recall:    {sum(r['file_recall'] for r in ok)/len(ok):.3f}")
-        print(f"Avg nontrivial:     {sum(r['nt_recall'] for r in ok)/len(ok):.3f}")
-        total_deleted = sum(r["n_deleted_in_patch"] for r in ok)
-        print(f"Total deleted files across all instances: {total_deleted}")
-
-        stages: dict[str, int] = {}
-        total_nt = 0
-        for r in ok:
-            for _f, stage in r.get("stage_per_file", {}).items():
-                stages[stage] = stages.get(stage, 0) + 1
-                total_nt += 1
-        if total_nt:
-            print(f"\nStage-wise breakdown ({total_nt} nontrivial gold files):")
-            for stage in sorted(stages, key=lambda s: -stages[s]):
-                pct = 100 * stages[stage] / total_nt
-                print(f"  {stage:50s}: {stages[stage]:4d} ({pct:5.1f}%)")
-
-        print("\nIf patch_coverage < 0.95: BUG — diffctx is losing files from its own diff input.")
-        print("If patch_coverage > 0.95: not a patch-loss bug, look elsewhere.")
+        _print_ok_summary(ok)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/loo_swebench.py b/benchmarks/loo_swebench.py
@@ -26,11 +26,11 @@
 def strip_file_from_patch(patch_text: str, file_to_hide: str) -> str:
     import re
 
-    pattern = re.compile(r"^diff --git\s.+?(?=(?:^diff --git\s)|\Z)", re.MULTILINE | re.DOTALL)
     hidden_markers = {f"a/{file_to_hide}", f"b/{file_to_hide}"}
     kept = []
-    for m in pattern.finditer(patch_text):
-        block = m.group()
+    for block in re.split(r"(?=^diff --git\s)", patch_text, flags=re.MULTILINE):
+        if not block.startswith("diff --git"):
+            continue
         first_line = block.split("\n", 1)[0]
         parts = first_line.split()
         if not any(p.strip('"') in hidden_markers for p in parts[2:]):
@@ -168,6 +168,38 @@ def _timeout_handler(_sig: int, _frame: object) -> None:
         return []
 
 
+def _filter_multi_file(insts: list) -> list:
+    return [
+        i
+        for i in insts
+        if len(patch_files(i["patch"])) >= 2
+        and not is_mechanical_change(i["patch"])
+        and not any(is_vendor_or_generated(f) for f in patch_files(i["patch"]))
+    ]
+
+
+def _print_loo_breakdowns(all_results: list[dict]) -> None:
+    by_repo: dict[str, list[dict]] = defaultdict(list)
+    for r in all_results:
+        by_repo[r["repo"]].append(r)
+
+    print("Per-repo breakdown:")
+    for repo in sorted(by_repo, key=lambda r: len(by_repo[r]), reverse=True):
+        trials = by_repo[repo]
+        h = sum(1 for t in trials if t["found"])
+        print(f"  {repo:40s} {h}/{len(trials):3d} ({100 * h / len(trials):.0f}%)")
+
+    by_lang: dict[str, list[dict]] = defaultdict(list)
+    for r in all_results:
+        by_lang[r["language"]].append(r)
+
+    print("\nPer-language breakdown:")
+    for lang in sorted(by_lang, key=lambda la: len(by_lang[la]), reverse=True):
+        trials = by_lang[lang]
+        h = sum(1 for t in trials if t["found"])
+        print(f"  {lang:20s} {h}/{len(trials):3d} ({100 * h / len(trials):.0f}%)")
+
+
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--limit", type=int, default=50)
@@ -187,13 +219,7 @@ def main():
     ds = load_dataset(args.dataset, args.split, split="train")
     insts = list(ds)
 
-    multi_file = [
-        i
-        for i in insts
-        if len(patch_files(i["patch"])) >= 2
-        and not is_mechanical_change(i["patch"])
-        and not any(is_vendor_or_generated(f) for f in patch_files(i["patch"]))
-    ]
+    multi_file = _filter_multi_file(insts)
     print(f"Total instances: {len(insts)}, multi-file (filtered): {len(multi_file)}")
 
     warm_cache(multi_file)
@@ -235,25 +261,7 @@ def main():
             print(f"Found distractor:  {distractor_found}/{distractor_total} ({100 * distractor_found / distractor_total:.1f}%)")
         print()
 
-        by_repo: dict[str, list[dict]] = defaultdict(list)
-        for r in all_results:
-            by_repo[r["repo"]].append(r)
-
-        print("Per-repo breakdown:")
-        for repo in sorted(by_repo, key=lambda r: len(by_repo[r]), reverse=True):
-            trials = by_repo[repo]
-            h = sum(1 for t in trials if t["found"])
-            print(f"  {repo:40s} {h}/{len(trials):3d} ({100 * h / len(trials):.0f}%)")
-
-        by_lang: dict[str, list[dict]] = defaultdict(list)
-        for r in all_results:
-            by_lang[r["language"]].append(r)
-
-        print("\nPer-language breakdown:")
-        for lang in sorted(by_lang, key=lambda la: len(by_lang[la]), reverse=True):
-            trials = by_lang[lang]
-            h = sum(1 for t in trials if t["found"])
-            print(f"  {lang:20s} {h}/{len(trials):3d} ({100 * h / len(trials):.0f}%)")
+        _print_loo_breakdowns(all_results)
 
         if len(seeds) == 1:
             tag = f"loo_{args.scoring}_n{args.limit}_b{args.budget}"
diff --git a/scripts/migrate_to_v3.py b/scripts/migrate_to_v3.py
diff --git a/src/treemapper/diffctx/utility.py b/src/treemapper/diffctx/utility.py