fix(qa): unblock CI/bench Docker, repair tests, address SonarCloud

nikolay-e · nikolay-e · commit 6b03bf125349 · 2026-04-30T06:03:34.000+02:00
- Dockerfile.bench: copy diffctx/tests so cargo manifest parses
- ci.yml: split rust tests — unit in dev, yaml_cases in release (panic=abort)
- runner.py: extract _run_serial/_run_parallel (S3516 BLOCKER, S3776)
- build_splits.py: drop redundant int return (S3516 BLOCKER)
- test_benchmark_runner: align resume contract with replay-for-aggregation
- _idents.py: extract _emit_query_idents helper (S3776)
- tokens.py: validate encoding instead of accepting unused arg (S1172)
- aider_subprocess: docstring stubs + del silent (S1186, S1172)
- splits.py: extract _TWO_COL_DIVIDER constant (S1192)
- pin_revisions.py: flatten nested ternary (S3358)
- sensitivity_check.py: regex {2} quantifier (S6326)
- merge implicit string concats across bm25/run_final_eval/multi_swebench/tests (S5799)
- tests: pytest.approx for float equality assertions (S1244)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -104,8 +104,9 @@ jobs:
         env:
           DIFFCTX_YAML_CASES_LIMIT: "20"
         run: |
-          cargo test --release
+          cargo test --lib
           cargo build --release
+          cargo test --release --test yaml_cases
 
       - name: Run diffctx YAML test suite
         working-directory: diffctx
diff --git a/Dockerfile.bench b/Dockerfile.bench
@@ -53,6 +53,7 @@ WORKDIR /build/diffctx
 COPY diffctx/Cargo.toml diffctx/Cargo.lock diffctx/pyproject.toml ./
 COPY diffctx/src ./src
 COPY diffctx/python ./python
+COPY diffctx/tests ./tests
 
 RUN set -eux; \
     TGT=$(cat /target_triple); \
diff --git a/benchmarks/adapters/multi_swebench.py b/benchmarks/adapters/multi_swebench.py
@@ -90,7 +90,7 @@ def _load_raw(self) -> Iterator[dict]:
                 break
             except (DatasetGenerationError, TypeError, ValueError) as e:
                 print(
-                    f"[WARN] {self.name}: stream stopped early at row {n} " f"({type(e).__name__}: {str(e)[:200]})",
+                    f"[WARN] {self.name}: stream stopped early at row {n} ({type(e).__name__}: {str(e)[:200]})",
                     file=sys.stderr,
                 )
                 break
diff --git a/benchmarks/adapters/runner.py b/benchmarks/adapters/runner.py
@@ -163,17 +163,36 @@ def _record(r: EvalResult) -> None:
         if checkpoint_path is not None:
             append_checkpoint(checkpoint_path, r)
 
-    if not pending:
-        return results
+    if pending:
+        if workers <= 1 or len(pending) <= 1:
+            _run_serial(pending, eval_fn, params, _record)
+        else:
+            _run_parallel(pending, eval_fn, params, workers, timeout_per_instance, _record)
 
-    if workers <= 1 or len(pending) <= 1:
-        for inst in pending:
-            try:
-                _record(eval_fn(inst, params))
-            except Exception as e:
-                _record(_failure_result(inst, params, "error", f"{type(e).__name__}: {e}"))
-        return results
+    return results
+
+
+def _run_serial(
+    pending: list[BenchmarkInstance],
+    eval_fn: EvalFn,
+    params: RunParams,
+    record: Callable[[EvalResult], None],
+) -> None:
+    for inst in pending:
+        try:
+            record(eval_fn(inst, params))
+        except Exception as e:
+            record(_failure_result(inst, params, "error", f"{type(e).__name__}: {e}"))
 
+
+def _run_parallel(
+    pending: list[BenchmarkInstance],
+    eval_fn: EvalFn,
+    params: RunParams,
+    workers: int,
+    timeout_per_instance: float,
+    record: Callable[[EvalResult], None],
+) -> None:
     from concurrent.futures import (
         ThreadPoolExecutor,
         as_completed,
@@ -184,8 +203,6 @@ def _record(r: EvalResult) -> None:
 
     with ThreadPoolExecutor(max_workers=workers) as pool:
         futures = {pool.submit(eval_fn, inst, params): inst for inst in pending}
-        # Generous outer deadline: timeout * ceil(len/workers) covers the
-        # serialised case if workers all hang together.
         outer_deadline = time.monotonic() + timeout_per_instance * max(1, (len(pending) + workers - 1) // workers)
         completed: set[str] = set()
         try:
@@ -198,12 +215,9 @@ def _record(r: EvalResult) -> None:
                 except Exception as e:
                     r = _failure_result(inst, params, "error", f"{type(e).__name__}: {e}")
                 completed.add(inst.instance_id)
-                _record(r)
+                record(r)
         except FuturesTimeoutError:
             for inst in futures.values():
                 if inst.instance_id not in completed:
-                    _record(_failure_result(inst, params, "timeout", "exceeded global deadline"))
-        # Cancel any pending futures; running threads are abandoned.
+                    record(_failure_result(inst, params, "timeout", "exceeded global deadline"))
         pool.shutdown(wait=False, cancel_futures=True)
-
-    return results
diff --git a/benchmarks/adapters/splits.py b/benchmarks/adapters/splits.py
@@ -9,6 +9,8 @@
 from benchmarks.adapters.base import BenchmarkAdapter, BenchmarkInstance
 from benchmarks.adapters.contamination import ContaminationDetector
 
+_TWO_COL_DIVIDER = "|---|---|"
+
 
 @dataclass(frozen=True)
 class SplitConfig:
@@ -152,15 +154,15 @@ def render_split_report(config: SplitConfig, result: SplitResult, today: str = "
     lines.append("## Totals")
     lines.append("")
     lines.append("| Split | Count |")
-    lines.append("|---|---|")
+    lines.append(_TWO_COL_DIVIDER)
     lines.append(f"| Test | {result.stats.test_total} |")
     lines.append(f"| Validation | {result.stats.validation_total} |")
     lines.append(f"| Calibration | {result.stats.calibration_total} |")
     lines.append("")
     lines.append("## Test set per benchmark")
     lines.append("")
     lines.append("| Benchmark | Count |")
-    lines.append("|---|---|")
+    lines.append(_TWO_COL_DIVIDER)
     for name in sorted(result.stats.test_per_benchmark):
         lines.append(f"| {name} | {result.stats.test_per_benchmark[name]} |")
     lines.append("")
@@ -181,7 +183,7 @@ def render_split_report(config: SplitConfig, result: SplitResult, today: str = "
     lines.append("## Pinned dataset revisions")
     lines.append("")
     lines.append("| Adapter | Revision |")
-    lines.append("|---|---|")
+    lines.append(_TWO_COL_DIVIDER)
     for name in sorted(result.stats.dataset_revisions):
         lines.append(f"| {name} | `{result.stats.dataset_revisions[name]}` |")
     lines.append("")
diff --git a/benchmarks/baselines/_idents.py b/benchmarks/baselines/_idents.py
@@ -84,6 +84,18 @@
 )
 
 
+def _emit_query_idents(tok: str, out: set[str]) -> None:
+    if len(tok) < 3 or tok.isdigit() or tok.lower() in _KEYWORDS:
+        return
+    out.add(tok.lower())
+    for part in _CAMEL_SPLIT_RE.split(tok):
+        if len(part) >= 3 and part.lower() not in _KEYWORDS:
+            out.add(part.lower())
+    for part in tok.split("_"):
+        if len(part) >= 3 and part.lower() not in _KEYWORDS:
+            out.add(part.lower())
+
+
 def extract_idents_from_patch(patch: str) -> set[str]:
     """Extract retrieval-query identifiers from a unified diff.
 
@@ -100,15 +112,7 @@ def extract_idents_from_patch(patch: str) -> set[str]:
             continue
         body = raw[1:] if raw[:1] in ("+", "-", " ") else raw
         for tok in _TOKEN_RE.findall(body):
-            if len(tok) < 3 or tok.isdigit() or tok.lower() in _KEYWORDS:
-                continue
-            idents.add(tok.lower())
-            for part in _CAMEL_SPLIT_RE.split(tok):
-                if len(part) >= 3 and part.lower() not in _KEYWORDS:
-                    idents.add(part.lower())
-            for part in tok.split("_"):
-                if len(part) >= 3 and part.lower() not in _KEYWORDS:
-                    idents.add(part.lower())
+            _emit_query_idents(tok, idents)
     return idents
 
 
diff --git a/benchmarks/baselines/aider_subprocess.py b/benchmarks/baselines/aider_subprocess.py
@@ -51,20 +51,23 @@ class _SilentIO:
     """Duck-typed `io` substitute for RepoMap. No prompts, no console."""
 
     def read_text(self, fname, silent=False):
+        # `silent` is part of Aider's IO contract; ignored here because we
+        # never log anyway.
+        del silent
         try:
             with open(fname, encoding="utf-8", errors="replace") as f:
                 return f.read()
         except Exception:
             return None
 
     def tool_output(self, *args, **kwargs):
-        pass
+        """No-op: stub that swallows Aider tool output to keep the bench quiet."""
 
     def tool_warning(self, *args, **kwargs):
-        pass
+        """No-op: stub that swallows Aider tool warnings."""
 
     def tool_error(self, *args, **kwargs):
-        pass
+        """No-op: stub that swallows Aider tool errors."""
 
 
 _CODE_EXT_RE = __import__("re").compile(
diff --git a/benchmarks/baselines/bm25_baseline.py b/benchmarks/baselines/bm25_baseline.py
@@ -48,7 +48,7 @@ def make_bm25_eval_fn(repos_dir: Path):
         from rank_bm25 import BM25Okapi
     except ImportError as e:
         raise RuntimeError(
-            "rank-bm25 not installed; expected to be in requirements-bench.lock. " "Run: pip install rank-bm25"
+            "rank-bm25 not installed; expected to be in requirements-bench.lock. Run: pip install rank-bm25"
         ) from e
 
     evaluator = UniversalEvaluator()
diff --git a/benchmarks/build_splits.py b/benchmarks/build_splits.py
@@ -66,7 +66,7 @@ def default_calibration_pool_adapters() -> tuple[BenchmarkAdapter, ...]:
     )
 
 
-def main() -> int:
+def main() -> None:
     p = argparse.ArgumentParser(description=__doc__)
     p.add_argument(
         "--out",
@@ -108,7 +108,7 @@ def main() -> int:
         print("=== SPLIT_REPORT.md (dry-run, NOT written) ===\n")
         print(report)
         print("\n--dry-run: no manifests written. Re-run without the flag to freeze.")
-        return 0
+        return
 
     written = write_manifests(result, args.out)
     report_path = args.out / "SPLIT_REPORT.md"
@@ -117,8 +117,7 @@ def main() -> int:
     for name, path in sorted(written.items()):
         print(f"  {name}: {path}")
     print(f"Report: {report_path}")
-    return 0
 
 
 if __name__ == "__main__":
-    raise SystemExit(main())
+    main()
diff --git a/benchmarks/pin_revisions.py b/benchmarks/pin_revisions.py
@@ -72,7 +72,8 @@ def main() -> int:
             "fetched_at": now,
             "hf_path": hf_path,
         }
-        marker = "(unchanged)" if prev == sha else f"(was {prev[:12] if prev else 'main'})"
+        prev_marker = f"was {prev[:12]}" if prev else "was main"
+        marker = "(unchanged)" if prev == sha else f"({prev_marker})"
         print(f"  OK   {hf_path:<45} {sha[:12]} {marker}")
 
     save_pins(pins)
diff --git a/benchmarks/run_final_eval.py b/benchmarks/run_final_eval.py
@@ -118,9 +118,8 @@ def main() -> int:
     lang_agg = aggregate_by_language(all_results)
     lang_table = render_language_table(lang_agg)
 
-    header = f"# Final evaluation — {args.baseline}\n\n" f"Method: **{args.baseline}**, budget={params.budget}" + (
-        f", τ={params.tau}, cbf={params.core_budget_fraction}, scoring={params.scoring}" if args.baseline == "diffctx" else ""
-    )
+    extra = f", τ={params.tau}, cbf={params.core_budget_fraction}, scoring={params.scoring}" if args.baseline == "diffctx" else ""
+    header = f"# Final evaluation — {args.baseline}\n\nMethod: **{args.baseline}**, budget={params.budget}{extra}"
     summary = "\n\n".join(
         [
             header,
diff --git a/scripts/sensitivity_check.py b/scripts/sensitivity_check.py
@@ -28,7 +28,7 @@
 
 PERTURBATION_FACTORS = [0.50, 0.75, 1.25, 1.50]
 TOKEN_RE = re.compile(r"^([\d,]+)\s+tokens\b")
-FRAGMENT_RE = re.compile(r"^  (\S+):(\d+)-(\d+)")
+FRAGMENT_RE = re.compile(r"^ {2}(\S+):(\d+)-(\d+)")
 
 
 @dataclass(frozen=True)
@@ -118,7 +118,7 @@ def main() -> int:
             delta_pct = 100.0 * (r.tokens - baseline.tokens) / baseline.tokens if baseline.tokens else 0.0
             jacc = jaccard(baseline.fragments, r.fragments)
             print(
-                f"{name:<55} {factor:>6.2f} {value:>10.4g} " f"{r.tokens:>8d} {delta_pct:>+6.2f}% {jacc:>6.3f}",
+                f"{name:<55} {factor:>6.2f} {value:>10.4g} {r.tokens:>8d} {delta_pct:>+6.2f}% {jacc:>6.3f}",
                 flush=True,
             )
 
diff --git a/src/treemapper/tokens.py b/src/treemapper/tokens.py
@@ -16,9 +16,12 @@ class TokenCountResult:
 def count_tokens(text: str, encoding: str = "o200k_base") -> TokenCountResult:
     """Count tokens via the Rust tiktoken backend.
 
-    The `encoding` argument is accepted for API stability; the Rust backend
-    uses `o200k_base` and is always exact.
+    Only `o200k_base` is currently supported; passing any other encoding
+    raises `ValueError` to avoid silently mismatching the count and the
+    label on the returned result.
     """
+    if encoding != "o200k_base":
+        raise ValueError(f"unsupported encoding {encoding!r}; only 'o200k_base' is available")
     return TokenCountResult(int(_rust_count_tokens(text)), True, "o200k_base")
 
 
diff --git a/tests/test_benchmark_adapters.py b/tests/test_benchmark_adapters.py
@@ -200,7 +200,7 @@ def test_eval_result_dataclass_carries_optional_fragment_metrics():
     )
     assert r.fragment_recall is None
     assert r.line_f1 is None
-    assert r.elapsed_seconds == 0.0
+    assert r.elapsed_seconds == pytest.approx(0.0)
 
 
 class _StubPolyBenchAdapter(PolyBenchAdapter):
@@ -299,8 +299,8 @@ def test_evaluator_file_metrics_only_when_no_gold_fragments():
     )
     output = SelectionOutput(selected_files=frozenset({"a.py", "c.py"}))
     result = UniversalEvaluator().evaluate(inst, output, budget=8000)
-    assert result.file_recall == 0.5
-    assert result.file_precision == 0.5
+    assert result.file_recall == pytest.approx(0.5)
+    assert result.file_precision == pytest.approx(0.5)
     assert result.fragment_recall is None
     assert result.line_f1 is None
 
@@ -352,7 +352,7 @@ def test_evaluator_whole_file_gold_satisfied_by_any_selection_on_path():
     )
     output = SelectionOutput(selected_files=frozenset({"README.md"}), selected_fragments=selected)
     result = UniversalEvaluator().evaluate(inst, output, budget=8000)
-    assert result.fragment_recall == 1.0
+    assert result.fragment_recall == pytest.approx(1.0)
 
 
 def test_evaluator_aggregate_per_benchmark_separates_by_source():
@@ -382,8 +382,8 @@ def test_evaluator_aggregate_per_benchmark_separates_by_source():
         ev.evaluate(inst_b, output_miss, 8000),
     ]
     agg = ev.aggregate_per_benchmark(results)
-    assert agg["a"]["file_recall"] == 1.0
-    assert agg["b"]["file_recall"] == 0.0
+    assert agg["a"]["file_recall"] == pytest.approx(1.0)
+    assert agg["b"]["file_recall"] == pytest.approx(0.0)
 
 
 def test_dataset_pin_resolver_prefers_env_override(monkeypatch, tmp_path):
diff --git a/tests/test_benchmark_runner.py b/tests/test_benchmark_runner.py
@@ -3,6 +3,8 @@
 from collections.abc import Iterator
 from pathlib import Path
 
+import pytest
+
 from benchmarks.adapters import BenchmarkInstance, EvalResult
 from benchmarks.adapters.base import BenchmarkAdapter
 from benchmarks.adapters.calibrate import (
@@ -203,9 +205,9 @@ def test_aggregate_test_set_averages_metrics():
     ]
     report = aggregate_test_set("x", rs)
     assert report.n == 2
-    assert report.file_recall == 0.7
-    assert report.fragment_recall == 0.8
-    assert report.used_tokens_mean == 2000.0
+    assert report.file_recall == pytest.approx(0.7)
+    assert report.fragment_recall == pytest.approx(0.8)
+    assert report.used_tokens_mean == pytest.approx(2000.0)
 
 
 def test_render_paper_table_includes_per_benchmark_and_aggregate():
@@ -226,9 +228,9 @@ def test_aggregate_by_language_groups_using_extra_field():
         EvalResult("b::1", "b", file_recall=0.9, file_precision=0.8, extra={"language": "java"}),
     ]
     agg = aggregate_by_language(rs)
-    assert agg["python"]["n"] == 2.0
-    assert agg["python"]["file_recall"] == 0.6
-    assert agg["java"]["n"] == 1.0
+    assert agg["python"]["n"] == pytest.approx(2.0)
+    assert agg["python"]["file_recall"] == pytest.approx(0.6)
+    assert agg["java"]["n"] == pytest.approx(1.0)
 
 
 def test_run_eval_set_resume_from_skips_already_recorded(tmp_path: Path):
@@ -238,10 +240,19 @@ def test_run_eval_set_resume_from_skips_already_recorded(tmp_path: Path):
     # Pre-populate checkpoint with two completed IDs.
     pre = run_eval_set(instances[:2], _stub_eval_fn, params, workers=1, checkpoint_path=ckpt)
     assert len(pre) == 2
-    # Resume — only the last 3 should run.
-    rest = run_eval_set(instances, _stub_eval_fn, params, workers=1, resume_from=ckpt, checkpoint_path=ckpt)
-    assert len(rest) == 3
-    assert {r.instance_id for r in rest} == {"a::2", "a::3", "a::4"}
+
+    invoked_ids: list[str] = []
+
+    def _tracking_eval(instance: BenchmarkInstance, p: RunParams) -> EvalResult:
+        invoked_ids.append(instance.instance_id)
+        return _stub_eval_fn(instance, p)
+
+    rest = run_eval_set(instances, _tracking_eval, params, workers=1, resume_from=ckpt, checkpoint_path=ckpt)
+    # Only the unrecorded instances should actually invoke the eval fn.
+    assert invoked_ids == ["a::2", "a::3", "a::4"]
+    # Returned results include replayed + freshly-computed entries so a
+    # fully-resumed run still aggregates over every instance.
+    assert {r.instance_id for r in rest} == {"a::0", "a::1", "a::2", "a::3", "a::4"}
 
 
 def test_run_eval_set_records_timeout_when_eval_fn_hangs(tmp_path: Path):
diff --git a/tests/test_diffctx_invariants.py b/tests/test_diffctx_invariants.py

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ def _load_raw(self) -> Iterator[dict]:`
`90`	`90`	`break`
`91`	`91`	`except (DatasetGenerationError, TypeError, ValueError) as e:`
`92`	`92`	`print(`
`93`		`- f"[WARN] {self.name}: stream stopped early at row {n} " f"({type(e).__name__}: {str(e)[:200]})",`
	`93`	`+ f"[WARN] {self.name}: stream stopped early at row {n} ({type(e).__name__}: {str(e)[:200]})",`
`94`	`94`	`file=sys.stderr,`
`95`	`95`	`)`
`96`	`96`	`break`
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,8 @@ def main() -> int:`
`72`	`72`	`"fetched_at": now,`
`73`	`73`	`"hf_path": hf_path,`
`74`	`74`	`}`
`75`		`- marker = "(unchanged)" if prev == sha else f"(was {prev[:12] if prev else 'main'})"`
	`75`	`+ prev_marker = f"was {prev[:12]}" if prev else "was main"`
	`76`	`+ marker = "(unchanged)" if prev == sha else f"({prev_marker})"`
`76`	`77`	`print(f" OK {hf_path:<45} {sha[:12]} {marker}")`
`77`	`78`
`78`	`79`	`save_pins(pins)`