Skip to content

Commit 6b03bf1

Browse files
committed
fix(qa): unblock CI/bench Docker, repair tests, address SonarCloud
- Dockerfile.bench: copy diffctx/tests so cargo manifest parses - ci.yml: split rust tests — unit in dev, yaml_cases in release (panic=abort) - runner.py: extract _run_serial/_run_parallel (S3516 BLOCKER, S3776) - build_splits.py: drop redundant int return (S3516 BLOCKER) - test_benchmark_runner: align resume contract with replay-for-aggregation - _idents.py: extract _emit_query_idents helper (S3776) - tokens.py: validate encoding instead of accepting unused arg (S1172) - aider_subprocess: docstring stubs + del silent (S1186, S1172) - splits.py: extract _TWO_COL_DIVIDER constant (S1192) - pin_revisions.py: flatten nested ternary (S3358) - sensitivity_check.py: regex {2} quantifier (S6326) - merge implicit string concats across bm25/run_final_eval/multi_swebench/tests (S5799) - tests: pytest.approx for float equality assertions (S1244)
1 parent f5f64ef commit 6b03bf1

16 files changed

Lines changed: 111 additions & 73 deletions

.github/workflows/ci.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,9 @@ jobs:
104104
env:
105105
DIFFCTX_YAML_CASES_LIMIT: "20"
106106
run: |
107-
cargo test --release
107+
cargo test --lib
108108
cargo build --release
109+
cargo test --release --test yaml_cases
109110
110111
- name: Run diffctx YAML test suite
111112
working-directory: diffctx

Dockerfile.bench

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ WORKDIR /build/diffctx
5353
COPY diffctx/Cargo.toml diffctx/Cargo.lock diffctx/pyproject.toml ./
5454
COPY diffctx/src ./src
5555
COPY diffctx/python ./python
56+
COPY diffctx/tests ./tests
5657

5758
RUN set -eux; \
5859
TGT=$(cat /target_triple); \

benchmarks/adapters/multi_swebench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def _load_raw(self) -> Iterator[dict]:
9090
break
9191
except (DatasetGenerationError, TypeError, ValueError) as e:
9292
print(
93-
f"[WARN] {self.name}: stream stopped early at row {n} " f"({type(e).__name__}: {str(e)[:200]})",
93+
f"[WARN] {self.name}: stream stopped early at row {n} ({type(e).__name__}: {str(e)[:200]})",
9494
file=sys.stderr,
9595
)
9696
break

benchmarks/adapters/runner.py

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -163,17 +163,36 @@ def _record(r: EvalResult) -> None:
163163
if checkpoint_path is not None:
164164
append_checkpoint(checkpoint_path, r)
165165

166-
if not pending:
167-
return results
166+
if pending:
167+
if workers <= 1 or len(pending) <= 1:
168+
_run_serial(pending, eval_fn, params, _record)
169+
else:
170+
_run_parallel(pending, eval_fn, params, workers, timeout_per_instance, _record)
168171

169-
if workers <= 1 or len(pending) <= 1:
170-
for inst in pending:
171-
try:
172-
_record(eval_fn(inst, params))
173-
except Exception as e:
174-
_record(_failure_result(inst, params, "error", f"{type(e).__name__}: {e}"))
175-
return results
172+
return results
173+
174+
175+
def _run_serial(
176+
pending: list[BenchmarkInstance],
177+
eval_fn: EvalFn,
178+
params: RunParams,
179+
record: Callable[[EvalResult], None],
180+
) -> None:
181+
for inst in pending:
182+
try:
183+
record(eval_fn(inst, params))
184+
except Exception as e:
185+
record(_failure_result(inst, params, "error", f"{type(e).__name__}: {e}"))
176186

187+
188+
def _run_parallel(
189+
pending: list[BenchmarkInstance],
190+
eval_fn: EvalFn,
191+
params: RunParams,
192+
workers: int,
193+
timeout_per_instance: float,
194+
record: Callable[[EvalResult], None],
195+
) -> None:
177196
from concurrent.futures import (
178197
ThreadPoolExecutor,
179198
as_completed,
@@ -184,8 +203,6 @@ def _record(r: EvalResult) -> None:
184203

185204
with ThreadPoolExecutor(max_workers=workers) as pool:
186205
futures = {pool.submit(eval_fn, inst, params): inst for inst in pending}
187-
# Generous outer deadline: timeout * ceil(len/workers) covers the
188-
# serialised case if workers all hang together.
189206
outer_deadline = time.monotonic() + timeout_per_instance * max(1, (len(pending) + workers - 1) // workers)
190207
completed: set[str] = set()
191208
try:
@@ -198,12 +215,9 @@ def _record(r: EvalResult) -> None:
198215
except Exception as e:
199216
r = _failure_result(inst, params, "error", f"{type(e).__name__}: {e}")
200217
completed.add(inst.instance_id)
201-
_record(r)
218+
record(r)
202219
except FuturesTimeoutError:
203220
for inst in futures.values():
204221
if inst.instance_id not in completed:
205-
_record(_failure_result(inst, params, "timeout", "exceeded global deadline"))
206-
# Cancel any pending futures; running threads are abandoned.
222+
record(_failure_result(inst, params, "timeout", "exceeded global deadline"))
207223
pool.shutdown(wait=False, cancel_futures=True)
208-
209-
return results

benchmarks/adapters/splits.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from benchmarks.adapters.base import BenchmarkAdapter, BenchmarkInstance
1010
from benchmarks.adapters.contamination import ContaminationDetector
1111

12+
_TWO_COL_DIVIDER = "|---|---|"
13+
1214

1315
@dataclass(frozen=True)
1416
class SplitConfig:
@@ -152,15 +154,15 @@ def render_split_report(config: SplitConfig, result: SplitResult, today: str = "
152154
lines.append("## Totals")
153155
lines.append("")
154156
lines.append("| Split | Count |")
155-
lines.append("|---|---|")
157+
lines.append(_TWO_COL_DIVIDER)
156158
lines.append(f"| Test | {result.stats.test_total} |")
157159
lines.append(f"| Validation | {result.stats.validation_total} |")
158160
lines.append(f"| Calibration | {result.stats.calibration_total} |")
159161
lines.append("")
160162
lines.append("## Test set per benchmark")
161163
lines.append("")
162164
lines.append("| Benchmark | Count |")
163-
lines.append("|---|---|")
165+
lines.append(_TWO_COL_DIVIDER)
164166
for name in sorted(result.stats.test_per_benchmark):
165167
lines.append(f"| {name} | {result.stats.test_per_benchmark[name]} |")
166168
lines.append("")
@@ -181,7 +183,7 @@ def render_split_report(config: SplitConfig, result: SplitResult, today: str = "
181183
lines.append("## Pinned dataset revisions")
182184
lines.append("")
183185
lines.append("| Adapter | Revision |")
184-
lines.append("|---|---|")
186+
lines.append(_TWO_COL_DIVIDER)
185187
for name in sorted(result.stats.dataset_revisions):
186188
lines.append(f"| {name} | `{result.stats.dataset_revisions[name]}` |")
187189
lines.append("")

benchmarks/baselines/_idents.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,18 @@
8484
)
8585

8686

87+
def _emit_query_idents(tok: str, out: set[str]) -> None:
88+
if len(tok) < 3 or tok.isdigit() or tok.lower() in _KEYWORDS:
89+
return
90+
out.add(tok.lower())
91+
for part in _CAMEL_SPLIT_RE.split(tok):
92+
if len(part) >= 3 and part.lower() not in _KEYWORDS:
93+
out.add(part.lower())
94+
for part in tok.split("_"):
95+
if len(part) >= 3 and part.lower() not in _KEYWORDS:
96+
out.add(part.lower())
97+
98+
8799
def extract_idents_from_patch(patch: str) -> set[str]:
88100
"""Extract retrieval-query identifiers from a unified diff.
89101
@@ -100,15 +112,7 @@ def extract_idents_from_patch(patch: str) -> set[str]:
100112
continue
101113
body = raw[1:] if raw[:1] in ("+", "-", " ") else raw
102114
for tok in _TOKEN_RE.findall(body):
103-
if len(tok) < 3 or tok.isdigit() or tok.lower() in _KEYWORDS:
104-
continue
105-
idents.add(tok.lower())
106-
for part in _CAMEL_SPLIT_RE.split(tok):
107-
if len(part) >= 3 and part.lower() not in _KEYWORDS:
108-
idents.add(part.lower())
109-
for part in tok.split("_"):
110-
if len(part) >= 3 and part.lower() not in _KEYWORDS:
111-
idents.add(part.lower())
115+
_emit_query_idents(tok, idents)
112116
return idents
113117

114118

benchmarks/baselines/aider_subprocess.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,23 @@ class _SilentIO:
5151
"""Duck-typed `io` substitute for RepoMap. No prompts, no console."""
5252

5353
def read_text(self, fname, silent=False):
54+
# `silent` is part of Aider's IO contract; ignored here because we
55+
# never log anyway.
56+
del silent
5457
try:
5558
with open(fname, encoding="utf-8", errors="replace") as f:
5659
return f.read()
5760
except Exception:
5861
return None
5962

6063
def tool_output(self, *args, **kwargs):
61-
pass
64+
"""No-op: stub that swallows Aider tool output to keep the bench quiet."""
6265

6366
def tool_warning(self, *args, **kwargs):
64-
pass
67+
"""No-op: stub that swallows Aider tool warnings."""
6568

6669
def tool_error(self, *args, **kwargs):
67-
pass
70+
"""No-op: stub that swallows Aider tool errors."""
6871

6972

7073
_CODE_EXT_RE = __import__("re").compile(

benchmarks/baselines/bm25_baseline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def make_bm25_eval_fn(repos_dir: Path):
4848
from rank_bm25 import BM25Okapi
4949
except ImportError as e:
5050
raise RuntimeError(
51-
"rank-bm25 not installed; expected to be in requirements-bench.lock. " "Run: pip install rank-bm25"
51+
"rank-bm25 not installed; expected to be in requirements-bench.lock. Run: pip install rank-bm25"
5252
) from e
5353

5454
evaluator = UniversalEvaluator()

benchmarks/build_splits.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def default_calibration_pool_adapters() -> tuple[BenchmarkAdapter, ...]:
6666
)
6767

6868

69-
def main() -> int:
69+
def main() -> None:
7070
p = argparse.ArgumentParser(description=__doc__)
7171
p.add_argument(
7272
"--out",
@@ -108,7 +108,7 @@ def main() -> int:
108108
print("=== SPLIT_REPORT.md (dry-run, NOT written) ===\n")
109109
print(report)
110110
print("\n--dry-run: no manifests written. Re-run without the flag to freeze.")
111-
return 0
111+
return
112112

113113
written = write_manifests(result, args.out)
114114
report_path = args.out / "SPLIT_REPORT.md"
@@ -117,8 +117,7 @@ def main() -> int:
117117
for name, path in sorted(written.items()):
118118
print(f" {name}: {path}")
119119
print(f"Report: {report_path}")
120-
return 0
121120

122121

123122
if __name__ == "__main__":
124-
raise SystemExit(main())
123+
main()

benchmarks/pin_revisions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ def main() -> int:
7272
"fetched_at": now,
7373
"hf_path": hf_path,
7474
}
75-
marker = "(unchanged)" if prev == sha else f"(was {prev[:12] if prev else 'main'})"
75+
prev_marker = f"was {prev[:12]}" if prev else "was main"
76+
marker = "(unchanged)" if prev == sha else f"({prev_marker})"
7677
print(f" OK {hf_path:<45} {sha[:12]} {marker}")
7778

7879
save_pins(pins)

0 commit comments

Comments
 (0)