Skip to content

Commit bdb0533

Browse files
authored
Mutation testing, cost visibility, CI, and docs (#40)
* Mutation testing, cost visibility, CI, and docs - storage_json: tests for refresh_summary, get_event_stats (empty/single/by_repo), prune boundary - docs/mutation-testing.md: how to run, equivalent mutants, pre-push vs CI, file globs - CI: mutation job on *storage_json* with timeout and missed-count baseline (15) - README/CLAUDE: pre-push vs CI mutation note; optional --no-verify - Analytics: show est. cost (reviews) from useEventStats when available - web: cost.test.ts for formatCost, estimateCost, totalCost Made-with: Cursor * Publish Docker image on merge to main; review job skips when image unavailable - publish-image.yml: on push to main, build and push ghcr.io/evalops/diffscope:latest and :sha-<sha> - diffscope.yml: check image available before Run DiffScope; skip gracefully with notice if pull fails (job no longer fails) Made-with: Cursor * diffscope review: continue-on-error on Run DiffScope so image pull failure never fails job Made-with: Cursor * DiffScope Review: use docker run instead of uses docker:// to avoid pre-pull failing job GitHub pre-pulls container actions before any steps run; that pull was failing the job. Run DiffScope via 'docker run' only when image is already available from Check image step. Made-with: Cursor * Fix prune boundary test race (Sentry feedback): use prune_at(now) for single timestamp - Extract prune_at(max_age_secs, max_count, now_secs) so test and production share logic - prune() calls prune_at(..., SystemTime::now()); test calls prune_at(..., now_ts()) once Made-with: Cursor
1 parent 2f33f20 commit bdb0533

File tree

9 files changed

+403
-17
lines changed

9 files changed

+403
-17
lines changed

.github/workflows/ci.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,27 @@ jobs:
6666
- uses: Swatinem/rust-cache@v2
6767
- name: Test
6868
run: cargo test
69+
70+
mutation:
71+
runs-on: ubuntu-latest
72+
steps:
73+
- uses: actions/checkout@v5
74+
- uses: dtolnay/rust-toolchain@1.88.0
75+
- uses: Swatinem/rust-cache@v2
76+
- name: Install cargo-mutants
77+
run: cargo install cargo-mutants --locked
78+
- name: Mutation test (storage_json)
79+
run: |
80+
timeout 900 cargo mutants -f '*storage_json*' 2>&1 | tee mutation.log || true
81+
MISSED=$(grep -E '[0-9]+ missed' mutation.log | tail -1 | grep -oE '[0-9]+' | head -1 || echo "0")
82+
echo "missed=$MISSED" >> "$GITHUB_OUTPUT"
83+
id: mutation
84+
- name: Check mutation baseline
85+
run: |
86+
MISSED="${{ steps.mutation.outputs.missed }}"
87+
BASELINE=15
88+
if [ -n "$MISSED" ] && [ "$MISSED" -gt "$BASELINE" ]; then
89+
echo "Mutation missed count $MISSED exceeds baseline $BASELINE. Update docs/mutation-testing.md and this baseline if intentional."
90+
exit 1
91+
fi
92+
if: always() && steps.mutation.outputs.missed != ''

.github/workflows/diffscope.yml

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,26 @@ jobs:
3737
echo "skip=false" >> "$GITHUB_OUTPUT"
3838
fi
3939
40+
- name: Check image available
41+
id: check_image
42+
run: |
43+
docker pull ghcr.io/evalops/diffscope:latest 2>/dev/null && echo "available=true" >> "$GITHUB_OUTPUT" || echo "available=false" >> "$GITHUB_OUTPUT"
44+
45+
- name: Notice image unavailable
46+
if: steps.check_image.outputs.available == 'false'
47+
run: echo "::notice::DiffScope review skipped — image ghcr.io/evalops/diffscope:latest not available (merge to main publishes it)"
48+
4049
- name: Run DiffScope
41-
if: steps.check_key.outputs.skip != 'true'
42-
uses: docker://ghcr.io/evalops/diffscope:latest
43-
env:
44-
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
45-
with:
46-
args: review --diff pr.diff --output-format json --output comments.json
50+
if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
51+
run: |
52+
docker run --rm \
53+
-e OPENAI_API_KEY="${{ secrets.OPENAI_API_KEY }}" \
54+
-v "$PWD":/workspace -w /workspace \
55+
ghcr.io/evalops/diffscope:latest \
56+
review --diff pr.diff --output-format json --output comments.json
4757
4858
- name: Post comments
49-
if: steps.check_key.outputs.skip != 'true'
59+
if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
5060
uses: actions/github-script@v7
5161
with:
5262
script: |
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Build and push Docker image to ghcr.io on every merge to main.
2+
# Keeps ghcr.io/evalops/diffscope:latest available for the DiffScope Review workflow.
3+
name: Publish image
4+
5+
on:
6+
push:
7+
branches:
8+
- main
9+
10+
concurrency:
11+
group: publish-image-main
12+
cancel-in-progress: false
13+
14+
jobs:
15+
build-and-push:
16+
name: Build and push Docker image
17+
runs-on: ubuntu-latest
18+
permissions:
19+
contents: read
20+
packages: write
21+
steps:
22+
- name: Checkout code
23+
uses: actions/checkout@v5
24+
25+
- name: Set up Docker Buildx
26+
uses: docker/setup-buildx-action@v3
27+
28+
- name: Login to GitHub Container Registry
29+
uses: docker/login-action@v3
30+
with:
31+
registry: ghcr.io
32+
username: ${{ github.actor }}
33+
password: ${{ secrets.GITHUB_TOKEN }}
34+
35+
- name: Build and push Docker image
36+
id: build-and-push
37+
uses: docker/build-push-action@v6
38+
with:
39+
context: .
40+
platforms: linux/amd64
41+
push: true
42+
cache-from: type=gha
43+
cache-to: type=gha,mode=max
44+
tags: |
45+
ghcr.io/evalops/diffscope:latest
46+
ghcr.io/evalops/diffscope:sha-${{ github.sha }}

CLAUDE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ cargo run -- --help # CLI usage
2222
- `examples/` — Usage examples
2323

2424
## Development
25-
- Run `bash scripts/install-hooks.sh` once to install pre-commit and pre-push hooks (merge conflict check, trailing newline, conditional Rust/web checks, shellcheck; pre-push adds version sync, cargo audit, full web build+test).
25+
- Run `bash scripts/install-hooks.sh` once to install pre-commit and pre-push hooks (merge conflict check, trailing newline, conditional Rust/web checks, shellcheck; pre-push adds version sync, cargo audit, full web build+test). Mutation testing runs in CI only; see `docs/mutation-testing.md`.
2626

2727
## Conventions
2828
- Use frontier models for reviews — never default to smaller models

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ bash scripts/install-hooks.sh
10171017
- When `web/` is staged: `npm run lint`, `tsc -b`, and `npm run test` in `web/`
10181018
- When `scripts/` or `.githooks/` change: `shellcheck` (if installed)
10191019

1020-
**Pre-push** runs the full gate: workflow check, version sync (`Cargo.toml` vs git tags), `cargo fmt`, `cargo clippy`, `cargo audit` (if installed), `npm ci && npm run build && npm run test` in `web/`, and `cargo test`. The first push after clone may take longer due to `npm ci`.
1020+
**Pre-push** runs the full gate: workflow check, version sync (`Cargo.toml` vs git tags), `cargo fmt`, `cargo clippy`, `cargo audit` (if installed), `npm ci && npm run build && npm run test` in `web/`, and `cargo test`. The first push after clone may take longer due to `npm ci`. Mutation testing runs in CI only (see `docs/mutation-testing.md`), not on pre-push, to keep pushes fast. For a quick push without full checks use `git push --no-verify` (use sparingly).
10211021

10221022
## Supported Platforms
10231023

docs/mutation-testing.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Mutation testing
2+
3+
We use [cargo-mutants](https://github.com/sourcefrog/cargo-mutants) to find gaps in test coverage. Mutants are small code changes (e.g. `*``/`, `>``>=`); if tests still pass, the mutant is "missed" and the behavior is under-tested.
4+
5+
## Running
6+
7+
```bash
8+
# All mutants (slow; 6000+)
9+
cargo mutants
10+
11+
# One file / path (faster; glob)
12+
cargo mutants -f '*storage_json*'
13+
cargo mutants -f '*state*'
14+
cargo mutants -f '*storage_pg*'
15+
cargo mutants -f '*cost*'
16+
```
17+
18+
CI runs mutation on `storage_json` with a timeout; see [CI job](#ci) below.
19+
20+
## Known equivalent / accepted mutants
21+
22+
These mutants are either equivalent (same behavior) or accepted by design. CI does not fail on them.
23+
24+
### cost.rs
25+
26+
| Location | Mutation | Reason |
27+
|----------|----------|--------|
28+
| ~line 43 | `* FALLBACK_PRICE_PER_M``/ FALLBACK_PRICE_PER_M` | With `FALLBACK_PRICE_PER_M = 1.0`, `* 1` and `/ 1` are equivalent. No test can distinguish without changing the constant. |
29+
30+
### storage_json.rs
31+
32+
After adding targeted tests (refresh_summary, get_event_stats exact aggregates/single-event/by_repo, prune boundary), many previously missed mutants are now killed. Any remaining misses in these areas are documented here after a run:
33+
34+
- **refresh_summary**: Condition `summary.is_some() \|\| !comments.is_empty()` — tests now assert synthesized summary when comments exist and no summary.
35+
- **get_event_stats**: Percentile index, avg formulas, by_model/by_repo — tests assert exact values (e.g. p50/p95/p99, avg_score).
36+
- **prune**: Boundary `now - started_at > max_age_secs` — test asserts review exactly at boundary is not pruned; max_count test asserts oldest removed.
37+
38+
If new mutants appear in these regions, add assertions that would fail on the wrong operator/formula, or add them to this table with a one-line rationale.
39+
40+
## Pre-push vs CI
41+
42+
- **Pre-push** (`.githooks/pre-push`): Runs unit tests, `cargo audit`, web build+test. Does *not* run mutation (too slow for every push).
43+
- **CI mutation job**: Runs `cargo mutants -f storage_json` (or one crate) on a schedule or on PRs to main. Fails if "missed" count increases beyond the baseline in this doc.
44+
- For a quick local push without full checks: `git push --no-verify` (use sparingly; CI will still run).
45+
46+
## CI
47+
48+
The `mutation` job in `.github/workflows/ci.yml` runs mutation on the `storage_json` crate with a timeout. Update the "allowed missed" baseline in the job when you intentionally accept new equivalent mutants (and add them to the table above).

src/server/storage_json.rs

Lines changed: 186 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -390,16 +390,28 @@ impl StorageBackend for JsonStorageBackend {
390390
}
391391

392392
async fn prune(&self, max_age_secs: i64, max_count: usize) -> anyhow::Result<usize> {
393+
let now = std::time::SystemTime::now()
394+
.duration_since(std::time::UNIX_EPOCH)
395+
.unwrap_or_default()
396+
.as_secs() as i64;
397+
self.prune_at(max_age_secs, max_count, now).await
398+
}
399+
}
400+
401+
impl JsonStorageBackend {
402+
/// Prune with a fixed "now" timestamp. Used by the trait implementation and by tests to avoid race conditions.
403+
pub(crate) async fn prune_at(
404+
&self,
405+
max_age_secs: i64,
406+
max_count: usize,
407+
now_secs: i64,
408+
) -> anyhow::Result<usize> {
393409
let removed = {
394410
let mut reviews = self.reviews.write().await;
395-
let now = std::time::SystemTime::now()
396-
.duration_since(std::time::UNIX_EPOCH)
397-
.unwrap_or_default()
398-
.as_secs() as i64;
399411

400412
let expired: Vec<String> = reviews
401413
.iter()
402-
.filter(|(_, r)| now - r.started_at > max_age_secs)
414+
.filter(|(_, r)| now_secs - r.started_at > max_age_secs)
403415
.map(|(id, _)| id.clone())
404416
.collect();
405417
let mut removed = expired.len();
@@ -1099,6 +1111,79 @@ mod tests {
10991111
assert!(stats.avg_score.is_none());
11001112
assert_eq!(stats.error_rate, 0.0);
11011113
assert_eq!(stats.p50_latency_ms, 0);
1114+
assert_eq!(stats.p95_latency_ms, 0);
1115+
assert_eq!(stats.p99_latency_ms, 0);
1116+
assert!(stats.by_model.is_empty());
1117+
assert!(stats.by_source.is_empty());
1118+
assert!(stats.by_repo.is_empty());
1119+
assert!(stats.daily_counts.is_empty());
1120+
assert_eq!(stats.total_cost_estimate, 0.0);
1121+
}
1122+
1123+
/// Single event: percentiles and avg must equal that single value (catches percentile/division mutants).
1124+
#[tokio::test]
1125+
async fn test_get_event_stats_single_event_exact_values() {
1126+
let dir = tempfile::tempdir().unwrap();
1127+
let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
1128+
1129+
let mut s = make_session_with_event(
1130+
"r1",
1131+
now_ts(),
1132+
ReviewStatus::Complete,
1133+
"review.completed",
1134+
"gpt-4o",
1135+
"head",
1136+
150,
1137+
);
1138+
s.event.as_mut().unwrap().overall_score = Some(7.0);
1139+
backend.save_review(&s).await.unwrap();
1140+
1141+
let stats = backend
1142+
.get_event_stats(&EventFilters::default())
1143+
.await
1144+
.unwrap();
1145+
1146+
assert_eq!(stats.total_reviews, 1);
1147+
assert_eq!(stats.avg_duration_ms, 150.0);
1148+
assert_eq!(stats.p50_latency_ms, 150, "single value is p50/p95/p99");
1149+
assert_eq!(stats.p95_latency_ms, 150);
1150+
assert_eq!(stats.p99_latency_ms, 150);
1151+
assert_eq!(stats.avg_score, Some(7.0));
1152+
}
1153+
1154+
/// By-repo avg_score: two events same repo (4.0 + 6.0) / 2 = 5.0.
1155+
#[tokio::test]
1156+
async fn test_get_event_stats_by_repo_avg_score_exact() {
1157+
let dir = tempfile::tempdir().unwrap();
1158+
let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
1159+
1160+
let now = now_ts();
1161+
for (i, &score) in [4.0_f32, 6.0_f32].iter().enumerate() {
1162+
let mut s = make_session_with_event(
1163+
&format!("r{}", i),
1164+
now + i as i64,
1165+
ReviewStatus::Complete,
1166+
"review.completed",
1167+
"gpt-4o",
1168+
"head",
1169+
100,
1170+
);
1171+
if let Some(ref mut e) = s.event {
1172+
e.github_repo = Some("org/same-repo".to_string());
1173+
e.overall_score = Some(score);
1174+
}
1175+
backend.save_review(&s).await.unwrap();
1176+
}
1177+
1178+
let stats = backend
1179+
.get_event_stats(&EventFilters::default())
1180+
.await
1181+
.unwrap();
1182+
1183+
assert_eq!(stats.by_repo.len(), 1);
1184+
assert_eq!(stats.by_repo[0].repo, "org/same-repo");
1185+
assert_eq!(stats.by_repo[0].count, 2);
1186+
assert_eq!(stats.by_repo[0].avg_score, Some(5.0), " (4+6)/2 ");
11021187
}
11031188

11041189
#[tokio::test]
@@ -1346,6 +1431,102 @@ mod tests {
13461431
assert_eq!(stats.by_model[0].avg_duration_ms, 200.0);
13471432
}
13481433

1434+
/// refresh_summary: get_review with comments but no summary produces synthesized summary.
1435+
#[tokio::test]
1436+
async fn test_get_review_refreshes_summary_when_comments_no_summary() {
1437+
let dir = tempfile::tempdir().unwrap();
1438+
let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
1439+
1440+
let mut session = make_session("ref-sum", now_ts(), ReviewStatus::Complete);
1441+
session.comments = vec![
1442+
make_comment("c1", "src/a.rs"),
1443+
make_comment("c2", "src/b.rs"),
1444+
];
1445+
session.summary = None;
1446+
backend.save_review(&session).await.unwrap();
1447+
1448+
let loaded = backend.get_review("ref-sum").await.unwrap().unwrap();
1449+
assert!(
1450+
loaded.summary.is_some(),
1451+
"refresh_summary should synthesize summary when comments exist"
1452+
);
1453+
let sum = loaded.summary.unwrap();
1454+
assert_eq!(sum.total_comments, 2);
1455+
}
1456+
1457+
/// refresh_summary: list_reviews returns sessions with refreshed summary when comments exist.
1458+
#[tokio::test]
1459+
async fn test_list_reviews_refreshes_summary_for_sessions_with_comments() {
1460+
let dir = tempfile::tempdir().unwrap();
1461+
let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
1462+
1463+
let mut with_comments = make_session("with-c", now_ts(), ReviewStatus::Complete);
1464+
with_comments.comments = vec![make_comment("c1", "src/main.rs")];
1465+
with_comments.summary = None;
1466+
backend.save_review(&with_comments).await.unwrap();
1467+
1468+
backend
1469+
.save_review(&make_session("no-c", now_ts() - 1, ReviewStatus::Complete))
1470+
.await
1471+
.unwrap();
1472+
1473+
let list = backend.list_reviews(10, 0).await.unwrap();
1474+
let with_c = list.iter().find(|s| s.id == "with-c").unwrap();
1475+
assert!(
1476+
with_c.summary.is_some(),
1477+
"list_reviews should refresh summary for session with comments"
1478+
);
1479+
}
1480+
1481+
/// Prune age boundary: review with started_at exactly (now - max_age_secs) must NOT be expired (> not >=).
1482+
/// Uses prune_at with a single timestamp to avoid race (Sentry feedback).
1483+
#[tokio::test]
1484+
async fn test_prune_age_boundary_not_expired() {
1485+
let dir = tempfile::tempdir().unwrap();
1486+
let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
1487+
1488+
let now = now_ts();
1489+
let max_age = 100_i64;
1490+
let session = make_session("boundary", now - max_age, ReviewStatus::Complete);
1491+
backend.save_review(&session).await.unwrap();
1492+
1493+
let removed = backend.prune_at(max_age, 1000, now).await.unwrap();
1494+
assert_eq!(
1495+
removed, 0,
1496+
"exactly at boundary (now - max_age) should not be pruned"
1497+
);
1498+
1499+
let list = backend.list_reviews(10, 0).await.unwrap();
1500+
assert_eq!(list.len(), 1);
1501+
assert_eq!(list[0].id, "boundary");
1502+
}
1503+
1504+
/// Prune max_count: keep newest max_count completed, remove oldest.
1505+
#[tokio::test]
1506+
async fn test_prune_max_count_removes_oldest() {
1507+
let dir = tempfile::tempdir().unwrap();
1508+
let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
1509+
1510+
let base = now_ts() - 10_000;
1511+
for i in 0..5 {
1512+
let s = make_session(&format!("r{}", i), base + i as i64, ReviewStatus::Complete);
1513+
backend.save_review(&s).await.unwrap();
1514+
}
1515+
1516+
let removed = backend.prune(1_000_000, 3).await.unwrap();
1517+
assert_eq!(removed, 2, "5 - 3 = 2 removed by count limit");
1518+
1519+
let list = backend.list_reviews(10, 0).await.unwrap();
1520+
assert_eq!(list.len(), 3);
1521+
let ids: Vec<_> = list.iter().map(|s| s.id.as_str()).collect();
1522+
let want = ["r2", "r3", "r4"];
1523+
assert!(
1524+
want.iter().all(|w| ids.contains(w)),
1525+
"newest 3 (r2,r3,r4) kept; r0,r1 removed, got {:?}",
1526+
ids
1527+
);
1528+
}
1529+
13491530
/// time_to filter: event with created_at == time_to must be included (<=).
13501531
#[tokio::test]
13511532
async fn test_list_events_time_to_inclusive() {

0 commit comments

Comments
 (0)