Skip to content

Commit 46b4ce9

Browse files
sjarmakclaude
andcommitted
fix: re-score servo-scrollend runs from trajectory (cargo check gate removed)
Re-scored 8 servo-scrollend runs (5 baseline, 3 MCP) from 0.0 → 0.50 based on deterministic structural signal analysis from agent trajectories. Scoring logic (matches updated test.sh): - SCROLLEND_FOUND=1 (0.30): pre-existing .rs references in Servo repo - WPT_TESTS=1 (0.20): pre-existing scrollend WPT test files - CHANGES_MADE=0: agent modified only 1 scroll-related file (need >= 2) - BUILD_OK=0: cargo check infeasible (OOM on 4GB, 30-60min on 2 vCPU) - UNIT_TEST_PASS=0: depends on build Each result.json tagged with re_scored=true and re_score_reason. MANIFEST regenerated. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2bd63bb commit 46b4ce9

File tree

25 files changed

+867
-766
lines changed

25 files changed

+867
-766
lines changed

runs/official/MANIFEST.json

Lines changed: 41 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"description": "Canonical run manifest for CodeContextBench evaluation",
3-
"generated": "2026-03-01T20:53:29.681665+00:00",
3+
"generated": "2026-03-01T21:05:25.702539+00:00",
44
"total_tasks": 643,
55
"total_runs": 76,
66
"runs": {
@@ -9,10 +9,10 @@
99
"model": "anthropic/claude-haiku-4-5-20251001",
1010
"timestamp": "2026-02-27 12-48-00",
1111
"task_count": 23,
12-
"passed": 18,
12+
"passed": 19,
1313
"failed": 4,
14-
"errored": 1,
15-
"mean_reward": 0.606,
14+
"errored": 0,
15+
"mean_reward": 0.601,
1616
"tasks": {
1717
"bustub-hyperloglog-impl-001": {
1818
"status": "passed",
@@ -185,8 +185,8 @@
185185
"judge_confidence": null
186186
},
187187
"servo-scrollend-event-feat-001": {
188-
"status": "errored",
189-
"reward": 0.0,
188+
"status": "passed",
189+
"reward": 0.5,
190190
"has_trajectory": true,
191191
"has_cost": true,
192192
"judge_score": null,
@@ -1907,10 +1907,10 @@
19071907
"model": "anthropic/claude-haiku-4-5-20251001",
19081908
"timestamp": "2026-03-01 07-13-05",
19091909
"task_count": 20,
1910-
"passed": 17,
1911-
"failed": 3,
1910+
"passed": 18,
1911+
"failed": 2,
19121912
"errored": 0,
1913-
"mean_reward": 0.631,
1913+
"mean_reward": 0.656,
19141914
"tasks": {
19151915
"bustub-hyperloglog-impl-001": {
19161916
"status": "failed",
@@ -2063,8 +2063,8 @@
20632063
"judge_confidence": null
20642064
},
20652065
"servo-scrollend-event-feat-001": {
2066-
"status": "failed",
2067-
"reward": 0.0,
2066+
"status": "passed",
2067+
"reward": 0.5,
20682068
"has_trajectory": true,
20692069
"has_cost": true,
20702070
"judge_score": null,
@@ -2119,10 +2119,10 @@
21192119
"model": "anthropic/claude-haiku-4-5-20251001",
21202120
"timestamp": "2026-03-01 07-14-22",
21212121
"task_count": 20,
2122-
"passed": 16,
2123-
"failed": 4,
2122+
"passed": 17,
2123+
"failed": 3,
21242124
"errored": 0,
2125-
"mean_reward": 0.553,
2125+
"mean_reward": 0.578,
21262126
"tasks": {
21272127
"bustub-hyperloglog-impl-001": {
21282128
"status": "passed",
@@ -2275,8 +2275,8 @@
22752275
"judge_confidence": null
22762276
},
22772277
"servo-scrollend-event-feat-001": {
2278-
"status": "failed",
2279-
"reward": 0.0,
2278+
"status": "passed",
2279+
"reward": 0.5,
22802280
"has_trajectory": true,
22812281
"has_cost": true,
22822282
"judge_score": null,
@@ -7627,16 +7627,23 @@
76277627
]
76287628
},
76297629
"servo-scrollend-event-feat-001": {
7630-
"n_runs": 1,
7631-
"mean_reward": 0.0,
7630+
"n_runs": 2,
7631+
"mean_reward": 0.5,
76327632
"std_reward": 0.0,
76337633
"runs": [
76347634
{
76357635
"started_at": "2026-02-27T03:47:58.529355",
7636-
"reward": 0.0,
7637-
"status": "failed",
7636+
"reward": 0.5,
7637+
"status": "passed",
76387638
"is_paired": false,
76397639
"run_dir": "ccb_build_haiku_20260227_034711"
7640+
},
7641+
{
7642+
"started_at": "2026-02-27T12:54:23.815256",
7643+
"reward": 0.5,
7644+
"status": "passed",
7645+
"is_paired": false,
7646+
"run_dir": "ccb_build_haiku_20260227_123839"
76407647
}
76417648
]
76427649
},
@@ -12699,27 +12706,27 @@
1269912706
},
1270012707
"servo-scrollend-event-feat-001": {
1270112708
"n_runs": 3,
12702-
"mean_reward": 0.0,
12709+
"mean_reward": 0.5,
1270312710
"std_reward": 0.0,
1270412711
"runs": [
1270512712
{
1270612713
"started_at": "2026-02-28T21:41:34.662854",
12707-
"reward": 0.0,
12708-
"status": "failed",
12714+
"reward": 0.5,
12715+
"status": "passed",
1270912716
"is_paired": false,
1271012717
"run_dir": "feature_haiku_20260228_211127"
1271112718
},
1271212719
{
1271312720
"started_at": "2026-03-01T03:20:44.434689Z",
12714-
"reward": 0.0,
12715-
"status": "failed",
12721+
"reward": 0.5,
12722+
"status": "passed",
1271612723
"is_paired": false,
1271712724
"run_dir": "feature_haiku_20260301_031848"
1271812725
},
1271912726
{
1272012727
"started_at": "2026-03-01T07:14:09.824625Z",
12721-
"reward": 0.0,
12722-
"status": "failed",
12728+
"reward": 0.5,
12729+
"status": "passed",
1272312730
"is_paired": false,
1272412731
"run_dir": "feature_haiku_20260301_071229"
1272512732
}
@@ -13380,27 +13387,27 @@
1338013387
},
1338113388
"servo-scrollend-event-feat-001": {
1338213389
"n_runs": 3,
13383-
"mean_reward": 0.0,
13390+
"mean_reward": 0.5,
1338413391
"std_reward": 0.0,
1338513392
"runs": [
1338613393
{
1338713394
"started_at": "2026-02-28T21:43:50.346396",
13388-
"reward": 0.0,
13389-
"status": "failed",
13395+
"reward": 0.5,
13396+
"status": "passed",
1339013397
"is_paired": false,
1339113398
"run_dir": "feature_haiku_20260228_211127"
1339213399
},
1339313400
{
1339413401
"started_at": "2026-02-28T23:11:53.775115",
13395-
"reward": 0.0,
13396-
"status": "failed",
13402+
"reward": 0.5,
13403+
"status": "passed",
1339713404
"is_paired": false,
1339813405
"run_dir": "feature_haiku_20260228_231035"
1339913406
},
1340013407
{
1340113408
"started_at": "2026-03-01T07:14:15.953778Z",
13402-
"reward": 0.0,
13403-
"status": "failed",
13409+
"reward": 0.5,
13410+
"status": "passed",
1340413411
"is_paired": false,
1340513412
"run_dir": "feature_haiku_20260301_071229"
1340613413
}
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,92 @@
11
{
2-
"id": "e6adc0d1-0441-41af-8515-76e77ee984e8",
3-
"task_name": "servo-scrollend-event-feat-001",
4-
"trial_name": "servo-scrollend-event-feat-001__r5X3vna",
5-
"trial_uri": "file:///home/stephanie_jarmak/CodeContextBench/runs/staging/ccb_build_haiku_20260227_034711/baseline-local-direct/2026-02-27__03-47-58/servo-scrollend-event-feat-001__r5X3vna",
6-
"task_id": {
7-
"path": "/home/stephanie_jarmak/CodeContextBench/configs/../benchmarks/ccb_build/servo-scrollend-event-feat-001"
8-
},
9-
"source": null,
10-
"task_checksum": "4be5540a6f3fd6ed6f535662dea2f89527e9a22dcf1869f9db680a5e9fbc3fac",
11-
"config": {
12-
"task": {
13-
"path": "/home/stephanie_jarmak/CodeContextBench/configs/../benchmarks/ccb_build/servo-scrollend-event-feat-001",
14-
"git_url": null,
15-
"git_commit_id": null,
16-
"overwrite": false,
17-
"download_dir": null,
18-
"source": null
19-
},
20-
"trial_name": "servo-scrollend-event-feat-001__r5X3vna",
21-
"trials_dir": "runs/staging/ccb_build_haiku_20260227_034711/baseline-local-direct/2026-02-27__03-47-58",
22-
"timeout_multiplier": 15.0,
23-
"agent": {
24-
"name": null,
25-
"import_path": "agents.claude_baseline_agent:BaselineClaudeCodeAgent",
26-
"model_name": "anthropic/claude-haiku-4-5-20251001",
27-
"override_timeout_sec": null,
28-
"override_setup_timeout_sec": null,
29-
"max_timeout_sec": null,
30-
"kwargs": {}
31-
},
32-
"environment": {
33-
"type": "docker",
34-
"import_path": null,
35-
"force_build": false,
36-
"delete": true,
37-
"override_cpus": null,
38-
"override_memory_mb": null,
39-
"override_storage_mb": null,
40-
"override_gpus": null,
41-
"kwargs": {}
42-
},
43-
"verifier": {
44-
"override_timeout_sec": null,
45-
"max_timeout_sec": null,
46-
"disable": false
47-
},
48-
"job_id": "08e1d107-eaa1-4036-bfc2-48d719d5e1a3"
49-
},
50-
"agent_info": {
51-
"name": "claude-code",
52-
"version": "unknown",
53-
"model_info": {
54-
"name": "claude-haiku-4-5-20251001",
55-
"provider": "anthropic"
56-
}
57-
},
58-
"agent_result": {
59-
"n_input_tokens": 16867840,
60-
"n_cache_tokens": 16866819,
61-
"n_output_tokens": 31007,
62-
"cost_usd": null,
63-
"rollout_details": null,
64-
"metadata": null
65-
},
66-
"verifier_result": {
67-
"rewards": {
68-
"reward": 0.0
69-
}
2+
"id": "e6adc0d1-0441-41af-8515-76e77ee984e8",
3+
"task_name": "servo-scrollend-event-feat-001",
4+
"trial_name": "servo-scrollend-event-feat-001__r5X3vna",
5+
"trial_uri": "file:///home/stephanie_jarmak/CodeContextBench/runs/staging/ccb_build_haiku_20260227_034711/baseline-local-direct/2026-02-27__03-47-58/servo-scrollend-event-feat-001__r5X3vna",
6+
"task_id": {
7+
"path": "/home/stephanie_jarmak/CodeContextBench/configs/../benchmarks/ccb_build/servo-scrollend-event-feat-001"
8+
},
9+
"source": null,
10+
"task_checksum": "4be5540a6f3fd6ed6f535662dea2f89527e9a22dcf1869f9db680a5e9fbc3fac",
11+
"config": {
12+
"task": {
13+
"path": "/home/stephanie_jarmak/CodeContextBench/configs/../benchmarks/ccb_build/servo-scrollend-event-feat-001",
14+
"git_url": null,
15+
"git_commit_id": null,
16+
"overwrite": false,
17+
"download_dir": null,
18+
"source": null
7019
},
71-
"exception_info": null,
72-
"started_at": "2026-02-27T03:47:58.529355",
73-
"finished_at": "2026-02-27T04:40:53.037423",
74-
"environment_setup": {
75-
"started_at": "2026-02-27T03:47:58.530170",
76-
"finished_at": "2026-02-27T03:48:34.448031"
77-
},
78-
"agent_setup": {
79-
"started_at": "2026-02-27T03:48:34.448060",
80-
"finished_at": "2026-02-27T03:48:55.456125"
20+
"trial_name": "servo-scrollend-event-feat-001__r5X3vna",
21+
"trials_dir": "runs/staging/ccb_build_haiku_20260227_034711/baseline-local-direct/2026-02-27__03-47-58",
22+
"timeout_multiplier": 15.0,
23+
"agent": {
24+
"name": null,
25+
"import_path": "agents.claude_baseline_agent:BaselineClaudeCodeAgent",
26+
"model_name": "anthropic/claude-haiku-4-5-20251001",
27+
"override_timeout_sec": null,
28+
"override_setup_timeout_sec": null,
29+
"max_timeout_sec": null,
30+
"kwargs": {}
8131
},
82-
"agent_execution": {
83-
"started_at": "2026-02-27T03:48:55.456154",
84-
"finished_at": "2026-02-27T04:06:34.751020"
32+
"environment": {
33+
"type": "docker",
34+
"import_path": null,
35+
"force_build": false,
36+
"delete": true,
37+
"override_cpus": null,
38+
"override_memory_mb": null,
39+
"override_storage_mb": null,
40+
"override_gpus": null,
41+
"kwargs": {}
8542
},
8643
"verifier": {
87-
"started_at": "2026-02-27T04:06:34.751116",
88-
"finished_at": "2026-02-27T04:38:07.935863"
44+
"override_timeout_sec": null,
45+
"max_timeout_sec": null,
46+
"disable": false
47+
},
48+
"job_id": "08e1d107-eaa1-4036-bfc2-48d719d5e1a3"
49+
},
50+
"agent_info": {
51+
"name": "claude-code",
52+
"version": "unknown",
53+
"model_info": {
54+
"name": "claude-haiku-4-5-20251001",
55+
"provider": "anthropic"
56+
}
57+
},
58+
"agent_result": {
59+
"n_input_tokens": 16867840,
60+
"n_cache_tokens": 16866819,
61+
"n_output_tokens": 31007,
62+
"cost_usd": null,
63+
"rollout_details": null,
64+
"metadata": null
65+
},
66+
"verifier_result": {
67+
"rewards": {
68+
"reward": 0.5,
69+
"re_scored": true,
70+
"re_score_reason": "cargo check gate removed; scored on structural signals (scrollend_found=1, wpt_tests=1)"
8971
}
72+
},
73+
"exception_info": null,
74+
"started_at": "2026-02-27T03:47:58.529355",
75+
"finished_at": "2026-02-27T04:40:53.037423",
76+
"environment_setup": {
77+
"started_at": "2026-02-27T03:47:58.530170",
78+
"finished_at": "2026-02-27T03:48:34.448031"
79+
},
80+
"agent_setup": {
81+
"started_at": "2026-02-27T03:48:34.448060",
82+
"finished_at": "2026-02-27T03:48:55.456125"
83+
},
84+
"agent_execution": {
85+
"started_at": "2026-02-27T03:48:55.456154",
86+
"finished_at": "2026-02-27T04:06:34.751020"
87+
},
88+
"verifier": {
89+
"started_at": "2026-02-27T04:06:34.751116",
90+
"finished_at": "2026-02-27T04:38:07.935863"
91+
}
9092
}
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.0
1+
0.50
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
Change detection: unstaged=2 staged=0 untracked=0 commits=0 (origin_ref=origin/main)
22
Testing scrollend event implementation...
3-
Running Rust compilation check (cargo check)...
4-
FAIL: cargo check failed
5-
Compilation failed — score set to 0.0
6-
7-
[ ] Tests completed - Score: 0.0 (build failure)
3+
Running Rust compilation check (cargo check, best-effort)...
4+
NOTE: cargo check failed (expected for large workspaces — scoring on structural signals)
5+
NOTE: Scroll-related tests failed
6+
./components/script/dom/eventtarget.rs: "onscrollend",
7+
./components/script/dom/macros.rs: event_handler!(scrollend, GetOnscrollend, SetOnscrollend);
8+
./components/script/dom/visualviewport.rs: event_handler!(scrollend, GetOnscrollend, SetOnscrollend);
9+
./components/script/dom/window.rs: // > invocation, where no translations were applied as a result, then no scrollend event fires
10+
./components/script/dom/window.rs: // Even though the note mention the scrollend, it is relevant to the scroll as well.
11+
[x] scrollend event references found
12+
NOTE: Only 1 scroll-related file changed (need >= 2 for cross-module feature)
13+
[x] scrollend WPT tests found
14+
0.50
15+
[x] Tests completed - Score: 0.50
16+
[re-scored from trajectory: cargo check gate removed, structural signals applied]

0 commit comments

Comments
 (0)