Skip to content

Commit b067d06

Browse files
sjarmakclaude
andcommitted
Add 9 curator ground truths + harden runner timeouts (148/160 SDLC)
New GT: qutebrowser-darkmode, qutebrowser-url, bustub-hyperloglog, tensorrt-mxfp4, element-web-unread, django-repo-scoped, grpcurl-vuln, calcom-code-review, kafka-build-orient. Runner fixes: - Python-level thread timeout on exec_cmd (shutdown(wait=False)) - OS-level timeout wrapper on git clone and curator CLI - kernel.org → github.com/torvalds/linux mirror for curator clones - Skip 4 linux kernel tasks (repo too large for Daytona sandbox) - Bump SANDBOX_TIMEOUT_SEC 900→1500, SIGALRM 840→1440 12 tasks remain: 4 kernel (need manual GT), 8 curator timeout (need parallel=1 or dedicated account). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 11cd2a7 commit b067d06

File tree

19 files changed

+645
-49
lines changed

19 files changed

+645
-49
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
"files": [
3+
"qutebrowser/browser/webengine/darkmode.py",
4+
"tests/unit/browser/webengine/test_darkmode.py",
5+
"qutebrowser/utils/version.py"
6+
],
7+
"symbols": [
8+
{
9+
"file": "qutebrowser/browser/webengine/darkmode.py",
10+
"symbol": "Variant",
11+
"repo": null
12+
},
13+
{
14+
"file": "qutebrowser/browser/webengine/darkmode.py",
15+
"symbol": "_variant",
16+
"repo": null
17+
},
18+
{
19+
"file": "qutebrowser/browser/webengine/darkmode.py",
20+
"symbol": "_DEFINITIONS",
21+
"repo": null
22+
},
23+
{
24+
"file": "qutebrowser/browser/webengine/darkmode.py",
25+
"symbol": "_Definition",
26+
"repo": null
27+
},
28+
{
29+
"file": "qutebrowser/browser/webengine/darkmode.py",
30+
"symbol": "_Setting",
31+
"repo": null
32+
},
33+
{
34+
"file": "qutebrowser/browser/webengine/darkmode.py",
35+
"symbol": "copy_replace_setting",
36+
"repo": null
37+
},
38+
{
39+
"file": "qutebrowser/browser/webengine/darkmode.py",
40+
"symbol": "settings",
41+
"repo": null
42+
},
43+
{
44+
"file": "qutebrowser/utils/version.py",
45+
"symbol": "WebEngineVersions",
46+
"repo": null
47+
}
48+
]
49+
}

benchmarks/csb_sdlc_debug/qutebrowser-darkmode-threshold-regression-prove-001/tests/ground_truth_meta.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
"curator_agent_version": "2.0",
88
"model": "claude-opus-4-6",
99
"backend": "hybrid",
10-
"timestamp": "2026-03-03T21:34:52Z",
11-
"files_count": 4,
10+
"timestamp": "2026-03-06T14:49:26Z",
11+
"files_count": 3,
1212
"edit_files_count": 0,
1313
"chunks_count": 0,
14-
"symbols_count": 9,
15-
"cost_usd": 3.1905410000000005,
16-
"elapsed_sec": 625.6,
17-
"exploration_notes": "The bug is in qutebrowser/browser/webengine/darkmode.py. Qt 6.4's Chromium renamed 'TextBrightnessThreshold' to 'ForegroundBrightnessThreshold'. The fix (commit 50efac08f) added: (1) a Variant.qt_64 enum member, (2) a copy_replace_setting method on _Definition, (3) a Qt 6.4 entry in _DEFINITIONS that maps threshold.text to ForegroundBrightnessThreshold, and (4) version detection in _variant() for Qt >= 6.4. Before the fix, Qt 6.4 was treated as Qt 6.3 and used the old TextBrightnessThreshold key"
14+
"symbols_count": 8,
15+
"cost_usd": 1.6713492500000002,
16+
"elapsed_sec": 361.8,
17+
"exploration_notes": "The bug is in darkmode.py. Before the fix (commit 50efac08f), there was no Variant.qt_64 enum value. The _variant() function mapped Qt 6.4 to Variant.qt_63 (since 6.4 >= 6.3). The Qt 6.3 definition uses 'TextBrightnessThreshold' as the Chromium key for the threshold.text setting. However, Qt 6.4's Chromium (99.0.4785.0) renamed this to 'ForegroundBrightnessThreshold'. Since the old key name is not recognized by Qt 6.4's Chromium, the threshold.text setting was silently ignored. The fix adds a ne"
1818
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"files": [
3+
"qutebrowser/utils/urlutils.py",
4+
"tests/unit/utils/test_urlutils.py",
5+
"qutebrowser/browser/navigate.py"
6+
],
7+
"symbols": [
8+
{
9+
"file": "qutebrowser/utils/urlutils.py",
10+
"symbol": "incdec_number",
11+
"repo": null
12+
},
13+
{
14+
"file": "qutebrowser/utils/urlutils.py",
15+
"symbol": "_get_incdec_value",
16+
"repo": null
17+
},
18+
{
19+
"file": "qutebrowser/utils/urlutils.py",
20+
"symbol": "_URL_SEGMENTS",
21+
"repo": null
22+
},
23+
{
24+
"file": "qutebrowser/utils/urlutils.py",
25+
"symbol": "IncDecError",
26+
"repo": null
27+
}
28+
]
29+
}

benchmarks/csb_sdlc_debug/qutebrowser-url-regression-prove-001/tests/ground_truth_meta.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
"curator_agent_version": "2.0",
88
"model": "claude-opus-4-6",
99
"backend": "hybrid",
10-
"timestamp": "2026-03-03T18:06:21Z",
10+
"timestamp": "2026-03-06T16:00:54Z",
1111
"files_count": 3,
1212
"edit_files_count": 0,
1313
"chunks_count": 0,
1414
"symbols_count": 4,
15-
"cost_usd": 2.5924995,
16-
"elapsed_sec": 592.7,
17-
"exploration_notes": "The root cause is in urlutils.py lines 559-624. The _URL_SEGMENTS table uses QUrl.PrettyDecoded for getters and QUrl.StrictMode for setters. PrettyDecoded decodes percent-encoded characters that don't need encoding in their context (e.g., %20\u2192space). When the modified string is written back via StrictMode, the decoded characters (like literal spaces) make the URL invalid. The regression test at /workspace/regression_test.py demonstrates: (1) %20 in path/query causes URLs to become invalid after "
15+
"cost_usd": 0.97326375,
16+
"elapsed_sec": 325.9,
17+
"exploration_notes": "The bugs are in qutebrowser/utils/urlutils.py. The regex at line 618 (r'(.*\\D|^)(?<!%)(?<!%.)(0*)(\\d+)(.*)') has an over-aggressive lookbehind that blocks legitimate digits after percent-encoded sequences (e.g., '5' in '%235' which is '#5'). Additionally, the _URL_SEGMENTS getters at lines 559-580 use QUrl.PrettyDecoded which decodes certain percent-encoded chars (like %3A\u2192':'), and the setters use QUrl.StrictMode which doesn't re-encode them, causing encoding loss. The regression test at /works"
1818
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
{
2+
"files": [
3+
"src/include/primer/hyperloglog.h",
4+
"src/primer/hyperloglog.cpp",
5+
"src/include/primer/hyperloglog_presto.h",
6+
"src/primer/hyperloglog_presto.cpp",
7+
"src/include/common/util/hash_util.h",
8+
"test/primer/hyperloglog_test.cpp"
9+
],
10+
"symbols": [
11+
{
12+
"file": "src/include/primer/hyperloglog.h",
13+
"symbol": "HyperLogLog",
14+
"repo": null
15+
},
16+
{
17+
"file": "src/include/primer/hyperloglog.h",
18+
"symbol": "AddElem",
19+
"repo": null
20+
},
21+
{
22+
"file": "src/include/primer/hyperloglog.h",
23+
"symbol": "ComputeCardinality",
24+
"repo": null
25+
},
26+
{
27+
"file": "src/include/primer/hyperloglog.h",
28+
"symbol": "ComputeBinary",
29+
"repo": null
30+
},
31+
{
32+
"file": "src/include/primer/hyperloglog.h",
33+
"symbol": "PositionOfLeftmostOne",
34+
"repo": null
35+
},
36+
{
37+
"file": "src/include/primer/hyperloglog.h",
38+
"symbol": "CalculateHash",
39+
"repo": null
40+
},
41+
{
42+
"file": "src/include/primer/hyperloglog_presto.h",
43+
"symbol": "HyperLogLogPresto",
44+
"repo": null
45+
},
46+
{
47+
"file": "src/include/primer/hyperloglog_presto.h",
48+
"symbol": "GetDenseBucket",
49+
"repo": null
50+
},
51+
{
52+
"file": "src/include/primer/hyperloglog_presto.h",
53+
"symbol": "GetOverflowBucketofIndex",
54+
"repo": null
55+
},
56+
{
57+
"file": "src/include/common/util/hash_util.h",
58+
"symbol": "HashBytes",
59+
"repo": null
60+
},
61+
{
62+
"file": "src/include/common/util/hash_util.h",
63+
"symbol": "HashValue",
64+
"repo": null
65+
}
66+
]
67+
}
Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
{
2-
"model": "manual",
3-
"backend": "instruction-extraction",
4-
"prompt_version": "manual",
5-
"cost_usd": 0.0,
6-
"elapsed_sec": 0,
7-
"timestamp": "2026-03-03T22:22:15Z",
8-
"tool_calls": 0,
9-
"generator": "manual_instruction_extraction"
2+
"has_ground_truth": true,
3+
"has_chunk_ground_truth": false,
4+
"ground_truth_source": "curator_agent",
5+
"ground_truth_confidence": "medium",
6+
"task_name": "bustub-hyperloglog-impl-001",
7+
"curator_agent_version": "2.0",
8+
"model": "claude-opus-4-6",
9+
"backend": "hybrid",
10+
"timestamp": "2026-03-06T14:25:14Z",
11+
"files_count": 6,
12+
"edit_files_count": 0,
13+
"chunks_count": 0,
14+
"symbols_count": 11,
15+
"cost_usd": 1.1280160000000001,
16+
"elapsed_sec": 442.5,
17+
"exploration_notes": "The task requires implementing HyperLogLog cardinality estimation across 4 files plus a fix to hash_util.h. The test file reveals two variants:\n\n**Basic HyperLogLog** (hyperloglog.h/.cpp): Uses `n_bits` top bits of a 64-bit hash for bucket indexing (m=2^n_bits buckets). `ComputeBinary` converts hash to bitset<64>. `PositionOfLeftmostOne` finds the 1-indexed position of the highest set bit scanning from MSB (bit 63). `AddElem` hashes the value, extracts bucket index from top n_bits, zeros those b"
1018
}
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
{
2+
"files": [
3+
"tensorrt_llm/quantization/mode.py",
4+
"cpp/include/tensorrt_llm/common/quantization.h",
5+
"tensorrt_llm/quantization/utils/fp4_utils.py",
6+
"tensorrt_llm/_torch/model_config.py",
7+
"cpp/tensorrt_llm/thop/fp4Gemm.cpp",
8+
"cpp/tensorrt_llm/thop/moeOp.cpp",
9+
"cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h",
10+
"cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h",
11+
"cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h",
12+
"cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu",
13+
"cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu",
14+
"cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu",
15+
"tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py",
16+
"tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py",
17+
"tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py",
18+
"tensorrt_llm/_torch/modules/fused_moe/quantization.py",
19+
"tensorrt_llm/_torch/modules/linear.py",
20+
"tensorrt_llm/_torch/modules/triton_linear.py",
21+
"tensorrt_llm/_torch/custom_ops/torch_custom_ops.py",
22+
"tests/unittest/_torch/modules/test_fused_moe.py",
23+
"tests/unittest/_torch/modules/test_triton_linear.py",
24+
"tests/unittest/_torch/modules/moe/test_moe_backend.py",
25+
"tests/unittest/_torch/modules/moe/quantize_utils.py"
26+
],
27+
"symbols": [
28+
{
29+
"file": "tensorrt_llm/quantization/mode.py",
30+
"symbol": "QuantAlgo",
31+
"repo": null
32+
},
33+
{
34+
"file": "tensorrt_llm/quantization/mode.py",
35+
"symbol": "QuantMode",
36+
"repo": null
37+
},
38+
{
39+
"file": "cpp/include/tensorrt_llm/common/quantization.h",
40+
"symbol": "QuantMode",
41+
"repo": null
42+
},
43+
{
44+
"file": "cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h",
45+
"symbol": "FP4GemmType",
46+
"repo": null
47+
},
48+
{
49+
"file": "cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h",
50+
"symbol": "FP4GemmType",
51+
"repo": null
52+
},
53+
{
54+
"file": "tensorrt_llm/quantization/utils/fp4_utils.py",
55+
"symbol": "FP4GemmType",
56+
"repo": null
57+
},
58+
{
59+
"file": "tensorrt_llm/_torch/model_config.py",
60+
"symbol": "ModelConfig.get_mxfp4_quant_algo",
61+
"repo": null
62+
},
63+
{
64+
"file": "tensorrt_llm/_torch/model_config.py",
65+
"symbol": "ModelConfig.override_quant_algo",
66+
"repo": null
67+
},
68+
{
69+
"file": "tensorrt_llm/_torch/modules/fused_moe/quantization.py",
70+
"symbol": "W4A8MXFP4FP8CutlassFusedMoEMethod",
71+
"repo": null
72+
},
73+
{
74+
"file": "tensorrt_llm/_torch/modules/fused_moe/quantization.py",
75+
"symbol": "W4A8MXFP4MXFP8CutlassFusedMoEMethod",
76+
"repo": null
77+
},
78+
{
79+
"file": "tensorrt_llm/_torch/modules/fused_moe/quantization.py",
80+
"symbol": "W4A8MXFP4FP8TRTLLMGenFusedMoEMethod",
81+
"repo": null
82+
},
83+
{
84+
"file": "tensorrt_llm/_torch/modules/fused_moe/quantization.py",
85+
"symbol": "W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod",
86+
"repo": null
87+
},
88+
{
89+
"file": "tensorrt_llm/_torch/modules/fused_moe/quantization.py",
90+
"symbol": "FusedMoEQuantScalesW4A8MXFP4FP8",
91+
"repo": null
92+
},
93+
{
94+
"file": "tensorrt_llm/_torch/modules/fused_moe/quantization.py",
95+
"symbol": "FusedMoEQuantScalesW4A8MXFP4MXFP8",
96+
"repo": null
97+
}
98+
]
99+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"has_ground_truth": true,
3+
"has_chunk_ground_truth": false,
4+
"ground_truth_source": "curator_agent",
5+
"ground_truth_confidence": "medium",
6+
"task_name": "tensorrt-mxfp4-quant-feat-001",
7+
"curator_agent_version": "2.0",
8+
"model": "claude-opus-4-6",
9+
"backend": "hybrid",
10+
"timestamp": "2026-03-06T14:19:49Z",
11+
"files_count": 23,
12+
"edit_files_count": 0,
13+
"chunks_count": 0,
14+
"symbols_count": 14,
15+
"cost_usd": 1.0432487499999998,
16+
"elapsed_sec": 115.5,
17+
"exploration_notes": "The W4A8_MXFP4_INT8 mode needs to be added following the pattern of W4A8_MXFP4_FP8 and W4A8_MXFP4_MXFP8. Key changes span:\n\n**Python enums & mode mapping**: `tensorrt_llm/quantization/mode.py` (QuantAlgo enum + QuantMode IntFlag + from_quant_algo/from_description). The Python FP4GemmType in `tensorrt_llm/quantization/utils/fp4_utils.py` may need a new variant.\n\n**C++ enums & mode mapping**: `cpp/include/tensorrt_llm/common/quantization.h` (QuantMode bit flag + hasW4a8Mxfp4Int8 + fromDescription "
18+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"files": [
3+
"apps/web/src/Unread.ts",
4+
"apps/web/test/unit-tests/Unread-test.ts",
5+
"apps/web/src/RoomNotifs.ts",
6+
"apps/web/src/shouldHideEvent.ts",
7+
"apps/web/src/events/EventTileFactory.tsx"
8+
],
9+
"symbols": [
10+
{
11+
"file": "apps/web/src/Unread.ts",
12+
"symbol": "doesTimelineHaveUnreadMessages",
13+
"repo": null
14+
},
15+
{
16+
"file": "apps/web/src/Unread.ts",
17+
"symbol": "doesRoomHaveUnreadMessages",
18+
"repo": null
19+
},
20+
{
21+
"file": "apps/web/src/Unread.ts",
22+
"symbol": "doesRoomHaveUnreadThreads",
23+
"repo": null
24+
},
25+
{
26+
"file": "apps/web/src/Unread.ts",
27+
"symbol": "doesRoomOrThreadHaveUnreadMessages",
28+
"repo": null
29+
},
30+
{
31+
"file": "apps/web/src/Unread.ts",
32+
"symbol": "eventTriggersUnreadCount",
33+
"repo": null
34+
},
35+
{
36+
"file": "apps/web/src/Unread.ts",
37+
"symbol": "isRelevantEvent",
38+
"repo": null
39+
},
40+
{
41+
"file": "apps/web/src/Unread.ts",
42+
"symbol": "findLatestRelevantEvent",
43+
"repo": null
44+
},
45+
{
46+
"file": "apps/web/src/RoomNotifs.ts",
47+
"symbol": "determineUnreadState",
48+
"repo": null
49+
}
50+
]
51+
}

0 commit comments

Comments
 (0)