|
1 | 1 | { |
2 | | - "generated": "2026-03-05", |
| 2 | + "generated": "2026-03-09", |
3 | 3 | "stats": { |
4 | | - "total": 198, |
5 | | - "data": 50, |
| 4 | + "total": 201, |
| 5 | + "data": 51, |
6 | 6 | "methods": 128, |
7 | | - "analysis": 21 |
| 7 | + "analysis": 23 |
8 | 8 | }, |
9 | 9 | "papers": [ |
| 10 | + { |
| 11 | + "id": 201, |
| 12 | + "short_name": "SWE-Atlas", |
| 13 | + "title": "SWE-Atlas", |
| 14 | + "authors": "Scale AI", |
| 15 | + "venue": "-", |
| 16 | + "month": "2026-03", |
| 17 | + "category": "evaluation_datasets", |
| 18 | + "links": { |
| 19 | + "website": "https://labs.scale.com/leaderboard/sweatlas-qna" |
| 20 | + } |
| 21 | + }, |
| 22 | + { |
| 23 | + "id": 200, |
| 24 | + "short_name": "SWE-CI", |
| 25 | + "title": "SWE-CI: Evaluating Agent Capabilities in Maintaining Codebases via Continuous Integration", |
| 26 | + "authors": "Jialong Chen, Xander Xu, Hu Wei, Chuan Chen, Bing Zhao", |
| 27 | + "venue": "arXiv preprint arXiv:2603.03823", |
| 28 | + "month": "2026-03", |
| 29 | + "category": "evaluation_datasets", |
| 30 | + "links": { |
| 31 | + "arxiv": "https://arxiv.org/abs/2603.03823", |
| 32 | + "github": "https://github.com/SKYLENAGE-AI/SWE-CI", |
| 33 | + "huggingface": "https://huggingface.co/datasets/skylenage/SWE-CI" |
| 34 | + } |
| 35 | + }, |
10 | 36 | { |
11 | 37 | "id": 196, |
12 | 38 | "short_name": "BeyondSWE", |
|
34 | 60 | "arxiv": "https://arxiv.org/abs/2603.01327" |
35 | 61 | } |
36 | 62 | }, |
| 63 | + { |
| 64 | + "id": 202, |
| 65 | + "short_name": "ContextBench", |
| 66 | + "title": "ContextBench: A Benchmark for Context Retrieval in Coding Agents", |
| 67 | + "authors": "Han Li, Letian Zhu, Bohan Zhang, Rili Feng, Jiaming Wang, Yue Pan, Earl T. Barr, Federica Sarro, Zhaoyang Chu, He Ye", |
| 68 | + "venue": "arXiv preprint arXiv:2602.05892", |
| 69 | + "month": "2026-02", |
| 70 | + "category": "methods_analysis", |
| 71 | + "links": { |
| 72 | + "arxiv": "https://arxiv.org/abs/2602.05892", |
| 73 | + "github": "https://github.com/EuniAI/ContextBench", |
| 74 | + "huggingface": "https://huggingface.co/datasets/Contextbench/ContextBench", |
| 75 | + "website": "https://contextbench.github.io/" |
| 76 | + } |
| 77 | + }, |
37 | 78 | { |
38 | 79 | "id": 199, |
39 | 80 | "short_name": "SWE-Bench Mobile", |
|
493 | 534 | "arxiv": "https://arxiv.org/abs/2512.10218" |
494 | 535 | } |
495 | 536 | }, |
| 537 | + { |
| 538 | + "id": 203, |
| 539 | + "short_name": "Test Overfitting on SWE-bench", |
| 540 | + "title": "Investigating Test Overfitting on SWE-bench", |
| 541 | + "authors": "Toufique Ahmed, Jatin Ganhotra, Avraham Shinnar, Martin Hirzel", |
| 542 | + "venue": "arXiv preprint arXiv:2511.16858", |
| 543 | + "month": "2025-11", |
| 544 | + "category": "data_analysis", |
| 545 | + "links": { |
| 546 | + "arxiv": "https://arxiv.org/abs/2511.16858" |
| 547 | + } |
| 548 | + }, |
496 | 549 | { |
497 | 550 | "id": 173, |
498 | 551 | "short_name": "InfCode", |
|
1547 | 1600 | "arxiv": "https://arxiv.org/abs/2505.23932" |
1548 | 1601 | } |
1549 | 1602 | }, |
1550 | | - { |
1551 | | - "id": 11, |
1552 | | - "short_name": "RepoLaunch", |
1553 | | - "title": "SWE-bench Goes Live!", |
1554 | | - "authors": "Linghao Zhang, Shilin He, Chaoyun Zhang, Yu Kang, Bowen Li, Chengxing Xie, Junhao Wang et al.", |
1555 | | - "venue": "The Thirty-ninth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track 2025", |
1556 | | - "month": "2025-05", |
1557 | | - "category": "data_collection", |
1558 | | - "links": { |
1559 | | - "arxiv": "https://arxiv.org/abs/2505.23419v2", |
1560 | | - "openreview": "https://openreview.net/forum?id=OGWkr7gXka" |
1561 | | - } |
1562 | | - }, |
1563 | 1603 | { |
1564 | 1604 | "id": 10, |
1565 | 1605 | "short_name": "SWE-rebench", |
|
0 commit comments