|
1 | 1 | # Data Analysis |
2 | | -# Auto-generated from papers_raw/taxonomy.tex and papers_raw/main.bib |
| 2 | +# Auto-generated from taxonomy.tex and BibTeX file |
3 | 3 |
|
4 | | -- short_name: "SWE-bench Verified" |
5 | | - title: "Introducing SWE-bench Verified | OpenAI" |
6 | | - authors: "OpenAI" |
7 | | - venue: "arXiv 2024" |
8 | | - year: "2024" |
| 4 | +- short_name: SWE-bench Verified |
| 5 | + title: Introducing SWE-bench Verified | OpenAI |
| 6 | + authors: OpenAI |
| 7 | + year: '2024' |
| 8 | + venue: '2024' |
| 9 | +- short_name: Patch Correctness |
| 10 | + title: Are "Solved Issues" in SWE-bench Really Solved Correctly? An Empirical Study |
| 11 | + authors: You Wang, Michael Pradel, Zhongxin Liu |
| 12 | + year: '2025' |
| 13 | + venue: arXiv preprint arXiv:2503.15223 2025 |
9 | 14 | links: |
10 | | - arxiv: "https://openai.com/index/introducing-swe-bench-verified/" |
11 | | - |
12 | | -- short_name: "SWE-Bench+" |
13 | | - title: "SWE-Bench+: Enhanced Coding Benchmark for LLMs" |
14 | | - authors: "Reem Aleithan, Haoran Xue, Mohammad Mahdi Mohajer, Elijah Nnorom, Gias Uddin, Song Wang" |
15 | | - venue: "arXiv 2024" |
16 | | - year: "2024" |
| 15 | + arxiv: https://arxiv.org/abs/2503.15223 |
| 16 | +- short_name: UTBoost |
| 17 | + title: 'UTBoost: Rigorous Evaluation of Coding Agents on SWE-Bench' |
| 18 | + authors: Boxi Yu, Yuxuan Zhu, Pinjia He, Daniel Kang |
| 19 | + year: '2025' |
| 20 | + venue: arXiv preprint arXiv:2506.09289 2025 |
17 | 21 | links: |
18 | | - arxiv: "https://arxiv.org/abs/2410.06992" |
19 | | - |
20 | | -- short_name: "Patch Correctness" |
21 | | - title: "Are \"Solved Issues\" in SWE-bench Really Solved Correctly? An Empirical Study" |
22 | | - authors: "You Wang, Michael Pradel, Zhongxin Liu" |
23 | | - venue: "arXiv 2025" |
24 | | - year: "2025" |
| 22 | + arxiv: https://arxiv.org/abs/2506.09289 |
| 23 | +- short_name: Trustworthiness |
| 24 | + title: Is Your Automated Software Engineer Trustworthy? |
| 25 | + authors: Noble Saji Mathews, Meiyappan Nagappan |
| 26 | + year: '2025' |
| 27 | + venue: arXiv preprint arXiv:2506.17812 2025 |
25 | 28 | links: |
26 | | - arxiv: "http://arxiv.org/abs/2503.15223" |
27 | | - |
28 | | -- short_name: "UTBoost" |
29 | | - title: "UTBoost: Rigorous Evaluation of Coding Agents on SWE-Bench" |
30 | | - authors: "Boxi Yu, Yuxuan Zhu, Pinjia He, Daniel Kang" |
31 | | - venue: "arXiv 2025" |
32 | | - year: "2025" |
| 29 | + arxiv: https://arxiv.org/abs/2506.17812 |
| 30 | +- short_name: Rigorous agentic benchmarks |
| 31 | + title: Establishing Best Practices for Building Rigorous Agentic Benchmarks |
| 32 | + authors: Yuxuan Zhu, Tengjun Jin, Yada Pruksachatkun, Andy Zhang, Shu Liu, Sasha |
| 33 | + Cui, Sayash Kapoor et al. |
| 34 | + year: '2025' |
| 35 | + venue: arXiv preprint arXiv:2507.02825 2025 |
33 | 36 | links: |
34 | | - arxiv: "https://arxiv.org/abs/2506.09289" |
35 | | - |
36 | | -- short_name: "Trustworthiness" |
37 | | - title: "Is Your Automated Software Engineer Trustworthy?" |
38 | | - authors: "Noble Saji Mathews, Meiyappan Nagappan" |
39 | | - venue: "arXiv 2025" |
40 | | - year: "2025" |
| 37 | + arxiv: https://arxiv.org/abs/2507.02825 |
| 38 | +- short_name: The SWE-Bench Illusion |
| 39 | + title: 'The SWE-Bench Illusion: When State-of-the-Art LLMs Remember Instead of Reason' |
| 40 | + authors: Shanchao Liang, Spandan Garg, Roshanak Zilouchian Moghaddam |
| 41 | + year: '2025' |
| 42 | + venue: arXiv preprint arXiv:2506.12286 2025 |
41 | 43 | links: |
42 | | - arxiv: "https://arxiv.org/abs/2506.17812" |
43 | | - |
44 | | -- short_name: "Rigorous agentic benchmarks" |
45 | | - title: "Establishing Best Practices for Building Rigorous Agentic Benchmarks" |
46 | | - authors: "Yuxuan Zhu, Tengjun Jin, Yada Pruksachatkun, Andy Zhang, Shu Liu, Sasha Cui, Sayash Kapoor, Shayne Longpre, Kevin Meng, Rebecca Weiss, Fazl Barez, Rahul Gupta, Jwala Dhamala, Jacob Merizian, Mario Giulianelli, Harry Coppock, Cozmin Ududec, Jasjeet Sekhon, Jacob Steinhardt, Antony Kellermann, Sarah Schwettmann, Matei Zaharia, Ion Stoica, Percy Liang, Daniel Kang" |
47 | | - venue: "arXiv 2025" |
48 | | - year: "2025" |
| 44 | + arxiv: https://arxiv.org/abs/2506.12286 |
| 45 | +- short_name: Revisiting SWE-Bench |
| 46 | + title: 'Revisiting SWE-Bench: On the Importance of Data Quality for LLM-Based Code |
| 47 | + Models' |
| 48 | + authors: Aleithan, Reem |
| 49 | + year: '2025' |
| 50 | + venue: '2025 IEEE/ACM 47th International Conference on Software Engineering: Companion |
| 51 | + Proceedings (ICSE-Companion) 2025' |
49 | 52 | links: |
50 | | - arxiv: "https://arxiv.org/abs/2507.02825" |
51 | | - |
52 | | -- short_name: "The SWE-Bench Illusion" |
53 | | - title: "The SWE-Bench Illusion: When State-of-the-Art LLMs Remember Instead of Reason" |
54 | | - authors: "Shanchao Liang, Spandan Garg, Roshanak Zilouchian Moghaddam" |
55 | | - venue: "arXiv 2025" |
56 | | - year: "2025" |
| 53 | + doi: http://dx.doi.org/10.1109/ICSE-Companion66252.2025.00075 |
| 54 | +- short_name: SPICE |
| 55 | + title: "SPICE: An Automated SWE-Bench Labeling Pipeline for Issue Clarity,\n \ |
| 56 | + \ Test Coverage, and Effort Estimation" |
| 57 | + authors: Gustavo A. Oliva, Gopi Krishnan Rajbahadur, Aaditya Bhatia, Haoxiang Zhang, |
| 58 | + Yihao Chen, Zhilong Chen, Arthur Leung et al. |
| 59 | + year: '2025' |
| 60 | + venue: ASE 2025 |
| 61 | +- short_name: Data contamination |
| 62 | + title: Does SWE-Bench-Verified Test Agent Ability or Model Memory? |
| 63 | + authors: Thanosan Prathifkumar, Noble Saji Mathews, Meiyappan Nagappan |
| 64 | + year: '2025' |
| 65 | + venue: arXiv preprint arXiv:2512.10218 2025 |
57 | 66 | links: |
58 | | - arxiv: "https://arxiv.org/abs/2506.12286" |
59 | | - |
60 | | -- short_name: "Revisiting SWE-Bench" |
61 | | - title: "Revisiting SWE-Bench: On the Importance of Data Quality for LLM-Based Code Models" |
62 | | - authors: "Reem Aleithan" |
63 | | - venue: "2025 IEEE/ACM 47th International Conference on Software Engineering: Companion Proceedings (ICSE-Companion) 2025" |
64 | | - year: "2025" |
65 | | - links: |
66 | | - |
67 | | -- short_name: "SPICE" |
68 | | - title: "SPICE: An Automated SWE-Bench Labeling Pipeline for Issue Clarity, Test Coverage, and Effort Estimation" |
69 | | - authors: "Gustavo A. Oliva, Gopi Krishnan Rajbahadur, Aaditya Bhatia, Haoxiang Zhang, Yihao Chen, Zhilong Chen, Arthur Leung, Dayi Lin, Boyuan Chen, Ahmed E. Hassan" |
70 | | - venue: "arXiv 2025" |
71 | | - year: "2025" |
72 | | - links: |
73 | | - arxiv: "https://arxiv.org/abs/2507.09108" |
74 | | - |
75 | | -- short_name: "Data contamination" |
76 | | - title: "Does SWE-Bench-Verified Test Agent Ability or Model Memory?" |
77 | | - authors: "Thanosan Prathifkumar, Noble Saji Mathews, Meiyappan Nagappan" |
78 | | - venue: "arXiv 2025" |
79 | | - year: "2025" |
80 | | - links: |
81 | | - arxiv: "https://arxiv.org/abs/2512.10218" |
82 | | - |
| 67 | + arxiv: https://arxiv.org/abs/2512.10218 |
0 commit comments