File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 6060 env :
6161 GITHUB_TOKEN : ${{ secrets.GITHUB_TOKEN }}
6262 run : npm run agent:copilot
63+ continue-on-error : true
6364
6465 - name : Upload Copilot eval results
6566 if : always()
9192 env :
9293 GEMINI_API_KEY : ${{ secrets.GEMINI_API_KEY }}
9394 run : npm run agent:gemini
95+ continue-on-error : true
9496
9597 - name : Upload Gemini eval results
9698 if : always()
Original file line number Diff line number Diff line change 1313 "command" : " gemini" ,
1414 "installCommand" : " npm install -g @google/gemini-cli" ,
1515 "promptArgs" : [" -p" ],
16- "autoApproveArgs" : [" --sandbox " ],
16+ "autoApproveArgs" : [" --yolo " ],
1717 "envAuth" : " GEMINI_API_KEY" ,
1818 "description" : " Google Gemini CLI (requires GEMINI_API_KEY)"
1919 }
Original file line number Diff line number Diff line change @@ -147,7 +147,7 @@ run_agent_task() {
147147 fi
148148 CMD_ARGS+=(" $FULL_PROMPT " )
149149
150- # Add auto-approve args (e.g., --yes, --sandbox )
150+ # Add auto-approve args (e.g., --yes for copilot , --yolo for gemini )
151151 if [ -n " $AGENT_APPROVE_ARGS " ]; then
152152 read -ra _APPROVE_PARTS <<< " $AGENT_APPROVE_ARGS"
153153 CMD_ARGS+=(" ${_APPROVE_PARTS[@]} " )
@@ -256,6 +256,12 @@ run_task() {
256256}
257257EOF
258258
259+ # In agent mode, a low score is a measurement result, NOT a script error.
260+ # Only propagate the grader exit code in validate mode (where failure means
261+ # the reference solution itself is broken).
262+ if [ " $MODE " = " agent" ]; then
263+ return 0
264+ fi
259265 return " $GRADER_EXIT "
260266}
261267
@@ -345,7 +351,8 @@ run_task_trials() {
345351}
346352EOF
347353
348- [ " $PASS_AT_K " -eq 1 ] && return 0 || return 1
354+ # Agent eval scores are measurements, not pass/fail gates — always succeed.
355+ return 0
349356}
350357
351358# --- main ------------------------------------------------------------------ #
You can’t perform that action at this time.
0 commit comments