Skip to content

Commit 695c0d5

Browse files
sjarmakclaude
andcommitted
feat: output OpenHands runs in official directory structure
- openhands_2config.sh now writes results to the official layout: {jobs_base}/openhands/{csb_sdlc|csb_org}/{model_dir}/{suite}/{config}/ instead of flat {jobs_base}/{config}/timestamp/task/ - Config names mapped to official: baseline-local-direct -> baseline, mcp-remote-direct -> sourcegraph_full - Added MODEL_DIR (dotted format e.g. sonnet-4.6) for dir naming - Store benchmark/suite per task for correct placement - validate_task_run.py discover_task_dirs() now falls back to recursive os.walk when 2-level glob finds nothing (supports official layout) Also promoted 25 MCP tasks from staging run 142143 to official. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 63484ac commit 695c0d5

File tree

2 files changed

+60
-10
lines changed

2 files changed

+60
-10
lines changed

configs/openhands_2config.sh

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,15 @@ if [ ${#TASK_ROWS[@]} -eq 0 ]; then
151151
fi
152152

153153
declare -A TASK_PATH_BY_ID
154+
declare -A TASK_SUITE_BY_ID
154155
TASK_IDS=()
155156
for row in "${TASK_ROWS[@]}"; do
156157
task_id=$(echo "$row" | cut -f1)
157158
task_path=$(echo "$row" | cut -f2)
159+
benchmark=$(echo "$row" | cut -f3)
158160
TASK_IDS+=("$task_id")
159161
TASK_PATH_BY_ID["$task_id"]="$task_path"
162+
TASK_SUITE_BY_ID["$task_id"]="$benchmark"
160163
done
161164

162165
if [ -z "${PARALLEL_JOBS:-}" ] || [ "$PARALLEL_JOBS" -lt 1 ] 2>/dev/null; then
@@ -180,6 +183,17 @@ case "$_model_lower" in
180183
*) MODEL_SHORT=$(echo "$_model_lower" | tr -d '-' | tr -d '_' | cut -c1-12) ;;
181184
esac
182185

186+
# Dotted model version for official directory structure (e.g. sonnet-4.6)
187+
case "$_model_lower" in
188+
*sonnet-4-6*|*sonnet46*) MODEL_DIR="sonnet-4.6" ;;
189+
*sonnet-4-5*|*sonnet45*) MODEL_DIR="sonnet-4.5" ;;
190+
*opus-4-6*|*opus46*) MODEL_DIR="opus-4.6" ;;
191+
*haiku-4-5*|*haiku45*) MODEL_DIR="haiku-4.5" ;;
192+
*gpt-5*|*gpt5*) MODEL_DIR="gpt-5" ;;
193+
*gpt-4o*|*gpt4o*) MODEL_DIR="gpt-4o" ;;
194+
*) MODEL_DIR="$MODEL_SHORT" ;;
195+
esac
196+
183197
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
184198
JOBS_BASE="runs/${CATEGORY}/openhands_${MODEL_SHORT}_${TIMESTAMP}"
185199
mkdir -p "$JOBS_BASE"
@@ -208,9 +222,27 @@ _openhands_run_single() {
208222
local config=${3:-baseline-local-direct}
209223
local mcp_type=${4:-none}
210224
local jobs_base=${5:-$JOBS_BASE}
211-
local jobs_subdir="${jobs_base}/${config}"
212225
local task_path="${TASK_PATH_BY_ID[$task_id]}"
213226

227+
# Map harness config name to official config dir name
228+
local official_config
229+
case "$config" in
230+
baseline-local-direct) official_config="baseline" ;;
231+
mcp-remote-direct) official_config="sourcegraph_full" ;;
232+
*) official_config="$config" ;;
233+
esac
234+
235+
# Build official-structure jobs dir:
236+
# {jobs_base}/openhands/{csb_sdlc|csb_org}/{model_dir}/{suite}/{official_config}
237+
local suite="${TASK_SUITE_BY_ID[$task_id]}"
238+
local top_level
239+
if [[ "$suite" == csb_sdlc_* ]]; then
240+
top_level="csb_sdlc"
241+
else
242+
top_level="csb_org"
243+
fi
244+
local jobs_subdir="${jobs_base}/openhands/${top_level}/${MODEL_DIR}/${suite}/${official_config}"
245+
214246
# Extract ANTHROPIC_API_KEY from this account's OAuth credentials.
215247
# run_tasks_parallel sets HOME=$_task_home for account rotation.
216248
if [ "$USE_SUBSCRIPTION" = "true" ]; then
@@ -278,23 +310,20 @@ run_mode() {
278310
local mode=$1
279311
local mcp_type=$2
280312

281-
jobs_subdir="${JOBS_BASE}/${mode}"
282-
mkdir -p "$jobs_subdir"
283-
284313
_mode_dispatch() {
285314
_openhands_run_single "$1" "$2" "$mode" "$mcp_type" "$JOBS_BASE"
286315
}
287316

288317
run_tasks_parallel TASK_IDS _mode_dispatch || true
289-
validate_and_report "$jobs_subdir" "$mode"
318+
validate_and_report "$JOBS_BASE" "$mode"
290319
}
291320

292321
if [ "$PAIRED_MODE" = true ] && [ "$RUN_BASELINE" = true ] && [ "$RUN_FULL" = true ]; then
293322
# Run baseline + MCP simultaneously per task (interleaved, not sequential)
294323
export FULL_CONFIG="mcp-remote-direct"
295324
run_paired_configs TASK_IDS _openhands_run_single "$JOBS_BASE"
296-
validate_and_report "${JOBS_BASE}/baseline-local-direct" "baseline-local-direct"
297-
validate_and_report "${JOBS_BASE}/mcp-remote-direct" "mcp-remote-direct"
325+
validate_and_report "$JOBS_BASE" "baseline"
326+
validate_and_report "$JOBS_BASE" "sourcegraph_full"
298327
else
299328
# Sequential mode (--baseline-only, --full-only, or --sequential)
300329
if [ "$RUN_BASELINE" = true ]; then

scripts/validate_task_run.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,15 +191,36 @@ def flag(rule: str, severity: str, message: str):
191191

192192

193193
def discover_task_dirs(jobs_dir: str) -> List[str]:
194-
"""Find task directories via jobs_dir/*/*/ glob (same as extract_all_metrics)."""
194+
"""Find task directories containing result.json or task_metrics.json.
195+
196+
Tries the legacy 2-level glob (jobs_dir/*/*/) first, then falls back
197+
to recursive discovery for the official directory layout.
198+
"""
199+
# Legacy: jobs_dir/timestamp/task_dir/
195200
pattern = os.path.join(jobs_dir, "*", "*", "")
196201
dirs = sorted(glob.glob(pattern))
197-
# Only include directories that have at least result.json or task_metrics.json
198-
return [
202+
found = [
199203
d for d in dirs
200204
if os.path.isfile(os.path.join(d, "task_metrics.json"))
201205
or os.path.isfile(os.path.join(d, "result.json"))
202206
]
207+
if found:
208+
return found
209+
210+
# Recursive: walk the tree for any dir containing result.json
211+
found = []
212+
for root, _subdirs, files in os.walk(jobs_dir):
213+
if "result.json" in files or "task_metrics.json" in files:
214+
rj = os.path.join(root, "result.json")
215+
if os.path.isfile(rj):
216+
try:
217+
data = json.loads(open(rj).read())
218+
if "n_total_trials" in data:
219+
continue # run-level metadata, not a task
220+
except (OSError, json.JSONDecodeError):
221+
pass
222+
found.append(root + os.sep)
223+
return sorted(found)
203224

204225

205226
def print_summary(all_flags: List[Flag]):

0 commit comments

Comments
 (0)