From 69372fd71d1452d8a688f53ec201ac75473a2f03 Mon Sep 17 00:00:00 2001 From: richardzhuang0412 Date: Sun, 5 Apr 2026 20:07:21 +0000 Subject: [PATCH 1/3] Migrate eval system to cluster-agnostic git-syncable architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructure the eval infrastructure so that `git pull` on any cluster brings everything up to date. No more manual file copying between clusters. Key changes: - Move cluster-agnostic scripts from eval/MBZ/ to eval/ (listener, sbatch, check_progress, snapshot_download, precreate_snapshots) - Consolidate harbor eval configs into eval/configs/ with no hardcoded jobs_dir (injected at runtime via `--jobs-dir` CLI flag) - Remove all hardcoded Jupiter/MBZ fallback paths from sbatch — fail clearly with error messages if not launched via listener - Add eval/clusters/ YAML configs for M2, MBZ/M1, Jupiter with ${USER} parameterization for multi-user support - Add eval/lists/ for shared model priority lists - Add eval/docs/ for all documentation (onboarding, workflow, tutorials) - Move legacy v4 scripts to eval/legacy/ (frozen, M2 backward compat) - Add native vLLM data-parallel support (--dp-size flag) with proper port planning for pack-jobs and TORCHDYNAMO handling - Add ${USER}/${HOME} expansion in cluster config YAML loader - Parameterize dotenv files (m2.env, mbz.env) with $USER - Update .gitignore for eval/local/, logs, secrets - Add comprehensive cluster onboarding guide (eval/docs/CLUSTER_ONBOARDING.md) Workflow after merge: git pull source ~/secrets.env python eval/unified_eval_listener.py \ --cluster-config eval/clusters/.yaml \ --preset v2 ... Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 14 + database/unified_db/utils.py | 12 +- eval/baseline_model_configs.yaml | 103 +- eval/build_vllm_cmd.sh | 11 +- eval/check_progress.py | 627 +++ eval/clusters/jupiter.yaml | 41 + eval/clusters/m2.yaml | 40 + eval/clusters/mbz.yaml | 39 + eval/configs/dcagent_eval_config.yaml | 47 + .../dcagent_eval_config_no_override.yaml | 47 + eval/docs/CLUSTER_ONBOARDING.md | 344 ++ eval/docs/CLUSTER_SETUP_GUIDE.md | 294 ++ eval/docs/EVAL_WORKFLOW.md | 1431 +++++++ eval/docs/HF_REUPLOAD_GUIDE.md | 192 + eval/docs/LISTENER_TUTORIAL.md | 266 ++ eval/docs/OTAGENT2_SETUP_TUTORIAL.md | 249 ++ eval/docs/OVERLONG_EVAL_RUNBOOK.md | 135 + eval/docs/V6_MIGRATION.md | 111 + eval/jupiter/V6_MIGRATION.md | 111 + eval/jupiter/dcagent_eval_config.yaml | 2 +- .../dcagent_eval_config_no_override.yaml | 46 + eval/jupiter/snapshot_download.py | 45 +- eval/legacy/README.md | 11 + eval/legacy/unified_eval_harbor_v4.sbatch | 663 +++ eval/legacy/unified_eval_harbor_v6_mbz.sbatch | 713 ++++ eval/legacy/unified_eval_listener_v4.py | 2890 +++++++++++++ eval/lists/16x_32b_lc_baseline.txt | 8 + eval/lists/a1_models.txt | 77 + eval/lists/a1_nl2bash.txt | 1 + eval/lists/a1_retrained.txt | 4 + eval/lists/alfworld_131k.txt | 1 + .../lists/architecture_invalid_test_model.txt | 1 + eval/lists/baseline_swe.txt | 10 + eval/lists/custom_force_run.txt | 1 + eval/lists/dp_test_model.txt | 1 + eval/lists/exp_tas_qwen35.txt | 1 + eval/lists/glm46_131k.txt | 1 + eval/lists/glm47_flash.txt | 1 + eval/lists/inactive_models_latest.txt | 457 ++ eval/lists/kept_models_names.txt | 221 + eval/lists/laion_latest.txt | 28 + .../latest_sort_by_release_eval_prio.txt | 28 + eval/lists/lists/16x_32b_lc_baseline.txt | 8 + eval/lists/lists/a1_models.txt | 77 + eval/lists/lists/a1_nl2bash.txt | 1 + eval/lists/lists/a1_retrained.txt | 4 + eval/lists/lists/alfworld_131k.txt | 1 + .../lists/architecture_invalid_test_model.txt | 1 + eval/lists/lists/baseline_swe.txt | 10 + eval/lists/lists/bfcl_rerun_failed.txt | 22 + eval/lists/lists/bfcl_rerun_models.txt | 74 + eval/lists/lists/custom_force_run.txt | 1 + eval/lists/lists/dp_test_model.txt | 1 + eval/lists/lists/dsv2_rerun_models.txt | 230 + eval/lists/lists/exp_tas_qwen35.txt | 1 + eval/lists/lists/glm46_131k.txt | 1 + eval/lists/lists/glm47_flash.txt | 1 + eval/lists/lists/inactive_models_latest.txt | 457 ++ eval/lists/lists/kept_models_names.txt | 221 + eval/lists/lists/laion_latest.txt | 28 + .../latest_sort_by_release_eval_prio.txt | 28 + eval/lists/lists/missing_dev_set_v2.txt | 189 + .../missing_dev_set_v2_inactive_laion.txt | 24 + ...g_swebench_verified_random_100_folders.txt | 69 + eval/lists/lists/missing_terminal_bench_2.txt | 96 + eval/lists/lists/models_131k.txt | 35 + eval/lists/lists/models_32b.txt | 31 + eval/lists/lists/nemotron_nano.txt | 1 + eval/lists/lists/no_eval_models_latest.txt | 30 + eval/lists/lists/pipeline_exp_prio.txt | 19 + eval/lists/lists/priority_131k_test.txt | 1 + eval/lists/lists/priority_batch2.txt | 3 + eval/lists/lists/priority_batch_evalorg.txt | 3 + eval/lists/lists/priority_obiwan.txt | 1 + eval/lists/lists/priority_qwen35.txt | 1 + eval/lists/lists/priority_rl_test.txt | 1 + eval/lists/lists/pruned_models_names.txt | 364 ++ eval/lists/lists/pyme_v3_40.txt | 1 + eval/lists/lists/qwen35_27b.txt | 1 + eval/lists/lists/qwen35_9b.txt | 1 + eval/lists/lists/richard_base_model.txt | 1 + eval/lists/lists/richard_test_model.txt | 2 + eval/lists/lists/rope_step_batch.txt | 3 + eval/lists/lists/sera_14b.txt | 1 + eval/lists/lists/swesmith_fixthink_45.txt | 1 + eval/lists/lists/syh_32b.txt | 1 + eval/lists/lists/tb2_richard_test_model.txt | 1 + eval/lists/lists/v2_richard_test_model.txt | 1 + eval/lists/missing_dev_set_v2.txt | 189 + .../missing_dev_set_v2_inactive_laion.txt | 24 + ...g_swebench_verified_random_100_folders.txt | 69 + eval/lists/missing_terminal_bench_2.txt | 96 + eval/lists/nemotron_nano.txt | 1 + eval/lists/no_eval_models_latest.txt | 30 + eval/lists/pipeline_exp_prio.txt | 19 + eval/lists/priority_131k_test.txt | 1 + eval/lists/priority_batch2.txt | 3 + eval/lists/priority_batch_evalorg.txt | 3 + eval/lists/priority_obiwan.txt | 1 + eval/lists/priority_qwen35.txt | 1 + eval/lists/priority_rl_test.txt | 1 + eval/lists/pruned_models_names.txt | 364 ++ eval/lists/pyme_v3_40.txt | 1 + eval/lists/qwen35_27b.txt | 1 + eval/lists/qwen35_9b.txt | 1 + eval/lists/richard_base_model.txt | 1 + eval/lists/richard_test_model.txt | 2 + eval/lists/rope_step_batch.txt | 3 + eval/lists/sera_14b.txt | 1 + eval/lists/swesmith_fixthink_45.txt | 1 + eval/lists/syh_32b.txt | 1 + eval/lists/tb2_richard_test_model.txt | 1 + eval/lists/v2_richard_test_model.txt | 1 + eval/precreate_snapshots.py | 268 ++ eval/secret.env.template | 7 + eval/snapshot_download.py | 150 + eval/test_dp_eval.sh | 64 + eval/unified_eval_harbor.sbatch | 1014 +++++ eval/unified_eval_harbor_dp.sbatch | 992 +++++ eval/unified_eval_listener.py | 3756 +++++++++++++++++ hpc/dotenv/m2.env | 27 + hpc/dotenv/mbz.env | 29 + hpc/launch_utils.py | 2 + 123 files changed, 18443 insertions(+), 56 deletions(-) create mode 100644 eval/check_progress.py create mode 100644 eval/clusters/jupiter.yaml create mode 100644 eval/clusters/m2.yaml create mode 100644 eval/clusters/mbz.yaml create mode 100644 eval/configs/dcagent_eval_config.yaml create mode 100644 eval/configs/dcagent_eval_config_no_override.yaml create mode 100644 eval/docs/CLUSTER_ONBOARDING.md create mode 100644 eval/docs/CLUSTER_SETUP_GUIDE.md create mode 100644 eval/docs/EVAL_WORKFLOW.md create mode 100644 eval/docs/HF_REUPLOAD_GUIDE.md create mode 100644 eval/docs/LISTENER_TUTORIAL.md create mode 100644 eval/docs/OTAGENT2_SETUP_TUTORIAL.md create mode 100644 eval/docs/OVERLONG_EVAL_RUNBOOK.md create mode 100644 eval/docs/V6_MIGRATION.md create mode 100644 eval/jupiter/V6_MIGRATION.md create mode 100644 eval/jupiter/dcagent_eval_config_no_override.yaml create mode 100644 eval/legacy/README.md create mode 100644 eval/legacy/unified_eval_harbor_v4.sbatch create mode 100644 eval/legacy/unified_eval_harbor_v6_mbz.sbatch create mode 100644 eval/legacy/unified_eval_listener_v4.py create mode 100644 eval/lists/16x_32b_lc_baseline.txt create mode 100644 eval/lists/a1_models.txt create mode 100644 eval/lists/a1_nl2bash.txt create mode 100644 eval/lists/a1_retrained.txt create mode 100644 eval/lists/alfworld_131k.txt create mode 100644 eval/lists/architecture_invalid_test_model.txt create mode 100644 eval/lists/baseline_swe.txt create mode 100644 eval/lists/custom_force_run.txt create mode 100644 eval/lists/dp_test_model.txt create mode 100644 eval/lists/exp_tas_qwen35.txt create mode 100644 eval/lists/glm46_131k.txt create mode 100644 eval/lists/glm47_flash.txt create mode 100644 eval/lists/inactive_models_latest.txt create mode 100644 eval/lists/kept_models_names.txt create mode 100644 eval/lists/laion_latest.txt create mode 100644 eval/lists/latest_sort_by_release_eval_prio.txt create mode 100644 eval/lists/lists/16x_32b_lc_baseline.txt create mode 100644 eval/lists/lists/a1_models.txt create mode 100644 eval/lists/lists/a1_nl2bash.txt create mode 100644 eval/lists/lists/a1_retrained.txt create mode 100644 eval/lists/lists/alfworld_131k.txt create mode 100644 eval/lists/lists/architecture_invalid_test_model.txt create mode 100644 eval/lists/lists/baseline_swe.txt create mode 100644 eval/lists/lists/bfcl_rerun_failed.txt create mode 100644 eval/lists/lists/bfcl_rerun_models.txt create mode 100644 eval/lists/lists/custom_force_run.txt create mode 100644 eval/lists/lists/dp_test_model.txt create mode 100644 eval/lists/lists/dsv2_rerun_models.txt create mode 100644 eval/lists/lists/exp_tas_qwen35.txt create mode 100644 eval/lists/lists/glm46_131k.txt create mode 100644 eval/lists/lists/glm47_flash.txt create mode 100644 eval/lists/lists/inactive_models_latest.txt create mode 100644 eval/lists/lists/kept_models_names.txt create mode 100644 eval/lists/lists/laion_latest.txt create mode 100644 eval/lists/lists/latest_sort_by_release_eval_prio.txt create mode 100644 eval/lists/lists/missing_dev_set_v2.txt create mode 100644 eval/lists/lists/missing_dev_set_v2_inactive_laion.txt create mode 100644 eval/lists/lists/missing_swebench_verified_random_100_folders.txt create mode 100644 eval/lists/lists/missing_terminal_bench_2.txt create mode 100644 eval/lists/lists/models_131k.txt create mode 100644 eval/lists/lists/models_32b.txt create mode 100644 eval/lists/lists/nemotron_nano.txt create mode 100644 eval/lists/lists/no_eval_models_latest.txt create mode 100644 eval/lists/lists/pipeline_exp_prio.txt create mode 100644 eval/lists/lists/priority_131k_test.txt create mode 100644 eval/lists/lists/priority_batch2.txt create mode 100644 eval/lists/lists/priority_batch_evalorg.txt create mode 100644 eval/lists/lists/priority_obiwan.txt create mode 100644 eval/lists/lists/priority_qwen35.txt create mode 100644 eval/lists/lists/priority_rl_test.txt create mode 100644 eval/lists/lists/pruned_models_names.txt create mode 100644 eval/lists/lists/pyme_v3_40.txt create mode 100644 eval/lists/lists/qwen35_27b.txt create mode 100644 eval/lists/lists/qwen35_9b.txt create mode 100644 eval/lists/lists/richard_base_model.txt create mode 100644 eval/lists/lists/richard_test_model.txt create mode 100644 eval/lists/lists/rope_step_batch.txt create mode 100644 eval/lists/lists/sera_14b.txt create mode 100644 eval/lists/lists/swesmith_fixthink_45.txt create mode 100644 eval/lists/lists/syh_32b.txt create mode 100644 eval/lists/lists/tb2_richard_test_model.txt create mode 100644 eval/lists/lists/v2_richard_test_model.txt create mode 100644 eval/lists/missing_dev_set_v2.txt create mode 100644 eval/lists/missing_dev_set_v2_inactive_laion.txt create mode 100644 eval/lists/missing_swebench_verified_random_100_folders.txt create mode 100644 eval/lists/missing_terminal_bench_2.txt create mode 100644 eval/lists/nemotron_nano.txt create mode 100644 eval/lists/no_eval_models_latest.txt create mode 100644 eval/lists/pipeline_exp_prio.txt create mode 100644 eval/lists/priority_131k_test.txt create mode 100644 eval/lists/priority_batch2.txt create mode 100644 eval/lists/priority_batch_evalorg.txt create mode 100644 eval/lists/priority_obiwan.txt create mode 100644 eval/lists/priority_qwen35.txt create mode 100644 eval/lists/priority_rl_test.txt create mode 100644 eval/lists/pruned_models_names.txt create mode 100644 eval/lists/pyme_v3_40.txt create mode 100644 eval/lists/qwen35_27b.txt create mode 100644 eval/lists/qwen35_9b.txt create mode 100644 eval/lists/richard_base_model.txt create mode 100644 eval/lists/richard_test_model.txt create mode 100644 eval/lists/rope_step_batch.txt create mode 100644 eval/lists/sera_14b.txt create mode 100644 eval/lists/swesmith_fixthink_45.txt create mode 100644 eval/lists/syh_32b.txt create mode 100644 eval/lists/tb2_richard_test_model.txt create mode 100644 eval/lists/v2_richard_test_model.txt create mode 100644 eval/precreate_snapshots.py create mode 100644 eval/secret.env.template create mode 100644 eval/snapshot_download.py create mode 100644 eval/test_dp_eval.sh create mode 100644 eval/unified_eval_harbor.sbatch create mode 100644 eval/unified_eval_harbor_dp.sbatch create mode 100644 eval/unified_eval_listener.py create mode 100644 hpc/dotenv/m2.env create mode 100644 hpc/dotenv/mbz.env diff --git a/.gitignore b/.gitignore index 228383a3..78dcdd8c 100644 --- a/.gitignore +++ b/.gitignore @@ -410,3 +410,17 @@ train/hpc/dotenv/secret.env # Claude CLAUDE.md development_progress.md + +# Eval local runtime data (logs, tracking, job output — per-cluster) +eval/local/ +eval/MBZ/logs/ +eval/MBZ/tracking/ +eval/MBZ/__pycache__/ +eval/jupiter/logs/ +eval/*/logs/ +jobs/ + +# Secrets (never commit) +secret.env +secrets.env +*.env.local \ No newline at end of file diff --git a/database/unified_db/utils.py b/database/unified_db/utils.py index 32397611..3d81a4ce 100644 --- a/database/unified_db/utils.py +++ b/database/unified_db/utils.py @@ -2015,7 +2015,8 @@ def register_sandbox_job( stats: Optional[Dict[str, Any]] = None, forced_update: bool = True, hf_traces_link: Optional[str] = None, - job_status: Optional[str] = None + job_status: Optional[str] = None, + is_overlong: bool = False, ) -> Dict[str, Any]: """ Register a sandbox job with minimal auto-filling. @@ -2055,7 +2056,8 @@ def register_sandbox_job( "benchmark_id": benchmark_id, "n_rep_eval": n_rep_eval, "hf_traces_link": hf_traces_link, - "job_status": job_status + "job_status": job_status, + "is_overlong": is_overlong, } # Include job_id if provided (preserves local ID from result.json) @@ -3447,6 +3449,7 @@ def upload_job_and_trial_records( register_benchmark: bool = False, hf_dataset_url: Optional[str] = None, forced_update: bool = False, + is_overlong: bool = False, ) -> Dict[str, Any]: """ Upload job and trial records to database (with optional HF dataset URL for trials). @@ -3858,7 +3861,7 @@ def upload_job_and_trial_records( job_metadata["hf_traces_link"] = hf_dataset_url job_metadata["job_status"] = "Finished" - job_record = register_sandbox_job(**job_metadata, forced_update=forced_update) + job_record = register_sandbox_job(**job_metadata, forced_update=forced_update, is_overlong=is_overlong) if not job_record.get("success"): raise Exception(f"Job registration failed: {job_record.get('error')}") @@ -4141,6 +4144,7 @@ def upload_traces_to_hf( verbose=verbose, success_filter=success_filter, include_verifier_output=include_verifier_output, + export_subagents=export_subagents, ) logger.info(f"Extracted {len(dataset)} conversation rows from trials") except Exception as e: @@ -4301,6 +4305,7 @@ def upload_eval_results( hf_verbose: bool = False, hf_export_subagents: bool = False, forced_update: bool = False, + is_overlong: bool = False, ) -> Dict[str, Any]: """ Upload evaluation results from a job directory to HuggingFace and database. @@ -4434,6 +4439,7 @@ def upload_eval_results( register_benchmark=register_benchmark, hf_dataset_url=hf_dataset_url, # Will be None if HF upload failed forced_update=forced_update, + is_overlong=is_overlong, ) # Add HF-related information to result diff --git a/eval/baseline_model_configs.yaml b/eval/baseline_model_configs.yaml index 1302c370..c189b3a2 100644 --- a/eval/baseline_model_configs.yaml +++ b/eval/baseline_model_configs.yaml @@ -9,56 +9,89 @@ # The eval script will default to the node's GPU count (e.g. TP=4 on Polaris A100-40GB). # Only set it here if a model requires a specific TP size. # -# Pattern configs (under "patterns") are matched when no exact model name is found. -# Each pattern entry has a "match" regex and the same config fields as a model entry. +# Format: +# models: Per-model overrides (exact HF name -> config). +# groups: Apply one config to many models. Each entry has "models" (list) + config fields. +# Per-model entries in "models:" are merged on top of group config (override wins). +# patterns: Regex fallback when no exact/group match. First match wins. -models: - "NovaSky-AI/SA-SWE-32B": +# --- Groups: many models sharing the same config --- +groups: + - models: + - "nvidia/Nemotron-Terminal-32B" + - "DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B" + - "NovaSky-AI/SA-SWE-32B" + - "Qwen/Qwen2.5-Coder-32B-Instruct" + - "Qwen/Qwen3-32B" + - "R2E-Gym/R2EGym-32B-Agent" + - "SWE-Swiss/SWE-Swiss-32B" + - "SWE-bench/SWE-agent-LM-32B" + - "Skywork/Skywork-SWE-32B" + - "allenai/SERA-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_32_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_4.0_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B" + - "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B" + - "laion/Qwen3-32B-NL2Bash-31step" + - "laion/Qwen3-32B-R2EGYM-256-3epochs" + - "laion/Qwen3-32B-SweSmith-20step" + - "laion/open-thoughts-4-code-qwen3-32b-annotated" + - "laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith" + - "laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith" + - "laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B" + - "laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B" + - "laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B" + - "laion/rl__48GPU_shaped_32b__swe_rebench_patched_oracle__Qwen3-32B-45" + tensor_parallel_size: 4 max_model_len: 32768 swap_space: 32 trust_remote_code: true tool_call_parser: hermes - reasoning_parser: qwen3 extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching" - "Qwen/Qwen2.5-Coder-32B-Instruct": - max_model_len: 32768 - swap_space: 32 - trust_remote_code: true - tool_call_parser: hermes - extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching" +# --- Per-model overrides (merged on top of group config) --- +models: + # Adds reasoning_parser on top of the group defaults + "NovaSky-AI/SA-SWE-32B": + reasoning_parser: qwen3 - "R2E-Gym/R2EGym-32B-Agent": - max_model_len: 32768 - swap_space: 32 - trust_remote_code: true - tool_call_parser: hermes - extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching" + # Override: lower swap_space (group default is 32) + "allenai/SERA-32B": + swap_space: 12 - "SWE-Swiss/SWE-Swiss-32B": - max_model_len: 32768 - swap_space: 32 +# Pattern-based configs: matched by regex when no exact/group match is found. +# Checked in order; first match wins. +patterns: + - match: "(?i)qwen3\\.5" + conda_env: otagent2 trust_remote_code: true - tool_call_parser: hermes - extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching" - - "Skywork/Skywork-SWE-32B": + tensor_parallel_size: 4 max_model_len: 32768 swap_space: 32 - trust_remote_code: true - tool_call_parser: hermes - extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching" + extra_args: "--enforce-eager" - "allenai/SERA-32B": - max_model_len: 32768 + - match: "(?i)(?:32b.*(131k|-lc)|(131k|-lc).*32b)" + tensor_parallel_size: 4 + max_model_len: 131072 swap_space: 12 - tool_call_parser: hermes - extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching" + hf_overrides: '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}}' + + - match: "(?i)32[Bb]" + tensor_parallel_size: 4 + max_model_len: 32768 + swap_space: 32 -# Pattern-based configs: matched by regex when no exact model name is found. -# Checked in order; first match wins. -patterns: - match: "131k|-lc$" max_model_len: 131072 swap_space: 12 - extra_args: "--hf-overrides '{\"rope_scaling\":{\"rope_type\":\"yarn\",\"factor\":4.0,\"original_max_position_embeddings\":32768}}'" + hf_overrides: '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}}' diff --git a/eval/build_vllm_cmd.sh b/eval/build_vllm_cmd.sh index d027ff2d..a731eaee 100644 --- a/eval/build_vllm_cmd.sh +++ b/eval/build_vllm_cmd.sh @@ -21,6 +21,7 @@ # EVAL_VLLM_REASONING_PARSER (default: unset) # EVAL_VLLM_DATA_PARALLEL_SIZE (default: unset; vLLM v0.8+ only) # EVAL_VLLM_EXTRA_ARGS (default: unset; space-separated string) +# EVAL_VLLM_HF_OVERRIDES (default: unset; JSON string for --hf-overrides) # ============================================================================== build_vllm_cmd() { @@ -31,18 +32,19 @@ build_vllm_cmd() { # Read overrides from env (set by listener via sbatch --export) local tp="${EVAL_VLLM_TENSOR_PARALLEL_SIZE:-4}" local dp="${EVAL_VLLM_DATA_PARALLEL_SIZE:-}" - local max_model_len="${EVAL_VLLM_MAX_MODEL_LEN:-}" + local max_model_len="${EVAL_VLLM_MAX_MODEL_LEN:-32768}" local swap_space="${EVAL_VLLM_SWAP_SPACE:-4}" local trust_remote_code="${EVAL_VLLM_TRUST_REMOTE_CODE:-}" local tool_call_parser="${EVAL_VLLM_TOOL_CALL_PARSER:-}" local reasoning_parser="${EVAL_VLLM_REASONING_PARSER:-}" local extra_args="${EVAL_VLLM_EXTRA_ARGS:-}" + local hf_overrides="${EVAL_VLLM_HF_OVERRIDES:-}" # Build command array VLLM_CMD=( "$python_bin" -m vllm.entrypoints.openai.api_server --model "$model" - --host 0.0.0.0 --port 8000 + --host 0.0.0.0 --port "${VLLM_PORT:-8000}" --served-model-name "$model" --tensor-parallel-size "$tp" --gpu-memory-utilization "$gpu_mem_util" @@ -70,6 +72,11 @@ build_vllm_cmd() { VLLM_CMD+=(--reasoning-parser "$reasoning_parser") fi + # HF model config overrides (JSON string, properly quoted) + if [ -n "$hf_overrides" ]; then + VLLM_CMD+=(--hf-overrides "$hf_overrides") + fi + # Append extra args (space-separated string) if [ -n "$extra_args" ]; then # shellcheck disable=SC2206 diff --git a/eval/check_progress.py b/eval/check_progress.py new file mode 100644 index 00000000..61abb0a6 --- /dev/null +++ b/eval/check_progress.py @@ -0,0 +1,627 @@ +#!/usr/bin/env python3 +"""Check progress of all running eval jobs on Jupiter. + +Usage: + python check_progress.py # grouped text output (default) + python check_progress.py --live # rich live dashboard + python check_progress.py --live -i 3 # live with 3s refresh + python check_progress.py --sort elapsed + python check_progress.py --jobs-dir /path/to/other/jobs # override jobs dir +""" + +import argparse +import subprocess +import json +import os +import sys +import time +from collections import defaultdict +from datetime import datetime +from pathlib import Path + +REPO_DIR = Path(__file__).resolve().parent.parent.parent +LOGS_DIR = REPO_DIR / "eval" / "MBZ" / "logs" +DEFAULT_JOBS_DIR = REPO_DIR / "jobs" + + +# --------------------------------------------------------------------------- +# Data collection helpers (unchanged logic, refactored for reuse) +# --------------------------------------------------------------------------- + +def get_running_jobs(): + result = subprocess.run( + ["squeue", "-u", os.environ["USER"], "--format=%.10i %.15j %.8T %.20S", "--noheader"], + capture_output=True, text=True, + ) + jobs = [] + for line in result.stdout.strip().split("\n"): + parts = line.split() + if len(parts) >= 4: + jobs.append((parts[0].strip(), parts[1].strip(), parts[2].strip(), parts[3].strip())) + return jobs + + +def parse_eval_log(jid, job_name): + """Try multiple log naming patterns. SLURM %x is captured at submit time, + so renamed jobs (eval_dp -> eval_dp_v2) need fallback to original name.""" + candidates = [ + LOGS_DIR / f"{job_name}_{jid}.out", + LOGS_DIR / f"eval_{jid}.out", + ] + if job_name.startswith("eval_dp_"): + candidates.insert(1, LOGS_DIR / f"eval_dp_{jid}.out") + elif job_name.startswith("eval_"): + candidates.insert(1, LOGS_DIR / f"eval_{jid}.out") + if job_name.startswith("res_dp_"): + candidates.insert(1, LOGS_DIR / f"res_dp_{jid}.out") + elif job_name.startswith("res_"): + candidates.insert(1, LOGS_DIR / f"res_{jid}.out") + model = bench = run_tag = None + num_shards = 0 + for log in candidates: + if log.exists(): + with open(log) as f: + for line in f: + if line.startswith("Model: "): + model = line.strip()[7:] + elif line.startswith("Dataset: "): + bench = line.strip()[9:] + elif line.startswith("Run tag: "): + run_tag = line.strip()[9:] + elif "total shards)" in line: + try: + num_shards = int(line.split("total shards")[0].split(",")[-1].strip()) + except (ValueError, IndexError): + pass + break + return model, bench, run_tag, num_shards + + +VALID_ERROR_TYPES = { + "AgentTimeoutError", "ContextLengthExceededError", + "SummarizationTimeout", "SummarizationTimeoutError", "BadRequestError", +} + + +def get_progress_single(run_tag, jobs_dir): + """Get progress from a single (non-DP) job dir.""" + if not run_tag: + return None, None, None, None, None, None + job_dir = jobs_dir / run_tag + rf = job_dir / "result.json" + if not rf.exists(): + return None, None, None, None, None, None + try: + d = json.load(open(rf)) + completed = d.get("stats", {}).get("n_trials", None) + total = d.get("n_total_trials", None) + finished = d.get("finished_at") is not None + invalid_trials = set() + evals = d.get("stats", {}).get("evals", {}) + # Extract accuracy from metrics + accuracy = None + for eval_data in evals.values(): + for error_type, trial_names in eval_data.get("exception_stats", {}).items(): + if error_type not in VALID_ERROR_TYPES and isinstance(trial_names, list): + invalid_trials.update(trial_names) + metrics = eval_data.get("metrics", []) + if metrics and isinstance(metrics, list): + mr = metrics[0].get("mean_reward") + if mr is not None: + accuracy = mr + # Use reward_stats to count completed trials (avoids expensive dir scan) + n_on_disk = None + for eval_data in evals.values(): + rs = eval_data.get("reward_stats", {}).get("reward", {}) + if rs: + n_on_disk = sum(len(v) for v in rs.values() if isinstance(v, list)) + break + if n_on_disk is None: + # Fallback: count from stats + n_on_disk = completed + return completed, total, len(invalid_trials), finished, n_on_disk, accuracy + except Exception: + return None, None, None, None, None, None + + +def get_progress(run_tag, num_shards, jobs_dir): + """Get progress, aggregating across shards for DP jobs.""" + if not run_tag: + return None, None, None, None, None, None + if num_shards > 1: + total_completed = 0 + total_total = 0 + total_errors = 0 + total_on_disk = 0 + all_finished = True + found_any = False + all_accuracies = [] + for shard_idx in range(num_shards): + shard_tag = f"{run_tag}_shard{shard_idx}" + c, t, e, fin, od, acc = get_progress_single(shard_tag, jobs_dir) + if c is not None: + found_any = True + total_completed += c + total_total += (t or 0) + total_errors += (e or 0) + total_on_disk += (od or 0) + if acc is not None: + all_accuracies.append(acc) + if not fin: + all_finished = False + if found_any: + avg_acc = (sum(all_accuracies) / len(all_accuracies)) if all_accuracies else None + return total_completed, total_total, total_errors, all_finished, total_on_disk, avg_acc + return None, None, None, None, None, None + else: + return get_progress_single(run_tag, jobs_dir) + + +def format_elapsed(start_str): + try: + if start_str == "N/A": + return "-", 0 + st = datetime.fromisoformat(start_str.replace("T", " ")) + delta = datetime.now() - st + secs = int(delta.total_seconds()) + hours, rem = divmod(secs, 3600) + mins, _ = divmod(rem, 60) + return f"{hours}h{mins:02d}m", secs + except Exception: + return "?", 0 + + +# --------------------------------------------------------------------------- +# Unified data collection +# --------------------------------------------------------------------------- + +def collect_job_data(jobs_dir): + """Collect all job data into structured dicts. + + Returns (running_data: list[dict], pending_jobs: list[tuple]). + """ + all_jobs = get_running_jobs() + if not all_jobs: + return [], [] + + running_raw = [(jid, name, state, start) for jid, name, state, start in all_jobs if state == "RUNNING"] + pending = [(jid, name, state, start) for jid, name, state, start in all_jobs if state == "PENDING"] + + running_data = [] + for jid, name, state, start_time in running_raw: + model, bench, run_tag, num_shards = parse_eval_log(jid, name) + completed, total, invalid_errors, finished, n_on_disk, accuracy = get_progress(run_tag, num_shards, jobs_dir) + + if finished: + status = "done" + elif completed is not None and total and completed >= total: + status = "retry" + else: + status = "active" + + elapsed, elapsed_secs = format_elapsed(start_time) + m_short = model.split("/")[-1] if model and "/" in model else (model or "?") + b_short = bench.split("/")[-1] if bench and "/" in bench else (bench or "?") + + is_resume = name.startswith("res_") + tags = [] + if is_resume: + tags.append("RES") + if num_shards > 1: + tags.append(f"{num_shards}x DP") + + # Progress percentage (based on disk for ground truth) + if n_on_disk is not None and total and total > 0: + progress_pct = n_on_disk / total + elif completed is not None and total and total > 0: + progress_pct = completed / total + else: + progress_pct = 0.0 + + running_data.append({ + "jid": jid, + "job_name": name, + "model": m_short, + "model_full": model or "?", + "bench": b_short, + "bench_full": bench or "?", + "run_tag": run_tag, + "num_shards": num_shards, + "completed": completed, + "total": total, + "n_on_disk": n_on_disk, + "n_invalid_errors": invalid_errors, + "accuracy": accuracy, + "finished": finished, + "elapsed": elapsed, + "elapsed_secs": elapsed_secs, + "status": status, + "tags": tags, + "progress_pct": progress_pct, + "is_resume": is_resume, + }) + + return running_data, pending + + +def group_by_benchmark(jobs, sort_key="progress"): + """Group jobs by benchmark, sort groups alphabetically, sort jobs within groups. + + sort_key: 'progress' (default), 'elapsed', 'model', 'errors' + """ + groups = defaultdict(list) + for job in jobs: + groups[job["bench"]].append(job) + + # Sort within each group + for bench in groups: + if sort_key == "elapsed": + groups[bench].sort(key=lambda j: j["elapsed_secs"], reverse=True) + elif sort_key == "model": + groups[bench].sort(key=lambda j: j["model"].lower()) + elif sort_key == "errors": + groups[bench].sort(key=lambda j: (j["n_invalid_errors"] or 0), reverse=True) + else: # progress (default) + groups[bench].sort(key=lambda j: j["progress_pct"], reverse=True) + + # Return as OrderedDict-like sorted by benchmark name + return dict(sorted(groups.items())) + + +def get_pending_reasons(): + """Get pending job reasons from squeue.""" + result = subprocess.run( + ["squeue", "-u", os.environ["USER"], "--format=%.10i %.8T %R", "--noheader"], + capture_output=True, text=True, + ) + reasons = {} + for line in result.stdout.strip().split("\n"): + parts = line.split(None, 2) + if len(parts) >= 3 and parts[1].strip() == "PENDING": + reasons[parts[0].strip()] = parts[2].strip() + return reasons + + +# --------------------------------------------------------------------------- +# Default text mode (enhanced with grouping) +# --------------------------------------------------------------------------- + +def print_default(running_data, pending, sort_key="progress"): + if not running_data and not pending: + print("No jobs in queue.") + return + + if running_data: + groups = group_by_benchmark(running_data, sort_key) + + # Count statuses + status_counts = defaultdict(int) + for j in running_data: + status_counts[j["status"]] += 1 + + print(f"\nRUNNING ({len(running_data)}):") + + header = f" {'JID':>8s} {'Progress':>10s} {'On Disk':>8s} {'Acc':>6s} {'Errors':>6s} {'Status':>6s} {'Elapsed':>8s} Model" + sep = " " + "-" * (len(header) - 2) + + for bench_name, jobs in groups.items(): + # Per-benchmark summary + n_active = sum(1 for j in jobs if j["status"] == "active") + n_retry = sum(1 for j in jobs if j["status"] == "retry") + n_done = sum(1 for j in jobs if j["status"] == "done") + parts = [] + if n_active: + parts.append(f"{n_active} active") + if n_retry: + parts.append(f"{n_retry} retry") + if n_done: + parts.append(f"{n_done} done") + status_str = ", ".join(parts) if parts else "0 jobs" + + print(f"\n === {bench_name} ({len(jobs)} jobs: {status_str}) ===") + print(header) + print(sep) + + for j in jobs: + progress = f"{j['completed']}/{j['total']}" if j["completed"] is not None else "-" + on_disk = f"{j['n_on_disk']}/{j['total']}" if j["n_on_disk"] is not None and j["total"] else "-" + acc = f"{j['accuracy']:.1%}" if j["accuracy"] is not None else "-" + errors = str(j["n_invalid_errors"]) if j["n_invalid_errors"] is not None else "-" + tag_str = f" [{', '.join(j['tags'])}]" if j["tags"] else "" + print(f" {j['jid']:>8s} {progress:>10s} {on_disk:>8s} {acc:>6s} {errors:>6s} {j['status']:>6s} {j['elapsed']:>8s} {j['model']}{tag_str}") + + if pending: + reasons = get_pending_reasons() + print(f"\n PENDING ({len(pending)}):") + print(f" {'JID':>8s} Reason") + print(" " + "-" * 38) + for jid, name, state, start in pending: + reason = reasons.get(jid, "?") + print(f" {jid:>8s} {reason}") + + # Summary line + status_counts = defaultdict(int) + for j in running_data: + status_counts[j["status"]] += 1 + parts = [] + for s in ["active", "retry", "done"]: + if status_counts[s]: + parts.append(f"{status_counts[s]} {s}") + status_detail = f" ({', '.join(parts)})" if parts else "" + print(f"\n Total: {len(running_data)} running{status_detail}, {len(pending)} pending\n") + + +# --------------------------------------------------------------------------- +# Rich Live dashboard +# --------------------------------------------------------------------------- + +def run_live_dashboard(jobs_dir, interval, sort_key="progress", compact=False, + benchmark_filter=None, page_mode=False): + from rich.console import Console, Group + from rich.table import Table + from rich.live import Live + from rich.text import Text + from rich.panel import Panel + from rich.progress_bar import ProgressBar + + console = Console() + page_idx = [0] # mutable for closure + + STATUS_STYLES = { + "active": "green", + "active_res": "purple", + "retry": "yellow", + "done": "dim", + } + + def _job_style(j): + """Get style key for a job: distinguish resume (cyan) from fresh (green) for active jobs.""" + if j["status"] == "active" and j["is_resume"]: + return "active_res" + return j["status"] + + def make_progress_bar(pct, style_key): + """Create a colored progress bar.""" + color = STATUS_STYLES.get(style_key, "white") + bar = ProgressBar(total=100, completed=int(pct * 100), width=10, + complete_style=color, finished_style=color) + return bar + + def render(): + running_data, pending = collect_job_data(jobs_dir) + now = datetime.now().strftime("%H:%M:%S") + + renderables = [] + + # Header + n_fresh = sum(1 for j in running_data if j["status"] == "active" and not j["is_resume"]) + n_resume = sum(1 for j in running_data if j["status"] == "active" and j["is_resume"]) + n_retry = sum(1 for j in running_data if j["status"] == "retry") + n_done = sum(1 for j in running_data if j["status"] == "done") + header_parts = [ + f"[bold]EVAL DASHBOARD[/bold]", + f"[green]{n_fresh} fresh[/green]" if n_fresh else None, + f"[purple]{n_resume} resume[/purple]" if n_resume else None, + f"[yellow]{n_retry} retry[/yellow]" if n_retry else None, + f"[dim]{n_done} done[/dim]" if n_done else None, + f"{len(pending)} pending" if pending else None, + f"[dim]Updated {now}[/dim]", + ] + header_text = " | ".join(p for p in header_parts if p) + renderables.append(Text.from_markup(header_text)) + renderables.append(Text("")) + + if not running_data and not pending: + renderables.append(Text("No jobs in queue.", style="dim")) + return Group(*renderables) + + # Group by benchmark + groups = group_by_benchmark(running_data, sort_key) + + # Apply benchmark filter + if benchmark_filter: + groups = {k: v for k, v in groups.items() + if benchmark_filter.lower() in k.lower()} + + # Page mode: show one benchmark at a time, rotating each refresh + if page_mode and groups: + bench_names = list(groups.keys()) + idx = page_idx[0] % len(bench_names) + selected = bench_names[idx] + groups = {selected: groups[selected]} + page_idx[0] += 1 + renderables.append(Text.from_markup( + f"[dim]Page {idx + 1}/{len(bench_names)}[/dim]")) + renderables.append(Text("")) + + for bench_name, jobs in groups.items(): + # Per-benchmark stats + b_fresh = sum(1 for j in jobs if j["status"] == "active" and not j["is_resume"]) + b_resume = sum(1 for j in jobs if j["status"] == "active" and j["is_resume"]) + b_retry = sum(1 for j in jobs if j["status"] == "retry") + b_done = sum(1 for j in jobs if j["status"] == "done") + b_errors = sum(j["n_invalid_errors"] or 0 for j in jobs) + avg_pct = sum(j["progress_pct"] for j in jobs) / len(jobs) if jobs else 0 + + status_parts = [] + if b_fresh: + status_parts.append(f"[green]{b_fresh} fresh[/green]") + if b_resume: + status_parts.append(f"[purple]{b_resume} resume[/purple]") + if b_retry: + status_parts.append(f"[yellow]{b_retry} retry[/yellow]") + if b_done: + status_parts.append(f"[dim]{b_done} done[/dim]") + error_str = f" [red]{b_errors} err[/red]" if b_errors > 10 else (f" {b_errors} err" if b_errors else "") + + title = (f"[bold cyan]{bench_name}[/bold cyan] " + f"{len(jobs)} jobs | {' | '.join(status_parts)}{error_str} | " + f"avg {avg_pct:.0%}") + + # In compact mode, only show active jobs in the table + display_jobs = [j for j in jobs if j["status"] == "active"] if compact else jobs + n_hidden = len(jobs) - len(display_jobs) + + table = Table( + show_header=True, header_style="bold", box=None, + padding=(0, 1), expand=True, + ) + table.add_column("JID", style="dim", width=7, justify="right") + table.add_column("Progress", width=8, justify="right") + if not compact: + table.add_column("Bar", width=12) + table.add_column("Disk", width=8, justify="right") + table.add_column("Acc", width=6, justify="right") + table.add_column("Err", width=4, justify="right") + table.add_column("Status", width=6, justify="right") + table.add_column("Elapsed", width=6, justify="right") + table.add_column("Model", no_wrap=False, ratio=1) + + for j in display_jobs: + skey = _job_style(j) + style = STATUS_STYLES.get(skey, "") + progress = f"{j['completed']}/{j['total']}" if j["completed"] is not None else "-" + on_disk = f"{j['n_on_disk']}/{j['total']}" if j["n_on_disk"] is not None and j["total"] else "-" + acc = f"{j['accuracy']:.1%}" if j["accuracy"] is not None else "-" + errors = str(j["n_invalid_errors"]) if j["n_invalid_errors"] is not None else "-" + error_style = "red bold" if (j["n_invalid_errors"] or 0) > 10 else "" + tag_str = f" [{', '.join(j['tags'])}]" if j["tags"] else "" + model_display = j["model_full"] + + row = [ + j["jid"], + Text(progress, style=style), + ] + if not compact: + row.append(make_progress_bar(j["progress_pct"], skey)) + row.extend([ + Text(on_disk, style=style), + Text(acc, style="cyan" if j["accuracy"] is not None else style), + Text(errors, style=error_style or style), + Text("resume" if skey == "active_res" else j["status"], style=style), + Text(j["elapsed"], style=style), + Text(f"{model_display}{tag_str}", style=style), + ]) + table.add_row(*row) + + renderables.append(Text.from_markup(title)) + if display_jobs: + renderables.append(table) + if n_hidden: + renderables.append(Text.from_markup( + f" [dim]... {n_hidden} done/retry job(s) hidden (use without --compact to show)[/dim]")) + renderables.append(Text("")) + + # Pending section + if pending: + reasons = get_pending_reasons() + pending_lines = [] + for jid, name, state, start in pending: + reason = reasons.get(jid, "?") + pending_lines.append(f" {jid} {reason}") + pending_text = "\n".join(pending_lines[:10]) + if len(pending) > 10: + pending_text += f"\n ... and {len(pending) - 10} more" + renderables.append(Panel( + pending_text, + title=f"[dim]PENDING ({len(pending)})[/dim]", + border_style="dim", + expand=False, + )) + + # Footer + tags = [] + if compact: + tags.append("compact") + if page_mode: + tags.append("paging") + if benchmark_filter: + tags.append(f"filter: {benchmark_filter}") + extra = f" | {', '.join(tags)}" if tags else "" + renderables.append(Text.from_markup( + f"[dim]Ctrl+C to exit | Refreshing every {interval}s | " + f"Sort: {sort_key}{extra}[/dim]" + )) + + return Group(*renderables) + + try: + with Live(render(), console=console, refresh_per_second=1, + vertical_overflow="ellipsis") as live: + while True: + time.sleep(interval) + live.update(render()) + except KeyboardInterrupt: + console.print("\n[dim]Dashboard stopped.[/dim]") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def parse_args(): + parser = argparse.ArgumentParser( + description="Check progress of running eval jobs on Jupiter.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="""\ +examples: + %(prog)s # grouped text output + %(prog)s --live # rich live dashboard + %(prog)s --live -i 3 # 3s refresh interval + %(prog)s --sort elapsed # sort by elapsed time + %(prog)s --sort errors # sort by error count + %(prog)s --live --compact # live, hide done/retry + no bar (fits more) + %(prog)s --live -b tb2 # live, only terminal_bench_2 + %(prog)s --live --page # live, rotate one benchmark per refresh +""", + ) + parser.add_argument( + "--live", "-w", action="store_true", + help="Launch Rich live dashboard (auto-refreshing)", + ) + parser.add_argument( + "--interval", "-i", type=int, default=5, + help="Refresh interval in seconds for --live mode (default: 5, min: 3)", + ) + parser.add_argument( + "--sort", "-s", choices=["progress", "elapsed", "model", "errors"], + default="progress", + help="Sort order within benchmark groups (default: progress)", + ) + parser.add_argument( + "--compact", "-c", action="store_true", + help="In --live mode, hide done/retry jobs and progress bar (fits more rows)", + ) + parser.add_argument( + "--benchmark", "-b", type=str, default=None, + help="Filter to benchmarks matching this substring (e.g. 'tb2', 'dev_set')", + ) + parser.add_argument( + "--page", "-p", action="store_true", + help="In --live mode, show one benchmark per page, rotating each refresh", + ) + parser.add_argument( + "--jobs-dir", type=Path, default=DEFAULT_JOBS_DIR, + help=f"Path to eval jobs directory (default: {DEFAULT_JOBS_DIR})", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + if args.live: + interval = max(3, args.interval) + run_live_dashboard(args.jobs_dir, interval, sort_key=args.sort, + compact=args.compact, benchmark_filter=args.benchmark, + page_mode=args.page) + else: + running_data, pending = collect_job_data(args.jobs_dir) + if args.benchmark: + running_data = [j for j in running_data + if args.benchmark.lower() in j["bench"].lower()] + print_default(running_data, pending, sort_key=args.sort) + + +if __name__ == "__main__": + main() diff --git a/eval/clusters/jupiter.yaml b/eval/clusters/jupiter.yaml new file mode 100644 index 00000000..c0a317d8 --- /dev/null +++ b/eval/clusters/jupiter.yaml @@ -0,0 +1,41 @@ +# Cluster configuration for Jupiter (JSC GH200 ARM nodes) +# Used by unified_eval_listener_v6.py --cluster-config eval/clusters/jupiter.yaml + +cluster_name: jupiter + +# SLURM +slurm_partition: booster +slurm_account: "reformo" +slurm_time: "12:00:00" + +# Conda environments: name → prefix directory (passed as OTAGENT_DIR to sbatch) +conda_envs: + otagent: /e/scratch/jureap59/feuer1/miniforge3/envs/otagent + otagent2: /e/scratch/jureap59/zhuang1/conda/envs/otagent2 + +# Paths +paths: + project_root: /e/scratch/jureap59/zhuang1/OpenThoughts-Agent + hf_cache: /e/data1/datasets/playground/ot/hf_hub + eval_jobs_dir: /e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs + eval_logs_dir: eval/local/jupiter/logs + listener_logs_dir: experiments/listener_logs + sbatch_script: eval/unified_eval_harbor.sbatch + dp_sbatch_script: eval/unified_eval_harbor_dp.sbatch + harbor_src: /e/scratch/jureap59/feuer1/harbor/src + datasets_dirs: + - /e/data1/datasets/playground/ot/datasets + - /e/scratch/jureap59/${USER}/datasets + secrets_file: ~/secrets.env + +# Proxy (Jupiter compute nodes have no internet) +proxy: + enabled: true + login_node: jpbl-s01-02 + proxychains_bin: /e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4 + +# Hardware +hardware: + gpus_per_node: 4 + arch: aarch64 + cuda_home: /e/software/default/stages/2026/software/CUDA/13 diff --git a/eval/clusters/m2.yaml b/eval/clusters/m2.yaml new file mode 100644 index 00000000..6e6d3429 --- /dev/null +++ b/eval/clusters/m2.yaml @@ -0,0 +1,40 @@ +# Cluster configuration for M2 (MBZ H200 cluster, 8×GPU per node) +# Used by: python eval/unified_eval_listener.py --cluster-config eval/clusters/m2.yaml + +cluster_name: m2 + +# SLURM +slurm_partition: main +slurm_account: "" +slurm_time: "24:00:00" + +# Conda environments: name → prefix directory +conda_envs: + otagent: /mnt/weka/home/${USER}/miniconda3/envs/otagent + otagent2: /mnt/weka/home/${USER}/miniconda3/envs/otagent2 + +# Paths (parameterized with $USER) +paths: + project_root: /mnt/weka/home/${USER}/OpenThoughts-Agent + hf_cache: /mnt/weka/home/${USER}/.cache/huggingface/hub + eval_jobs_dir: /mnt/weka/home/${USER}/OpenThoughts-Agent/jobs + eval_logs_dir: eval/local/m2/logs + listener_logs_dir: experiments/listener_logs + sbatch_script: eval/unified_eval_harbor.sbatch + dp_sbatch_script: eval/unified_eval_harbor_dp.sbatch + harbor_src: /mnt/weka/home/${USER}/harbor/src + datasets_dirs: + - /mnt/weka/home/${USER}/.cache/huggingface/hub + secrets_file: ~/secrets.env + +# Proxy (MBZ compute nodes have internet) +proxy: + enabled: false + +# Hardware +hardware: + gpus_per_node: 8 + cpus_per_node: 128 + mem_per_node_mb: 1612647 + arch: x86_64 + cuda_home: /usr/local/cuda-12.8 diff --git a/eval/clusters/mbz.yaml b/eval/clusters/mbz.yaml new file mode 100644 index 00000000..7ec7bafa --- /dev/null +++ b/eval/clusters/mbz.yaml @@ -0,0 +1,39 @@ +# Cluster configuration for MBZ/M1 (x86_64 nodes, 8×GPU) +# Used by: python eval/unified_eval_listener.py --cluster-config eval/clusters/mbz.yaml + +cluster_name: mbz + +# SLURM +slurm_partition: main +slurm_account: "" +slurm_time: "24:00:00" + +# Conda environments: name → prefix directory +conda_envs: + otagent: /lustrefs/users/${USER}/miniconda3/envs/otagent + +# Paths (parameterized with $USER where possible) +paths: + project_root: /lustrefs/users/${USER}/OpenThoughts-Agent + hf_cache: /lustrefs/users/${USER}/.cache/huggingface/hub + eval_jobs_dir: /lustrefs/users/${USER}/OpenThoughts-Agent/jobs + eval_logs_dir: eval/local/mbz/logs + listener_logs_dir: experiments/listener_logs + sbatch_script: eval/unified_eval_harbor.sbatch + dp_sbatch_script: eval/unified_eval_harbor_dp.sbatch + harbor_src: /lustrefs/users/${USER}/harbor/src + datasets_dirs: + - /lustrefs/users/${USER}/.cache/huggingface/hub + secrets_file: ~/secrets.env + +# Proxy (MBZ compute nodes have internet) +proxy: + enabled: false + +# Hardware +hardware: + gpus_per_node: 8 + cpus_per_node: 96 + mem_per_node_mb: 1860000 + arch: x86_64 + cuda_home: /usr/local/cuda-12.8 diff --git a/eval/configs/dcagent_eval_config.yaml b/eval/configs/dcagent_eval_config.yaml new file mode 100644 index 00000000..414188fe --- /dev/null +++ b/eval/configs/dcagent_eval_config.yaml @@ -0,0 +1,47 @@ +# Harbor eval config (default) +# NOTE: jobs_dir is NOT set here — it's passed via `--jobs-dir` CLI flag +# from the sbatch, which reads EVAL_JOBS_DIR from the cluster config. +n_attempts: 3 +timeout_multiplier: 1.0 +orchestrator: + type: local + n_concurrent_trials: 16 + quiet: false + plain_output: true + retry: + max_retries: 3 + exclude_exceptions: + - AgentTimeoutError + - AgentEnvironmentTimeoutError + - BadRequestError + - VerifierTimeoutError + - SummarizationTimeout + - SummarizationTimeoutError + - ContextLengthExceededError + wait_multiplier: 1.0 + min_wait_sec: 1.0 + max_wait_sec: 60.0 +environment: + type: daytona + force_build: false +agents: + - name: terminus-2 + max_timeout_sec: 7200 + trajectory_config: + raw_content: true + linear_history: true + kwargs: + record_terminal_session: false + enable_episode_logging: false + enable_pane_logging: false + collect_rollout_details: false + collect_engine_metrics: false + metrics_endpoint: https://replace-with-vllm-host/metrics + metrics_timeout_sec: 10 + model_info: + max_input_tokens: 32768 + max_output_tokens: 8192 + input_cost_per_token: 0 + output_cost_per_token: 0 +datasets: + - path: examples/tasks diff --git a/eval/configs/dcagent_eval_config_no_override.yaml b/eval/configs/dcagent_eval_config_no_override.yaml new file mode 100644 index 00000000..b3df7d86 --- /dev/null +++ b/eval/configs/dcagent_eval_config_no_override.yaml @@ -0,0 +1,47 @@ +# Harbor eval config (no-override variant for swebench/tb2) +# NOTE: jobs_dir is NOT set here — it's passed via `--jobs-dir` CLI flag. +n_attempts: 3 +timeout_multiplier: 1.0 +orchestrator: + type: local + n_concurrent_trials: 4 + quiet: false + plain_output: true + retry: + max_retries: 3 + exclude_exceptions: + - AgentTimeoutError + - AgentEnvironmentTimeoutError + - BadRequestError + - VerifierTimeoutError + - SummarizationTimeout + - SummarizationTimeoutError + - ContextLengthExceededError + wait_multiplier: 1.0 + min_wait_sec: 1.0 + max_wait_sec: 60.0 +environment: + type: daytona + force_build: true + delete: false +agents: + - name: terminus-2 + max_timeout_sec: 7200 + trajectory_config: + raw_content: true + linear_history: true + kwargs: + record_terminal_session: false + enable_episode_logging: false + enable_pane_logging: false + collect_rollout_details: false + collect_engine_metrics: false + metrics_endpoint: https://replace-with-vllm-host/metrics + metrics_timeout_sec: 10 + model_info: + max_input_tokens: 32768 + max_output_tokens: 8192 + input_cost_per_token: 0 + output_cost_per_token: 0 +datasets: + - path: examples/tasks diff --git a/eval/docs/CLUSTER_ONBOARDING.md b/eval/docs/CLUSTER_ONBOARDING.md new file mode 100644 index 00000000..0ba2af42 --- /dev/null +++ b/eval/docs/CLUSTER_ONBOARDING.md @@ -0,0 +1,344 @@ +# Eval System — New Cluster Onboarding Guide + +This guide walks you through setting up the eval system on a new HPC cluster. After following these steps, you can run evals with a single `git pull` + listener command. + +## Step 0: Gather Cluster Information + +Before creating any config files, collect the following about your cluster: + +### Hardware (run on the cluster) +```bash +# GPU type, count per node, total nodes +sinfo -N --format="%.30N %.6t %.5c %.10G %.10m" | head -5 + +# Partitions +sinfo --format="%P %D %G" --noheader + +# Architecture +uname -m # x86_64 or aarch64 + +# CUDA version +nvidia-smi | head -3 +ls /usr/local/cuda* +``` + +**Record these values:** +| Field | Example | Your cluster | +|-------|---------|-------------| +| GPUs per node | 8 | | +| GPU type | H200 141GB | | +| CPUs per node | 128 | | +| Memory per node (MB) | 1612647 | | +| Architecture | x86_64 | | +| CUDA path | /usr/local/cuda-12.8 | | +| SLURM partition | main | | +| SLURM account | (empty or required) | | +| Max wall time | 24:00:00 | | +| Internet on compute? | yes / no | | + +### Paths (decide these) +| Path | Purpose | Example | +|------|---------|---------| +| Project root | Where OpenThoughts-Agent is cloned | `/home/$USER/OpenThoughts-Agent` | +| Harbor source | Where harbor repo is cloned | `/home/$USER/harbor` | +| HF cache | HuggingFace hub cache for models/datasets | `/home/$USER/.cache/huggingface/hub` | +| Jobs dir | Where harbor writes eval trial output | `$PROJECT_ROOT/jobs` | +| Conda prefix | Conda environment prefix directory | `/home/$USER/miniconda3/envs/otagent` | + +### Network +- **Internet on compute nodes?** If no, you need a proxy setup (proxychains, SSH tunnel). See the Jupiter cluster config for an example. +- **HuggingFace Hub access?** Test with `curl -s https://huggingface.co/api/models | head -c 100` + +--- + +## Step 1: Create the Cluster Config YAML + +Create `eval/clusters/.yaml`. This is the single source of truth for all cluster-specific settings. + +```yaml +# eval/clusters/mycluster.yaml +cluster_name: mycluster + +# SLURM +slurm_partition: gpu +slurm_account: "" # empty = no --account flag; set if required +slurm_time: "24:00:00" # max wall time + +# Conda environments: name → prefix directory +# The listener passes this as OTAGENT_DIR to the sbatch +conda_envs: + otagent: /path/to/miniconda3/envs/otagent + +# Paths — use ${USER} for multi-user support +paths: + project_root: /path/to/${USER}/OpenThoughts-Agent + hf_cache: /path/to/${USER}/.cache/huggingface/hub + eval_jobs_dir: /path/to/${USER}/OpenThoughts-Agent/jobs + eval_logs_dir: eval/local/mycluster/logs # relative to project_root + listener_logs_dir: experiments/listener_logs # relative to project_root + sbatch_script: eval/unified_eval_harbor.sbatch # DON'T CHANGE (cluster-agnostic) + dp_sbatch_script: eval/unified_eval_harbor_dp.sbatch + harbor_src: /path/to/${USER}/harbor/src + datasets_dirs: + - /path/to/${USER}/.cache/huggingface/hub + secrets_file: ~/secrets.env + +# Proxy (set enabled: true for no-internet compute nodes) +proxy: + enabled: false + # Only needed if enabled: true + # login_node: login01 + # proxychains_bin: /path/to/proxychains4 + +# Hardware (critical for job packing and resource requests) +hardware: + gpus_per_node: 8 + cpus_per_node: 128 + mem_per_node_mb: 1612647 # from `sinfo --format="%.10m"` or `free -m` + arch: x86_64 # x86_64 or aarch64 + cuda_home: /usr/local/cuda-12.8 +``` + +**Commit this file** — it contains no secrets, only paths and hardware specs. + +--- + +## Step 2: Create the Cluster Dotenv + +Create `hpc/dotenv/.env`. This is sourced by the sbatch on compute nodes. + +```bash +# hpc/dotenv/mycluster.env +export SCRATCH="/path/to/$USER" +export DCFT="$SCRATCH/OpenThoughts-Agent" +export DC_AGENT="$DCFT" +export DC_AGENT_SECRET_ENV=~/secrets.env +export HF_CACHE_DIR="$SCRATCH/.cache/huggingface" +export HF_HUB_CACHE="$HF_CACHE_DIR/hub" +export HF_HOME="$HF_CACHE_DIR" +export DATASETS_DIR="$HF_HUB_CACHE" +export MODELS_DIR="$HF_HUB_CACHE" +export VLLM_CACHE_ROOT="$SCRATCH/.cache/vllm" +export TRITON_CACHE_DIR="$SCRATCH/.cache/triton" +export FLASHINFER_CACHE_DIR="$SCRATCH/.cache/flashinfer" +export HF_HUB_ENABLE_HF_TRANSFER=1 +export WANDB_PROJECT="OpenThoughts-Agent" +export PYTHONPATH="${DCFT}${PYTHONPATH:+:$PYTHONPATH}" +export DCFT_CONDA="$SCRATCH/miniconda3" +export PYTORCH_CUDA_ALLOW_TF32=1 +export PYTORCH_CUDNN_ALLOW_TF32=1 +``` + +**Commit this file** — it contains no secrets (secrets come from `~/secrets.env`). + +--- + +## Step 3: Set Up Secrets (Local Only — Never Commit) + +Create `~/secrets.env` on the cluster: + +```bash +export DAYTONA_API_KEY="dtn_..." +export DAYTONA_TARGET="" # empty = default region +export HF_TOKEN="hf_..." +export SUPABASE_URL="https://..." +export SUPABASE_ANON_KEY="..." +export SUPABASE_SERVICE_ROLE_KEY="..." +``` + +**Never commit this file.** It's gitignored. + +--- + +## Step 4: Set Up Conda Environment + +```bash +# Create the conda env +conda create -n otagent python=3.12 -y +conda activate otagent + +# Install core dependencies +pip install vllm # or specific version for your GPU arch +pip install hf_transfer # fast HF downloads + +# Install harbor (pin to known-good commit) +cd /path/to/harbor +git checkout 6fdb92e7f5707c2b01214933f1622771784e6f67 +pip install -e . + +# Install the repo itself (for database utils) +cd /path/to/OpenThoughts-Agent +pip install -e . +``` + +### Architecture-specific notes + +| GPU arch | vLLM install | Notes | +|----------|-------------|-------| +| x86_64 (H100/H200/A100) | `pip install vllm` | Standard install | +| aarch64 (GH200) | Build from source or use pre-built wheel | See Jupiter setup | + +### Optional: otagent2 env (for newer models) + +Some models (Qwen3.5, GLM-4.7) require newer vLLM (≥0.17). Create a second env: + +```bash +conda create -n otagent2 python=3.12 -y +conda activate otagent2 +pip install vllm>=0.17 hf_transfer +pip install -e /path/to/harbor +``` + +Add it to your cluster YAML: +```yaml +conda_envs: + otagent: /path/to/envs/otagent + otagent2: /path/to/envs/otagent2 +``` + +--- + +## Step 5: Pre-download Datasets + +Do this on the login node (which has internet). This avoids race conditions when multiple sbatch jobs try to download simultaneously. + +```bash +source ~/secrets.env +python eval/snapshot_download.py DCAgent/dev_set_v2 +python eval/snapshot_download.py DCAgent2/terminal_bench_2 +python eval/snapshot_download.py DCAgent2/swebench-verified-random-100-folders +``` + +For no-internet clusters, also pre-download models you plan to eval: +```bash +python -c "from huggingface_hub import snapshot_download; snapshot_download('DCAgent/a1-nl2bash')" +``` + +--- + +## Step 6: Create Log Directories + +```bash +mkdir -p eval/local//logs +mkdir -p experiments/listener_logs +``` + +--- + +## Step 7: Verify Setup (Dry Run) + +```bash +source ~/secrets.env +PYTHONPATH=$PWD python eval/unified_eval_listener.py \ + --cluster-config eval/clusters/.yaml \ + --preset v2 \ + --priority-file eval/lists/a1_nl2bash.txt \ + --require-priority-list \ + --baseline-model-config eval/baseline_model_configs.yaml \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --enable-thinking \ + --slurm-time 24:00:00 \ + --dry-run --once --verbose +``` + +Check the output for: +- `[v6] Cluster config: ` — cluster detected +- `Sbatch params: ...` — parameters look correct +- `[DRY RUN] Would submit ...` — job would be submitted +- No `ERROR:` lines about missing paths + +--- + +## Step 8: Run for Real + +```bash +source ~/secrets.env +PYTHONPATH=$PWD python eval/unified_eval_listener.py \ + --cluster-config eval/clusters/.yaml \ + --preset v2 \ + --priority-file eval/lists/.txt \ + --require-priority-list \ + --baseline-model-config eval/baseline_model_configs.yaml \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --enable-thinking \ + --slurm-partition \ + --slurm-time \ + --once +``` + +### Common flag combinations + +```bash +# 8B models: TP=1, DP=4 (fast, 4 replicas) +--tp-size 1 --dp-size 4 --conda-env otagent2 + +# 32B models: TP=4 (one replica across 4 GPUs) +--tp-size 4 + +# Packed jobs (multiple models per node) +--pack-jobs --stagger-delay 1 --chain-batch-size 10 + +# Batch with sliding window (max 32 SLURM jobs active) +--max-jobs-submitted 32 --batch-size 32 +``` + +### Available presets + +| Preset | Dataset | auto_snapshot | Typical use | +|--------|---------|---------------|-------------| +| `v2` | DCAgent/dev_set_v2 | true | Dev eval (100 tasks) | +| `tb2` | DCAgent2/terminal_bench_2 | true | Terminal benchmark | +| `swebench` | DCAgent2/swebench-verified-random-100-folders | false | SWE-bench | +| `aider` | DCAgent2/aider_polyglot | false | Aider benchmark | +| `bfcl` | DCAgent2/bfcl-parity | false | BFCL benchmark | + +--- + +## Monitoring + +```bash +# Check job status +squeue -u $USER --format="%.10i %.40j %.8T %.12M" + +# Tail job output +tail -f eval/local//logs/data_.out + +# Tail vLLM log +tail -f eval/local//logs/vllm_.log + +# Progress dashboard +PYTHONPATH=$PWD python eval/check_progress.py --live +``` + +--- + +## Existing Cluster Configs (Reference) + +| Cluster | YAML | GPUs/node | Arch | Internet | Partition | Account | +|---------|------|-----------|------|----------|-----------|---------| +| M2 (MBZ) | `eval/clusters/m2.yaml` | 8× H200 | x86_64 | yes | main | — | +| MBZ/M1 | `eval/clusters/mbz.yaml` | 8× H200 | x86_64 | yes | main | — | +| Jupiter | `eval/clusters/jupiter.yaml` | 4× GH200 | aarch64 | **no** | booster | reformo | + +--- + +## Troubleshooting + +### "ERROR: EVAL_PROJECT_ROOT not set" +You're running the sbatch directly without the listener. Use the listener with `--cluster-config`, or set the env vars manually. + +### "ERROR: OTAGENT_DIR not set" +The conda env wasn't found. Check that `conda_envs` in your cluster YAML has the correct prefix path. + +### Port collision (vLLM "Port X already in use" spam) +- **Without `--pack-jobs`**: The sbatch derives a unique port from `SLURM_JOB_ID`. If another user's job is on the same node, ports may collide. The sbatch will increment until it finds a free port. +- **With `--pack-jobs`**: The listener centrally assigns ports. Check the log for `Pack: (GPUs X/8, port XXXXX)`. +- **With DP > 1**: Do NOT export `VLLM_PORT` as an environment variable — vLLM's DP subprocesses all read it and try to bind the same port. The sbatch passes it via `--port` CLI flag only. + +### No internet on compute nodes +Set `proxy.enabled: true` in your cluster YAML and provide `proxy.login_node` and `proxy.proxychains_bin`. The sbatch will auto-configure SSH tunnel + proxychains. Pre-download all models/datasets on the login node before submitting jobs. + +### Model needs newer vLLM (Qwen3.5, GLM-4.7, etc.) +Use `--conda-env otagent2` with a second conda env that has vLLM ≥0.17. diff --git a/eval/docs/CLUSTER_SETUP_GUIDE.md b/eval/docs/CLUSTER_SETUP_GUIDE.md new file mode 100644 index 00000000..f0503183 --- /dev/null +++ b/eval/docs/CLUSTER_SETUP_GUIDE.md @@ -0,0 +1,294 @@ +# Setting Up OpenThoughts-Agent Eval on a New MBZUAI Cluster + +## 1. Transfer Code to the New Cluster + +Since we don't want to push uncommitted changes to GitHub, use `git bundle` to create a single portable file containing the full repo + all local changes. + +### On the source cluster (current MBZ H200) + +```bash +cd /mnt/weka/home/richard.zhuang/OpenThoughts-Agent + +# Stage everything (including untracked eval/MBZ files) +git add -A + +# Create a stash commit (doesn't affect your branch) +git stash + +# Create bundle of the entire repo (all branches + tags) +git bundle create /tmp/openthoughts-agent.bundle --all + +# Pop stash back +git stash pop + +# Also bundle the uncommitted/untracked files separately +# (git bundle only contains committed objects) +tar czf /tmp/openthoughts-uncommitted.tar.gz \ + --exclude='jobs/' \ + --exclude='__pycache__' \ + --exclude='*.pyc' \ + --exclude='.eggs' \ + eval/MBZ/ \ + hpc/dotenv/mbz.env \ + database/unified_db/utils.py \ + .gitignore + +# Transfer both files to new cluster +scp /tmp/openthoughts-agent.bundle :/path/to/scratch/ +scp /tmp/openthoughts-uncommitted.tar.gz :/path/to/scratch/ +``` + +**Alternative: if clusters share a filesystem (same Weka mount)**, just reference the same path — no transfer needed. + +**Alternative: rsync** (if you prefer a live copy instead of git bundle): +```bash +rsync -avz --exclude='jobs/' --exclude='__pycache__' --exclude='*.pyc' \ + /mnt/weka/home/richard.zhuang/OpenThoughts-Agent/ \ + :/path/to/scratch/OpenThoughts-Agent/ +``` + +### On the destination cluster + +```bash +SCRATCH="/path/to/your/scratch" # e.g., /mnt/weka/home/ +cd "$SCRATCH" + +# Clone from bundle +git clone /path/to/openthoughts-agent.bundle OpenThoughts-Agent +cd OpenThoughts-Agent + +# Checkout working branch +git checkout penfever/working + +# Apply uncommitted files on top +tar xzf /path/to/openthoughts-uncommitted.tar.gz + +# Set the real remote (so future git pull works) +git remote set-url origin https://github.com//OpenThoughts-Agent.git +``` + +--- + +## 2. Install Miniconda (if not present) + +```bash +SCRATCH="/path/to/your/scratch" + +# Download and install +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh +bash /tmp/miniconda.sh -b -p "$SCRATCH/miniconda3" +eval "$($SCRATCH/miniconda3/bin/conda shell.bash hook)" +conda init bash +``` + +--- + +## 3. Create the `otagent` Conda Environment + +```bash +SCRATCH="/path/to/your/scratch" +DCFT="$SCRATCH/OpenThoughts-Agent" + +eval "$(conda shell.bash hook 2>/dev/null)" || source "$SCRATCH/miniconda3/etc/profile.d/conda.sh" + +# Create Python 3.12 env +conda create -n otagent python=3.12 -y +conda activate otagent + +# Install uv (fast pip replacement) +pip install uv + +# Install PyTorch (CUDA 12.8 — adjust for your cluster's CUDA version) +uv pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 \ + --index-url https://download.pytorch.org/whl/cu128 + +# Install vLLM 0.13.0 (the version used by otagent for existing model evals) +uv pip install "vllm==0.13.0" + +# Install the project in editable mode (pulls core deps from pyproject.toml) +cd "$DCFT" +uv pip install -e . --no-deps + +# Install eval infrastructure packages +uv pip install \ + "pydantic>=2.0.0,<3.0.0" \ + pyyaml \ + omegaconf \ + wandb \ + bs4 \ + "numpy<=2.26.0" \ + "huggingface_hub>=0.20.0,<1.0.0" \ + "datasets>=2.0.0" \ + "supabase>=2.22.3" \ + "python-dotenv>=1.0.0" \ + "google-cloud-storage" \ + h5py \ + certifi \ + rapidfuzz \ + "uv>=0.4.17" \ + socksio \ + "litellm>=1.80.0" \ + "ray[default]>=2.50.0" \ + "hydra-core>=1.3.2" \ + aiohttp-socks \ + Jinja2 \ + "transformers==4.57.3" \ + "accelerate==1.12.0" + +# Install harbor (with Daytona support) +uv pip install "harbor[daytona] @ git+https://github.com/laude-institute/harbor.git@penfever/temp-override" + +# Install dynamic-semaphore +uv pip install "dynamic-semaphore @ git+https://github.com/penfever/dynamic-semaphore" +``` + +### Verify + +```bash +python -c " +import torch; print(f'torch: {torch.__version__} CUDA: {torch.version.cuda}') +import vllm; print(f'vllm: {vllm.__version__}') +import transformers; print(f'transformers: {transformers.__version__}') +import ray; print(f'ray: {ray.__version__}') +import harbor; print(f'harbor: {harbor.__version__}') +import litellm; print('litellm: OK') +import daytona; print('daytona: OK') +from database.unified_db.utils import upload_eval_results; print('unified_db: OK') +" +``` + +Expected output: +``` +torch: 2.9.0 CUDA: 12.8 +vllm: 0.13.0 +transformers: 4.57.3 +ray: 2.54.0 +harbor: 0.1.45 +litellm: OK +daytona: OK +unified_db: OK +``` + +--- + +## 4. (Optional) Create `otagent2` for Newer Models + +If you need to evaluate models like Qwen3.5 that require newer vLLM: + +```bash +bash eval/MBZ/setup_eval_env.sh +``` + +This creates `otagent2` with vLLM 0.17.1 + transformers from source. + +--- + +## 5. Configure Secrets + +```bash +# Copy the template and fill in your keys +cp eval/MBZ/secret.env.template ~/secrets.env +# Edit ~/secrets.env with real values: +# DAYTONA_API_KEY, DAYTONA_TARGET, HF_TOKEN, +# SUPABASE_URL, SUPABASE_ANON_KEY, SUPABASE_SERVICE_ROLE_KEY + +# For RL org (if needed): +cp eval/MBZ/secret.env.template ~/secrets_rl_org.env +# Edit with RL org keys (DAYTONA_TARGET='RL') +``` + +--- + +## 6. Adapt Cluster-Specific Paths + +Files that need path updates for the new cluster: + +| File | What to change | +|------|---------------| +| `eval/MBZ/unified_eval_harbor_v4.sbatch` | `SCRATCH=` path (line 92), SLURM partition/QoS (`#SBATCH -p`, `#SBATCH --qos`) | +| `eval/MBZ/unified_eval_listener_v4.py` | Default paths if launching from new cluster | +| `eval/MBZ/reupload_hf.py` | `SCRATCH=`, `DEFAULT_EVAL_JOBS_DIR`, `DEFAULT_LOG_DIR` | +| `hpc/dotenv/mbz.env` | Cluster-specific env vars | + +### Key path pattern + +The sbatch uses `SCRATCH` as the root for everything: +```bash +SCRATCH="/path/to/your/scratch" # <-- CHANGE THIS +DCFT="$SCRATCH/OpenThoughts-Agent" +``` + +All other paths (`VLLM_CACHE_ROOT`, `HF_HUB_CACHE`, etc.) derive from `$SCRATCH`. + +--- + +## 7. Download Datasets + +```bash +conda activate otagent + +# Download eval datasets to HF cache +python eval/MBZ/snapshot_download.py DCAgent/dev_set_v2 +python eval/MBZ/snapshot_download.py DCAgent2/terminal_bench_2 +``` + +--- + +## 8. Pre-download Model Weights + +Compute nodes typically have no internet. Pre-download on login node: + +```bash +python -c " +from huggingface_hub import snapshot_download +snapshot_download('your-org/model-name', cache_dir='$SCRATCH/.cache/huggingface/hub') +" +``` + +--- + +## 9. Test a Single Job + +```bash +conda activate otagent + +# Manual single-job test (no listener) +sbatch eval/MBZ/unified_eval_harbor_v4.sbatch \ + your-org/model-name \ + DCAgent/dev_set_v2 +``` + +Check logs: +```bash +tail -f experiments/logs/terminal_.out +tail -f experiments/logs/vllm_.log +``` + +--- + +## 10. Start the Listener + +```bash +# Basic listener (scans DB for pending evals) +python eval/MBZ/unified_eval_listener_v4.py \ + --datasets DCAgent/dev_set_v2 DCAgent2/terminal_bench_2 \ + --priority-file eval/MBZ/lists/priority_models.txt \ + --secrets-file ~/secrets.env \ + --verbose + +# For newer models (Qwen3.5+), add: +# --conda-env otagent2 +``` + +--- + +## Troubleshooting + +| Issue | Fix | +|-------|-----| +| `RequestsDependencyWarning: chardet` | Harmless warning, ignore | +| vLLM fails with `qwen3_5` not recognized | Use `otagent2` env (vLLM 0.17.1) | +| HF upload: XET permission denied | Ensure `HF_XET_CACHE` is set (already in v4 sbatch) | +| HF upload: subagent traces bloating dataset | Fixed in `database/unified_db/utils.py` (export_subagents forwarding) | +| Snapshot quota exceeded | Daytona org limited to 40 snapshots; request quota increase | +| CUDA version mismatch | Change `--index-url` in torch install to match your CUDA (cu118, cu121, cu124, cu128) | diff --git a/eval/docs/EVAL_WORKFLOW.md b/eval/docs/EVAL_WORKFLOW.md new file mode 100644 index 00000000..78ac05a9 --- /dev/null +++ b/eval/docs/EVAL_WORKFLOW.md @@ -0,0 +1,1431 @@ +# Harbor Evaluation Workflow — Jupiter HPC + +End-to-end guide for running Harbor agent evaluations on the Jupiter cluster (JSC GH200). + +--- + +## Table of Contents + +1. [Overview & Architecture](#1-overview--architecture) +2. [Dataset Preparation](#2-dataset-preparation) +3. [Snapshot Pre-creation](#3-snapshot-pre-creation) +4. [Launching Evaluations](#4-launching-evaluations) +5. [Monitoring Jobs](#5-monitoring-jobs) +6. [Uploading Results](#6-uploading-results) +7. [Database Schema](#7-database-schema) +8. [Common Pitfalls & Troubleshooting](#8-common-pitfalls--troubleshooting) +9. [Key File Paths Reference](#9-key-file-paths-reference) + +--- + +## 1. Overview & Architecture + +Harbor is a framework for evaluating AI agents against benchmark tasks. The evaluation pipeline has four main components: + +``` +┌──────────────┐ ┌──────────────────┐ ┌───────────┐ ┌──────────┐ +│ Harbor │────>│ Daytona Cloud │────>│ Verifier │────>│ Results │ +│ Agent │ │ Sandbox (Docker) │ │ (tests/) │ │ Upload │ +│ (terminus-2)│ │ │ │ │ │ HF + DB │ +└──────────────┘ └──────────────────┘ └───────────┘ └──────────┘ + │ │ + │ │ + vLLM Server Daytona API + (GPU node) (cloud sandboxes) +``` + +- **Agent**: `terminus-2` — the standard eval agent. Takes task instructions, executes in a sandbox, writes output. +- **Daytona**: Cloud sandbox provider. Each task runs in an isolated Docker container. Snapshots pre-built from `environment/Dockerfile`. +- **Verifier**: Runs `tests/test.sh` inside the container after the agent finishes. Writes reward (0 or 1) to `/logs/verifier/reward.txt`. +- **Supabase DB**: Stores benchmarks, tasks, jobs, trials, and model usage records. +- **HuggingFace Hub**: Stores agent traces (conversation logs) as HF datasets. + +### Execution Flow (Slurm) + +1. Sbatch starts vLLM server on GPU node (TP=4 across 4 GH200 GPUs) +2. SSH tunnel provides internet to compute node via SOCKS5 proxy +3. Harbor orchestrator runs N trials concurrently against Daytona sandboxes +4. Each trial: create sandbox → agent runs → verifier checks → record result +5. After all trials: check DaytonaError count → upload traces to HF → upload records to DB + +--- + +## 2. Dataset Preparation + +### Task Directory Structure + +Each task is a directory with this layout: + +``` +/ +├── task.toml # Configuration (timeouts, resources, metadata) +├── instruction.md # Natural language task description for the agent +├── environment/ # Docker build context +│ ├── Dockerfile # Container definition (required) +│ └── workspace/ # Optional files copied into container +├── tests/ # Verification scripts +│ ├── test.sh # Main test runner (writes reward to /logs/verifier/reward.txt) +│ └── ... # Test files, expected answers, etc. +└── solution/ # Optional reference solution + └── solve.sh +``` + +### task.toml Format + +```toml +version = "1.0" + +[metadata] +author_name = "Your Name" +author_email = "you@example.com" +difficulty = "medium" +category = "reasoning" +tags = ["qa", "web-search"] +source = "gaia-benchmark/GAIA" + +[verifier] +timeout_sec = 300 + +[verifier.env] +# Optional: env vars passed to verifier (e.g., for LLM judges) +# Values with ${VAR} are resolved from the host environment at runtime +OPENAI_API_KEY = "${OPENAI_API_KEY}" +MODEL_NAME = "openai/gpt-5-2025-08-07" + +[agent] +timeout_sec = 600 + +[environment] +build_timeout_sec = 600 +cpus = 1 +memory_mb = 2048 +storage_mb = 10240 +``` + +### Three Verifier Patterns + +**1. String Match (GAIA)** +```bash +# tests/test.sh +AGENT_ANSWER=$(cat /app/answer.txt | tr '[:upper:]' '[:lower:]' | xargs) +EXPECTED=$(cat /tests/expected_answer.txt | tr '[:upper:]' '[:lower:]' | xargs) +if [ "$AGENT_ANSWER" = "$EXPECTED" ]; then + echo 1 > /logs/verifier/reward.txt +else + echo 0 > /logs/verifier/reward.txt +fi +``` + +**2. Code Tests (Aider Polyglot)** +```bash +# tests/test.sh — runs language-specific test suite +# C++: cmake + ctest, Python: pytest, Go: go test, etc. +# Passes → reward=1, Fails → reward=0 +``` + +**3. LLM Judge (FinanceAgent)** +```bash +# tests/test.sh — calls run_test.py which uses LiteLLM +# Sends agent answer + expected answer to gpt-5 for semantic comparison +# Writes judgment to /logs/verifier/judgment.json +# Writes reward (0 or 1) to /logs/verifier/reward.txt +``` + +The `[verifier.env]` section in task.toml supplies API keys to the LLM judge. + +### Environment Hashing + +Harbor uses `environment_dir_hash_truncated()` to compute a 12-character hex hash of the entire `environment/` directory. This hash determines the snapshot name used by Daytona. + +```python +# From harbor/utils/container_cache.py +def environment_dir_hash(env_dir: Path) -> str: + h = hashlib.sha256() + for file_path in sorted(env_dir.rglob("*")): + if file_path.is_file(): + rel = str(file_path.relative_to(env_dir)) + h.update(rel.encode("utf-8")) + h.update(file_path.read_bytes()) + return h.hexdigest() + +def environment_dir_hash_truncated(env_dir: Path, truncate: int = 12) -> str: + return environment_dir_hash(env_dir)[:truncate] +``` + +Key details: +- Hashes **all files** in `environment/` (not just Dockerfile), including workspace files +- Files processed in sorted order for determinism +- Both relative path and file content contribute to the hash +- Tasks with identical `environment/` directories share a snapshot + +### Pre-downloaded Datasets + +These datasets are already downloaded and ready to use on the shared filesystem: + +| Dataset | Tasks | Local Path | +|---------|------:|------------| +| Aider Polyglot | 225 | `/e/data1/.../guha1/datasets/DCAgent2_aider_polyglot` | +| BFCL Parity | 123 | `/e/data1/.../guha1/datasets/DCAgent2_bfcl-parity` | +| Terminal Bench v2 | 89 | `/e/data1/.../guha1/datasets/DCAgent2_terminal_bench_2` | +| Dev Set 71 | 70 | `/e/data1/.../guha1/datasets/DCAgent_dev_set_71_tasks` | +| Dev Set v2 | 100 | `/e/data1/.../guha1/datasets/DCAgent_dev_set_v2` | +| FinanceAgent | 50 | `/e/data1/.../guha1/datasets/financeagent` | +| FinanceAgent Terminal | 50 | `/e/data1/.../guha1/datasets/financeagent_terminal` | +| FinanceAgent Terminal v2 | 50 | `/e/data1/.../guha1/datasets/financeagent_terminal_v2` | +| FinanceAgent Terminal+Keys | 50 | `/e/data1/.../guha1/datasets/financeagent_terminal_withkeys` | +| GAIA (full) | 165 | `/e/data1/.../guha1/datasets/gaia` | +| GAIA 127 | 127 | `/e/data1/.../guha1/datasets/gaia_127` | +| GAIA 127 +Tools | 127 | `/e/data1/.../guha1/datasets/gaia_127_withtools` | +| MedAgentBench | 300 | `/e/data1/.../guha1/datasets/medagentbench` | + +All paths above expand to `/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/`. + +### Downloading a Dataset from HuggingFace + +Datasets are hosted as HF dataset repos (e.g., `DCAgent/dev_set_v2`). Download with `snapshot_download.py`: + +```bash +PYTHON="/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python" +DCFT="/e/scratch/jureap59/guha1/OpenThoughts-Agent" +REPO_ID="DCAgent2/aider_polyglot" + +# Download to a local directory (real files, no symlinks — Daytona needs real files) +LOCAL_DIR="/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/DCAgent2_aider_polyglot" +$PYTHON "$DCFT/eval/jupiter/snapshot_download.py" "$REPO_ID" --local-dir "$LOCAL_DIR" + +# Verify +ls "$LOCAL_DIR" | head -5 +ls "$LOCAL_DIR"/$(ls "$LOCAL_DIR" | head -1)/ # Should show task.toml, instruction.md, environment/, tests/ +``` + +The script uses `huggingface_hub.snapshot_download()` with `local_dir` to get real files (not symlinks). If the local dir already has valid task directories it skips re-downloading. + +For local paths, no download needed — just point directly: +```bash +DATASET_PATH="/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/gaia_127" +``` + +### Checking Unique Dockerfiles / Environment Hashes + +Before pre-creating snapshots, you need to know how many unique environments exist in your dataset. This tells you exactly which snapshots to create. + +```bash +PYTHON="/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python" +export PYTHONPATH="/e/scratch/jureap59/etash/harbor/src" + +# Quick check: count unique environment hashes and list them +$PYTHON -c " +from pathlib import Path +from harbor.utils.container_cache import analyze_task_dockerfiles, get_task_environment_hash + +dataset = Path('/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/gaia_127') +task_dirs = sorted([d for d in dataset.iterdir() if d.is_dir() and (d / 'instruction.md').exists()]) + +stats = analyze_task_dockerfiles(task_dirs) +print(f'Total tasks: {stats.total_tasks}') +print(f'Tasks with Dockerfile: {stats.tasks_with_dockerfile}') +print(f'Without Dockerfile: {stats.tasks_without_dockerfile}') +print(f'Unique env hashes: {stats.unique_hashes}') +print() +print('Hash distribution (hash → task count):') +for h, count in stats.hash_counts.most_common(): + # Pick one representative task for this hash (needed for Dockerfile path later) + for d in task_dirs: + if get_task_environment_hash(d) == h: + print(f' harbor__{h}__snapshot ({count} tasks) e.g. {d.name}') + break +" +``` + +Example output for GAIA-127: +``` +Total tasks: 127 +Tasks with Dockerfile: 127 +Without Dockerfile: 0 +Unique env hashes: 1 +Hash distribution: + harbor__92ea7b6dd33f__snapshot (127 tasks) e.g. 0383a3ee-47a7-41a4-b493-519bdefe0488 +``` + +Example output for Aider Polyglot (many unique environments): +``` +Total tasks: 225 +Tasks with Dockerfile: 225 +Without Dockerfile: 0 +Unique env hashes: 225 +Hash distribution: + harbor__a1b2c3d4e5f6__snapshot (1 tasks) e.g. polyglot_cpp_allergies + harbor__b2c3d4e5f6a7__snapshot (1 tasks) e.g. polyglot_go_bowling + ... +``` + +### Creating a New Dataset from Scratch + +```bash +# 1. Create dataset directory structure +mkdir -p my_dataset/task_001/{environment,tests,solution} + +# 2. Write task.toml, instruction.md, Dockerfile, test.sh, solve.sh + +# 3. Verify hash for a single task +$PYTHON -c " +from pathlib import Path +from harbor.utils.container_cache import environment_dir_hash_truncated +h = environment_dir_hash_truncated(Path('my_dataset/task_001/environment')) +print(f'Hash: {h}') +print(f'Snapshot name: harbor__{h}__snapshot') +" + +# 4. Check unique snapshots across entire dataset (see section above) +``` + +--- + +## 3. Snapshot Pre-creation + +### Why Pre-create + +Without pre-creation, each unique environment hash triggers a Docker build inside Daytona on the first trial that uses it. This adds minutes of latency and can fail under load. Pre-creating snapshots ensures they're in `ACTIVE` state before the eval starts. + +### Snapshot Naming Convention + +``` +harbor__{hash}__snapshot # Regular evals +harbor__{hash}__RL__snapshot # RL training evals (DAYTONA_TARGET=RL) +``` + +Where `{hash}` is the 12-char truncated SHA256 from `environment_dir_hash_truncated()`. + +### Daytona Keys and Orgs + +Three Daytona orgs are used. Keys are stored in `~/secrets.env` — **never hardcode them in scripts**. + +| Key Name | Env Var | Use | +|----------|---------|-----| +| org1 | `DAYTONA_KEY_ORG1` | Data + eval | +| org2 | `DAYTONA_KEY_ORG2` | Eval only (more quota) | +| RL key | `DAYTONA_KEY_RL` | RL training only | + +Snapshots must be pre-created on **both org1 and org2** for regular evals (the sbatch script randomly selects one with 3:1 weighting toward org2). For RL evals, pre-create on the RL org only. + +### End-to-End: Compute Hashes → Pre-create on Both Orgs + +Here's the full workflow from dataset to ready-to-eval: + +**Step 1: Compute unique environment hashes** (see "Checking Unique Dockerfiles" above) + +```bash +PYTHON="/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python" +export PYTHONPATH="/e/scratch/jureap59/etash/harbor/src" +DATASET="/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/gaia_127" + +# Get unique hashes and a representative Dockerfile path for each +$PYTHON -c " +from pathlib import Path +from harbor.utils.container_cache import analyze_task_dockerfiles, get_task_environment_hash + +dataset = Path('$DATASET') +task_dirs = sorted([d for d in dataset.iterdir() if d.is_dir() and (d / 'instruction.md').exists()]) +stats = analyze_task_dockerfiles(task_dirs) + +print(f'Unique snapshots needed: {stats.unique_hashes}') +for h, count in stats.hash_counts.most_common(): + # Find one representative task for this hash + for d in task_dirs: + if get_task_environment_hash(d) == h: + dockerfile = d / 'environment' / 'Dockerfile' + print(f' harbor__{h}__snapshot -> {dockerfile} ({count} tasks)') + break +" +``` + +**Step 2: Pre-create snapshots on both orgs** + +The pre-creation script reads keys from `~/secrets.env` and creates snapshots on both Daytona orgs: + +```python +#!/usr/bin/env python3 +"""Pre-create Daytona snapshots for eval datasets on both orgs. + +Keys are read from ~/secrets.env (DAYTONA_KEY_ORG1, DAYTONA_KEY_ORG2). +Edit the SNAPSHOTS dict below with output from step 1. +""" +import asyncio +import os + +# Load secrets +with open(os.path.expanduser("~/secrets.env")) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + if line.startswith("export "): + line = line[7:] + k, v = line.split("=", 1) + os.environ[k.strip()] = v.strip().strip('"').strip("'") + +import sys +sys.path.insert(0, "/e/scratch/jureap59/etash/harbor/src") +sys.path.insert(0, "/e/scratch/jureap59/guha1/pip_packages") +sys.path.insert(0, "/e/scratch/jureap59/etash/pip_extras") + +from daytona import AsyncDaytona, DaytonaConfig, CreateSnapshotParams, Image, Resources +from daytona._async.snapshot import SnapshotState + +# --- EDIT THIS: snapshot name → representative Dockerfile path --- +SNAPSHOTS = { + "harbor__92ea7b6dd33f__snapshot": "/e/data1/.../gaia/task_abc/environment/Dockerfile", + "harbor__bfc3340ef3c7__snapshot": "/e/data1/.../financeagent/task_0/environment/Dockerfile", +} + +# Keys from environment (loaded from ~/secrets.env) +DAYTONA_KEYS = { + "org1": os.environ["DAYTONA_KEY_ORG1"], + "org2": os.environ["DAYTONA_KEY_ORG2"], +} + + +async def create_snapshot(client, name, dockerfile_path, org_name): + """Create a single snapshot, handling already-exists gracefully.""" + try: + snap = await client.snapshot.get(name) + if snap.state == SnapshotState.ACTIVE: + print(f" [{org_name}] {name}: already ACTIVE, skipping") + return True + elif snap.state == SnapshotState.ERROR: + print(f" [{org_name}] {name}: ERROR state, deleting and recreating...") + await client.snapshot.delete(snap) + except Exception: + pass # Doesn't exist yet + + print(f" [{org_name}] Creating {name} from {dockerfile_path}...") + try: + await client.snapshot.create( + CreateSnapshotParams( + name=name, + image=Image.from_dockerfile(dockerfile_path), + resources=Resources(cpu=1, memory=1, disk=3), + ) + ) + except Exception as e: + if "already exists" in str(e).lower(): + print(f" [{org_name}] {name}: already exists (global), OK") + return True + print(f" [{org_name}] {name}: create FAILED: {e}") + return False + + # Poll for ACTIVE state (up to 10 minutes) + for i in range(120): + await asyncio.sleep(5) + try: + snap = await client.snapshot.get(name) + if snap.state == SnapshotState.ACTIVE: + print(f" [{org_name}] {name}: ACTIVE (took ~{i*5}s)") + return True + elif snap.state == SnapshotState.ERROR: + print(f" [{org_name}] {name}: entered ERROR state") + return False + except Exception: + pass + print(f" [{org_name}] {name}: TIMEOUT waiting for ACTIVE") + return False + + +async def main(): + for org_name, api_key in DAYTONA_KEYS.items(): + print(f"\n=== {org_name} ({api_key[:12]}...) ===") + client = AsyncDaytona(DaytonaConfig(api_key=api_key, target="us")) + try: + for snap_name, dockerfile in SNAPSHOTS.items(): + await create_snapshot(client, snap_name, dockerfile, org_name) + finally: + await client.close() + print("\nDone!") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Step 3: Run it** (must use Python 3.12, login node only): + +```bash +/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python precreate_snapshots.py +``` + +Expected output: +``` +=== org1 (dtn_17868a19...) === + [org1] harbor__92ea7b6dd33f__snapshot: already ACTIVE, skipping + [org1] harbor__bfc3340ef3c7__snapshot: ACTIVE (took ~30s) + +=== org2 (dtn_ecfb7592...) === + [org2] harbor__92ea7b6dd33f__snapshot: already ACTIVE, skipping + [org2] harbor__bfc3340ef3c7__snapshot: ACTIVE (took ~25s) + +Done! +``` + +For datasets with many unique snapshots (like Aider Polyglot with 225), this can take a while. The snapshots are global — once created on an org, they stay cached. + +### Automated Hash → Snapshot Script + +For large datasets, you can combine hash computation + pre-creation into one script: + +```bash +PYTHON="/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python" +export PYTHONPATH="/e/scratch/jureap59/etash/harbor/src:/e/scratch/jureap59/guha1/pip_packages:/e/scratch/jureap59/etash/pip_extras" +DATASET="/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/DCAgent2_aider_polyglot" + +# Generate snapshot dict and pre-create in one go +$PYTHON -c " +import asyncio, os, sys + +# Load secrets +with open(os.path.expanduser('~/secrets.env')) as f: + for line in f: + line = line.strip() + if line and not line.startswith('#') and '=' in line: + if line.startswith('export '): line = line[7:] + k, v = line.split('=', 1) + os.environ[k.strip()] = v.strip().strip('\"').strip(\"'\") + +from pathlib import Path +from harbor.utils.container_cache import get_task_environment_hash +from daytona import AsyncDaytona, DaytonaConfig, CreateSnapshotParams, Image, Resources +from daytona._async.snapshot import SnapshotState + +dataset = Path('$DATASET') +task_dirs = sorted([d for d in dataset.iterdir() if d.is_dir() and (d / 'instruction.md').exists()]) + +# Build unique hash → Dockerfile mapping +snapshots = {} +for d in task_dirs: + h = get_task_environment_hash(d) + if h: + snap_name = f'harbor__{h}__snapshot' + if snap_name not in snapshots: + snapshots[snap_name] = str(d / 'environment' / 'Dockerfile') + +print(f'Found {len(snapshots)} unique snapshots to pre-create') + +async def precreate_all(): + for org_name, key_env in [('org1', 'DAYTONA_KEY_ORG1'), ('org2', 'DAYTONA_KEY_ORG2')]: + api_key = os.environ[key_env] + print(f'\n=== {org_name} ({api_key[:12]}...) ===') + client = AsyncDaytona(DaytonaConfig(api_key=api_key, target='us')) + try: + for name, dockerfile in snapshots.items(): + try: + snap = await client.snapshot.get(name) + if snap.state == SnapshotState.ACTIVE: + print(f' [{org_name}] {name}: ACTIVE') + continue + except Exception: + pass + print(f' [{org_name}] Creating {name}...') + try: + await client.snapshot.create(CreateSnapshotParams( + name=name, image=Image.from_dockerfile(dockerfile), + resources=Resources(cpu=1, memory=1, disk=3))) + except Exception as e: + if 'already exists' in str(e).lower(): + print(f' [{org_name}] {name}: already exists') + continue + print(f' [{org_name}] {name}: FAILED: {e}') + continue + # Poll for ACTIVE + for i in range(120): + await asyncio.sleep(5) + snap = await client.snapshot.get(name) + if snap.state == SnapshotState.ACTIVE: + print(f' [{org_name}] {name}: ACTIVE ({i*5}s)') + break + elif snap.state == SnapshotState.ERROR: + print(f' [{org_name}] {name}: ERROR') + break + finally: + await client.close() + +asyncio.run(precreate_all()) +print('Done!') +" +``` + +Run with Python 3.12 (login node Python 3.9 is too old): +```bash +/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python precreate_snapshots.py +``` + +### RL Snapshot Pre-creation + +For RL jobs, snapshot names include `__RL__`: +``` +harbor__{hash}__RL__snapshot +``` + +Use the RL Daytona key and `DaytonaConfig(api_key=RL_KEY, target="RL")`. + +--- + +## 4. Launching Evaluations + +### 4a. Slurm Jobs (GPU Models via sbatch) + +Use `unified_eval_harbor.sbatch` for models that need a local vLLM server. + +**Positional Arguments:** +``` +$1 = MODEL # HF model name (e.g., mlfoundations-dev/some_model) +$2 = REPO_ID # HF dataset repo or local path (starts with / for local) +$3 = BENCHMARK_ID # Optional: DB benchmark UUID +$4 = RUN_TAG_ARG # Optional: override run tag +``` + +**Environment Variables:** +```bash +EVAL_N_CONCURRENT=128 # Concurrent trials (default: 128) +EVAL_GPU_MEMORY_UTIL=0.95 # vLLM GPU memory utilization (default: 0.95) +EVAL_DAYTONA_THRESHOLD=3 # Max DaytonaErrors before skipping upload (default: 3) +EVAL_SNAPSHOT_NAME=... # Force a specific snapshot template +EVAL_TIMEOUT_MULTIPLIER=... # Scale agent timeout +EVAL_OVERRIDE_MEMORY_MB=... # Override container memory +``` + +**Example Submission:** +```bash +sbatch --job-name="eval_mymodel_gaia" \ + eval/jupiter/unified_eval_harbor.sbatch \ + "mlfoundations-dev/my_model" \ + "/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/gaia_127" +``` + +**What the sbatch script does:** + +1. **Environment setup**: Sources `jupiter.env`, `secrets.env`, sets PYTHONPATH, creates ld-linux wrappers (ARM GH200 needs system loader since conda has no exec perms on shared filesystem) +2. **Daytona key selection**: Randomly picks org1 (25%) or org2 (75%) +3. **SSH tunnel + proxychains**: Compute nodes have no internet; creates SOCKS5 tunnel to login node +4. **vLLM server**: Starts on port 8000 with TP=4, waits up to ~33 minutes for ready +5. **Dataset download**: Via proxychains (HF) or direct local path +6. **Start vs Resume logic**: If `$EVAL_JOBS_DIR/$RUN_TAG/config.json` exists, resumes (retries DaytonaError and EnvironmentStartTimeoutError trials). Otherwise starts fresh. +7. **DaytonaError check**: If error count > threshold, skips upload +8. **Upload**: Traces to HF, records to Supabase DB + +**Auto-resume in sbatch:** + +The sbatch script has built-in auto-resume. If you submit the same model+dataset combination and a previous job directory (`$EVAL_JOBS_DIR/$RUN_TAG/config.json`) exists, it automatically switches to `harbor jobs resume` instead of `harbor jobs start`. This means: + +- **Re-submitting the same sbatch is safe** — it picks up where it left off +- Only trials with transient errors (DaytonaError, EnvironmentStartTimeoutError, DaytonaRateLimitError) are retried +- Completed trials are skipped +- You can resubmit after a Slurm timeout or OOM without losing progress + +The run tag is derived from `${SAFE_REPO}_${SAFE_MODEL}` (e.g., `gaia_127_mlfoundations-dev_my_model`), so the same model+dataset always maps to the same job directory. + +```bash +# First run: starts fresh +sbatch eval/jupiter/unified_eval_harbor.sbatch "my-org/my-model" "/path/to/dataset" + +# Job times out or gets OOM-killed. Resubmit: +sbatch eval/jupiter/unified_eval_harbor.sbatch "my-org/my-model" "/path/to/dataset" +# ^ Automatically resumes, retrying only failed trials +``` + +**Manual harbor commands (for reference):** +```bash +# New job +harbor jobs start -p $DATASET_PATH --n-concurrent 128 --agent terminus-2 \ + --model "hosted_vllm/$MODEL" --env daytona \ + --agent-kwarg "api_base=http://localhost:8000/v1" \ + --agent-kwarg "key=fake_key" --n-attempts 3 \ + --job-name "$RUN_TAG" --config eval/jupiter/dcagent_eval_config.yaml + +# Resume (retry transient errors only) +harbor jobs resume -p "$EXISTING_JOB_DIR" \ + --filter-error-type EnvironmentStartTimeoutError \ + --filter-error-type DaytonaError \ + --filter-error-type DaytonaRateLimitError +``` + +### 4b. Commercial Models (No GPU, Login Node / tmux) + +Commercial API models (OpenAI, Anthropic) don't need vLLM — run directly on the login node inside a tmux session. + +**Concrete commands to run commercial evals:** + +```bash +# 1. Start a tmux session +tmux new-session -s commercial_evals + +# 2. Set up environment +HARBOR_PYTHON="/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python" +export PYTHONPATH="/e/scratch/jureap59/etash/harbor/src:/e/scratch/jureap59/guha1/pip_packages:/e/scratch/jureap59/etash/pip_extras" +EVAL_JOBS_DIR="/e/data1/datasets/playground/mmlaion/shared/guha1/eval_jobs" + +source ~/secrets.env +# DAYTONA_API_KEY, OPENAI_API_KEY, HF_TOKEN loaded from secrets.env +export DAYTONA_TARGET="us" + +# 3. Run a single model on a single dataset +$HARBOR_PYTHON -m harbor.cli.main jobs start \ + -p "/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/gaia_127" \ + --n-concurrent 32 \ + --agent terminus-2 \ + --model "openai/gpt-5-mini" \ + --env daytona \ + --ek auto_snapshot=true \ + --no-force-build \ + --n-attempts 1 \ + --job-name "gaia_127_openai_gpt-5-mini" \ + --jobs-dir "$EVAL_JOBS_DIR" + +# 4. Resume a failed/partial run +$HARBOR_PYTHON -m harbor.cli.main jobs resume \ + -p "$EVAL_JOBS_DIR/gaia_127_openai_gpt-5-mini" \ + --filter-error-type EnvironmentStartTimeoutError \ + --filter-error-type DaytonaError +``` + +**Batch script for multiple models x datasets:** + +The script at `/e/scratch/jureap59/etash/run_commercial_evals.sh` loops over models and datasets with auto-skip (if `result.json` exists) and auto-resume (if `config.json` exists): + +```bash +#!/bin/bash +set -eo pipefail + +HARBOR_PYTHON="/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python" +export PYTHONPATH="/e/scratch/jureap59/etash/harbor/src:/e/scratch/jureap59/guha1/pip_packages:/e/scratch/jureap59/etash/pip_extras" +EVAL_JOBS_DIR="/e/data1/datasets/playground/mmlaion/shared/guha1/eval_jobs" + +source ~/secrets.env +# Keys loaded: DAYTONA_API_KEY, OPENAI_API_KEY +export DAYTONA_TARGET="us" + +MODELS=("openai/gpt-5-mini" "openai/gpt-5-nano" "openai/gpt-5") + +DATASETS=( + "gaia_127:/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/gaia_127" + "financeagent:/e/data1/datasets/playground/mmlaion/shared/guha1/datasets/financeagent" +) + +N_CONCURRENT=32 + +harbor_cmd() { $HARBOR_PYTHON -m harbor.cli.main "$@"; } + +for ds_entry in "${DATASETS[@]}"; do + DS_NAME="${ds_entry%%:*}" + DS_PATH="${ds_entry##*:}" + + for MODEL in "${MODELS[@]}"; do + SAFE_MODEL=$(echo "$MODEL" | tr '/:' '_') + JOB_NAME="${DS_NAME}_${SAFE_MODEL}" + JOB_DIR="${EVAL_JOBS_DIR}/${JOB_NAME}" + + # Skip completed + if [ -f "$JOB_DIR/result.json" ]; then + echo "SKIP: $JOB_NAME already done" + continue + fi + + # Resume or start + if [ -d "$JOB_DIR" ] && [ -f "$JOB_DIR/config.json" ]; then + echo "RESUME: $JOB_NAME" + harbor_cmd jobs resume -p "$JOB_DIR" \ + --filter-error-type EnvironmentStartTimeoutError \ + --filter-error-type DaytonaError + else + echo "START: $JOB_NAME" + harbor_cmd jobs start -p "$DS_PATH" \ + --n-concurrent "$N_CONCURRENT" --agent terminus-2 \ + --model "$MODEL" --env daytona \ + --ek auto_snapshot=true --no-force-build \ + --n-attempts 1 --job-name "$JOB_NAME" \ + --jobs-dir "$EVAL_JOBS_DIR" + fi + done +done +``` + +Run in tmux: +```bash +tmux new-session -s commercial_evals +bash /e/scratch/jureap59/etash/run_commercial_evals.sh 2>&1 | tee /e/scratch/jureap59/etash/commercial_evals.log +# Ctrl-b d to detach, tmux attach -t commercial_evals to reattach +``` + +Key differences from Slurm: +- `--n-attempts 1` (commercial APIs are reliable, no retries needed) +- `--n-concurrent 32` (lower than Slurm's 128 — login node has limited resources) +- No vLLM, no SSH tunnel, no proxychains (login nodes have direct internet) +- Model specified directly (e.g., `openai/gpt-5-mini`), not as `hosted_vllm/...` + +### 4c. Automated Eval Listener (Queue Management + Dependencies) + +The `unified_eval_listener.py` is a long-running daemon that polls the Supabase DB for new models and auto-submits Slurm eval jobs. It handles queue management, deduplication, stale job detection, and Slurm dependency chains. + +**How it works:** +1. Polls DB for recently registered models (configurable lookback window) +2. For each (model, dataset) pair, checks if a job already exists (Pending/Started/Finished) +3. Skips finished jobs, detects and auto-cancels stale jobs +4. Creates a "Pending" DB entry, then submits `sbatch` +5. Supports **sliding-window batch-size** to limit concurrent Slurm jobs + +**Sliding-Window Dependencies (`--batch-size`):** + +With `--batch-size N`, at most N jobs run concurrently. Job `i` depends on job `i-N` finishing (using Slurm's `afterany` dependency). As one job finishes, the next starts immediately — no waiting for entire waves. + +``` +--batch-size 4 with 10 jobs: + +Job 0 ─────────────┐ +Job 1 ─────────────┤ (run immediately, first 4) +Job 2 ─────────────┤ +Job 3 ─────────────┤ +Job 4 ──afterany:0──┤ (starts when job 0 finishes) +Job 5 ──afterany:1──┤ (starts when job 1 finishes) +Job 6 ──afterany:2──┤ +... +``` + +**Example usage:** + +```bash +PYTHON="/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python" + +# Dry run: see what would be submitted +$PYTHON eval/jupiter/unified_eval_listener.py \ + --preset dev --dry-run --verbose --once + +# Submit with batch-size (max 4 concurrent Slurm jobs) +$PYTHON eval/jupiter/unified_eval_listener.py \ + --preset dev --once --verbose --batch-size 4 + +# Submit with a priority file (only eval specific models) +$PYTHON eval/jupiter/unified_eval_listener.py \ + --preset aider --once --verbose \ + --priority-file eval/jupiter/priority_models.txt \ + --batch-size 4 + +# Long-running daemon (checks every 4 hours) +$PYTHON eval/jupiter/unified_eval_listener.py \ + --preset dev --verbose --check-hours 4 + +# Custom datasets (no preset) +$PYTHON eval/jupiter/unified_eval_listener.py \ + --datasets "DCAgent/dev_set_v2,DCAgent2/aider_polyglot" \ + --once --verbose --batch-size 2 + +# Add an explicit dependency (e.g., wait for another job first) +$PYTHON eval/jupiter/unified_eval_listener.py \ + --preset dev --once --dependency "afterany:12345" +``` + +**Available Presets:** + +| Preset | Datasets | N_Concurrent | Description | +|--------|----------|-------------|-------------| +| `dev` | DCAgent/dev_set_71_tasks | 128 | Dev set (71 tasks) | +| `v2` | DCAgent/dev_set_v2 | 128 | Dev set v2 | +| `bfcl` | DCAgent2/bfcl-parity | 16 | Berkeley Function Calling | +| `aider` | DCAgent2/aider_polyglot | 64 | Aider Polyglot | +| `swebench` | DCAgent/swebench_verified_eval_set | 32 | SWE-bench verified | +| `tb2` | DCAgent/terminal_bench_v2 | 64 | Terminal Bench v2 | + +**Manual sbatch with Slurm dependencies** (without the listener): + +```bash +# Submit 3 jobs, max 2 concurrent: +JOB1=$(sbatch --parsable eval/jupiter/unified_eval_harbor.sbatch "model-A" "/path/to/dataset") +JOB2=$(sbatch --parsable eval/jupiter/unified_eval_harbor.sbatch "model-B" "/path/to/dataset") +JOB3=$(sbatch --parsable --dependency=afterany:$JOB1 eval/jupiter/unified_eval_harbor.sbatch "model-C" "/path/to/dataset") +# JOB3 starts only after JOB1 finishes (success or failure) + +echo "Submitted: $JOB1, $JOB2, $JOB3" +squeue -u $USER # Verify +``` + +**Run tag & auto-resume interaction:** + +When using the listener or resubmitting manually, the run tag determines whether auto-resume kicks in: +- Run tag = `${SAFE_REPO}_${SAFE_MODEL}` (e.g., `gaia_127_mlfoundations-dev_my_model`) +- If `$EVAL_JOBS_DIR/$RUN_TAG/config.json` exists → auto-resumes +- If not → starts fresh + +This means resubmitting the same (model, dataset) is always safe. The listener uses this to handle Slurm timeouts: if a job times out, the next poll detects it as stale and resubmits, and the sbatch auto-resumes. + +--- + +## 5. Monitoring Jobs + +### Slurm Jobs + +```bash +# Check job status +squeue -u $USER + +# View live log +tail -f eval/jupiter/logs/eval_.out + +# Check vLLM log +tail -f eval/jupiter/logs/vllm_.log +``` + +### Job-Level result.json + +Located at `$EVAL_JOBS_DIR/$RUN_TAG/result.json` (written when job completes): + +```json +{ + "stats": { + "n_trials": 127, + "n_errors": 3, + "evals": { + "task_001__0": { + "reward_stats": {"mean": 1.0, "std": 0.0}, + "exception_stats": {} + }, + "task_002__0": { + "reward_stats": {"mean": 0.0, "std": 0.0}, + "exception_stats": {"DaytonaError": ["trial_id_1"]} + } + } + }, + "trial_results": [...] +} +``` + +### Trial-Level result.json + +Each trial directory (`$RUN_TAG/task_name__attempt/result.json`) has: + +```json +{ + "task_checksum": "abc123...", + "task_name": "task_001", + "verifier_result": { + "reward": 1.0 + }, + "exception_info": null +} +``` + +When a trial fails: +```json +{ + "exception_info": { + "exception_type": "DaytonaError", + "exception_message": "Sandbox creation timed out" + }, + "verifier_result": null +} +``` + +### Error Classification + +| Error Type | Retryable | Description | +|-----------|-----------|-------------| +| `DaytonaError` | Yes | Sandbox creation/connection failures | +| `DaytonaRateLimitError` | Yes | API rate limits | +| `EnvironmentStartTimeoutError` | Yes | Container took too long to start | +| `AgentTimeoutError` | No | Agent exceeded its timeout | +| `ContextLength` / `LLMError` | No | Model issues (context overflow, API error) | +| `VerifierTimeoutError` | No | Verifier timed out | +| `SandboxBuildFailedError` | No | Dockerfile build failed | + +### When to Retry a Job + +**Retry (resume) when:** +- **DaytonaError / DaytonaRateLimitError / EnvironmentStartTimeoutError** — these are transient infrastructure failures (sandbox creation hiccups, rate limits, slow container starts). Retrying usually succeeds. +- **Slurm timeout or OOM kill** — the job was interrupted, not failed. Resubmit the same sbatch command; auto-resume picks up where it left off. +- **SSH tunnel failure** — if the proxy died mid-job, Daytona calls fail. Retry after ensuring SSH key is configured. +- **A few DaytonaErrors but most trials succeeded** — resume to fill in the gaps. + +**Don't retry when:** +- **AgentTimeoutError** — the model genuinely couldn't solve the task in time. Retrying gives the same result. Consider increasing `EVAL_TIMEOUT_MULTIPLIER` if the timeout is too aggressive. +- **ContextLength / LLMError** — the model hit its context limit or returned invalid output. This is a model limitation, not infrastructure. Retrying won't help. +- **SandboxBuildFailedError** — the Dockerfile itself is broken. Fix the Dockerfile first. +- **VerifierTimeoutError** — the test suite is too slow. Fix the tests or increase verifier timeout in `task.toml`. +- **All trials have non-retryable errors** — the eval is done, the errors reflect real model/task behavior. + +**Rule of thumb**: check the error distribution first. If most errors are DaytonaError, resume. If most are AgentTimeoutError, the results are final. + +### Resuming Failed Trials + +```bash +# Resume only retryable errors +harbor jobs resume -p "$JOB_DIR" \ + --filter-error-type DaytonaError \ + --filter-error-type EnvironmentStartTimeoutError \ + --filter-error-type DaytonaRateLimitError +``` + +`harbor jobs resume` skips completed trials and only retries trials matching the filter. + +For Slurm jobs, just resubmit the same sbatch — auto-resume is built in: +```bash +sbatch eval/jupiter/unified_eval_harbor.sbatch "my-org/my-model" "/path/to/dataset" +``` + +### Quick Error Distribution Check + +Read the job-level `result.json` (not individual trial dirs): + +```bash +python3 -c " +import json +from pathlib import Path +from collections import Counter + +result_path = Path('$EVAL_JOBS_DIR/$RUN_TAG/result.json') +data = json.loads(result_path.read_text()) +stats = data.get('stats', {}) + +print(f'Total trials: {stats.get(\"n_trials\", \"?\")}') +print(f'Total errors: {stats.get(\"n_errors\", \"?\")}') + +errors = Counter() +for eval_key, eval_data in stats.get('evals', {}).items(): + for exc_type, ids in eval_data.get('exception_stats', {}).items(): + errors[exc_type] += len(ids) if isinstance(ids, list) else 1 + +print(f'Error breakdown: {dict(errors)}') +print(f'Success: {stats.get(\"n_trials\", 0) - sum(errors.values())}') +" +``` + +--- + +## 6. Uploading Results + +### upload_eval_results() — Full Reference + +```python +upload_eval_results( + job_dir, # Path to job directory (required) + username="guha1", # Username for DB records (required) + error_mode="skip_on_error", # "skip_on_error" or "rollback_on_error" (required) + + # Auto-detected if not provided: + agent_name=None, # Agent name (from trial config) + agent_version=None, # Agent version (from trial config) + model_name=None, # Model name (from trial config) + benchmark_name="gaia_127", # Benchmark name + benchmark_version_hash="abc...", # SHA256 hash (64 chars) + + # Benchmark/task auto-registration: + register_benchmark=True, # Auto-register benchmark + tasks if not in DB + + # HuggingFace trace upload: + hf_repo_id="DCAgent2/run_tag", # HF dataset repo for traces + hf_token=os.environ["HF_TOKEN"], # HF auth token + hf_private=False, # Public by default + hf_episodes="last", # "last" or "all" + + # Other: + git_commit_id=None, # Optional git SHA + forced_update=False, # Allow overwriting existing DB records +) +``` + +### Two-Stage Upload + +**Stage 1: HF Traces Upload** +- Exports agent conversation logs (trajectories) as a HuggingFace dataset +- Creates/updates a repo at `hf_repo_id` (e.g., `DCAgent2/gaia_127_openai_gpt-5-mini`) +- Returns the HF dataset URL + +**Stage 2: DB Records Upload** +- Registers/finds: agent, model, benchmark in DB +- Creates `sandbox_jobs` entry with job metadata +- Creates `sandbox_trials` entries for each trial with timing, reward, exception info +- Creates `sandbox_trial_model_usage` entries for token tracking +- Links HF dataset URL to the job record + +### register_benchmark=True + +When `register_benchmark=True`, the upload function calls `register_benchmark_and_tasks_from_job()` which: + +1. Registers the benchmark in `benchmarks` table if not found +2. Scans all trial directories for `result.json` → extracts `task_checksum` +3. Deduplicates tasks (important when `n_attempts > 1`) +4. Registers each unique task in `sandbox_tasks` table (with 3x retry) +5. Links tasks to benchmark in `sandbox_benchmark_tasks` table + +This is critical for new benchmarks. Without it, trial uploads fail with FK constraint errors because the referenced `task_checksum` doesn't exist in `sandbox_tasks`. + +### Error Modes + +**`skip_on_error`** (recommended for most uploads): +- Continues uploading even if individual trials fail +- Job record always kept in DB +- Returns list of failed trials for debugging + +**`rollback_on_error`** (atomic uploads): +- Deletes ALL job/trial/usage records on any error +- All-or-nothing semantics +- If HF upload fails, entire process aborts + +### Example Upload Script + +```python +#!/usr/bin/env python3 +"""Upload eval results to HF + DB.""" +import os, sys, hashlib + +sys.path.insert(0, "eval/jupiter/dcagents-leaderboard") +from unified_db.utils import upload_eval_results + +run_dir = "/e/data1/datasets/playground/mmlaion/shared/guha1/eval_jobs/gaia_127_openai_gpt-5-mini" +dataset_hf = "gaia_127" + +# Stable benchmark version hash +benchmark_version_hash = hashlib.sha256(dataset_hf.encode()).hexdigest() + +# HF repo ID (sanitized) +hf_repo_id = f"DCAgent2/gaia_127_openai_gpt-5-mini" + +result = upload_eval_results( + run_dir, + username="guha1", + error_mode="skip_on_error", + hf_token=os.environ["HF_TOKEN"], + hf_repo_id=hf_repo_id, + register_benchmark=True, + benchmark_name=dataset_hf, + benchmark_version_hash=benchmark_version_hash, +) + +print(f"Success: {result['success']}") +print(f"Job ID: {result.get('job_id')}") +print(f"Trials uploaded: {result.get('n_trials_uploaded')}") +if result.get('hf_dataset_url'): + print(f"HF URL: {result['hf_dataset_url']}") +``` + +### Name Resolution: How Upload Finds the Right DB Records + +The upload needs to match three things in the DB by **exact name**: agent, model, and benchmark. Getting the names wrong means it either fails or creates a duplicate entry. + +**Benchmark name** — derived from `REPO_ID`: + +``` +REPO_ID → benchmark_name +───────────────────────────────────────────────────── +"DCAgent/dev_set_v2" → "dev_set_v2" (split on "/", take last) +"/e/.../datasets/gaia_127" → "gaia_127" (split on "/", take last) +"/e/.../datasets/DCAgent_dev_set_v2" → "DCAgent_dev_set_v2" ← WRONG if DB has "dev_set_v2" +``` + +The sbatch derives `benchmark_name = dataset_hf.split("/")[-1]`. So: +- HF repo IDs like `DCAgent/dev_set_v2` → `dev_set_v2` (correct) +- Local paths use the directory basename → could differ from what's in the DB + +**If the name doesn't match an existing benchmark** and `register_benchmark=True`, it creates a **new** benchmark with that name. This is how you end up with both `dev_set_v2` and `DCAgent_dev_set_v2` in the DB. + +**How to ensure you upload to the right benchmark:** + +1. **Always pass `benchmark_name` explicitly** in manual uploads: + ```python + upload_eval_results( + ..., + benchmark_name="dev_set_v2", # Must match what's in the DB + benchmark_version_hash=hashlib.sha256("DCAgent/dev_set_v2".encode()).hexdigest(), + ) + ``` + +2. **Check what's in the DB** before uploading: + ```python + from unified_db.utils import get_benchmark_by_name + # Try the name you think it should be + b = get_benchmark_by_name("dev_set_v2") + print(b) # If None, it doesn't exist yet + ``` + +3. **For the sbatch script**, the `REPO_ID` (`$2`) controls the name. Use consistent `REPO_ID` values: + - First run used `DCAgent/dev_set_v2` → benchmark "dev_set_v2" created + - Later run must also use `DCAgent/dev_set_v2` (or a local path whose basename is `dev_set_v2`) + - Using `/e/.../datasets/DCAgent_dev_set_v2` would create a separate "DCAgent_dev_set_v2" benchmark + +**Model name** — auto-detected from trial config, with fallback chain: + +``` +Trial config model_name → DB lookup +────────────────────────────────────────────── +"hosted_vllm/my-org/my-model" → try "hosted_vllm/my-org/my-model", then strip to "my-org/my-model" +"openai/gpt-5-mini" → try "openai/gpt-5-mini" directly +``` + +The upload strips the `hosted_vllm/` prefix automatically. If the model still isn't found, it tries to auto-register from HuggingFace `run_summary.json`. + +**HF repo ID for traces** — the `sanitize_hf_repo_id()` function in the sbatch cleans the run tag for use as a HuggingFace dataset name: +- Replaces special characters with hyphens +- Collapses consecutive hyphens/dots +- Truncates long names with SHA1 suffix + +--- + +## 7. Database Schema + +### Entity Relationship Diagram + +``` +benchmarks ←──┐ + │ benchmark_id +sandbox_benchmark_tasks ──→ sandbox_tasks (PK: checksum) + ↑ + │ task_checksum +agents ←─────── sandbox_jobs ──→ sandbox_trials +models ←─────── sandbox_jobs │ + ↑ │ + │ job_id │ trial_id + └──────────────┘ + │ + sandbox_trial_model_usage +``` + +### Key Tables + +**`benchmarks`** +| Column | Type | Description | +|--------|------|-------------| +| id | UUID (PK) | Auto-generated | +| name | TEXT | Benchmark name (e.g., "gaia_127") | +| benchmark_version_hash | CHAR(64) | SHA256 of benchmark content | +| is_external | BOOLEAN | Whether externally hosted | + +**`sandbox_tasks`** +| Column | Type | Description | +|--------|------|-------------| +| checksum | TEXT (PK) | SHA256 content hash (deduplication key) | +| source | TEXT | Benchmark name that registered this task | +| name | TEXT | Task name | +| instruction | TEXT | Task instruction text | +| agent_timeout_sec | NUMERIC | Agent execution timeout | +| verifier_timeout_sec | NUMERIC | Verifier timeout | +| path | TEXT | Task directory path | + +**`sandbox_jobs`** +| Column | Type | Description | +|--------|------|-------------| +| id | UUID (PK) | Auto-generated | +| job_name | TEXT | Run tag / job name | +| username | TEXT | Who ran the eval | +| agent_id | UUID (FK→agents) | Agent used | +| model_id | UUID (FK→models) | Model used | +| benchmark_id | UUID (FK→benchmarks) | Benchmark evaluated | +| n_trials | INTEGER | Number of trials | +| metrics | JSONB | Aggregate metrics | +| stats | JSONB | Detailed statistics | +| hf_traces_link | TEXT | HuggingFace dataset URL | +| job_status | ENUM | 'Pending', 'Started', 'Finished' | +| UNIQUE | | (agent_id, model_id, benchmark_id) | + +**`sandbox_trials`** +| Column | Type | Description | +|--------|------|-------------| +| id | UUID (PK) | Auto-generated | +| trial_name | TEXT | Trial identifier | +| job_id | UUID (FK→sandbox_jobs) | Parent job | +| task_checksum | TEXT (FK→sandbox_tasks) | Task that was executed | +| reward | NUMERIC | Score (0 or 1) | +| started_at / ended_at | TIMESTAMP | Trial timing | +| environment_setup_* | TIMESTAMP | Sandbox setup timing | +| agent_execution_* | TIMESTAMP | Agent run timing | +| verifier_* | TIMESTAMP | Verification timing | +| exception_info | JSONB | Error details if failed | + +**`sandbox_trial_model_usage`** +| Column | Type | Description | +|--------|------|-------------| +| trial_id | UUID (FK→sandbox_trials) | Trial | +| model_id | UUID (FK→models) | Model used | +| model_provider | TEXT | Provider (openai, anthropic, etc.) | +| n_input_tokens | INTEGER | Input tokens consumed | +| n_output_tokens | INTEGER | Output tokens consumed | +| PK | | (trial_id, model_id, model_provider) | + +### Common DB Queries + +```python +from unified_db.utils import get_supabase_client + +client = get_supabase_client() + +# Get all jobs for a benchmark +jobs = client.table('sandbox_jobs') \ + .select('*, benchmarks(name)') \ + .eq('benchmark_id', benchmark_uuid) \ + .execute() + +# Get trial results for a job +trials = client.table('sandbox_trials') \ + .select('trial_name, reward, exception_info') \ + .eq('job_id', job_uuid) \ + .execute() + +# Compute pass rate +rewards = [t['reward'] for t in trials.data if t['reward'] is not None] +pass_rate = sum(r > 0 for r in rewards) / len(rewards) +``` + +--- + +## 8. Common Pitfalls & Troubleshooting + +### Python 3.9 on Login Node + +The login node has Python 3.9 which is too old for Harbor and the Daytona SDK. Always use: +```bash +/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/bin/python # Python 3.12 +``` + +For RL snapshot work specifically: +```bash +/e/scratch/jureap59/feuer1/OpenThoughts-Agent/envs/rl/bin/python3 # Python 3.12 +``` + +### Missing SUPABASE Environment Variables + +Upload will fail silently or error if `SUPABASE_URL`, `SUPABASE_ANON_KEY`, or `SUPABASE_SERVICE_ROLE_KEY` are not set. These are loaded from `~/secrets.env`: +```bash +source ~/secrets.env +# Or set KEYS=/path/to/secrets.env for auto-loading +``` + +### FK Constraint Errors (Missing Tasks) + +If upload fails with errors like: +``` +insert or update on table "sandbox_trials" violates foreign key constraint +"sandbox_trials_task_checksum_fkey" +``` + +The task checksums referenced by trials don't exist in `sandbox_tasks`. Fix: set `register_benchmark=True` in `upload_eval_results()` — this auto-registers all tasks from trial results. + +### Stale Jobs That Look Running + +If `squeue` shows a job but it crashed (e.g., OOM kill), check the log: +```bash +tail -50 eval/jupiter/logs/eval_.out +sacct -j --format=JobID,State,ExitCode,MaxRSS +``` + +The job directory may have partial results. Use `harbor jobs resume` to retry. + +### RL Key vs Org Key Confusion + +**Symptom**: `"Region not found"` error during RL snapshot operations. + +**Cause**: Used org1/org2 key with `DAYTONA_TARGET=RL`. Only the RL key (`dtn_7ff746b0...`) has an RL region. + +**Fix**: Ensure RL sbatch scripts use: +```bash +export DAYTONA_API_KEY="dtn_7ff746b032c547e741f0ef153ba7947b7d312c25711d4181423fcfe91cebb894" +``` + +### SSH Tunnel Failures + +**Symptom**: `[proxy] Connectivity test failed` or Daytona timeouts on compute nodes. + +**Fix**: Ensure `SSH_KEY` env var is set in `secrets.env`. The tunnel needs passwordless SSH to the login node (`jpbl-s01-02`). + +### vLLM Server Won't Start + +**Symptoms**: Health check loop times out after ~33 minutes. + +Common causes: +- Model not in cache (`HF_HUB_CACHE`): pre-download via proxychains +- OOM: reduce `EVAL_GPU_MEMORY_UTIL` or use fewer GPUs +- Incompatible model: check `eval/jupiter/logs/vllm_.log` + +### Snapshot Not Found Warnings + +**Symptom**: Log shows `"not found (not global)"` for snapshots. + +**Cause**: Snapshots weren't pre-created on the Daytona org being used. + +**Fix**: Run `precreate_snapshots.py` on both orgs before launching evals. + +### PYTHONPATH Pollution + +The sbatch script explicitly `unset PYTHONPATH` before setting its own. If you source `jupiter.env` in your shell, it may set a PYTHONPATH with incompatible packages (e.g., numpy 2.4 breaking numba/vLLM). The sbatch handles this, but manual testing should be careful. + +--- + +## 9. Key File Paths Reference + +### Scripts + +| Path | Description | +|------|-------------| +| `eval/jupiter/unified_eval_harbor.sbatch` | Main Slurm eval script | +| `eval/jupiter/dcagent_eval_config.yaml` | Harbor job config (jobs_dir, retry, agent settings) | +| `eval/jupiter/snapshot_download.py` | HF dataset download helper | +| `eval/jupiter/unified_eval_listener.py` | Auto-submit daemon (polls DB for new models) | +| `/e/scratch/jureap59/etash/run_commercial_evals.sh` | Commercial model eval (GAIA+FinanceAgent) | +| `/e/scratch/jureap59/etash/run_commercial_aider.sh` | Commercial model eval (Aider Polyglot) | +| `/e/scratch/jureap59/etash/precreate_snapshots.py` | Snapshot pre-creation script | + +### Datasets + +| Path | Description | +|------|-------------| +| `/e/data1/.../guha1/datasets/gaia_127` | GAIA 127-task subset | +| `/e/data1/.../guha1/datasets/gaia` | Full GAIA dataset | +| `/e/data1/.../guha1/datasets/financeagent` | FinanceAgent (50 tasks) | +| `/e/data1/.../guha1/datasets/DCAgent2_aider_polyglot` | Aider Polyglot (225 tasks) | +| `/e/data1/.../guha1/datasets/medagentbench` | MedAgentBench | + +### Job Output + +| Path | Description | +|------|-------------| +| `/e/data1/.../guha1/eval_jobs/` | All eval job directories | +| `eval/jupiter/logs/` | Slurm job logs and vLLM logs | +| `eval/jupiter/logs/upload_.log` | Upload logs | + +### Infrastructure + +| Path | Description | +|------|-------------| +| `/e/scratch/jureap59/feuer1/miniforge3/envs/otagent/` | Python 3.12 environment | +| `/e/scratch/jureap59/etash/harbor/src/` | Harbor source (editable install) | +| `/e/scratch/jureap59/guha1/pip_packages/` | Additional Python packages | +| `/e/data1/.../guha1/hub/` | Pre-downloaded HuggingFace models | +| `eval/jupiter/dcagents-leaderboard/` | DB utilities (unified_db/) | +| `~/secrets.env` | API keys (Daytona, OpenAI, HF, Supabase) | + +### Harbor Config (dcagent_eval_config.yaml) + +```yaml +jobs_dir: /e/data1/datasets/playground/mmlaion/shared/guha1/eval_jobs +n_attempts: 3 +timeout_multiplier: 1.0 +orchestrator: + type: local + n_concurrent_trials: 16 + quiet: false + retry: + max_retries: 3 + exclude_exceptions: + - AgentTimeoutError + - EnvironmentStartTimeoutError + - SandboxBuildFailedError + - VerifierTimeoutError + wait_multiplier: 1.0 + min_wait_sec: 1.0 + max_wait_sec: 60.0 +environment: + type: daytona + force_build: false + delete: false + kwargs: + auto_snapshot: true +agents: + - name: terminus-2 + trajectory_config: + raw_content: true + linear_history: true +``` + +Note: `n_concurrent_trials: 16` in the config is overridden by `--n-concurrent` on the CLI (128 for Slurm, 32 for commercial). The config's `exclude_exceptions` list prevents retries for non-transient errors. diff --git a/eval/docs/HF_REUPLOAD_GUIDE.md b/eval/docs/HF_REUPLOAD_GUIDE.md new file mode 100644 index 00000000..44b91d3f --- /dev/null +++ b/eval/docs/HF_REUPLOAD_GUIDE.md @@ -0,0 +1,192 @@ +# HF Trace Re-upload & Subagent Trace Fix + +Guide for porting the HF upload fix and re-upload tooling to another cluster. + +--- + +## Problem 1: HF uploads fail with XET permission denied + +### Symptom +All HuggingFace result uploads fail with: +``` +Permission denied (os error 13) at path "/xet/.../staging/shard-session/.tmpXXX" +``` +DB uploads succeed (the pipeline uses `skip_on_error` mode), but HF trace repos are never created. + +### Root cause +The sbatch sets `HF_HOME` to a shared read-only cache (for model/dataset downloads). HuggingFace's XET upload layer derives its staging directory from `HF_HOME`, landing at `/xet/`. If another user created that `xet/` subtree, your user can't write temp files there. + +### Fix +Set `HF_XET_CACHE` to a **user-writable** location, separate from `HF_HOME`. Add this to your sbatch **after** the `HF_HOME`/`HF_HUB_CACHE` exports: + +```bash +# XET staging cache — must be user-writable (shared HF_HOME/xet/ has permission issues) +export HF_XET_CACHE="/your/scratch/${USER}/hf_xet_cache" +mkdir -p "$HF_XET_CACHE" +``` + +**On Jupiter**, the change was made in `eval/jupiter/unified_eval_harbor.sbatch` at line ~138, right after: +```bash +export HF_HUB_CACHE="/e/data1/datasets/playground/ot/hf_hub" +export HF_HOME="/e/data1/datasets/playground/ot/hf_hub" +export HF_CACHE_DIR="$HF_HUB_CACHE" +``` + +Already-running jobs are NOT affected by this fix — they need the re-upload script below. + +--- + +## Problem 2: Subagent traces were being uploaded (bloated HF datasets) + +### Symptom +A job with 300 trials produces thousands of HF dataset rows instead of 300. The `Map:` progress lines show many batches of varying sizes (300, 238, 239, 240, 230, ...) instead of a single batch of ~300. + +### Root cause +The upload call chain is: +``` +sbatch upload section + → upload_eval_results() [database/unified_db/utils.py:4166] + → upload_traces_to_hf() [database/unified_db/utils.py:3909] + → export_traces() [harbor/utils/traces_utils.py:1422] +``` + +`upload_eval_results` has `hf_export_subagents: bool = False` (correct default). It passes this to `upload_traces_to_hf(export_subagents=...)`. **But** `upload_traces_to_hf` was NOT forwarding `export_subagents` to `export_traces()` — the parameter was silently dropped. So `export_traces()` always used its own default of `export_subagents=True`. + +### Fix +In `database/unified_db/utils.py`, in the `upload_traces_to_hf` function (~line 4014), add `export_subagents` to the `export_traces()` call: + +```python +# BEFORE (broken): +dataset = export_traces( + root=job_dir, + recursive=True, + episodes=episodes, + to_sharegpt=False, + repo_id=None, + push=False, + verbose=verbose, + success_filter=success_filter, + include_verifier_output=include_verifier_output, +) + +# AFTER (fixed): +dataset = export_traces( + root=job_dir, + recursive=True, + episodes=episodes, + to_sharegpt=False, + repo_id=None, + push=False, + verbose=verbose, + success_filter=success_filter, + include_verifier_output=include_verifier_output, + export_subagents=export_subagents, # <-- THIS WAS MISSING +) +``` + +The `upload_traces_to_hf` function already accepts `export_subagents` as a parameter (line 3918) — it just wasn't passing it through. No signature changes needed. + +### Verification +After the fix, a 300-trial job should produce exactly ~300 rows in the HF dataset (one per trial, last episode only), not thousands. + +--- + +## Re-upload script: `eval/jupiter/reupload_hf.py` + +This script re-uploads HF traces for jobs that already completed but failed the HF upload step. It also patches the Supabase DB record with the new HF URL. + +### What it does (per job) +1. Reads `meta.env` from the job's run directory to get `RUN_TAG`, `DB_JOB_ID`, `MODEL`, etc. +2. Skips early if no `DB_JOB_ID` (no DB record to patch). +3. Calls `harbor.utils.traces_utils.export_traces()` **directly** (not through `upload_traces_to_hf`) with: + - `episodes="last"` — only the final episode per trial + - `export_subagents=False` — no subagent traces + - `include_verifier_output=True` +4. Creates HF repo via `huggingface_hub.create_repo()` and pushes via `dataset.push_to_hub()`. +5. Patches `sandbox_jobs.hf_traces_link` in Supabase with the new HF URL. + +### HF repo naming +- Format: `DCAgent2/-` +- The random suffix prevents overwriting previous uploads for the same model/benchmark. +- Name part is sanitized to fit HF's 96-char limit (special chars → hyphens, truncation with sha1 tail if needed). + +### Key dependencies / imports +```python +from database.unified_db.utils import get_supabase_client, load_supabase_keys +from harbor.utils.traces_utils import export_traces +from huggingface_hub import create_repo +``` + +### Environment setup +The script auto-configures these if not already set: +- `HF_XET_CACHE` → `//hf_xet_cache` (the permission fix) +- `DC_AGENT_SECRET_ENV` → `~/secrets.env` (so `load_supabase_keys()` can find credentials) +- `SUPABASE_KEY` → aliased to `SUPABASE_ANON_KEY` (Jupiter's secrets.env uses `SUPABASE_KEY`) + +### Usage +```bash +PYTHON="/path/to/otagent/bin/python" + +# Single job by SLURM ID +$PYTHON eval/jupiter/reupload_hf.py --job-ids 279050 + +# Multiple jobs +$PYTHON eval/jupiter/reupload_hf.py --job-ids 279050 279051 279052 + +# From a file of SLURM IDs +$PYTHON eval/jupiter/reupload_hf.py --job-ids-file eval/jupiter/lists/failed_uploads.txt + +# Direct run directory paths +$PYTHON eval/jupiter/reupload_hf.py --run-dirs /path/to/eval_jobs/run_dir_name + +# Scan ALL run dirs for missing HF uploads (skips dirs without DB_JOB_ID) +$PYTHON eval/jupiter/reupload_hf.py --scan-all + +# Dry run (preview without uploading) +$PYTHON eval/jupiter/reupload_hf.py --scan-all --dry-run +``` + +### How it finds run directories from SLURM job IDs +1. **Strategy 1**: Parses `eval/jupiter/logs/eval_.out` for the `Run dir: ` line. +2. **Strategy 2**: Scans all `/*/meta.env` files for matching `SLURM_JOB_ID`. + +### Supabase details +- **Table**: `sandbox_jobs` (NOT `eval_jobs` — that doesn't exist) +- **Column**: `hf_traces_link` (string, stores the full HF dataset URL) +- **Client**: `get_supabase_client()` from `database.unified_db.utils` (NOT `_get_supabase_client`) +- **Credentials loader**: `load_supabase_keys()` — reads from file at `DC_AGENT_SECRET_ENV` +- Required env vars after loading: `SUPABASE_URL`, `SUPABASE_ANON_KEY`, `SUPABASE_SERVICE_ROLE_KEY` + +### Job directory structure (what the script reads) +``` +// +├── meta.env # MODEL, REPO_ID, DB_JOB_ID, RUN_TAG, BENCHMARK_NAME, etc. +├── config.json # Harbor job config +├── result.json # Job result (status, metrics, exception_stats) +└── / # One per trial + ├── config.json + ├── result.json + └── agent/ + ├── episode-0/ + ├── episode-1/ # export_traces with episodes="last" takes only the highest-numbered + └── trajectory.json +``` + +### Porting to another cluster +To adapt `reupload_hf.py` for a different cluster, change: +1. `DEFAULT_EVAL_JOBS_DIR` — path to your eval jobs output directory +2. `DEFAULT_LOG_DIR` — path to your SLURM log directory +3. `SECRETS_ENV` — path to your secrets file +4. `HARBOR_SRC` — path to harbor source (for `export_traces` import) +5. The `HF_XET_CACHE` default path in `main()` — your user-writable scratch +6. HF org in `reupload_single()` — currently hardcoded as `DCAgent2/` + +### Gotchas discovered during development +| Issue | Wrong | Correct | +|-------|-------|---------| +| Supabase table name | `eval_jobs` | `sandbox_jobs` | +| Supabase client function | `_get_supabase_client()` | `get_supabase_client()` | +| Credentials env var | Not set → warning + empty client | Set `DC_AGENT_SECRET_ENV=~/secrets.env` before calling `load_supabase_keys()` | +| Supabase key name alias | `SUPABASE_KEY` in secrets.env | Must alias to `SUPABASE_ANON_KEY` if secrets.env uses the short name | +| `export_subagents` not forwarded | `upload_traces_to_hf` silently ignored it | Must pass `export_subagents=export_subagents` to `export_traces()` | +| HF repo name length | No limit enforced → API rejects | Cap name part at 96 chars (HF limit), truncate with sha1 tail | diff --git a/eval/docs/LISTENER_TUTORIAL.md b/eval/docs/LISTENER_TUTORIAL.md new file mode 100644 index 00000000..3611c653 --- /dev/null +++ b/eval/docs/LISTENER_TUTORIAL.md @@ -0,0 +1,266 @@ +# Eval Listener Tutorial + +How to run the unified eval listener to submit model evaluation jobs on an HPC cluster. + +## Overview + +The listener (`eval/MBZ/unified_eval_listener_v4.py`) polls Supabase for pending evals, submits SLURM jobs, which then: +1. Start a vLLM server to serve the model +2. Run Harbor agent trials against a benchmark dataset +3. Upload results to Supabase + HuggingFace + +## Prerequisites + +### 1. Conda Environment + +You need the `otagent` conda env. Set it up from the existing setup script or replicate these key packages: +- Python 3.12 +- vLLM 0.13+ (use `otagent2` with vLLM 0.17+ for newer model architectures like Qwen3.5) +- Harbor installed from `git+https://github.com/laude-institute/harbor.git@penfever/temp-override` +- torch, transformers, huggingface_hub, supabase + +### 2. Secrets File + +Create `~/secrets.env` with these keys: +```bash +export DAYTONA_API_KEY='dtn_...' # Daytona sandbox API key +export DAYTONA_TARGET='us' # Daytona region (us, eu, RL) +export HF_TOKEN='hf_...' # HuggingFace token for uploads +export SUPABASE_URL='https://...' # Supabase project URL +export SUPABASE_ANON_KEY='...' # Supabase anon key +export SUPABASE_SERVICE_ROLE_KEY='...' # Supabase service role key +``` + +### 3. Directory Structure + +``` +OpenThoughts-Agent/ +├── eval/MBZ/ +│ ├── unified_eval_listener_v4.py # The listener +│ ├── unified_eval_harbor_v4.sbatch # SLURM job script +│ ├── lists/ # Priority/blacklist files +│ └── dcagent_eval_config_no_override.yaml +├── experiments/ +│ ├── logs/ # SLURM stdout logs (terminal_*.out, vllm_*.log) +│ └── listener_logs/ # Listener daemon logs +└── database/unified_db/utils.py # Supabase client +``` + +## Quick Start + +### Run a One-Time Eval for Specific Models + +**Step 1:** Create a priority file listing models (one per line): +```bash +cat > eval/MBZ/lists/my_models.txt << 'EOF' +Qwen/Qwen3-8B +deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +EOF +``` + +**Step 2:** Dry run to verify (no actual submission): +```bash +conda activate otagent +python eval/MBZ/unified_eval_listener_v4.py \ + --preset v2 \ + --priority-file eval/MBZ/lists/my_models.txt \ + --secrets-file ~/secrets.env \ + --enable-thinking \ + --tp-size 2 \ + --once --dry-run --verbose +``` + +**Step 3:** Submit for real: +```bash +python eval/MBZ/unified_eval_listener_v4.py \ + --preset v2 \ + --priority-file eval/MBZ/lists/my_models.txt \ + --secrets-file ~/secrets.env \ + --enable-thinking \ + --tp-size 2 \ + --once --verbose +``` + +**Step 4:** Monitor: +```bash +# Check SLURM queue +squeue -u $USER + +# Watch job logs (replace JOB_ID) +tail -f experiments/logs/terminal_.out +tail -f experiments/logs/vllm_.log +``` + +## Presets + +Presets bundle dataset + tuned defaults. **Always use a preset** as the starting point. + +| Preset | Dataset | Time Limit | Notes | +|--------|---------|------------|-------| +| `v2` | `DCAgent/dev_set_v2` | 24h | Primary dev benchmark (~90 tasks) | +| `tb2` | `DCAgent2/terminal_bench_2` | 48h | Large terminal benchmark (~180 tasks) | +| `swebench` | `DCAgent2/swebench-verified-random-100-folders` | 24h | SWE-Bench, uses XML parser | +| `aider` | `DCAgent2/aider_polyglot` | 24h | Aider polyglot benchmark | +| `bfcl` | `DCAgent2/bfcl-parity` | 24h | BFCL function calling | +| `v1` | `DCAgent/dev_set_71_tasks` | 24h | Legacy dev set | + +## Key Flags + +### Required for most runs +| Flag | Description | +|------|-------------| +| `--preset ` | Benchmark preset (v2, tb2, swebench, etc.) | +| `--priority-file ` | File listing models to evaluate (one per line) | +| `--secrets-file ` | Path to secrets.env with Daytona/Supabase/HF keys | + +### Execution control +| Flag | Description | +|------|-------------| +| `--once` | Run one iteration then exit (for one-off submissions) | +| `--dry-run` | Print what would be submitted without actually submitting | +| `--verbose` | Extra logging | + +### Model/GPU configuration +| Flag | Default | Description | +|------|---------|-------------| +| `--tp-size ` | 1 | Tensor parallel size (GPUs per job). Use 2 for 7-14B models, 4 for 32B+ | +| `--conda-env ` | otagent | Conda env for the SLURM job. Use `otagent2` for Qwen3.5+ | +| `--enable-thinking` | off | Enable thinking/reasoning blocks. Most presets set this automatically | +| `--gpu-memory-util ` | 0.9 | vLLM GPU memory utilization (0.0-1.0) | + +### Job management +| Flag | Default | Description | +|------|---------|-------------| +| `--max-jobs-submitted ` | 50 | Max concurrent SLURM jobs from this listener | +| `--batch-size ` | 20 | First N jobs run immediately, rest chain one-by-one | +| `--timeout-multiplier ` | 1.0 | Multiply per-task timeouts (use 2.0 for slow models) | +| `--blacklist-file ` | none | File listing models to never evaluate | +| `--slurm-partition ` | main | SLURM partition to submit to | + +### Daytona/snapshot control +| Flag | Default | Description | +|------|---------|-------------| +| `--auto-snapshot` | off | Auto-create Daytona snapshots per environment (quota: 40/org) | + +## Common Patterns + +### One-time batch of specific models on dev_set_v2 +```bash +python eval/MBZ/unified_eval_listener_v4.py \ + --preset v2 \ + --priority-file eval/MBZ/lists/my_models.txt \ + --blacklist-file eval/MBZ/lists/pruned_models_names.txt \ + --enable-thinking \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --secrets-file ~/secrets.env \ + --once --verbose +``` + +### Same models on terminal_bench_2 +```bash +python eval/MBZ/unified_eval_listener_v4.py \ + --preset tb2 \ + --priority-file eval/MBZ/lists/my_models.txt \ + --blacklist-file eval/MBZ/lists/pruned_models_names.txt \ + --enable-thinking \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --secrets-file ~/secrets.env \ + --once --verbose +``` + +### Long-running daemon (polls every 2h) +```bash +nohup python eval/MBZ/unified_eval_listener_v4.py \ + --preset v2 \ + --priority-file eval/MBZ/lists/my_models.txt \ + --blacklist-file eval/MBZ/lists/pruned_models_names.txt \ + --enable-thinking \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --max-jobs-submitted 50 \ + --secrets-file ~/secrets.env \ + --verbose & +``` + +### Using otagent2 for newer models (Qwen3.5+) +```bash +python eval/MBZ/unified_eval_listener_v4.py \ + --preset v2 \ + --priority-file eval/MBZ/lists/qwen35_models.txt \ + --conda-env otagent2 \ + --enable-thinking \ + --tp-size 2 \ + --secrets-file ~/secrets.env \ + --once --verbose +``` + +## How It Works (End-to-End Flow) + +``` +1. Listener reads priority file → fetches model list +2. Checks Supabase `sandbox_jobs` table for existing jobs (dedup) +3. For each new (model, dataset) pair: + a. Creates "Pending" DB entry + b. Runs: sbatch --partition --gres gpu: unified_eval_harbor_v4.sbatch + c. Updates DB entry with SLURM job ID +4. SLURM job (sbatch script): + a. Sources secrets.env → activates conda env + b. Pre-flight: validates model architecture is supported by vLLM + c. Downloads model weights if not cached + d. Starts vLLM server (with retries) + e. Runs: harbor jobs start -p --model --env daytona ... + f. Harbor creates Daytona sandboxes, runs agent trials + g. Checks error threshold → uploads results to Supabase + HuggingFace + h. Updates DB entry to "Finished" +``` + +## Deduplication Logic + +The listener skips models that already have jobs in the DB: +- **Finished** → skip (already done) +- **Started** < 24h ago → skip (in progress) +- **Started** > 24h ago → restart (stale) +- **Pending** > 48h ago → restart + scancel old job +- **No entry** → submit new job + +## Troubleshooting + +### Job fails immediately +Check `experiments/logs/terminal_.out` for: +- `Unsupported model architecture` → need newer vLLM (use `--conda-env otagent2`) +- `CUDA out of memory` → increase `--tp-size` +- `Invalid qos specification` → check partition supports the QOS + +### vLLM won't start +Check `experiments/logs/vllm_.log`: +- Model download failures → pre-download with `huggingface_hub.snapshot_download()` +- Architecture not in ModelRegistry → upgrade vLLM + +### Listener skips a model +- Already has a job in DB → check with: + ```python + from database.unified_db.utils import get_supabase_client, load_supabase_keys + load_supabase_keys() + c = get_supabase_client() + r = c.table('sandbox_jobs').select('*').like('job_name', '%ModelName%').execute() + for j in r.data: print(j['job_name'], j['status']) + ``` +- Model is in blacklist file +- Model not in priority file (filter_only mode) + +### SLURM partition issues +- `main` partition: no `--qos` needed +- `lowprio` partition: needs `--qos=lowprio` (but jobs get preempted) +- The listener auto-handles this: it only adds `--qos` for non-main partitions + +## Cluster-Specific Notes + +When setting up on a new cluster, you need to: +1. Set up conda env (otagent) with vLLM + harbor + dependencies +2. Create `~/secrets.env` with valid credentials +3. Adjust `--slurm-partition` to match your cluster's partition names +4. Verify the sbatch script paths (`DCFT`, `HF_HOME`, cache dirs) match your filesystem +5. Pre-download model weights before submitting (avoids download during eval) +6. Test with `--dry-run --once` first, then `--once` for a single model before running the daemon diff --git a/eval/docs/OTAGENT2_SETUP_TUTORIAL.md b/eval/docs/OTAGENT2_SETUP_TUTORIAL.md new file mode 100644 index 00000000..a39480c5 --- /dev/null +++ b/eval/docs/OTAGENT2_SETUP_TUTORIAL.md @@ -0,0 +1,249 @@ +# otagent2 Environment Setup Tutorial + +How to create the `otagent2` conda environment for evaluating newer model architectures (Qwen3.5, etc.) that require vLLM >= 0.17.1. + +## Why a Separate Environment? + +The existing `otagent` env (vLLM 0.13.0) cannot serve newer models like Qwen3.5-9B: + +1. **vLLM 0.13.0** doesn't have `Qwen3_5ForConditionalGeneration` in its model registry +2. **Upgrading transformers alone** breaks vLLM 0.13.0 (internal `ALLOWED_LAYER_TYPES` was renamed) +3. **vLLM 0.16.0** still doesn't have Qwen3.5 — need **0.17.1** minimum +4. **transformers on PyPI** (4.57.x) doesn't recognize `qwen3_5` config — need to install from GitHub source + +So we create a parallel `otagent2` env that keeps `otagent` intact for all existing models. + +## Target Versions + +| Package | Version | Notes | +|---------|---------|-------| +| Python | 3.12 | | +| vLLM | 0.17.1 | First version with Qwen3.5 support | +| torch | 2.10.0+cu128 | Pulled by vLLM | +| transformers | 5.x (dev) | From GitHub main, not PyPI | +| ray | 2.54.0 | | +| harbor | 0.1.45 | From laude-institute fork | +| CUDA | 12.8 | Pulled by vLLM's torch | + +## Setup Instructions + +### Option A: Use the Setup Script (Recommended) + +The script at `eval/MBZ/setup_eval_env.sh` automates everything: + +```bash +cd /path/to/OpenThoughts-Agent +bash eval/MBZ/setup_eval_env.sh +``` + +The script accepts an optional env name argument (default: `otagent2`): +```bash +bash eval/MBZ/setup_eval_env.sh my_custom_env_name +``` + +**Before running**, edit these paths at the top of the script to match your cluster: +```bash +SCRATCH="/mnt/weka/home/richard.zhuang" # Your home/scratch directory +DCFT="$SCRATCH/OpenThoughts-Agent" # Path to this repo +CONDA_BASE="$SCRATCH/miniconda3" # Path to miniconda installation +``` + +### Option B: Manual Step-by-Step + +If you need to adapt for a different cluster or debug issues: + +#### Step 1: Create conda env +```bash +conda create -n otagent2 python=3.12 -y +conda activate otagent2 +pip install uv # Fast dependency resolver +``` + +#### Step 2: Install vLLM 0.17.1 +This pulls compatible torch, triton, and CUDA dependencies automatically: +```bash +uv pip install "vllm==0.17.1" +``` + +**CUDA note:** vLLM 0.17.1 installs torch with CUDA 12.8. If your cluster has a different CUDA version, you may need to adjust. Check `nvidia-smi` for your driver version — CUDA 12.8 requires driver >= 570.x. + +#### Step 3: Install transformers from source +PyPI's transformers (4.57.x) doesn't have Qwen3.5 config. Must install from GitHub: +```bash +uv pip install "git+https://github.com/huggingface/transformers.git" +``` + +This gives you transformers 5.x with `Qwen3_5ForConditionalGeneration` support. + +#### Step 4: Install project dependencies +```bash +cd /path/to/OpenThoughts-Agent + +# Install the project itself (no-deps to avoid conflicting torch/vllm pins) +uv pip install -e . --no-deps + +# Install eval infrastructure packages +uv pip install \ + "pydantic>=2.0.0,<3.0.0" \ + pyyaml \ + omegaconf \ + wandb \ + bs4 \ + "numpy<=2.26.0" \ + "datasets>=2.0.0" \ + "supabase>=2.22.3" \ + "python-dotenv>=1.0.0" \ + "google-cloud-storage" \ + h5py \ + certifi \ + rapidfuzz \ + "uv>=0.4.17" \ + socksio \ + "litellm>=1.80.0" \ + "ray[default]>=2.50.0" \ + "hydra-core>=1.3.2" \ + aiohttp-socks \ + Jinja2 +``` + +**Important:** Use `--no-deps` for the project install. The project's `pyproject.toml` pins older vllm/torch versions that would downgrade what we just installed. + +#### Step 5: Install Harbor +```bash +uv pip install "harbor[daytona] @ git+https://github.com/laude-institute/harbor.git@penfever/temp-override" +``` + +If you have a local harbor checkout (e.g. with custom patches), use editable install instead: +```bash +uv pip install -e /path/to/harbor +``` + +#### Step 6: Install dynamic-semaphore +```bash +uv pip install "dynamic-semaphore @ git+https://github.com/penfever/dynamic-semaphore" +``` + +## Verification + +Run this after setup to confirm everything works: + +```bash +conda activate otagent2 +python -c " +import sys; print(f'Python: {sys.version}') +import torch; print(f'torch: {torch.__version__} (CUDA: {torch.version.cuda})') +import vllm; print(f'vllm: {vllm.__version__}') +import transformers; print(f'transformers: {transformers.__version__}') + +# Critical: Qwen3.5 config must be recognized +from transformers import AutoConfig +cfg = AutoConfig.from_pretrained('Qwen/Qwen3.5-9B', trust_remote_code=True) +print(f'Qwen3.5-9B config: OK (arch={cfg.architectures})') + +import ray; print(f'ray: {ray.__version__}') +import harbor; print(f'harbor: {harbor.__version__}') +import litellm; print('litellm: OK') +import supabase; print('supabase: OK') +import daytona; print('daytona: OK') + +from database.unified_db.utils import upload_eval_results; print('database.unified_db: OK') +from harbor.utils.traces_utils import export_traces; print('harbor traces: OK') +" +``` + +Expected output: +``` +Python: 3.12.x +torch: 2.10.0+cu128 (CUDA: 12.8) +vllm: 0.17.1 +transformers: 5.3.0.dev0 +Qwen3.5-9B config: OK (arch=['Qwen3_5ForConditionalGeneration']) +ray: 2.54.0 +harbor: 0.1.45 +litellm: OK +supabase: OK +daytona: OK +database.unified_db: OK +harbor traces: OK +``` + +## GPU Smoke Test + +After env setup, verify vLLM can actually serve the model on your GPUs: + +```bash +# Interactive SLURM job (adjust partition/qos for your cluster) +srun -p main --gres=gpu:1 --cpus-per-task=16 --time=00:30:00 --pty bash -c ' + eval "$(conda shell.bash hook 2>/dev/null)" + conda activate otagent2 + python -c "import torch; print(f\"GPU: {torch.cuda.get_device_name(0)}\"); print(f\"CUDA available: {torch.cuda.is_available()}\")" + vllm serve Qwen/Qwen3.5-9B --host 0.0.0.0 --port 8000 --enforce-eager --gpu-memory-utilization 0.9 +' +``` + +On H200 (141GB), single-GPU results: +- Model memory: 17.66 GiB +- KV cache available: 104.16 GiB +- Architecture: `Qwen3_5ForConditionalGeneration` +- Max concurrency at 262K context: 12.93x + +## Using otagent2 with the Eval Listener + +Pass `--conda-env otagent2` to the listener: + +```bash +python eval/MBZ/unified_eval_listener_v4.py \ + --preset v2 \ + --priority-file eval/MBZ/lists/my_models.txt \ + --conda-env otagent2 \ + --enable-thinking \ + --tp-size 2 \ + --secrets-file ~/secrets.env \ + --once --verbose +``` + +This sets `EVAL_CONDA_ENV=otagent2` → the sbatch script does `conda activate "$CONDA_ENV"` instead of the default `otagent`. + +## When to Use otagent2 vs otagent + +| Model | Environment | +|-------|------------| +| Qwen3-8B, Qwen3-32B, etc. | `otagent` (vLLM 0.13.0 supports these) | +| Qwen3.5-9B, Qwen3.5-* | `otagent2` (needs vLLM 0.17.1) | +| DeepSeek-R1-*, Nemotron-*, most 7-14B models | `otagent` | +| Any model with `Qwen3_5ForConditionalGeneration` arch | `otagent2` | +| Future new architectures not in vLLM 0.13.0 | `otagent2` (or newer) | + +**Rule of thumb:** Use `otagent` by default. Only switch to `otagent2` if vLLM fails with "Unsupported model architecture" or if the model's `config.json` has an architecture not in vLLM 0.13.0's registry. + +## Troubleshooting + +### "No module named 'transformers.models.qwen3_5'" +Transformers was installed from PyPI instead of source. Fix: +```bash +uv pip install --force-reinstall "git+https://github.com/huggingface/transformers.git" +``` + +### vLLM crashes with ALLOWED_LAYER_TYPES error +You have a transformers version that's too new for your vLLM. Ensure vLLM is 0.17.1: +```bash +python -c "import vllm; print(vllm.__version__)" +# Should print 0.17.1 +``` + +### CUDA version mismatch +If you get CUDA errors, check compatibility: +```bash +nvidia-smi # Shows driver version and max CUDA +python -c "import torch; print(torch.version.cuda)" # Should be 12.8 +``` +CUDA 12.8 requires NVIDIA driver >= 570.x. If your driver is older, you may need a different vLLM version compiled for your CUDA. + +### torch.cuda.is_available() returns False +Ensure you're on a GPU node (not login node). Run the GPU smoke test via `srun`. + +### Harbor import errors +If harbor fails to import, ensure it was installed after torch/vllm (not before): +```bash +uv pip install -e /path/to/harbor # or from git +``` diff --git a/eval/docs/OVERLONG_EVAL_RUNBOOK.md b/eval/docs/OVERLONG_EVAL_RUNBOOK.md new file mode 100644 index 00000000..82c80a04 --- /dev/null +++ b/eval/docs/OVERLONG_EVAL_RUNBOOK.md @@ -0,0 +1,135 @@ +# Overlong Eval Jobs: Diagnosis & Upload Runbook + +## Background + +Some eval jobs hit the 24-hour SLURM time limit without completing all trials. These are marked as `is_overlong=true` in the database so the listener doesn't perpetually resubmit them. + +### Root Cause + +Slow models are **not more verbose** — they produce ~0.5x the output tokens of fast models. The bottleneck is **tool/environment execution**: 94% of agent wall time is spent waiting on sandbox commands (compilations, builds, installs) that don't lead to solutions. Trials hit the agent timeout ceiling and waste the maximum allowed duration. + +Key metrics across 19 timed-out jobs: +- LLM inference time per trial: ~7 min (same as fast models) +- Agent execution per trial: **125 min** (vs 20 min for fast models) +- LLM % of agent execution: **6%** (vs 29% for fast models) +- 44% of trials end in `AgentTimeoutError`, consuming 88-96% of total wall time + +### Detection + +Overlong jobs are detected by: **no `finished_at` in result.json AND elapsed > 20 hours**. This catches the final timed-out attempt that was never retried. Earlier timed-out attempts show as `TIME LIMIT` in SLURM logs but were retried by the listener. + +## Files + +All files are relative to the repo root (`OpenThoughts-Agent/`). + +### Scripts + +| File | Purpose | +|------|---------| +| `scripts/database/upload_overlong_jobs.py` | Detect overlong jobs, check DB, upload with `is_overlong=true` | +| `scripts/database/manual_db_eval_push.py` | Manual single-job upload (supports `--overlong` flag) | +| `eval/MBZ/diagnose_slow_jobs.py` | Diagnostic script: timing breakdown, error distribution, per-model analysis | + +### Modified (to support `is_overlong`) + +| File | Change | +|------|--------| +| `database/unified_db/utils.py` | Added `is_overlong` param to `upload_job_and_trial_records()`, `upload_eval_results()`, `register_sandbox_job()` | +| `hpc/launch_utils.py` | Added `is_overlong` param to `sync_eval_to_database()`. Added trial `source` field parsing in `derive_benchmark_from_job_dir()` | +| `scripts/database/manual_db_eval_push.py` | Added `--overlong` CLI flag | + +### Reports + +| File | Purpose | +|------|---------| +| `eval/MBZ/SLOW_MODEL_DIAGNOSIS.md` | Generated diagnostic report (run `diagnose_slow_jobs.py` to regenerate) | + +## Quick Start (copy-paste for another cluster) + +### Prerequisites + +```bash +# Source environment +source ~/secrets.env # needs SUPABASE_URL, SUPABASE_ANON_KEY/SERVICE_ROLE_KEY, HF_TOKEN +conda activate otagent +cd /path/to/OpenThoughts-Agent +``` + +### 1. Dry run: see what would be uploaded + +```bash +python scripts/database/upload_overlong_jobs.py --jobs-dir /path/to/jobs +``` + +### 2. Upload all overlong jobs (with HF traces, 8 parallel workers) + +```bash +python scripts/database/upload_overlong_jobs.py \ + --upload --force --skip-db-check --parallel 8 \ + --jobs-dir /path/to/jobs +``` + +### 3. Upload with DB duplicate check (skips if model/benchmark already Finished) + +```bash +python scripts/database/upload_overlong_jobs.py \ + --upload --force --parallel 8 \ + --jobs-dir /path/to/jobs +``` + +### 4. Upload a single job manually + +```bash +python scripts/database/manual_db_eval_push.py \ + --job-dir /path/to/jobs/terminal_bench_2_model_name_20260330_014503 \ + --benchmark-name terminal_bench_2 \ + --overlong --force --forced-update --verbose +``` + +### 5. Skip HF upload (DB only) + +```bash +python scripts/database/upload_overlong_jobs.py \ + --upload --force --skip-db-check --skip-hf --parallel 8 \ + --jobs-dir /path/to/jobs +``` + +### 6. Filter to specific benchmark + +```bash +python scripts/database/upload_overlong_jobs.py \ + --upload --force --parallel 8 \ + --benchmark terminal_bench_2 \ + --jobs-dir /path/to/jobs +``` + +### 7. Generate diagnostic report + +```bash +python eval/MBZ/diagnose_slow_jobs.py --output eval/MBZ/SLOW_MODEL_DIAGNOSIS.md +python eval/MBZ/diagnose_slow_jobs.py --benchmark tb2 --top 20 # stdout, filtered +``` + +## DB Schema + +The `sandbox_jobs` table has: +- `is_overlong` (boolean, default false) — set to true for timed-out jobs +- `job_status` remains `"Finished"` so the listener treats them as done + +```sql +-- If is_overlong column doesn't exist yet on the target cluster: +ALTER TABLE sandbox_jobs ADD COLUMN IF NOT EXISTS is_overlong BOOLEAN DEFAULT false; +``` + +## Files to Copy to Another Cluster + +```bash +# From the repo root, these are the files needed: +scripts/database/upload_overlong_jobs.py +scripts/database/manual_db_eval_push.py +eval/MBZ/diagnose_slow_jobs.py +database/unified_db/utils.py # has is_overlong support +hpc/launch_utils.py # has is_overlong passthrough + benchmark derivation fix +``` + +Or just `git pull` if the cluster has the same repo. diff --git a/eval/docs/V6_MIGRATION.md b/eval/docs/V6_MIGRATION.md new file mode 100644 index 00000000..fe1f2e63 --- /dev/null +++ b/eval/docs/V6_MIGRATION.md @@ -0,0 +1,111 @@ +# Jupiter v6 Listener Migration + +## What's already done (in this repo, will arrive via `git pull`) + +1. **`eval/clusters/jupiter.yaml`** — updated sbatch paths to shared `eval/MBZ/unified_eval_harbor.sbatch` +2. **`eval/jupiter/dcagent_eval_config.yaml`** — updated `jobs_dir` to `zhuang1_eval_jobs` +3. **`eval/jupiter/dcagent_eval_config_no_override.yaml`** — created (swebench/tb2 variant) +4. **`eval/MBZ/unified_eval_harbor.sbatch`** — cluster-agnostic v6 sbatch (shared across clusters) +5. **`eval/MBZ/unified_eval_harbor_dp.sbatch`** — cluster-agnostic DP sbatch +6. **`eval/MBZ/unified_eval_listener_v6.py`** — shared v6 listener +7. **`eval/baseline_model_configs.yaml`** — shared model configs + +## Steps to run on Jupiter + +### 1. Pull latest code +```bash +source ~/.bashrc; conda activate otagent +cd /e/scratch/jureap59/zhuang1/OpenThoughts-Agent +GIT_TERMINAL_PROMPT=0 git pull +``` + +### 2. Pin harbor to known-good commit +```bash +cd /e/scratch/jureap59/feuer1/harbor +git fetch && git checkout 6fdb92e7f5707c2b01214933f1622771784e6f67 +# Reinstall in your conda env +pip install -e . +``` + +### 3. Install hf_transfer +```bash +pip install hf_transfer +``` + +### 4. Create jobs dir (if it doesn't exist) +```bash +mkdir -p /e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs +mkdir -p eval/jupiter/logs +``` + +### 5. Pre-download datasets +```bash +source ~/secrets.env +python eval/jupiter/snapshot_download.py DCAgent/dev_set_v2 +python eval/jupiter/snapshot_download.py DCAgent2/terminal_bench_2 +python eval/jupiter/snapshot_download.py DCAgent2/swebench-verified-random-100-folders +``` + +### 6. Verify secrets.env has all required keys +```bash +source ~/secrets.env +echo "DAYTONA_API_KEY: ${DAYTONA_API_KEY:0:12}..." +echo "SUPABASE_URL: ${SUPABASE_URL:0:20}..." +echo "HF_TOKEN: ${HF_TOKEN:0:8}..." +``` + +### 7. Dry-run +```bash +source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ + --cluster-config eval/clusters/jupiter.yaml \ + --preset v2 \ + --priority-file eval/MBZ/lists/a1_retrained.txt \ + --baseline-model-config eval/baseline_model_configs.yaml \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --enable-thinking \ + --slurm-time 12:00:00 \ + --max-jobs-submitted 32 \ + --dry-run --once --verbose +``` + +### 8. Real run (example) +```bash +source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ + --cluster-config eval/clusters/jupiter.yaml \ + --preset swebench \ + --priority-file eval/MBZ/lists/no_eval_models_latest.txt \ + --baseline-model-config eval/baseline_model_configs.yaml \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --enable-thinking \ + --slurm-time 12:00:00 \ + --max-jobs-submitted 32 \ + --pack-jobs \ + --stagger-delay 1 --chain-batch-size 10 \ + --no-disk-resume \ + --once +``` + +## Key differences from M2 + +| Setting | M2 | Jupiter | +|---------|-----|---------| +| Partition | `main` | `booster` | +| Account | (none) | `reformo` | +| Time limit | 24:00:00 | 12:00:00 | +| GPUs/node | 8 | 4 | +| Arch | x86_64 | aarch64 (GH200) | +| Internet on compute | yes | **no** (proxy required) | +| Conda env | otagent/otagent2 | otagent/otagent2 (different paths) | +| Harbor | local install | feuer1's shared install | +| HF cache | `~/.cache/huggingface/hub` | `/e/data1/datasets/playground/ot/hf_hub` | +| Jobs dir | `$PWD/jobs` | `/e/data1/.../zhuang1_eval_jobs` | +| Pre-download | optional (has internet) | **required** (no internet on compute) | + +## Proxy note + +Jupiter compute nodes have no internet. The v6 sbatch auto-detects proxy settings from `jupiter.yaml`: +- Uses proxychains for HF downloads on compute +- SSH tunnel via `jpbl-s01-02` login node +- `--pre-download` flag on listener pre-downloads models on login node before submission diff --git a/eval/jupiter/V6_MIGRATION.md b/eval/jupiter/V6_MIGRATION.md new file mode 100644 index 00000000..fe1f2e63 --- /dev/null +++ b/eval/jupiter/V6_MIGRATION.md @@ -0,0 +1,111 @@ +# Jupiter v6 Listener Migration + +## What's already done (in this repo, will arrive via `git pull`) + +1. **`eval/clusters/jupiter.yaml`** — updated sbatch paths to shared `eval/MBZ/unified_eval_harbor.sbatch` +2. **`eval/jupiter/dcagent_eval_config.yaml`** — updated `jobs_dir` to `zhuang1_eval_jobs` +3. **`eval/jupiter/dcagent_eval_config_no_override.yaml`** — created (swebench/tb2 variant) +4. **`eval/MBZ/unified_eval_harbor.sbatch`** — cluster-agnostic v6 sbatch (shared across clusters) +5. **`eval/MBZ/unified_eval_harbor_dp.sbatch`** — cluster-agnostic DP sbatch +6. **`eval/MBZ/unified_eval_listener_v6.py`** — shared v6 listener +7. **`eval/baseline_model_configs.yaml`** — shared model configs + +## Steps to run on Jupiter + +### 1. Pull latest code +```bash +source ~/.bashrc; conda activate otagent +cd /e/scratch/jureap59/zhuang1/OpenThoughts-Agent +GIT_TERMINAL_PROMPT=0 git pull +``` + +### 2. Pin harbor to known-good commit +```bash +cd /e/scratch/jureap59/feuer1/harbor +git fetch && git checkout 6fdb92e7f5707c2b01214933f1622771784e6f67 +# Reinstall in your conda env +pip install -e . +``` + +### 3. Install hf_transfer +```bash +pip install hf_transfer +``` + +### 4. Create jobs dir (if it doesn't exist) +```bash +mkdir -p /e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs +mkdir -p eval/jupiter/logs +``` + +### 5. Pre-download datasets +```bash +source ~/secrets.env +python eval/jupiter/snapshot_download.py DCAgent/dev_set_v2 +python eval/jupiter/snapshot_download.py DCAgent2/terminal_bench_2 +python eval/jupiter/snapshot_download.py DCAgent2/swebench-verified-random-100-folders +``` + +### 6. Verify secrets.env has all required keys +```bash +source ~/secrets.env +echo "DAYTONA_API_KEY: ${DAYTONA_API_KEY:0:12}..." +echo "SUPABASE_URL: ${SUPABASE_URL:0:20}..." +echo "HF_TOKEN: ${HF_TOKEN:0:8}..." +``` + +### 7. Dry-run +```bash +source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ + --cluster-config eval/clusters/jupiter.yaml \ + --preset v2 \ + --priority-file eval/MBZ/lists/a1_retrained.txt \ + --baseline-model-config eval/baseline_model_configs.yaml \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --enable-thinking \ + --slurm-time 12:00:00 \ + --max-jobs-submitted 32 \ + --dry-run --once --verbose +``` + +### 8. Real run (example) +```bash +source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ + --cluster-config eval/clusters/jupiter.yaml \ + --preset swebench \ + --priority-file eval/MBZ/lists/no_eval_models_latest.txt \ + --baseline-model-config eval/baseline_model_configs.yaml \ + --timeout-multiplier 2.0 \ + --tp-size 2 \ + --enable-thinking \ + --slurm-time 12:00:00 \ + --max-jobs-submitted 32 \ + --pack-jobs \ + --stagger-delay 1 --chain-batch-size 10 \ + --no-disk-resume \ + --once +``` + +## Key differences from M2 + +| Setting | M2 | Jupiter | +|---------|-----|---------| +| Partition | `main` | `booster` | +| Account | (none) | `reformo` | +| Time limit | 24:00:00 | 12:00:00 | +| GPUs/node | 8 | 4 | +| Arch | x86_64 | aarch64 (GH200) | +| Internet on compute | yes | **no** (proxy required) | +| Conda env | otagent/otagent2 | otagent/otagent2 (different paths) | +| Harbor | local install | feuer1's shared install | +| HF cache | `~/.cache/huggingface/hub` | `/e/data1/datasets/playground/ot/hf_hub` | +| Jobs dir | `$PWD/jobs` | `/e/data1/.../zhuang1_eval_jobs` | +| Pre-download | optional (has internet) | **required** (no internet on compute) | + +## Proxy note + +Jupiter compute nodes have no internet. The v6 sbatch auto-detects proxy settings from `jupiter.yaml`: +- Uses proxychains for HF downloads on compute +- SSH tunnel via `jpbl-s01-02` login node +- `--pre-download` flag on listener pre-downloads models on login node before submission diff --git a/eval/jupiter/dcagent_eval_config.yaml b/eval/jupiter/dcagent_eval_config.yaml index 149e3607..1f729080 100644 --- a/eval/jupiter/dcagent_eval_config.yaml +++ b/eval/jupiter/dcagent_eval_config.yaml @@ -1,4 +1,4 @@ -jobs_dir: /e/data1/datasets/playground/mmlaion/shared/guha1/eval_jobs +jobs_dir: /e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs n_attempts: 3 timeout_multiplier: 1.0 orchestrator: diff --git a/eval/jupiter/dcagent_eval_config_no_override.yaml b/eval/jupiter/dcagent_eval_config_no_override.yaml new file mode 100644 index 00000000..8ec7dae0 --- /dev/null +++ b/eval/jupiter/dcagent_eval_config_no_override.yaml @@ -0,0 +1,46 @@ +jobs_dir: /e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs +n_attempts: 3 +timeout_multiplier: 1.0 +orchestrator: + type: local + n_concurrent_trials: 4 + quiet: false + plain_output: true + retry: + max_retries: 3 + exclude_exceptions: + - AgentTimeoutError + - AgentEnvironmentTimeoutError + - BadRequestError + - VerifierTimeoutError + - SummarizationTimeout + - SummarizationTimeoutError + - ContextLengthExceededError + wait_multiplier: 1.0 + min_wait_sec: 1.0 + max_wait_sec: 60.0 +environment: + type: daytona + force_build: true + delete: false +agents: + - name: terminus-2 + max_timeout_sec: 7200 + trajectory_config: + raw_content: true + linear_history: true + kwargs: + record_terminal_session: false + enable_episode_logging: false + enable_pane_logging: false + collect_rollout_details: false + collect_engine_metrics: false + metrics_endpoint: https://replace-with-vllm-host/metrics + metrics_timeout_sec: 10 + model_info: + max_input_tokens: 32768 + max_output_tokens: 8192 + input_cost_per_token: 0 + output_cost_per_token: 0 +datasets: + - path: examples/tasks diff --git a/eval/jupiter/snapshot_download.py b/eval/jupiter/snapshot_download.py index fcd402d9..882c1f85 100644 --- a/eval/jupiter/snapshot_download.py +++ b/eval/jupiter/snapshot_download.py @@ -1,6 +1,8 @@ import os import sys import argparse +import fcntl +import time from huggingface_hub import snapshot_download def is_valid_task_dir(path): @@ -112,21 +114,34 @@ def main(): path = None if args.local_dir: - # When --local-dir is specified, download real files (no symlinks) - # Check if local_dir already has valid task dirs - if os.path.isdir(args.local_dir): - task_dirs = [d for d in os.listdir(args.local_dir) - if is_valid_task_dir(os.path.join(args.local_dir, d))] - if task_dirs: - print(f"Found existing dataset at {args.local_dir} with {len(task_dirs)} tasks") - path = args.local_dir - if not path: - print("Downloading dataset to local dir (real files, no symlinks)...", file=sys.stderr) - path = download_sandboxes_dataset( - repo_id=args.repo_id, - local_dir=args.local_dir, - cache_dir=args.cache_dir - ) + # When --local-dir is specified, download real files (no symlinks). + # Use a file lock to prevent race conditions when multiple SLURM jobs + # download the same dataset concurrently. + lock_path = args.local_dir.rstrip("/") + ".lock" + os.makedirs(os.path.dirname(lock_path) or ".", exist_ok=True) + lock_fd = open(lock_path, "w") + try: + print(f"Acquiring dataset lock: {lock_path}", file=sys.stderr) + fcntl.flock(lock_fd, fcntl.LOCK_EX) + print("Lock acquired.", file=sys.stderr) + + # Check if local_dir already has valid task dirs + if os.path.isdir(args.local_dir): + task_dirs = [d for d in os.listdir(args.local_dir) + if is_valid_task_dir(os.path.join(args.local_dir, d))] + if task_dirs: + print(f"Found existing dataset at {args.local_dir} with {len(task_dirs)} tasks") + path = args.local_dir + if not path: + print("Downloading dataset to local dir (real files, no symlinks)...", file=sys.stderr) + path = download_sandboxes_dataset( + repo_id=args.repo_id, + local_dir=args.local_dir, + cache_dir=args.cache_dir + ) + finally: + fcntl.flock(lock_fd, fcntl.LOCK_UN) + lock_fd.close() else: # First try to get existing cached path path = get_dataset_path(args.repo_id) diff --git a/eval/legacy/README.md b/eval/legacy/README.md new file mode 100644 index 00000000..aa1255a6 --- /dev/null +++ b/eval/legacy/README.md @@ -0,0 +1,11 @@ +# Legacy Eval Scripts + +These are frozen v4/v6-MBZ eval scripts kept for backward compatibility. + +- `unified_eval_harbor_v4.sbatch` — v4 sbatch (MBZ-specific, hardcoded paths) +- `unified_eval_harbor_v6_mbz.sbatch` — old v6 sbatch (MBZ-specific, before cluster-agnostic rewrite) +- `unified_eval_listener_v4.py` — v4 listener (MBZ-specific) + +**Do not modify these files.** Use the cluster-agnostic versions in `eval/` instead: +- `eval/unified_eval_harbor.sbatch` +- `eval/unified_eval_listener.py` diff --git a/eval/legacy/unified_eval_harbor_v4.sbatch b/eval/legacy/unified_eval_harbor_v4.sbatch new file mode 100644 index 00000000..23e8569a --- /dev/null +++ b/eval/legacy/unified_eval_harbor_v4.sbatch @@ -0,0 +1,663 @@ +#!/bin/bash +#SBATCH -p main +#SBATCH --no-requeue +#SBATCH --time=24:00:00 +#SBATCH --nodes 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:1 +#SBATCH --output=experiments/logs/%x_%j.out +#SBATCH --job-name=terminal + +# ============================================================================== +# Unified Eval Harbor Script v4 — MBZ H200 Cluster +# ============================================================================== +# This script replaces all individual sbatch scripts with one parameterized version. +# Parameters are passed via environment variables (set by unified_eval_listener_v4.py). +# +# v3 Enhancements: +# - Enhancement 1: Unified invalid error threshold (counts all errors except AgentTimeoutError and ContextLengthExceededError) +# - Enhancement 4: Model retry tracking (appends to eval starts log) +# - Enhancement 5: Timeout multiplier support +# +# Positional Args: +# $1 = MODEL (HuggingFace model name, e.g., "org/model-name") +# $2 = REPO_ID (HuggingFace dataset repo, e.g., "DCAgent/dev_set_v2") +# $3 = BENCHMARK_ID (optional, DB benchmark UUID) +# $4 = RUN_TAG (optional, job name - if provided, updates Pending job to Started) +# +# Environment Variables (with defaults): +# EVAL_N_CONCURRENT Harbor --n-concurrent (default: 64) +# EVAL_N_ATTEMPTS Harbor --n-attempts (default: 3) +# EVAL_GPU_MEMORY_UTIL VLLM --gpu-memory-utilization (default: 0.9) +# EVAL_DAYTONA_THRESHOLD Max invalid errors before abort (default: 3) +# EVAL_VLLM_MAX_RETRIES VLLM startup retries (default: 5) +# EVAL_AGENT_PARSER Agent parser type (default: "", use "xml" for swebench) +# EVAL_SLURM_TIME SLURM time limit (read at submit time) +# EVAL_ENABLE_THINKING Enable thinking blocks (default: false) +# EVAL_AGENT_NAME Agent name for harbor and DB entries (default: "terminus-2") +# EVAL_STARTS_LOG Path to shared eval starts log file (default: "") +# EVAL_TIMEOUT_MULTIPLIER Harbor timeout multiplier (default: 1.0) +# EVAL_VLLM_TENSOR_PARALLEL_SIZE vLLM --tensor-parallel-size (default: 1) +# EVAL_SLURM_GPUS Number of GPUs to request (default: 1) +# EVAL_INVALID_ERRORS_LOG Path to invalid errors log (default: jobs/invalid_errors.log) +# ============================================================================== + +# === CONFIGURABLE PARAMETERS (with defaults) === +N_CONCURRENT="${EVAL_N_CONCURRENT:-64}" +N_ATTEMPTS="${EVAL_N_ATTEMPTS:-3}" +GPU_MEMORY_UTIL="${EVAL_GPU_MEMORY_UTIL:-0.9}" +ERROR_THRESHOLD="${EVAL_DAYTONA_THRESHOLD:-3}" +VLLM_MAX_RETRIES="${EVAL_VLLM_MAX_RETRIES:-5}" +AGENT_PARSER="${EVAL_AGENT_PARSER:-}" +ENABLE_THINKING="${EVAL_ENABLE_THINKING:-false}" +AGENT_NAME="${EVAL_AGENT_NAME:-terminus-2}" +EVAL_STARTS_LOG="${EVAL_STARTS_LOG:-}" +TIMEOUT_MULTIPLIER="${EVAL_TIMEOUT_MULTIPLIER:-1.0}" +TENSOR_PARALLEL_SIZE="${EVAL_VLLM_TENSOR_PARALLEL_SIZE:-1}" + +# === POSITIONAL ARGS === +MODEL="${1:-mlfoundations-dev/claude_3_7_20250219_tbench_traces_sharegptv1}" +REPO_ID="${2:-DCAgent/dev_set_v2}" +BENCHMARK_ID="${3:-}" +RUN_TAG_ARG="${4:-}" # Optional: job name from listener (if provided, Pending entry exists) + +# Create timestamp and safe names +TIMESTAMP=$(date +'%Y%m%d_%H%M%S') +SAFE_MODEL=$(echo "$MODEL" | tr '/:' '_') +SAFE_REPO=$(echo "$REPO_ID" | tr '/:' '_') + +echo "==============================================" +echo "Unified Eval Harbor v4 - Starting job (MBZ H200)" +echo "==============================================" +echo "Model: $MODEL" +echo "Repository: $REPO_ID" +echo "Benchmark ID: ${BENCHMARK_ID:-}" +echo "" +echo "Parameters:" +echo " N_CONCURRENT: $N_CONCURRENT" +echo " N_ATTEMPTS: $N_ATTEMPTS" +echo " GPU_MEMORY_UTIL: $GPU_MEMORY_UTIL" +echo " ERROR_THRESHOLD: $ERROR_THRESHOLD" +echo " VLLM_MAX_RETRIES: $VLLM_MAX_RETRIES" +echo " AGENT_PARSER: ${AGENT_PARSER:-}" +echo " ENABLE_THINKING: $ENABLE_THINKING" +echo " AGENT_NAME: $AGENT_NAME" +echo " TIMEOUT_MULTIPLIER: $TIMEOUT_MULTIPLIER" +echo " TENSOR_PARALLEL_SIZE: $TENSOR_PARALLEL_SIZE" +echo " EVAL_STARTS_LOG: ${EVAL_STARTS_LOG:-}" +echo "==============================================" + +# === MBZ ENVIRONMENT SETUP === +SCRATCH="/mnt/weka/home/richard.zhuang" +DCFT="$SCRATCH/OpenThoughts-Agent" + +# Set up environment variables +export VLLM_USE_V1=1 +export RAY_RUNTIME_ENV_HOOK=ray._private.runtime_env.uv_runtime_env_hook.hook +export RAY_ADDRESS=${RAY_ADDRESS:-} +export VLLM_CACHE_ROOT="$SCRATCH/.cache/vllm" +export VLLM_CONFIG_ROOT="$SCRATCH/.cache/vllm_config" +export TRITON_CACHE_DIR="$SCRATCH/.cache/triton" +export FLASHINFER_WORKSPACE_BASE="$SCRATCH/.cache/flashinfer" +export UV_CACHE_DIR="$SCRATCH/.cache/uv" +export HYDRA_FULL_ERROR=1 +export HF_CACHE_DIR="$SCRATCH/.cache/huggingface" +export HF_HUB_CACHE="$HF_CACHE_DIR/hub" +# XET staging cache — must be user-writable (shared HF_HOME/xet/ has permission issues) +export HF_XET_CACHE="$SCRATCH/.cache/hf_xet_cache" +mkdir -p "$HF_XET_CACHE" +export LITELLM_LOCAL_MODEL_COST_MAP=True +export PYTHONPATH="${DCFT}${PYTHONPATH:+:$PYTHONPATH}" + +# DB/API secrets +SECRETS_FILE="${EVAL_SECRETS_FILE:-$HOME/secrets.env}" +echo "Sourcing secrets from: $SECRETS_FILE" +source "$SECRETS_FILE" + +# Conda env (configurable via EVAL_CONDA_ENV, default: otagent) +CONDA_ENV="${EVAL_CONDA_ENV:-otagent}" +eval "$(conda shell.bash hook 2>/dev/null)" || source "$SCRATCH/miniconda3/etc/profile.d/conda.sh" +conda activate "$CONDA_ENV" +echo "Conda env: $CONDA_ENV" + +# Create cache dirs +mkdir -p "$VLLM_CACHE_ROOT" "$VLLM_CONFIG_ROOT" "$TRITON_CACHE_DIR" \ + "$FLASHINFER_WORKSPACE_BASE" "$UV_CACHE_DIR" "$HF_CACHE_DIR" \ + experiments/logs + +# === PRE-FLIGHT: Validate model architecture before starting vLLM === +echo "Validating model architecture..." +python - "$MODEL" <<'PREFLIGHT' +import json, sys, os + +model = sys.argv[1] if len(sys.argv) > 1 and sys.argv[1] else "" +if not model: + print("WARNING: No MODEL set, skipping preflight check") + sys.exit(0) + +# Blocklist of architecture prefixes that vLLM cannot serve +UNSUPPORTED_PREFIXES = ["FSDP"] + +try: + from huggingface_hub import hf_hub_download + cfg_path = hf_hub_download(model, "config.json") + with open(cfg_path) as f: + cfg = json.load(f) + archs = cfg.get("architectures", []) + print(f"Model architectures: {archs}") + for arch in archs: + for prefix in UNSUPPORTED_PREFIXES: + if arch.startswith(prefix): + print(f"FATAL: Architecture '{arch}' is not supported by vLLM.", file=sys.stderr) + print(f"This model appears to be a raw training checkpoint (e.g. FSDP).", file=sys.stderr) + print(f"Convert it to standard HuggingFace format before evaluating.", file=sys.stderr) + sys.exit(1) +except Exception as e: + # If we can't download config.json, vLLM will also fail — let it fail there + # with its own error message rather than blocking on network issues here + print(f"WARNING: Preflight check could not validate model: {e}") + sys.exit(0) +PREFLIGHT + +if [ $? -ne 0 ]; then + echo "FATAL: Model architecture not supported. Aborting job." + exit 1 +fi + +# ============================================================================== +# Fix: Qwen3.5 VL fine-tunes missing preprocessor_config.json +# ============================================================================== +echo "Checking for Qwen3.5 preprocessor fix..." +MODEL_SNAP_DIR=$(python3 -c " +import sys +from huggingface_hub import snapshot_download +path = snapshot_download('$MODEL') +sys.stdout.write(path) +" 2>/dev/null) || true + +if [ -n "$MODEL_SNAP_DIR" ] && [ -f "$MODEL_SNAP_DIR/config.json" ]; then + IS_QWEN35_VL=$(python3 -c " +import json +with open('$MODEL_SNAP_DIR/config.json') as f: + cfg = json.load(f) +archs = cfg.get('architectures', []) +if any('Qwen3_5ForConditional' in a for a in archs): + print('yes') +" 2>/dev/null) + + if [ "$IS_QWEN35_VL" = "yes" ]; then + if [ ! -f "$MODEL_SNAP_DIR/preprocessor_config.json" ]; then + echo "Qwen3.5 VL fine-tune missing preprocessor_config.json — copying from base model" + BASE_PREP=$(python3 -c " +from huggingface_hub import hf_hub_download +print(hf_hub_download('Qwen/Qwen3.5-9B', 'preprocessor_config.json')) +" 2>/dev/null) + if [ -n "$BASE_PREP" ] && [ -f "$BASE_PREP" ]; then + cp "$BASE_PREP" "$MODEL_SNAP_DIR/preprocessor_config.json" + echo "Copied preprocessor_config.json to $MODEL_SNAP_DIR" + fi + else + echo "preprocessor_config.json already present in snapshot" + fi + export HF_HUB_OFFLINE=1 + echo "Set HF_HUB_OFFLINE=1 for Qwen3.5 VL model loading" + fi +fi + +# Start VLLM server +VLLM_ARGS="--host 0.0.0.0 --port 8000" +VLLM_ARGS="$VLLM_ARGS --served-model-name $MODEL" +VLLM_ARGS="$VLLM_ARGS --enforce-eager" +VLLM_ARGS="$VLLM_ARGS --gpu-memory-utilization $GPU_MEMORY_UTIL" +VLLM_ARGS="$VLLM_ARGS --tensor-parallel-size $TENSOR_PARALLEL_SIZE" + +# Baseline model config overrides (set by listener from baseline_model_configs.yaml) +if [ -n "${EVAL_VLLM_MAX_MODEL_LEN:-}" ]; then + VLLM_ARGS="$VLLM_ARGS --max-model-len $EVAL_VLLM_MAX_MODEL_LEN" + echo " Override: --max-model-len $EVAL_VLLM_MAX_MODEL_LEN" +fi +if [ -n "${EVAL_VLLM_SWAP_SPACE:-}" ]; then + VLLM_ARGS="$VLLM_ARGS --swap-space $EVAL_VLLM_SWAP_SPACE" + echo " Override: --swap-space $EVAL_VLLM_SWAP_SPACE" +fi +if [ -n "${EVAL_VLLM_TRUST_REMOTE_CODE:-}" ]; then + VLLM_ARGS="$VLLM_ARGS --trust-remote-code" + echo " Override: --trust-remote-code" +fi +if [ -n "${EVAL_VLLM_TOOL_CALL_PARSER:-}" ]; then + VLLM_ARGS="$VLLM_ARGS --tool-call-parser $EVAL_VLLM_TOOL_CALL_PARSER" + echo " Override: --tool-call-parser $EVAL_VLLM_TOOL_CALL_PARSER" +fi +if [ -n "${EVAL_VLLM_REASONING_PARSER:-}" ]; then + VLLM_ARGS="$VLLM_ARGS --reasoning-parser $EVAL_VLLM_REASONING_PARSER" + echo " Override: --reasoning-parser $EVAL_VLLM_REASONING_PARSER" +fi +if [ -n "${EVAL_VLLM_EXTRA_ARGS:-}" ]; then + VLLM_ARGS="$VLLM_ARGS $EVAL_VLLM_EXTRA_ARGS" + echo " Override: extra_args=$EVAL_VLLM_EXTRA_ARGS" +fi + +echo "Starting vLLM: vllm serve $MODEL $VLLM_ARGS" +vllm serve "$MODEL" \ + $VLLM_ARGS \ + > "experiments/logs/vllm_${SLURM_JOB_ID}.log" 2>&1 & +VLLM_PID=$! + +cleanup() { + echo "Cleaning up..." + kill $VLLM_PID 2>/dev/null || true + conda deactivate || true +} +trap cleanup EXIT + +# Wait for VLLM server to start with healthcheck +RETRY_INTERVAL=100 +for i in $(seq 1 $VLLM_MAX_RETRIES); do + if curl -s http://localhost:8000/v1/models > /dev/null; then + echo "VLLM server is ready" + break + fi + echo "Waiting for VLLM server to start (attempt $i/$VLLM_MAX_RETRIES)..." + sleep $RETRY_INTERVAL + if [ $i -eq $VLLM_MAX_RETRIES ]; then + echo "VLLM server failed to start" + exit 1 + fi +done + +# Restore online mode for dataset download and uploads +unset HF_HUB_OFFLINE + +# Get the dataset path using the specified repo_id +echo "Downloading/locating dataset: $REPO_ID" +DATASET_PATH=$(python "$DCFT/eval/MBZ/snapshot_download.py" "$REPO_ID" | grep DATASET_PATH | tail -n 1 | cut -d'=' -f2) +if [ -z "${DATASET_PATH:-}" ]; then + echo "Failed to get dataset path" + exit 1 +fi +echo "Using dataset path: $DATASET_PATH" + +# Construct run dir (harbor honors --job-name in jobs/) +# Use provided RUN_TAG if available (from listener), otherwise generate one +if [ -n "$RUN_TAG_ARG" ]; then + RUN_TAG="$RUN_TAG_ARG" + echo "Using RUN_TAG from listener: $RUN_TAG" +else + RUN_TAG="${SAFE_REPO}_${SAFE_MODEL}_${TIMESTAMP}" + echo "Generated new RUN_TAG: $RUN_TAG" +fi +RUN_DIR="jobs/${RUN_TAG}" +echo "Job dir: ${RUN_DIR}" + +# Update or create DB row with status='Started' BEFORE running eval +echo "Updating/Creating DB job entry..." +export MODEL="$MODEL" +export REPO_ID="$REPO_ID" +export RUN_TAG="$RUN_TAG" +export RUN_TAG_ARG="$RUN_TAG_ARG" +export SLURM_JOB_ID="$SLURM_JOB_ID" +export N_CONCURRENT="$N_CONCURRENT" +export N_ATTEMPTS="$N_ATTEMPTS" +export AGENT_NAME="$AGENT_NAME" +export TIMEOUT_MULTIPLIER="$TIMEOUT_MULTIPLIER" + +# Get harbor package version +HARBOR_VERSION=$(python -c "import harbor; print(harbor.__version__)" 2>/dev/null || echo "unknown") +export HARBOR_VERSION + +python - <<'PY' +import os, sys, json +from database.unified_db.utils import create_job_entry_started, update_job_status_to_started + +model_hf = os.environ["MODEL"] +dataset_hf = os.environ["REPO_ID"] +run_tag = os.environ["RUN_TAG"] +run_tag_arg = os.environ.get("RUN_TAG_ARG", "") +slurm_job_id = os.environ["SLURM_JOB_ID"] +harbor_version = os.environ.get("HARBOR_VERSION", "unknown") +n_concurrent = int(os.environ.get("N_CONCURRENT", "64")) +n_attempts = int(os.environ.get("N_ATTEMPTS", "3")) +agent_name = os.environ.get("AGENT_NAME", "terminus-2") +timeout_multiplier = float(os.environ.get("TIMEOUT_MULTIPLIER", "1.0")) + +# Build config dict with timeout_multiplier (Enhancement 5) +config = {"agent": agent_name, "env": "daytona", "timeout_multiplier": timeout_multiplier} + +# If RUN_TAG was provided from listener, a Pending entry should exist - update it +# Otherwise, create a new Started entry (backward compatibility) +# Fallback: if Pending entry not found (e.g. stale entry reused by listener), +# create a fresh Started entry instead of failing. +result = None +if run_tag_arg: + print(f"Updating Pending job to Started: {run_tag}") + result = update_job_status_to_started( + job_name=run_tag, + n_trials=n_concurrent, + n_rep_eval=n_attempts, + config=config, + harbor_package_version=harbor_version + ) + if not result.get("success"): + print(f"WARNING: Pending entry not found ({result.get('error')}), creating Started entry directly") + result = None # fall through to create_job_entry_started + +if result is None: + print(f"Creating new Started job entry: {run_tag}") + result = create_job_entry_started( + model_hf_name=model_hf, + benchmark_hf_name=dataset_hf, + job_name=run_tag, + username=os.environ.get("UPLOAD_USERNAME", os.environ.get("USER", "unknown")), + slurm_job_id=slurm_job_id, + harbor_package_version=harbor_version, + agent_name=agent_name, + config=config, + n_trials=n_concurrent, + n_rep_eval=n_attempts + ) + +if not result.get("success"): + print(f"ERROR: {result.get('error')}", file=sys.stderr) + sys.exit(1) + +# IMPORTANT: Save the DB job_id for later update +db_job_id = result["job"]["id"] +print(f"DB job ready with ID: {db_job_id}") +PY + +if [ $? -ne 0 ]; then + echo "Failed to create/update DB job entry" + exit 1 +fi + +# Enhancement 4: Append to eval starts log (model retry tracking) +if [ -n "$EVAL_STARTS_LOG" ]; then + mkdir -p "$(dirname "$EVAL_STARTS_LOG")" + echo "$(date -Iseconds) $MODEL $REPO_ID $RUN_TAG" >> "$EVAL_STARTS_LOG" + echo "Logged eval start to: $EVAL_STARTS_LOG" +fi + +# Extract the job_id from Python output and save it +DB_JOB_ID=$(python - <<'PY' +import os, sys +from database.unified_db.utils import get_latest_job_for_model_benchmark + +model_hf = os.environ["MODEL"] +dataset_hf = os.environ["REPO_ID"] + +result = get_latest_job_for_model_benchmark(model_hf, dataset_hf) +if result and result.get("id"): + print(result["id"]) +else: + sys.exit(1) +PY +) + +if [ -z "$DB_JOB_ID" ]; then + echo "Failed to get DB job ID" + exit 1 +fi + +echo "DB job entry ready: $DB_JOB_ID" + +export HARBOR_JOBS_DIR="/tmp/harbor_jobs" +mkdir -p "$HARBOR_JOBS_DIR" + +# Let Terminus2 auto-resolve model_info for hosted_vllm (includes cost fields) +export TERMINUS_MODEL_MAX_TOKENS=32768 +export TERMINUS_MODEL_MAX_OUTPUT_TOKENS=16384 + +# Build harbor command with configurable params +HARBOR_CMD="harbor jobs start" +HARBOR_CMD="$HARBOR_CMD -p \"$DATASET_PATH\"" +HARBOR_CMD="$HARBOR_CMD --n-concurrent $N_CONCURRENT" +HARBOR_CMD="$HARBOR_CMD --agent \"$AGENT_NAME\"" +HARBOR_CMD="$HARBOR_CMD --model \"hosted_vllm/$MODEL\"" +HARBOR_CMD="$HARBOR_CMD --env \"daytona\"" +AUTO_SNAPSHOT="${EVAL_AUTO_SNAPSHOT:-false}" +HARBOR_CMD="$HARBOR_CMD --environment-kwarg auto_snapshot=$AUTO_SNAPSHOT" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"api_base=http://localhost:8000/v1\"" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"key=fake_key\"" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"max_tokens=16384\"" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg 'model_info={\"max_output_tokens\":16384,\"max_input_tokens\":32768,\"input_cost_per_token\":0.0,\"output_cost_per_token\":0.0}'" +HARBOR_CMD="$HARBOR_CMD --n-attempts $N_ATTEMPTS" +HARBOR_CMD="$HARBOR_CMD --export-traces" +HARBOR_CMD="$HARBOR_CMD --job-name \"$RUN_TAG\"" +CONFIG_YAML="${EVAL_CONFIG_YAML:-dcagent_eval_config.yaml}" +HARBOR_CMD="$HARBOR_CMD --config \"$DCFT/eval/MBZ/$CONFIG_YAML\"" + +# Add parser arg if specified (e.g., for swebench) +if [ -n "$AGENT_PARSER" ]; then + HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"parser=$AGENT_PARSER\"" +fi + +# Add extra_body for thinking control (only if enable_thinking is true) +if [ "$ENABLE_THINKING" = "true" ]; then + HARBOR_CMD="$HARBOR_CMD --agent-kwarg 'extra_body={\"chat_template_kwargs\":{\"enable_thinking\":true}}'" + echo "Thinking blocks enabled via extra_body" +fi + +# Enhancement 5: Add timeout multiplier if != 1.0 +if [ "$TIMEOUT_MULTIPLIER" != "1.0" ]; then + HARBOR_CMD="$HARBOR_CMD --timeout-multiplier $TIMEOUT_MULTIPLIER" + echo "Using timeout multiplier: $TIMEOUT_MULTIPLIER" +fi + +echo "Running: $HARBOR_CMD" + +# Run sandbox evaluation +set +e +eval $HARBOR_CMD +SB_EXIT=$? +set -e + +# Save originals for exact round-trip later +mkdir -p "$RUN_DIR" +{ + echo "MODEL=$MODEL" + echo "REPO_ID=$REPO_ID" + echo "TIMESTAMP=$TIMESTAMP" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "DB_JOB_ID=$DB_JOB_ID" + echo "RUN_TAG=$RUN_TAG" + echo "RUN_TAG_ARG=$RUN_TAG_ARG" + echo "N_CONCURRENT=$N_CONCURRENT" + echo "N_ATTEMPTS=$N_ATTEMPTS" + echo "GPU_MEMORY_UTIL=$GPU_MEMORY_UTIL" + echo "ERROR_THRESHOLD=$ERROR_THRESHOLD" + echo "AGENT_PARSER=$AGENT_PARSER" + echo "ENABLE_THINKING=$ENABLE_THINKING" + echo "TIMEOUT_MULTIPLIER=$TIMEOUT_MULTIPLIER" +} > "$RUN_DIR/meta.env" + +# If eval failed, don't attempt upload +if [ ${SB_EXIT:-0} -ne 0 ]; then + echo "harbor run exited with non-zero status: ${SB_EXIT}. Skipping upload." + exit ${SB_EXIT} +fi + +# Ensure run dir exists; no fallback +if [ ! -d "$RUN_DIR" ]; then + echo "Expected run directory not found: $RUN_DIR" + exit 2 +fi + + +# ---- Enhancement 1: Check for invalid errors before upload ---- +# Count ALL error types except AgentTimeoutError and ContextLengthExceededError and SummarizationTimeout and SummarizationTimeoutError (normal timeout) toward the threshold. +RESULT_FILE="$RUN_DIR/result.json" +ERROR_LOG="${EVAL_INVALID_ERRORS_LOG:-jobs/invalid_errors.log}" + +if [ -f "$RESULT_FILE" ]; then + echo "Checking for invalid errors in $RESULT_FILE..." + + # Count invalid errors using Python (all errors except AgentTimeoutError and ContextLengthExceededError and SummarizationTimeout and SummarizationTimeoutError) + INVALID_ERROR_COUNT=$(python3 -c " +import json +import sys + +VALID_ERROR_TYPES = {'AgentTimeoutError', 'ContextLengthExceededError', 'SummarizationTimeout', 'SummarizationTimeoutError'} + +try: + with open('$RESULT_FILE', 'r') as f: + data = json.load(f) + + invalid_trials = set() + error_counts = {} + + if 'stats' in data and 'evals' in data['stats']: + for eval_key, eval_data in data['stats']['evals'].items(): + if 'exception_stats' in eval_data: + for error_type, trial_names in eval_data['exception_stats'].items(): + if error_type not in VALID_ERROR_TYPES and isinstance(trial_names, list): + invalid_trials.update(trial_names) + error_counts[error_type] = error_counts.get(error_type, 0) + len(trial_names) + + print(len(invalid_trials)) + + # Log breakdown to stderr + if invalid_trials: + print(f'Found {len(invalid_trials)} unique invalid error trial(s):', file=sys.stderr) + for etype, count in sorted(error_counts.items(), key=lambda x: -x[1]): + print(f' {etype}: {count}', file=sys.stderr) + +except Exception as e: + print(f'Error parsing result.json: {e}', file=sys.stderr) + print('0') +" 2>&1 | tail -n 1) + + echo "Invalid error count: ${INVALID_ERROR_COUNT}" + + + # If too many invalid errors, log and skip upload + if [ "${INVALID_ERROR_COUNT:-0}" -gt "$ERROR_THRESHOLD" ]; then + echo "Job has ${INVALID_ERROR_COUNT} invalid errors (> ${ERROR_THRESHOLD}), skipping upload" + + # Log to error file + { + echo "===============================================" + echo "Timestamp: $(date)" + echo "Job: ${RUN_TAG}" + echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" + echo "Model: ${MODEL}" + echo "Repo: ${REPO_ID}" + echo "Invalid Errors: ${INVALID_ERROR_COUNT}" + echo "Error Threshold: ${ERROR_THRESHOLD}" + echo "Result file: ${RESULT_FILE}" + + # Extract error details with breakdown by type + python3 -c " +import json + +VALID_ERROR_TYPES = {'AgentTimeoutError', 'ContextLengthExceededError', 'SummarizationTimeout', 'SummarizationTimeoutError'} + +with open('$RESULT_FILE', 'r') as f: + data = json.load(f) +if 'stats' in data and 'evals' in data['stats']: + for eval_key, eval_data in data['stats']['evals'].items(): + if 'exception_stats' in eval_data: + eval_has_errors = False + for error_type, trial_names in eval_data['exception_stats'].items(): + if error_type not in VALID_ERROR_TYPES and isinstance(trial_names, list) and trial_names: + if not eval_has_errors: + print(f'Eval: {eval_key}') + eval_has_errors = True + print(f' {error_type} ({len(trial_names)}):') + for i, tid in enumerate(trial_names[:5], 1): + print(f' {i}. {tid}') + if len(trial_names) > 5: + print(f' ... and {len(trial_names) - 5} more') +" + echo "===============================================" + } >> "$ERROR_LOG" + + echo "Error details logged to: $ERROR_LOG" + echo "Job completed but not uploaded due to excessive invalid errors" + exit 0 # Exit successfully but skip upload + fi +else + echo "Warning: result.json not found, continuing with upload" +fi + + + +# ---- Upload results to DB ---- + +export RUN_DIR="$RUN_DIR" +export UPLOAD_USERNAME="${EVAL_UPLOAD_USERNAME:-$USER}" +export UPLOAD_MODE="${UPLOAD_MODE:-skip_on_error}" +export RUN_TAG="$RUN_TAG" +UPLOAD_LOG="experiments/logs/upload_${SLURM_JOB_ID}.log" +mkdir -p "$(dirname "$UPLOAD_LOG")" + +echo "Uploading results from: $RUN_DIR" | tee -a "$UPLOAD_LOG" +echo "Using username=${UPLOAD_USERNAME}, mode=${UPLOAD_MODE}" | tee -a "$UPLOAD_LOG" + +# Run the uploader (uses database.unified_db from the repo) +python - <<'PY' 2>&1 | tee -a "$UPLOAD_LOG" +import os, sys +from database.unified_db.utils import upload_eval_results +import re +import hashlib + + +def sanitize_hf_repo_id(repo_id: str, max_length: int = 96) -> str: + """ + Sanitize a Hugging Face repo_id to comply with naming rules. + Keeps org prefix (e.g. 'mlfoundations-dev/') and cleans up the rest. + No extra '-' before hash suffix. + """ + def collapse(s: str) -> str: + prev = None + while s != prev: + prev = s + s = s.replace("--", "-").replace("..", ".") + return s + + org, name = repo_id.split("/", 1) if "/" in repo_id else (None, repo_id) + + name = re.sub(r"[^A-Za-z0-9._-]", "-", name) + name = collapse(name).strip("-.") + + if not name: + name = "repo" + + limit = max_length - (len(org) + 1 if org else 0) + if len(name) > limit: + digest = hashlib.sha1(name.encode()).hexdigest()[:8] + keep = max(1, limit - len(digest)) + base = name[:keep].rstrip("-.") + if not base: + base = "r" + name = f"{base}{digest}" # no '-' before hash + name = collapse(name).strip("-.") + + # final cleanup + name = collapse(name).strip("-.") + if name[0] in "-.": + name = "r" + name[1:] + if name[-1] in "-.": + name = name[:-1] + "0" + + return f"{org}/{name}" if org else name + + +run_dir = os.environ["RUN_DIR"] +run_tag = os.environ["RUN_TAG"] +username = os.environ.get("UPLOAD_USERNAME", os.environ.get("USER", "unknown")) +error_mode= os.environ.get("UPLOAD_MODE", "skip_on_error") +hf_repo_id = sanitize_hf_repo_id(f"DCAgent2/{run_tag}") +hf_token = os.environ["HF_TOKEN"] +print(f"[uploader] upload_eval_results(path={run_dir!r}, username={username!r}, error_mode={error_mode!r}, hf_repo_id={hf_repo_id!r})") +upload_eval_results(run_dir, username=username, error_mode=error_mode, hf_token=hf_token, hf_repo_id=hf_repo_id, register_benchmark=True) +print("[uploader] done.") +PY +UPLOAD_EXIT=${PIPESTATUS[0]} + +if [ $UPLOAD_EXIT -ne 0 ]; then + echo "Upload failed with exit code: $UPLOAD_EXIT" + exit $UPLOAD_EXIT +fi + +echo "Eval and upload finished successfully." diff --git a/eval/legacy/unified_eval_harbor_v6_mbz.sbatch b/eval/legacy/unified_eval_harbor_v6_mbz.sbatch new file mode 100644 index 00000000..dcd66d79 --- /dev/null +++ b/eval/legacy/unified_eval_harbor_v6_mbz.sbatch @@ -0,0 +1,713 @@ +#!/bin/bash +#SBATCH -p main +#SBATCH --no-requeue +#SBATCH --time=24:00:00 +#SBATCH --nodes 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:1 +#SBATCH --output=experiments/logs/%x_%j.out +#SBATCH --job-name=terminal + +# ============================================================================== +# Unified Eval Harbor Script v6 — MBZ H200 Cluster +# ============================================================================== +# Adapted from v4 + Jupiter sbatch improvements: +# - Resume logic: if job dir already exists, resume failed Daytona/env trials +# - build_vllm_cmd.sh: shared vLLM command builder (replaces inline args) +# - Daytona API key rotation: 3:1 weighted between two orgs +# - Benchmark name mapping: canonical DB names with timeout/memory suffixes +# +# Positional Args: +# $1 = MODEL (HuggingFace model name, e.g., "org/model-name") +# $2 = REPO_ID (HuggingFace dataset repo, e.g., "DCAgent/dev_set_v2") +# $3 = BENCHMARK_ID (optional, DB benchmark UUID) +# $4 = RUN_TAG (optional, job name - if provided, updates Pending job to Started) +# +# Environment Variables (with defaults): +# EVAL_N_CONCURRENT Harbor --n-concurrent (default: 64) +# EVAL_N_ATTEMPTS Harbor --n-attempts (default: 3) +# EVAL_GPU_MEMORY_UTIL VLLM --gpu-memory-utilization (default: 0.9) +# EVAL_DAYTONA_THRESHOLD Max invalid errors before abort (default: 3) +# EVAL_VLLM_MAX_RETRIES VLLM startup retries (default: 5) +# EVAL_AGENT_PARSER Agent parser type (default: "", use "xml" for swebench) +# EVAL_ENABLE_THINKING Enable thinking blocks (default: false) +# EVAL_AGENT_NAME Agent name for harbor and DB entries (default: "terminus-2") +# EVAL_STARTS_LOG Path to shared eval starts log file (default: "") +# EVAL_TIMEOUT_MULTIPLIER Harbor timeout multiplier (default: 1.0) +# EVAL_VLLM_TENSOR_PARALLEL_SIZE vLLM --tensor-parallel-size (default: 1) +# EVAL_INVALID_ERRORS_LOG Path to invalid errors log (default: jobs/invalid_errors.log) +# EVAL_VLLM_* env vars Per-model overrides from baseline_model_configs.yaml +# ============================================================================== + +set -eo pipefail +ulimit -c 0 # Disable core dumps +ulimit -n 65536 2>/dev/null || true + +# === CONFIGURABLE PARAMETERS (with defaults) === +N_CONCURRENT="${EVAL_N_CONCURRENT:-64}" +N_ATTEMPTS="${EVAL_N_ATTEMPTS:-3}" +GPU_MEMORY_UTIL="${EVAL_GPU_MEMORY_UTIL:-0.9}" +ERROR_THRESHOLD="${EVAL_DAYTONA_THRESHOLD:-3}" +VLLM_MAX_RETRIES="${EVAL_VLLM_MAX_RETRIES:-5}" +AGENT_PARSER="${EVAL_AGENT_PARSER:-}" +ENABLE_THINKING="${EVAL_ENABLE_THINKING:-false}" +AGENT_NAME="${EVAL_AGENT_NAME:-terminus-2}" +EVAL_STARTS_LOG="${EVAL_STARTS_LOG:-}" +TIMEOUT_MULTIPLIER="${EVAL_TIMEOUT_MULTIPLIER:-1.0}" +TENSOR_PARALLEL_SIZE="${EVAL_VLLM_TENSOR_PARALLEL_SIZE:-1}" + +# === POSITIONAL ARGS === +MODEL="${1:-mlfoundations-dev/claude_3_7_20250219_tbench_traces_sharegptv1}" +REPO_ID="${2:-DCAgent/dev_set_v2}" +BENCHMARK_ID="${3:-}" +RUN_TAG_ARG="${4:-}" # Optional: job name from listener (if provided, Pending entry exists) + +# Create timestamp and safe names +TIMESTAMP=$(date +'%Y%m%d_%H%M%S') +SAFE_MODEL=$(echo "$MODEL" | tr '/:' '_') +if [[ "$REPO_ID" == /* ]]; then + SAFE_REPO=$(basename "$REPO_ID") +else + SAFE_REPO=$(echo "$REPO_ID" | tr '/:' '_') +fi + +echo "==============================================" +echo "Unified Eval Harbor v6 - Starting job (MBZ H200)" +echo "==============================================" +echo "Model: $MODEL" +echo "Repository: $REPO_ID" +echo "Benchmark ID: ${BENCHMARK_ID:-}" +echo "" +echo "Parameters:" +echo " N_CONCURRENT: $N_CONCURRENT" +echo " N_ATTEMPTS: $N_ATTEMPTS" +echo " GPU_MEMORY_UTIL: $GPU_MEMORY_UTIL" +echo " ERROR_THRESHOLD: $ERROR_THRESHOLD" +echo " VLLM_MAX_RETRIES: $VLLM_MAX_RETRIES" +echo " AGENT_PARSER: ${AGENT_PARSER:-}" +echo " ENABLE_THINKING: $ENABLE_THINKING" +echo " AGENT_NAME: $AGENT_NAME" +echo " TIMEOUT_MULTIPLIER: $TIMEOUT_MULTIPLIER" +echo " TENSOR_PARALLEL_SIZE: $TENSOR_PARALLEL_SIZE" +echo " EVAL_STARTS_LOG: ${EVAL_STARTS_LOG:-}" +echo "==============================================" + +# ============================================================================== +# MBZ Environment Setup +# ============================================================================== + +SCRATCH="/mnt/weka/home/richard.zhuang" +DCFT="$SCRATCH/OpenThoughts-Agent" + +# Reset PYTHONPATH to avoid inheriting submitter's environment +unset PYTHONPATH + +# Set up environment variables +export VLLM_USE_V1=1 +export RAY_RUNTIME_ENV_HOOK=ray._private.runtime_env.uv_runtime_env_hook.hook +export RAY_ADDRESS=${RAY_ADDRESS:-} +export RAY_DEDUP_LOGS=0 +export VLLM_CACHE_ROOT="$SCRATCH/.cache/vllm" +export VLLM_CONFIG_ROOT="$SCRATCH/.cache/vllm_config" +export TRITON_CACHE_DIR="$SCRATCH/.cache/triton" +export FLASHINFER_WORKSPACE_BASE="$SCRATCH/.cache/flashinfer" +export UV_CACHE_DIR="$SCRATCH/.cache/uv" +export HYDRA_FULL_ERROR=1 +export HF_CACHE_DIR="$SCRATCH/.cache/huggingface" +export HF_HUB_CACHE="$HF_CACHE_DIR/hub" +export HF_HOME="$HF_CACHE_DIR" +# XET staging cache — must be user-writable (shared HF_HOME/xet/ has permission issues) +export HF_XET_CACHE="$SCRATCH/.cache/hf_xet_cache" +mkdir -p "$HF_XET_CACHE" +export LITELLM_LOCAL_MODEL_COST_MAP=True +export PYTHONPATH="${DCFT}${PYTHONPATH:+:$PYTHONPATH}" + +# DB/API secrets +SECRETS_FILE="${EVAL_SECRETS_FILE:-$HOME/secrets.env}" +echo "Sourcing secrets from: $SECRETS_FILE" +source "$SECRETS_FILE" + +# Daytona API key comes from secrets.env (single org) + +# Conda env (configurable via EVAL_CONDA_ENV, default: otagent) +CONDA_ENV="${EVAL_CONDA_ENV:-otagent}" +eval "$(conda shell.bash hook 2>/dev/null)" || source "$SCRATCH/miniconda3/etc/profile.d/conda.sh" +conda activate "$CONDA_ENV" +echo "Conda env: $CONDA_ENV" + +# Create cache dirs +mkdir -p "$VLLM_CACHE_ROOT" "$VLLM_CONFIG_ROOT" "$TRITON_CACHE_DIR" \ + "$FLASHINFER_WORKSPACE_BASE" "$UV_CACHE_DIR" "$HF_CACHE_DIR" \ + experiments/logs + +# ============================================================================== +# PRE-FLIGHT: Validate model architecture before starting vLLM +# ============================================================================== +echo "Validating model architecture..." +python - "$MODEL" <<'PREFLIGHT' +import json, sys, os + +model = sys.argv[1] if len(sys.argv) > 1 and sys.argv[1] else "" +if not model: + print("WARNING: No MODEL set, skipping preflight check") + sys.exit(0) + +# Blocklist of architecture prefixes that vLLM cannot serve +UNSUPPORTED_PREFIXES = ["FSDP"] + +try: + from huggingface_hub import hf_hub_download + cfg_path = hf_hub_download(model, "config.json") + with open(cfg_path) as f: + cfg = json.load(f) + archs = cfg.get("architectures", []) + print(f"Model architectures: {archs}") + for arch in archs: + for prefix in UNSUPPORTED_PREFIXES: + if arch.startswith(prefix): + print(f"FATAL: Architecture '{arch}' is not supported by vLLM.", file=sys.stderr) + print(f"This model appears to be a raw training checkpoint (e.g. FSDP).", file=sys.stderr) + print(f"Convert it to standard HuggingFace format before evaluating.", file=sys.stderr) + sys.exit(1) +except Exception as e: + print(f"WARNING: Preflight check could not validate model: {e}") + sys.exit(0) +PREFLIGHT + +if [ $? -ne 0 ]; then + echo "FATAL: Model architecture not supported. Aborting job." + exit 1 +fi + +# ============================================================================== +# Fix: Qwen3.5 VL fine-tunes missing preprocessor_config.json +# ============================================================================== +# Qwen3.5 is a VL model (Qwen3_5ForConditionalGeneration). Text-only fine-tunes +# are missing preprocessor_config.json, which vLLM's multimodal pipeline needs. +# We copy it from the base model, then force offline mode so transformers uses +# the local cache instead of re-checking the HF Hub remote file list. +echo "Checking for Qwen3.5 preprocessor fix..." +MODEL_SNAP_DIR=$(python3 -c " +import sys +from huggingface_hub import snapshot_download +path = snapshot_download('$MODEL') +sys.stdout.write(path) +" 2>/dev/null) || true + +if [ -n "$MODEL_SNAP_DIR" ] && [ -f "$MODEL_SNAP_DIR/config.json" ]; then + # Check if this is a Qwen3.5 VL architecture (needs preprocessor_config.json + offline mode) + IS_QWEN35_VL=$(python3 -c " +import json +with open('$MODEL_SNAP_DIR/config.json') as f: + cfg = json.load(f) +archs = cfg.get('architectures', []) +if any('Qwen3_5ForConditional' in a for a in archs): + print('yes') +" 2>/dev/null) + + if [ "$IS_QWEN35_VL" = "yes" ]; then + # Copy preprocessor_config.json from base model if missing + if [ ! -f "$MODEL_SNAP_DIR/preprocessor_config.json" ]; then + echo "Qwen3.5 VL fine-tune missing preprocessor_config.json — copying from base model" + BASE_PREP=$(python3 -c " +from huggingface_hub import hf_hub_download +print(hf_hub_download('Qwen/Qwen3.5-9B', 'preprocessor_config.json')) +" 2>/dev/null) + if [ -n "$BASE_PREP" ] && [ -f "$BASE_PREP" ]; then + cp "$BASE_PREP" "$MODEL_SNAP_DIR/preprocessor_config.json" + echo "Copied preprocessor_config.json to $MODEL_SNAP_DIR" + fi + else + echo "preprocessor_config.json already present in snapshot" + fi + # Force offline so transformers uses local cache (doesn't re-check HF Hub file list) + export HF_HUB_OFFLINE=1 + echo "Set HF_HUB_OFFLINE=1 for Qwen3.5 VL model loading" + fi +fi + +# ============================================================================== +# Start vLLM Server (using shared build_vllm_cmd.sh) +# ============================================================================== +echo "Building vLLM command..." +source "$DCFT/eval/build_vllm_cmd.sh" +build_vllm_cmd "python" "$MODEL" "$GPU_MEMORY_UTIL" +# MBZ: add --enforce-eager for compatibility (not in shared script) +VLLM_CMD+=(--enforce-eager) + +echo "Starting vLLM server..." +env TORCHDYNAMO_DISABLE=1 \ + TMPDIR="/tmp" \ + TRITON_CACHE_DIR="$TRITON_CACHE_DIR" \ + TORCH_COMPILE_CACHE_DIR="/tmp/torch_cache_${USER}" \ + TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${USER}" \ + HF_HOME="$HF_HOME" \ + HF_HUB_CACHE="$HF_HUB_CACHE" \ +"${VLLM_CMD[@]}" \ + > "experiments/logs/vllm_${SLURM_JOB_ID}.log" 2>&1 & +VLLM_PID=$! + +cleanup() { + echo "Cleaning up..." + kill $VLLM_PID 2>/dev/null || true + conda deactivate || true +} +trap cleanup EXIT + +# Wait for vLLM server to start with healthcheck +RETRY_INTERVAL=100 +for i in $(seq 1 $VLLM_MAX_RETRIES); do + if curl -s http://localhost:8000/v1/models > /dev/null; then + echo "vLLM server is ready" + break + fi + echo "Waiting for vLLM server to start (attempt $i/$VLLM_MAX_RETRIES)..." + sleep $RETRY_INTERVAL + if [ $i -eq $VLLM_MAX_RETRIES ]; then + echo "vLLM server failed to start" + tail -50 "experiments/logs/vllm_${SLURM_JOB_ID}.log" 2>/dev/null || true + exit 1 + fi +done + +# Restore online mode for dataset download and uploads +unset HF_HUB_OFFLINE + +# ============================================================================== +# Download/locate dataset +# ============================================================================== +if [[ "$REPO_ID" == /* ]]; then + echo "Using local dataset path: $REPO_ID" + DATASET_PATH="$REPO_ID" + if [ ! -d "$DATASET_PATH" ]; then + echo "ERROR: Local dataset path does not exist: $DATASET_PATH" + exit 1 + fi +else + echo "Downloading/locating dataset: $REPO_ID" + DATASET_PATH=$(python "$DCFT/eval/MBZ/snapshot_download.py" "$REPO_ID" | grep DATASET_PATH | tail -n 1 | cut -d'=' -f2) + if [ -z "${DATASET_PATH:-}" ]; then + echo "Failed to get dataset path" + exit 1 + fi +fi +echo "Using dataset path: $DATASET_PATH" + +# ============================================================================== +# Construct run tag and directory +# ============================================================================== +if [ -n "$RUN_TAG_ARG" ]; then + RUN_TAG="$RUN_TAG_ARG" + echo "Using RUN_TAG from listener: $RUN_TAG" +else + RUN_TAG="${SAFE_REPO}_${SAFE_MODEL}_${TIMESTAMP}" + echo "Generated new RUN_TAG: $RUN_TAG" +fi +RUN_DIR="jobs/${RUN_TAG}" +echo "Job dir: ${RUN_DIR}" + +# ============================================================================== +# Compute canonical benchmark name for DB uploads +# ============================================================================== +declare -A BENCHMARK_NAME_MAP=( + ["DCAgent2_aider_polyglot"]="aider_polyglot" + ["DCAgent_dev_set_v2"]="dev_set_v2" + ["DCAgent_dev_set_71_tasks"]="dev_set_71_tasks" + ["DCAgent2_terminal_bench_2"]="terminal_bench_2" + ["DCAgent_swebench_verified_eval_set"]="swebench-verified-random-100-folders" + ["DCAgent2_bfcl-parity"]="bfcl-parity" +) +BASE_NAME="${BENCHMARK_NAME_MAP[$SAFE_REPO]:-$SAFE_REPO}" + +# Append suffix for non-default timeout/memory configs +BENCHMARK_SUFFIX="" +if [ -n "${EVAL_OVERRIDE_MEMORY_MB:-}" ] && [ "$EVAL_OVERRIDE_MEMORY_MB" != "1024" ]; then + mem_gb=$(( EVAL_OVERRIDE_MEMORY_MB / 1024 )) + BENCHMARK_SUFFIX="${BENCHMARK_SUFFIX}_${mem_gb}gb" +fi +if [ -n "${EVAL_TIMEOUT_MULTIPLIER:-}" ] && [ "$EVAL_TIMEOUT_MULTIPLIER" != "1" ] && [ "$EVAL_TIMEOUT_MULTIPLIER" != "1.0" ]; then + BENCHMARK_SUFFIX="${BENCHMARK_SUFFIX}_${EVAL_TIMEOUT_MULTIPLIER}x" +fi +if [ -n "$BENCHMARK_SUFFIX" ]; then + BENCHMARK_NAME="${BASE_NAME}${BENCHMARK_SUFFIX}" +else + BENCHMARK_NAME="$BASE_NAME" +fi +export BENCHMARK_NAME +echo "Benchmark name: $BENCHMARK_NAME" + +# ============================================================================== +# Update DB: Started +# ============================================================================== +echo "Updating/Creating DB job entry..." +export MODEL REPO_ID RUN_TAG RUN_TAG_ARG SLURM_JOB_ID N_CONCURRENT N_ATTEMPTS AGENT_NAME TIMEOUT_MULTIPLIER + +HARBOR_VERSION=$(python -c "import harbor; print(harbor.__version__)" 2>/dev/null || echo "unknown") +export HARBOR_VERSION + +python - <<'PY' +import os, sys, json +from database.unified_db.utils import create_job_entry_started, update_job_status_to_started + +model_hf = os.environ["MODEL"] +dataset_hf = os.environ["REPO_ID"] +run_tag = os.environ["RUN_TAG"] +run_tag_arg = os.environ.get("RUN_TAG_ARG", "") +slurm_job_id = os.environ["SLURM_JOB_ID"] +harbor_version = os.environ.get("HARBOR_VERSION", "unknown") +n_concurrent = int(os.environ.get("N_CONCURRENT", "64")) +n_attempts = int(os.environ.get("N_ATTEMPTS", "3")) +agent_name = os.environ.get("AGENT_NAME", "terminus-2") +timeout_multiplier = float(os.environ.get("TIMEOUT_MULTIPLIER", "1.0")) + +config = {"agent": agent_name, "env": "daytona", "timeout_multiplier": timeout_multiplier} + +result = None +if run_tag_arg: + print(f"Updating Pending job to Started: {run_tag}") + result = update_job_status_to_started( + job_name=run_tag, + n_trials=n_concurrent, + n_rep_eval=n_attempts, + config=config, + harbor_package_version=harbor_version + ) + if not result.get("success"): + print(f"WARNING: Pending entry not found ({result.get('error')}), creating Started entry directly") + result = None + +if result is None: + print(f"Creating new Started job entry: {run_tag}") + result = create_job_entry_started( + model_hf_name=model_hf, + benchmark_hf_name=dataset_hf, + job_name=run_tag, + username=os.environ.get("UPLOAD_USERNAME", os.environ.get("USER", "unknown")), + slurm_job_id=slurm_job_id, + harbor_package_version=harbor_version, + agent_name=agent_name, + config=config, + n_trials=n_concurrent, + n_rep_eval=n_attempts + ) + +if not result.get("success"): + print(f"ERROR: {result.get('error')}", file=sys.stderr) + sys.exit(1) + +db_job_id = result["job"]["id"] +print(f"DB job ready with ID: {db_job_id}") +PY + +if [ $? -ne 0 ]; then + echo "Failed to create/update DB job entry" + exit 1 +fi + +# Enhancement 4: Append to eval starts log (model retry tracking) +if [ -n "$EVAL_STARTS_LOG" ]; then + mkdir -p "$(dirname "$EVAL_STARTS_LOG")" + echo "$(date -Iseconds) $MODEL $REPO_ID $RUN_TAG" >> "$EVAL_STARTS_LOG" + echo "Logged eval start to: $EVAL_STARTS_LOG" +fi + +# Extract the DB job ID +DB_JOB_ID=$(python - <<'PY' +import os, sys +from database.unified_db.utils import get_latest_job_for_model_benchmark + +model_hf = os.environ["MODEL"] +dataset_hf = os.environ["REPO_ID"] + +result = get_latest_job_for_model_benchmark(model_hf, dataset_hf) +if result and result.get("id"): + print(result["id"]) +else: + sys.exit(1) +PY +) + +if [ -z "$DB_JOB_ID" ]; then + echo "Failed to get DB job ID" + exit 1 +fi + +echo "DB job entry ready: $DB_JOB_ID" + +export HARBOR_JOBS_DIR="/tmp/harbor_jobs" +mkdir -p "$HARBOR_JOBS_DIR" + +export TERMINUS_MODEL_MAX_TOKENS=32768 +export TERMINUS_MODEL_MAX_OUTPUT_TOKENS=16384 + +# ============================================================================== +# Run Harbor Eval (with resume support) +# ============================================================================== + +# Build harbor start command +HARBOR_CMD="harbor jobs start" +HARBOR_CMD="$HARBOR_CMD -p \"$DATASET_PATH\"" +HARBOR_CMD="$HARBOR_CMD --n-concurrent $N_CONCURRENT" +HARBOR_CMD="$HARBOR_CMD --agent \"$AGENT_NAME\"" +HARBOR_CMD="$HARBOR_CMD --model \"hosted_vllm/$MODEL\"" +HARBOR_CMD="$HARBOR_CMD --env \"daytona\"" +AUTO_SNAPSHOT="${EVAL_AUTO_SNAPSHOT:-false}" +HARBOR_CMD="$HARBOR_CMD --environment-kwarg auto_snapshot=$AUTO_SNAPSHOT" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"api_base=http://localhost:8000/v1\"" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"key=fake_key\"" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"max_tokens=16384\"" +HARBOR_CMD="$HARBOR_CMD --agent-kwarg 'model_info={\"max_output_tokens\":16384,\"max_input_tokens\":32768,\"input_cost_per_token\":0.0,\"output_cost_per_token\":0.0}'" +HARBOR_CMD="$HARBOR_CMD --n-attempts $N_ATTEMPTS" +HARBOR_CMD="$HARBOR_CMD --export-traces" +HARBOR_CMD="$HARBOR_CMD --job-name \"$RUN_TAG\"" +CONFIG_YAML="${EVAL_CONFIG_YAML:-dcagent_eval_config.yaml}" +HARBOR_CMD="$HARBOR_CMD --config \"$DCFT/eval/MBZ/$CONFIG_YAML\"" + +if [ -n "$AGENT_PARSER" ]; then + HARBOR_CMD="$HARBOR_CMD --agent-kwarg \"parser=$AGENT_PARSER\"" +fi + +if [ "$ENABLE_THINKING" = "true" ]; then + HARBOR_CMD="$HARBOR_CMD --agent-kwarg 'extra_body={\"chat_template_kwargs\":{\"enable_thinking\":true}}'" + echo "Thinking blocks enabled via extra_body" +fi + +if [ "$TIMEOUT_MULTIPLIER" != "1.0" ]; then + HARBOR_CMD="$HARBOR_CMD --timeout-multiplier $TIMEOUT_MULTIPLIER" + echo "Using timeout multiplier: $TIMEOUT_MULTIPLIER" +fi + +# Resume logic: if a previous job dir exists with config.json, resume instead of starting fresh +set +e +if [ -d "$RUN_DIR" ] && [ -f "$RUN_DIR/config.json" ]; then + echo "Found existing job dir, resuming: $RUN_DIR" + harbor jobs resume \ + -p "$RUN_DIR" \ + --filter-error-type EnvironmentStartTimeoutError \ + --filter-error-type DaytonaError \ + --filter-error-type DaytonaRateLimitError \ + --filter-error-type AgentEnvironmentTimeoutError +else + echo "Starting new job" + echo "Running: $HARBOR_CMD" + eval $HARBOR_CMD +fi +SB_EXIT=$? +set -e + +# Save meta.env +mkdir -p "$RUN_DIR" +{ + echo "MODEL=$MODEL" + echo "REPO_ID=$REPO_ID" + echo "TIMESTAMP=$TIMESTAMP" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "DB_JOB_ID=$DB_JOB_ID" + echo "RUN_TAG=$RUN_TAG" + echo "RUN_TAG_ARG=$RUN_TAG_ARG" + echo "N_CONCURRENT=$N_CONCURRENT" + echo "N_ATTEMPTS=$N_ATTEMPTS" + echo "GPU_MEMORY_UTIL=$GPU_MEMORY_UTIL" + echo "ERROR_THRESHOLD=$ERROR_THRESHOLD" + echo "AGENT_PARSER=$AGENT_PARSER" + echo "ENABLE_THINKING=$ENABLE_THINKING" + echo "TIMEOUT_MULTIPLIER=$TIMEOUT_MULTIPLIER" + echo "BENCHMARK_NAME=$BENCHMARK_NAME" +} > "$RUN_DIR/meta.env" + +# If eval failed, don't attempt upload +if [ ${SB_EXIT:-0} -ne 0 ]; then + echo "harbor run exited with non-zero status: ${SB_EXIT}. Skipping upload." + exit ${SB_EXIT} +fi + +if [ ! -d "$RUN_DIR" ]; then + echo "Expected run directory not found: $RUN_DIR" + exit 2 +fi + +# ============================================================================== +# Check for invalid errors before upload +# ============================================================================== +RESULT_FILE="$RUN_DIR/result.json" +ERROR_LOG="${EVAL_INVALID_ERRORS_LOG:-jobs/invalid_errors.log}" + +if [ -f "$RESULT_FILE" ]; then + echo "Checking for invalid errors in $RESULT_FILE..." + + INVALID_ERROR_COUNT=$(python3 -c " +import json +import sys + +VALID_ERROR_TYPES = {'AgentTimeoutError', 'ContextLengthExceededError', 'SummarizationTimeout', 'SummarizationTimeoutError'} + +try: + with open('$RESULT_FILE', 'r') as f: + data = json.load(f) + + invalid_trials = set() + error_counts = {} + + if 'stats' in data and 'evals' in data['stats']: + for eval_key, eval_data in data['stats']['evals'].items(): + if 'exception_stats' in eval_data: + for error_type, trial_names in eval_data['exception_stats'].items(): + if error_type not in VALID_ERROR_TYPES and isinstance(trial_names, list): + invalid_trials.update(trial_names) + error_counts[error_type] = error_counts.get(error_type, 0) + len(trial_names) + + print(len(invalid_trials)) + + if invalid_trials: + print(f'Found {len(invalid_trials)} unique invalid error trial(s):', file=sys.stderr) + for etype, count in sorted(error_counts.items(), key=lambda x: -x[1]): + print(f' {etype}: {count}', file=sys.stderr) + +except Exception as e: + print(f'Error parsing result.json: {e}', file=sys.stderr) + print('0') +" 2>&1 | tail -n 1) + + echo "Invalid error count: ${INVALID_ERROR_COUNT}" + + if [ "${INVALID_ERROR_COUNT:-0}" -gt "$ERROR_THRESHOLD" ]; then + echo "Job has ${INVALID_ERROR_COUNT} invalid errors (> ${ERROR_THRESHOLD}), skipping upload" + + { + echo "===============================================" + echo "Timestamp: $(date)" + echo "Job: ${RUN_TAG}" + echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" + echo "Model: ${MODEL}" + echo "Repo: ${REPO_ID}" + echo "Invalid Errors: ${INVALID_ERROR_COUNT}" + echo "Error Threshold: ${ERROR_THRESHOLD}" + echo "Result file: ${RESULT_FILE}" + + python3 -c " +import json + +VALID_ERROR_TYPES = {'AgentTimeoutError', 'ContextLengthExceededError', 'SummarizationTimeout', 'SummarizationTimeoutError'} + +with open('$RESULT_FILE', 'r') as f: + data = json.load(f) +if 'stats' in data and 'evals' in data['stats']: + for eval_key, eval_data in data['stats']['evals'].items(): + if 'exception_stats' in eval_data: + eval_has_errors = False + for error_type, trial_names in eval_data['exception_stats'].items(): + if error_type not in VALID_ERROR_TYPES and isinstance(trial_names, list) and trial_names: + if not eval_has_errors: + print(f'Eval: {eval_key}') + eval_has_errors = True + print(f' {error_type} ({len(trial_names)}):') + for i, tid in enumerate(trial_names[:5], 1): + print(f' {i}. {tid}') + if len(trial_names) > 5: + print(f' ... and {len(trial_names) - 5} more') +" + echo "===============================================" + } >> "$ERROR_LOG" + + echo "Error details logged to: $ERROR_LOG" + echo "Job completed but not uploaded due to excessive invalid errors" + exit 0 + fi +else + echo "Warning: result.json not found, continuing with upload" +fi + +# ============================================================================== +# Upload results to DB +# ============================================================================== +export RUN_DIR="$RUN_DIR" +export UPLOAD_USERNAME="${EVAL_UPLOAD_USERNAME:-$USER}" +export UPLOAD_MODE="${UPLOAD_MODE:-skip_on_error}" +export RUN_TAG="$RUN_TAG" +export BENCHMARK_NAME +UPLOAD_LOG="experiments/logs/upload_${SLURM_JOB_ID}.log" +mkdir -p "$(dirname "$UPLOAD_LOG")" + +echo "Uploading results from: $RUN_DIR" | tee -a "$UPLOAD_LOG" +echo "Using username=${UPLOAD_USERNAME}, mode=${UPLOAD_MODE}" | tee -a "$UPLOAD_LOG" + +python - <<'PY' 2>&1 | tee -a "$UPLOAD_LOG" +import os, sys, re, hashlib + +from database.unified_db.utils import upload_eval_results + + +def sanitize_hf_repo_id(repo_id: str, max_length: int = 96) -> str: + def collapse(s: str) -> str: + prev = None + while s != prev: + prev = s + s = s.replace("--", "-").replace("..", ".") + return s + + org, name = repo_id.split("/", 1) if "/" in repo_id else (None, repo_id) + name = re.sub(r"[^A-Za-z0-9._-]", "-", name) + name = collapse(name).strip("-.") + if not name: + name = "repo" + + limit = max_length - (len(org) + 1 if org else 0) + if len(name) > limit: + digest = hashlib.sha1(name.encode()).hexdigest()[:8] + keep = max(1, limit - len(digest)) + base = name[:keep].rstrip("-.") + if not base: + base = "r" + name = f"{base}{digest}" + name = collapse(name).strip("-.") + + name = collapse(name).strip("-.") + if name[0] in "-.": + name = "r" + name[1:] + if name[-1] in "-.": + name = name[:-1] + "0" + + return f"{org}/{name}" if org else name + + +run_dir = os.environ["RUN_DIR"] +run_tag = os.environ["RUN_TAG"] +username = os.environ.get("UPLOAD_USERNAME", os.environ.get("USER", "unknown")) +error_mode = os.environ.get("UPLOAD_MODE", "skip_on_error") +hf_repo_id = sanitize_hf_repo_id(f"DCAgent2/{run_tag}") +hf_token = os.environ["HF_TOKEN"] + +# Use canonical benchmark name computed in shell +benchmark_name = os.environ.get("BENCHMARK_NAME", "") +if not benchmark_name: + dataset_hf = os.environ.get("REPO_ID", "") + benchmark_name = dataset_hf.split("/")[-1] if "/" in dataset_hf else dataset_hf + +benchmark_version_hash = hashlib.sha256(benchmark_name.encode()).hexdigest() +print(f"[uploader] benchmark_name={benchmark_name!r}, version_hash={benchmark_version_hash[:16]}...") + +print(f"[uploader] upload_eval_results(path={run_dir!r}, username={username!r}, " + f"error_mode={error_mode!r}, hf_repo_id={hf_repo_id!r})") +upload_eval_results( + run_dir, + username=username, + error_mode=error_mode, + hf_token=hf_token, + hf_repo_id=hf_repo_id, + register_benchmark=True, + benchmark_name=benchmark_name, + benchmark_version_hash=benchmark_version_hash, +) +print("[uploader] done.") +PY +UPLOAD_EXIT=${PIPESTATUS[0]} + +if [ $UPLOAD_EXIT -ne 0 ]; then + echo "Upload failed with exit code: $UPLOAD_EXIT" + exit $UPLOAD_EXIT +fi + +echo "==============================================" +echo "Eval and upload finished successfully." +echo "==============================================" diff --git a/eval/legacy/unified_eval_listener_v4.py b/eval/legacy/unified_eval_listener_v4.py new file mode 100644 index 00000000..6cc48860 --- /dev/null +++ b/eval/legacy/unified_eval_listener_v4.py @@ -0,0 +1,2890 @@ +#!/usr/bin/env python3 +""" +Unified Eval Listener v4 - Polls Supabase for models and submits SLURM eval jobs. + +Replaces all per-benchmark listener scripts with one configurable entry point. +Uses unified_eval_harbor_v6.sbatch as the SLURM job template. + + +=============================================================================== +FLAG REFERENCE +=============================================================================== + +--- Preset & Dataset Selection --- + +--preset, -p {aider,bfcl,swebench,v2,tb2,v1} + Load a named preset that bundles dataset, concurrency, error threshold, and + other defaults tuned for a specific benchmark. CLI flags override any preset + value. Almost all runs should start with a preset. + + Preset details: + aider Dataset: DCAgent2/aider_polyglot. n_concurrent=32, error_threshold=3, thinking=on. + bfcl Dataset: DCAgent2/bfcl-parity. n_concurrent=32, error_threshold=10, thinking=on, vllm_retries=20. + swebench Dataset: DCAgent2/swebench-verified-*. n_concurrent=32, error_threshold=15, thinking=on, vllm_retries=20, + agent_parser=xml, gpu_mem=0.95, config_yaml=no_override. HF existence check on. + v2 Dataset: DCAgent/dev_set_v2. n_concurrent=32, error_threshold=10, thinking=on, vllm_retries=20. + tb2 Dataset: DCAgent2/terminal_bench_2. n_concurrent=32, error_threshold=10, thinking=on, + gpu_mem=0.95, slurm_time=48h, config_yaml=no_override. + v1 Dataset: DCAgent/dev_set_71_tasks. n_concurrent=32, error_threshold=10, thinking=on, vllm_retries=20. + + Tuning: Pick the preset matching your benchmark. Override individual params + with CLI flags (e.g. --n-concurrent 64 to double concurrency). + +--datasets, -d + Comma- or space-separated list of HuggingFace dataset repos. Overrides the + preset's dataset list. Use this for one-off evals against custom datasets. + Example: --datasets "DCAgent/dev_set_v2,DCAgent2/terminal_bench_2" + +--sbatch-script, -s + Path to the sbatch template. Default: unified_eval_harbor_v6.sbatch (or + whatever the preset specifies). Only change this if you have a custom sbatch. + + +--- Model Filtering --- + +--priority-file + Text file listing HuggingFace model names (org/model), one per line. + Lines starting with # are comments; blank lines are ignored. + File order = submission priority: earlier lines are submitted first. + Hot-reloaded every iteration — edit the file without restarting the listener. + + Env: EVAL_LISTENER_PRIORITY_FILE + +--priority-mode {filter_only,priority_first} [default: filter_only] + filter_only — Only evaluate models IN the priority file. All others skipped. + priority_first — Evaluate ALL models, but submit priority models first. + + Tuning: Use filter_only (default) when you have a curated list of models to + evaluate. Use priority_first when you want to evaluate everything but ensure + specific models get SLURM slots first. + + Env: EVAL_LISTENER_PRIORITY_MODE + +--require-priority-list + Safety flag. If set and no priority file is loaded (missing file or empty), + the listener skips ALL models instead of evaluating everything. Prevents + accidental mass submissions when a priority file path is misconfigured. + + Env: EVAL_LISTENER_REQUIRE_PRIORITY_LIST="1" + +--blacklist-file [v4 NEW] + Text file listing models that should NEVER be submitted, same format as + --priority-file (one model per line, # comments, blank lines ignored). + Blacklist overrides priority: if a model appears in both files, it is blocked. + Hot-reloaded every iteration, same as --priority-file. + + Tuning: Use this to permanently exclude known-bad models (e.g. broken + checkpoints, models that consistently OOM, duplicates you don't want to + re-evaluate). Faster than removing them from the priority file because + the blacklist is checked first — no DB queries wasted on blocked models. + + Env: EVAL_LISTENER_BLACKLIST_FILE + +--check-hf-exists + Before submitting, validate that the model actually exists on HuggingFace Hub. + Adds a network round-trip per model but prevents wasted SLURM jobs on typos + or deleted models. The swebench preset enables this by default. + + Env: EVAL_LISTENER_CHECK_HF_EXISTS="1" + + +--- Timing & Lifecycle --- + +--lookback-days [default: 1000] + How far back to query the Supabase `models` table (by creation_time). + Priority models bypass this window — they are always fetched by name + regardless of when they were added. + + Tuning: Keep this large (default 1000) to catch old models. Reduce only if + DB queries are slow and you know all target models are recent. + + Env: EVAL_LISTENER_LOOKBACK_DAYS + +--check-hours [default: 4.0] + Hours to sleep between iterations. Each iteration re-queries the DB, hot- + reloads priority/blacklist files, and submits any new jobs. + + Tuning: For active development with frequent model uploads, use 1-2h. + For stable production runs, 4-12h is fine. Ignored when --once is set. + + Env: EVAL_LISTENER_CHECK_HOURS + +--stale-hours [default: 24] + A job in "Started" status older than this is considered stale and will be + resubmitted. Covers cases where the sbatch job crashed without updating + the DB to Finished. + + Tuning: Set to at least 1.5x your SLURM time limit. If --slurm-time is + 24:00:00, keep this at 24 (default). If you use --slurm-time 48:00:00 + (like tb2), bump to 48-72. + +--stale-pending-hours [default: 48] + A job in "Pending" status older than this is considered stale. The listener + will scancel the old SLURM job (if tracked) and resubmit. + + Tuning: Should be >= --stale-hours. Default of 48h gives Pending jobs extra + time to get through the SLURM queue before being killed. + + +--- Sbatch / vLLM Parameters (passed to sbatch via env vars) --- + +--n-concurrent [default: 64, preset overrides] + Number of concurrent Harbor evaluation jobs inside the sbatch. Controls how + many sandbox tasks run in parallel against the vLLM server. + + Tuning: Depends on model size and GPU memory. + - 7-8B models on GH200 (96GB): 32-64 is safe. + - 32B models: 8-16 (higher causes vLLM queue buildup → AgentTimeoutError). + - 131K context models: 4-8 (KV cache fills fast at high concurrency). + If you see many AgentTimeoutErrors, reduce this. If eval is slow and vLLM + GPU utilization is low, increase it. + +--n-attempts [default: 3] + Number of retry attempts per Harbor task. If a task fails (e.g. sandbox + timeout), Harbor retries it up to this many times. + + Tuning: 3 is good for most benchmarks. Raise to 5 for flaky benchmarks. + Lowering to 1 speeds up runs but increases noise from transient failures. + +--gpu-memory-util [default: 0.9] + Fraction of GPU memory allocated to vLLM via --gpu-memory-utilization. + + Tuning: + - 0.90 (default): safe for 7-8B models on GH200 (96GB). Leaves headroom + for GPU memory variance across nodes. + - 0.95: used by swebench/tb2 presets for larger models or when you need + maximum KV cache capacity. Risk: some GH200 nodes have slightly less + available memory and will OOM at 0.95 (use --exclude in sbatch). + - Never go above 0.95. Below 0.85 wastes memory. + +--error-threshold [default: 3, preset overrides] + Maximum number of "invalid" errors allowed before the sbatch script aborts + result upload. Invalid = any error type EXCEPT AgentTimeoutError, + ContextLengthExceededError, SummarizationTimeout, SummarizationTimeoutError. + + Tuning: Controls quality gating. Low values (3) are strict — a few + DaytonaErrors or unexpected crashes abort the upload. Higher values (10-15) + are more tolerant, appropriate for benchmarks where some sandbox flakiness + is expected. + - aider: 3 (strict, small dataset) + - v2/tb2: 10 (moderate, larger datasets with occasional flakes) + - swebench: 15 (lenient, swebench sandboxes are flakier) + + --daytona-threshold is a backward-compatible alias for this flag. + +--vllm-max-retries [default: 5, preset overrides] + Number of times the sbatch script retries starting the vLLM server. + vLLM occasionally fails to start on first attempt (port conflicts, + CUDA initialization issues). + + Tuning: 5 is fine for quick detection of real failures. Presets like v2 + and swebench use 20 for more resilience on busy clusters. + +--agent-parser [default: "" (none)] + Parser type for Harbor agent output. Set to "xml" for swebench (which + uses XML-structured agent responses). Leave empty for all other benchmarks. + + Tuning: Only change this if you're adding a new benchmark with a custom + agent output format. The swebench preset sets this automatically. + +--slurm-time [default: "24:00:00"] + SLURM wall-clock time limit for the sbatch job. Format: HH:MM:SS. + + Tuning: 24h is enough for most benchmarks. tb2 preset uses 48h because + terminal_bench_2 tasks are longer-running. If jobs are hitting the time + limit and getting killed, increase this and also bump --stale-hours. + +--slurm-partition [default: "gh"] + SLURM partition to submit jobs to. On TACC, "gh" is the GH200 GPU partition. + +--agent-name [default: "terminus-2"] + Agent name written to DB entries and used by Harbor for evaluation config. + This determines which agent implementation Harbor uses to run the eval tasks. + +--enable-thinking + Enable thinking/reasoning blocks in vLLM model inference. Most presets + enable this by default. Only disable if the model doesn't support thinking + or you want to test non-thinking mode. + +--upload-username [default: current OS user] + Username recorded in DB entries and result uploads. Auto-detected from + the OS user if not specified. + + Env: EVAL_UPLOAD_USERNAME + + +--- v3 Enhancement: Per-Listener SLURM Throttle --- + +--max-jobs-submitted [default: 20] + Maximum number of active SLURM jobs this listener instance is allowed to + have running simultaneously. The listener tracks which SLURM job IDs it + submitted and checks squeue to count only those still active. + + Tuning: This is PER-LISTENER, not global. Multiple listeners can run in + parallel with independent budgets. Set based on your fair-share allocation: + - Single listener: 10-20 is typical. + - Multiple listeners: split your budget (e.g. v2=10, swebench=5). + When the limit is reached, the listener queues submissions by priority + order and drops the lowest-priority ones. + + Env: EVAL_LISTENER_MAX_JOBS + + +--- v3 Enhancement: Daytona Resource Pre-flight --- + +--check-daytona-resources + Enable Daytona API sandbox count check at startup and each iteration. + If active sandboxes are at or above the limit, the listener skips that + iteration entirely. Requires DAYTONA_API_KEY in environment. + + Tuning: Enable this in production to prevent overwhelming the Daytona + sandbox pool. Not needed for small-scale or development runs. + +--daytona-sandbox-limit [default: 2000] + Maximum expected active sandboxes. The listener skips submissions when + the active count reaches this number. + +--daytona-warning-buffer [default: 0.9] + Fraction of the sandbox limit at which a warning is logged. At 0.9 with + limit=2000, warns when active sandboxes reach 1800. + + +--- v3 Enhancement: Model Retry Tracking --- + +--track-model-retries + Enable tracking of how many times each model has been started. Models + exceeding the retry threshold are deprioritized (moved to end of the + submission queue, not blocked entirely). + + Tuning: Enable this for long-running listeners to prevent repeatedly + resubmitting models that keep failing. The sbatch script appends to the + shared log when transitioning a job from Pending → Started. + +--model-retry-threshold [default: 5] + Number of eval starts before a model is deprioritized. Deprioritized + models are still submitted, just last in the queue (and may be dropped + if --max-jobs-submitted truncates the list). + + Tuning: 3-5 for strict environments. Higher (10+) if transient failures + are common and you want to give models more chances. + +--eval-starts-log [default: auto-generated] + Path to the shared append-only log file where eval starts are recorded. + Auto-generated with a benchmark+timestamp suffix if not specified. + Multiple listeners using the same log file will share retry counts. + + Tuning: If you run multiple listeners for the same benchmark and want + shared retry tracking, point them at the same log file. + + +--- v3 Enhancement: Timeout-Config-Sensitive Dedup --- + +--timeout-aware + Change job dedup logic to check model + benchmark + agent + timeout_multiplier + instead of just model + benchmark. This allows running the same model with + different timeout configurations without one blocking the other. + + Tuning: Enable when running A/B experiments with different timeout settings. + When disabled (default), two listeners submitting the same model with + different --timeout-multiplier values will conflict (one sees the other's + job and skips). + +--timeout-multiplier [default: 1.0] + Harbor timeout multiplier, passed to the sbatch job and stored in the DB + job config. Values >1.0 give tasks more time; <1.0 makes them stricter. + + Tuning: Use with --timeout-aware for controlled experiments: + --timeout-multiplier 0.25 (aggressive timeout, fast failures) + --timeout-multiplier 1.0 (default) + --timeout-multiplier 2.0 (lenient, for slow models) + --timeout-multiplier 4.0 (very lenient, for debugging) + + +--- Execution Mode --- + +--dry-run + Preview mode: runs one full iteration (DB queries, filtering, status checks) + but does NOT submit any sbatch jobs. Logs what WOULD be submitted. Implies + --once. Use this to verify your flags before a real run. + + Env: EVAL_LISTENER_DRY_RUN="1" + +--once + Run a single iteration and exit. Useful for cron-triggered runs or one-shot + submissions. Without this, the listener loops forever (sleeping --check-hours + between iterations). + +--verbose, -v + Enable detailed logging: shows every model skipped (with reason), priority + list contents, blacklist contents, and per-model DB status checks. + +--log-file + Explicit log file path. Default: auto-generated in experiments/listener_logs/ + with a preset+timestamp name. + + Env: EVAL_LISTENER_LOG_DIR (for the directory) + + +=============================================================================== +ENVIRONMENT VARIABLES (all optional, CLI args take precedence) +=============================================================================== + + EVAL_LISTENER_LOOKBACK_DAYS Days to look back for models (default: 1000) + EVAL_LISTENER_CHECK_HOURS Hours between iterations (default: 4.0) + EVAL_LISTENER_SBATCH SBATCH script to use + EVAL_LISTENER_LOG_DIR Log directory (default: experiments/listener_logs) + EVAL_LISTENER_DATASETS Comma/space/newline list of HF dataset repos + EVAL_LISTENER_PRIORITY_FILE Path to priority models file (hot-reloaded) + EVAL_LISTENER_BLACKLIST_FILE Path to blacklist models file (hot-reloaded) [v4] + EVAL_LISTENER_DRY_RUN "1" or "true" to enable dry run mode + EVAL_LISTENER_REQUIRE_PRIORITY_LIST "1" or "true" to require priority list + EVAL_LISTENER_PRIORITY_MODE "filter_only" or "priority_first" + EVAL_LISTENER_CHECK_HF_EXISTS "1" or "true" to validate HF model existence + EVAL_LISTENER_MAX_JOBS Per-listener SLURM job limit (default: 20) + EVAL_UPLOAD_USERNAME Username for DB entries (default: OS user) + DAYTONA_API_KEY Required for --check-daytona-resources + + +=============================================================================== +QUICK START EXAMPLES +=============================================================================== + + # Most common: evaluate priority models on dev_set_v2 + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file v2_priority_models_richard.txt + + # Preview what would be submitted (no actual jobs) + python unified_eval_listener_v4.py --preset v2 --dry-run --once \\ + --priority-file v2_priority_models_richard.txt --verbose + + # Block known-bad models + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file v2_priority_models_richard.txt \\ + --blacklist-file bad_models.txt + + # Full v3/v4 features enabled + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file v2_priority_models_richard.txt \\ + --blacklist-file bad_models.txt \\ + --error-threshold 10 --max-jobs-submitted 15 \\ + --check-daytona-resources \\ + --track-model-retries --model-retry-threshold 3 \\ + --timeout-aware --timeout-multiplier 2.0 + + # Two listeners with independent SLURM budgets + python unified_eval_listener_v4.py --preset v2 --max-jobs-submitted 10 & + python unified_eval_listener_v4.py --preset swebench --max-jobs-submitted 5 & + + # A/B timeout experiment (requires --timeout-aware on both) + python unified_eval_listener_v4.py --preset v2 --timeout-aware \\ + --timeout-multiplier 1.0 --max-jobs-submitted 10 & + python unified_eval_listener_v4.py --preset v2 --timeout-aware \\ + --timeout-multiplier 2.0 --max-jobs-submitted 5 & +""" + +import argparse +import getpass +import json +import os +import re +import subprocess +import sys +import time +from collections import Counter +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple + +# Add leaderboard utilities to path +# Add project root to path for database.unified_db imports +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +from database.unified_db.utils import get_supabase_client, load_supabase_keys + + +# --------------------------------------------------------------------------- +# Secrets loading (Jupiter-specific: load ~/secrets.env at import time) +# --------------------------------------------------------------------------- +def _load_secrets(path: Optional[str] = None) -> None: + """Load secrets from env file, then call unified_db's load_supabase_keys.""" + path = ( + path + or os.environ.get("DC_AGENT_SECRET_ENV") + or os.environ.get("KEYS") + or os.path.expanduser("~/secrets.env") + ) + if path and os.path.isfile(os.path.expanduser(path)): + with open(os.path.expanduser(path)) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + if line.startswith("export "): + line = line[7:].strip() + k, v = line.split("=", 1) + os.environ[k.strip()] = v.strip().strip("'\"") + # Alias SUPABASE_KEY -> SUPABASE_ANON_KEY if the latter is missing + # (some secrets.env files use the shorter name) + if os.environ.get("SUPABASE_KEY") and not os.environ.get("SUPABASE_ANON_KEY"): + os.environ["SUPABASE_ANON_KEY"] = os.environ["SUPABASE_KEY"] + try: + load_supabase_keys() + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Harbor config parsing -- extract eval config fields for dedup +# --------------------------------------------------------------------------- +def parse_harbor_eval_config(path: Optional[str]) -> Dict: + """Parse eval-relevant config fields from a Harbor YAML config. + + Returns dict with keys: timeout_multiplier, override_cpus, + override_memory_mb, override_storage_mb (only if set). + """ + if not path or not os.path.isfile(path): + return {} + try: + import yaml + with open(path) as f: + cfg = yaml.safe_load(f) or {} + except Exception as e: + log(f"WARNING: failed to parse harbor config {path}: {e}") + return {} + result: Dict = {} + if cfg.get("timeout_multiplier") is not None: + result["timeout_multiplier"] = float(cfg["timeout_multiplier"]) + env_cfg = cfg.get("environment") or {} + for key in ("override_cpus", "override_memory_mb", "override_storage_mb"): + if env_cfg.get(key) is not None: + result[key] = int(env_cfg[key]) + return result + + +# --------------------------------------------------------------------------- +# Baseline model config mapping -- per-model vLLM overrides +# --------------------------------------------------------------------------- +_BASELINE_MODEL_CONFIGS: Optional[Dict[str, Dict]] = None +_BASELINE_MODEL_PATTERNS: Optional[List[Dict]] = None + + +def load_baseline_model_configs(path: Optional[str]) -> Dict[str, Dict]: + """Load baseline model -> vLLM config mapping from YAML file. + + Returns dict mapping HF model name to vLLM serving params. + Also loads pattern-based fallback configs (stored in _BASELINE_MODEL_PATTERNS). + """ + global _BASELINE_MODEL_CONFIGS, _BASELINE_MODEL_PATTERNS + if _BASELINE_MODEL_CONFIGS is not None: + return _BASELINE_MODEL_CONFIGS + + if not path or not os.path.isfile(path): + _BASELINE_MODEL_CONFIGS = {} + _BASELINE_MODEL_PATTERNS = [] + return _BASELINE_MODEL_CONFIGS + + try: + import yaml + with open(path) as f: + data = yaml.safe_load(f) or {} + _BASELINE_MODEL_CONFIGS = data.get("models", {}) + _BASELINE_MODEL_PATTERNS = data.get("patterns", []) + log(f"Loaded {len(_BASELINE_MODEL_CONFIGS)} baseline model config(s) and " + f"{len(_BASELINE_MODEL_PATTERNS)} pattern(s) from {path}") + except Exception as e: + log(f"WARNING: failed to load baseline model configs from {path}: {e}") + _BASELINE_MODEL_CONFIGS = {} + _BASELINE_MODEL_PATTERNS = [] + + return _BASELINE_MODEL_CONFIGS + + +def _match_pattern_config(hf_model: str) -> Optional[Dict]: + """Try to match a model name against pattern-based configs. + + Patterns are checked in order; first match wins. + Each pattern has a 'match' field (regex or substring) and config fields. + """ + if not _BASELINE_MODEL_PATTERNS: + return None + for pattern_entry in _BASELINE_MODEL_PATTERNS: + pattern = pattern_entry.get("match", "") + if not pattern: + continue + if re.search(pattern, hf_model): + return {k: v for k, v in pattern_entry.items() if k != "match"} + return None + + +def get_vllm_env_overrides(hf_model: str, configs: Dict[str, Dict]) -> Dict[str, str]: + """Get vLLM env var overrides for a model from the baseline config mapping. + + Tries exact model name match first, then falls back to pattern matching. + Returns dict of EVAL_VLLM_* env vars to pass to the eval script. + """ + cfg = configs.get(hf_model) + if not cfg: + cfg = _match_pattern_config(hf_model) + if not cfg: + return {} + + env: Dict[str, str] = {} + if cfg.get("tensor_parallel_size") is not None: + env["EVAL_VLLM_TENSOR_PARALLEL_SIZE"] = str(cfg["tensor_parallel_size"]) + if cfg.get("max_model_len") is not None: + env["EVAL_VLLM_MAX_MODEL_LEN"] = str(cfg["max_model_len"]) + if cfg.get("swap_space") is not None: + env["EVAL_VLLM_SWAP_SPACE"] = str(cfg["swap_space"]) + if cfg.get("trust_remote_code"): + env["EVAL_VLLM_TRUST_REMOTE_CODE"] = "1" + if cfg.get("tool_call_parser"): + env["EVAL_VLLM_TOOL_CALL_PARSER"] = cfg["tool_call_parser"] + if cfg.get("reasoning_parser"): + env["EVAL_VLLM_REASONING_PARSER"] = cfg["reasoning_parser"] + if cfg.get("extra_args"): + env["EVAL_VLLM_EXTRA_ARGS"] = cfg["extra_args"] + + return env + + +# ---------- Preset Definitions ---------- +# Each preset can configure: +# - datasets: list of HF dataset repos +# - sbatch_script: sbatch script to use (default: unified_eval_harbor_v6.sbatch) +# - log_suffix: suffix for log file +# - check_hf_exists: validate model exists on HuggingFace +# - n_concurrent: Harbor --n-concurrent (default: 64) +# - n_attempts: Harbor --n-attempts (default: 3) +# - gpu_memory_util: VLLM --gpu-memory-utilization (default: 0.9) +# - error_threshold: Max invalid errors before abort (default: 3) +# - vllm_max_retries: VLLM startup retries (default: 5) +# - agent_parser: Agent parser type (default: "", use "xml" for swebench) +# - slurm_time: SLURM time limit (default: "24:00:00") +PRESETS: Dict[str, Dict] = { + "aider": { + "datasets": ["DCAgent2/aider_polyglot"], + "log_suffix": "aider", + "n_concurrent": 32, + "error_threshold": 3, + "enable_thinking": True, + }, + "bfcl": { + "datasets": ["DCAgent2/bfcl-parity"], + "log_suffix": "bfcl", + "n_concurrent": 32, + "error_threshold": 10, + "vllm_max_retries": 20, + "enable_thinking": True, + }, + # NOTE: swebench and tb2 use dcagent_eval_config_no_override.yaml (no model overrides) + "swebench": { + "datasets": ["DCAgent2/swebench-verified-random-100-folders"], + "check_hf_exists": True, + "log_suffix": "swebench", + "n_concurrent": 32, + "error_threshold": 15, + "agent_parser": "xml", + "gpu_memory_util": 0.95, + "vllm_max_retries": 10, + "enable_thinking": True, + "config_yaml": "dcagent_eval_config_no_override.yaml", + }, + "v2": { + "datasets": ["DCAgent/dev_set_v2"], + "log_suffix": "v2", + "n_concurrent": 32, + "error_threshold": 10, + "vllm_max_retries": 10, + "enable_thinking": True, + "config_yaml": "dcagent_eval_config_no_override.yaml", + }, + "tb2": { + "datasets": ["DCAgent2/terminal_bench_2"], + "log_suffix": "tb2", + "n_concurrent": 32, + "error_threshold": 10, + "vllm_max_retries": 10, + "slurm_time": "48:00:00", + "gpu_memory_util": 0.95, + "enable_thinking": True, + "config_yaml": "dcagent_eval_config_no_override.yaml", + }, + "v1": { + "datasets": ["DCAgent/dev_set_71_tasks"], + "log_suffix": "v1", + "n_concurrent": 32, + "error_threshold": 10, + "vllm_max_retries": 20, + "enable_thinking": True, + }, +} + +# ---------- Constants ---------- +HF_URL_RE = re.compile(r'https?://(?:www\.)?huggingface\.co/([^/\s]+)/([^/\s#?]+)') +JOB_STATUS_PENDING = "Pending" +JOB_STATUS_STARTED = "Started" +JOB_STATUS_FINISHED = "Finished" +DEFAULT_STALE_JOB_HOURS = 24 +DEFAULT_STALE_PENDING_HOURS = 48 +DEFAULT_LOOKBACK_DAYS = 1000 +DEFAULT_CHECK_HOURS = 2.0 +DEFAULT_LOG_DIR = "experiments/listener_logs" + +# Sbatch parameter defaults +DEFAULT_N_CONCURRENT = 64 +DEFAULT_N_ATTEMPTS = 3 +DEFAULT_GPU_MEMORY_UTIL = 0.9 +DEFAULT_ERROR_THRESHOLD = 3 +DEFAULT_VLLM_MAX_RETRIES = 5 +DEFAULT_AGENT_PARSER = "" +DEFAULT_SLURM_TIME = "24:00:00" +DEFAULT_AGENT_NAME = "terminus-2" +DEFAULT_SLURM_PARTITION = "main" +DEFAULT_TENSOR_PARALLEL_SIZE = 1 +DEFAULT_ENABLE_THINKING = False +DEFAULT_SBATCH_SCRIPT = "eval/MBZ/unified_eval_harbor_v6.sbatch" + +# Enhancement 2: SLURM job submission throttle +DEFAULT_MAX_JOBS_SUBMITTED = 20 + +# Enhancement 3: Daytona resource pre-flight check +DEFAULT_DAYTONA_SANDBOX_LIMIT = 2000 +DEFAULT_DAYTONA_WARNING_BUFFER = 0.9 + +# Enhancement 4: Model retry tracking +DEFAULT_MODEL_RETRY_THRESHOLD = 5 + +# Enhancement 5: Timeout-config-sensitive dedup +DEFAULT_TIMEOUT_MULTIPLIER = 1.0 + + +# ---------- Configuration ---------- +@dataclass +class ListenerConfig: + """Configuration for the eval listener. + + Core fields: + datasets HF dataset repos to evaluate against. + sbatch_script Path to the sbatch script to submit. + priority_models Ordered list of HF model names from the priority file. + File order = submission priority (first = highest). + priority_file Path to the priority file (hot-reloaded each iteration). + + Sbatch parameters (forwarded to sbatch via env vars): + n_concurrent Harbor --n-concurrent. + n_attempts Harbor --n-attempts. + gpu_memory_util VLLM --gpu-memory-utilization. + error_threshold Max invalid errors before aborting upload (v3 Enhancement 1). + Replaces v2's daytona_threshold. Env var kept as + EVAL_DAYTONA_THRESHOLD for sbatch backward compat. + agent_name Agent name for harbor and DB entries. + timeout_multiplier Harbor timeout multiplier (v3 Enhancement 5). + + v3 enhancement fields: + max_jobs_submitted Per-listener SLURM job limit (Enhancement 2). + Each listener tracks its own submitted job IDs and + only counts those still active in squeue. + check_daytona_resources Enable Daytona API pre-flight check (Enhancement 3). + daytona_sandbox_limit Max expected active sandboxes for pre-flight check. + daytona_warning_buffer Fraction of limit to trigger warning (e.g. 0.95). + track_model_retries Enable model retry tracking (Enhancement 4). + model_retry_threshold Starts before a model is deprioritized. + eval_starts_log Path to shared eval starts log. Auto-generated with + benchmark+time suffix if not specified. + timeout_aware Enable config-sensitive job dedup (Enhancement 5). + """ + datasets: List[str] + sbatch_script: str + log_file: Optional[Path] + lookback_days: int + check_interval_hours: float + stale_job_hours: int + stale_pending_hours: int + priority_file: Optional[str] + require_priority_list: bool + priority_models: List[str] + check_hf_exists: bool + dry_run: bool + run_once: bool + verbose: bool + # Priority mode: "filter_only" (skip non-priority) or "priority_first" (all models, priority first) + priority_mode: str = "filter_only" + # Sbatch parameters (passed to sbatch via env vars) + n_concurrent: int = DEFAULT_N_CONCURRENT + n_attempts: int = DEFAULT_N_ATTEMPTS + gpu_memory_util: float = DEFAULT_GPU_MEMORY_UTIL + error_threshold: int = DEFAULT_ERROR_THRESHOLD + vllm_max_retries: int = DEFAULT_VLLM_MAX_RETRIES + agent_parser: str = DEFAULT_AGENT_PARSER + slurm_time: str = DEFAULT_SLURM_TIME + enable_thinking: bool = DEFAULT_ENABLE_THINKING + agent_name: str = DEFAULT_AGENT_NAME + slurm_partition: str = DEFAULT_SLURM_PARTITION + tensor_parallel_size: int = DEFAULT_TENSOR_PARALLEL_SIZE + upload_username: str = "" + log_prefix: str = "[unified-eval-listener-v4]" + # v3 Enhancement 2: Per-listener SLURM throttle + max_jobs_submitted: int = DEFAULT_MAX_JOBS_SUBMITTED + # v3 Enhancement 3: Daytona pre-flight + check_daytona_resources: bool = False + daytona_sandbox_limit: int = DEFAULT_DAYTONA_SANDBOX_LIMIT + daytona_warning_buffer: float = DEFAULT_DAYTONA_WARNING_BUFFER + # v3 Enhancement 4: Model retry tracking + track_model_retries: bool = False + model_retry_threshold: int = DEFAULT_MODEL_RETRY_THRESHOLD + eval_starts_log: str = "" + # v3 Enhancement 5: Timeout-config-sensitive dedup + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER + timeout_aware: bool = False + # Config YAML for harbor (overrides vs no-overrides) + config_yaml: str = "dcagent_eval_config.yaml" + # Model blacklist + blacklist_file: Optional[str] = None + blacklisted_models: Set[str] = field(default_factory=set) + # Per-model vLLM overrides (baseline model configs) + baseline_model_configs: Optional[str] = None + # Harbor config path + harbor_config: Optional[str] = None + # Parsed eval config from harbor YAML (for config-aware dedup) + eval_config: Dict = field(default_factory=dict) + # Pre-download model weights before submitting jobs + pre_download: bool = False + # Sliding-window batch dependencies + batch_size: Optional[int] = None + # Per-listener invalid errors log + invalid_errors_log: str = "" + # Force submission — skip dedup checks + force: bool = False + # Daytona auto_snapshot + auto_snapshot: bool = False + # Secrets file path for sbatch jobs + secrets_file: str = "" + # Conda env name (default: otagent, use otagent2 for newer models like Qwen3.5) + conda_env: str = "" + + @property + def check_interval_seconds(self) -> int: + return int(self.check_interval_hours * 60 * 60) + + +# ---------- Logging ---------- +_LOG_FILE: Optional[Path] = None +_VERBOSE: bool = False + + +def set_log_file(path: Optional[Path]) -> None: + global _LOG_FILE + _LOG_FILE = path + + +def log(msg: str, prefix: str = "[unified-eval-listener-v4]", verbose_only: bool = False) -> None: + """Log a message to stdout and optionally to file. + + If verbose_only=True, the message is only emitted when _VERBOSE is set. + """ + if verbose_only and not _VERBOSE: + return + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + line = f"{prefix} {ts} {msg}" + print(line, flush=True) + if _LOG_FILE: + try: + with _LOG_FILE.open("a") as f: + f.write(line + "\n") + except Exception: + pass + + +# ---------- Priority Models Loading ---------- +def load_priority_models(filepath: Optional[str]) -> List[str]: + """ + Load priority models from a text file, preserving file order as rank. + + File order determines submission priority: models listed earlier are + submitted first. When the per-listener SLURM job limit truncates the + submission list, higher-priority (earlier) models are kept. + + File format: + - One model per line (HuggingFace format: org/model) + - Lines starting with # are comments + - Blank lines are ignored + + Returns: + Ordered list of model names (duplicates removed, order preserved). + Empty list if file is missing or empty. + """ + if not filepath: + return [] + + path = Path(filepath) + if not path.exists(): + log(f"Priority file not found: {filepath}") + return [] + + models: List[str] = [] + seen: Set[str] = set() + try: + with path.open("r") as f: + for line in f: + line = line.strip() + # Skip empty lines and comments + if not line or line.startswith("#"): + continue + if line not in seen: + seen.add(line) + models.append(line) + log(f"Loaded {len(models)} model(s) from priority file: {filepath}") + return models + except Exception as e: + log(f"ERROR reading priority file {filepath}: {e}") + return [] + + +# ---------- Model Blacklist Loading ---------- +def load_blacklist(filepath: Optional[str]) -> Set[str]: + """Load blacklisted models from a text file. Same format as priority file.""" + return set(load_priority_models(filepath)) + + +# ---------- HuggingFace Utilities ---------- +def check_hf_model_exists(model_name: str) -> bool: + """ + Check if a model exists on HuggingFace Hub. + + Args: + model_name: HF model name (e.g., "org/model-name") + + Returns: + True if model exists and is accessible, False otherwise + """ + if not model_name or not isinstance(model_name, str): + return False + + try: + from huggingface_hub import model_info + model_info(model_name) + return True + except Exception as e: + log(f"HF check failed for {model_name}: {e}") + return False + + +def _parse_hf_from_str(val: Optional[str]) -> Optional[str]: + """Parse HuggingFace model name from a string (URL or org/repo).""" + if not isinstance(val, str): + return None + m = HF_URL_RE.search(val) + if m: + return f"{m.group(1)}/{m.group(2)}" + return None + + +def resolve_hf_model_name(model_row: Dict) -> Optional[str]: + """ + Resolve HF model name from a database model row. + + Checks multiple fields in order of priority. + """ + # Check name field first + v = model_row.get("name") + if isinstance(v, str) and "/" in v and not v.startswith("hosted_vllm/"): + return v + + # Check other URL fields + for field in ("weights_location", "training_parameters", "url", "hf_url"): + vv = model_row.get(field) + if isinstance(vv, str): + name = _parse_hf_from_str(vv) + if name: + return name + + # Check training_parameters as JSON + vv = model_row.get("training_parameters") + if isinstance(vv, str): + try: + obj = json.loads(vv) + except Exception: + obj = None + else: + obj = vv + + if isinstance(obj, dict): + for sval in obj.values(): + if isinstance(sval, str): + name = _parse_hf_from_str(sval) + if name: + return name + + return None + + +# ---------- Dataset Parsing ---------- +def parse_datasets(s: str) -> List[str]: + """ + Parse dataset list from string. + + Supports comma, space, or newline separated values. + Normalizes HF URLs to org/repo format. + """ + parts = [p.strip() for p in re.split(r"[,\s]+", s) if p.strip()] + out = [] + for p in parts: + m = HF_URL_RE.search(p) + out.append(f"{m.group(1)}/{m.group(2)}" if m else p) + + # Dedup while preserving order + seen: Set[str] = set() + uniq: List[str] = [] + for d in out: + if d not in seen: + seen.add(d) + uniq.append(d) + return uniq + + +def dataset_repo_name(dataset_hf: str) -> str: + """Convert 'org/repo' or HF URL to 'repo' (just the repo name).""" + if not dataset_hf: + return dataset_hf + m = HF_URL_RE.search(dataset_hf) + if m: + return m.group(2) + if "/" in dataset_hf: + return dataset_hf.rsplit("/", 1)[-1] + return dataset_hf + + +# ---------- Database Operations ---------- +_BENCH_CACHE: Dict[str, Optional[str]] = {} + + +def _iso(dt: datetime) -> str: + """Convert datetime to ISO format string.""" + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc).isoformat() + + +def _time_filters(q, since_iso: str): + """Apply time filter to Supabase query (handles both column names).""" + try: + return q.gte('creation_time', since_iso) + except Exception: + return q.gte('created_at', since_iso) + + +def fetch_recent_models(days: int) -> List[Dict]: + """Fetch recent models from Supabase within the lookback window. + + Filters out: + - Models with created_by == "precomputed_hf" + - Models with a non-empty "duplicate_of" field (v3: prevents duplicate + eval submissions when the same HF model appears under multiple DB rows) + """ + client = get_supabase_client() + since = _iso(datetime.now(timezone.utc) - timedelta(days=days)) + try: + resp = _time_filters(client.table('models').select('*'), since).execute() + rows = list(resp.data or []) + except Exception as e: + log(f"ERROR: failed querying models by time: {e}") + return [] + + # Filter out precomputed models and duplicates + out: List[Dict] = [] + skipped_dupes = 0 + for r in rows: + if r.get("created_by") == "precomputed_hf": + continue + if r.get("duplicate_of"): + skipped_dupes += 1 + continue + out.append(r) + if skipped_dupes: + log(f"Filtered out {skipped_dupes} duplicate model(s) (duplicate_of set)") + return out + + +def fetch_priority_models(priority_names: List[str]) -> List[Dict]: + """Fetch models by name from Supabase, bypassing the lookback window. + + This ensures priority models are always evaluated even if they were + registered long ago (outside the lookback window). + + Filters out: + - Models with created_by == "precomputed_hf" + - Models with a non-empty "duplicate_of" field + """ + if not priority_names: + return [] + + client = get_supabase_client() + try: + resp = ( + client.table('models') + .select('*') + .in_('name', priority_names) + .execute() + ) + rows = list(resp.data or []) + except Exception as e: + log(f"ERROR: failed querying priority models by name: {e}") + return [] + + out: List[Dict] = [] + for r in rows: + if r.get("created_by") == "precomputed_hf": + continue + if r.get("duplicate_of"): + continue + out.append(r) + return out + + +def resolve_benchmark_id(dataset_hf: str) -> Optional[str]: + """ + Look up benchmark ID from database for a given dataset. + + Caches results for performance. + """ + repo_name = dataset_repo_name(dataset_hf) + if repo_name in _BENCH_CACHE: + return _BENCH_CACHE[repo_name] + + try: + client = get_supabase_client() + resp = ( + client.table('benchmarks') + .select('id,name') + .eq('name', repo_name) + .limit(1) + .execute() + ) + rows = resp.data or [] + bench_id = rows[0]['id'] if rows else None + _BENCH_CACHE[repo_name] = bench_id + if not bench_id: + log(f"No benchmark row found for dataset '{dataset_hf}' (wanted name='{repo_name}').") + return bench_id + except Exception as e: + log(f"ERROR resolving benchmark id for dataset '{dataset_hf}': {e}") + return None + + +def check_job_status( + model_id: str, benchmark_id: Optional[str] +) -> Tuple[bool, Optional[str], Optional[datetime], Optional[datetime], Optional[str]]: + """ + Check if a job exists for (model_id, benchmark_id) and its status. + + Returns: + (job_exists, job_status, started_at, submitted_at, slurm_job_id) + """ + if not benchmark_id: + return (False, None, None, None, None) + + try: + client = get_supabase_client() + q = ( + client.table('sandbox_jobs') + .select('id,job_status,started_at,submitted_at,slurm_job_id') + .eq('model_id', model_id) + .eq('benchmark_id', benchmark_id) + .order('created_at', desc=True) + .limit(1) + ) + data = (q.execute().data) or [] + + if not data: + return (False, None, None, None, None) + + job = data[0] + job_status = job.get('job_status') + started_at_str = job.get('started_at') + submitted_at_str = job.get('submitted_at') + slurm_job_id = job.get('slurm_job_id') + + started_at = None + if started_at_str: + try: + started_at = datetime.fromisoformat(started_at_str.replace('Z', '+00:00')) + except Exception: + pass + + submitted_at = None + if submitted_at_str: + try: + submitted_at = datetime.fromisoformat(submitted_at_str.replace('Z', '+00:00')) + except Exception: + pass + + return (True, job_status, started_at, submitted_at, slurm_job_id) + + except Exception as e: + log(f"WARNING: sandbox_jobs check failed for model_id={model_id}, benchmark_id={benchmark_id}: {e}") + return (False, None, None, None, None) # fail-open + + +# ---------- Cross-Duplicate Aggregation ---------- +_DUP_GROUP_CACHE: Dict[str, List[str]] = {} + + +def get_duplicate_group_ids(table: str, entity_id: str) -> List[str]: + """Get all IDs in the duplicate group for a model or benchmark. + + Given an entity_id, finds the canonical ID and all its duplicates. + - If entity has duplicate_of set, canonical = duplicate_of + - Otherwise canonical = entity_id + - Then finds all rows WHERE duplicate_of = canonical_id + - Returns [canonical_id] + [all duplicate IDs] + + Results are cached per (table, entity_id). + """ + cache_key = f"{table}:{entity_id}" + if cache_key in _DUP_GROUP_CACHE: + return _DUP_GROUP_CACHE[cache_key] + + try: + client = get_supabase_client() + + # Step 1: Find the canonical ID + resp = client.table(table).select('id,duplicate_of').eq('id', entity_id).limit(1).execute() + rows = resp.data or [] + if not rows: + _DUP_GROUP_CACHE[cache_key] = [entity_id] + return [entity_id] + + canonical_id = rows[0].get('duplicate_of') or entity_id + + # Step 2: Find all duplicates of the canonical + resp2 = client.table(table).select('id').eq('duplicate_of', canonical_id).execute() + dup_ids = [r['id'] for r in (resp2.data or [])] + + group = list(set([canonical_id] + dup_ids)) + # Cache for all members of the group + for gid in group: + _DUP_GROUP_CACHE[f"{table}:{gid}"] = group + return group + + except Exception as e: + log(f"WARNING: Failed to get duplicate group for {table}/{entity_id}: {e}") + _DUP_GROUP_CACHE[cache_key] = [entity_id] + return [entity_id] + + +# ---------- v3 Enhancement 5: Timeout-Config-Sensitive Job Dedup ---------- +def check_job_status_v3( + model_id: str, + benchmark_id: Optional[str], + timeout_aware: bool = False, + agent_name: str = DEFAULT_AGENT_NAME, + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER, + duplicate_model_ids: Optional[List[str]] = None, + duplicate_benchmark_ids: Optional[List[str]] = None, +) -> Tuple[bool, Optional[str], Optional[datetime], Optional[datetime], Optional[str]]: + """ + Check if a job exists for (model_id, benchmark_id) and its status. + + When timeout_aware=True, filters to only match jobs with the same + agent_name and timeout_multiplier in their config. + + When duplicate_model_ids/duplicate_benchmark_ids are provided, queries + across the entire duplicate group using .in_() instead of .eq(). + + Returns: + (job_exists, job_status, started_at, submitted_at, slurm_job_id) + """ + if not benchmark_id: + return (False, None, None, None, None) + + # Determine which IDs to query + model_ids = duplicate_model_ids if duplicate_model_ids else [model_id] + bench_ids = duplicate_benchmark_ids if duplicate_benchmark_ids else [benchmark_id] + + if not timeout_aware and len(model_ids) == 1 and len(bench_ids) == 1: + # Fall back to original behavior (no duplicates, no timeout awareness) + return check_job_status(model_id, benchmark_id) + + try: + client = get_supabase_client() + q = client.table('sandbox_jobs').select( + 'id,job_status,started_at,submitted_at,slurm_job_id,config' + ) + + # Use .in_() for duplicate groups, .eq() for singles + if len(model_ids) == 1: + q = q.eq('model_id', model_ids[0]) + else: + q = q.in_('model_id', model_ids) + + if len(bench_ids) == 1: + q = q.eq('benchmark_id', bench_ids[0]) + else: + q = q.in_('benchmark_id', bench_ids) + + q = q.order('created_at', desc=True).limit(50) + data = (q.execute().data) or [] + + if not data: + return (False, None, None, None, None) + + # Filter to matching config if timeout_aware + for job in data: + if timeout_aware: + config = job.get('config') + if isinstance(config, str): + try: + config = json.loads(config) + except Exception: + config = {} + if not isinstance(config, dict): + config = {} + + job_agent = config.get('agent', DEFAULT_AGENT_NAME) + job_tm = config.get('timeout_multiplier', DEFAULT_TIMEOUT_MULTIPLIER) + + # Skip if agent_name or timeout_multiplier don't match + if job_agent != agent_name or float(job_tm) != float(timeout_multiplier): + continue + + job_status = job.get('job_status') + started_at_str = job.get('started_at') + submitted_at_str = job.get('submitted_at') + slurm_job_id = job.get('slurm_job_id') + + started_at = None + if started_at_str: + try: + started_at = datetime.fromisoformat(started_at_str.replace('Z', '+00:00')) + except Exception: + pass + + submitted_at = None + if submitted_at_str: + try: + submitted_at = datetime.fromisoformat(submitted_at_str.replace('Z', '+00:00')) + except Exception: + pass + + return (True, job_status, started_at, submitted_at, slurm_job_id) + + # No matching job found + return (False, None, None, None, None) + + except Exception as e: + log(f"WARNING: sandbox_jobs v3 check failed for model_id={model_id}, benchmark_id={benchmark_id}: {e}") + return (False, None, None, None, None) # fail-open + + +def is_job_stale(started_at: Optional[datetime], hours: int = DEFAULT_STALE_JOB_HOURS) -> bool: + """Check if a job started more than the specified hours ago.""" + if not started_at: + # If started_at is null but job exists with status='Started', treat as stale + return True + now = datetime.now(timezone.utc) + if started_at.tzinfo is None: + started_at = started_at.replace(tzinfo=timezone.utc) + age = now - started_at + return age > timedelta(hours=hours) + + +def _config_matches_eval(job_config: Optional[Dict], eval_config: Dict) -> bool: + """Check if a DB job's config JSONB matches the current eval config fields. + + Compares: timeout_multiplier, override_cpus, override_memory_mb, override_storage_mb. + A job with no config is treated as defaults (timeout=1.0, no overrides). + If eval_config is empty (no harbor config), any job config matches (backwards compat). + """ + if not eval_config: + return True # no config constraints -- any existing job counts + + job_cfg = job_config or {} + job_env = job_cfg.get("environment") or {} + + # timeout_multiplier: top-level in config JSONB + if "timeout_multiplier" in eval_config: + job_tm = job_cfg.get("timeout_multiplier") + # Treat None/missing as 1.0 + job_tm = float(job_tm) if job_tm is not None else 1.0 + if float(eval_config["timeout_multiplier"]) != job_tm: + return False + + # Environment overrides: nested under config.environment + for key in ("override_cpus", "override_memory_mb", "override_storage_mb"): + if key in eval_config: + job_val = job_env.get(key) + # Treat None/missing as the default (None means no override) + job_val = int(job_val) if job_val is not None else None + eval_val = int(eval_config[key]) + if job_val != eval_val: + return False + + return True + + +def should_start_job( + model_id: str, + benchmark_id: Optional[str], + stale_hours: int = DEFAULT_STALE_JOB_HOURS, + stale_pending_hours: int = DEFAULT_STALE_PENDING_HOURS, + timeout_aware: bool = False, + agent_name: str = DEFAULT_AGENT_NAME, + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER, + duplicate_model_ids: Optional[List[str]] = None, + duplicate_benchmark_ids: Optional[List[str]] = None, + eval_config: Optional[Dict] = None, +) -> Tuple[bool, str, Optional[str]]: + """ + Determine if a job should be started based on DB status. + + When timeout_aware=True (v3 Enhancement 5), uses check_job_status_v3() + which filters jobs by agent_name and timeout_multiplier in config. This + allows running the same model with different configs without one blocking + the other. + + When duplicate_model_ids/duplicate_benchmark_ids are provided, checks + across the entire duplicate group for existing jobs. + + When eval_config is provided (from harbor YAML), performs config-aware + dedup: checks that existing jobs match the current resource overrides + (timeout_multiplier, override_cpus, override_memory_mb, override_storage_mb). + + Returns: + (should_start, reason, slurm_job_id) + slurm_job_id is provided so the caller can scancel stale jobs. + """ + has_duplicates = ( + (duplicate_model_ids and len(duplicate_model_ids) > 1) + or (duplicate_benchmark_ids and len(duplicate_benchmark_ids) > 1) + ) + + if timeout_aware or has_duplicates: + job_exists, job_status, started_at, submitted_at, slurm_job_id = check_job_status_v3( + model_id, benchmark_id, + timeout_aware=timeout_aware, + agent_name=agent_name, + timeout_multiplier=timeout_multiplier, + duplicate_model_ids=duplicate_model_ids, + duplicate_benchmark_ids=duplicate_benchmark_ids, + ) + else: + job_exists, job_status, started_at, submitted_at, slurm_job_id = check_job_status( + model_id, benchmark_id + ) + + if not job_exists: + return (True, "no existing job", None) + + ec = eval_config or {} + + if job_status == JOB_STATUS_FINISHED: + if ec: + # Check if any Finished job matches our eval config + try: + client = get_supabase_client() + q = ( + client.table("sandbox_jobs") + .select("config, metrics") + .eq("model_id", model_id) + .eq("benchmark_id", benchmark_id) + .eq("job_status", "Finished") + .order("created_at", desc=True) + .limit(10) + ) + rows = (q.execute().data) or [] + matching = [r for r in rows if _config_matches_eval(r.get("config"), ec)] + if not matching: + return (True, "no finished job with matching config", slurm_job_id) + if matching[0] and not matching[0].get("metrics"): + return (True, "finished with matching config but metrics cleared", slurm_job_id) + except Exception as e: + log(f"WARNING: config-aware check failed: {e}") + return (False, "job finished", slurm_job_id) + + if job_status == JOB_STATUS_PENDING: + if ec: + try: + client = get_supabase_client() + q = ( + client.table("sandbox_jobs") + .select("config, slurm_job_id, created_at") + .eq("model_id", model_id) + .eq("benchmark_id", benchmark_id) + .eq("job_status", "Pending") + .order("created_at", desc=True) + .limit(5) + ) + rows = (q.execute().data) or [] + matching = [r for r in rows if _config_matches_eval(r.get("config"), ec)] + if not matching: + return (True, "no pending job with matching config", slurm_job_id) + except Exception as e: + log(f"WARNING: config-aware check failed: {e}") + # Job submitted but not yet running - check if stale using separate pending threshold + if is_job_stale(submitted_at, stale_pending_hours): + submitted_str = submitted_at.isoformat() if submitted_at else "null" + return (True, f"stale pending job (submitted_at={submitted_str})", slurm_job_id) + else: + submitted_str = submitted_at.isoformat() if submitted_at else "null" + return (False, f"job pending in SLURM queue (submitted_at={submitted_str})", slurm_job_id) + + if job_status == JOB_STATUS_STARTED: + if ec: + try: + client = get_supabase_client() + q = ( + client.table("sandbox_jobs") + .select("config, started_at") + .eq("model_id", model_id) + .eq("benchmark_id", benchmark_id) + .eq("job_status", "Started") + .order("created_at", desc=True) + .limit(5) + ) + rows = (q.execute().data) or [] + matching = [r for r in rows if _config_matches_eval(r.get("config"), ec)] + if not matching: + return (True, "no in-progress job with matching config", slurm_job_id) + except Exception as e: + log(f"WARNING: config-aware check failed: {e}") + if is_job_stale(started_at, stale_hours): + started_str = started_at.isoformat() if started_at else "null" + return (True, f"stale job (started_at={started_str})", slurm_job_id) + else: + started_str = started_at.isoformat() if started_at else "null" + return (False, f"job in progress (started_at={started_str})", slurm_job_id) + + # Unknown status - start job to be safe + return (True, f"unknown job status: {job_status}", slurm_job_id) + + +# ---------- v3 Enhancement 2: Per-Listener SLURM Job Throttle ---------- +def get_active_slurm_job_ids() -> Set[str]: + """Return set of SLURM job IDs currently queued/running for this user. + + Used by EvalListener to determine which of its submitted jobs are still + active. The listener intersects this with its internal _submitted_jobs + set to get a per-listener active count. + """ + try: + user = getpass.getuser() + code, out = _run(["squeue", "-u", user, "--noheader", "-h", "-o", "%i"]) + if code != 0: + log(f"WARNING: squeue failed (exit {code}), returning empty set") + return set() + return {line.strip() for line in out.strip().split('\n') if line.strip()} + except Exception as e: + log(f"WARNING: Failed to query squeue: {e}") + return set() + + +# ---------- v3 Enhancement 3: Daytona Resource Pre-flight Check ---------- +def check_daytona_resources(sandbox_limit: int, warning_buffer: float) -> bool: + """ + Check Daytona resource usage via API. + + Called at listener startup and optionally each iteration when + --check-daytona-resources is enabled. Requires DAYTONA_API_KEY in env. + + Returns True if OK to proceed, False if active sandboxes >= sandbox_limit. + Logs a warning when active sandboxes >= sandbox_limit * warning_buffer. + """ + try: + from daytona_api_client import ApiClient, Configuration, SandboxApi + except ImportError: + log("WARNING: daytona_api_client not installed, skipping resource check") + return True + + api_key = os.environ.get("DAYTONA_API_KEY") + api_url = os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api") + if not api_key: + log("WARNING: DAYTONA_API_KEY not set, skipping resource check") + return True + + try: + config = Configuration(host=api_url) + client = ApiClient(config) + client.default_headers["Authorization"] = f"Bearer {api_key}" + api = SandboxApi(client) + + result = api.list_sandboxes_paginated(states=["started"], limit=1, page=1) + active_count = result.total + + threshold = int(sandbox_limit * warning_buffer) + if active_count >= sandbox_limit: + log(f"ERROR: Daytona resources at limit: {active_count}/{sandbox_limit} active sandboxes " + f"({active_count/sandbox_limit:.1%})") + return False + elif active_count >= threshold: + log(f"WARNING: Daytona resources at {active_count}/{sandbox_limit} active sandboxes " + f"({active_count/sandbox_limit:.1%}) - approaching limit!") + return True + else: + log(f"Daytona resources OK: {active_count}/{sandbox_limit} active sandboxes " + f"({active_count/sandbox_limit:.1%})") + return True + except Exception as e: + log(f"WARNING: Daytona resource check failed: {e}") + return True # fail-open + + +# ---------- v3 Enhancement 4: Model Retry Tracking ---------- +def load_model_retry_counts(log_path: str) -> Dict[str, int]: + """Parse the eval starts log and return start counts per model. + + The log is an append-only file written by the sbatch script each time a + job transitions to Started. Format: TIMESTAMP MODEL_HF DATASET_HF RUN_TAG + + Models with counts >= model_retry_threshold are deprioritized (moved to + end of submission queue) so they don't consume SLURM slots ahead of + models that haven't been tried yet. + """ + counts: Dict[str, int] = Counter() + try: + with open(log_path) as f: + for line in f: + parts = line.strip().split() + if len(parts) >= 2: + counts[parts[1]] += 1 # parts[1] is MODEL_HF + except FileNotFoundError: + pass + except Exception as e: + log(f"WARNING: Failed to read eval starts log {log_path}: {e}") + return dict(counts) + + +# ---------- Job Submission ---------- +@dataclass +class SbatchParams: + """Parameters passed to the sbatch script via environment variables. + + The listener converts these to EVAL_* env vars via to_env(), which the + sbatch script reads at startup. See unified_eval_harbor_v6.sbatch header + for the full env var list. + + v3 additions: + error_threshold Mapped to EVAL_DAYTONA_THRESHOLD (name kept for compat). + Controls the unified invalid error threshold. + eval_starts_log Mapped to EVAL_STARTS_LOG. Path to the shared eval + starts log for model retry tracking. + timeout_multiplier Mapped to EVAL_TIMEOUT_MULTIPLIER. Passed to harbor + --timeout-multiplier and stored in DB job config. + """ + n_concurrent: int = DEFAULT_N_CONCURRENT + n_attempts: int = DEFAULT_N_ATTEMPTS + gpu_memory_util: float = DEFAULT_GPU_MEMORY_UTIL + error_threshold: int = DEFAULT_ERROR_THRESHOLD + vllm_max_retries: int = DEFAULT_VLLM_MAX_RETRIES + agent_parser: str = DEFAULT_AGENT_PARSER + slurm_time: str = DEFAULT_SLURM_TIME + enable_thinking: bool = DEFAULT_ENABLE_THINKING + agent_name: str = DEFAULT_AGENT_NAME + slurm_partition: str = DEFAULT_SLURM_PARTITION + tensor_parallel_size: int = DEFAULT_TENSOR_PARALLEL_SIZE + upload_username: str = "" + eval_starts_log: str = "" # v3 Enhancement 4 + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER # v3 Enhancement 5 + invalid_errors_log: str = "" # Per-listener invalid errors log + auto_snapshot: bool = False # Daytona auto_snapshot for environments + secrets_file: str = "" # Path to secrets.env file for sbatch + config_yaml: str = "dcagent_eval_config.yaml" + conda_env: str = "" # Conda env name (default: otagent, use otagent2 for Qwen3.5+) + + def to_env(self) -> Dict[str, str]: + """Convert to environment variables for sbatch.""" + env = { + "EVAL_N_CONCURRENT": str(self.n_concurrent), + "EVAL_N_ATTEMPTS": str(self.n_attempts), + "EVAL_GPU_MEMORY_UTIL": str(self.gpu_memory_util), + "EVAL_DAYTONA_THRESHOLD": str(self.error_threshold), + "EVAL_VLLM_MAX_RETRIES": str(self.vllm_max_retries), + "EVAL_AGENT_PARSER": self.agent_parser, + "EVAL_SLURM_TIME": self.slurm_time, + "EVAL_ENABLE_THINKING": "true" if self.enable_thinking else "false", + "EVAL_AGENT_NAME": self.agent_name, + "EVAL_VLLM_TENSOR_PARALLEL_SIZE": str(self.tensor_parallel_size), + } + if self.upload_username: + env["EVAL_UPLOAD_USERNAME"] = self.upload_username + # Enhancement 4: Pass eval starts log path + if self.eval_starts_log: + env["EVAL_STARTS_LOG"] = self.eval_starts_log + # Pass per-listener invalid errors log path + if self.invalid_errors_log: + env["EVAL_INVALID_ERRORS_LOG"] = self.invalid_errors_log + # Pass auto_snapshot setting + if self.auto_snapshot: + env["EVAL_AUTO_SNAPSHOT"] = "true" + # Pass secrets file path + if self.secrets_file: + env["EVAL_SECRETS_FILE"] = self.secrets_file + # Enhancement 5: Pass timeout multiplier + if self.timeout_multiplier != DEFAULT_TIMEOUT_MULTIPLIER: + env["EVAL_TIMEOUT_MULTIPLIER"] = str(self.timeout_multiplier) + # Pass config YAML (no-override for tb2/swebench) + if self.config_yaml != "dcagent_eval_config.yaml": + env["EVAL_CONFIG_YAML"] = self.config_yaml + # Pass conda env name (for newer models needing otagent2) + if self.conda_env: + env["EVAL_CONDA_ENV"] = self.conda_env + return env + + def __str__(self) -> str: + """String representation for logging.""" + parts = [ + f"n_concurrent={self.n_concurrent}", + f"n_attempts={self.n_attempts}", + f"gpu_memory_util={self.gpu_memory_util}", + f"error_threshold={self.error_threshold}", + f"vllm_max_retries={self.vllm_max_retries}", + ] + if self.agent_parser: + parts.append(f"agent_parser={self.agent_parser}") + if self.slurm_time != DEFAULT_SLURM_TIME: + parts.append(f"slurm_time={self.slurm_time}") + if self.enable_thinking: + parts.append("enable_thinking=True") + if self.agent_name != DEFAULT_AGENT_NAME: + parts.append(f"agent_name={self.agent_name}") + if self.tensor_parallel_size != DEFAULT_TENSOR_PARALLEL_SIZE: + parts.append(f"tensor_parallel_size={self.tensor_parallel_size}") + if self.slurm_partition != DEFAULT_SLURM_PARTITION: + parts.append(f"slurm_partition={self.slurm_partition}") + if self.upload_username: + parts.append(f"upload_username={self.upload_username}") + if self.eval_starts_log: + parts.append(f"eval_starts_log={self.eval_starts_log}") + if self.timeout_multiplier != DEFAULT_TIMEOUT_MULTIPLIER: + parts.append(f"timeout_multiplier={self.timeout_multiplier}") + if self.conda_env: + parts.append(f"conda_env={self.conda_env}") + return ", ".join(parts) + + +def _run(cmd: List[str], env: Optional[Dict[str, str]] = None) -> Tuple[int, str]: + """Run a command and return exit code and output.""" + # Merge with current environment if extra env vars provided + run_env = None + if env: + run_env = os.environ.copy() + run_env.update(env) + + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=run_env + ) + out_lines = [] + assert proc.stdout is not None + for line in proc.stdout: + out_lines.append(line.rstrip()) + code = proc.wait() + return code, "\n".join(out_lines) + + +def generate_run_tag(dataset_hf: str, model_hf: str) -> str: + """ + Generate a unique RUN_TAG for the job. + + Format: {safe_repo}_{safe_model}_{timestamp} + """ + safe_repo = dataset_repo_name(dataset_hf).replace("-", "_").replace(".", "_") + safe_model = model_hf.split("/")[-1].replace("-", "_").replace(".", "_") + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"{safe_repo}_{safe_model}_{timestamp}" + + +def cancel_slurm_job(slurm_job_id: str, dry_run: bool = False) -> bool: + """Cancel a SLURM job via scancel. Returns True if successful.""" + if dry_run: + log(f"[DRY RUN] Would cancel SLURM job {slurm_job_id}") + return True + code, out = _run(["scancel", slurm_job_id]) + if code == 0: + log(f"Cancelled SLURM job {slurm_job_id}") + return True + else: + log(f"WARNING: scancel failed for job {slurm_job_id}: {out}") + return False + + +def update_pending_job_slurm_id(db_job_id: str, slurm_job_id: str) -> None: + """Update the Pending job entry with the SLURM job ID after successful sbatch.""" + try: + client = get_supabase_client() + client.table("sandbox_jobs").update( + {"slurm_job_id": slurm_job_id} + ).eq("id", db_job_id).execute() + log(f"Updated job {db_job_id} with slurm_job_id={slurm_job_id}", verbose_only=True) + except Exception as e: + log(f"WARNING: failed to update job {db_job_id} with slurm_job_id: {e}") + + +def submit_eval( + hf_model_name: str, + dataset_hf: str, + benchmark_id: Optional[str], + sbatch_script: str, + sbatch_params: Optional[SbatchParams] = None, + dry_run: bool = False, + upload_username: str = "", + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER, + vllm_overrides: Optional[Dict[str, str]] = None, + dependency: Optional[str] = None, + eval_config: Optional[Dict] = None, +) -> Tuple[Optional[str], Optional[str]]: + """ + Create a Pending DB entry, then submit sbatch job and update with SLURM ID. + + sbatch positional args: + $1 = model HF name + $2 = dataset HF repo (org/repo) + $3 = benchmark_id (uuid) [optional] + $4 = job_name (RUN_TAG) + + Environment variables (from SbatchParams.to_env()): + EVAL_N_CONCURRENT, EVAL_N_ATTEMPTS, EVAL_GPU_MEMORY_UTIL, + EVAL_DAYTONA_THRESHOLD, EVAL_VLLM_MAX_RETRIES, EVAL_AGENT_PARSER, + EVAL_SLURM_TIME, EVAL_ENABLE_THINKING, EVAL_AGENT_NAME, + EVAL_STARTS_LOG (v3), EVAL_TIMEOUT_MULTIPLIER (v3) + + The Pending DB entry includes timeout_multiplier in its config dict + so that timeout-aware dedup (Enhancement 5) can match on it. + + Args: + vllm_overrides: Optional dict of EVAL_VLLM_* env vars from baseline + model config. Merged into sbatch env vars. + dependency: Optional SLURM dependency string (e.g. 'afterany:12345'). + eval_config: Optional harbor eval config dict for DB job config. + + Returns: + (slurm_job_id, job_name) if successful, ("DRY_RUN", job_name) if dry run, (None, None) on failure + """ + # Generate unique job name + job_name = generate_run_tag(dataset_hf, hf_model_name) + + # Early return for dry-run — no DB writes, no sbatch + if dry_run: + log(f"[DRY RUN] Would submit: model={hf_model_name} dataset={dataset_hf} job={job_name}") + if sbatch_params: + log(f"[DRY RUN] With params: {sbatch_params}") + if vllm_overrides: + log(f"[DRY RUN] vLLM overrides: {list(vllm_overrides.keys())}", verbose_only=True) + return ("DRY_RUN", job_name) + + # Step 1: Create Pending DB entry BEFORE sbatch submission + agent = sbatch_params.agent_name if sbatch_params else DEFAULT_AGENT_NAME + tm = sbatch_params.timeout_multiplier if sbatch_params else timeout_multiplier + config: Dict = {"agent": agent, "env": "daytona", "timeout_multiplier": tm} + # Include harbor eval config fields in DB entry for config-aware dedup + if eval_config: + if "timeout_multiplier" in eval_config: + config["timeout_multiplier"] = eval_config["timeout_multiplier"] + env_overrides = {} + for key in ("override_cpus", "override_memory_mb", "override_storage_mb"): + if key in eval_config: + env_overrides[key] = eval_config[key] + if env_overrides: + config["environment"] = env_overrides + + db_job_id: Optional[str] = None + try: + from database.unified_db.utils import create_job_entry_pending + result = create_job_entry_pending( + job_name=job_name, + model_hf=hf_model_name, + benchmark_hf=dataset_hf, + agent_name=agent, + slurm_job_id="pending", + username=upload_username or "listener", + config=config, + ) + if result.get("success") and result.get("job"): + db_job_id = str(result["job"].get("id")) + log(f"Created Pending DB entry: {db_job_id}", verbose_only=True) + else: + log(f"WARNING: Failed to create Pending DB entry: {result.get('error')}") + except Exception as e: + log(f"WARNING: Exception creating Pending DB entry: {e}") + + # Step 2: Build sbatch command + cmd = ["sbatch"] + if sbatch_params: + cmd.extend(["--time", sbatch_params.slurm_time]) + cmd.extend(["--partition", sbatch_params.slurm_partition]) + # main partition has no QOS; other partitions need matching QOS + # Map partition name to QOS name (some differ, e.g. highprio -> highprio-qos) + _PARTITION_QOS = { + "lowprio": "lowprio", + "highprio": "highprio-qos", + } + if sbatch_params.slurm_partition in _PARTITION_QOS: + cmd.extend(["--qos", _PARTITION_QOS[sbatch_params.slurm_partition]]) + # Set GPU count: per-model override > listener-level TP size > default 1 + n_gpus = sbatch_params.tensor_parallel_size if sbatch_params else DEFAULT_TENSOR_PARALLEL_SIZE + if vllm_overrides and "EVAL_VLLM_TENSOR_PARALLEL_SIZE" in vllm_overrides: + n_gpus = int(vllm_overrides["EVAL_VLLM_TENSOR_PARALLEL_SIZE"]) + cmd.extend(["--gres", f"gpu:{n_gpus}"]) + if dependency: + cmd.append(f"--dependency={dependency}") + cmd.append(sbatch_script) + cmd.extend([hf_model_name, dataset_hf]) + if benchmark_id: + cmd.append(str(benchmark_id)) + cmd.append(job_name) # 4th arg: job_name (RUN_TAG) + + # Get env vars from params and merge vllm overrides + harbor config env vars + env_vars = sbatch_params.to_env() if sbatch_params else {} + if db_job_id: + env_vars["EVAL_DB_JOB_ID"] = db_job_id + if vllm_overrides: + env_vars.update(vllm_overrides) + # Pass harbor eval config fields as sbatch env vars + if eval_config: + if eval_config.get("timeout_multiplier") is not None: + env_vars["EVAL_TIMEOUT_MULTIPLIER"] = str(eval_config["timeout_multiplier"]) + if eval_config.get("override_memory_mb") is not None: + env_vars["EVAL_OVERRIDE_MEMORY_MB"] = str(eval_config["override_memory_mb"]) + + # Step 3: Run sbatch + code, out = _run(cmd, env=env_vars) + log(f"sbatch: {' '.join(cmd)}\n{out}") + + if code != 0: + # sbatch failed; pending entry remains (will be detected as stale later) + return (None, None) + + m = re.search(r"Submitted batch job (\d+)", out) + slurm_job_id = m.group(1) if m else None + + if not slurm_job_id: + log("ERROR: Could not parse SLURM job ID from sbatch output") + return (None, None) + + # Step 4: Update pending entry with actual SLURM job ID + if db_job_id: + update_pending_job_slurm_id(db_job_id, slurm_job_id) + + return (slurm_job_id, job_name) + + +# ---------- Main Listener Class ---------- +class EvalListener: + """Unified eval listener v3 that handles all benchmark configurations. + + Lifecycle: + 1. run() logs config, runs Daytona pre-flight (if enabled), enters main loop + 2. Each iteration: hot-reload priority file, fetch models, filter, build + submissions list, sort by priority rank, apply retry deprioritization, + throttle to per-listener SLURM limit, submit + 3. Sleep for check_interval_hours, then repeat + + Per-listener SLURM tracking: + _submitted_jobs tracks SLURM job IDs submitted by THIS listener instance. + Each iteration, completed jobs are pruned via squeue intersection. This + allows multiple listeners to run in parallel with independent job budgets. + """ + + def __init__(self, config: ListenerConfig): + self.config = config + self._submitted_jobs: Set[str] = set() # SLURM job IDs submitted by THIS listener + set_log_file(config.log_file) + + def run_iteration(self) -> int: + """ + Run one check iteration. + + Returns: + Number of jobs submitted (or would submit in dry-run mode) + """ + # Hot-reload priority models from file (enables editing during long runs) + if self.config.priority_file: + new_priority = load_priority_models(self.config.priority_file) + if new_priority != self.config.priority_models: + log(f"Priority list reloaded: {len(new_priority)} model(s)") + self.config.priority_models = new_priority + + # Hot-reload blacklist from file + if self.config.blacklist_file: + new_blacklist = load_blacklist(self.config.blacklist_file) + if new_blacklist != self.config.blacklisted_models: + log(f"Blacklist reloaded: {len(new_blacklist)} model(s)") + self.config.blacklisted_models = new_blacklist + + log("Checking for new models...") + + # Optimization: in filter_only mode with a priority file, skip the + # expensive fetch_recent_models() (which returns ALL models in the + # lookback window) and only fetch priority models by name. + if (self.config.priority_mode == "filter_only" + and self.config.priority_models): + models = fetch_priority_models(self.config.priority_models) + log(f"Fetched {len(models)} priority model(s) directly (filter_only mode, skipped full scan).") + else: + models = fetch_recent_models(self.config.lookback_days) + log(f"Found {len(models)} model(s) in lookback window.") + + # Priority models bypass lookback window. + # Fetch priority models by name regardless of creation_time, then merge. + if self.config.priority_models: + priority_models_from_db = fetch_priority_models(self.config.priority_models) + seen_ids = {str(m.get("id")) for m in models} + added = 0 + for pm in priority_models_from_db: + if str(pm.get("id")) not in seen_ids: + models.append(pm) + seen_ids.add(str(pm.get("id"))) + added += 1 + if added: + log(f"Added {added} priority model(s) outside lookback window.") + + log(f"Total {len(models)} model(s) to check. Filtering...") + + # Check if we should skip all models due to require_priority_list + if not self.config.priority_models and self.config.require_priority_list: + log("No priority list configured and --require-priority-list is set. Skipping all models.") + return 0 + + submissions: List[Tuple[str, str, str, Optional[str], str, Optional[str]]] = [] + # (model_id, hf_model_name, dataset_hf, benchmark_id, reason, slurm_job_id) + + # Track stats + skipped_not_in_priority = 0 + skipped_hf_not_exists = 0 + + # Resolve all benchmarks up front (once per loop) + dataset_to_bench: Dict[str, Optional[str]] = { + ds: resolve_benchmark_id(ds) for ds in self.config.datasets + } + + # Precompute benchmark duplicate groups for cross-duplicate aggregation + bench_dup_groups: Dict[str, List[str]] = {} + for ds, bench_id in dataset_to_bench.items(): + if bench_id: + bench_dup_groups[bench_id] = get_duplicate_group_ids('benchmarks', bench_id) + + for m in models: + model_id = str(m.get("id")) + if not model_id: + continue + + hf_model = resolve_hf_model_name(m) + if not hf_model: + if self.config.verbose: + log(f"Skip: cannot resolve HF model for id={model_id}, name={m.get('name')}") + continue + + # Blacklist check (overrides priority) + if hf_model in self.config.blacklisted_models: + if self.config.verbose: + log(f"Skip: model={hf_model} is blacklisted") + continue + + # Priority handling depends on mode + is_priority = bool(self.config.priority_models and hf_model in self.config.priority_models) + + if self.config.priority_mode == "filter_only": + # Only evaluate models in the priority list + if self.config.priority_models and not is_priority: + skipped_not_in_priority += 1 + continue + # priority_first: don't skip, just track is_priority for sorting + + # HuggingFace existence check + if self.config.check_hf_exists: + if not check_hf_model_exists(hf_model): + log(f"Skip: model not found on HuggingFace: {hf_model} (model_id={model_id})") + skipped_hf_not_exists += 1 + continue + + # Compute model duplicate group for cross-duplicate aggregation + model_dup_ids = get_duplicate_group_ids('models', model_id) + + for dataset_hf in self.config.datasets: + bench_id = dataset_to_bench.get(dataset_hf) + + # Get benchmark duplicate group (precomputed above) + bench_dup_ids = bench_dup_groups.get(bench_id) if bench_id else None + + # Check DB status to decide if we should start + # (Enhancement 5: timeout-aware, cross-duplicate aggregation, config-aware dedup) + if self.config.force: + should_start, reason, old_slurm_job_id = True, "forced", None + else: + should_start, reason, old_slurm_job_id = should_start_job( + model_id, bench_id, self.config.stale_job_hours, + stale_pending_hours=self.config.stale_pending_hours, + timeout_aware=self.config.timeout_aware, + agent_name=self.config.agent_name, + timeout_multiplier=self.config.timeout_multiplier, + duplicate_model_ids=model_dup_ids, + duplicate_benchmark_ids=bench_dup_ids, + eval_config=self.config.eval_config if self.config.eval_config else None, + ) + + if should_start: + submissions.append((model_id, hf_model, dataset_hf, bench_id, reason, old_slurm_job_id)) + elif self.config.verbose: + log(f"Skip: model={hf_model}, dataset={dataset_hf}, reason={reason}") + + # Log filtering stats + if self.config.priority_mode == "filter_only" and self.config.priority_models and skipped_not_in_priority > 0: + log(f"Skipped {skipped_not_in_priority} model(s) not in priority list") + if self.config.check_hf_exists and skipped_hf_not_exists > 0: + log(f"Skipped {skipped_hf_not_exists} model(s) not found on HuggingFace") + + if not submissions: + log("No eligible (model, dataset) pairs to submit.") + return 0 + + # Sort submissions by priority file order (earlier in file = higher priority). + # Models not in the priority list get lowest rank (submitted last). + if self.config.priority_models: + priority_rank = {m: i for i, m in enumerate(self.config.priority_models)} + fallback_rank = len(self.config.priority_models) + submissions.sort(key=lambda s: priority_rank.get(s[1], fallback_rank)) + if self.config.priority_mode == "priority_first": + n_priority = sum(1 for s in submissions if s[1] in priority_rank) + n_non_priority = len(submissions) - n_priority + log(f"Priority-first ordering: {n_priority} priority + {n_non_priority} non-priority submissions") + + # Enhancement 4: Model retry tracking - deprioritize repeat failures + # Deprioritized models are moved after all normal-priority models, + # but preserve relative priority order within each tier. + if self.config.track_model_retries: + retry_counts = load_model_retry_counts(self.config.eval_starts_log) + normal = [] + deprioritized = [] + for sub in submissions: + model_hf = sub[1] + count = retry_counts.get(model_hf, 0) + if count >= self.config.model_retry_threshold: + deprioritized.append(sub) + log(f"WARNING: Model {model_hf} has been started {count} times " + f"(>= threshold {self.config.model_retry_threshold}), deprioritizing") + else: + normal.append(sub) + submissions = normal + deprioritized + if deprioritized: + log(f"Deprioritized {len(deprioritized)} model(s) due to excessive retries") + + prefix = "[DRY RUN] Would submit" if self.config.dry_run else "Submitting" + log(f"{prefix} {len(submissions)} eval(s)...") + + # Enhancement 2: Per-listener SLURM job submission throttle. + # Track which SLURM job IDs this listener submitted. Prune finished ones + # via squeue, then cap new submissions at remaining slots. + if not self.config.dry_run: + active_ids = get_active_slurm_job_ids() + # Prune jobs that are no longer in squeue (finished/failed/cancelled) + still_active = self._submitted_jobs & active_ids + finished = len(self._submitted_jobs) - len(still_active) + self._submitted_jobs = still_active + active_count = len(self._submitted_jobs) + remaining_slots = self.config.max_jobs_submitted - active_count + log(f"Listener SLURM jobs: {active_count} active " + f"({finished} finished since last check), " + f"{remaining_slots} slots available (max {self.config.max_jobs_submitted})") + if remaining_slots <= 0: + log(f"WARNING: At per-listener job limit " + f"({active_count}/{self.config.max_jobs_submitted}), " + f"skipping all submissions this iteration") + return 0 + if len(submissions) > remaining_slots: + log(f"Capping submissions from {len(submissions)} to {remaining_slots} " + f"(per-listener limit: {self.config.max_jobs_submitted})") + submissions = submissions[:remaining_slots] + + # Create sbatch params from config + sbatch_params = SbatchParams( + n_concurrent=self.config.n_concurrent, + n_attempts=self.config.n_attempts, + gpu_memory_util=self.config.gpu_memory_util, + error_threshold=self.config.error_threshold, + vllm_max_retries=self.config.vllm_max_retries, + agent_parser=self.config.agent_parser, + slurm_time=self.config.slurm_time, + enable_thinking=self.config.enable_thinking, + agent_name=self.config.agent_name, + slurm_partition=self.config.slurm_partition, + tensor_parallel_size=self.config.tensor_parallel_size, + upload_username=self.config.upload_username, + eval_starts_log=self.config.eval_starts_log if self.config.track_model_retries else "", + timeout_multiplier=self.config.timeout_multiplier, + config_yaml=self.config.config_yaml, + invalid_errors_log=self.config.invalid_errors_log, + auto_snapshot=self.config.auto_snapshot, + secrets_file=self.config.secrets_file, + conda_env=self.config.conda_env, + ) + + # Load baseline model configs for per-model vLLM overrides + baseline_configs = load_baseline_model_configs(self.config.baseline_model_configs) + + # Add harbor config env vars to sbatch params + if self.config.harbor_config: + # Will be merged in submit_eval via eval_config, but also pass path + pass # harbor config fields are passed via eval_config to submit_eval + + # Pre-download setup (for no-internet compute nodes) + if self.config.pre_download: + from huggingface_hub import snapshot_download + downloaded_models: set = set() + + # Sliding-window dependency tracking + batch_size = self.config.batch_size + all_job_ids: List[str] = [] + if batch_size and batch_size > 0: + log(f"Using sliding-window batch-size={batch_size}: " + f"first {batch_size} run immediately, rest chain one-by-one") + + submitted = 0 + for idx, (mid, hf_model, dataset_hf, bench_id, reason, old_slurm_job_id) in enumerate(submissions): + + # Pre-download this model before submitting (download-then-submit per model) + # Uses the shared HF cache so compute nodes (no internet) find it via HF_HUB_OFFLINE=1 + if self.config.pre_download and hf_model not in downloaded_models: + hf_cache = os.environ.get("HF_HUB_CACHE", os.path.expanduser("~/.cache/huggingface/hub")) + log(f" Pre-downloading model {hf_model} to {hf_cache}...") + try: + path = snapshot_download(repo_id=hf_model, repo_type="model", cache_dir=hf_cache) + log(f" Cached at {path}") + except Exception as e: + log(f" WARNING: Failed to download {hf_model}: {e}") + downloaded_models.add(hf_model) + + dry_prefix = "[DRY RUN] " if self.config.dry_run else "" + prio_tag = " [PRIORITY]" if (self.config.priority_mode == "priority_first" + and self.config.priority_models + and hf_model in self.config.priority_models) else "" + log(f"{dry_prefix}Submitting [{idx+1}/{len(submissions)}]: model={hf_model}, dataset={dataset_hf}, reason={reason}{prio_tag}") + + # Cancel stale Pending SLURM job before resubmission + if reason.startswith("stale pending") and old_slurm_job_id: + cancel_slurm_job(old_slurm_job_id, dry_run=self.config.dry_run) + + # Per-model vLLM overrides from baseline config mapping + vllm_overrides = get_vllm_env_overrides(hf_model, baseline_configs) + if vllm_overrides: + log(f" Applying baseline model vLLM overrides: {list(vllm_overrides.keys())}", verbose_only=True) + + # Build sliding-window dependency: job N depends on job N-batch_size + job_dependency: Optional[str] = None + if batch_size and batch_size > 0 and idx >= batch_size: + dep_job = all_job_ids[idx - batch_size] + if not dep_job.startswith("FAILED_") and not dep_job.startswith("DRY_"): + job_dependency = f"afterany:{dep_job}" + log(f" Depends on job {dep_job} (slot {idx - batch_size + 1})", verbose_only=True) + + slurm_job_id, job_name = submit_eval( + hf_model, + dataset_hf, + bench_id, + self.config.sbatch_script, + sbatch_params=sbatch_params, + dry_run=self.config.dry_run, + upload_username=self.config.upload_username, + timeout_multiplier=self.config.timeout_multiplier, + vllm_overrides=vllm_overrides if vllm_overrides else None, + dependency=job_dependency, + eval_config=self.config.eval_config if self.config.eval_config else None, + ) + + if slurm_job_id: + if self.config.dry_run: + log(f" -> Would submit as SLURM job (job_name={job_name})") + all_job_ids.append(f"DRY_{idx}") + else: + log(f" -> Submitted as SLURM job {slurm_job_id} (job_name={job_name})") + self._submitted_jobs.add(slurm_job_id) + all_job_ids.append(slurm_job_id) + submitted += 1 + else: + log(f" -> Submission failed") + all_job_ids.append(f"FAILED_{idx}") + + if not self.config.dry_run: + time.sleep(1) + + return submitted + + def run(self) -> None: + """Main event loop.""" + # Log configuration + hdr = ( + f"lookback={self.config.lookback_days}d, " + f"every {self.config.check_interval_hours}h, " + f"sbatch={self.config.sbatch_script}" + ) + log(f"Starting listener v3 for datasets={self.config.datasets}: {hdr}") + log( + f"Job logic: restart if 'Started' and started_at > {self.config.stale_job_hours}h ago, " + f"restart+scancel if 'Pending' and submitted_at > {self.config.stale_pending_hours}h ago, " + f"skip if 'Finished'" + ) + log(f"Dry run mode: {self.config.dry_run}") + log(f"Run once mode: {self.config.run_once}") + log(f"Check HF exists: {self.config.check_hf_exists}") + log(f"Require priority list: {self.config.require_priority_list}") + + if self.config.priority_models: + mode_desc = "filter_only (skip non-priority)" if self.config.priority_mode == "filter_only" else "priority_first (all models, priority first)" + log(f"Priority mode: {mode_desc}, {len(self.config.priority_models)} model(s) in list") + if self.config.priority_file: + log(f"Priority file: {self.config.priority_file} (hot-reloaded each iteration)") + if self.config.verbose: + for m in sorted(self.config.priority_models): + log(f" - {m}") + else: + log("Priority: disabled (no priority file or empty)") + + if self.config.blacklisted_models: + log(f"Blacklist: {len(self.config.blacklisted_models)} model(s) from {self.config.blacklist_file}") + if self.config.verbose: + for m in sorted(self.config.blacklisted_models): + log(f" - {m}") + else: + log("Blacklist: disabled (no blacklist file or empty)") + + # Log sbatch parameters + sbatch_params = SbatchParams( + n_concurrent=self.config.n_concurrent, + n_attempts=self.config.n_attempts, + gpu_memory_util=self.config.gpu_memory_util, + error_threshold=self.config.error_threshold, + vllm_max_retries=self.config.vllm_max_retries, + agent_parser=self.config.agent_parser, + slurm_time=self.config.slurm_time, + enable_thinking=self.config.enable_thinking, + agent_name=self.config.agent_name, + slurm_partition=self.config.slurm_partition, + timeout_multiplier=self.config.timeout_multiplier, + config_yaml=self.config.config_yaml, + conda_env=self.config.conda_env, + ) + log(f"Sbatch params: {sbatch_params}") + + # Log v3 enhancement status + log(f"[v3] Max SLURM jobs per listener: {self.config.max_jobs_submitted}") + log(f"[v3] Daytona resource check: {'enabled' if self.config.check_daytona_resources else 'disabled'}") + log(f"[v3] Model retry tracking: {'enabled' if self.config.track_model_retries else 'disabled'}") + if self.config.track_model_retries: + log(f"[v3] Model retry threshold: {self.config.model_retry_threshold}") + log(f"[v3] Eval starts log: {self.config.eval_starts_log}") + log(f"[v3] Timeout-aware dedup: {'enabled' if self.config.timeout_aware else 'disabled'}") + if self.config.invalid_errors_log: + log(f"Invalid errors log: {self.config.invalid_errors_log}") + if self.config.timeout_multiplier != DEFAULT_TIMEOUT_MULTIPLIER: + log(f"[v3] Timeout multiplier: {self.config.timeout_multiplier}") + + # Enhancement 3: Daytona resource pre-flight check at startup + if self.config.check_daytona_resources: + ok = check_daytona_resources( + self.config.daytona_sandbox_limit, + self.config.daytona_warning_buffer, + ) + if not ok: + log("ERROR: Daytona resources at limit. Exiting.") + sys.exit(1) + + while True: + try: + # Enhancement 3: Optional per-iteration Daytona resource check + if self.config.check_daytona_resources: + ok = check_daytona_resources( + self.config.daytona_sandbox_limit, + self.config.daytona_warning_buffer, + ) + if not ok: + log("WARNING: Daytona resources at limit, skipping this iteration") + if self.config.run_once or self.config.dry_run: + break + hours = self.config.check_interval_hours + log(f"Sleeping for {hours} hours...\n") + time.sleep(self.config.check_interval_seconds) + continue + + self.run_iteration() + + # Exit after one iteration if requested + if self.config.run_once or self.config.dry_run: + mode = "DRY RUN" if self.config.dry_run else "ONCE" + log(f"[{mode}] Complete. Exiting after one iteration.") + break + + hours = self.config.check_interval_hours + log(f"Sleeping for {hours} hours...\n") + time.sleep(self.config.check_interval_seconds) + + except KeyboardInterrupt: + log("Interrupted by user. Exiting.") + sys.exit(0) + except Exception as e: + log(f"ERROR in main loop: {e}. Backing off 30s.") + time.sleep(30) + + +# ---------- CLI Argument Parsing ---------- +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Unified Eval Listener v4 - Run models on benchmark datasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +See the module docstring (top of file) for detailed flag reference with tuning +guidance. Quick summary below. + +Presets: aider, bfcl, swebench, v2, tb2, v1 + +v4 new: --blacklist-file PATH Block models from eval (overrides priority list) +v3 opt-in enhancements (all backward compatible): + --error-threshold N Unified invalid error threshold + --max-jobs-submitted N Per-listener SLURM job limit + --check-daytona-resources Daytona sandbox pre-flight check + --track-model-retries Deprioritize repeatedly-started models + --timeout-aware Dedup by model+benchmark+timeout_multiplier + +Examples: + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file priority_models.txt + + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file priority_models.txt \\ + --blacklist-file bad_models.txt + + python unified_eval_listener_v4.py --preset v2 --dry-run --once --verbose + """, + ) + + # Preset configuration + parser.add_argument( + "--preset", "-p", + choices=list(PRESETS.keys()), + help="Use a preset configuration (aider, bfcl, swebench, v2, tb2, v1)", + ) + + # Dataset configuration + parser.add_argument( + "--datasets", "-d", + help="Comma/space separated HF dataset repos (overrides preset)", + ) + parser.add_argument( + "--sbatch-script", "-s", + help="SBATCH script to use (overrides preset)", + ) + parser.add_argument( + "--log-file", + help="Log file path (default: auto-generated based on preset)", + ) + + # Timing configuration + parser.add_argument( + "--lookback-days", + type=int, + help=f"Days to look back for models (default: {DEFAULT_LOOKBACK_DAYS})", + ) + parser.add_argument( + "--check-hours", + type=float, + help=f"Hours between iterations (default: {DEFAULT_CHECK_HOURS})", + ) + parser.add_argument( + "--stale-hours", + type=int, + help=f"Hours before 'Started' job is stale (default: {DEFAULT_STALE_JOB_HOURS})", + ) + parser.add_argument( + "--stale-pending-hours", + type=int, + help=f"Hours before 'Pending' job is stale (default: {DEFAULT_STALE_PENDING_HOURS})", + ) + + # Priority filtering + parser.add_argument( + "--priority-file", + help="Path to priority models file (one model per line)", + ) + parser.add_argument( + "--require-priority-list", + action="store_true", + help="Skip all models when priority list is empty/missing", + ) + parser.add_argument( + "--blacklist-file", + help="Path to blacklisted models file (one model per line). " + "Models in this file are never submitted. Overrides priority list.", + ) + parser.add_argument( + "--priority-mode", + choices=["filter_only", "priority_first"], + help='Priority mode: "filter_only" (default) only evaluates priority models; ' + '"priority_first" evaluates all models but submits priority ones first', + ) + + # Validation options + parser.add_argument( + "--check-hf-exists", + action="store_true", + help="Validate model exists on HuggingFace before submit", + ) + + # Eval parameters (passed to sbatch via env vars) + parser.add_argument( + "--n-concurrent", + type=int, + help=f"Harbor concurrent jobs (default: {DEFAULT_N_CONCURRENT}, preset overrides)", + ) + parser.add_argument( + "--n-attempts", + type=int, + help=f"Retry attempts per task (default: {DEFAULT_N_ATTEMPTS})", + ) + parser.add_argument( + "--gpu-memory-util", + type=float, + help=f"VLLM GPU memory fraction (default: {DEFAULT_GPU_MEMORY_UTIL})", + ) + # Enhancement 1: Unified error threshold (with backward-compat alias) + parser.add_argument( + "--error-threshold", + type=int, + dest="error_threshold", + help=f"Max invalid errors before abort upload (default: {DEFAULT_ERROR_THRESHOLD})", + ) + parser.add_argument( + "--daytona-threshold", + type=int, + dest="error_threshold_compat", + help=f"Alias for --error-threshold (backward compat, default: {DEFAULT_ERROR_THRESHOLD})", + ) + parser.add_argument( + "--vllm-max-retries", + type=int, + help=f"VLLM startup retries (default: {DEFAULT_VLLM_MAX_RETRIES})", + ) + parser.add_argument( + "--agent-parser", + help=f"Agent parser type (default: \"{DEFAULT_AGENT_PARSER}\", use \"xml\" for swebench)", + ) + parser.add_argument( + "--slurm-time", + help=f"SLURM time limit (default: \"{DEFAULT_SLURM_TIME}\")", + ) + parser.add_argument( + "--agent-name", + help=f"Agent name for harbor and DB entries (default: \"{DEFAULT_AGENT_NAME}\")", + ) + parser.add_argument( + "--slurm-partition", + help=f"SLURM partition (default: \"{DEFAULT_SLURM_PARTITION}\")", + ) + parser.add_argument( + "--tensor-parallel-size", "--tp-size", + type=int, + dest="tensor_parallel_size", + help=f"vLLM tensor parallel size for all models. Also sets GPU count per job " + f"(default: {DEFAULT_TENSOR_PARALLEL_SIZE}). Per-model overrides from " + f"--baseline-model-configs take precedence.", + ) + parser.add_argument( + "--enable-thinking", + action="store_true", + help="Enable thinking blocks for model inference (default: False)", + ) + parser.add_argument( + "--upload-username", + help="Username for DB entries and result uploads (default: current OS user)", + ) + + # v3 Enhancement 2: Per-listener SLURM job throttle + parser.add_argument( + "--max-jobs-submitted", + type=int, + help=f"Per-listener SLURM job limit. Each listener tracks its own " + f"submitted jobs independently (default: {DEFAULT_MAX_JOBS_SUBMITTED})", + ) + + # v3 Enhancement 3: Daytona resource pre-flight check + parser.add_argument( + "--check-daytona-resources", + action="store_true", + help="Query Daytona API for active sandbox count; skip if at limit. " + "Requires DAYTONA_API_KEY in env", + ) + parser.add_argument( + "--daytona-sandbox-limit", + type=int, + help=f"Max expected active sandboxes (default: {DEFAULT_DAYTONA_SANDBOX_LIMIT})", + ) + parser.add_argument( + "--daytona-warning-buffer", + type=float, + help=f"Warn when active sandboxes reach this fraction of limit " + f"(default: {DEFAULT_DAYTONA_WARNING_BUFFER})", + ) + + # v3 Enhancement 4: Model retry tracking + parser.add_argument( + "--track-model-retries", + action="store_true", + help="Deprioritize models that have been started too many times " + "(moved to end of submission queue)", + ) + parser.add_argument( + "--model-retry-threshold", + type=int, + help=f"Starts before a model is deprioritized (default: {DEFAULT_MODEL_RETRY_THRESHOLD})", + ) + parser.add_argument( + "--eval-starts-log", + help="Path to shared eval starts log. Sbatch appends on start, listener " + "reads to count retries (default: auto-generated with benchmark+time suffix)", + ) + + # v3 Enhancement 5: Timeout-config-sensitive dedup + parser.add_argument( + "--timeout-multiplier", + type=float, + help=f"Harbor timeout multiplier, stored in DB job config " + f"(default: {DEFAULT_TIMEOUT_MULTIPLIER})", + ) + parser.add_argument( + "--timeout-aware", + action="store_true", + help="Dedup jobs by model+benchmark+agent+timeout_multiplier instead " + "of just model+benchmark. Allows same model with different configs", + ) + + # Baseline model configs (per-model vLLM overrides) + parser.add_argument( + "--baseline-model-configs", + help="Path to YAML mapping baseline models to vLLM serving params " + "(e.g., eval/baseline_model_configs.yaml)", + ) + + # Harbor config + parser.add_argument( + "--harbor-config", + help="Path to Harbor YAML config (parsed for timeout_multiplier, " + "resource overrides; passed as EVAL_HARBOR_CONFIG to sbatch)", + ) + + # Pre-download model weights + parser.add_argument( + "--pre-download", + action="store_true", + help="Pre-download all model weights on login node before submitting jobs. " + "Essential for no-internet compute nodes (Leonardo, Jupiter).", + ) + + # Sliding-window batch dependencies + parser.add_argument( + "--batch-size", + type=int, + help="Max concurrent jobs via sliding-window SLURM dependencies. " + "Job N depends on job N-batch_size finishing (afterany), " + "so at most batch-size jobs run at once.", + ) + + parser.add_argument( + "--auto-snapshot", + action="store_true", + help="Enable Daytona auto_snapshot for environments (default: disabled)", + ) + parser.add_argument( + "--secrets-file", + type=str, + default="", + help="Path to secrets.env file for sbatch jobs (default: ~/secrets.env)", + ) + parser.add_argument( + "--conda-env", + type=str, + default="", + help="Conda env name for sbatch (default: otagent, use otagent2 for Qwen3.5+)", + ) + + # Execution mode + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview mode, no actual submission (implies --once)", + ) + parser.add_argument( + "--once", + action="store_true", + help="Run single iteration and exit", + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable verbose logging", + ) + parser.add_argument( + "--force", + action="store_true", + help="Skip dedup checks — submit even if model+benchmark combo already exists in DB", + ) + + return parser.parse_args() + + +def _env_bool(name: str) -> bool: + """Get boolean from environment variable.""" + return os.getenv(name, "").lower() in ("1", "true", "yes") + + +def build_config(args: argparse.Namespace) -> ListenerConfig: + """Build configuration from args, env vars, and preset defaults.""" + + # Start with preset if specified + preset_config: Dict = {} + if args.preset: + preset_config = PRESETS.get(args.preset, {}) + + # Resolve datasets: CLI > ENV > Preset + datasets_str = args.datasets or os.getenv("EVAL_LISTENER_DATASETS") or "" + if datasets_str: + datasets = parse_datasets(datasets_str) + else: + datasets = preset_config.get("datasets", []) + + if not datasets: + print("ERROR: No datasets specified. Use --datasets, EVAL_LISTENER_DATASETS, or --preset") + sys.exit(2) + + # Resolve sbatch script: CLI > ENV > Preset > Default + sbatch_script = ( + args.sbatch_script + or os.getenv("EVAL_LISTENER_SBATCH") + or preset_config.get("sbatch_script") + or DEFAULT_SBATCH_SCRIPT + ) + + # Resolve timing: CLI > ENV > Default + lookback_days = ( + args.lookback_days + if args.lookback_days is not None + else int(os.getenv("EVAL_LISTENER_LOOKBACK_DAYS", str(DEFAULT_LOOKBACK_DAYS))) + ) + check_hours = ( + args.check_hours + if args.check_hours is not None + else float(os.getenv("EVAL_LISTENER_CHECK_HOURS", str(DEFAULT_CHECK_HOURS))) + ) + stale_hours = args.stale_hours if args.stale_hours is not None else DEFAULT_STALE_JOB_HOURS + stale_pending_hours = args.stale_pending_hours if args.stale_pending_hours is not None else DEFAULT_STALE_PENDING_HOURS + + # Resolve log file + log_dir = Path(os.getenv("EVAL_LISTENER_LOG_DIR", DEFAULT_LOG_DIR)) + log_dir.mkdir(parents=True, exist_ok=True) + + suffix = preset_config.get("log_suffix", "unified") + current_time = datetime.now().strftime("%Y%m%d_%H%M%S") + + if args.log_file: + log_file = Path(args.log_file) + else: + log_file = log_dir / f"{suffix}_eval_listener_v4_{current_time}.log" + + # Resolve priority file: CLI > ENV + priority_file = args.priority_file or os.getenv("EVAL_LISTENER_PRIORITY_FILE") + priority_models = load_priority_models(priority_file) + + # Resolve blacklist file: CLI > ENV + blacklist_file = args.blacklist_file or os.getenv("EVAL_LISTENER_BLACKLIST_FILE") + blacklisted_models = load_blacklist(blacklist_file) + + # Resolve priority mode: CLI > ENV > default + priority_mode = ( + args.priority_mode + or os.getenv("EVAL_LISTENER_PRIORITY_MODE") + or "filter_only" + ) + + # Resolve boolean flags: CLI > ENV > Preset + require_priority = args.require_priority_list or _env_bool("EVAL_LISTENER_REQUIRE_PRIORITY_LIST") + dry_run = args.dry_run or _env_bool("EVAL_LISTENER_DRY_RUN") + check_hf_exists = ( + args.check_hf_exists + or _env_bool("EVAL_LISTENER_CHECK_HF_EXISTS") + or preset_config.get("check_hf_exists", False) + ) + + # Resolve sbatch parameters: CLI > Preset > Default + # Helper to get value with priority: CLI arg > Preset > Default + def _resolve(cli_val, preset_key: str, default): + if cli_val is not None: + return cli_val + return preset_config.get(preset_key, default) + + n_concurrent = _resolve(args.n_concurrent, "n_concurrent", DEFAULT_N_CONCURRENT) + n_attempts = _resolve(args.n_attempts, "n_attempts", DEFAULT_N_ATTEMPTS) + gpu_memory_util = _resolve(args.gpu_memory_util, "gpu_memory_util", DEFAULT_GPU_MEMORY_UTIL) + + # Enhancement 1: Resolve error_threshold with backward compat + # Priority: --error-threshold > --daytona-threshold > preset (error_threshold key) > default + error_threshold_cli = args.error_threshold + if error_threshold_cli is None: + error_threshold_cli = getattr(args, 'error_threshold_compat', None) + error_threshold = _resolve(error_threshold_cli, "error_threshold", DEFAULT_ERROR_THRESHOLD) + + vllm_max_retries = _resolve(args.vllm_max_retries, "vllm_max_retries", DEFAULT_VLLM_MAX_RETRIES) + agent_parser = _resolve(args.agent_parser, "agent_parser", DEFAULT_AGENT_PARSER) + slurm_time = _resolve(args.slurm_time, "slurm_time", DEFAULT_SLURM_TIME) + agent_name = _resolve(args.agent_name, "agent_name", DEFAULT_AGENT_NAME) + slurm_partition = _resolve(args.slurm_partition, "slurm_partition", DEFAULT_SLURM_PARTITION) + tensor_parallel_size = _resolve(args.tensor_parallel_size, "tensor_parallel_size", DEFAULT_TENSOR_PARALLEL_SIZE) + # enable_thinking: CLI flag > Preset > Default (CLI is action="store_true" so check explicitly) + enable_thinking = args.enable_thinking or preset_config.get("enable_thinking", DEFAULT_ENABLE_THINKING) + + # Resolve upload_username: CLI > ENV > current OS user + upload_username = ( + args.upload_username + or os.getenv("EVAL_UPLOAD_USERNAME") + or getpass.getuser() + ) + + # Enhancement 2: SLURM throttle + max_jobs_submitted = ( + args.max_jobs_submitted + if args.max_jobs_submitted is not None + else int(os.getenv("EVAL_LISTENER_MAX_JOBS", str(DEFAULT_MAX_JOBS_SUBMITTED))) + ) + + # Enhancement 3: Daytona resource check + check_daytona = args.check_daytona_resources + daytona_sandbox_limit = ( + args.daytona_sandbox_limit + if args.daytona_sandbox_limit is not None + else DEFAULT_DAYTONA_SANDBOX_LIMIT + ) + daytona_warning_buffer = ( + args.daytona_warning_buffer + if args.daytona_warning_buffer is not None + else DEFAULT_DAYTONA_WARNING_BUFFER + ) + + # Enhancement 4: Model retry tracking + track_model_retries = args.track_model_retries + model_retry_threshold = ( + args.model_retry_threshold + if args.model_retry_threshold is not None + else DEFAULT_MODEL_RETRY_THRESHOLD + ) + if args.eval_starts_log: + eval_starts_log = args.eval_starts_log + else: + # Default: benchmark+time specific, matching listener log naming pattern + eval_starts_log = str(log_dir / f"{suffix}_eval_starts_{current_time}.log") + + # Per-listener invalid errors log (matching listener log naming pattern) + invalid_errors_log = str(log_dir / f"{suffix}_invalid_errors_{current_time}.log") + + # Enhancement 5: Timeout-config-sensitive dedup + timeout_multiplier = ( + args.timeout_multiplier + if args.timeout_multiplier is not None + else DEFAULT_TIMEOUT_MULTIPLIER + ) + timeout_aware = args.timeout_aware + + # Config YAML: Preset > Default + config_yaml = preset_config.get("config_yaml", "dcagent_eval_config.yaml") + + # Harbor config (parse eval-relevant fields for config-aware dedup) + harbor_config = args.harbor_config or preset_config.get("harbor_config") + eval_config = parse_harbor_eval_config(harbor_config) + + # Baseline model configs for per-model vLLM overrides + baseline_model_configs_path = args.baseline_model_configs + + # Pre-download model weights + pre_download = args.pre_download + + # Sliding-window batch dependencies + batch_size = args.batch_size + + # Auto snapshot + auto_snapshot = args.auto_snapshot or preset_config.get("auto_snapshot", False) + + # Secrets file + secrets_file = args.secrets_file or preset_config.get("secrets_file", "") + + # Conda env + conda_env = args.conda_env or preset_config.get("conda_env", "") + + return ListenerConfig( + datasets=datasets, + sbatch_script=sbatch_script, + log_file=log_file, + lookback_days=lookback_days, + check_interval_hours=check_hours, + stale_job_hours=stale_hours, + stale_pending_hours=stale_pending_hours, + priority_file=priority_file, + require_priority_list=require_priority, + priority_models=priority_models, + priority_mode=priority_mode, + check_hf_exists=check_hf_exists, + dry_run=dry_run, + run_once=args.once, + verbose=args.verbose, + # Sbatch parameters + n_concurrent=n_concurrent, + n_attempts=n_attempts, + gpu_memory_util=gpu_memory_util, + error_threshold=error_threshold, + vllm_max_retries=vllm_max_retries, + agent_parser=agent_parser, + slurm_time=slurm_time, + enable_thinking=enable_thinking, + agent_name=agent_name, + slurm_partition=slurm_partition, + tensor_parallel_size=tensor_parallel_size, + upload_username=upload_username, + # Enhancement 2 + max_jobs_submitted=max_jobs_submitted, + # Enhancement 3 + check_daytona_resources=check_daytona, + daytona_sandbox_limit=daytona_sandbox_limit, + daytona_warning_buffer=daytona_warning_buffer, + # Enhancement 4 + track_model_retries=track_model_retries, + model_retry_threshold=model_retry_threshold, + eval_starts_log=eval_starts_log, + # Enhancement 5 + timeout_multiplier=timeout_multiplier, + timeout_aware=timeout_aware, + config_yaml=config_yaml, + blacklist_file=blacklist_file, + blacklisted_models=blacklisted_models, + # New features + baseline_model_configs=baseline_model_configs_path, + harbor_config=harbor_config, + eval_config=eval_config, + pre_download=pre_download, + batch_size=batch_size, + invalid_errors_log=invalid_errors_log, + auto_snapshot=auto_snapshot, + secrets_file=secrets_file, + conda_env=conda_env, + force=args.force, + ) + + +# ---------- Main ---------- +def main() -> None: + global _VERBOSE + _load_secrets() + args = parse_args() + config = build_config(args) + _VERBOSE = config.verbose + listener = EvalListener(config) + listener.run() + + +if __name__ == "__main__": + main() diff --git a/eval/lists/16x_32b_lc_baseline.txt b/eval/lists/16x_32b_lc_baseline.txt new file mode 100644 index 00000000..b70e4288 --- /dev/null +++ b/eval/lists/16x_32b_lc_baseline.txt @@ -0,0 +1,8 @@ +nvidia/Nemotron-Terminal-32B +allenai/SERA-32B +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B +laion/alfworld-swesmith-r2egym-swegym-131k-32B-lc \ No newline at end of file diff --git a/eval/lists/a1_models.txt b/eval/lists/a1_models.txt new file mode 100644 index 00000000..8fd5976a --- /dev/null +++ b/eval/lists/a1_models.txt @@ -0,0 +1,77 @@ +DCAgent/a1-stack_rspec +DCAgent/a1-stack_go +DCAgent/a1-self_instruct_naive +DCAgent/a1-nemotron_bash_withtests_gpt5mini +DCAgent/a1-nemotron_bash_withtests +DCAgent/a1-inferredbugs +DCAgent/a1-codeforces +DCAgent/a1-code_contests +DCAgent/a1-bash_textbook +DCAgent/a1-swegym_openhands +DCAgent/a1-stack_bash_withtests_gpt5mini +DCAgent/a1-orca_agentinstruct +DCAgent/a1-nnetnav_live +DCAgent/a1-nebius_swe_agent +DCAgent/a1-mind2web +DCAgent/a1-go_browse_wa +DCAgent/a1-codeactinstruct +DCAgent/a1-agenttuning_alfworld +DCAgent/a1-stack_pytest_gpt5mini +DCAgent/a1-nemotron_pytest +DCAgent/a1-glaive_code_assistant +DCAgent/a1-wizardlm_orca +DCAgent/a1-nemo_prism_math +DCAgent/a1-tulu3_sft_personas_math +DCAgent/a1-swesmith +DCAgent/a1-r2egym +DCAgent/a1-magicoder +DCAgent/a1-ghactions +DCAgent/a1-freelancer +DCAgent/a1-codeelo +DCAgent/a1-bugswarm +DCAgent/a1-stackexchange_unix +DCAgent/a1-stackexchange_tor +DCAgent/a1-stackexchange_superuser +DCAgent/a1-stack_pytest_withtests +DCAgent/a1-stack_pytest_synthetic_gpt5nano +DCAgent/a1-stack_phpunit +DCAgent/a1-pymethods2test +DCAgent/a1-defects4j +DCAgent/a1-curriculum_medium +DCAgent/a1-curriculum_hard +DCAgent/a1-curriculum_easy +DCAgent/a1-code_feedback +DCAgent/a1-agenttuning_webshop +DCAgent/a1-agenttuning_os +DCAgent/a1-agenttuning_mind2web +DCAgent/a1-agenttuning_db +DCAgent/a1-agenttuning_kg +DCAgent/a1-taskmaster2 +DCAgent/a1-stack_bash +DCAgent/a1-repo_scaffold +DCAgent/a1-pr_mining +DCAgent/a1-nemotron_junit +DCAgent/a1-nemotron_cpp +DCAgent/a1-nemotron_bash +DCAgent/a1-manybugs +DCAgent/a1-issue_tasks +DCAgent/a1-codenet_python +DCAgent/a1-bugsinpy +DCAgent/a1-multifile_composition +DCAgent/a1-exercism_python +DCAgent/a1-crosscodeeval_typescript +DCAgent/a1-crosscodeeval_python +DCAgent/a1-crosscodeeval_java +DCAgent/a1-taco +DCAgent/a1-staqc +DCAgent/a1-stackexchange_tezos +DCAgent/a1-stackexchange_overflow +DCAgent/a1-stack_rust +DCAgent/a1-stack_ruby +DCAgent/a1-stack_pytest +DCAgent/a1-stack_junit +DCAgent/a1-stack_jest +DCAgent/a1-stack_csharp +DCAgent/a1-stack_cpp +DCAgent/a1-stack_bash_withtests +DCAgent/a1-crosscodeeval_csharp \ No newline at end of file diff --git a/eval/lists/a1_nl2bash.txt b/eval/lists/a1_nl2bash.txt new file mode 100644 index 00000000..1069eaa1 --- /dev/null +++ b/eval/lists/a1_nl2bash.txt @@ -0,0 +1 @@ +DCAgent/a1-nl2bash diff --git a/eval/lists/a1_retrained.txt b/eval/lists/a1_retrained.txt new file mode 100644 index 00000000..be82d222 --- /dev/null +++ b/eval/lists/a1_retrained.txt @@ -0,0 +1,4 @@ +DCAgent/a1-bugswarm +DCAgent/a1-codeelo +DCAgent/a1-ghactions +DCAgent/a1-magicoder diff --git a/eval/lists/alfworld_131k.txt b/eval/lists/alfworld_131k.txt new file mode 100644 index 00000000..2f77176c --- /dev/null +++ b/eval/lists/alfworld_131k.txt @@ -0,0 +1 @@ +laion/alfworld-swesmith-r2egym-swegym-131k-lc diff --git a/eval/lists/architecture_invalid_test_model.txt b/eval/lists/architecture_invalid_test_model.txt new file mode 100644 index 00000000..13a9cb14 --- /dev/null +++ b/eval/lists/architecture_invalid_test_model.txt @@ -0,0 +1 @@ +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 \ No newline at end of file diff --git a/eval/lists/baseline_swe.txt b/eval/lists/baseline_swe.txt new file mode 100644 index 00000000..6df89642 --- /dev/null +++ b/eval/lists/baseline_swe.txt @@ -0,0 +1,10 @@ +open-thoughts/OpenThinker-Agent-v1 +camel-ai/seta-rl-qwen3-8b +allenai/SERA-14B +nvidia/Nemotron-Terminal-32B +nvidia/Nemotron-Terminal-14B +nvidia/Nemotron-Terminal-8B +obiwan96/qwen3-8b-openthinker-sft-endless-terminals +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 +deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +open-thoughts/OpenThinker3-7B \ No newline at end of file diff --git a/eval/lists/custom_force_run.txt b/eval/lists/custom_force_run.txt new file mode 100644 index 00000000..15acb74c --- /dev/null +++ b/eval/lists/custom_force_run.txt @@ -0,0 +1 @@ +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 diff --git a/eval/lists/dp_test_model.txt b/eval/lists/dp_test_model.txt new file mode 100644 index 00000000..01b63ba7 --- /dev/null +++ b/eval/lists/dp_test_model.txt @@ -0,0 +1 @@ +DCAgent/a1-stack_jest \ No newline at end of file diff --git a/eval/lists/exp_tas_qwen35.txt b/eval/lists/exp_tas_qwen35.txt new file mode 100644 index 00000000..3ddf9ad1 --- /dev/null +++ b/eval/lists/exp_tas_qwen35.txt @@ -0,0 +1 @@ +laion/exp_tas_optimal_combined_traces-Qwen3.5-9B diff --git a/eval/lists/glm46_131k.txt b/eval/lists/glm46_131k.txt new file mode 100644 index 00000000..8f37fa74 --- /dev/null +++ b/eval/lists/glm46_131k.txt @@ -0,0 +1 @@ +laion/glm46-swesmith-maxeps-131k-lc diff --git a/eval/lists/glm47_flash.txt b/eval/lists/glm47_flash.txt new file mode 100644 index 00000000..eae1439b --- /dev/null +++ b/eval/lists/glm47_flash.txt @@ -0,0 +1 @@ +zai-org/GLM-4.7-Flash diff --git a/eval/lists/inactive_models_latest.txt b/eval/lists/inactive_models_latest.txt new file mode 100644 index 00000000..05e4667d --- /dev/null +++ b/eval/lists/inactive_models_latest.txt @@ -0,0 +1,457 @@ +allenai/SERA-14B +allenai/SERA-32B +allenai/SERA-8B +bespokelabs/Qwen3-8B-ot_step100 +bespokelabs/Qwen3-8B-ot_step60_high +camel-ai/seta-rl-qwen3-8b +claude-haiku-4-5-20251001 +DCAgent/All_Puzzles_5k_new_context +DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context +DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct +DCAgent/bash_textbook_tasks_traces +DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B +DCAgent/code_contests-Qwen3-Coder-480B-traces +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/codeforces-gptoss120b-traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/freelancer-askllm-filtered-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B +DCAgent/freelancer-long-instruction-filter_Qwen3-8B +DCAgent/freelancer-projects-0-1k-traces +DCAgent/freelancer-projects-0-3k-traces +DCAgent/freelancer-projects-100k-traces_Qwen3-8B +DCAgent/freelancer-projects-10k-traces +DCAgent/freelancer-projects-1k-traces +DCAgent/freelancer-projects-3k-traces +DCAgent/freelancer-projects-gpt5_Qwen3-8B +DCAgent/freelancer-projects-gpt5mini +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-short-instruction-filter_Qwen3-8B +DCAgent/freelancer-t1024s-32ep_Qwen3-8B +DCAgent/freelancer-t2048s-32ep_Qwen3-8B +DCAgent/freelancer-t256s-32ep_Qwen3-8B +DCAgent/freelancer-t512s-32ep_Qwen3-8B +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_40 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-mind2web-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-synatra-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/staqc-ot3-100k-code-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-math-subset-traces-terminus-2_save-strategy_steps_Qwen3-8B +DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-Coder-30B-A3B-Instruct +DCAgent/taskmaster2-0-1k-traces +DCAgent/taskmaster2-0-3k-traces +DCAgent/taskmaster2-10k-traces +DCAgent/taskmaster2-16ep +DCAgent/taskmaster2-1ep +DCAgent/taskmaster2-1k-traces +DCAgent/taskmaster2-2ep +DCAgent/taskmaster2-32ep +DCAgent/taskmaster2-3k-traces +DCAgent/taskmaster2-4ep +DCAgent/taskmaster2-64ep +DCAgent/taskmaster2-8ep +DCAgent/taskmaster2-banana +DCAgent/taskmaster2-gpt5mini +DCAgent/taskmaster2-gpt5mini_global-batch-size_16 +DCAgent/tbench_oracle_solutions_terminus +DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B +DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces +DCAgent2/bugs-nl2bashseq +DCAgent2/bugs-stack-nl2bashseq +DCAgent2/bugs-swesmith-over5050 +DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv +DCAgent2/freelancer-projects-100k-traces +DCAgent2/freelancer-projects-31k-traces +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces +DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps +DCAgent2/glm-4_6-freelancer-traces +DCAgent2/glm-4_6-freelancer-traces-pm +DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps +DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k +DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor +DCAgent2/inferredbugs-GLM-4_6-32ep-32k +DCAgent2/inferredbugs-GLM-4_6-32ep-65k +DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw +DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B +DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B +DCAgent2/nl2bash-bugs-over5050 +DCAgent2/nl2bash-bugs-undr3070 +DCAgent2/nl2bash-bugsseq +DCAgent2/nl2bash-bugsshuffle +DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/nl2bash-stack-bugs-over333 +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-stack-bugs-undr503020 +DCAgent2/nl2bash-stack-over5050 +DCAgent2/nl2bash-stack-undr3070 +DCAgent2/nl2bash-stack-undr7030 +DCAgent2/nl2bash-stackseq +DCAgent2/nl2bash-stackshuffle +DCAgent2/nl2bash-swesmith-reason +DCAgent2/nl2bash-swesmithseq +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2095 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1097_Qwen3-8B +DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/stack-bugs-over5050 +DCAgent2/stack-bugs-undr3070 +DCAgent2/stack-bugs-undr7030 +DCAgent2/stack-bugsseq +DCAgent2/stack-bugsshuffle +DCAgent2/stack-nl2bashseq +DCAgent2/stack-swesmithseq +DCAgent2/swesmith-nl2bashseq +DCAgent2/swesmith-stack-undr7030 +DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-GLM-4_6-32ep-32k +DCAgent2/test2-tbench-dev-71-qwen3-8b-8nodes-sync +deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +gemini-2.5-flash +gpt-5-2025-08-07 +gpt-5-mini-2025-08-07 +gpt-5-nano-2025-08-07 +laion/bugs-nl2bashseq_Qwen3-8B +laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_frequency_penalty_1_0_traces +laion/exp_tas_high_diversity_traces +laion/exp_tas_linear_history_off_traces +laion/exp_tas_low_diversity_traces +laion/exp_tas_max_tokens_2048_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_optimal_combined_traces +laion/exp_tas_parser_xml_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_repetition_penalty_1_2_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_k_128_traces +laion/exp_tas_top_k_16_traces +laion/exp_tas_top_p_0_8_traces +laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter +laion/exp-psu-stackoverflow-1K_glm_4_7_traces +laion/exp-psu-stackoverflow-316_glm_4_7_traces +laion/exp-psu-stackoverflow-31K_glm_4_7_traces +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-swd-r2egym-wo-docker_glm_4_7_traces +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-4_2x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-8_4x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter +laion/glm-4_6-all-puzzles-32ep-131k +laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k +laion/glm-4_6-freelancer-32ep-131k-torch +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-inferredbugs-32ep-65k-reasoning +laion/glm-4_6-nemo-prism +laion/GLM-4_6-nl2bash-verified-32ep-32k-reasoning +laion/glm-4_6-r2egym-32ep-32k +laion/GLM-4_6-selfinstruct-naive-2-32ep-32k +laion/glm-4_6-stack-overflow-32ep-131k-summtrc +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/glm-4_6-staqc-32ep-131k +laion/GLM-4_6-swesmith-32ep-131k-nosumm +laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning +laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/glm46-defects4j-32ep-131k +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k +laion/glm46-neulab-synatra-32ep-131k +laion/glm46-qasper-maxeps-131k +laion/glm46-stackexchange-tezos-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k +laion/glm46-swesmith-maxeps-131k +laion/GPT-OSS-120B-codeforces-fixeps_Qwen3-8B +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 +laion/kimi-k2-r2egym_sandboxes-maxeps-32k +laion/kimi-k2-swegym-tasks-maxeps-32k +laion/kimi-k2t-freelancer-32ep-32k +laion/Kimi-K2T-neulab-agenttuning-kg-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-mind2web-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-webshop-sandboxes-maxeps-32k +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/Kimi-K2T-swesmith-32ep-131k +laion/MiniMax-M2-freelancer-32ep-32k +laion/MiniMax-M2-freelancer-32ep-32k-reasoning +laion/minimax-m2-stack-overflow-32ep-131k-summtrc +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e3_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/open-thoughts-4-code-qwen3-32b-annotated +laion/Qwen3-8B_exp_tas_summarize_threshold_4096_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_trajectory_minimal_traces_save-strategy_steps +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-Coder-480B-codeforces-fixeps_Qwen3-8B +laion/Qwen3-Coder-480B-nl2bash-fixeps_Qwen3-8B +laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc +laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/r2egym-bugsseq +laion/r2egym-gpt5-codex-160ep-1M +laion/r2egym-nl2bash-bugsseq +laion/r2egym-nl2bash-stack-bugsseq +laion/r2egym-nl2bash-stack-bugsseq_lr3e-5_exp_rpt_stack-php-v2_step20 +laion/r2egym-nl2bash-stack-bugsseq-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-cpp +laion/r2egym-nl2bash-stack-bugsseq-fixthink +laion/r2egym-nl2bash-stack-bugsseq-fixthink-again +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp +laion/r2egym-nl2bash-stack-bugsseq-junit +laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-stack-php-v2 +laion/r2egym-nl2bash-stackseq +laion/r2egym-stack-bugsseq +laion/rl_bs128-gs16-rloo-n-code-contests-900s-noreg-15 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr1e-5_taco +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-type +laion/rl_think_npfg-code-contests-900s-45 +laion/rl_tp4s64_8x_2skill +laion/rl_tp4s64_8x_exercism-python +laion/rl_tp4s64_8x_flat25_baseline +laion/rl_tp4s64_8x_github_issue +laion/rl_tp4s64_8x_heavy_padding +laion/rl_tp4s64_8x_minimal_instructions +laion/rl_tp4s64_8x_nemotron-cpp +laion/rl_tp4s64_8x_nemotron-junit +laion/rl_tp4s64_8x_proportional +laion/rl_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again +mistralai/Devstral-Small-2507 +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_43 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 +moonshotai/Kimi-Dev-72B +moonshotai/Kimi-K2.5 +NovaSky-AI/SA-SWE-32B +nvidia/AceReason-Nemotron-7B +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 +nvidia/Nemotron-Terminal-14B +nvidia/Nemotron-Terminal-32B +nvidia/Nemotron-Terminal-8B +nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 +o4-mini +obiwan96/qwen-2.5-7b-instruct-endless-terminals +obiwan96/qwen3-8b-openthinker-sft-endless-terminals +open-r1/OpenR1-Distill-7B +open-thoughts/OpenThinker-Agent-v1 +open-thoughts/OpenThinker-Agent-v1-SFT +open-thoughts/OpenThinker3-7B +openai/gpt-5 +openai/gpt-5-mini +openai/gpt-5-nano +penfever/freelancer-t1024s-32ep-restore-hp +penfever/freelancer-t2048s-32ep-restore-hp +penfever/freelancer-t512s-32ep-restore-hp +penfever/GLM-4_6-codeforces-32ep-32k-restore-hp +penfever/GLM-4_6-gemini25flash-stackexchange-overflow-32ep-512k-fixeps +penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps +penfever/kimi-k2-swesmith_with_plain_docker-sandboxes-maxeps-32k +penfever/neulab-codeactinstruct-restore-hp +penfever/nl2bash_gpt-5-nano-traces-8ep-restore-hp +penfever/nl2bash_verified_gpt-5-nano-traces-restore-hp +penfever/nl2bash-0-1k-traces-restore-hp +penfever/nl2bash-0-3k-traces-restore-hp +penfever/nl2bash-16ep-restore-hp +penfever/nl2bash-1ep-restore-hp +penfever/nl2bash-1k-traces-restore-hp +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-32ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/nl2bash-4ep-restore-hp +penfever/nl2bash-8ep-restore-hp +penfever/nl2bash-GLM-4_6-traces-newhparams +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp +penfever/rl_bs128_gs16_ruby-30 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_code-v2-25 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-pyte-v2-25 +penfever/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_soft-v2-45 +penfever/selfinstruct-naive-sandboxes-2-traces-restore-hp +penfever/swesmith-2stage-restore-hp +penfever/taskmaster2-4ep-2stage-restore-hp +Qwen/Qwen2.5-7B-Instruct +Qwen/Qwen2.5-Coder-32B-Instruct +Qwen/Qwen3-1.7B +Qwen/Qwen3-14B +Qwen/Qwen3-235B-A22B-Instruct-2507-tput +Qwen/Qwen3-32B +Qwen/Qwen3-4B +Qwen/Qwen3-4B-Instruct-2507 +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-8B +Qwen/Qwen3-8B-Base +Qwen/Qwen3-Coder-30B-A3B-Instruct +Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 +Qwen/Qwen3.5-0.8B +Qwen/Qwen3.5-27B +Qwen/Qwen3.5-2B +Qwen/Qwen3.5-35B-A3B +Qwen/Qwen3.5-4B +Qwen/Qwen3.5-9B +R2E-Gym/R2EGym-32B-Agent +Skywork/Skywork-OR1-7B +Skywork/Skywork-SWE-32B +SWE-bench/SWE-agent-LM-32B +SWE-bench/SWE-agent-LM-7B +SWE-Swiss/SWE-Swiss-32B +zai-org/GLM-4.7 +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 +DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 +DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor +DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor diff --git a/eval/lists/kept_models_names.txt b/eval/lists/kept_models_names.txt new file mode 100644 index 00000000..f98dffb7 --- /dev/null +++ b/eval/lists/kept_models_names.txt @@ -0,0 +1,221 @@ +/e/data1/datasets/playground/ot/hf_hub/models--laion--r2egym-nl2bash-stack-bugsseq-fixthink-again/snapshots/2f4f59f076583f8c084bbca8308d5f80bfc7def5 +/e/data1/datasets/playground/ot/hf_hub/models--laion--r2egym-nl2bash-stack-bugsseq-fixthink/snapshots/53ccb94616c4fb83ee5c138f334ed1b99c681272 +/leonardo_scratch/fast/AIFAC_5C0_290/dc-agent-shared/hf_hub/models--open-thoughts--OpenThinker-Agent-v1/snapshots/899181e51a920db4b7b580fc50ca1f6d99fbb0f5 +DCAgent/exp_rpt_crosscodeeval-csharp_20260219 +DCAgent/exp_tas_max_episodes_32_traces +DCAgent/exp_tas_max_episodes_512_traces +DCAgent/exp_tas_max_tokens_8192_traces +DCAgent/exp_tas_presence_penalty_0_25_traces +DCAgent/exp_tas_presence_penalty_1_0_traces +DCAgent/exp_tas_repetition_penalty_1_05_traces +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 +DCAgent2/bs64_rloo_n_noct_stri_micr_auto_conv_pref_model_r2e-120 +DCAgent2/bugs-swesmith-reason +DCAgent2/bugs-swesmith-undr7030 +DCAgent2/nl2bash-stack-bugsseq +DCAgent2/nl2bash-stack-bugsshuffle +DCAgent2/nl2bash-swesmith-over5050 +DCAgent2/nl2bash-swesmith-undr7030 +DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/swesmith-bugsseq +DCAgent2/swesmith-stack-over5050 +DCAgent2/swesmith-stack-reason +DCAgent2/swesmith-stackseq +NovaSky-AI/SA-SWE-32B +Qwen/Qwen2.5-Coder-32B-Instruct +Qwen/Qwen3-1.7B +Qwen/Qwen3-14B +Qwen/Qwen3-235B-A22B-Instruct-2507-tput +Qwen/Qwen3-32B +Qwen/Qwen3-4B +Qwen/Qwen3-4B-Instruct-2507 +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-8B +Qwen/Qwen3-8B-Base +Qwen/Qwen3-Coder-30B-A3B-Instruct +Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 +R2E-Gym/R2EGym-32B-Agent +SWE-Swiss/SWE-Swiss-32B +SWE-bench/SWE-agent-LM-32B +SWE-bench/SWE-agent-LM-7B +Skywork/Skywork-SWE-32B +allenai/SERA-32B +allenai/SERA-8B +claude-haiku-4-5-20251001 +gemini-2.5-flash +gpt-5-2025-08-07 +gpt-5-mini-2025-08-07 +gpt-5-nano-2025-08-07 +hosted_vllm/mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_32_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_4.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B +laion/GLM-4_6-stackexchange-overflow-sandboxes-32eps-65k-reasoning +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k-lc +laion/GLM-4_7-stackexchange-tezos-sandboxes-maxeps-131k +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Kimi-K2T-ling-coder-sft-sandboxes-1-maxeps-32k +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps +laion/alfworld-swesmith-r2egym-swegym-131k-lc +laion/bugs-r2egym-stackseq +laion/dev_set_part1_10k_glm_4_7_traces_jupiter +laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned +laion/dev_set_part1_10k_glm_4_7_traces_locetash +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-staqc-embedding-mean-filtered-10K_glm_4_7_traces_jupiter +laion/exp-gfi-staqc-short-response-filtered-10K_glm_4_7_traces_locetash +laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter +laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-psu-stackoverflow-10K_glm_4_7_traces +laion/exp-psu-stackoverflow-3K_glm_4_7_traces +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-askllm-hardened_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_locetash +laion/exp-syh-tezos-askllm-constrained_glm_4_7_traces_jupiter +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_locetash +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter +laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-40x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-80x_glm_4_7_traces_jupiter_cleaned +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/exp_tas_full_thinking_traces +laion/exp_tas_interleaved_thinking_on_traces +laion/exp_tas_min_p_0_01_traces +laion/exp_tas_min_p_0_1_traces +laion/exp_tas_optimal_combined_traces +laion/exp_tas_summarize_off_traces +laion/exp_tas_summarize_threshold_16384_traces +laion/exp_tas_summarize_threshold_2048_traces +laion/exp_tas_timeout_multiplier_0_25_traces +laion/exp_tas_timeout_multiplier_1_0_traces +laion/exp_tas_timeout_multiplier_4_0_traces +laion/exp_tas_timeout_multiplier_8_0_traces +laion/exp_tas_top_k_64_traces +laion/exp_tas_top_p_0_95_traces +laion/exp_tas_top_p_0_9_traces +laion/glm-4_6-stackexchange-tezos-32ep-131k +laion/glm46-Toolscale-tasks-traces +laion/glm46-r2egym_sandboxes-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k-lc +laion/glm46-swesmith-maxeps-131k-fixthink +laion/glm46-swesmith-maxeps-131k-lc +laion/nl2bash-swesmith-stack-bugsseq +laion/perturbed-docker-exp-freelancer-tasks_glm_4_7_traces +laion/r2egym-nl2bash-stack-bugsseq +laion/r2egym-nl2bash-stack-bugsseq-crosscodeeval-python-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-again +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large +laion/r2egym-nl2bash-stack-bugsseq-rl-crosscodeeval-csharp +laion/r2egym-nl2bashseq +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_base-code-contests-900s-160 +laion/rl_base-code-contests-900s-reg-140 +laion/rl_base-code-contests-900s-reg-lr1e-5-140 +laion/rl_base-exp_rpt_stack_bash-90 +laion/rl_base-exp_rpt_stack_bash_with_gpt5-90 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-php-larg-75 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_meth-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-self-larg-60 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-java +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_swesmith-fixthink-pymethods2test-45 +laion/rl_tp4s64_8x_curated +laion/rl_tp4s64_8x_detailed +laion/rl_tp4s64_8x_error_report +laion/rl_tp4s64_8x_expert +laion/rl_tp4s64_8x_moderate_padding +laion/rl_tp4s64_8x_partial_ambiguity +laion/rl_tp4s64_8x_stack-jest-v2 +laion/rl_tp4s64_8x_stack-selfdoc-v2 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/stackexchange-tezos-sandboxes_glm_4_7_traces_locetash +laion/swesmith-nl2bash-stack-bugsseq +laion/swesmith-sandboxes-with_tests-gpt-5-mini-passed_glm_4_7_traces +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +mistralai/Devstral-Small-2507 +moonshotai/Kimi-Dev-72B +moonshotai/Kimi-K2.5 +nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 +o4-mini +openai/gpt-5 +openai/gpt-5-mini +openai/gpt-5-nano +penfever/GLM-4_6-taskmaster2-32eps-32k-fixeps +penfever/bs64_rloo_n_noct_stri_micr_auto_tis_model_r2e-100 +penfever/bs64_rloo_n_noct_stri_micr_model_noconv_r2eg_nl2_140 +penfever/bs64_rloo_n_noct_stri_micr_model_r2eg_nl2_160 +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +penfever/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-110 +r2egym-nl2bash-stack-bugsseq-bash-withtests +r2egym-nl2bash-stack-bugsseq-cpp +r2egym-nl2bash-stack-bugsseq-junit +r2egym-nl2bash-stack-bugsseq-pytest-v2 +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +zai-org/GLM-4.7 diff --git a/eval/lists/laion_latest.txt b/eval/lists/laion_latest.txt new file mode 100644 index 00000000..864a34e1 --- /dev/null +++ b/eval/lists/laion_latest.txt @@ -0,0 +1,28 @@ +laion/rl__24GPU_base__llm-verifier-freelancer__r2egym-nl2bash-stack +laion/rl__24GPU_shaped__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/SweSmith-8B-SFT-Rope-step62 +laion/SweSmith-8B-SFT-NoRope-step58 +laion/rl__24GPU_base__mix_h2_language_proportional__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +laion/rl_v3_tp4s64_8x_exercism-python +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned diff --git a/eval/lists/latest_sort_by_release_eval_prio.txt b/eval/lists/latest_sort_by_release_eval_prio.txt new file mode 100644 index 00000000..b6f1047e --- /dev/null +++ b/eval/lists/latest_sort_by_release_eval_prio.txt @@ -0,0 +1,28 @@ +laion/rl__24GPU_base__llm-verifier-freelancer__r2egym-nl2bash-stack +laion/rl__24GPU_shaped__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/SweSmith-8B-SFT-Rope-step62 +laion/SweSmith-8B-SFT-NoRope-step58 +laion/rl__24GPU_base__mix_h2_language_proportional__r2egym-nl2bash-stack +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +laion/rl_v3_tp4s64_8x_exercism-python +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned diff --git a/eval/lists/lists/16x_32b_lc_baseline.txt b/eval/lists/lists/16x_32b_lc_baseline.txt new file mode 100644 index 00000000..b70e4288 --- /dev/null +++ b/eval/lists/lists/16x_32b_lc_baseline.txt @@ -0,0 +1,8 @@ +nvidia/Nemotron-Terminal-32B +allenai/SERA-32B +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B +laion/alfworld-swesmith-r2egym-swegym-131k-32B-lc \ No newline at end of file diff --git a/eval/lists/lists/a1_models.txt b/eval/lists/lists/a1_models.txt new file mode 100644 index 00000000..8fd5976a --- /dev/null +++ b/eval/lists/lists/a1_models.txt @@ -0,0 +1,77 @@ +DCAgent/a1-stack_rspec +DCAgent/a1-stack_go +DCAgent/a1-self_instruct_naive +DCAgent/a1-nemotron_bash_withtests_gpt5mini +DCAgent/a1-nemotron_bash_withtests +DCAgent/a1-inferredbugs +DCAgent/a1-codeforces +DCAgent/a1-code_contests +DCAgent/a1-bash_textbook +DCAgent/a1-swegym_openhands +DCAgent/a1-stack_bash_withtests_gpt5mini +DCAgent/a1-orca_agentinstruct +DCAgent/a1-nnetnav_live +DCAgent/a1-nebius_swe_agent +DCAgent/a1-mind2web +DCAgent/a1-go_browse_wa +DCAgent/a1-codeactinstruct +DCAgent/a1-agenttuning_alfworld +DCAgent/a1-stack_pytest_gpt5mini +DCAgent/a1-nemotron_pytest +DCAgent/a1-glaive_code_assistant +DCAgent/a1-wizardlm_orca +DCAgent/a1-nemo_prism_math +DCAgent/a1-tulu3_sft_personas_math +DCAgent/a1-swesmith +DCAgent/a1-r2egym +DCAgent/a1-magicoder +DCAgent/a1-ghactions +DCAgent/a1-freelancer +DCAgent/a1-codeelo +DCAgent/a1-bugswarm +DCAgent/a1-stackexchange_unix +DCAgent/a1-stackexchange_tor +DCAgent/a1-stackexchange_superuser +DCAgent/a1-stack_pytest_withtests +DCAgent/a1-stack_pytest_synthetic_gpt5nano +DCAgent/a1-stack_phpunit +DCAgent/a1-pymethods2test +DCAgent/a1-defects4j +DCAgent/a1-curriculum_medium +DCAgent/a1-curriculum_hard +DCAgent/a1-curriculum_easy +DCAgent/a1-code_feedback +DCAgent/a1-agenttuning_webshop +DCAgent/a1-agenttuning_os +DCAgent/a1-agenttuning_mind2web +DCAgent/a1-agenttuning_db +DCAgent/a1-agenttuning_kg +DCAgent/a1-taskmaster2 +DCAgent/a1-stack_bash +DCAgent/a1-repo_scaffold +DCAgent/a1-pr_mining +DCAgent/a1-nemotron_junit +DCAgent/a1-nemotron_cpp +DCAgent/a1-nemotron_bash +DCAgent/a1-manybugs +DCAgent/a1-issue_tasks +DCAgent/a1-codenet_python +DCAgent/a1-bugsinpy +DCAgent/a1-multifile_composition +DCAgent/a1-exercism_python +DCAgent/a1-crosscodeeval_typescript +DCAgent/a1-crosscodeeval_python +DCAgent/a1-crosscodeeval_java +DCAgent/a1-taco +DCAgent/a1-staqc +DCAgent/a1-stackexchange_tezos +DCAgent/a1-stackexchange_overflow +DCAgent/a1-stack_rust +DCAgent/a1-stack_ruby +DCAgent/a1-stack_pytest +DCAgent/a1-stack_junit +DCAgent/a1-stack_jest +DCAgent/a1-stack_csharp +DCAgent/a1-stack_cpp +DCAgent/a1-stack_bash_withtests +DCAgent/a1-crosscodeeval_csharp \ No newline at end of file diff --git a/eval/lists/lists/a1_nl2bash.txt b/eval/lists/lists/a1_nl2bash.txt new file mode 100644 index 00000000..1069eaa1 --- /dev/null +++ b/eval/lists/lists/a1_nl2bash.txt @@ -0,0 +1 @@ +DCAgent/a1-nl2bash diff --git a/eval/lists/lists/a1_retrained.txt b/eval/lists/lists/a1_retrained.txt new file mode 100644 index 00000000..be82d222 --- /dev/null +++ b/eval/lists/lists/a1_retrained.txt @@ -0,0 +1,4 @@ +DCAgent/a1-bugswarm +DCAgent/a1-codeelo +DCAgent/a1-ghactions +DCAgent/a1-magicoder diff --git a/eval/lists/lists/alfworld_131k.txt b/eval/lists/lists/alfworld_131k.txt new file mode 100644 index 00000000..2f77176c --- /dev/null +++ b/eval/lists/lists/alfworld_131k.txt @@ -0,0 +1 @@ +laion/alfworld-swesmith-r2egym-swegym-131k-lc diff --git a/eval/lists/lists/architecture_invalid_test_model.txt b/eval/lists/lists/architecture_invalid_test_model.txt new file mode 100644 index 00000000..13a9cb14 --- /dev/null +++ b/eval/lists/lists/architecture_invalid_test_model.txt @@ -0,0 +1 @@ +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 \ No newline at end of file diff --git a/eval/lists/lists/baseline_swe.txt b/eval/lists/lists/baseline_swe.txt new file mode 100644 index 00000000..6df89642 --- /dev/null +++ b/eval/lists/lists/baseline_swe.txt @@ -0,0 +1,10 @@ +open-thoughts/OpenThinker-Agent-v1 +camel-ai/seta-rl-qwen3-8b +allenai/SERA-14B +nvidia/Nemotron-Terminal-32B +nvidia/Nemotron-Terminal-14B +nvidia/Nemotron-Terminal-8B +obiwan96/qwen3-8b-openthinker-sft-endless-terminals +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 +deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +open-thoughts/OpenThinker3-7B \ No newline at end of file diff --git a/eval/lists/lists/bfcl_rerun_failed.txt b/eval/lists/lists/bfcl_rerun_failed.txt new file mode 100644 index 00000000..fa394720 --- /dev/null +++ b/eval/lists/lists/bfcl_rerun_failed.txt @@ -0,0 +1,22 @@ +DCAgent2/nl2bash-swesmithseq +DCAgent2/stack-bugsshuffle +DCAgent2/stack-bugs-undr7030 +DCAgent2/swesmith-nl2bashseq +DCAgent/exp_tas_max_episodes_512_traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/exp_tas_presence_penalty_1_0_traces +DCAgent/exp_tas_repetition_penalty_1_05_traces +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +laion/exp_tas_baseline_traces +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps diff --git a/eval/lists/lists/bfcl_rerun_models.txt b/eval/lists/lists/bfcl_rerun_models.txt new file mode 100644 index 00000000..e8af30d2 --- /dev/null +++ b/eval/lists/lists/bfcl_rerun_models.txt @@ -0,0 +1,74 @@ +DCAgent/All_Puzzles_5k_new_context +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/taskmaster2-64ep +DCAgent/taskmaster2-banana +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw +DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B +DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 +DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +Qwen/Qwen3-8B-Base +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/dev_set_part1_10k_glm_4_7_traces_locetash +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-askllm-hardened_glm_4_7_traces_jupiter +laion/exp-syh-tezos-askllm-constrained_glm_4_7_traces_jupiter +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter +laion/exp-uns-tezos-40x_glm_4_7_traces_jupiter +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_high_diversity_traces +laion/glm-4_6-stackexchange-tezos-32ep-131k +laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k +laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/r2egym-nl2bash-stack-bugsseq-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-cpp +laion/r2egym-nl2bash-stack-bugsseq-fixthink-again +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large +laion/rl_tp4s64_8x_curated +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 +penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps +penfever/nl2bash-GLM-4_6-traces-newhparams diff --git a/eval/lists/lists/custom_force_run.txt b/eval/lists/lists/custom_force_run.txt new file mode 100644 index 00000000..15acb74c --- /dev/null +++ b/eval/lists/lists/custom_force_run.txt @@ -0,0 +1 @@ +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 diff --git a/eval/lists/lists/dp_test_model.txt b/eval/lists/lists/dp_test_model.txt new file mode 100644 index 00000000..01b63ba7 --- /dev/null +++ b/eval/lists/lists/dp_test_model.txt @@ -0,0 +1 @@ +DCAgent/a1-stack_jest \ No newline at end of file diff --git a/eval/lists/lists/dsv2_rerun_models.txt b/eval/lists/lists/dsv2_rerun_models.txt new file mode 100644 index 00000000..317130e0 --- /dev/null +++ b/eval/lists/lists/dsv2_rerun_models.txt @@ -0,0 +1,230 @@ +DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_new_context +DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B +DCAgent/code_contests-Qwen3-Coder-480B-traces +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/exp_tas_max_episodes_32_traces +DCAgent/exp_tas_max_episodes_512_traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/exp_tas_max_tokens_8192_traces +DCAgent/exp_tas_presence_penalty_0_25_traces +DCAgent/exp_tas_presence_penalty_1_0_traces +DCAgent/exp_tas_repetition_penalty_1_05_traces +DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B +DCAgent/freelancer-projects-0-1k-traces +DCAgent/freelancer-projects-10k-traces +DCAgent/freelancer-projects-gpt5_Qwen3-8B +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B +DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base +DCAgent/taskmaster2-0-1k-traces +DCAgent/taskmaster2-0-3k-traces +DCAgent/taskmaster2-1ep +DCAgent/taskmaster2-1k-traces +DCAgent/taskmaster2-32ep +DCAgent/taskmaster2-4ep +DCAgent/taskmaster2-banana +DCAgent/taskmaster2-gpt5mini +DCAgent/taskmaster2-gpt5mini_global-batch-size_16 +DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B +DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces +DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps +DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps +DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k +DCAgent2/bugs-stack-nl2bashseq +DCAgent2/bugs-swesmith-over5050 +DCAgent2/bugs-swesmith-undr7030 +DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv +DCAgent2/freelancer-projects-31k-traces +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces +DCAgent2/glm-4_6-freelancer-traces +DCAgent2/inferredbugs-GLM-4_6-32ep-32k +DCAgent2/inferredbugs-GLM-4_6-32ep-65k +DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw +DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B +DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B +DCAgent2/nl2bash-bugs-over5050 +DCAgent2/nl2bash-bugsshuffle +DCAgent2/nl2bash-stack-bugs-undr503020 +DCAgent2/nl2bash-stack-bugsseq +DCAgent2/nl2bash-stack-bugsshuffle +DCAgent2/nl2bash-stack-undr3070 +DCAgent2/nl2bash-stack-undr7030 +DCAgent2/nl2bash-stackshuffle +DCAgent2/nl2bash-swesmith-reason +DCAgent2/nl2bash-swesmith-undr7030 +DCAgent2/nl2bash-swesmithseq +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 +DCAgent2/stack-bugs-over5050 +DCAgent2/stack-bugs-undr7030 +DCAgent2/stack-bugsseq +DCAgent2/stack-bugsshuffle +DCAgent2/stack-swesmithseq +DCAgent2/swesmith-bugsseq +DCAgent2/swesmith-nl2bashseq +DCAgent2/swesmith-stack-over5050 +DCAgent2/swesmith-stack-reason +DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-GLM-4_6-32ep-32k +NovaSky-AI/SA-SWE-32B +Qwen/Qwen3-1.7B +Qwen/Qwen3-14B +Qwen/Qwen3-32B +Qwen/Qwen3-4B +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-8B-Base +Qwen/Qwen3-Coder-30B-A3B-Instruct +R2E-Gym/R2EGym-32B-Agent +SWE-bench/SWE-agent-LM-7B +bespokelabs/Qwen3-8B-ot_step100 +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-stackexchange-overflow-sandboxes-32eps-65k-reasoning +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/GLM-4_6-swesmith-32ep-131k-nosumm +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink +laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps +laion/bugs-nl2bashseq_Qwen3-8B +laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-psu-stackoverflow-1K_glm_4_7_traces +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_frequency_penalty_1_0_traces +laion/exp_tas_high_diversity_traces +laion/exp_tas_linear_history_off_traces +laion/exp_tas_max_tokens_2048_traces +laion/exp_tas_min_p_0_01_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_repetition_penalty_1_2_traces +laion/exp_tas_summarize_off_traces +laion/exp_tas_summarize_threshold_2048_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_k_128_traces +laion/exp_tas_top_k_16_traces +laion/exp_tas_top_p_0_8_traces +laion/exp_tas_top_p_0_95_traces +laion/exp_tas_top_p_0_9_traces +laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k +laion/glm-4_6-freelancer-32ep-131k-torch +laion/glm-4_6-r2egym-32ep-32k +laion/glm-4_6-stackexchange-tezos-32ep-131k +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/glm46-defects4j-32ep-131k +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k +laion/glm46-stackexchange-tezos-maxeps-131k +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 +laion/kimi-k2-r2egym_sandboxes-maxeps-32k +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/minimax-m2-stack-overflow-32ep-131k-summtrc +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/open-thoughts-4-code-qwen3-32b-annotated +laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc +laion/r2egym-bugsseq +laion/r2egym-gpt5-codex-160ep-1M +laion/r2egym-nl2bash-bugsseq +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large +laion/rl_tp4s64_8x_exercism-python +laion/rl_tp4s64_8x_nemotron-junit +laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again +laion/stackexchange-tezos-sandboxes_glm_4_7_traces_locetash +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 +penfever/GLM-4_6-codeforces-32ep-32k-restore-hp +penfever/nl2bash-0-1k-traces-restore-hp +penfever/nl2bash-0-3k-traces-restore-hp +penfever/nl2bash-1k-traces-restore-hp +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-32ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/nl2bash-4ep-restore-hp +penfever/nl2bash-8ep-restore-hp +penfever/nl2bash-GLM-4_6-traces-newhparams +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp diff --git a/eval/lists/lists/exp_tas_qwen35.txt b/eval/lists/lists/exp_tas_qwen35.txt new file mode 100644 index 00000000..3ddf9ad1 --- /dev/null +++ b/eval/lists/lists/exp_tas_qwen35.txt @@ -0,0 +1 @@ +laion/exp_tas_optimal_combined_traces-Qwen3.5-9B diff --git a/eval/lists/lists/glm46_131k.txt b/eval/lists/lists/glm46_131k.txt new file mode 100644 index 00000000..8f37fa74 --- /dev/null +++ b/eval/lists/lists/glm46_131k.txt @@ -0,0 +1 @@ +laion/glm46-swesmith-maxeps-131k-lc diff --git a/eval/lists/lists/glm47_flash.txt b/eval/lists/lists/glm47_flash.txt new file mode 100644 index 00000000..eae1439b --- /dev/null +++ b/eval/lists/lists/glm47_flash.txt @@ -0,0 +1 @@ +zai-org/GLM-4.7-Flash diff --git a/eval/lists/lists/inactive_models_latest.txt b/eval/lists/lists/inactive_models_latest.txt new file mode 100644 index 00000000..05e4667d --- /dev/null +++ b/eval/lists/lists/inactive_models_latest.txt @@ -0,0 +1,457 @@ +allenai/SERA-14B +allenai/SERA-32B +allenai/SERA-8B +bespokelabs/Qwen3-8B-ot_step100 +bespokelabs/Qwen3-8B-ot_step60_high +camel-ai/seta-rl-qwen3-8b +claude-haiku-4-5-20251001 +DCAgent/All_Puzzles_5k_new_context +DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context +DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct +DCAgent/bash_textbook_tasks_traces +DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B +DCAgent/code_contests-Qwen3-Coder-480B-traces +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/codeforces-gptoss120b-traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/freelancer-askllm-filtered-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B +DCAgent/freelancer-long-instruction-filter_Qwen3-8B +DCAgent/freelancer-projects-0-1k-traces +DCAgent/freelancer-projects-0-3k-traces +DCAgent/freelancer-projects-100k-traces_Qwen3-8B +DCAgent/freelancer-projects-10k-traces +DCAgent/freelancer-projects-1k-traces +DCAgent/freelancer-projects-3k-traces +DCAgent/freelancer-projects-gpt5_Qwen3-8B +DCAgent/freelancer-projects-gpt5mini +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-short-instruction-filter_Qwen3-8B +DCAgent/freelancer-t1024s-32ep_Qwen3-8B +DCAgent/freelancer-t2048s-32ep_Qwen3-8B +DCAgent/freelancer-t256s-32ep_Qwen3-8B +DCAgent/freelancer-t512s-32ep_Qwen3-8B +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_40 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-mind2web-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-synatra-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/staqc-ot3-100k-code-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-math-subset-traces-terminus-2_save-strategy_steps_Qwen3-8B +DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-Coder-30B-A3B-Instruct +DCAgent/taskmaster2-0-1k-traces +DCAgent/taskmaster2-0-3k-traces +DCAgent/taskmaster2-10k-traces +DCAgent/taskmaster2-16ep +DCAgent/taskmaster2-1ep +DCAgent/taskmaster2-1k-traces +DCAgent/taskmaster2-2ep +DCAgent/taskmaster2-32ep +DCAgent/taskmaster2-3k-traces +DCAgent/taskmaster2-4ep +DCAgent/taskmaster2-64ep +DCAgent/taskmaster2-8ep +DCAgent/taskmaster2-banana +DCAgent/taskmaster2-gpt5mini +DCAgent/taskmaster2-gpt5mini_global-batch-size_16 +DCAgent/tbench_oracle_solutions_terminus +DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B +DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces +DCAgent2/bugs-nl2bashseq +DCAgent2/bugs-stack-nl2bashseq +DCAgent2/bugs-swesmith-over5050 +DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv +DCAgent2/freelancer-projects-100k-traces +DCAgent2/freelancer-projects-31k-traces +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces +DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps +DCAgent2/glm-4_6-freelancer-traces +DCAgent2/glm-4_6-freelancer-traces-pm +DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps +DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k +DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor +DCAgent2/inferredbugs-GLM-4_6-32ep-32k +DCAgent2/inferredbugs-GLM-4_6-32ep-65k +DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw +DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B +DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B +DCAgent2/nl2bash-bugs-over5050 +DCAgent2/nl2bash-bugs-undr3070 +DCAgent2/nl2bash-bugsseq +DCAgent2/nl2bash-bugsshuffle +DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/nl2bash-stack-bugs-over333 +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-stack-bugs-undr503020 +DCAgent2/nl2bash-stack-over5050 +DCAgent2/nl2bash-stack-undr3070 +DCAgent2/nl2bash-stack-undr7030 +DCAgent2/nl2bash-stackseq +DCAgent2/nl2bash-stackshuffle +DCAgent2/nl2bash-swesmith-reason +DCAgent2/nl2bash-swesmithseq +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2095 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1097_Qwen3-8B +DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/stack-bugs-over5050 +DCAgent2/stack-bugs-undr3070 +DCAgent2/stack-bugs-undr7030 +DCAgent2/stack-bugsseq +DCAgent2/stack-bugsshuffle +DCAgent2/stack-nl2bashseq +DCAgent2/stack-swesmithseq +DCAgent2/swesmith-nl2bashseq +DCAgent2/swesmith-stack-undr7030 +DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-GLM-4_6-32ep-32k +DCAgent2/test2-tbench-dev-71-qwen3-8b-8nodes-sync +deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +gemini-2.5-flash +gpt-5-2025-08-07 +gpt-5-mini-2025-08-07 +gpt-5-nano-2025-08-07 +laion/bugs-nl2bashseq_Qwen3-8B +laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_frequency_penalty_1_0_traces +laion/exp_tas_high_diversity_traces +laion/exp_tas_linear_history_off_traces +laion/exp_tas_low_diversity_traces +laion/exp_tas_max_tokens_2048_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_optimal_combined_traces +laion/exp_tas_parser_xml_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_repetition_penalty_1_2_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_k_128_traces +laion/exp_tas_top_k_16_traces +laion/exp_tas_top_p_0_8_traces +laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter +laion/exp-psu-stackoverflow-1K_glm_4_7_traces +laion/exp-psu-stackoverflow-316_glm_4_7_traces +laion/exp-psu-stackoverflow-31K_glm_4_7_traces +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-swd-r2egym-wo-docker_glm_4_7_traces +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-4_2x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-8_4x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter +laion/glm-4_6-all-puzzles-32ep-131k +laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k +laion/glm-4_6-freelancer-32ep-131k-torch +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-inferredbugs-32ep-65k-reasoning +laion/glm-4_6-nemo-prism +laion/GLM-4_6-nl2bash-verified-32ep-32k-reasoning +laion/glm-4_6-r2egym-32ep-32k +laion/GLM-4_6-selfinstruct-naive-2-32ep-32k +laion/glm-4_6-stack-overflow-32ep-131k-summtrc +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/glm-4_6-staqc-32ep-131k +laion/GLM-4_6-swesmith-32ep-131k-nosumm +laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning +laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/glm46-defects4j-32ep-131k +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k +laion/glm46-neulab-synatra-32ep-131k +laion/glm46-qasper-maxeps-131k +laion/glm46-stackexchange-tezos-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k +laion/glm46-swesmith-maxeps-131k +laion/GPT-OSS-120B-codeforces-fixeps_Qwen3-8B +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 +laion/kimi-k2-r2egym_sandboxes-maxeps-32k +laion/kimi-k2-swegym-tasks-maxeps-32k +laion/kimi-k2t-freelancer-32ep-32k +laion/Kimi-K2T-neulab-agenttuning-kg-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-mind2web-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-webshop-sandboxes-maxeps-32k +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/Kimi-K2T-swesmith-32ep-131k +laion/MiniMax-M2-freelancer-32ep-32k +laion/MiniMax-M2-freelancer-32ep-32k-reasoning +laion/minimax-m2-stack-overflow-32ep-131k-summtrc +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e3_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/open-thoughts-4-code-qwen3-32b-annotated +laion/Qwen3-8B_exp_tas_summarize_threshold_4096_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_trajectory_minimal_traces_save-strategy_steps +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-Coder-480B-codeforces-fixeps_Qwen3-8B +laion/Qwen3-Coder-480B-nl2bash-fixeps_Qwen3-8B +laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc +laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/r2egym-bugsseq +laion/r2egym-gpt5-codex-160ep-1M +laion/r2egym-nl2bash-bugsseq +laion/r2egym-nl2bash-stack-bugsseq +laion/r2egym-nl2bash-stack-bugsseq_lr3e-5_exp_rpt_stack-php-v2_step20 +laion/r2egym-nl2bash-stack-bugsseq-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-cpp +laion/r2egym-nl2bash-stack-bugsseq-fixthink +laion/r2egym-nl2bash-stack-bugsseq-fixthink-again +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp +laion/r2egym-nl2bash-stack-bugsseq-junit +laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-stack-php-v2 +laion/r2egym-nl2bash-stackseq +laion/r2egym-stack-bugsseq +laion/rl_bs128-gs16-rloo-n-code-contests-900s-noreg-15 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr1e-5_taco +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-type +laion/rl_think_npfg-code-contests-900s-45 +laion/rl_tp4s64_8x_2skill +laion/rl_tp4s64_8x_exercism-python +laion/rl_tp4s64_8x_flat25_baseline +laion/rl_tp4s64_8x_github_issue +laion/rl_tp4s64_8x_heavy_padding +laion/rl_tp4s64_8x_minimal_instructions +laion/rl_tp4s64_8x_nemotron-cpp +laion/rl_tp4s64_8x_nemotron-junit +laion/rl_tp4s64_8x_proportional +laion/rl_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again +mistralai/Devstral-Small-2507 +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_43 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 +moonshotai/Kimi-Dev-72B +moonshotai/Kimi-K2.5 +NovaSky-AI/SA-SWE-32B +nvidia/AceReason-Nemotron-7B +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 +nvidia/Nemotron-Terminal-14B +nvidia/Nemotron-Terminal-32B +nvidia/Nemotron-Terminal-8B +nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 +o4-mini +obiwan96/qwen-2.5-7b-instruct-endless-terminals +obiwan96/qwen3-8b-openthinker-sft-endless-terminals +open-r1/OpenR1-Distill-7B +open-thoughts/OpenThinker-Agent-v1 +open-thoughts/OpenThinker-Agent-v1-SFT +open-thoughts/OpenThinker3-7B +openai/gpt-5 +openai/gpt-5-mini +openai/gpt-5-nano +penfever/freelancer-t1024s-32ep-restore-hp +penfever/freelancer-t2048s-32ep-restore-hp +penfever/freelancer-t512s-32ep-restore-hp +penfever/GLM-4_6-codeforces-32ep-32k-restore-hp +penfever/GLM-4_6-gemini25flash-stackexchange-overflow-32ep-512k-fixeps +penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps +penfever/kimi-k2-swesmith_with_plain_docker-sandboxes-maxeps-32k +penfever/neulab-codeactinstruct-restore-hp +penfever/nl2bash_gpt-5-nano-traces-8ep-restore-hp +penfever/nl2bash_verified_gpt-5-nano-traces-restore-hp +penfever/nl2bash-0-1k-traces-restore-hp +penfever/nl2bash-0-3k-traces-restore-hp +penfever/nl2bash-16ep-restore-hp +penfever/nl2bash-1ep-restore-hp +penfever/nl2bash-1k-traces-restore-hp +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-32ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/nl2bash-4ep-restore-hp +penfever/nl2bash-8ep-restore-hp +penfever/nl2bash-GLM-4_6-traces-newhparams +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp +penfever/rl_bs128_gs16_ruby-30 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_code-v2-25 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-pyte-v2-25 +penfever/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_soft-v2-45 +penfever/selfinstruct-naive-sandboxes-2-traces-restore-hp +penfever/swesmith-2stage-restore-hp +penfever/taskmaster2-4ep-2stage-restore-hp +Qwen/Qwen2.5-7B-Instruct +Qwen/Qwen2.5-Coder-32B-Instruct +Qwen/Qwen3-1.7B +Qwen/Qwen3-14B +Qwen/Qwen3-235B-A22B-Instruct-2507-tput +Qwen/Qwen3-32B +Qwen/Qwen3-4B +Qwen/Qwen3-4B-Instruct-2507 +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-8B +Qwen/Qwen3-8B-Base +Qwen/Qwen3-Coder-30B-A3B-Instruct +Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 +Qwen/Qwen3.5-0.8B +Qwen/Qwen3.5-27B +Qwen/Qwen3.5-2B +Qwen/Qwen3.5-35B-A3B +Qwen/Qwen3.5-4B +Qwen/Qwen3.5-9B +R2E-Gym/R2EGym-32B-Agent +Skywork/Skywork-OR1-7B +Skywork/Skywork-SWE-32B +SWE-bench/SWE-agent-LM-32B +SWE-bench/SWE-agent-LM-7B +SWE-Swiss/SWE-Swiss-32B +zai-org/GLM-4.7 +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 +DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 +DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor +DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor diff --git a/eval/lists/lists/kept_models_names.txt b/eval/lists/lists/kept_models_names.txt new file mode 100644 index 00000000..f98dffb7 --- /dev/null +++ b/eval/lists/lists/kept_models_names.txt @@ -0,0 +1,221 @@ +/e/data1/datasets/playground/ot/hf_hub/models--laion--r2egym-nl2bash-stack-bugsseq-fixthink-again/snapshots/2f4f59f076583f8c084bbca8308d5f80bfc7def5 +/e/data1/datasets/playground/ot/hf_hub/models--laion--r2egym-nl2bash-stack-bugsseq-fixthink/snapshots/53ccb94616c4fb83ee5c138f334ed1b99c681272 +/leonardo_scratch/fast/AIFAC_5C0_290/dc-agent-shared/hf_hub/models--open-thoughts--OpenThinker-Agent-v1/snapshots/899181e51a920db4b7b580fc50ca1f6d99fbb0f5 +DCAgent/exp_rpt_crosscodeeval-csharp_20260219 +DCAgent/exp_tas_max_episodes_32_traces +DCAgent/exp_tas_max_episodes_512_traces +DCAgent/exp_tas_max_tokens_8192_traces +DCAgent/exp_tas_presence_penalty_0_25_traces +DCAgent/exp_tas_presence_penalty_1_0_traces +DCAgent/exp_tas_repetition_penalty_1_05_traces +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 +DCAgent2/bs64_rloo_n_noct_stri_micr_auto_conv_pref_model_r2e-120 +DCAgent2/bugs-swesmith-reason +DCAgent2/bugs-swesmith-undr7030 +DCAgent2/nl2bash-stack-bugsseq +DCAgent2/nl2bash-stack-bugsshuffle +DCAgent2/nl2bash-swesmith-over5050 +DCAgent2/nl2bash-swesmith-undr7030 +DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/swesmith-bugsseq +DCAgent2/swesmith-stack-over5050 +DCAgent2/swesmith-stack-reason +DCAgent2/swesmith-stackseq +NovaSky-AI/SA-SWE-32B +Qwen/Qwen2.5-Coder-32B-Instruct +Qwen/Qwen3-1.7B +Qwen/Qwen3-14B +Qwen/Qwen3-235B-A22B-Instruct-2507-tput +Qwen/Qwen3-32B +Qwen/Qwen3-4B +Qwen/Qwen3-4B-Instruct-2507 +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-8B +Qwen/Qwen3-8B-Base +Qwen/Qwen3-Coder-30B-A3B-Instruct +Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 +R2E-Gym/R2EGym-32B-Agent +SWE-Swiss/SWE-Swiss-32B +SWE-bench/SWE-agent-LM-32B +SWE-bench/SWE-agent-LM-7B +Skywork/Skywork-SWE-32B +allenai/SERA-32B +allenai/SERA-8B +claude-haiku-4-5-20251001 +gemini-2.5-flash +gpt-5-2025-08-07 +gpt-5-mini-2025-08-07 +gpt-5-nano-2025-08-07 +hosted_vllm/mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_32_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_4.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B +laion/GLM-4_6-stackexchange-overflow-sandboxes-32eps-65k-reasoning +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k-lc +laion/GLM-4_7-stackexchange-tezos-sandboxes-maxeps-131k +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Kimi-K2T-ling-coder-sft-sandboxes-1-maxeps-32k +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps +laion/alfworld-swesmith-r2egym-swegym-131k-lc +laion/bugs-r2egym-stackseq +laion/dev_set_part1_10k_glm_4_7_traces_jupiter +laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned +laion/dev_set_part1_10k_glm_4_7_traces_locetash +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-staqc-embedding-mean-filtered-10K_glm_4_7_traces_jupiter +laion/exp-gfi-staqc-short-response-filtered-10K_glm_4_7_traces_locetash +laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter +laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-psu-stackoverflow-10K_glm_4_7_traces +laion/exp-psu-stackoverflow-3K_glm_4_7_traces +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-askllm-hardened_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_locetash +laion/exp-syh-tezos-askllm-constrained_glm_4_7_traces_jupiter +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_locetash +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter +laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-40x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-80x_glm_4_7_traces_jupiter_cleaned +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/exp_tas_full_thinking_traces +laion/exp_tas_interleaved_thinking_on_traces +laion/exp_tas_min_p_0_01_traces +laion/exp_tas_min_p_0_1_traces +laion/exp_tas_optimal_combined_traces +laion/exp_tas_summarize_off_traces +laion/exp_tas_summarize_threshold_16384_traces +laion/exp_tas_summarize_threshold_2048_traces +laion/exp_tas_timeout_multiplier_0_25_traces +laion/exp_tas_timeout_multiplier_1_0_traces +laion/exp_tas_timeout_multiplier_4_0_traces +laion/exp_tas_timeout_multiplier_8_0_traces +laion/exp_tas_top_k_64_traces +laion/exp_tas_top_p_0_95_traces +laion/exp_tas_top_p_0_9_traces +laion/glm-4_6-stackexchange-tezos-32ep-131k +laion/glm46-Toolscale-tasks-traces +laion/glm46-r2egym_sandboxes-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k-lc +laion/glm46-swesmith-maxeps-131k-fixthink +laion/glm46-swesmith-maxeps-131k-lc +laion/nl2bash-swesmith-stack-bugsseq +laion/perturbed-docker-exp-freelancer-tasks_glm_4_7_traces +laion/r2egym-nl2bash-stack-bugsseq +laion/r2egym-nl2bash-stack-bugsseq-crosscodeeval-python-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-again +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large +laion/r2egym-nl2bash-stack-bugsseq-rl-crosscodeeval-csharp +laion/r2egym-nl2bashseq +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_base-code-contests-900s-160 +laion/rl_base-code-contests-900s-reg-140 +laion/rl_base-code-contests-900s-reg-lr1e-5-140 +laion/rl_base-exp_rpt_stack_bash-90 +laion/rl_base-exp_rpt_stack_bash_with_gpt5-90 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-php-larg-75 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_meth-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-self-larg-60 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-java +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_swesmith-fixthink-pymethods2test-45 +laion/rl_tp4s64_8x_curated +laion/rl_tp4s64_8x_detailed +laion/rl_tp4s64_8x_error_report +laion/rl_tp4s64_8x_expert +laion/rl_tp4s64_8x_moderate_padding +laion/rl_tp4s64_8x_partial_ambiguity +laion/rl_tp4s64_8x_stack-jest-v2 +laion/rl_tp4s64_8x_stack-selfdoc-v2 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/stackexchange-tezos-sandboxes_glm_4_7_traces_locetash +laion/swesmith-nl2bash-stack-bugsseq +laion/swesmith-sandboxes-with_tests-gpt-5-mini-passed_glm_4_7_traces +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +mistralai/Devstral-Small-2507 +moonshotai/Kimi-Dev-72B +moonshotai/Kimi-K2.5 +nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 +o4-mini +openai/gpt-5 +openai/gpt-5-mini +openai/gpt-5-nano +penfever/GLM-4_6-taskmaster2-32eps-32k-fixeps +penfever/bs64_rloo_n_noct_stri_micr_auto_tis_model_r2e-100 +penfever/bs64_rloo_n_noct_stri_micr_model_noconv_r2eg_nl2_140 +penfever/bs64_rloo_n_noct_stri_micr_model_r2eg_nl2_160 +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +penfever/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-110 +r2egym-nl2bash-stack-bugsseq-bash-withtests +r2egym-nl2bash-stack-bugsseq-cpp +r2egym-nl2bash-stack-bugsseq-junit +r2egym-nl2bash-stack-bugsseq-pytest-v2 +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +zai-org/GLM-4.7 diff --git a/eval/lists/lists/laion_latest.txt b/eval/lists/lists/laion_latest.txt new file mode 100644 index 00000000..864a34e1 --- /dev/null +++ b/eval/lists/lists/laion_latest.txt @@ -0,0 +1,28 @@ +laion/rl__24GPU_base__llm-verifier-freelancer__r2egym-nl2bash-stack +laion/rl__24GPU_shaped__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/SweSmith-8B-SFT-Rope-step62 +laion/SweSmith-8B-SFT-NoRope-step58 +laion/rl__24GPU_base__mix_h2_language_proportional__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +laion/rl_v3_tp4s64_8x_exercism-python +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned diff --git a/eval/lists/lists/latest_sort_by_release_eval_prio.txt b/eval/lists/lists/latest_sort_by_release_eval_prio.txt new file mode 100644 index 00000000..b6f1047e --- /dev/null +++ b/eval/lists/lists/latest_sort_by_release_eval_prio.txt @@ -0,0 +1,28 @@ +laion/rl__24GPU_base__llm-verifier-freelancer__r2egym-nl2bash-stack +laion/rl__24GPU_shaped__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/SweSmith-8B-SFT-Rope-step62 +laion/SweSmith-8B-SFT-NoRope-step58 +laion/rl__24GPU_base__mix_h2_language_proportional__r2egym-nl2bash-stack +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +laion/rl_v3_tp4s64_8x_exercism-python +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned diff --git a/eval/lists/lists/missing_dev_set_v2.txt b/eval/lists/lists/missing_dev_set_v2.txt new file mode 100644 index 00000000..7d0b84ba --- /dev/null +++ b/eval/lists/lists/missing_dev_set_v2.txt @@ -0,0 +1,189 @@ +laion/r2egym-nl2bash-stack-bugsseq-fixthink-again +laion/r2egym-nl2bash-stack-bugsseq-fixthink +open-thoughts/OpenThinker-Agent-v1 +DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_new_context +DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B +DCAgent/code_contests-Qwen3-Coder-480B-traces +DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/exp_tas_max_episodes_512_traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/exp_tas_max_tokens_8192_traces +DCAgent/exp_tas_presence_penalty_1_0_traces +DCAgent/exp_tas_repetition_penalty_1_05_traces +DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B +DCAgent/freelancer-long-instruction-filter_Qwen3-8B +DCAgent/freelancer-projects-0-1k-traces +DCAgent/freelancer-projects-0-3k-traces +DCAgent/freelancer-projects-10k-traces +DCAgent/freelancer-projects-1k-traces +DCAgent/freelancer-projects-gpt5_Qwen3-8B +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-short-instruction-filter_Qwen3-8B +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B +DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base +DCAgent/taskmaster2-0-1k-traces +DCAgent/taskmaster2-0-3k-traces +DCAgent/taskmaster2-1k-traces +DCAgent/taskmaster2-banana +DCAgent2/bugs-swesmith-reason +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/glm-4_6-freelancer-traces +DCAgent2/inferredbugs-GLM-4_6-32ep-65k +DCAgent2/nl2bash-bugs-over5050 +DCAgent2/nl2bash-bugsshuffle +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-stack-undr3070 +DCAgent2/nl2bash-stack-undr7030 +DCAgent2/nl2bash-stackshuffle +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 +DCAgent2/stack-bugs-undr7030 +DCAgent2/swesmith-stack-undr7030 +Qwen/Qwen2.5-Coder-32B-Instruct +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-8B-Base +SWE-bench/SWE-agent-LM-32B +mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-80x_glm_4_7_traces_jupiter_cleaned +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_interleaved_thinking_on_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_p_0_8_traces +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +moonshotai/Kimi-Dev-72B +penfever/GLM-4_6-codeforces-32ep-32k-restore-hp +penfever/bs64_rloo_n_noct_stri_micr_model_noconv_r2eg_nl2_140 +penfever/nl2bash-0-1k-traces-restore-hp +penfever/nl2bash-0-3k-traces-restore-hp +penfever/nl2bash-1k-traces-restore-hp +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-32ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/nl2bash-4ep-restore-hp +penfever/nl2bash-8ep-restore-hp +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +penfever/swesmith-2stage-restore-hp +laion/r2egym-nl2bash-stack-bugsseq-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-cpp +laion/r2egym-nl2bash-stack-bugsseq-junit +laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/lists/missing_dev_set_v2_inactive_laion.txt b/eval/lists/lists/missing_dev_set_v2_inactive_laion.txt new file mode 100644 index 00000000..730ec8cc --- /dev/null +++ b/eval/lists/lists/missing_dev_set_v2_inactive_laion.txt @@ -0,0 +1,24 @@ +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_p_0_8_traces +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B diff --git a/eval/lists/lists/missing_swebench_verified_random_100_folders.txt b/eval/lists/lists/missing_swebench_verified_random_100_folders.txt new file mode 100644 index 00000000..a07b9104 --- /dev/null +++ b/eval/lists/lists/missing_swebench_verified_random_100_folders.txt @@ -0,0 +1,69 @@ +DCAgent/All_Puzzles_5k_new_context +DCAgent/exp_rpt_crosscodeeval-csharp_20260219 +DCAgent/freelancer-projects-100k-traces_Qwen3-8B +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B +DCAgent2/glm-4_6-freelancer-traces-pm +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +r2egym-nl2bash-stack-bugsseq-bash-withtests +r2egym-nl2bash-stack-bugsseq-cpp +r2egym-nl2bash-stack-bugsseq-junit +r2egym-nl2bash-stack-bugsseq-pytest-v2 +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/lists/missing_terminal_bench_2.txt b/eval/lists/lists/missing_terminal_bench_2.txt new file mode 100644 index 00000000..64dd4971 --- /dev/null +++ b/eval/lists/lists/missing_terminal_bench_2.txt @@ -0,0 +1,96 @@ + +DCAgent/exp_rpt_crosscodeeval-csharp_20260219 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 +DCAgent/taskmaster2-banana +DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-swesmithseq +DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +Qwen/Qwen2.5-Coder-32B-Instruct +hosted_vllm/mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps +laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/glm46-swegym-tasks-maxeps-131k +laion/glm46-swesmith-maxeps-131k-lc +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-php-larg-75 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_meth-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-self-larg-60 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_swesmith-fixthink-pymethods2test-45 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +r2egym-nl2bash-stack-bugsseq-bash-withtests +r2egym-nl2bash-stack-bugsseq-cpp +r2egym-nl2bash-stack-bugsseq-junit +r2egym-nl2bash-stack-bugsseq-pytest-v2 +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/lists/models_131k.txt b/eval/lists/lists/models_131k.txt new file mode 100644 index 00000000..b049ad34 --- /dev/null +++ b/eval/lists/lists/models_131k.txt @@ -0,0 +1,35 @@ +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-swesmith-32ep-131k-nosumm +laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning +laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k-lc +laion/GLM-4_7-stackexchange-tezos-sandboxes-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink +laion/Kimi-K2T-swesmith-32ep-131k +laion/glm-4_6-all-puzzles-32ep-131k +laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k +laion/glm-4_6-freelancer-32ep-131k-torch +laion/glm-4_6-stack-overflow-32ep-131k-summtrc +laion/glm-4_6-stackexchange-tezos-32ep-131k +laion/glm-4_6-staqc-32ep-131k +laion/glm46-defects4j-32ep-131k +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k +laion/glm46-neulab-synatra-32ep-131k +laion/glm46-qasper-maxeps-131k +laion/glm46-r2egym_sandboxes-maxeps-131k +laion/glm46-stackexchange-tezos-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k-lc +laion/glm46-swesmith-maxeps-131k +laion/glm46-swesmith-maxeps-131k-fixthink +laion/glm46-swesmith-maxeps-131k-lc +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/minimax-m2-stack-overflow-32ep-131k-summtrc +laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc +laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B diff --git a/eval/lists/lists/models_32b.txt b/eval/lists/lists/models_32b.txt new file mode 100644 index 00000000..2c5a7967 --- /dev/null +++ b/eval/lists/lists/models_32b.txt @@ -0,0 +1,31 @@ +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +NovaSky-AI/SA-SWE-32B +Qwen/Qwen2.5-Coder-32B-Instruct +Qwen/Qwen3-32B +R2E-Gym/R2EGym-32B-Agent +SWE-Swiss/SWE-Swiss-32B +SWE-bench/SWE-agent-LM-32B +Skywork/Skywork-SWE-32B +allenai/SERA-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_32_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_4.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/open-thoughts-4-code-qwen3-32b-annotated +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B diff --git a/eval/lists/lists/nemotron_nano.txt b/eval/lists/lists/nemotron_nano.txt new file mode 100644 index 00000000..15acb74c --- /dev/null +++ b/eval/lists/lists/nemotron_nano.txt @@ -0,0 +1 @@ +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 diff --git a/eval/lists/lists/no_eval_models_latest.txt b/eval/lists/lists/no_eval_models_latest.txt new file mode 100644 index 00000000..8d5ac20f --- /dev/null +++ b/eval/lists/lists/no_eval_models_latest.txt @@ -0,0 +1,30 @@ +rl__nemotron-bash_fp8_terminus-2_step48 +laion/rl__64GPU_base_32b__nl2bash-tasks-cleaned-oracle__syh-r2eg-askl-glm_4__40-0 +DCAgent/a1-quixbugs +laion/rl__24GPU_shaped__stackexchange-overflow__exp_tas_optimal_comb-25 +DCAgent/a1-e2egit +DCAgent/a1-nemotron_rspec +laion/allenai-sera-unified-100000-opt100k__Qwen3-8B +laion/allenai-sera-unified-31600-opt100k__Qwen3-8B +laion/coderforge-100000-opt100k__Qwen3-8B +laion/sera-10000__Qwen3-8B +laion/coderforge-3160__Qwen3-8B +laion/swesmith-100000-opt100k__Qwen3-8B +laion/coderforge-31600-opt100k__Qwen3-8B +laion/sera-1000-opt1k__Qwen3-8B +laion/r2egym-100000-opt100k__Qwen3-8B +laion/coderforge-1000-opt1k__Qwen3-8B +laion/sera-316-opt1k__Qwen3-8B +laion/r2egym-316-opt1k__Qwen3-8B +laion/r2egym-1000-opt1k__Qwen3-8B +laion/coderforge-316-opt1k__Qwen3-8B +laion/rl__24GPU_shaped__nemotron-math-oracle-filtered__exp_tas_optimal_comb__40-0-30 +laion/coderforge-10000__Qwen3-8B +laion/r2egym-10000__Qwen3-8B +laion/sera-3160__Qwen3-8B +DCAgent/a1-ghactions +laion/swesmith-unified-10000__Qwen3-8B +laion/r2egym-unified-3160__Qwen3-8B +laion/rl__24GPU_shaped__exp_rpt_pymethods2test-large__GLM-4_7-swesmith-san-30 +laion/100k_epochs4__Qwen3-8B +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__0-05__Qwen3-8B \ No newline at end of file diff --git a/eval/lists/lists/pipeline_exp_prio.txt b/eval/lists/lists/pipeline_exp_prio.txt new file mode 100644 index 00000000..75105143 --- /dev/null +++ b/eval/lists/lists/pipeline_exp_prio.txt @@ -0,0 +1,19 @@ +DCAgent/a1-taskmaster2 +DCAgent/a1-stack_bash +DCAgent/a1-repo_scaffold +DCAgent/a1-pr_mining +DCAgent/a1-nemotron_junit +DCAgent/a1-nemotron_cpp +DCAgent/a1-nemotron_bash +DCAgent/a1-manybugs +DCAgent/a1-issue_tasks +DCAgent/a1-codenet_python +DCAgent/a1-bugsinpy +DCAgent/a1-multifile_composition +DCAgent/a1-exercism_python +DCAgent/a1-crosscodeeval_typescript +DCAgent/a1-crosscodeeval_python +DCAgent/a1-crosscodeeval_java +DCAgent/a1-taco +DCAgent/a1-staqc +DCAgent/a1-stackexchange_tezos diff --git a/eval/lists/lists/priority_131k_test.txt b/eval/lists/lists/priority_131k_test.txt new file mode 100644 index 00000000..2f77176c --- /dev/null +++ b/eval/lists/lists/priority_131k_test.txt @@ -0,0 +1 @@ +laion/alfworld-swesmith-r2egym-swegym-131k-lc diff --git a/eval/lists/lists/priority_batch2.txt b/eval/lists/lists/priority_batch2.txt new file mode 100644 index 00000000..c15c9b04 --- /dev/null +++ b/eval/lists/lists/priority_batch2.txt @@ -0,0 +1,3 @@ +open-thoughts/OpenThinker3-7B +deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +camel-ai/seta-rl-qwen3-8b diff --git a/eval/lists/lists/priority_batch_evalorg.txt b/eval/lists/lists/priority_batch_evalorg.txt new file mode 100644 index 00000000..f32a0dd6 --- /dev/null +++ b/eval/lists/lists/priority_batch_evalorg.txt @@ -0,0 +1,3 @@ +obiwan96/qwen3-8b-openthinker-sft-endless-terminals +nvidia/Nemotron-Terminal-8B +nvidia/Nemotron-Terminal-14B diff --git a/eval/lists/lists/priority_obiwan.txt b/eval/lists/lists/priority_obiwan.txt new file mode 100644 index 00000000..ab57457f --- /dev/null +++ b/eval/lists/lists/priority_obiwan.txt @@ -0,0 +1 @@ +obiwan96/qwen3-8b-openthinker-sft-endless-terminals diff --git a/eval/lists/lists/priority_qwen35.txt b/eval/lists/lists/priority_qwen35.txt new file mode 100644 index 00000000..14ef350e --- /dev/null +++ b/eval/lists/lists/priority_qwen35.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-9B diff --git a/eval/lists/lists/priority_rl_test.txt b/eval/lists/lists/priority_rl_test.txt new file mode 100644 index 00000000..4aab785f --- /dev/null +++ b/eval/lists/lists/priority_rl_test.txt @@ -0,0 +1 @@ +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B diff --git a/eval/lists/lists/pruned_models_names.txt b/eval/lists/lists/pruned_models_names.txt new file mode 100644 index 00000000..328d8ec8 --- /dev/null +++ b/eval/lists/lists/pruned_models_names.txt @@ -0,0 +1,364 @@ +DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_new_context +DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct +DCAgent/bash_textbook_tasks_traces +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B +DCAgent/code_contests-Qwen3-Coder-480B-traces +DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/codeforces-gptoss120b-traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/freelancer-askllm-filtered-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B +DCAgent/freelancer-long-instruction-filter_Qwen3-8B +DCAgent/freelancer-projects-0-1k-traces +DCAgent/freelancer-projects-0-3k-traces +DCAgent/freelancer-projects-100k-traces_Qwen3-8B +DCAgent/freelancer-projects-10k-traces +DCAgent/freelancer-projects-1k-traces +DCAgent/freelancer-projects-3k-traces +DCAgent/freelancer-projects-gpt5_Qwen3-8B +DCAgent/freelancer-projects-gpt5mini +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-short-instruction-filter_Qwen3-8B +DCAgent/freelancer-t1024s-32ep_Qwen3-8B +DCAgent/freelancer-t2048s-32ep_Qwen3-8B +DCAgent/freelancer-t256s-32ep_Qwen3-8B +DCAgent/freelancer-t512s-32ep_Qwen3-8B +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_40 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-mind2web-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-synatra-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B +DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 +DCAgent/staqc-ot3-100k-code-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-math-subset-traces-terminus-2_save-strategy_steps_Qwen3-8B +DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-Coder-30B-A3B-Instruct +DCAgent/taskmaster2-0-1k-traces +DCAgent/taskmaster2-0-3k-traces +DCAgent/taskmaster2-10k-traces +DCAgent/taskmaster2-16ep +DCAgent/taskmaster2-1ep +DCAgent/taskmaster2-1k-traces +DCAgent/taskmaster2-2ep +DCAgent/taskmaster2-32ep +DCAgent/taskmaster2-3k-traces +DCAgent/taskmaster2-4ep +DCAgent/taskmaster2-64ep +DCAgent/taskmaster2-8ep +DCAgent/taskmaster2-banana +DCAgent/taskmaster2-gpt5mini +DCAgent/taskmaster2-gpt5mini_global-batch-size_16 +DCAgent/tbench_oracle_solutions_terminus +DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B +DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces +DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps +DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps +DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k +DCAgent2/bugs-nl2bashseq +DCAgent2/bugs-stack-nl2bashseq +DCAgent2/bugs-swesmith-over5050 +DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv +DCAgent2/freelancer-projects-100k-traces +DCAgent2/freelancer-projects-31k-traces +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces +DCAgent2/glm-4_6-freelancer-traces +DCAgent2/glm-4_6-freelancer-traces-pm +DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor +DCAgent2/inferredbugs-GLM-4_6-32ep-32k +DCAgent2/inferredbugs-GLM-4_6-32ep-65k +DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw +DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B +DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B +DCAgent2/nl2bash-bugs-over5050 +DCAgent2/nl2bash-bugs-undr3070 +DCAgent2/nl2bash-bugsseq +DCAgent2/nl2bash-bugsshuffle +DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/nl2bash-stack-bugs-over333 +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-stack-bugs-undr503020 +DCAgent2/nl2bash-stack-over5050 +DCAgent2/nl2bash-stack-undr3070 +DCAgent2/nl2bash-stack-undr7030 +DCAgent2/nl2bash-stackseq +DCAgent2/nl2bash-stackshuffle +DCAgent2/nl2bash-swesmith-reason +DCAgent2/nl2bash-swesmithseq +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1097_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2095 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 +DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/stack-bugs-over5050 +DCAgent2/stack-bugs-undr3070 +DCAgent2/stack-bugs-undr7030 +DCAgent2/stack-bugsseq +DCAgent2/stack-bugsshuffle +DCAgent2/stack-nl2bashseq +DCAgent2/stack-swesmithseq +DCAgent2/swesmith-nl2bashseq +DCAgent2/swesmith-stack-undr7030 +DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-GLM-4_6-32ep-32k +DCAgent2/test2-tbench-dev-71-qwen3-8b-8nodes-sync +bespokelabs/Qwen3-8B-ot_step100 +bespokelabs/Qwen3-8B-ot_step60_high +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-inferredbugs-32ep-65k-reasoning +laion/GLM-4_6-nl2bash-verified-32ep-32k-reasoning +laion/GLM-4_6-selfinstruct-naive-2-32ep-32k +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/GLM-4_6-swesmith-32ep-131k-nosumm +laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning +laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink +laion/GPT-OSS-120B-codeforces-fixeps_Qwen3-8B +laion/Kimi-K2T-neulab-agenttuning-kg-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-mind2web-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-webshop-sandboxes-maxeps-32k +laion/Kimi-K2T-swesmith-32ep-131k +laion/MiniMax-M2-freelancer-32ep-32k +laion/MiniMax-M2-freelancer-32ep-32k-reasoning +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_summarize_threshold_4096_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_trajectory_minimal_traces_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-Coder-480B-codeforces-fixeps_Qwen3-8B +laion/Qwen3-Coder-480B-nl2bash-fixeps_Qwen3-8B +laion/bugs-nl2bashseq_Qwen3-8B +laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces +laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter +laion/exp-psu-stackoverflow-1K_glm_4_7_traces +laion/exp-psu-stackoverflow-316_glm_4_7_traces +laion/exp-psu-stackoverflow-31K_glm_4_7_traces +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-swd-r2egym-wo-docker_glm_4_7_traces +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-4_2x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-8_4x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_frequency_penalty_1_0_traces +laion/exp_tas_high_diversity_traces +laion/exp_tas_linear_history_off_traces +laion/exp_tas_low_diversity_traces +laion/exp_tas_max_tokens_2048_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_parser_xml_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_repetition_penalty_1_2_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_k_128_traces +laion/exp_tas_top_k_16_traces +laion/exp_tas_top_p_0_8_traces +laion/glm-4_6-all-puzzles-32ep-131k +laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k +laion/glm-4_6-freelancer-32ep-131k-torch +laion/glm-4_6-nemo-prism +laion/glm-4_6-r2egym-32ep-32k +laion/glm-4_6-stack-overflow-32ep-131k-summtrc +laion/glm-4_6-staqc-32ep-131k +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/glm46-defects4j-32ep-131k +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k +laion/glm46-neulab-synatra-32ep-131k +laion/glm46-qasper-maxeps-131k +laion/glm46-stackexchange-tezos-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k +laion/glm46-swesmith-maxeps-131k +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 +laion/kimi-k2-r2egym_sandboxes-maxeps-32k +laion/kimi-k2-swegym-tasks-maxeps-32k +laion/kimi-k2t-freelancer-32ep-32k +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/minimax-m2-stack-overflow-32ep-131k-summtrc +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e3_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/open-thoughts-4-code-qwen3-32b-annotated +laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc +laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/r2egym-bugsseq +laion/r2egym-gpt5-codex-160ep-1M +laion/r2egym-nl2bash-bugsseq +laion/r2egym-nl2bash-stack-bugsseq-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-cpp +laion/r2egym-nl2bash-stack-bugsseq-fixthink +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp +laion/r2egym-nl2bash-stack-bugsseq-junit +laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-stack-php-v2 +laion/r2egym-nl2bash-stack-bugsseq_lr3e-5_exp_rpt_stack-php-v2_step20 +laion/r2egym-nl2bash-stackseq +laion/r2egym-stack-bugsseq +laion/rl_bs128-gs16-rloo-n-code-contests-900s-noreg-15 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr1e-5_taco +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-type +laion/rl_think_npfg-code-contests-900s-45 +laion/rl_tp4s64_8x_2skill +laion/rl_tp4s64_8x_exercism-python +laion/rl_tp4s64_8x_flat25_baseline +laion/rl_tp4s64_8x_github_issue +laion/rl_tp4s64_8x_heavy_padding +laion/rl_tp4s64_8x_minimal_instructions +laion/rl_tp4s64_8x_nemotron-cpp +laion/rl_tp4s64_8x_nemotron-junit +laion/rl_tp4s64_8x_proportional +laion/rl_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_43 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 +open-thoughts/OpenThinker-Agent-v1-SFT +penfever/GLM-4_6-codeforces-32ep-32k-restore-hp +penfever/GLM-4_6-gemini25flash-stackexchange-overflow-32ep-512k-fixeps +penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps +penfever/freelancer-t1024s-32ep-restore-hp +penfever/freelancer-t2048s-32ep-restore-hp +penfever/freelancer-t512s-32ep-restore-hp +penfever/kimi-k2-swesmith_with_plain_docker-sandboxes-maxeps-32k +penfever/neulab-codeactinstruct-restore-hp +penfever/nl2bash-0-1k-traces-restore-hp +penfever/nl2bash-0-3k-traces-restore-hp +penfever/nl2bash-16ep-restore-hp +penfever/nl2bash-1ep-restore-hp +penfever/nl2bash-1k-traces-restore-hp +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-32ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/nl2bash-4ep-restore-hp +penfever/nl2bash-8ep-restore-hp +penfever/nl2bash-GLM-4_6-traces-newhparams +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp +penfever/nl2bash_gpt-5-nano-traces-8ep-restore-hp +penfever/nl2bash_verified_gpt-5-nano-traces-restore-hp +penfever/rl_bs128_gs16_ruby-30 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_code-v2-25 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-pyte-v2-25 +penfever/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_soft-v2-45 +penfever/selfinstruct-naive-sandboxes-2-traces-restore-hp +penfever/swesmith-2stage-restore-hp +penfever/taskmaster2-4ep-2stage-restore-hp diff --git a/eval/lists/lists/pyme_v3_40.txt b/eval/lists/lists/pyme_v3_40.txt new file mode 100644 index 00000000..b67069dc --- /dev/null +++ b/eval/lists/lists/pyme_v3_40.txt @@ -0,0 +1 @@ +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 diff --git a/eval/lists/lists/qwen35_27b.txt b/eval/lists/lists/qwen35_27b.txt new file mode 100644 index 00000000..1e3117a0 --- /dev/null +++ b/eval/lists/lists/qwen35_27b.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-27B diff --git a/eval/lists/lists/qwen35_9b.txt b/eval/lists/lists/qwen35_9b.txt new file mode 100644 index 00000000..14ef350e --- /dev/null +++ b/eval/lists/lists/qwen35_9b.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-9B diff --git a/eval/lists/lists/richard_base_model.txt b/eval/lists/lists/richard_base_model.txt new file mode 100644 index 00000000..497c9a05 --- /dev/null +++ b/eval/lists/lists/richard_base_model.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-9B \ No newline at end of file diff --git a/eval/lists/lists/richard_test_model.txt b/eval/lists/lists/richard_test_model.txt new file mode 100644 index 00000000..bd8a4f31 --- /dev/null +++ b/eval/lists/lists/richard_test_model.txt @@ -0,0 +1,2 @@ +# laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter \ No newline at end of file diff --git a/eval/lists/lists/rope_step_batch.txt b/eval/lists/lists/rope_step_batch.txt new file mode 100644 index 00000000..2c54a448 --- /dev/null +++ b/eval/lists/lists/rope_step_batch.txt @@ -0,0 +1,3 @@ +laion/swesmith_8b_rope_65k-step37 +laion/r2egym_8b_rope_65k-step17 +laion/swesmith_8b-step35 diff --git a/eval/lists/lists/sera_14b.txt b/eval/lists/lists/sera_14b.txt new file mode 100644 index 00000000..8835118a --- /dev/null +++ b/eval/lists/lists/sera_14b.txt @@ -0,0 +1 @@ +allenai/SERA-14B diff --git a/eval/lists/lists/swesmith_fixthink_45.txt b/eval/lists/lists/swesmith_fixthink_45.txt new file mode 100644 index 00000000..c4287878 --- /dev/null +++ b/eval/lists/lists/swesmith_fixthink_45.txt @@ -0,0 +1 @@ +laion/rl_swesmith-fixthink-pymethods2test-45 diff --git a/eval/lists/lists/syh_32b.txt b/eval/lists/lists/syh_32b.txt new file mode 100644 index 00000000..492f1c93 --- /dev/null +++ b/eval/lists/lists/syh_32b.txt @@ -0,0 +1 @@ +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B diff --git a/eval/lists/lists/tb2_richard_test_model.txt b/eval/lists/lists/tb2_richard_test_model.txt new file mode 100644 index 00000000..8eddee63 --- /dev/null +++ b/eval/lists/lists/tb2_richard_test_model.txt @@ -0,0 +1 @@ +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter \ No newline at end of file diff --git a/eval/lists/lists/v2_richard_test_model.txt b/eval/lists/lists/v2_richard_test_model.txt new file mode 100644 index 00000000..83e4b4f6 --- /dev/null +++ b/eval/lists/lists/v2_richard_test_model.txt @@ -0,0 +1 @@ +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr \ No newline at end of file diff --git a/eval/lists/missing_dev_set_v2.txt b/eval/lists/missing_dev_set_v2.txt new file mode 100644 index 00000000..7d0b84ba --- /dev/null +++ b/eval/lists/missing_dev_set_v2.txt @@ -0,0 +1,189 @@ +laion/r2egym-nl2bash-stack-bugsseq-fixthink-again +laion/r2egym-nl2bash-stack-bugsseq-fixthink +open-thoughts/OpenThinker-Agent-v1 +DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_new_context +DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B +DCAgent/code_contests-Qwen3-Coder-480B-traces +DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/exp_tas_max_episodes_512_traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/exp_tas_max_tokens_8192_traces +DCAgent/exp_tas_presence_penalty_1_0_traces +DCAgent/exp_tas_repetition_penalty_1_05_traces +DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B +DCAgent/freelancer-long-instruction-filter_Qwen3-8B +DCAgent/freelancer-projects-0-1k-traces +DCAgent/freelancer-projects-0-3k-traces +DCAgent/freelancer-projects-10k-traces +DCAgent/freelancer-projects-1k-traces +DCAgent/freelancer-projects-gpt5_Qwen3-8B +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-short-instruction-filter_Qwen3-8B +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B +DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base +DCAgent/taskmaster2-0-1k-traces +DCAgent/taskmaster2-0-3k-traces +DCAgent/taskmaster2-1k-traces +DCAgent/taskmaster2-banana +DCAgent2/bugs-swesmith-reason +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/glm-4_6-freelancer-traces +DCAgent2/inferredbugs-GLM-4_6-32ep-65k +DCAgent2/nl2bash-bugs-over5050 +DCAgent2/nl2bash-bugsshuffle +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-stack-undr3070 +DCAgent2/nl2bash-stack-undr7030 +DCAgent2/nl2bash-stackshuffle +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 +DCAgent2/stack-bugs-undr7030 +DCAgent2/swesmith-stack-undr7030 +Qwen/Qwen2.5-Coder-32B-Instruct +Qwen/Qwen3-4B-Thinking-2507 +Qwen/Qwen3-8B-Base +SWE-bench/SWE-agent-LM-32B +mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter_cleaned +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-80x_glm_4_7_traces_jupiter_cleaned +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_interleaved_thinking_on_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_p_0_8_traces +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +moonshotai/Kimi-Dev-72B +penfever/GLM-4_6-codeforces-32ep-32k-restore-hp +penfever/bs64_rloo_n_noct_stri_micr_model_noconv_r2eg_nl2_140 +penfever/nl2bash-0-1k-traces-restore-hp +penfever/nl2bash-0-3k-traces-restore-hp +penfever/nl2bash-1k-traces-restore-hp +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-32ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/nl2bash-4ep-restore-hp +penfever/nl2bash-8ep-restore-hp +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +penfever/swesmith-2stage-restore-hp +laion/r2egym-nl2bash-stack-bugsseq-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-cpp +laion/r2egym-nl2bash-stack-bugsseq-junit +laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/missing_dev_set_v2_inactive_laion.txt b/eval/lists/missing_dev_set_v2_inactive_laion.txt new file mode 100644 index 00000000..730ec8cc --- /dev/null +++ b/eval/lists/missing_dev_set_v2_inactive_laion.txt @@ -0,0 +1,24 @@ +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_p_0_8_traces +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B diff --git a/eval/lists/missing_swebench_verified_random_100_folders.txt b/eval/lists/missing_swebench_verified_random_100_folders.txt new file mode 100644 index 00000000..a07b9104 --- /dev/null +++ b/eval/lists/missing_swebench_verified_random_100_folders.txt @@ -0,0 +1,69 @@ +DCAgent/All_Puzzles_5k_new_context +DCAgent/exp_rpt_crosscodeeval-csharp_20260219 +DCAgent/freelancer-projects-100k-traces_Qwen3-8B +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B +DCAgent2/glm-4_6-freelancer-traces-pm +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +r2egym-nl2bash-stack-bugsseq-bash-withtests +r2egym-nl2bash-stack-bugsseq-cpp +r2egym-nl2bash-stack-bugsseq-junit +r2egym-nl2bash-stack-bugsseq-pytest-v2 +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/missing_terminal_bench_2.txt b/eval/lists/missing_terminal_bench_2.txt new file mode 100644 index 00000000..64dd4971 --- /dev/null +++ b/eval/lists/missing_terminal_bench_2.txt @@ -0,0 +1,96 @@ + +DCAgent/exp_rpt_crosscodeeval-csharp_20260219 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 +DCAgent/taskmaster2-banana +DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-swesmithseq +DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +Qwen/Qwen2.5-Coder-32B-Instruct +hosted_vllm/mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B +laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B +laion/Qwen3-32B-NL2Bash-31step +laion/Qwen3-32B-R2EGYM-256-3epochs +laion/Qwen3-32B-SweSmith-20step +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps +laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter +laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B +laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter +laion/glm46-swegym-tasks-maxeps-131k +laion/glm46-swesmith-maxeps-131k-lc +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B +laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack +laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 +laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 +laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack +laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack +laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith +laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-php-larg-75 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_meth-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-self-larg-60 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 +laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 +laion/rl_swesmith-fixthink-pymethods2test-45 +laion/rl_v1_tp4s64_8x_exercism-python +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/rl_v1_tp4s64_8x_stack-jest-large +laion/rl_v1_tp4s64_8x_stack-pytest-large +laion/rl_v1_tp4s64_8x_structural_debug +laion/rl_v3_tp4s64_8x_exercism-python +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B +laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B +penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 +penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 +r2egym-nl2bash-stack-bugsseq-bash-withtests +r2egym-nl2bash-stack-bugsseq-cpp +r2egym-nl2bash-stack-bugsseq-junit +r2egym-nl2bash-stack-bugsseq-pytest-v2 +rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/nemotron_nano.txt b/eval/lists/nemotron_nano.txt new file mode 100644 index 00000000..15acb74c --- /dev/null +++ b/eval/lists/nemotron_nano.txt @@ -0,0 +1 @@ +nvidia/Llama-3.1-Nemotron-Nano-8B-v1 diff --git a/eval/lists/no_eval_models_latest.txt b/eval/lists/no_eval_models_latest.txt new file mode 100644 index 00000000..8d5ac20f --- /dev/null +++ b/eval/lists/no_eval_models_latest.txt @@ -0,0 +1,30 @@ +rl__nemotron-bash_fp8_terminus-2_step48 +laion/rl__64GPU_base_32b__nl2bash-tasks-cleaned-oracle__syh-r2eg-askl-glm_4__40-0 +DCAgent/a1-quixbugs +laion/rl__24GPU_shaped__stackexchange-overflow__exp_tas_optimal_comb-25 +DCAgent/a1-e2egit +DCAgent/a1-nemotron_rspec +laion/allenai-sera-unified-100000-opt100k__Qwen3-8B +laion/allenai-sera-unified-31600-opt100k__Qwen3-8B +laion/coderforge-100000-opt100k__Qwen3-8B +laion/sera-10000__Qwen3-8B +laion/coderforge-3160__Qwen3-8B +laion/swesmith-100000-opt100k__Qwen3-8B +laion/coderforge-31600-opt100k__Qwen3-8B +laion/sera-1000-opt1k__Qwen3-8B +laion/r2egym-100000-opt100k__Qwen3-8B +laion/coderforge-1000-opt1k__Qwen3-8B +laion/sera-316-opt1k__Qwen3-8B +laion/r2egym-316-opt1k__Qwen3-8B +laion/r2egym-1000-opt1k__Qwen3-8B +laion/coderforge-316-opt1k__Qwen3-8B +laion/rl__24GPU_shaped__nemotron-math-oracle-filtered__exp_tas_optimal_comb__40-0-30 +laion/coderforge-10000__Qwen3-8B +laion/r2egym-10000__Qwen3-8B +laion/sera-3160__Qwen3-8B +DCAgent/a1-ghactions +laion/swesmith-unified-10000__Qwen3-8B +laion/r2egym-unified-3160__Qwen3-8B +laion/rl__24GPU_shaped__exp_rpt_pymethods2test-large__GLM-4_7-swesmith-san-30 +laion/100k_epochs4__Qwen3-8B +laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__0-05__Qwen3-8B \ No newline at end of file diff --git a/eval/lists/pipeline_exp_prio.txt b/eval/lists/pipeline_exp_prio.txt new file mode 100644 index 00000000..75105143 --- /dev/null +++ b/eval/lists/pipeline_exp_prio.txt @@ -0,0 +1,19 @@ +DCAgent/a1-taskmaster2 +DCAgent/a1-stack_bash +DCAgent/a1-repo_scaffold +DCAgent/a1-pr_mining +DCAgent/a1-nemotron_junit +DCAgent/a1-nemotron_cpp +DCAgent/a1-nemotron_bash +DCAgent/a1-manybugs +DCAgent/a1-issue_tasks +DCAgent/a1-codenet_python +DCAgent/a1-bugsinpy +DCAgent/a1-multifile_composition +DCAgent/a1-exercism_python +DCAgent/a1-crosscodeeval_typescript +DCAgent/a1-crosscodeeval_python +DCAgent/a1-crosscodeeval_java +DCAgent/a1-taco +DCAgent/a1-staqc +DCAgent/a1-stackexchange_tezos diff --git a/eval/lists/priority_131k_test.txt b/eval/lists/priority_131k_test.txt new file mode 100644 index 00000000..2f77176c --- /dev/null +++ b/eval/lists/priority_131k_test.txt @@ -0,0 +1 @@ +laion/alfworld-swesmith-r2egym-swegym-131k-lc diff --git a/eval/lists/priority_batch2.txt b/eval/lists/priority_batch2.txt new file mode 100644 index 00000000..c15c9b04 --- /dev/null +++ b/eval/lists/priority_batch2.txt @@ -0,0 +1,3 @@ +open-thoughts/OpenThinker3-7B +deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +camel-ai/seta-rl-qwen3-8b diff --git a/eval/lists/priority_batch_evalorg.txt b/eval/lists/priority_batch_evalorg.txt new file mode 100644 index 00000000..f32a0dd6 --- /dev/null +++ b/eval/lists/priority_batch_evalorg.txt @@ -0,0 +1,3 @@ +obiwan96/qwen3-8b-openthinker-sft-endless-terminals +nvidia/Nemotron-Terminal-8B +nvidia/Nemotron-Terminal-14B diff --git a/eval/lists/priority_obiwan.txt b/eval/lists/priority_obiwan.txt new file mode 100644 index 00000000..ab57457f --- /dev/null +++ b/eval/lists/priority_obiwan.txt @@ -0,0 +1 @@ +obiwan96/qwen3-8b-openthinker-sft-endless-terminals diff --git a/eval/lists/priority_qwen35.txt b/eval/lists/priority_qwen35.txt new file mode 100644 index 00000000..14ef350e --- /dev/null +++ b/eval/lists/priority_qwen35.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-9B diff --git a/eval/lists/priority_rl_test.txt b/eval/lists/priority_rl_test.txt new file mode 100644 index 00000000..4aab785f --- /dev/null +++ b/eval/lists/priority_rl_test.txt @@ -0,0 +1 @@ +laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B diff --git a/eval/lists/pruned_models_names.txt b/eval/lists/pruned_models_names.txt new file mode 100644 index 00000000..328d8ec8 --- /dev/null +++ b/eval/lists/pruned_models_names.txt @@ -0,0 +1,364 @@ +DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context +DCAgent/All_Puzzles_5k_new_context +DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct +DCAgent/bash_textbook_tasks_traces +DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B +DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B +DCAgent/code_contests-Qwen3-Coder-480B-traces +DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B +DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B +DCAgent/codeforces-gptoss120b-traces +DCAgent/exp_tas_max_tokens_1024_traces +DCAgent/freelancer-askllm-filtered-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B +DCAgent/freelancer-long-instruction-filter_Qwen3-8B +DCAgent/freelancer-projects-0-1k-traces +DCAgent/freelancer-projects-0-3k-traces +DCAgent/freelancer-projects-100k-traces_Qwen3-8B +DCAgent/freelancer-projects-10k-traces +DCAgent/freelancer-projects-1k-traces +DCAgent/freelancer-projects-3k-traces +DCAgent/freelancer-projects-gpt5_Qwen3-8B +DCAgent/freelancer-projects-gpt5mini +DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B +DCAgent/freelancer-short-instruction-filter_Qwen3-8B +DCAgent/freelancer-t1024s-32ep_Qwen3-8B +DCAgent/freelancer-t2048s-32ep_Qwen3-8B +DCAgent/freelancer-t256s-32ep_Qwen3-8B +DCAgent/freelancer-t512s-32ep_Qwen3-8B +DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_40 +DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-mind2web-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/neulab-synatra-sandboxes-traces-terminus-2_Qwen3-8B +DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B +DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 +DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 +DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 +DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 +DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 +DCAgent/staqc-ot3-100k-code-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-math-subset-traces-terminus-2_save-strategy_steps_Qwen3-8B +DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B +DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base +DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-Coder-30B-A3B-Instruct +DCAgent/taskmaster2-0-1k-traces +DCAgent/taskmaster2-0-3k-traces +DCAgent/taskmaster2-10k-traces +DCAgent/taskmaster2-16ep +DCAgent/taskmaster2-1ep +DCAgent/taskmaster2-1k-traces +DCAgent/taskmaster2-2ep +DCAgent/taskmaster2-32ep +DCAgent/taskmaster2-3k-traces +DCAgent/taskmaster2-4ep +DCAgent/taskmaster2-64ep +DCAgent/taskmaster2-8ep +DCAgent/taskmaster2-banana +DCAgent/taskmaster2-gpt5mini +DCAgent/taskmaster2-gpt5mini_global-batch-size_16 +DCAgent/tbench_oracle_solutions_terminus +DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B +DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces +DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps +DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps +DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k +DCAgent2/bugs-nl2bashseq +DCAgent2/bugs-stack-nl2bashseq +DCAgent2/bugs-swesmith-over5050 +DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv +DCAgent2/freelancer-projects-100k-traces +DCAgent2/freelancer-projects-31k-traces +DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B +DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces +DCAgent2/glm-4_6-freelancer-traces +DCAgent2/glm-4_6-freelancer-traces-pm +DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor +DCAgent2/inferredbugs-GLM-4_6-32ep-32k +DCAgent2/inferredbugs-GLM-4_6-32ep-65k +DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw +DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B +DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B +DCAgent2/nl2bash-bugs-over5050 +DCAgent2/nl2bash-bugs-undr3070 +DCAgent2/nl2bash-bugsseq +DCAgent2/nl2bash-bugsshuffle +DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/nl2bash-stack-bugs-over333 +DCAgent2/nl2bash-stack-bugs-undr203050 +DCAgent2/nl2bash-stack-bugs-undr503020 +DCAgent2/nl2bash-stack-over5050 +DCAgent2/nl2bash-stack-undr3070 +DCAgent2/nl2bash-stack-undr7030 +DCAgent2/nl2bash-stackseq +DCAgent2/nl2bash-stackshuffle +DCAgent2/nl2bash-swesmith-reason +DCAgent2/nl2bash-swesmithseq +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1097_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2095 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 +DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 +DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor +DCAgent2/stack-bugs-over5050 +DCAgent2/stack-bugs-undr3070 +DCAgent2/stack-bugs-undr7030 +DCAgent2/stack-bugsseq +DCAgent2/stack-bugsshuffle +DCAgent2/stack-nl2bashseq +DCAgent2/stack-swesmithseq +DCAgent2/swesmith-nl2bashseq +DCAgent2/swesmith-stack-undr7030 +DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra +DCAgent2/taskmaster2-GLM-4_6-32ep-32k +DCAgent2/test2-tbench-dev-71-qwen3-8b-8nodes-sync +bespokelabs/Qwen3-8B-ot_step100 +bespokelabs/Qwen3-8B-ot_step60_high +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B +laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B +laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B +laion/GLM-4_6-freelancer-32eps-131k +laion/GLM-4_6-inferredbugs-32ep-65k-reasoning +laion/GLM-4_6-nl2bash-verified-32ep-32k-reasoning +laion/GLM-4_6-selfinstruct-naive-2-32ep-32k +laion/GLM-4_6-stackexchange-superuser-32ep-32k +laion/GLM-4_6-swesmith-32ep-131k-nosumm +laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning +laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k +laion/GLM-4_7-r2egym_sandboxes-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink +laion/GPT-OSS-120B-codeforces-fixeps_Qwen3-8B +laion/Kimi-K2T-neulab-agenttuning-kg-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-mind2web-sandboxes-maxeps-32k +laion/Kimi-K2T-neulab-agenttuning-webshop-sandboxes-maxeps-32k +laion/Kimi-K2T-swesmith-32ep-131k +laion/MiniMax-M2-freelancer-32ep-32k +laion/MiniMax-M2-freelancer-32ep-32k-reasoning +laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-8B_exp_tas_summarize_threshold_4096_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps +laion/Qwen3-8B_exp_tas_trajectory_minimal_traces_save-strategy_steps +laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps +laion/Qwen3-Coder-480B-codeforces-fixeps_Qwen3-8B +laion/Qwen3-Coder-480B-nl2bash-fixeps_Qwen3-8B +laion/bugs-nl2bashseq_Qwen3-8B +laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces +laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter +laion/exp-psu-stackoverflow-1K_glm_4_7_traces +laion/exp-psu-stackoverflow-316_glm_4_7_traces +laion/exp-psu-stackoverflow-31K_glm_4_7_traces +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-swd-r2egym-wo-docker_glm_4_7_traces +laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned +laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-r2egym-4_2x_glm_4_7_traces_jupiter +laion/exp-uns-r2egym-8_4x_glm_4_7_traces_jupiter +laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned +laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter +laion/exp_tas_baseline_traces +laion/exp_tas_frequency_penalty_0_25_traces +laion/exp_tas_frequency_penalty_0_5_traces +laion/exp_tas_frequency_penalty_1_0_traces +laion/exp_tas_high_diversity_traces +laion/exp_tas_linear_history_off_traces +laion/exp_tas_low_diversity_traces +laion/exp_tas_max_tokens_2048_traces +laion/exp_tas_max_tokens_4096_traces +laion/exp_tas_min_p_0_05_traces +laion/exp_tas_parser_xml_traces +laion/exp_tas_raw_content_off_traces +laion/exp_tas_repetition_penalty_1_2_traces +laion/exp_tas_temp_0_5_traces +laion/exp_tas_top_k_128_traces +laion/exp_tas_top_k_16_traces +laion/exp_tas_top_p_0_8_traces +laion/glm-4_6-all-puzzles-32ep-131k +laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k +laion/glm-4_6-freelancer-32ep-131k-torch +laion/glm-4_6-nemo-prism +laion/glm-4_6-r2egym-32ep-32k +laion/glm-4_6-stack-overflow-32ep-131k-summtrc +laion/glm-4_6-staqc-32ep-131k +laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces +laion/glm46-defects4j-32ep-131k +laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k +laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k +laion/glm46-neulab-synatra-32ep-131k +laion/glm46-qasper-maxeps-131k +laion/glm46-stackexchange-tezos-maxeps-131k +laion/glm46-swegym-tasks-maxeps-131k +laion/glm46-swesmith-maxeps-131k +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc +laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 +laion/kimi-k2-r2egym_sandboxes-maxeps-32k +laion/kimi-k2-swegym-tasks-maxeps-32k +laion/kimi-k2t-freelancer-32ep-32k +laion/kimi-k2t-neulab-synatra-32ep-131k +laion/minimax-m2-stack-overflow-32ep-131k-summtrc +laion/nl2bash-bugs-undr7030_Qwen3-8B +laion/nl2bash-bugsseq_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e3_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B +laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B +laion/open-thoughts-4-code-qwen3-32b-annotated +laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc +laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k +laion/r2egym-bugsseq +laion/r2egym-gpt5-codex-160ep-1M +laion/r2egym-nl2bash-bugsseq +laion/r2egym-nl2bash-stack-bugsseq-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-cpp +laion/r2egym-nl2bash-stack-bugsseq-fixthink +laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python +laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 +laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp +laion/r2egym-nl2bash-stack-bugsseq-junit +laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 +laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests +laion/r2egym-nl2bash-stack-bugsseq-stack-php-v2 +laion/r2egym-nl2bash-stack-bugsseq_lr3e-5_exp_rpt_stack-php-v2_step20 +laion/r2egym-nl2bash-stackseq +laion/r2egym-stack-bugsseq +laion/rl_bs128-gs16-rloo-n-code-contests-900s-noreg-15 +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr1e-5_taco +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 +laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 +laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 +laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 +laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-type +laion/rl_think_npfg-code-contests-900s-45 +laion/rl_tp4s64_8x_2skill +laion/rl_tp4s64_8x_exercism-python +laion/rl_tp4s64_8x_flat25_baseline +laion/rl_tp4s64_8x_github_issue +laion/rl_tp4s64_8x_heavy_padding +laion/rl_tp4s64_8x_minimal_instructions +laion/rl_tp4s64_8x_nemotron-cpp +laion/rl_tp4s64_8x_nemotron-junit +laion/rl_tp4s64_8x_proportional +laion/rl_tp4s64_8x_structural_debug +laion/rl_v1_tp4s64_8x_nemotron-junit +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together +laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again +mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_43 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 +mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 +mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 +mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 +mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 +mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 +mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 +mlfoundations-dev/qasper-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 +mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 +mlfoundations-dev/staqc-sandboxes-traces-terminus-2 +mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 +mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 +open-thoughts/OpenThinker-Agent-v1-SFT +penfever/GLM-4_6-codeforces-32ep-32k-restore-hp +penfever/GLM-4_6-gemini25flash-stackexchange-overflow-32ep-512k-fixeps +penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps +penfever/freelancer-t1024s-32ep-restore-hp +penfever/freelancer-t2048s-32ep-restore-hp +penfever/freelancer-t512s-32ep-restore-hp +penfever/kimi-k2-swesmith_with_plain_docker-sandboxes-maxeps-32k +penfever/neulab-codeactinstruct-restore-hp +penfever/nl2bash-0-1k-traces-restore-hp +penfever/nl2bash-0-3k-traces-restore-hp +penfever/nl2bash-16ep-restore-hp +penfever/nl2bash-1ep-restore-hp +penfever/nl2bash-1k-traces-restore-hp +penfever/nl2bash-2ep-restore-hp +penfever/nl2bash-32ep-restore-hp +penfever/nl2bash-3k-traces-restore-hp +penfever/nl2bash-4ep-restore-hp +penfever/nl2bash-8ep-restore-hp +penfever/nl2bash-GLM-4_6-traces-newhparams +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft +penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp +penfever/nl2bash_gpt-5-nano-traces-8ep-restore-hp +penfever/nl2bash_verified_gpt-5-nano-traces-restore-hp +penfever/rl_bs128_gs16_ruby-30 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_code-v2-25 +penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-pyte-v2-25 +penfever/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_soft-v2-45 +penfever/selfinstruct-naive-sandboxes-2-traces-restore-hp +penfever/swesmith-2stage-restore-hp +penfever/taskmaster2-4ep-2stage-restore-hp diff --git a/eval/lists/pyme_v3_40.txt b/eval/lists/pyme_v3_40.txt new file mode 100644 index 00000000..b67069dc --- /dev/null +++ b/eval/lists/pyme_v3_40.txt @@ -0,0 +1 @@ +laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 diff --git a/eval/lists/qwen35_27b.txt b/eval/lists/qwen35_27b.txt new file mode 100644 index 00000000..1e3117a0 --- /dev/null +++ b/eval/lists/qwen35_27b.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-27B diff --git a/eval/lists/qwen35_9b.txt b/eval/lists/qwen35_9b.txt new file mode 100644 index 00000000..14ef350e --- /dev/null +++ b/eval/lists/qwen35_9b.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-9B diff --git a/eval/lists/richard_base_model.txt b/eval/lists/richard_base_model.txt new file mode 100644 index 00000000..497c9a05 --- /dev/null +++ b/eval/lists/richard_base_model.txt @@ -0,0 +1 @@ +Qwen/Qwen3.5-9B \ No newline at end of file diff --git a/eval/lists/richard_test_model.txt b/eval/lists/richard_test_model.txt new file mode 100644 index 00000000..bd8a4f31 --- /dev/null +++ b/eval/lists/richard_test_model.txt @@ -0,0 +1,2 @@ +# laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter +laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter \ No newline at end of file diff --git a/eval/lists/rope_step_batch.txt b/eval/lists/rope_step_batch.txt new file mode 100644 index 00000000..2c54a448 --- /dev/null +++ b/eval/lists/rope_step_batch.txt @@ -0,0 +1,3 @@ +laion/swesmith_8b_rope_65k-step37 +laion/r2egym_8b_rope_65k-step17 +laion/swesmith_8b-step35 diff --git a/eval/lists/sera_14b.txt b/eval/lists/sera_14b.txt new file mode 100644 index 00000000..8835118a --- /dev/null +++ b/eval/lists/sera_14b.txt @@ -0,0 +1 @@ +allenai/SERA-14B diff --git a/eval/lists/swesmith_fixthink_45.txt b/eval/lists/swesmith_fixthink_45.txt new file mode 100644 index 00000000..c4287878 --- /dev/null +++ b/eval/lists/swesmith_fixthink_45.txt @@ -0,0 +1 @@ +laion/rl_swesmith-fixthink-pymethods2test-45 diff --git a/eval/lists/syh_32b.txt b/eval/lists/syh_32b.txt new file mode 100644 index 00000000..492f1c93 --- /dev/null +++ b/eval/lists/syh_32b.txt @@ -0,0 +1 @@ +laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B diff --git a/eval/lists/tb2_richard_test_model.txt b/eval/lists/tb2_richard_test_model.txt new file mode 100644 index 00000000..8eddee63 --- /dev/null +++ b/eval/lists/tb2_richard_test_model.txt @@ -0,0 +1 @@ +laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter \ No newline at end of file diff --git a/eval/lists/v2_richard_test_model.txt b/eval/lists/v2_richard_test_model.txt new file mode 100644 index 00000000..83e4b4f6 --- /dev/null +++ b/eval/lists/v2_richard_test_model.txt @@ -0,0 +1 @@ +laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr \ No newline at end of file diff --git a/eval/precreate_snapshots.py b/eval/precreate_snapshots.py new file mode 100644 index 00000000..377b29f0 --- /dev/null +++ b/eval/precreate_snapshots.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +"""Pre-create Daytona snapshots for eval datasets. + +Computes unique environment hashes per dataset, then creates snapshots +on the Daytona org specified by the secrets file. Handles RL region routing +(builds via "us" but registers for "RL"). + +Usage: + python eval/MBZ/precreate_snapshots.py \ + --dataset DCAgent/dev_set_v2 \ + --dataset DCAgent2/terminal_bench_2 \ + --secrets-file ~/secrets.env + + python eval/MBZ/precreate_snapshots.py \ + --dataset DCAgent/dev_set_v2 \ + --dataset DCAgent2/terminal_bench_2 \ + --secrets-file ~/secrets_rl_org.env +""" + +import argparse +import asyncio +import os +import sys +from pathlib import Path + +# Ensure harbor is importable +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +HARBOR_SRC = REPO_ROOT / "harbor" / "src" +if str(HARBOR_SRC) not in sys.path: + sys.path.insert(0, str(HARBOR_SRC)) + + +def load_secrets(secrets_file: str) -> None: + """Parse 'export KEY=value' lines from a secrets file into os.environ.""" + with open(os.path.expanduser(secrets_file)) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + if line.startswith("export "): + line = line[7:] + k, v = line.split("=", 1) + os.environ[k.strip()] = v.strip().strip('"').strip("'") + + +def resolve_dataset_path(repo_id: str) -> Path: + """Resolve a HF repo ID to the local cached snapshot path.""" + hf_cache = os.getenv("HF_HUB_CACHE", os.path.expanduser("~/.cache/huggingface/hub")) + dataset_dir = Path(hf_cache) / f"datasets--{repo_id.replace('/', '--')}" + snapshots_dir = dataset_dir / "snapshots" + + if not snapshots_dir.exists(): + raise FileNotFoundError( + f"Dataset {repo_id} not found in HF cache at {snapshots_dir}. " + "Download it first with snapshot_download.py." + ) + + snapshots = sorted( + [d for d in snapshots_dir.iterdir() if d.is_dir()], + key=lambda p: p.name, + ) + if not snapshots: + raise FileNotFoundError(f"No snapshots found in {snapshots_dir}") + + return snapshots[-1] + + +def get_task_dirs(dataset_path: Path) -> list[Path]: + """Find task directories (those containing instruction.md).""" + return sorted([ + d for d in dataset_path.iterdir() + if d.is_dir() + and not d.name.startswith(".") + and (d / "instruction.md").exists() + ]) + + +def compute_unique_snapshots( + datasets: list[tuple[str, Path]], + target: str | None, +) -> dict[str, str]: + """Compute unique snapshot names and a representative Dockerfile for each. + + Returns: + Dict of snapshot_name -> representative Dockerfile path (as string). + """ + from harbor.utils.container_cache import get_task_environment_hash + + snapshots: dict[str, str] = {} # snapshot_name -> dockerfile_path + + for repo_id, dataset_path in datasets: + task_dirs = get_task_dirs(dataset_path) + print(f"\n=== {repo_id} ({len(task_dirs)} tasks) ===") + + hash_counts: dict[str, int] = {} + hash_example: dict[str, Path] = {} + + for task_dir in task_dirs: + env_hash = get_task_environment_hash(task_dir) + if env_hash is None: + continue + hash_counts[env_hash] = hash_counts.get(env_hash, 0) + 1 + if env_hash not in hash_example: + hash_example[env_hash] = task_dir + + print(f"Unique environment hashes: {len(hash_counts)}") + + for env_hash, count in sorted(hash_counts.items(), key=lambda x: -x[1]): + if target: + snap_name = f"harbor__{env_hash}__{target}__snapshot" + else: + snap_name = f"harbor__{env_hash}__snapshot" + + example_task = hash_example[env_hash] + dockerfile = example_task / "environment" / "Dockerfile" + + print(f" {snap_name} ({count} tasks) e.g. {example_task.name}") + snapshots[snap_name] = str(dockerfile) + + return snapshots + + +async def create_snapshot( + client, + name: str, + dockerfile_path: str, + target: str | None, + label: str, +) -> bool: + """Create a single snapshot, handling already-exists gracefully.""" + from daytona import AsyncDaytona, DaytonaConfig, CreateSnapshotParams, Image, Resources + from daytona._async.snapshot import SnapshotState + + # Check existing state + try: + snap = await client.snapshot.get(name) + if snap.state == SnapshotState.ACTIVE: + print(f" [{label}] {name}: already ACTIVE, skipping") + return True + elif snap.state == SnapshotState.ERROR: + print(f" [{label}] {name}: ERROR state, deleting and recreating...") + try: + await client.snapshot.delete(snap) + except Exception as e: + print(f" [{label}] {name}: failed to delete ERROR snapshot: {e}") + except Exception: + pass # Doesn't exist yet + + # RL region has no build runners — build via "us", register for RL + build_client = client + if target and target.upper() == "RL": + api_key = os.environ.get("DAYTONA_API_KEY", "") + build_client = AsyncDaytona(DaytonaConfig(api_key=api_key, target="us")) + + print(f" [{label}] Creating {name} from {dockerfile_path}...") + try: + await build_client.snapshot.create( + CreateSnapshotParams( + name=name, + image=Image.from_dockerfile(dockerfile_path), + resources=Resources(cpu=1, memory=1, disk=3), + region_id=target if target else None, + ) + ) + except Exception as e: + error_msg = str(e).lower() + if "already exists" in error_msg or "conflict" in error_msg: + print(f" [{label}] {name}: already exists (global), OK") + return True + print(f" [{label}] {name}: create FAILED: {e}") + return False + finally: + if build_client is not client: + await build_client.close() + + # Poll for ACTIVE state (up to 10 minutes, 5s intervals) + for i in range(120): + await asyncio.sleep(5) + try: + snap = await client.snapshot.get(name) + if snap.state == SnapshotState.ACTIVE: + print(f" [{label}] {name}: ACTIVE (took ~{(i+1)*5}s)") + return True + elif snap.state == SnapshotState.ERROR: + reason = getattr(snap, "error_reason", "unknown") + print(f" [{label}] {name}: entered ERROR state: {reason}") + return False + except Exception: + pass + + print(f" [{label}] {name}: TIMEOUT waiting for ACTIVE (600s)") + return False + + +async def main(): + parser = argparse.ArgumentParser( + description="Pre-create Daytona snapshots for eval datasets" + ) + parser.add_argument( + "--dataset", + action="append", + required=True, + help="HF repo ID (e.g. DCAgent/dev_set_v2). Can be repeated.", + ) + parser.add_argument( + "--secrets-file", + required=True, + help="Path to secrets env file (must contain DAYTONA_API_KEY and DAYTONA_TARGET)", + ) + args = parser.parse_args() + + # Load secrets into environment + load_secrets(args.secrets_file) + + api_key = os.environ.get("DAYTONA_API_KEY") + target = os.environ.get("DAYTONA_TARGET") + + if not api_key: + print("ERROR: DAYTONA_API_KEY not found in secrets file", file=sys.stderr) + sys.exit(1) + + label = target or "default" + print(f"Daytona target: {label}") + print(f"API key: {api_key[:12]}...") + + # Resolve dataset paths + datasets: list[tuple[str, Path]] = [] + for repo_id in args.dataset: + path = resolve_dataset_path(repo_id) + print(f"Dataset {repo_id} -> {path}") + datasets.append((repo_id, path)) + + # Compute unique snapshots + snapshots = compute_unique_snapshots(datasets, target) + + if not snapshots: + print("\nNo snapshots to create (no tasks with Dockerfiles found).") + return + + print(f"\n--- Creating {len(snapshots)} unique snapshots on '{label}' ---\n") + + # Create snapshots + from daytona import AsyncDaytona, DaytonaConfig + + client = AsyncDaytona(DaytonaConfig(api_key=api_key, target=target or "us")) + try: + results = {} + for snap_name, dockerfile in snapshots.items(): + ok = await create_snapshot(client, snap_name, dockerfile, target, label) + results[snap_name] = ok + finally: + await client.close() + + # Summary + succeeded = sum(1 for v in results.values() if v) + failed = sum(1 for v in results.values() if not v) + print(f"\n=== Done: {succeeded} succeeded, {failed} failed ===") + + if failed: + print("\nFailed snapshots:") + for name, ok in results.items(): + if not ok: + print(f" - {name}") + sys.exit(1) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/eval/secret.env.template b/eval/secret.env.template new file mode 100644 index 00000000..720a1ca7 --- /dev/null +++ b/eval/secret.env.template @@ -0,0 +1,7 @@ +export DAYTONA_API_KEY='YOUR_DAYTONA_API_KEY' +export DAYTONA_TARGET='us' +export HF_TOKEN='YOUR_HF_TOKEN' +export SUPABASE_URL='YOUR_SUPABASE_URL' +export SUPABASE_ANON_KEY='YOUR_SUPABASE_ANON_KEY' +export SUPABASE_SERVICE_ROLE_KEY='YOUR_SUPABASE_SERVICE_ROLE_KEY' +export SSH_KEY="$HOME/.ssh/id_ed25519_jsc" diff --git a/eval/snapshot_download.py b/eval/snapshot_download.py new file mode 100644 index 00000000..fcd402d9 --- /dev/null +++ b/eval/snapshot_download.py @@ -0,0 +1,150 @@ +import os +import sys +import argparse +from huggingface_hub import snapshot_download + +def is_valid_task_dir(path): + """ + Check if a directory is a valid task directory by verifying it has instruction.md + and excludes Git-related files. + """ + # Skip Git-related files and directories + basename = os.path.basename(path) + if basename.startswith('.git') or basename in {'.gitignore', '.gitattributes', '.github'}: + return False + + # Check if it's a directory and has instruction.md + return (os.path.isdir(path) and + os.path.isfile(os.path.join(path, 'instruction.md'))) + +def get_dataset_path(repo_id): + """ + Get the path to the dataset without downloading + Returns the path if it exists, None otherwise + + Args: + repo_id (str): The repository ID to look for + """ + + # Construct the path using HF_HUB_CACHE environment variable + hf_cache = os.getenv('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')) + dataset_path = os.path.join(hf_cache, f"datasets--{repo_id.replace('/', '--')}") + + # Find the latest snapshot + snapshots_dir = os.path.join(dataset_path, 'snapshots') + if os.path.exists(snapshots_dir): + try: + snapshots = [d for d in os.listdir(snapshots_dir) if os.path.isdir(os.path.join(snapshots_dir, d))] + except OSError as e: + print(f"Could not read snapshots directory {snapshots_dir}: {e}", file=sys.stderr) + snapshots = [] + if snapshots: + latest_snapshot = sorted(snapshots)[-1] # Get the latest snapshot + snapshot_path = os.path.join(snapshots_dir, latest_snapshot) + + # Verify we have valid task directories + if os.path.exists(snapshot_path): + task_dirs = [d for d in os.listdir(snapshot_path) + if is_valid_task_dir(os.path.join(snapshot_path, d))] + + if task_dirs: + print(f"Found dataset at {snapshot_path}") + print(f"Found {len(task_dirs)} valid task directories") + return snapshot_path + else: + print("No valid task directories found in snapshot") + return None + + print("Dataset not found, downloading...") + return None + +def download_sandboxes_dataset(repo_id, local_dir=None, cache_dir=None): + """ + Download the dataset using snapshot_download + + Args: + repo_id (str): The repository ID (e.g., 'mlfoundations-dev/sandboxes-tasks') + local_dir (str, optional): Local directory to save the dataset + cache_dir (str, optional): Cache directory for huggingface hub + """ + + try: + print(f"Starting download of {repo_id}...") + + # Download the entire dataset repository + local_path = snapshot_download( + repo_id=repo_id, + repo_type="dataset", + local_dir=local_dir, + cache_dir=cache_dir, + ) + + # Remove .gitattributes file if it exists + gitattributes_path = os.path.join(local_path, '.gitattributes') + if os.path.exists(gitattributes_path): + os.remove(gitattributes_path) + print("Removed .gitattributes file") + + # Verify we have valid task directories + task_dirs = [d for d in os.listdir(local_path) + if is_valid_task_dir(os.path.join(local_path, d))] + + if task_dirs: + print(f"DATASET_PATH={local_path}") + print(f"Found {len(task_dirs)} valid task directories") + return local_path + else: + print("No valid task directories found in downloaded dataset") + return None + + except Exception as e: + print(f"Error downloading dataset: {e}") + return None + + +def main(): + parser = argparse.ArgumentParser(description='Download or locate a Hugging Face dataset') + parser.add_argument('repo_id', help='Repository ID (e.g., mlfoundations-dev/clean-sandboxes-tasks)') + parser.add_argument('--local-dir', help='Local directory to save the dataset') + parser.add_argument('--cache-dir', help='Cache directory for huggingface hub') + + args = parser.parse_args() + + path = None + if args.local_dir: + # When --local-dir is specified, download real files (no symlinks) + # Check if local_dir already has valid task dirs + if os.path.isdir(args.local_dir): + task_dirs = [d for d in os.listdir(args.local_dir) + if is_valid_task_dir(os.path.join(args.local_dir, d))] + if task_dirs: + print(f"Found existing dataset at {args.local_dir} with {len(task_dirs)} tasks") + path = args.local_dir + if not path: + print("Downloading dataset to local dir (real files, no symlinks)...", file=sys.stderr) + path = download_sandboxes_dataset( + repo_id=args.repo_id, + local_dir=args.local_dir, + cache_dir=args.cache_dir + ) + else: + # First try to get existing cached path + path = get_dataset_path(args.repo_id) + if not path: + # If not found, download it + print("Dataset not found, downloading...", file=sys.stderr) + path = download_sandboxes_dataset( + repo_id=args.repo_id, + local_dir=args.local_dir, + cache_dir=args.cache_dir + ) + + if path: + print(f"DATASET_PATH={path}") + return 0 + else: + print("Failed to download dataset", file=sys.stderr) + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/eval/test_dp_eval.sh b/eval/test_dp_eval.sh new file mode 100644 index 00000000..28ec1d71 --- /dev/null +++ b/eval/test_dp_eval.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# ============================================================================== +# Test: vLLM native Data Parallel eval (DP=4, TP=1) on single node +# +# Uses vLLM's --data-parallel-size flag to run 4 model replicas (one per GPU), +# each with TP=1. vLLM load-balances requests across replicas internally. +# Harbor sees a single endpoint — no sharding needed. +# +# Usage: +# bash eval/MBZ/test_dp_eval.sh +# +# To clean up DB entries from failed tests: +# python -c " +# from database.unified_db.utils import delete_sandbox_job_by_id, load_supabase_keys +# load_supabase_keys() +# delete_sandbox_job_by_id('') +# " +# ============================================================================== + +set -euo pipefail + +MODEL="DCAgent/a1-nl2bash" +DATASET="DCAgent/dev_set_v2" +BENCHMARK_ID="b94dfab2-c438-4c32-ba29-23e46d566763" + +# DP config: 4 replicas, TP=1 each, 16 concurrent harbor trials per replica = 64 total +DP_SIZE=4 +TP_SIZE=1 +N_CONCURRENT=64 # 16 per DP replica × 4 replicas +GPU_MEMORY_UTIL=0.9 +TIMEOUT_MULTIPLIER=2.0 + +echo "=== vLLM Native DP Test ===" +echo "Model: $MODEL" +echo "Dataset: $DATASET" +echo "DP=$DP_SIZE, TP=$TP_SIZE, N_CONCURRENT=$N_CONCURRENT" +echo "" + +# Submit using the v6 sbatch with DP env vars +EVAL_VLLM_TENSOR_PARALLEL_SIZE=$TP_SIZE \ +EVAL_VLLM_DATA_PARALLEL_SIZE=$DP_SIZE \ +EVAL_N_CONCURRENT=$N_CONCURRENT \ +EVAL_GPU_MEMORY_UTIL=$GPU_MEMORY_UTIL \ +EVAL_ENABLE_THINKING=true \ +EVAL_TIMEOUT_MULTIPLIER=$TIMEOUT_MULTIPLIER \ +EVAL_VLLM_MAX_RETRIES=10 \ +EVAL_DAYTONA_THRESHOLD=10 \ +EVAL_CONDA_ENV=otagent2 \ +EVAL_AUTO_SNAPSHOT=true \ +EVAL_CONFIG_YAML=dcagent_eval_config_no_override.yaml \ +sbatch \ + --time 24:00:00 \ + --partition main \ + --gres gpu:4 \ + --cpus-per-task=32 \ + --job-name data_dp_test \ + --output eval/MBZ/logs/data_dp_test_%j.out \ + eval/MBZ/unified_eval_harbor_v6.sbatch \ + "$MODEL" "$DATASET" "$BENCHMARK_ID" "" + +echo "Submitted! Monitor with:" +echo " squeue -u \$USER" +echo " tail -f eval/MBZ/logs/data_dp_test_*.out" +echo " tail -f experiments/logs/vllm_*.log" diff --git a/eval/unified_eval_harbor.sbatch b/eval/unified_eval_harbor.sbatch new file mode 100644 index 00000000..c076c32a --- /dev/null +++ b/eval/unified_eval_harbor.sbatch @@ -0,0 +1,1014 @@ +#!/bin/bash +#SBATCH -p booster +#SBATCH --time=12:00:00 +#SBATCH --signal=B:TERM@120 +#SBATCH --nodes 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:2 +#SBATCH --output=eval/local/logs/%x_%j.out +#SBATCH --job-name=eval + +# ============================================================================== +# Unified Eval Harbor v4 — Jupiter Cluster (JSC GH200) +# +# Starts vLLM, runs Harbor eval, checks errors, uploads results. +# Merged from Jupiter v1 env setup + TACC v4 features (configurable params, +# Pending→Started DB flow, unified error checking, thinking, export-traces). +# +# Positional args: +# $1 = MODEL (HF model name, e.g. mlfoundations-dev/some_model) +# $2 = REPO_ID (HF dataset repo, e.g. DCAgent/dev_set_v2) +# $3 = BENCHMARK_ID (optional, DB benchmark UUID) +# $4 = RUN_TAG_ARG (optional, override run tag) +# +# Env vars from listener (v4 SbatchParams.to_env()): +# EVAL_N_CONCURRENT (default: 128) +# EVAL_N_ATTEMPTS (default: 3) +# EVAL_GPU_MEMORY_UTIL (default: 0.95) +# EVAL_DAYTONA_THRESHOLD (default: 999999) +# EVAL_VLLM_MAX_RETRIES (default: 20) +# EVAL_AGENT_PARSER (default: "") +# EVAL_ENABLE_THINKING (default: "false") +# EVAL_AGENT_NAME (default: "terminus-2") +# EVAL_STARTS_LOG (optional, shared eval starts log) +# EVAL_TIMEOUT_MULTIPLIER (default: 1.0) +# EVAL_CONFIG_YAML (default: dcagent_eval_config.yaml) +# EVAL_DB_JOB_ID (optional, pre-created Pending job ID) +# EVAL_UPLOAD_USERNAME (optional, override upload username) +# EVAL_OVERRIDE_MEMORY_MB (optional, memory override for harbor) +# EVAL_AUTO_SNAPSHOT (optional, "true"/"false" override for auto_snapshot) +# EVAL_SNAPSHOT_NAME (optional, Daytona snapshot template) +# ============================================================================== + +set -eo pipefail +ulimit -c 0 # Disable core dumps +ulimit -n 65536 2>/dev/null || true + +TIMESTAMP=$(date +'%Y%m%d_%H%M%S') + +# --- Parse positional args --- +MODEL="${1:-mlfoundations-dev/claude_3_7_20250219_tbench_traces_sharegptv1}" +REPO_ID="${2:-DCAgent/dev_set_71_tasks}" +BENCHMARK_ID="${3:-}" +RUN_TAG_ARG="${4:-}" + +# --- Read env vars from listener (with defaults) --- +N_CONCURRENT="${EVAL_N_CONCURRENT:-128}" +N_ATTEMPTS="${EVAL_N_ATTEMPTS:-3}" +GPU_MEMORY_UTIL="${EVAL_GPU_MEMORY_UTIL:-0.95}" +ERROR_THRESHOLD="${EVAL_DAYTONA_THRESHOLD:-999999}" +VLLM_MAX_RETRIES="${EVAL_VLLM_MAX_RETRIES:-20}" +AGENT_PARSER="${EVAL_AGENT_PARSER:-}" +ENABLE_THINKING="${EVAL_ENABLE_THINKING:-false}" +AGENT_NAME="${EVAL_AGENT_NAME:-terminus-2}" +EVAL_STARTS_LOG="${EVAL_STARTS_LOG:-}" +TIMEOUT_MULTIPLIER="${EVAL_TIMEOUT_MULTIPLIER:-1.0}" +CONFIG_YAML="${EVAL_CONFIG_YAML:-dcagent_eval_config.yaml}" +DB_JOB_ID="${EVAL_DB_JOB_ID:-}" +UPLOAD_USERNAME="${EVAL_UPLOAD_USERNAME:-$USER}" + +# Strip slashes and special chars for file-safe names +SAFE_MODEL=$(echo "$MODEL" | tr '/:' '_') +if [[ "$REPO_ID" == /* ]]; then + # Local path: use basename as the safe repo name + SAFE_REPO=$(basename "$REPO_ID") +else + SAFE_REPO=$(echo "$REPO_ID" | tr '/:' '_') +fi + +# Derive benchmark shorthand for SLURM job name (for squeue readability) +# Listener passes --job-name to override, but for manual runs derive from REPO_ID. +declare -A BENCH_SHORT=( + ["DCAgent_dev_set_v2"]="v2" + ["DCAgent2_swebench-verified-random-100-folders"]="swe" + ["DCAgent2_terminal_bench_2"]="tb2" + ["DCAgent2_aider_polyglot"]="aider" + ["DCAgent2_bfcl-parity"]="bfcl" + ["DCAgent_dev_set_71_tasks"]="v1" +) +BENCH_TAG="${BENCH_SHORT[$SAFE_REPO]:-${SAFE_REPO:0:12}}" +# Only rename if still the default "eval" (i.e. listener didn't override) +if [ "$SLURM_JOB_NAME" = "eval" ]; then + scontrol update JobId="$SLURM_JOB_ID" JobName="eval_${BENCH_TAG}" +fi + +echo "==============================================" +echo "Jupiter Eval Harbor v4" +echo "==============================================" +echo "Model: $MODEL" +echo "Dataset: $REPO_ID" +echo "Benchmark ID: ${BENCHMARK_ID:-}" +echo "N concurrent: $N_CONCURRENT" +echo "N attempts: $N_ATTEMPTS" +echo "GPU memory util: $GPU_MEMORY_UTIL" +echo "Error threshold: $ERROR_THRESHOLD" +echo "vLLM max retries: $VLLM_MAX_RETRIES" +echo "Agent: $AGENT_NAME" +echo "Agent parser: ${AGENT_PARSER:-}" +echo "Thinking: $ENABLE_THINKING" +echo "Timeout multiplier: $TIMEOUT_MULTIPLIER" +echo "Config YAML: $CONFIG_YAML" +echo "DB Job ID (pending): ${DB_JOB_ID:-}" +echo "Upload username: $UPLOAD_USERNAME" +echo "==============================================" + +# ============================================================================== +# Cluster-Agnostic Environment Setup +# ============================================================================== + +DCFT="${EVAL_PROJECT_ROOT:-${DCFT:?ERROR: EVAL_PROJECT_ROOT not set. Launch via the listener with --cluster-config.}}" +CLUSTER_NAME="${EVAL_CLUSTER_NAME:?ERROR: EVAL_CLUSTER_NAME not set. Launch via the listener with --cluster-config.}" + +# Reset PYTHONPATH to avoid inheriting submitter's environment (e.g. numpy 2.4 +# from guha1/site-packages which breaks numba/vLLM). +unset PYTHONPATH + +# Preserve listener-provided overrides before dotenv clobbers them +_SAVED_SECRET_ENV="${DC_AGENT_SECRET_ENV:-}" + +# Source cluster-specific dotenv (e.g. hpc/dotenv/jupiter.env or hpc/dotenv/mbz.env) +DOTENV_FILE="$DCFT/hpc/dotenv/${CLUSTER_NAME}.env" +if [ -f "$DOTENV_FILE" ]; then + source "$DOTENV_FILE" + echo "Sourced dotenv: $DOTENV_FILE" +else + echo "WARNING: dotenv not found: $DOTENV_FILE" +fi + +# Source secrets (listener override takes precedence over dotenv) +DC_AGENT_SECRET_ENV="${_SAVED_SECRET_ENV:-${DC_AGENT_SECRET_ENV:-$HOME/secrets.env}}" +if [ -f "$DC_AGENT_SECRET_ENV" ]; then + source "$DC_AGENT_SECRET_ENV" +fi + +# --- Daytona API key (sourced from secrets.env above) --- +if [ -z "${DAYTONA_API_KEY:-}" ]; then + echo "WARNING: DAYTONA_API_KEY not set. Check $DC_AGENT_SECRET_ENV" +fi +echo "Daytona API key: ${DAYTONA_API_KEY:0:12}..." + +# --- vLLM / Ray / Triton env vars --- +export VLLM_USE_V1=1 +export RAY_RUNTIME_ENV_HOOK=ray._private.runtime_env.uv_runtime_env_hook.hook +export VLLM_CONFIG_ROOT="${VLLM_CACHE_ROOT:-/tmp/vllm_config_${USER}}" +export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-/tmp/triton_cache_${USER}}" +export FLASHINFER_WORKSPACE_BASE="${FLASHINFER_CACHE_DIR:-/tmp/flashinfer_cache_${USER}}" +export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv_cache_${USER}}" +export HYDRA_FULL_ERROR=1 +# Use naive all2all backend for DP (pplx_kernels requires separate install) +export VLLM_ALL2ALL_BACKEND="${VLLM_ALL2ALL_BACKEND:-naive}" +# Model/dataset cache — overridden by EVAL_HF_CACHE from listener +export HF_HUB_CACHE="${EVAL_HF_CACHE:-${HF_HUB_CACHE:?ERROR: HF_HUB_CACHE not set. Source your cluster dotenv or use --cluster-config.}}" +export HF_HOME="${HF_HUB_CACHE}" +export HF_CACHE_DIR="$HF_HUB_CACHE" +# XET staging cache — must be user-writable +export HF_XET_CACHE="${HF_XET_CACHE:-/tmp/hf_xet_cache_${USER}}" +mkdir -p "$HF_XET_CACHE" + +# Harbor and DB operations +HARBOR_SRC="${EVAL_HARBOR_SRC:-}" +export PYTHONPATH="${HARBOR_SRC}:${DCFT}:${PYTHONPATH:-}" + +# ============================================================================== +# LD_LOADER Wrappers (needed on aarch64 Jupiter where shared conda lacks exec perms) +# ============================================================================== + +OTAGENT_DIR="${OTAGENT_DIR:?ERROR: OTAGENT_DIR not set. Use --conda-env with the listener.}" +export PATH="$OTAGENT_DIR/bin:$PATH" +export CONDA_PREFIX="$OTAGENT_DIR" + +PYTHON_REAL="$OTAGENT_DIR/bin/python3.12" + +# LD_LOADER Wrappers: required on aarch64 (Jupiter GH200) where shared conda +# lacks execute permissions. Skipped on x86_64 where the conda env is directly usable. +if [ "$(uname -m)" = "aarch64" ] && [ -f /lib/ld-linux-aarch64.so.1 ]; then + LD_LOADER="/lib/ld-linux-aarch64.so.1" + WRAPPER_DIR="/tmp/eval_wrappers_${SLURM_JOB_ID}" + mkdir -p "$WRAPPER_DIR" + + cat > "$WRAPPER_DIR/python3" < "$WRAPPER_DIR/python" < "$WRAPPER_DIR/harbor" </dev/null +export RAY_DEDUP_LOGS=0 + +# Set CUDA_HOME for flashinfer JIT compilation (needs nvcc) +# The CUDA module provides nvcc at this path on Jupiter GH200 nodes. +_CUDA_HOME="${EVAL_CUDA_HOME:-/e/software/default/stages/2026/software/CUDA/13}" +if [ -n "${_CUDA_HOME}" ] && [ -d "${_CUDA_HOME}" ]; then + export CUDA_HOME="${_CUDA_HOME}" + export PATH="$CUDA_HOME/bin:$PATH" +fi + + +# ============================================================================== +# SSH Tunnel + Proxychains Setup (no internet on compute nodes) +# ============================================================================== + +NODE_HOST=$(hostname -s) +TUNNEL_PORT=7003 +LOGIN_NODE="${EVAL_LOGIN_NODE:-jpbl-s01-02}" +PROXYCHAINS_BIN="${EVAL_PROXYCHAINS_BIN:-}" +TUNNEL_PID="" + +setup_proxy() { + echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE..." + + if [ -z "${SSH_KEY:-}" ]; then + echo "[proxy] WARNING: SSH_KEY not set, skipping tunnel setup" + echo "[proxy] External connectivity (Daytona, HF) will fail" + return 1 + fi + + # Get node IP for multi-node access + NODE_IP=$(nslookup "$NODE_HOST" 2>/dev/null | grep 'Address' | tail -n1 | awk '{print $2}') + NODE_IP="${NODE_IP:-127.0.0.1}" + + # Create SSH tunnel with SOCKS5 proxy + ssh -g -f -N -D ${TUNNEL_PORT} \ + -o StrictHostKeyChecking=no \ + -o ConnectTimeout=1000 \ + -o ServerAliveInterval=10 \ + -o ServerAliveCountMax=30 \ + -o TCPKeepAlive=yes \ + -o ExitOnForwardFailure=yes \ + -o BatchMode=yes \ + -i "${SSH_KEY}" \ + "${USER}@${LOGIN_NODE}" + + sleep 5 + + # Verify tunnel + if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then + echo "[proxy] SSH tunnel started successfully" + TUNNEL_PID=$(pgrep -f "ssh.*-D.*${TUNNEL_PORT}" | head -1) + else + echo "[proxy] ERROR: SSH tunnel failed to start" + return 1 + fi + + # Generate proxychains config + CFG_PATH="$HOME/.proxychains/proxychains_${SLURM_JOB_ID}.conf" + mkdir -p "$HOME/.proxychains" + + cat > "$CFG_PATH" </dev/null; then + echo "[proxy] HuggingFace connectivity test passed" + else + echo "[proxy] WARNING: HuggingFace connectivity test failed" + fi + fi + + # Test Daytona API connectivity + if [ -n "${DAYTONA_API_KEY:-}" ]; then + echo "[daytona] Testing API connectivity..." + DAYTONA_HTTP_CODE=$("$PROXYCHAINS_BIN" -f "$CFG_PATH" curl -s --connect-timeout 15 \ + -H "Authorization: Bearer $DAYTONA_API_KEY" \ + "${DAYTONA_API_URL:-https://app.daytona.io/api}/health" \ + -o /tmp/daytona_health_$$.json -w "%{http_code}" 2>/dev/null) || DAYTONA_HTTP_CODE="FAIL" + echo "[daytona] Health check HTTP code: $DAYTONA_HTTP_CODE" + if [ -f /tmp/daytona_health_$$.json ]; then + echo "[daytona] Response: $(cat /tmp/daytona_health_$$.json | head -c 200)" + rm -f /tmp/daytona_health_$$.json + fi + if [ "$DAYTONA_HTTP_CODE" = "200" ]; then + echo "[daytona] API connectivity OK" + else + echo "[daytona] WARNING: API returned $DAYTONA_HTTP_CODE — sandbox creation may fail" + fi + fi + + echo "[proxy] Setup complete" + return 0 +} + +# Helper: run command through proxychains +proxied() { + if [ -x "$PROXYCHAINS_BIN" ] && [ -n "${PROXYCHAINS_CONF_FILE:-}" ]; then + "$PROXYCHAINS_BIN" -f "$PROXYCHAINS_CONF_FILE" "$@" + else + "$@" + fi +} + +# Setup proxy (skip if EVAL_PROXY_ENABLED=false, e.g. clusters with direct internet) +if [ "${EVAL_PROXY_ENABLED:-true}" = "true" ]; then + setup_proxy || echo "[proxy] Continuing without proxy (some operations may fail)" +else + echo "[proxy] Proxy disabled (EVAL_PROXY_ENABLED=false), assuming direct internet access" +fi + +# ============================================================================== +# Cleanup trap +# ============================================================================== +cleanup() { + echo "Cleaning up..." + # Kill vLLM + if [ -n "${VLLM_PID:-}" ]; then + kill "$VLLM_PID" 2>/dev/null || true + fi + # Kill SSH tunnel + if [ -n "${TUNNEL_PID:-}" ]; then + kill "$TUNNEL_PID" 2>/dev/null || true + fi + # Remove wrappers and per-job cache + [ -n "${WRAPPER_DIR:-}" ] && rm -rf "$WRAPPER_DIR" 2>/dev/null || true + [ -n "${_JOB_CACHE:-}" ] && rm -rf "$_JOB_CACHE" 2>/dev/null || true + # Remove proxychains config + rm -f "${PROXYCHAINS_CONF_FILE:-}" 2>/dev/null || true + echo "Cleanup done." +} +trap cleanup EXIT + +# ============================================================================== +# Start vLLM Server +# ============================================================================== +EVAL_LOGS_DIR="${EVAL_LOGS_DIR:-eval/local/logs}" +mkdir -p "$EVAL_LOGS_DIR" + +# Derive unique vLLM port. When the listener packs jobs (EVAL_VLLM_PORT is set), +# use the listener-assigned port directly — it centrally plans non-overlapping +# port ranges across all jobs on a node. Otherwise fall back to job-ID-based port. +# IMPORTANT: Do NOT export VLLM_PORT as an env var — vLLM's _get_open_port() +# reads the VLLM_PORT env var and all DP subprocess children would try to bind +# the same port, causing an infinite port-collision loop. +_PHYSICAL_GPUS="${SLURM_JOB_GPUS:-0}" +FIRST_GPU=$(echo "$_PHYSICAL_GPUS" | cut -d',' -f1) +if [ -n "${EVAL_VLLM_PORT:-}" ]; then + VLLM_PORT="$EVAL_VLLM_PORT" +else + VLLM_PORT=$(( 10000 + (SLURM_JOB_ID % 50000) )) +fi +unset -v VLLM_PORT_ENV # ensure no VLLM_PORT env var leaks to subprocesses +echo "Physical GPUs: $_PHYSICAL_GPUS (CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES), vLLM port: $VLLM_PORT" + +# Pre-download model via proxychains (compute nodes have no direct internet) +echo "Pre-downloading model: $MODEL" +proxied $PYTHON_BIN -c " +from huggingface_hub import snapshot_download +import os +cache = os.environ.get('HF_HUB_CACHE') +path = snapshot_download('$MODEL', cache_dir=cache) +print(f'Model cached at: {path}') +" +if [ $? -ne 0 ]; then + echo "ERROR: Model pre-download failed for $MODEL. Exiting." + exit 1 +fi + +echo "Starting vLLM server for model: $MODEL" +source "$DCFT/eval/build_vllm_cmd.sh" +build_vllm_cmd "$PYTHON_BIN" "$MODEL" "$GPU_MEMORY_UTIL" + +# Per-job cache dirs to avoid JIT compilation conflicts when packing +_JOB_CACHE="/tmp/${USER}_${SLURM_JOB_ID}" +mkdir -p "$_JOB_CACHE" + +# With DP > 1, vLLM needs torch.compile (TORCHDYNAMO_DISABLE must be unset) +_DP_SIZE="${EVAL_VLLM_DATA_PARALLEL_SIZE:-1}" +_DYNAMO_DISABLE=1 +if [ "$_DP_SIZE" -gt 1 ] 2>/dev/null; then + _DYNAMO_DISABLE=0 +fi + +env TORCHDYNAMO_DISABLE=$_DYNAMO_DISABLE \ + TMPDIR="$_JOB_CACHE" \ + TRITON_CACHE_DIR="$_JOB_CACHE/triton" \ + TORCH_COMPILE_CACHE_DIR="$_JOB_CACHE/torch" \ + TORCHINDUCTOR_CACHE_DIR="$_JOB_CACHE/torchinductor" \ + RAY_TMPDIR="$_JOB_CACHE/ray" \ + HF_HOME="$HF_HOME" \ + HF_HUB_CACHE="$HF_HUB_CACHE" \ + HF_HUB_OFFLINE=1 \ +"${VLLM_CMD[@]}" \ + > "$EVAL_LOGS_DIR/vllm_${SLURM_JOB_ID}.log" 2>&1 & +VLLM_PID=$! + +# Health check loop (use configurable retries) +MAX_RETRIES=$VLLM_MAX_RETRIES +RETRY_INTERVAL=100 +for i in $(seq 1 $MAX_RETRIES); do + if curl -s http://localhost:${VLLM_PORT}/v1/models > /dev/null 2>&1; then + echo "vLLM server is ready" + break + fi + echo "Waiting for vLLM server (attempt $i/$MAX_RETRIES)..." + sleep $RETRY_INTERVAL + if [ $i -eq $MAX_RETRIES ]; then + echo "ERROR: vLLM server failed to start" + # Dump last 50 lines of vLLM log + tail -50 "$EVAL_LOGS_DIR/vllm_${SLURM_JOB_ID}.log" 2>/dev/null || true + exit 1 + fi +done + +# ============================================================================== +# Download dataset (via proxychains for HF access) +# Use --local-dir to get real files instead of symlinks (Daytona needs real files) +# Supports local paths: if REPO_ID starts with /, use it directly. +# ============================================================================== +if [[ "$REPO_ID" == /* ]]; then + # Local dataset path + echo "Using local dataset path: $REPO_ID" + DATASET_PATH="$REPO_ID" + if [ ! -d "$DATASET_PATH" ]; then + echo "ERROR: Local dataset path does not exist: $DATASET_PATH" + exit 1 + fi + TASK_COUNT=$(ls -d "$DATASET_PATH"/*/instruction.md 2>/dev/null | wc -l) + echo "[DEBUG] Found $TASK_COUNT tasks with instruction.md" +else + echo "Downloading/locating dataset: $REPO_ID" + # Check dataset directories (from EVAL_DATASETS_DIRS or fallback) + _DS_DIRS="${EVAL_DATASETS_DIRS:?ERROR: EVAL_DATASETS_DIRS not set. Set datasets_dirs in cluster config YAML.}" + DATASET_LOCAL_DIR="" + IFS=':' read -ra _DS_DIRS_ARR <<< "$_DS_DIRS" + for _ds_dir in "${_DS_DIRS_ARR[@]}"; do + if [ -d "${_ds_dir}/${SAFE_REPO}" ]; then + DATASET_LOCAL_DIR="${_ds_dir}/${SAFE_REPO}" + break + fi + done + # Default to first dir if nothing found (snapshot_download will create it) + DATASET_LOCAL_DIR="${DATASET_LOCAL_DIR:-${_DS_DIRS_ARR[0]}/${SAFE_REPO}}" + DATASET_LOCAL_DIR_ALT="" # no longer needed; absorbed into EVAL_DATASETS_DIRS + echo "[DEBUG] DATASET_LOCAL_DIR=$DATASET_LOCAL_DIR" + echo "[DEBUG] Local dir exists: $([ -d "$DATASET_LOCAL_DIR" ] && echo yes || echo no)" + if [ -d "$DATASET_LOCAL_DIR" ]; then + TASK_COUNT=$(ls -d "$DATASET_LOCAL_DIR"/*/instruction.md 2>/dev/null | wc -l) + echo "[DEBUG] Found $TASK_COUNT tasks with instruction.md in local dir" + echo "[DEBUG] Sample files:" + file "$DATASET_LOCAL_DIR"/$(ls "$DATASET_LOCAL_DIR" | head -1)/environment/Dockerfile 2>/dev/null || echo "[DEBUG] No Dockerfile found" + fi + DOWNLOAD_LOG=$(mktemp /tmp/download_XXXXXX.log) + proxied $PYTHON_BIN "$DCFT/eval/snapshot_download.py" "$REPO_ID" --local-dir "$DATASET_LOCAL_DIR" > "$DOWNLOAD_LOG" 2>&1 + DOWNLOAD_EXIT=$? + cat "$DOWNLOAD_LOG" + DATASET_PATH=$(grep DATASET_PATH "$DOWNLOAD_LOG" | tail -n 1 | cut -d'=' -f2) + rm -f "$DOWNLOAD_LOG" + if [ $DOWNLOAD_EXIT -ne 0 ] || [ -z "${DATASET_PATH:-}" ]; then + echo "ERROR: Failed to get dataset path (exit code: $DOWNLOAD_EXIT)" + exit 1 + fi +fi +echo "Using dataset path: $DATASET_PATH" +echo "[DEBUG] Dataset source: $([ -L "$DATASET_PATH/$(ls "$DATASET_PATH" | head -1)/environment/Dockerfile" ] && echo 'SYMLINK' || echo 'REAL FILE')" +echo "[DEBUG] Task count in dataset: $(ls -d "$DATASET_PATH"/*/instruction.md 2>/dev/null | wc -l)" + +# ============================================================================== +# Construct run tag and directory +# ============================================================================== +if [ -n "$RUN_TAG_ARG" ]; then + RUN_TAG="$RUN_TAG_ARG" +else + RUN_TAG="${SAFE_REPO}_${SAFE_MODEL}" +fi +EVAL_JOBS_DIR="${EVAL_JOBS_DIR:?ERROR: EVAL_JOBS_DIR not set. Set eval_jobs_dir in cluster config YAML.}" +RUN_DIR="${EVAL_JOBS_DIR}/${RUN_TAG}" +mkdir -p "$RUN_DIR" + +echo "Run tag: $RUN_TAG" +echo "Run dir: $RUN_DIR" + +# --- Compute canonical benchmark name for DB uploads --- +declare -A BENCHMARK_NAME_MAP=( + ["DCAgent2_aider_polyglot"]="aider_polyglot" + ["DCAgent_dev_set_v2"]="dev_set_v2" + ["DCAgent_dev_set_71_tasks"]="dev_set_71_tasks" + ["DCAgent2_terminal_bench_2"]="terminal_bench_2" + ["DCAgent_swebench_verified_eval_set"]="swebench-verified-random-100-folders" + ["DCAgent2_bfcl-parity"]="bfcl-parity" +) +BASE_NAME="${BENCHMARK_NAME_MAP[$SAFE_REPO]:-$SAFE_REPO}" + +# Append suffix for non-default timeout/memory configs. +BENCHMARK_SUFFIX="" +if [ -n "${EVAL_OVERRIDE_MEMORY_MB:-}" ] && [ "$EVAL_OVERRIDE_MEMORY_MB" != "1024" ]; then + mem_gb=$(( EVAL_OVERRIDE_MEMORY_MB / 1024 )) + BENCHMARK_SUFFIX="${BENCHMARK_SUFFIX}_${mem_gb}gb" +fi +if [ "$TIMEOUT_MULTIPLIER" != "1" ] && [ "$TIMEOUT_MULTIPLIER" != "1.0" ]; then + BENCHMARK_SUFFIX="${BENCHMARK_SUFFIX}_${TIMEOUT_MULTIPLIER}x" +fi +if [ -n "$BENCHMARK_SUFFIX" ]; then + BENCHMARK_NAME="${BASE_NAME}${BENCHMARK_SUFFIX}" +else + BENCHMARK_NAME="$BASE_NAME" +fi +export BENCHMARK_NAME +echo "Benchmark name: $BENCHMARK_NAME" + +# ============================================================================== +# Update DB: Pending → Started (v4 flow) +# ============================================================================== +echo "Creating/updating DB job entry..." +export MODEL REPO_ID RUN_TAG SLURM_JOB_ID BENCHMARK_NAME + +HARBOR_VERSION=$($PYTHON_BIN -c "import harbor; print(harbor.__version__)" 2>/dev/null || echo "unknown") +export HARBOR_VERSION + +if [ -n "$DB_JOB_ID" ]; then + # v4 flow: listener already created a Pending entry — transition to Started + echo "Updating Pending job $DB_JOB_ID → Started (run_tag=$RUN_TAG)" + proxied $PYTHON_BIN - <<'PY' || true +import os, sys, json +from database.unified_db.utils import update_job_status_to_started + +run_tag = os.environ["RUN_TAG"] +agent_name = os.environ.get("EVAL_AGENT_NAME", "terminus-2") +n_concurrent = int(os.environ.get("EVAL_N_CONCURRENT", "128")) +n_attempts = int(os.environ.get("EVAL_N_ATTEMPTS", "3")) +timeout_multiplier = float(os.environ.get("EVAL_TIMEOUT_MULTIPLIER", "1.0")) +harbor_version = os.environ.get("HARBOR_VERSION", "unknown") + +config = { + "agent": agent_name, + "env": "daytona", + "timeout_multiplier": timeout_multiplier, +} + +result = update_job_status_to_started( + job_name=run_tag, + n_trials=n_concurrent, + n_rep_eval=n_attempts, + config=config, + harbor_package_version=harbor_version, +) +if not result.get("success"): + print(f"WARNING: Pending→Started update failed: {result.get('error')}", file=sys.stderr) +else: + print(f"DB job {run_tag} → Started") +PY +else + # Fallback: create Started entry directly (backward compat / manual runs) + echo "No EVAL_DB_JOB_ID — creating Started entry directly" + proxied $PYTHON_BIN - <<'PY' || true +import os, sys + +from database.unified_db.utils import create_job_entry_started + +model_hf = os.environ["MODEL"] +dataset_hf = os.environ["REPO_ID"] +run_tag = os.environ["RUN_TAG"] +slurm_job_id = os.environ["SLURM_JOB_ID"] +harbor_version = os.environ.get("HARBOR_VERSION", "unknown") +agent_name = os.environ.get("EVAL_AGENT_NAME", "terminus-2") +n_concurrent = int(os.environ.get("EVAL_N_CONCURRENT", "128")) +n_attempts = int(os.environ.get("EVAL_N_ATTEMPTS", "3")) + +result = create_job_entry_started( + model_hf_name=model_hf, + benchmark_hf_name=dataset_hf, + job_name=run_tag, + username=os.environ.get("USER", "jupiter"), + slurm_job_id=slurm_job_id, + harbor_package_version=harbor_version, + agent_name=agent_name, + config={"agent": agent_name, "env": "daytona"}, + n_trials=n_concurrent, + n_rep_eval=n_attempts +) + +if not result.get("success"): + print(f"WARNING: DB create failed: {result.get('error')}", file=sys.stderr) + sys.exit(0) + +db_job_id = result["job"]["id"] +print(f"DB job created with ID: {db_job_id}") +PY + + # Get the DB job ID for later use + DB_JOB_ID=$(proxied $PYTHON_BIN - <<'PY' || true +import os, sys + +from database.unified_db.utils import get_latest_job_for_model_benchmark + +model_hf = os.environ["MODEL"] +dataset_hf = os.environ["REPO_ID"] + +try: + result = get_latest_job_for_model_benchmark(model_hf, dataset_hf) + if result and result.get("id"): + print(result["id"]) +except Exception as e: + print(f"WARNING: DB lookup failed: {e}", file=sys.stderr) +PY +) +fi + +if [ -z "${DB_JOB_ID:-}" ]; then + echo "WARNING: Failed to get DB job ID" +else + echo "DB job entry: $DB_JOB_ID" +fi + +# ============================================================================== +# Eval Starts Log (v4 — for model retry tracking) +# ============================================================================== +if [ -n "$EVAL_STARTS_LOG" ]; then + echo "${TIMESTAMP} ${MODEL} ${REPO_ID} ${SLURM_JOB_ID} ${RUN_TAG}" >> "$EVAL_STARTS_LOG" + echo "Logged eval start to: $EVAL_STARTS_LOG" +fi + +# ============================================================================== +# Run Harbor Eval (via proxychains for Daytona access) +# ============================================================================== +set +e + +# Resolve config path: check eval/configs/ (canonical), then legacy per-cluster dirs +if [[ "$CONFIG_YAML" != /* ]]; then + if [ -f "$DCFT/eval/configs/$CONFIG_YAML" ]; then + HARBOR_CONFIG="$DCFT/eval/configs/$CONFIG_YAML" + elif [ -f "$DCFT/eval/${CLUSTER_NAME}/$CONFIG_YAML" ]; then + HARBOR_CONFIG="$DCFT/eval/${CLUSTER_NAME}/$CONFIG_YAML" + elif [ -f "$DCFT/eval/MBZ/$CONFIG_YAML" ]; then + HARBOR_CONFIG="$DCFT/eval/MBZ/$CONFIG_YAML" + else + echo "ERROR: Harbor config not found: $CONFIG_YAML" + exit 1 + fi +else + HARBOR_CONFIG="$CONFIG_YAML" +fi +echo "Resolved harbor config: $HARBOR_CONFIG" + +# Build extra harbor args +EXTRA_HARBOR_ARGS="" +if [ -n "${EVAL_SNAPSHOT_NAME:-}" ]; then + echo "Using Daytona snapshot: $EVAL_SNAPSHOT_NAME" + EXTRA_HARBOR_ARGS="--environment-kwarg snapshot_template_name=$EVAL_SNAPSHOT_NAME --no-force-build" +fi +if [ "$TIMEOUT_MULTIPLIER" != "1" ] && [ "$TIMEOUT_MULTIPLIER" != "1.0" ]; then + echo "Using timeout multiplier: $TIMEOUT_MULTIPLIER" + EXTRA_HARBOR_ARGS="$EXTRA_HARBOR_ARGS --timeout-multiplier $TIMEOUT_MULTIPLIER" +fi +if [ -n "${EVAL_OVERRIDE_MEMORY_MB:-}" ]; then + echo "Using memory override: ${EVAL_OVERRIDE_MEMORY_MB}MB" + EXTRA_HARBOR_ARGS="$EXTRA_HARBOR_ARGS --override-memory-mb $EVAL_OVERRIDE_MEMORY_MB" +fi +# auto_snapshot override from listener (overrides YAML config value) +if [ -n "${EVAL_AUTO_SNAPSHOT:-}" ]; then + echo "Auto snapshot override: $EVAL_AUTO_SNAPSHOT" + EXTRA_HARBOR_ARGS="$EXTRA_HARBOR_ARGS --environment-kwarg auto_snapshot=$EVAL_AUTO_SNAPSHOT" +fi + +# Build thinking args +THINKING_ARGS="" +if [ "$ENABLE_THINKING" = "true" ]; then + echo "Thinking enabled" + THINKING_ARGS='--agent-kwarg "enable_thinking=true"' +fi + +# Build agent-parser args +PARSER_ARGS="" +if [ -n "$AGENT_PARSER" ]; then + echo "Agent parser: $AGENT_PARSER" + PARSER_ARGS="--agent-kwarg \"parser=$AGENT_PARSER\"" +fi + +echo "[DEBUG] EXTRA_HARBOR_ARGS=$EXTRA_HARBOR_ARGS" +echo "[DEBUG] DAYTONA_API_KEY=${DAYTONA_API_KEY:0:20}..." + +# Check if a previous job dir exists for this run tag (for resume) +# Skip resume if EVAL_FORCE_FRESH=true +EXISTING_JOB_DIR="${EVAL_JOBS_DIR}/${RUN_TAG}" +if [ "${EVAL_FORCE_FRESH:-false}" != "true" ] && [ -d "$EXISTING_JOB_DIR" ] && [ -f "$EXISTING_JOB_DIR/config.json" ]; then + echo "Found existing job dir, resuming: $EXISTING_JOB_DIR" + proxied $HARBOR_BIN jobs resume \ + -p "$EXISTING_JOB_DIR" \ + --filter-error-type EnvironmentStartTimeoutError \ + --filter-error-type DaytonaError \ + --filter-error-type DaytonaRateLimitError \ + --filter-error-type CancelledError \ + --filter-error-type AgentEnvironmentTimeoutError \ + --filter-error-type SandboxBuildFailedError +else + echo "Starting new job" + # Build command as array to handle quoting properly + HARBOR_CMD=( + proxied $HARBOR_BIN jobs start + -p "$DATASET_PATH" + --n-concurrent "$N_CONCURRENT" + --agent "$AGENT_NAME" + --model "hosted_vllm/$MODEL" + --env "daytona" + --agent-kwarg "api_base=http://localhost:${VLLM_PORT}/v1" + --agent-kwarg "key=fake_key" + --agent-kwarg "max_tokens=16384" + --agent-kwarg "model_info={\"max_output_tokens\":16384,\"max_input_tokens\":32768,\"input_cost_per_token\":0.0,\"output_cost_per_token\":0.0}" + --n-attempts "$N_ATTEMPTS" + --job-name "$RUN_TAG" + --export-traces + --config "$HARBOR_CONFIG" + --jobs-dir "$EVAL_JOBS_DIR" + --debug + ) + # Add optional args + if [ "$ENABLE_THINKING" = "true" ]; then + HARBOR_CMD+=(--agent-kwarg "enable_thinking=true") + fi + if [ -n "$AGENT_PARSER" ]; then + HARBOR_CMD+=(--agent-kwarg "parser=$AGENT_PARSER") + fi + + echo "[DEBUG] Full harbor command:" + echo " ${HARBOR_CMD[*]} $EXTRA_HARBOR_ARGS" + + "${HARBOR_CMD[@]}" $EXTRA_HARBOR_ARGS +fi +SB_EXIT=$? +set -e + +# ============================================================================== +# Save meta.env (v4 — extended fields) +# ============================================================================== +# Track resume count +RESUME_COUNT=0 +if [ -f "$RUN_DIR/meta.env" ]; then + OLD_RESUME_COUNT=$(grep -oP 'RESUME_COUNT=\K[0-9]+' "$RUN_DIR/meta.env" 2>/dev/null || echo "0") + RESUME_COUNT=$((OLD_RESUME_COUNT + 1)) + echo "Resume count: $RESUME_COUNT (was $OLD_RESUME_COUNT)" +fi + +mkdir -p "$RUN_DIR" +{ + echo "MODEL=$MODEL" + echo "REPO_ID=$REPO_ID" + echo "TIMESTAMP=$TIMESTAMP" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "DB_JOB_ID=${DB_JOB_ID:-}" + echo "BENCHMARK_ID=${BENCHMARK_ID:-}" + echo "N_CONCURRENT=$N_CONCURRENT" + echo "N_ATTEMPTS=$N_ATTEMPTS" + echo "GPU_MEMORY_UTIL=$GPU_MEMORY_UTIL" + echo "AGENT_NAME=$AGENT_NAME" + echo "AGENT_PARSER=${AGENT_PARSER:-}" + echo "ENABLE_THINKING=$ENABLE_THINKING" + echo "TIMEOUT_MULTIPLIER=$TIMEOUT_MULTIPLIER" + echo "BENCHMARK_NAME=$BENCHMARK_NAME" + echo "RESUME_COUNT=$RESUME_COUNT" +} > "$RUN_DIR/meta.env" + +# If eval failed, don't attempt upload +if [ ${SB_EXIT:-0} -ne 0 ]; then + echo "Harbor eval exited with non-zero status: ${SB_EXIT}. Skipping upload." + exit ${SB_EXIT} +fi + +# Ensure run dir exists +if [ ! -d "$RUN_DIR" ]; then + echo "Expected run directory not found: $RUN_DIR" + exit 2 +fi + +# ============================================================================== +# Check for errors before upload (v4 unified error checking) +# Counts ALL errors except known-benign types: +# AgentTimeoutError, BadRequestError, ContextLengthExceededError, +# SummarizationTimeout, SummarizationTimeoutError +# NOTE: AgentEnvironmentTimeoutError is NOT benign (infra issue, not model's fault) +# but is excluded from retry (retrying won't help) +# ============================================================================== +RESULT_FILE="$RUN_DIR/result.json" +ERROR_LOG="${EVAL_JOBS_DIR}/invalid_errors_${SLURM_JOB_ID}.log" + +if [ -f "$RESULT_FILE" ]; then + echo "Checking for invalid errors in $RESULT_FILE..." + + INVALID_COUNT=$($PYTHON_BIN -c " +import json, sys +try: + with open('$RESULT_FILE', 'r') as f: + data = json.load(f) + + BENIGN_ERRORS = { + 'AgentTimeoutError', + 'BadRequestError', + 'ContextLengthExceededError', + 'SummarizationTimeout', + 'SummarizationTimeoutError', + } + + total_invalid = 0 + if 'stats' in data and 'evals' in data['stats']: + for eval_key, eval_data in data['stats']['evals'].items(): + if 'exception_stats' in eval_data: + for err_type, err_ids in eval_data['exception_stats'].items(): + if err_type not in BENIGN_ERRORS: + if isinstance(err_ids, list): + total_invalid += len(err_ids) + print(total_invalid) +except Exception as e: + print(f'Error parsing result.json: {e}', file=sys.stderr) + print('0') +" 2>&1 | tail -n 1) + + echo "Invalid error count: ${INVALID_COUNT} (threshold: ${ERROR_THRESHOLD})" + + if [ "${INVALID_COUNT:-0}" -gt "$ERROR_THRESHOLD" ]; then + echo "Job has ${INVALID_COUNT} invalid errors (> ${ERROR_THRESHOLD}), skipping upload" + + { + echo "===============================================" + echo "Timestamp: $(date)" + echo "Job: ${RUN_TAG}" + echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" + echo "Model: ${MODEL}" + echo "Repo: ${REPO_ID}" + echo "Invalid errors: ${INVALID_COUNT}" + echo "Result file: ${RESULT_FILE}" + $PYTHON_BIN -c " +import json +BENIGN = {'AgentTimeoutError', 'ContextLengthExceededError', 'SummarizationTimeout', 'SummarizationTimeoutError'} +with open('$RESULT_FILE', 'r') as f: + data = json.load(f) +if 'stats' in data and 'evals' in data['stats']: + for eval_key, eval_data in data['stats']['evals'].items(): + if 'exception_stats' in eval_data: + for err_type, err_ids in eval_data['exception_stats'].items(): + if err_type not in BENIGN and err_ids: + print(f'Eval: {eval_key} / {err_type}') + for i, error_id in enumerate(err_ids[:10], 1): + print(f' {i}. {error_id}') + if len(err_ids) > 10: + print(f' ... and {len(err_ids) - 10} more') +" 2>/dev/null || true + echo "===============================================" + } >> "$ERROR_LOG" + + echo "Error details logged to: $ERROR_LOG" + echo "Job completed but not uploaded due to excessive invalid errors" + exit 0 + fi +else + echo "Warning: result.json not found, continuing with upload" +fi + +# ============================================================================== +# Upload results to DB (via proxychains for HF upload) +# ============================================================================== +export RUN_DIR +export UPLOAD_USERNAME +export UPLOAD_MODE="${UPLOAD_MODE:-skip_on_error}" +export RUN_TAG + +UPLOAD_LOG="${EVAL_LOGS_DIR:-eval/local/logs}/upload_${SLURM_JOB_ID}.log" +mkdir -p "$(dirname "$UPLOAD_LOG")" + +echo "Uploading results from: $RUN_DIR" | tee -a "$UPLOAD_LOG" +echo "Using username=${UPLOAD_USERNAME}, mode=${UPLOAD_MODE}" | tee -a "$UPLOAD_LOG" + +proxied $PYTHON_BIN - <<'PY' 2>&1 | tee -a "$UPLOAD_LOG" +import os, sys, re, hashlib + +from database.unified_db.utils import upload_eval_results + + +def sanitize_hf_repo_id(repo_id: str, max_length: int = 96) -> str: + def collapse(s: str) -> str: + prev = None + while s != prev: + prev = s + s = s.replace("--", "-").replace("..", ".") + return s + + org, name = repo_id.split("/", 1) if "/" in repo_id else (None, repo_id) + name = re.sub(r"[^A-Za-z0-9._-]", "-", name) + name = collapse(name).strip("-.") + if not name: + name = "repo" + + limit = max_length - (len(org) + 1 if org else 0) + if len(name) > limit: + digest = hashlib.sha1(name.encode()).hexdigest()[:8] + keep = max(1, limit - len(digest)) + base = name[:keep].rstrip("-.") + if not base: + base = "r" + name = f"{base}{digest}" + name = collapse(name).strip("-.") + + name = collapse(name).strip("-.") + if name[0] in "-.": + name = "r" + name[1:] + if name[-1] in "-.": + name = name[:-1] + "0" + + return f"{org}/{name}" if org else name + + +run_dir = os.environ["RUN_DIR"] +run_tag = os.environ["RUN_TAG"] +username = os.environ.get("UPLOAD_USERNAME", os.environ.get("USER", "jupiter")) +error_mode = os.environ.get("UPLOAD_MODE", "skip_on_error") +hf_repo_id = sanitize_hf_repo_id(f"DCAgent2/{run_tag}") +hf_token = os.environ["HF_TOKEN"] + +print(f"[uploader] upload_eval_results(path={run_dir!r}, username={username!r}, " + f"error_mode={error_mode!r}, hf_repo_id={hf_repo_id!r})") +dataset_hf = os.environ.get("REPO_ID", "") +# Use the canonical benchmark name computed in the shell section, +# which handles local paths and timeout/memory suffixes correctly. +benchmark_name = os.environ.get("BENCHMARK_NAME", "") +if not benchmark_name: + # Fallback: derive from REPO_ID (legacy behavior) + benchmark_name = dataset_hf.split("/")[-1] if "/" in dataset_hf else dataset_hf + +# Compute a stable benchmark_version_hash from the benchmark name +import hashlib +benchmark_version_hash = hashlib.sha256(benchmark_name.encode()).hexdigest() +print(f"[uploader] benchmark_name={benchmark_name!r}, version_hash={benchmark_version_hash[:16]}...") + +upload_eval_results( + run_dir, + username=username, + error_mode=error_mode, + hf_token=hf_token, + hf_repo_id=hf_repo_id, + register_benchmark=True, + benchmark_name=benchmark_name, + benchmark_version_hash=benchmark_version_hash, +) +print("[uploader] done.") +PY +UPLOAD_EXIT=${PIPESTATUS[0]} + +if [ $UPLOAD_EXIT -ne 0 ]; then + echo "Upload failed with exit code: $UPLOAD_EXIT" + exit $UPLOAD_EXIT +fi + +echo "==============================================" +echo "Eval and upload finished successfully." +echo "==============================================" diff --git a/eval/unified_eval_harbor_dp.sbatch b/eval/unified_eval_harbor_dp.sbatch new file mode 100644 index 00000000..e224a951 --- /dev/null +++ b/eval/unified_eval_harbor_dp.sbatch @@ -0,0 +1,992 @@ +#!/bin/bash +#SBATCH -p booster +#SBATCH --time=12:00:00 +#SBATCH --signal=B:TERM@120 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task=72 +#SBATCH --gres=gpu:4 +#SBATCH --output=eval/MBZ/logs/%x_%j.out +#SBATCH --job-name=eval_dp + +# ============================================================================== +# Unified Eval Harbor DP — Jupiter Cluster (JSC GH200) +# +# Multi-node data-parallel version of unified_eval_harbor.sbatch. +# Runs N vLLM replicas (one per node/shard) with Ray, shards the dataset, +# and runs N parallel harbor jobs — one per shard. +# +# IMPORTANT: Must pass --nodes N to sbatch (or set via EVAL_NUM_SHARDS). +# +# Positional args (same as single-node): +# $1 = MODEL (HF model name) +# $2 = REPO_ID (HF dataset repo or local path) +# $3 = BENCHMARK_ID (optional, DB benchmark UUID) +# $4 = RUN_TAG_ARG (optional, override run tag) +# +# Extra env vars (DP-specific): +# EVAL_NUM_SHARDS (default: $SLURM_JOB_NUM_NODES) +# EVAL_NODES_PER_SHARD (default: 1) +# EVAL_VLLM_PIPELINE_PARALLEL_SIZE (default: 1) +# +# All other env vars from unified_eval_harbor.sbatch are supported: +# EVAL_N_CONCURRENT, EVAL_N_ATTEMPTS, EVAL_GPU_MEMORY_UTIL, etc. +# ============================================================================== + +set -eo pipefail +ulimit -c 0 +ulimit -n 65536 2>/dev/null || true + +TIMESTAMP=$(date +'%Y%m%d_%H%M%S') + +# --- Parse positional args --- +MODEL="${1:-mlfoundations-dev/claude_3_7_20250219_tbench_traces_sharegptv1}" +REPO_ID="${2:-DCAgent/dev_set_71_tasks}" +BENCHMARK_ID="${3:-}" +RUN_TAG_ARG="${4:-}" + +# --- Read env vars from listener (with defaults) --- +N_CONCURRENT="${EVAL_N_CONCURRENT:-128}" +N_ATTEMPTS="${EVAL_N_ATTEMPTS:-3}" +GPU_MEMORY_UTIL="${EVAL_GPU_MEMORY_UTIL:-0.95}" +ERROR_THRESHOLD="${EVAL_DAYTONA_THRESHOLD:-999999}" +VLLM_MAX_RETRIES="${EVAL_VLLM_MAX_RETRIES:-20}" +AGENT_PARSER="${EVAL_AGENT_PARSER:-}" +ENABLE_THINKING="${EVAL_ENABLE_THINKING:-false}" +AGENT_NAME="${EVAL_AGENT_NAME:-terminus-2}" +EVAL_STARTS_LOG="${EVAL_STARTS_LOG:-}" +TIMEOUT_MULTIPLIER="${EVAL_TIMEOUT_MULTIPLIER:-1.0}" +CONFIG_YAML="${EVAL_CONFIG_YAML:-dcagent_eval_config.yaml}" +DB_JOB_ID="${EVAL_DB_JOB_ID:-}" +UPLOAD_USERNAME="${EVAL_UPLOAD_USERNAME:-$USER}" + +# --- DP-specific env vars --- +GPUS_PER_NODE="${EVAL_GPUS_PER_NODE:-4}" # From cluster config (Jupiter=4, MBZ=8) +TP_SIZE="${EVAL_VLLM_TENSOR_PARALLEL_SIZE:-2}" +PP_SIZE="${EVAL_VLLM_PIPELINE_PARALLEL_SIZE:-1}" +NODES_PER_SHARD="${EVAL_NODES_PER_SHARD:-1}" +# Shards per node: how many vLLM replicas fit on one node (default: 4/TP) +SHARDS_PER_NODE="${EVAL_SHARDS_PER_NODE:-$((GPUS_PER_NODE / TP_SIZE))}" +GPUS_PER_SHARD=$((TP_SIZE * PP_SIZE)) +# Total shards (default: nodes * shards_per_node) +NUM_SHARDS="${EVAL_NUM_SHARDS:-$((SLURM_JOB_NUM_NODES * SHARDS_PER_NODE))}" + +# Strip slashes and special chars for file-safe names +SAFE_MODEL=$(echo "$MODEL" | tr '/:' '_') +if [[ "$REPO_ID" == /* ]]; then + SAFE_REPO=$(basename "$REPO_ID") +else + SAFE_REPO=$(echo "$REPO_ID" | tr '/:' '_') +fi + +# Benchmark shorthand for squeue +declare -A BENCH_SHORT=( + ["DCAgent_dev_set_v2"]="v2" + ["DCAgent2_swebench-verified-random-100-folders"]="swe" + ["DCAgent2_terminal_bench_2"]="tb2" + ["DCAgent2_aider_polyglot"]="aider" + ["DCAgent2_bfcl-parity"]="bfcl" + ["DCAgent_dev_set_71_tasks"]="v1" +) +BENCH_TAG="${BENCH_SHORT[$SAFE_REPO]:-${SAFE_REPO:0:12}}" +if [ "$SLURM_JOB_NAME" = "eval_dp" ]; then + scontrol update JobId="$SLURM_JOB_ID" JobName="eval_dp_${BENCH_TAG}" +fi + +echo "==============================================" +echo "Jupiter Eval Harbor DP (${NUM_SHARDS}-way)" +echo "==============================================" +echo "Model: $MODEL" +echo "Dataset: $REPO_ID" +echo "Benchmark ID: ${BENCHMARK_ID:-}" +echo "Nodes: $SLURM_JOB_NUM_NODES ($SHARDS_PER_NODE shards/node, $NUM_SHARDS total shards)" +echo "GPUs: TP=$TP_SIZE, PP=$PP_SIZE, $GPUS_PER_SHARD GPUs/shard" +echo "N concurrent (per shard): $N_CONCURRENT" +echo "N attempts: $N_ATTEMPTS" +echo "GPU memory util: $GPU_MEMORY_UTIL" +echo "Error threshold: $ERROR_THRESHOLD" +echo "vLLM max retries: $VLLM_MAX_RETRIES" +echo "Agent: $AGENT_NAME" +echo "Thinking: $ENABLE_THINKING" +echo "Timeout multiplier: $TIMEOUT_MULTIPLIER" +echo "Pipeline parallel: $PP_SIZE" +echo "Config YAML: $CONFIG_YAML" +echo "DB Job ID (pending): ${DB_JOB_ID:-}" +echo "==============================================" + +# ============================================================================== +# Cluster-Agnostic Environment Setup (same as single-node sbatch) +# ============================================================================== + +DCFT="${EVAL_PROJECT_ROOT:-${DCFT:-/e/scratch/jureap59/$USER/OpenThoughts-Agent}}" +CLUSTER_NAME="${EVAL_CLUSTER_NAME:-jupiter}" +unset PYTHONPATH + +# Source cluster-specific dotenv +DOTENV_FILE="$DCFT/hpc/dotenv/${CLUSTER_NAME}.env" +if [ -f "$DOTENV_FILE" ]; then + source "$DOTENV_FILE" + echo "Sourced dotenv: $DOTENV_FILE" +else + echo "WARNING: dotenv not found: $DOTENV_FILE" +fi + +DC_AGENT_SECRET_ENV="${DC_AGENT_SECRET_ENV:-$HOME/secrets.env}" +if [ -f "$DC_AGENT_SECRET_ENV" ]; then + source "$DC_AGENT_SECRET_ENV" +fi + +if [ -z "${DAYTONA_API_KEY:-}" ]; then + echo "WARNING: DAYTONA_API_KEY not set. Check $DC_AGENT_SECRET_ENV" +fi +echo "Daytona API key: ${DAYTONA_API_KEY:0:12}..." + +# vLLM / Ray / Triton env vars +export VLLM_USE_V1=1 +export RAY_RUNTIME_ENV_HOOK=ray._private.runtime_env.uv_runtime_env_hook.hook +export RAY_CGRAPH_get_timeout=900 +export VLLM_CONFIG_ROOT="${VLLM_CACHE_ROOT:-/tmp/vllm_config_${USER}}" +export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-/tmp/triton_cache_${USER}}" +export FLASHINFER_WORKSPACE_BASE="${FLASHINFER_CACHE_DIR:-/tmp/flashinfer_cache_${USER}}" +export UV_CACHE_DIR="${UV_CACHE_DIR:-/tmp/uv_cache_${USER}}" +export HYDRA_FULL_ERROR=1 +export HF_HUB_CACHE="${EVAL_HF_CACHE:-${HF_HUB_CACHE:-/e/data1/datasets/playground/ot/hf_hub}}" +export HF_HOME="${HF_HUB_CACHE}" +export HF_CACHE_DIR="$HF_HUB_CACHE" +export HF_XET_CACHE="${HF_XET_CACHE:-/tmp/hf_xet_cache_${USER}}" +mkdir -p "$HF_XET_CACHE" + +HARBOR_SRC="${EVAL_HARBOR_SRC:-/e/scratch/jureap59/feuer1/harbor/src}" +export PYTHONPATH="${HARBOR_SRC}:${DCFT}:${PYTHONPATH:-}" + +# ============================================================================== +# LD_LOADER Wrappers (needed on aarch64 Jupiter where shared conda lacks exec perms) +# ============================================================================== + +OTAGENT_DIR="${OTAGENT_DIR:-/e/scratch/jureap59/feuer1/miniforge3/envs/otagent}" +export PATH="$OTAGENT_DIR/bin:$PATH" +export CONDA_PREFIX="$OTAGENT_DIR" + +PYTHON_REAL="$OTAGENT_DIR/bin/python3.12" + +# LD_LOADER Wrappers: required on aarch64 (Jupiter GH200) where shared conda +# lacks execute permissions. Skipped on x86_64 where the conda env is directly usable. +if [ "$(uname -m)" = "aarch64" ] && [ -f /lib/ld-linux-aarch64.so.1 ]; then + LD_LOADER="/lib/ld-linux-aarch64.so.1" + # Use shared filesystem (not /tmp) so all nodes can see the wrappers + WRAPPER_DIR="${DCFT}/eval/${CLUSTER_NAME}/.wrappers/${SLURM_JOB_ID}" + mkdir -p "$WRAPPER_DIR" + + cat > "$WRAPPER_DIR/python3" < "$WRAPPER_DIR/python" < "$WRAPPER_DIR/ray" < "$WRAPPER_DIR/harbor" </dev/null +export RAY_DEDUP_LOGS=0 + +# Set CUDA_HOME for flashinfer JIT compilation (needs nvcc) +_CUDA_HOME="${EVAL_CUDA_HOME:-/e/software/default/stages/2026/software/CUDA/13}" +if [ -n "${_CUDA_HOME}" ] && [ -d "${_CUDA_HOME}" ]; then + export CUDA_HOME="${_CUDA_HOME}" + export PATH="$CUDA_HOME/bin:$PATH" +fi + + +# Get all nodes +ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +ALL_NODES_ARRAY=($ALL_NODES) +echo "All nodes: ${ALL_NODES_ARRAY[@]}" + +# Env vars to export to srun workers +export SRUN_EXPORT_ENV="ALL,LD_LIBRARY_PATH=$LD_LIBRARY_PATH,PATH=$PATH,HF_TOKEN=$HF_TOKEN" +RAY_ENV_VARS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH PATH=$PATH HF_TOKEN=$HF_TOKEN" + +# ============================================================================== +# SSH Tunnel + Proxychains Setup (from head node = first node) +# ============================================================================== + +NODE_HOST=$(hostname -s) +TUNNEL_PORT=7003 +LOGIN_NODE="${EVAL_LOGIN_NODE:-jpbl-s01-02}" +PROXYCHAINS_BIN="${EVAL_PROXYCHAINS_BIN:-/e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4}" +TUNNEL_PID="" + +setup_proxy() { + echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE..." + if [ -z "${SSH_KEY:-}" ]; then + echo "[proxy] WARNING: SSH_KEY not set, skipping tunnel setup" + return 1 + fi + NODE_IP=$(nslookup "$NODE_HOST" 2>/dev/null | grep 'Address' | tail -n1 | awk '{print $2}') + NODE_IP="${NODE_IP:-127.0.0.1}" + ssh -g -f -N -D ${TUNNEL_PORT} \ + -o StrictHostKeyChecking=no \ + -o ConnectTimeout=1000 \ + -o ServerAliveInterval=10 \ + -o ServerAliveCountMax=30 \ + -o TCPKeepAlive=yes \ + -o ExitOnForwardFailure=yes \ + -o BatchMode=yes \ + -i "${SSH_KEY}" \ + "${USER}@${LOGIN_NODE}" + sleep 5 + if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then + echo "[proxy] SSH tunnel started successfully" + TUNNEL_PID=$(pgrep -f "ssh.*-D.*${TUNNEL_PORT}" | head -1) + else + echo "[proxy] ERROR: SSH tunnel failed to start" + return 1 + fi + CFG_PATH="$HOME/.proxychains/proxychains_${SLURM_JOB_ID}.conf" + mkdir -p "$HOME/.proxychains" + cat > "$CFG_PATH" </dev/null \ + && echo "[proxy] HuggingFace connectivity OK" \ + || echo "[proxy] WARNING: HuggingFace connectivity test failed" + fi + + if [ -n "${DAYTONA_API_KEY:-}" ]; then + echo "[daytona] Testing API connectivity..." + DAYTONA_HTTP_CODE=$("$PROXYCHAINS_BIN" -f "$CFG_PATH" curl -s --connect-timeout 15 \ + -H "Authorization: Bearer $DAYTONA_API_KEY" \ + "${DAYTONA_API_URL:-https://app.daytona.io/api}/health" \ + -o /dev/null -w "%{http_code}" 2>/dev/null) || DAYTONA_HTTP_CODE="FAIL" + echo "[daytona] Health check: $DAYTONA_HTTP_CODE" + fi + echo "[proxy] Setup complete" + return 0 +} + +proxied() { + if [ -x "$PROXYCHAINS_BIN" ] && [ -n "${PROXYCHAINS_CONF_FILE:-}" ]; then + "$PROXYCHAINS_BIN" -f "$PROXYCHAINS_CONF_FILE" "$@" + else + "$@" + fi +} + +if [ "${EVAL_PROXY_ENABLED:-true}" = "true" ]; then + setup_proxy || echo "[proxy] Continuing without proxy" +else + echo "[proxy] Proxy disabled (EVAL_PROXY_ENABLED=false), assuming direct internet access" +fi + +# ============================================================================== +# Cleanup trap (DP: kill all vLLM + Ray + tunnel) +# ============================================================================== +cleanup() { + echo "Cleaning up DP resources..." + # Kill vLLM processes + for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + if [ -n "${VLLM_PIDS[$shard_idx]:-}" ]; then + kill "${VLLM_PIDS[$shard_idx]}" 2>/dev/null || true + fi + done + # Stop Ray on all nodes + for node in "${ALL_NODES_ARRAY[@]}"; do + srun --nodes=1 --ntasks=1 --overlap -w "$node" "$RAY_BIN" stop --force 2>/dev/null & + done + wait + # Kill SSH tunnel + if [ -n "${TUNNEL_PID:-}" ]; then + kill "$TUNNEL_PID" 2>/dev/null || true + fi + pkill -f "ssh.*-D.*${TUNNEL_PORT}" 2>/dev/null || true + # Remove wrappers + proxychains config + [ -n "${WRAPPER_DIR:-}" ] && rm -rf "$WRAPPER_DIR" 2>/dev/null || true + rm -f "${PROXYCHAINS_CONF_FILE:-}" 2>/dev/null || true + echo "Cleanup done." +} +trap cleanup EXIT + +# ============================================================================== +# Pre-download model (compute nodes have no internet) +# ============================================================================== +EVAL_LOGS_DIR="${EVAL_LOGS_DIR:-eval/jupiter/logs}" +mkdir -p "$EVAL_LOGS_DIR" + +echo "Pre-downloading model: $MODEL" +proxied $PYTHON_BIN -c " +from huggingface_hub import snapshot_download +import os +cache = os.environ.get('HF_HUB_CACHE') +path = snapshot_download('$MODEL', cache_dir=cache) +print(f'Model cached at: {path}') +" +if [ $? -ne 0 ]; then + echo "ERROR: Model pre-download failed. Exiting." + exit 1 +fi + +# ============================================================================== +# Build vLLM command args (reuse build_vllm_cmd.sh) +# ============================================================================== +source "$DCFT/eval/build_vllm_cmd.sh" +build_vllm_cmd "$PYTHON_BIN" "$MODEL" "$GPU_MEMORY_UTIL" +# VLLM_CMD includes --host 0.0.0.0 --port 8000 by default. +# For DP, we strip --port and --host from VLLM_CMD and re-add per shard. +VLLM_CMD_BASE=() +skip_next=false +for arg in "${VLLM_CMD[@]}"; do + if $skip_next; then + skip_next=false + continue + fi + if [ "$arg" = "--port" ] || [ "$arg" = "--host" ]; then + skip_next=true + continue + fi + VLLM_CMD_BASE+=("$arg") +done + +echo "vLLM DP config: TP=$TP_SIZE, PP=$PP_SIZE, shards=$NUM_SHARDS, shards_per_node=$SHARDS_PER_NODE" + +# ============================================================================== +# Start Ray Clusters + vLLM (supports multiple shards per node) +# +# Topology: +# - Each node has GPUS_PER_NODE GPUs (4 on Jupiter GH200) +# - Each shard uses TP_SIZE GPUs (via CUDA_VISIBLE_DEVICES) +# - SHARDS_PER_NODE = GPUS_PER_NODE / TP_SIZE (e.g. 4/2=2) +# - Each shard gets its own Ray cluster (separate port), vLLM (separate port) +# +# Example: 1 node, TP=2, 2 shards/node +# Shard 0: GPUs 0,1, Ray port 6379, API port 8000 +# Shard 1: GPUs 2,3, Ray port 6479, API port 8001 +# +# Example: 2 nodes, TP=2, 2 shards/node = 4 total shards +# Node 0: Shard 0 (GPUs 0,1, port 8000), Shard 1 (GPUs 2,3, port 8001) +# Node 1: Shard 2 (GPUs 0,1, port 8000), Shard 3 (GPUs 2,3, port 8001) +# ============================================================================== +echo "" +echo "=== Starting $NUM_SHARDS shards ($SHARDS_PER_NODE per node, TP=$TP_SIZE) ===" + +declare -a VLLM_PIDS +declare -a SHARD_NODES # which node each shard runs on +declare -a SHARD_GPU_IDS # CUDA_VISIBLE_DEVICES for each shard +declare -a API_PORTS +declare -a RAY_PORTS +declare -a HEAD_IPS + +BASE_RAY_PORT=6379 +BASE_API_PORT=8000 + +# Compute shard → node mapping and GPU assignment +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + node_idx=$((shard_idx / SHARDS_PER_NODE)) + local_shard=$((shard_idx % SHARDS_PER_NODE)) + gpu_start=$((local_shard * TP_SIZE)) + + # Build CUDA_VISIBLE_DEVICES string (e.g. "0,1" or "2,3") + gpu_ids="" + for ((g = gpu_start; g < gpu_start + TP_SIZE; g++)); do + [ -n "$gpu_ids" ] && gpu_ids="${gpu_ids}," + gpu_ids="${gpu_ids}${g}" + done + + SHARD_NODES[$shard_idx]="${ALL_NODES_ARRAY[$node_idx]}" + SHARD_GPU_IDS[$shard_idx]="$gpu_ids" + # Use different Ray ports per shard (even on same node) + RAY_PORTS[$shard_idx]=$((BASE_RAY_PORT + shard_idx * 100)) + # API ports: offset by local_shard so shards on same node don't collide + API_PORTS[$shard_idx]=$((BASE_API_PORT + local_shard)) + + echo " Shard $shard_idx: node=${SHARD_NODES[$shard_idx]}, GPUs=${gpu_ids}, Ray port=${RAY_PORTS[$shard_idx]}, API port=${API_PORTS[$shard_idx]}" +done + +# --- Step 3a: Clean up existing Ray processes on all nodes --- +echo "" +echo "Cleaning up existing Ray processes..." +for node in "${ALL_NODES_ARRAY[@]}"; do + srun --nodes=1 --ntasks=1 --overlap -w "$node" bash -c \ + "pkill -9 -u $USER -f 'ray::' 2>/dev/null; $RAY_BIN stop --force 2>/dev/null; rm -rf /tmp/ray_${USER}_shard* 2>/dev/null" || true & +done +wait +sleep 2 + +# --- Step 3b: Start Ray clusters (one per shard) --- +echo "" +echo "Starting Ray clusters..." + +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + node="${SHARD_NODES[$shard_idx]}" + gpu_ids="${SHARD_GPU_IDS[$shard_idx]}" + ray_port="${RAY_PORTS[$shard_idx]}" + ray_temp_dir="/tmp/ray_${USER}_shard${shard_idx}" + + # Get node IP via InfiniBand + head_ip="" + if srun --nodes=1 --ntasks=1 --overlap -w "$node" ip -o -4 addr show ib0 >/dev/null 2>&1; then + head_ip=$(srun --nodes=1 --ntasks=1 --overlap -w "$node" ip -o -4 addr show ib0 | awk '{print $4}' | cut -d/ -f1) + else + head_ip=$(srun --nodes=1 --ntasks=1 --overlap -w "$node" hostname --ip-address) + head_ip=${head_ip%% *} + fi + HEAD_IPS[$shard_idx]="$head_ip" + + echo " Shard $shard_idx: Starting Ray head on $node ($head_ip:$ray_port), GPUs=$gpu_ids" + + # Start Ray head with only this shard's GPUs visible + srun --export="$SRUN_EXPORT_ENV" --nodes=1 --ntasks=1 --overlap -w "$node" bash -c \ + "env $RAY_ENV_VARS CUDA_VISIBLE_DEVICES=$gpu_ids \ + $RAY_BIN start --head --node-ip-address=${head_ip} --port=${ray_port} \ + --num-gpus=${TP_SIZE} --num-cpus=$((72 / SHARDS_PER_NODE)) \ + --temp-dir=${ray_temp_dir}" & + sleep 3 +done + +echo "Waiting for Ray clusters to stabilize..." +sleep 15 + +# --- Step 3c: Start vLLM on each shard --- +echo "" +echo "=== Starting $NUM_SHARDS vLLM Servers ===" + +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + node="${SHARD_NODES[$shard_idx]}" + gpu_ids="${SHARD_GPU_IDS[$shard_idx]}" + head_ip="${HEAD_IPS[$shard_idx]}" + ray_port="${RAY_PORTS[$shard_idx]}" + api_port="${API_PORTS[$shard_idx]}" + ray_address="${head_ip}:${ray_port}" + vllm_log="$EVAL_LOGS_DIR/vllm_dp_shard${shard_idx}_${SLURM_JOB_ID}.log" + + echo " Shard $shard_idx: vLLM on $node, GPUs=$gpu_ids, port=$api_port" + + srun --export="$SRUN_EXPORT_ENV" --nodes=1 --ntasks=1 --overlap -w "$node" \ + env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \ + PATH="$PATH" \ + PYTHONPATH="$PYTHONPATH" \ + HF_HOME="$HF_HOME" \ + HF_HUB_CACHE="$HF_HUB_CACHE" \ + HF_HUB_OFFLINE="1" \ + HF_TOKEN="$HF_TOKEN" \ + CUDA_VISIBLE_DEVICES="$gpu_ids" \ + RAY_ADDRESS="$ray_address" \ + VLLM_USE_V1="$VLLM_USE_V1" \ + VLLM_ALL2ALL_BACKEND="pplx" \ + TORCHDYNAMO_DISABLE="1" \ + TMPDIR="/tmp" \ + TRITON_CACHE_DIR="/tmp/triton_cache_${USER}" \ + TORCH_COMPILE_CACHE_DIR="/tmp/torch_cache_${USER}" \ + TORCHINDUCTOR_CACHE_DIR="/tmp/torchinductor_${USER}" \ + "${VLLM_CMD_BASE[@]}" \ + --host "0.0.0.0" \ + --port "$api_port" \ + --pipeline-parallel-size "$PP_SIZE" \ + >> "$vllm_log" 2>&1 & + + VLLM_PIDS[$shard_idx]=$! + echo " PID: ${VLLM_PIDS[$shard_idx]}, Log: $vllm_log" +done + +# --- Step 3d: Health check all shards --- +echo "" +echo "Waiting for all vLLM servers to become healthy..." + +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + node="${SHARD_NODES[$shard_idx]}" + api_port="${API_PORTS[$shard_idx]}" + health_url="http://127.0.0.1:${api_port}/v1/models" + + echo " Checking shard $shard_idx (port $api_port on $node)..." + for i in $(seq 1 $VLLM_MAX_RETRIES); do + if srun --export="$SRUN_EXPORT_ENV" --nodes=1 --ntasks=1 --overlap -w "$node" \ + curl -s "$health_url" > /dev/null 2>&1; then + echo " Shard $shard_idx is healthy!" + break + fi + if [ "$i" -eq "$VLLM_MAX_RETRIES" ]; then + echo "ERROR: Shard $shard_idx failed health check after $VLLM_MAX_RETRIES attempts" + tail -50 "$EVAL_LOGS_DIR/vllm_dp_shard${shard_idx}_${SLURM_JOB_ID}.log" 2>/dev/null || true + exit 1 + fi + sleep 100 + done +done + +echo "All vLLM servers are ready!" + +# ============================================================================== +# Download and shard dataset +# ============================================================================== +echo "" +echo "=== Downloading and sharding dataset ===" + +# Download/locate dataset (same logic as single-node sbatch) +if [[ "$REPO_ID" == /* ]]; then + echo "Using local dataset path: $REPO_ID" + DATASET_PATH="$REPO_ID" + if [ ! -d "$DATASET_PATH" ]; then + echo "ERROR: Local dataset path does not exist: $DATASET_PATH" + exit 1 + fi +else + echo "Downloading/locating dataset: $REPO_ID" + # Check dataset directories (from EVAL_DATASETS_DIRS or fallback) + _DS_DIRS="${EVAL_DATASETS_DIRS:-/e/data1/datasets/playground/ot/datasets:/e/scratch/jureap59/${USER}/datasets}" + DATASET_LOCAL_DIR="" + IFS=':' read -ra _DS_DIRS_ARR <<< "$_DS_DIRS" + for _ds_dir in "${_DS_DIRS_ARR[@]}"; do + if [ -d "${_ds_dir}/${SAFE_REPO}" ]; then + DATASET_LOCAL_DIR="${_ds_dir}/${SAFE_REPO}" + break + fi + done + DATASET_LOCAL_DIR="${DATASET_LOCAL_DIR:-${_DS_DIRS_ARR[0]}/${SAFE_REPO}}" + DOWNLOAD_LOG=$(mktemp /tmp/download_XXXXXX.log) + proxied $PYTHON_BIN "$DCFT/eval/jupiter/snapshot_download.py" "$REPO_ID" --local-dir "$DATASET_LOCAL_DIR" > "$DOWNLOAD_LOG" 2>&1 + DOWNLOAD_EXIT=$? + cat "$DOWNLOAD_LOG" + DATASET_PATH=$(grep DATASET_PATH "$DOWNLOAD_LOG" | tail -n 1 | cut -d'=' -f2) + rm -f "$DOWNLOAD_LOG" + if [ $DOWNLOAD_EXIT -ne 0 ] || [ -z "${DATASET_PATH:-}" ]; then + echo "ERROR: Failed to get dataset path (exit code: $DOWNLOAD_EXIT)" + exit 1 + fi +fi +echo "Dataset path: $DATASET_PATH" + +# Shard dataset into N parts (simple round-robin symlink) +SHARD_DIR="/tmp/eval_shards_${SLURM_JOB_ID}" +mkdir -p "$SHARD_DIR" + +echo "Sharding dataset into $NUM_SHARDS parts..." +$PYTHON_BIN -c " +import os, sys +from pathlib import Path + +dataset_path = Path('$DATASET_PATH') +num_shards = $NUM_SHARDS +shard_dir = Path('$SHARD_DIR') + +task_dirs = sorted([d for d in dataset_path.iterdir() if d.is_dir()]) +print(f'Found {len(task_dirs)} tasks to shard into {num_shards} parts') + +for shard_idx in range(num_shards): + shard_path = shard_dir / f'shard_{shard_idx}' + shard_path.mkdir(exist_ok=True) + shard_tasks = task_dirs[shard_idx::num_shards] + print(f'Shard {shard_idx}: {len(shard_tasks)} tasks') + for task_dir in shard_tasks: + dest = shard_path / task_dir.name + if not dest.exists(): + os.symlink(task_dir, dest) +" +if [ $? -ne 0 ]; then + echo "ERROR: Dataset sharding failed" + exit 1 +fi + +TASK_COUNT=$(ls -d "$DATASET_PATH"/*/instruction.md 2>/dev/null | wc -l) +echo "Total tasks: $TASK_COUNT, shards: $NUM_SHARDS (~$((TASK_COUNT / NUM_SHARDS)) tasks/shard)" + +# ============================================================================== +# Construct run tag and directory +# ============================================================================== +if [ -n "$RUN_TAG_ARG" ]; then + RUN_TAG="$RUN_TAG_ARG" +else + RUN_TAG="${SAFE_REPO}_${SAFE_MODEL}" +fi +EVAL_JOBS_DIR="${EVAL_JOBS_DIR:-/e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs}" # fallback; listener sets EVAL_JOBS_DIR from cluster config + +echo "Run tag: $RUN_TAG" + +# Benchmark name (same logic as single-node) +declare -A BENCHMARK_NAME_MAP=( + ["DCAgent2_aider_polyglot"]="aider_polyglot" + ["DCAgent_dev_set_v2"]="dev_set_v2" + ["DCAgent_dev_set_71_tasks"]="dev_set_71_tasks" + ["DCAgent2_terminal_bench_2"]="terminal_bench_2" + ["DCAgent_swebench_verified_eval_set"]="swebench-verified-random-100-folders" + ["DCAgent2_bfcl-parity"]="bfcl-parity" +) +BASE_NAME="${BENCHMARK_NAME_MAP[$SAFE_REPO]:-$SAFE_REPO}" +BENCHMARK_SUFFIX="" +if [ -n "${EVAL_OVERRIDE_MEMORY_MB:-}" ] && [ "$EVAL_OVERRIDE_MEMORY_MB" != "1024" ]; then + mem_gb=$(( EVAL_OVERRIDE_MEMORY_MB / 1024 )) + BENCHMARK_SUFFIX="${BENCHMARK_SUFFIX}_${mem_gb}gb" +fi +if [ "$TIMEOUT_MULTIPLIER" != "1" ] && [ "$TIMEOUT_MULTIPLIER" != "1.0" ]; then + BENCHMARK_SUFFIX="${BENCHMARK_SUFFIX}_${TIMEOUT_MULTIPLIER}x" +fi +BENCHMARK_NAME="${BASE_NAME}${BENCHMARK_SUFFIX}" +export BENCHMARK_NAME MODEL REPO_ID SLURM_JOB_ID +echo "Benchmark name: $BENCHMARK_NAME" + +# ============================================================================== +# Update DB: Pending → Started +# ============================================================================== +echo "Creating/updating DB job entry..." +HARBOR_VERSION=$($PYTHON_BIN -c "import harbor; print(harbor.__version__)" 2>/dev/null || echo "unknown") +export HARBOR_VERSION RUN_TAG + +if [ -n "$DB_JOB_ID" ]; then + echo "Updating Pending job $DB_JOB_ID → Started" + proxied $PYTHON_BIN - <<'PY' || true +import os, sys +from database.unified_db.utils import update_job_status_to_started +run_tag = os.environ["RUN_TAG"] +agent_name = os.environ.get("EVAL_AGENT_NAME", "terminus-2") +timeout_multiplier = float(os.environ.get("EVAL_TIMEOUT_MULTIPLIER", "1.0")) +harbor_version = os.environ.get("HARBOR_VERSION", "unknown") +config = {"agent": agent_name, "env": "daytona", "timeout_multiplier": timeout_multiplier} +result = update_job_status_to_started( + job_name=run_tag, + n_trials=int(os.environ.get("EVAL_N_CONCURRENT", "128")), + n_rep_eval=int(os.environ.get("EVAL_N_ATTEMPTS", "3")), + config=config, + harbor_package_version=harbor_version, +) +if not result.get("success"): + print(f"WARNING: Pending→Started update failed: {result.get('error')}", file=sys.stderr) +else: + print(f"DB job {run_tag} → Started") +PY +fi + +# ============================================================================== +# Resolve harbor config +# ============================================================================== +if [[ "$CONFIG_YAML" != /* ]]; then + if [ -f "$DCFT/eval/${CLUSTER_NAME}/$CONFIG_YAML" ]; then + HARBOR_CONFIG="$DCFT/eval/${CLUSTER_NAME}/$CONFIG_YAML" + elif [ -f "$DCFT/eval/MBZ/$CONFIG_YAML" ]; then + HARBOR_CONFIG="$DCFT/eval/MBZ/$CONFIG_YAML" + else + HARBOR_CONFIG="$DCFT/eval/jupiter/$CONFIG_YAML" + fi +else + HARBOR_CONFIG="$CONFIG_YAML" +fi + +# Build extra harbor args +EXTRA_HARBOR_ARGS="" +if [ -n "${EVAL_SNAPSHOT_NAME:-}" ]; then + EXTRA_HARBOR_ARGS="--environment-kwarg snapshot_template_name=$EVAL_SNAPSHOT_NAME --no-force-build" +fi +if [ "$TIMEOUT_MULTIPLIER" != "1" ] && [ "$TIMEOUT_MULTIPLIER" != "1.0" ]; then + EXTRA_HARBOR_ARGS="$EXTRA_HARBOR_ARGS --timeout-multiplier $TIMEOUT_MULTIPLIER" +fi +if [ -n "${EVAL_OVERRIDE_MEMORY_MB:-}" ]; then + EXTRA_HARBOR_ARGS="$EXTRA_HARBOR_ARGS --override-memory-mb $EVAL_OVERRIDE_MEMORY_MB" +fi +if [ -n "${EVAL_AUTO_SNAPSHOT:-}" ]; then + EXTRA_HARBOR_ARGS="$EXTRA_HARBOR_ARGS --environment-kwarg auto_snapshot=$EVAL_AUTO_SNAPSHOT" +fi + +# ============================================================================== +# Run Harbor Jobs in Parallel (one per shard) +# ============================================================================== +echo "" +echo "=== Running $NUM_SHARDS Harbor Jobs in Parallel ===" +set +e + +declare -a HARBOR_PIDS +declare -a HARBOR_LOGS +declare -a SHARD_RUN_DIRS + +run_harbor_for_shard() { + local shard_idx=$1 + local head_node="${SHARD_NODES[$shard_idx]}" + local api_port=${API_PORTS[$shard_idx]} + local shard_dataset="$SHARD_DIR/shard_${shard_idx}" + local shard_run_tag="${RUN_TAG}_shard${shard_idx}" + local harbor_log="$EVAL_LOGS_DIR/harbor_dp_shard${shard_idx}_${SLURM_JOB_ID}.log" + local shard_run_dir="${EVAL_JOBS_DIR}/${shard_run_tag}" + + HARBOR_LOGS[$shard_idx]="$harbor_log" + SHARD_RUN_DIRS[$shard_idx]="$shard_run_dir" + + echo " Shard $shard_idx: dataset=$shard_dataset, port=$api_port, run_tag=$shard_run_tag" + + # Check for resume (existing job dir for this shard) + local existing_job_dir="$shard_run_dir" + local harbor_cmd + + if [ -d "$existing_job_dir" ] && [ -f "$existing_job_dir/config.json" ]; then + echo " Resuming existing job at $existing_job_dir" + harbor_cmd="harbor jobs resume \ + --job-path \"$existing_job_dir\" \ + --filter-error-type EnvironmentStartTimeoutError \ + --filter-error-type DaytonaError \ + --filter-error-type DaytonaRateLimitError \ + --filter-error-type CancelledError \ + --filter-error-type AgentEnvironmentTimeoutError \ + --filter-error-type SandboxBuildFailedError" + else + echo " Starting new job" + harbor_cmd="harbor jobs start \ + -p \"$shard_dataset\" \ + --jobs-dir \"$EVAL_JOBS_DIR\" \ + --n-concurrent \"$N_CONCURRENT\" \ + --agent \"$AGENT_NAME\" \ + --model \"hosted_vllm/$MODEL\" \ + --env \"daytona\" \ + --agent-kwarg \"api_base=http://localhost:${api_port}/v1\" \ + --agent-kwarg \"key=fake_key\" \ + --agent-kwarg \"max_tokens=16384\" \ + --agent-kwarg 'model_info={\"max_output_tokens\":16384,\"max_input_tokens\":32768,\"input_cost_per_token\":0.0,\"output_cost_per_token\":0.0}' \ + --n-attempts \"$N_ATTEMPTS\" \ + --job-name \"$shard_run_tag\" \ + --export-traces \ + --config \"$HARBOR_CONFIG\" \ + --debug" + + if [ "$ENABLE_THINKING" = "true" ]; then + harbor_cmd="$harbor_cmd --agent-kwarg \"enable_thinking=true\"" + fi + if [ -n "$AGENT_PARSER" ]; then + harbor_cmd="$harbor_cmd --agent-kwarg \"parser=$AGENT_PARSER\"" + fi + if [ -n "$EXTRA_HARBOR_ARGS" ]; then + harbor_cmd="$harbor_cmd $EXTRA_HARBOR_ARGS" + fi + fi + + # Run harbor on the shard's head node (via proxychains for Daytona) + if [ -x "$PROXYCHAINS_BIN" ] && [ -n "${PROXYCHAINS_CONF_FILE:-}" ]; then + srun --export="$SRUN_EXPORT_ENV" --nodes=1 --ntasks=1 --overlap -w "$head_node" \ + "$PROXYCHAINS_BIN" -f "$PROXYCHAINS_CONF_FILE" \ + bash -c "$harbor_cmd" \ + >> "$harbor_log" 2>&1 & + else + srun --export="$SRUN_EXPORT_ENV" --nodes=1 --ntasks=1 --overlap -w "$head_node" \ + bash -c "$harbor_cmd" \ + >> "$harbor_log" 2>&1 & + fi + + HARBOR_PIDS[$shard_idx]=$! + echo " PID: ${HARBOR_PIDS[$shard_idx]}, Log: $harbor_log" +} + +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + run_harbor_for_shard $shard_idx +done + +echo "" +echo "All Harbor jobs started. Waiting for completion..." + +FAILED=0 +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + wait ${HARBOR_PIDS[$shard_idx]} || FAILED=$((FAILED + 1)) + echo " Shard $shard_idx completed (exit: $?)" +done + +set -e + +# ============================================================================== +# Save meta.env (per shard) +# ============================================================================== +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + shard_run_dir="${SHARD_RUN_DIRS[$shard_idx]}" + mkdir -p "$shard_run_dir" + { + echo "MODEL=$MODEL" + echo "REPO_ID=$REPO_ID" + echo "TIMESTAMP=$TIMESTAMP" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "DB_JOB_ID=${DB_JOB_ID:-}" + echo "BENCHMARK_ID=${BENCHMARK_ID:-}" + echo "N_CONCURRENT=$N_CONCURRENT" + echo "N_ATTEMPTS=$N_ATTEMPTS" + echo "AGENT_NAME=$AGENT_NAME" + echo "ENABLE_THINKING=$ENABLE_THINKING" + echo "TIMEOUT_MULTIPLIER=$TIMEOUT_MULTIPLIER" + echo "BENCHMARK_NAME=$BENCHMARK_NAME" + echo "NUM_SHARDS=$NUM_SHARDS" + echo "SHARD_IDX=$shard_idx" + echo "DP_MODE=true" + } > "$shard_run_dir/meta.env" +done + +# ============================================================================== +# Error check per shard + aggregate +# ============================================================================== +echo "" +echo "=== Checking errors across all shards ===" + +TOTAL_INVALID=0 +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + shard_run_dir="${SHARD_RUN_DIRS[$shard_idx]}" + result_file="$shard_run_dir/result.json" + if [ -f "$result_file" ]; then + SHARD_INVALID=$($PYTHON_BIN -c " +import json +try: + with open('$result_file') as f: + data = json.load(f) + BENIGN = {'AgentTimeoutError','ContextLengthExceededError','SummarizationTimeout','SummarizationTimeoutError'} + total = 0 + if 'stats' in data and 'evals' in data['stats']: + for ek, ev in data['stats']['evals'].items(): + if 'exception_stats' in ev: + for et, ids in ev['exception_stats'].items(): + if et not in BENIGN and isinstance(ids, list): + total += len(ids) + print(total) +except: print('0') +" 2>&1 | tail -1) + echo " Shard $shard_idx: $SHARD_INVALID invalid errors" + TOTAL_INVALID=$((TOTAL_INVALID + SHARD_INVALID)) + else + echo " Shard $shard_idx: no result.json" + fi +done + +echo "Total invalid errors across all shards: $TOTAL_INVALID (threshold: $ERROR_THRESHOLD)" + +if [ "$TOTAL_INVALID" -gt "$ERROR_THRESHOLD" ]; then + echo "Too many invalid errors ($TOTAL_INVALID > $ERROR_THRESHOLD), skipping upload" + exit 0 +fi + +# ============================================================================== +# Upload results (merge all shard dirs) +# ============================================================================== +echo "" +echo "=== Uploading results ===" + +# Collect all shard job dirs +JOB_DIRS="" +for shard_idx in $(seq 0 $((NUM_SHARDS - 1))); do + shard_run_dir="${SHARD_RUN_DIRS[$shard_idx]}" + if [ -d "$shard_run_dir" ]; then + JOB_DIRS="$JOB_DIRS $shard_run_dir" + fi +done + +echo "Uploading from shard dirs: $JOB_DIRS" + +# Use upload_traces.py to merge and upload all shard results +export PYTHONPATH="${DCFT}:${DCFT}/data/sbatches/teacher_scripts:${PYTHONPATH:-}" +proxied $PYTHON_BIN -c " +import os, sys, re, hashlib + +from database.unified_db.utils import upload_eval_results + + +def sanitize_hf_repo_id(repo_id, max_length=96): + def collapse(s): + prev = None + while s != prev: + prev = s + s = s.replace('--', '-').replace('..', '.') + return s + org, name = repo_id.split('/', 1) if '/' in repo_id else (None, repo_id) + name = re.sub(r'[^A-Za-z0-9._-]', '-', name) + name = collapse(name).strip('-.') + if not name: name = 'repo' + limit = max_length - (len(org) + 1 if org else 0) + if len(name) > limit: + digest = hashlib.sha1(name.encode()).hexdigest()[:8] + keep = max(1, limit - len(digest)) + base = name[:keep].rstrip('-.') + if not base: base = 'r' + name = f'{base}{digest}' + name = collapse(name).strip('-.') + if name[0] in '-.': name = 'r' + name[1:] + if name[-1] in '-.': name = name[:-1] + '0' + return f'{org}/{name}' if org else name + + +# Upload each shard separately +shard_dirs = '$JOB_DIRS'.split() +run_tag = os.environ['RUN_TAG'] +username = os.environ.get('UPLOAD_USERNAME', os.environ.get('USER', 'jupiter')) +hf_token = os.environ['HF_TOKEN'] +benchmark_name = os.environ.get('BENCHMARK_NAME', '') + +for shard_dir in shard_dirs: + shard_name = os.path.basename(shard_dir) + hf_repo_id = sanitize_hf_repo_id(f'DCAgent2/{shard_name}') + print(f'[uploader] Uploading shard {shard_name} to {hf_repo_id}') + + benchmark_version_hash = hashlib.sha256(benchmark_name.encode()).hexdigest() + + try: + upload_eval_results( + shard_dir, + username=username, + error_mode='skip_on_error', + hf_token=hf_token, + hf_repo_id=hf_repo_id, + register_benchmark=True, + benchmark_name=benchmark_name, + benchmark_version_hash=benchmark_version_hash, + ) + print(f'[uploader] {shard_name} done.') + except Exception as e: + print(f'[uploader] WARNING: {shard_name} upload failed: {e}') +" 2>&1 +UPLOAD_EXIT=$? + +# Clean up shard directory +rm -rf "$SHARD_DIR" 2>/dev/null || true + +echo "==============================================" +if [ $FAILED -eq 0 ] && [ $UPLOAD_EXIT -eq 0 ]; then + echo "DP Eval COMPLETED ($NUM_SHARDS shards)" +else + echo "DP Eval completed with issues (failed shards: $FAILED, upload exit: $UPLOAD_EXIT)" +fi +echo "==============================================" diff --git a/eval/unified_eval_listener.py b/eval/unified_eval_listener.py new file mode 100644 index 00000000..195d3f55 --- /dev/null +++ b/eval/unified_eval_listener.py @@ -0,0 +1,3756 @@ +#!/usr/bin/env python3 +""" +Unified Eval Listener v6 - Polls Supabase for models and submits SLURM eval jobs. + +Based on v5, with disk-based resume replacing the v5 DB-based DaytonaError resume. +Scans the eval jobs directory for incomplete/error-heavy jobs and resubmits them +with the same run_tag so harbor auto-resumes (skips completed trials, retries failed ones). + +Key v6 features over v5: + - Disk-based resume: scans --jobs-dir for incomplete/errored job dirs + - Persistent sliding-window batch_size across iterations + - hf_overrides support in baseline model configs + - Supabase queries wrapped in try/except for resilience +Uses unified_eval_harbor.sbatch as the SLURM job template. + + +=============================================================================== +FLAG REFERENCE +=============================================================================== + +--- Preset & Dataset Selection --- + +--preset, -p {aider,bfcl,swebench,v2,tb2,v1} + Load a named preset that bundles dataset, concurrency, error threshold, and + other defaults tuned for a specific benchmark. CLI flags override any preset + value. Almost all runs should start with a preset. + + Preset details: + aider Dataset: DCAgent2/aider_polyglot. n_concurrent=32, error_threshold=10, thinking=on. + bfcl Dataset: DCAgent2/bfcl-parity. n_concurrent=32, error_threshold=10, thinking=on, vllm_retries=20. + swebench Dataset: DCAgent2/swebench-verified-*. n_concurrent=32, error_threshold=15, thinking=on, vllm_retries=20, + agent_parser=xml, gpu_mem=0.95, config_yaml=no_override. HF existence check on. + v2 Dataset: DCAgent/dev_set_v2. n_concurrent=32, error_threshold=10, thinking=on, vllm_retries=20. + tb2 Dataset: DCAgent2/terminal_bench_2. n_concurrent=32, error_threshold=10, thinking=on, + gpu_mem=0.95, slurm_time=48h, config_yaml=no_override. + v1 Dataset: DCAgent/dev_set_71_tasks. n_concurrent=32, error_threshold=10, thinking=on, vllm_retries=20. + + Tuning: Pick the preset matching your benchmark. Override individual params + with CLI flags (e.g. --n-concurrent 64 to double concurrency). + +--datasets, -d + Comma- or space-separated list of HuggingFace dataset repos. Overrides the + preset's dataset list. Use this for one-off evals against custom datasets. + Example: --datasets "DCAgent/dev_set_v2,DCAgent2/terminal_bench_2" + +--sbatch-script, -s + Path to the sbatch template. Default: unified_eval_harbor_v4.sbatch (or + whatever the preset specifies). Only change this if you have a custom sbatch. + + +--- Model Filtering --- + +--priority-file + Text file listing HuggingFace model names (org/model), one per line. + Lines starting with # are comments; blank lines are ignored. + File order = submission priority: earlier lines are submitted first. + Hot-reloaded every iteration — edit the file without restarting the listener. + + Env: EVAL_LISTENER_PRIORITY_FILE + +--priority-mode {filter_only,priority_first} [default: filter_only] + filter_only — Only evaluate models IN the priority file. All others skipped. + priority_first — Evaluate ALL models, but submit priority models first. + + Tuning: Use filter_only (default) when you have a curated list of models to + evaluate. Use priority_first when you want to evaluate everything but ensure + specific models get SLURM slots first. + + Env: EVAL_LISTENER_PRIORITY_MODE + +--require-priority-list + Safety flag. If set and no priority file is loaded (missing file or empty), + the listener skips ALL models instead of evaluating everything. Prevents + accidental mass submissions when a priority file path is misconfigured. + + Env: EVAL_LISTENER_REQUIRE_PRIORITY_LIST="1" + +--blacklist-file [v4 NEW] + Text file listing models that should NEVER be submitted, same format as + --priority-file (one model per line, # comments, blank lines ignored). + Blacklist overrides priority: if a model appears in both files, it is blocked. + Hot-reloaded every iteration, same as --priority-file. + + Tuning: Use this to permanently exclude known-bad models (e.g. broken + checkpoints, models that consistently OOM, duplicates you don't want to + re-evaluate). Faster than removing them from the priority file because + the blacklist is checked first — no DB queries wasted on blocked models. + + Env: EVAL_LISTENER_BLACKLIST_FILE + +--check-hf-exists + Before submitting, validate that the model actually exists on HuggingFace Hub. + Adds a network round-trip per model but prevents wasted SLURM jobs on typos + or deleted models. The swebench preset enables this by default. + + Env: EVAL_LISTENER_CHECK_HF_EXISTS="1" + + +--- Timing & Lifecycle --- + +--lookback-days [default: 1000] + How far back to query the Supabase `models` table (by creation_time). + Priority models bypass this window — they are always fetched by name + regardless of when they were added. + + Tuning: Keep this large (default 1000) to catch old models. Reduce only if + DB queries are slow and you know all target models are recent. + + Env: EVAL_LISTENER_LOOKBACK_DAYS + +--check-hours [default: 4.0] + Hours to sleep between iterations. Each iteration re-queries the DB, hot- + reloads priority/blacklist files, and submits any new jobs. + + Tuning: For active development with frequent model uploads, use 1-2h. + For stable production runs, 4-12h is fine. Ignored when --once is set. + + Env: EVAL_LISTENER_CHECK_HOURS + +--stale-hours [default: 24] + A job in "Started" status older than this is considered stale and will be + resubmitted. Covers cases where the sbatch job crashed without updating + the DB to Finished. + + Tuning: Set to at least 1.5x your SLURM time limit. If --slurm-time is + 24:00:00, keep this at 24 (default). If you use --slurm-time 48:00:00 + (like tb2), bump to 48-72. + +--stale-pending-hours [default: 48] + A job in "Pending" status older than this is considered stale. The listener + will scancel the old SLURM job (if tracked) and resubmit. + + Tuning: Should be >= --stale-hours. Default of 48h gives Pending jobs extra + time to get through the SLURM queue before being killed. + + +--- Sbatch / vLLM Parameters (passed to sbatch via env vars) --- + +--n-concurrent [default: 64, preset overrides] + Number of concurrent Harbor evaluation jobs inside the sbatch. Controls how + many sandbox tasks run in parallel against the vLLM server. + + Tuning: Depends on model size and GPU memory. + - 7-8B models on GH200 (96GB): 32-64 is safe. + - 32B models: 8-16 (higher causes vLLM queue buildup → AgentTimeoutError). + - 131K context models: 4-8 (KV cache fills fast at high concurrency). + If you see many AgentTimeoutErrors, reduce this. If eval is slow and vLLM + GPU utilization is low, increase it. + +--n-attempts [default: 3] + Number of retry attempts per Harbor task. If a task fails (e.g. sandbox + timeout), Harbor retries it up to this many times. + + Tuning: 3 is good for most benchmarks. Raise to 5 for flaky benchmarks. + Lowering to 1 speeds up runs but increases noise from transient failures. + +--gpu-memory-util [default: 0.9] + Fraction of GPU memory allocated to vLLM via --gpu-memory-utilization. + + Tuning: + - 0.90 (default): safe for 7-8B models on GH200 (96GB). Leaves headroom + for GPU memory variance across nodes. + - 0.95: used by swebench/tb2 presets for larger models or when you need + maximum KV cache capacity. Risk: some GH200 nodes have slightly less + available memory and will OOM at 0.95 (use --exclude in sbatch). + - Never go above 0.95. Below 0.85 wastes memory. + +--error-threshold [default: 3, preset overrides] + Maximum number of "invalid" errors allowed before the sbatch script aborts + result upload. Invalid = any error type EXCEPT AgentTimeoutError, + ContextLengthExceededError, SummarizationTimeout, SummarizationTimeoutError. + + Tuning: Controls quality gating. Low values (3) are strict — a few + DaytonaErrors or unexpected crashes abort the upload. Higher values (10-15) + are more tolerant, appropriate for benchmarks where some sandbox flakiness + is expected. + - aider: 3 (strict, small dataset) + - v2/tb2: 10 (moderate, larger datasets with occasional flakes) + - swebench: 15 (lenient, swebench sandboxes are flakier) + + --daytona-threshold is a backward-compatible alias for this flag. + +--vllm-max-retries [default: 5, preset overrides] + Number of times the sbatch script retries starting the vLLM server. + vLLM occasionally fails to start on first attempt (port conflicts, + CUDA initialization issues). + + Tuning: 5 is fine for quick detection of real failures. Presets like v2 + and swebench use 20 for more resilience on busy clusters. + +--agent-parser [default: "" (none)] + Parser type for Harbor agent output. Set to "xml" for swebench (which + uses XML-structured agent responses). Leave empty for all other benchmarks. + + Tuning: Only change this if you're adding a new benchmark with a custom + agent output format. The swebench preset sets this automatically. + +--slurm-time [default: "24:00:00"] + SLURM wall-clock time limit for the sbatch job. Format: HH:MM:SS. + + Tuning: 24h is enough for most benchmarks. tb2 preset uses 48h because + terminal_bench_2 tasks are longer-running. If jobs are hitting the time + limit and getting killed, increase this and also bump --stale-hours. + +--slurm-partition [default: "gh"] + SLURM partition to submit jobs to. On TACC, "gh" is the GH200 GPU partition. + +--agent-name [default: "terminus-2"] + Agent name written to DB entries and used by Harbor for evaluation config. + This determines which agent implementation Harbor uses to run the eval tasks. + +--enable-thinking + Enable thinking/reasoning blocks in vLLM model inference. Most presets + enable this by default. Only disable if the model doesn't support thinking + or you want to test non-thinking mode. + +--upload-username [default: current OS user] + Username recorded in DB entries and result uploads. Auto-detected from + the OS user if not specified. + + Env: EVAL_UPLOAD_USERNAME + + +--- v3 Enhancement: Per-Listener SLURM Throttle --- + +--max-jobs-submitted [default: 20] + Maximum number of active SLURM jobs this listener instance is allowed to + have running simultaneously. The listener tracks which SLURM job IDs it + submitted and checks squeue to count only those still active. + + Tuning: This is PER-LISTENER, not global. Multiple listeners can run in + parallel with independent budgets. Set based on your fair-share allocation: + - Single listener: 10-20 is typical. + - Multiple listeners: split your budget (e.g. v2=10, swebench=5). + When the limit is reached, the listener queues submissions by priority + order and drops the lowest-priority ones. + + Env: EVAL_LISTENER_MAX_JOBS + + +--- v3 Enhancement: Daytona Resource Pre-flight --- + +--check-daytona-resources + Enable Daytona API sandbox count check at startup and each iteration. + If active sandboxes are at or above the limit, the listener skips that + iteration entirely. Requires DAYTONA_API_KEY in environment. + + Tuning: Enable this in production to prevent overwhelming the Daytona + sandbox pool. Not needed for small-scale or development runs. + +--daytona-sandbox-limit [default: 2000] + Maximum expected active sandboxes. The listener skips submissions when + the active count reaches this number. + +--daytona-warning-buffer [default: 0.9] + Fraction of the sandbox limit at which a warning is logged. At 0.9 with + limit=2000, warns when active sandboxes reach 1800. + + +--- v3 Enhancement: Model Retry Tracking --- + +--track-model-retries + Enable tracking of how many times each model has been started. Models + exceeding the retry threshold are deprioritized (moved to end of the + submission queue, not blocked entirely). + + Tuning: Enable this for long-running listeners to prevent repeatedly + resubmitting models that keep failing. The sbatch script appends to the + shared log when transitioning a job from Pending → Started. + +--model-retry-threshold [default: 5] + Number of eval starts before a model is deprioritized. Deprioritized + models are still submitted, just last in the queue (and may be dropped + if --max-jobs-submitted truncates the list). + + Tuning: 3-5 for strict environments. Higher (10+) if transient failures + are common and you want to give models more chances. + +--eval-starts-log [default: auto-generated] + Path to the shared append-only log file where eval starts are recorded. + Auto-generated with a benchmark+timestamp suffix if not specified. + Multiple listeners using the same log file will share retry counts. + + Tuning: If you run multiple listeners for the same benchmark and want + shared retry tracking, point them at the same log file. + + +--- v3 Enhancement: Timeout-Config-Sensitive Dedup --- + +--timeout-aware + Change job dedup logic to check model + benchmark + agent + timeout_multiplier + instead of just model + benchmark. This allows running the same model with + different timeout configurations without one blocking the other. + + Tuning: Enable when running A/B experiments with different timeout settings. + When disabled (default), two listeners submitting the same model with + different --timeout-multiplier values will conflict (one sees the other's + job and skips). + +--timeout-multiplier [default: 1.0] + Harbor timeout multiplier, passed to the sbatch job and stored in the DB + job config. Values >1.0 give tasks more time; <1.0 makes them stricter. + + Tuning: Use with --timeout-aware for controlled experiments: + --timeout-multiplier 0.25 (aggressive timeout, fast failures) + --timeout-multiplier 1.0 (default) + --timeout-multiplier 2.0 (lenient, for slow models) + --timeout-multiplier 4.0 (very lenient, for debugging) + + +--- Execution Mode --- + +--dry-run + Preview mode: runs one full iteration (DB queries, filtering, status checks) + but does NOT submit any sbatch jobs. Logs what WOULD be submitted. Implies + --once. Use this to verify your flags before a real run. + + Env: EVAL_LISTENER_DRY_RUN="1" + +--once + Run a single iteration and exit. Useful for cron-triggered runs or one-shot + submissions. Without this, the listener loops forever (sleeping --check-hours + between iterations). + +--verbose, -v + Enable detailed logging: shows every model skipped (with reason), priority + list contents, blacklist contents, and per-model DB status checks. + +--log-file + Explicit log file path. Default: auto-generated in experiments/listener_logs/ + with a preset+timestamp name. + + Env: EVAL_LISTENER_LOG_DIR (for the directory) + + +=============================================================================== +ENVIRONMENT VARIABLES (all optional, CLI args take precedence) +=============================================================================== + + EVAL_LISTENER_LOOKBACK_DAYS Days to look back for models (default: 1000) + EVAL_LISTENER_CHECK_HOURS Hours between iterations (default: 4.0) + EVAL_LISTENER_SBATCH SBATCH script to use + EVAL_LISTENER_LOG_DIR Log directory (default: experiments/listener_logs) + EVAL_LISTENER_DATASETS Comma/space/newline list of HF dataset repos + EVAL_LISTENER_PRIORITY_FILE Path to priority models file (hot-reloaded) + EVAL_LISTENER_BLACKLIST_FILE Path to blacklist models file (hot-reloaded) [v4] + EVAL_LISTENER_DRY_RUN "1" or "true" to enable dry run mode + EVAL_LISTENER_REQUIRE_PRIORITY_LIST "1" or "true" to require priority list + EVAL_LISTENER_PRIORITY_MODE "filter_only" or "priority_first" + EVAL_LISTENER_CHECK_HF_EXISTS "1" or "true" to validate HF model existence + EVAL_LISTENER_MAX_JOBS Per-listener SLURM job limit (default: 20) + EVAL_UPLOAD_USERNAME Username for DB entries (default: OS user) + DAYTONA_API_KEY Required for --check-daytona-resources + + +=============================================================================== +QUICK START EXAMPLES +=============================================================================== + + # Most common: evaluate priority models on dev_set_v2 + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file v2_priority_models_richard.txt + + # Preview what would be submitted (no actual jobs) + python unified_eval_listener_v4.py --preset v2 --dry-run --once \\ + --priority-file v2_priority_models_richard.txt --verbose + + # Block known-bad models + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file v2_priority_models_richard.txt \\ + --blacklist-file bad_models.txt + + # Full v3/v4 features enabled + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file v2_priority_models_richard.txt \\ + --blacklist-file bad_models.txt \\ + --error-threshold 10 --max-jobs-submitted 15 \\ + --check-daytona-resources \\ + --track-model-retries --model-retry-threshold 3 \\ + --timeout-aware --timeout-multiplier 2.0 + + # Two listeners with independent SLURM budgets + python unified_eval_listener_v4.py --preset v2 --max-jobs-submitted 10 & + python unified_eval_listener_v4.py --preset swebench --max-jobs-submitted 5 & + + # A/B timeout experiment (requires --timeout-aware on both) + python unified_eval_listener_v4.py --preset v2 --timeout-aware \\ + --timeout-multiplier 1.0 --max-jobs-submitted 10 & + python unified_eval_listener_v4.py --preset v2 --timeout-aware \\ + --timeout-multiplier 2.0 --max-jobs-submitted 5 & +""" + +import argparse +import getpass +import json +import os +import re +import subprocess +import sys +import time +from collections import Counter +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional, Set, Tuple + +import yaml + +# Add leaderboard utilities to path +# Add project root to path for database.unified_db imports +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "..")) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +from database.unified_db.utils import get_supabase_client, load_supabase_keys + + +# --------------------------------------------------------------------------- +# Secrets loading (Jupiter-specific: load ~/secrets.env at import time) +# --------------------------------------------------------------------------- +def _load_secrets(path: Optional[str] = None) -> None: + """Load secrets from env file, then call unified_db's load_supabase_keys.""" + path = ( + path + or os.environ.get("DC_AGENT_SECRET_ENV") + or os.environ.get("KEYS") + or os.path.expanduser("~/secrets.env") + ) + if path and os.path.isfile(os.path.expanduser(path)): + with open(os.path.expanduser(path)) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + if line.startswith("export "): + line = line[7:].strip() + k, v = line.split("=", 1) + os.environ[k.strip()] = v.strip().strip("'\"") + # Alias SUPABASE_KEY -> SUPABASE_ANON_KEY if the latter is missing + # (some secrets.env files use the shorter name) + if os.environ.get("SUPABASE_KEY") and not os.environ.get("SUPABASE_ANON_KEY"): + os.environ["SUPABASE_ANON_KEY"] = os.environ["SUPABASE_KEY"] + try: + load_supabase_keys() + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Harbor config parsing -- extract eval config fields for dedup +# --------------------------------------------------------------------------- +def parse_harbor_eval_config(path: Optional[str]) -> Dict: + """Parse eval-relevant config fields from a Harbor YAML config. + + Returns dict with keys: timeout_multiplier, override_cpus, + override_memory_mb, override_storage_mb (only if set). + """ + if not path or not os.path.isfile(path): + return {} + try: + import yaml + with open(path) as f: + cfg = yaml.safe_load(f) or {} + except Exception as e: + log(f"WARNING: failed to parse harbor config {path}: {e}") + return {} + result: Dict = {} + if cfg.get("timeout_multiplier") is not None: + result["timeout_multiplier"] = float(cfg["timeout_multiplier"]) + env_cfg = cfg.get("environment") or {} + for key in ("override_cpus", "override_memory_mb", "override_storage_mb"): + if env_cfg.get(key) is not None: + result[key] = int(env_cfg[key]) + return result + + +# --------------------------------------------------------------------------- +# Baseline model config mapping -- per-model vLLM overrides +# --------------------------------------------------------------------------- +_BASELINE_MODEL_CONFIGS: Optional[Dict[str, Dict]] = None +_BASELINE_MODEL_PATTERNS: Optional[List[Dict]] = None + + +def load_baseline_model_configs(path: Optional[str]) -> Dict[str, Dict]: + """Load baseline model -> vLLM config mapping from YAML file. + + Returns dict mapping HF model name to vLLM serving params. + Also loads pattern-based fallback configs (stored in _BASELINE_MODEL_PATTERNS). + """ + global _BASELINE_MODEL_CONFIGS, _BASELINE_MODEL_PATTERNS + if _BASELINE_MODEL_CONFIGS is not None: + return _BASELINE_MODEL_CONFIGS + + if not path or not os.path.isfile(path): + _BASELINE_MODEL_CONFIGS = {} + _BASELINE_MODEL_PATTERNS = [] + return _BASELINE_MODEL_CONFIGS + + try: + import yaml + with open(path) as f: + data = yaml.safe_load(f) or {} + + # Start with per-model entries + per_model = data.get("models", {}) + + # Expand groups: each group has a "models" list + shared config fields. + # Group config is the base; per-model entries are merged on top (override wins). + expanded: Dict[str, Dict] = {} + for group in data.get("groups", []): + model_names = group.get("models", []) + shared_cfg = {k: v for k, v in group.items() if k != "models"} + for name in model_names: + expanded[name] = dict(shared_cfg) # copy so mutations are isolated + + # Merge per-model overrides on top of group defaults + for name, overrides in per_model.items(): + if name in expanded: + expanded[name].update(overrides) + else: + expanded[name] = dict(overrides) + + _BASELINE_MODEL_CONFIGS = expanded + _BASELINE_MODEL_PATTERNS = data.get("patterns", []) + n_groups = len(data.get("groups", [])) + log(f"Loaded {len(_BASELINE_MODEL_CONFIGS)} baseline model config(s) " + f"({n_groups} group(s), {len(per_model)} override(s)) and " + f"{len(_BASELINE_MODEL_PATTERNS)} pattern(s) from {path}") + except Exception as e: + log(f"WARNING: failed to load baseline model configs from {path}: {e}") + _BASELINE_MODEL_CONFIGS = {} + _BASELINE_MODEL_PATTERNS = [] + + return _BASELINE_MODEL_CONFIGS + + +def _match_pattern_config(hf_model: str) -> Optional[Dict]: + """Try to match a model name against pattern-based configs. + + Patterns are checked in order; first match wins. + Each pattern has a 'match' field (regex or substring) and config fields. + """ + if not _BASELINE_MODEL_PATTERNS: + return None + for pattern_entry in _BASELINE_MODEL_PATTERNS: + pattern = pattern_entry.get("match", "") + if not pattern: + continue + if re.search(pattern, hf_model): + return {k: v for k, v in pattern_entry.items() if k != "match"} + return None + + +def get_vllm_env_overrides(hf_model: str, configs: Dict[str, Dict]) -> Dict[str, str]: + """Get vLLM env var overrides for a model from the baseline config mapping. + + Tries exact model name match first, then falls back to pattern matching. + Returns dict of EVAL_VLLM_* env vars to pass to the eval script. + """ + match_source = None + cfg = configs.get(hf_model) + if cfg: + match_source = "exact/group" + else: + cfg = _match_pattern_config(hf_model) + if cfg: + match_source = "pattern" + if not cfg: + return {} + + log(f" Baseline config [{match_source}] for {hf_model}: {cfg}") + + env: Dict[str, str] = {} + if cfg.get("tensor_parallel_size") is not None: + env["EVAL_VLLM_TENSOR_PARALLEL_SIZE"] = str(cfg["tensor_parallel_size"]) + if cfg.get("max_model_len") is not None: + env["EVAL_VLLM_MAX_MODEL_LEN"] = str(cfg["max_model_len"]) + if cfg.get("swap_space") is not None: + env["EVAL_VLLM_SWAP_SPACE"] = str(cfg["swap_space"]) + if cfg.get("trust_remote_code"): + env["EVAL_VLLM_TRUST_REMOTE_CODE"] = "1" + if cfg.get("tool_call_parser"): + env["EVAL_VLLM_TOOL_CALL_PARSER"] = cfg["tool_call_parser"] + if cfg.get("reasoning_parser"): + env["EVAL_VLLM_REASONING_PARSER"] = cfg["reasoning_parser"] + if cfg.get("extra_args"): + env["EVAL_VLLM_EXTRA_ARGS"] = cfg["extra_args"] + if cfg.get("hf_overrides"): + env["EVAL_VLLM_HF_OVERRIDES"] = cfg["hf_overrides"] + + return env + + +def get_conda_env_override(hf_model: str, configs: Dict[str, Dict]) -> Optional[str]: + """Get conda_env override for a model from the baseline config mapping. + + Tries exact/group match first, then pattern match. Returns the conda_env + string (e.g. "otagent2") or None if no override is configured. + """ + cfg = configs.get(hf_model) + if not cfg: + cfg = _match_pattern_config(hf_model) + if cfg and cfg.get("conda_env"): + return cfg["conda_env"] + return None + + +# ---------- v6: Disk-Based Resume Scanner ---------- + +# Infrastructure errors that harbor's resume filters will retry +INFRA_ERROR_TYPES = { + "DaytonaError", + "EnvironmentStartTimeoutError", + "DaytonaRateLimitError", + "CancelledError", + "SandboxBuildFailedError", + "AgentEnvironmentTimeoutError", +} + + +def _parse_job_dir(job_dir: Path) -> Optional[Dict]: + """Parse a harbor job directory, extracting model/dataset/progress info. + + Returns dict with keys: run_tag, hf_model, dataset, n_completed, n_total, + finished_at, infra_errors, total_errors, db_job_id, slurm_job_id, resume_count. + Returns None if dir is not a valid harbor job dir. + """ + config_path = job_dir / "config.json" + if not config_path.exists(): + return None + + run_tag = job_dir.name + info: Dict = { + "run_tag": run_tag, + "hf_model": None, + "dataset": None, + "n_completed": 0, + "n_total": 0, + "finished_at": None, + "infra_errors": 0, + "total_errors": 0, + "db_job_id": None, + "slurm_job_id": None, + "resume_count": 0, + } + + # Parse config.json for model and dataset + try: + import json as _json + config = _json.loads(config_path.read_text()) + agents = config.get("agents", []) + if agents and isinstance(agents, list): + model_name = agents[0].get("model_name", "") + # Strip "hosted_vllm/" prefix + if model_name.startswith("hosted_vllm/"): + model_name = model_name[len("hosted_vllm/"):] + info["hf_model"] = model_name or None + datasets = config.get("datasets", []) + if datasets and isinstance(datasets, list): + ds_path = datasets[0].get("path", "") + if ds_path: + # Extract dataset name from path: /e/.../DCAgent_dev_set_v2 → DCAgent/dev_set_v2 + ds_name = Path(ds_path).name # e.g., "DCAgent_dev_set_v2" + # Convert first underscore to slash (org/name convention) + parts = ds_name.split("_", 1) + if len(parts) == 2: + info["dataset"] = f"{parts[0]}/{parts[1]}" + else: + info["dataset"] = ds_name + except Exception: + pass + + # Parse meta.env for DB_JOB_ID, SLURM_JOB_ID, RESUME_COUNT + meta_path = job_dir / "meta.env" + if meta_path.exists(): + try: + for line in meta_path.read_text().splitlines(): + if line.startswith("DB_JOB_ID="): + info["db_job_id"] = line.split("=", 1)[1].strip() or None + elif line.startswith("SLURM_JOB_ID="): + info["slurm_job_id"] = line.split("=", 1)[1].strip() or None + elif line.startswith("RESUME_COUNT="): + try: + info["resume_count"] = int(line.split("=", 1)[1].strip()) + except ValueError: + pass + elif line.startswith("MODEL=") and not info["hf_model"]: + info["hf_model"] = line.split("=", 1)[1].strip() or None + except Exception: + pass + + # Parse result.json for progress + result_path = job_dir / "result.json" + if result_path.exists(): + try: + import json as _json + result = _json.loads(result_path.read_text()) + info["n_total"] = result.get("n_total_trials", 0) + stats = result.get("stats", {}) + info["n_completed"] = stats.get("n_trials", 0) + info["finished_at"] = result.get("finished_at") + + # Count infrastructure errors + infra_count = 0 + total_err_count = 0 + for eval_data in stats.get("evals", {}).values(): + for exc_type, ids in eval_data.get("exception_stats", {}).items(): + n = len(ids) if isinstance(ids, list) else 1 + total_err_count += n + if exc_type in INFRA_ERROR_TYPES: + infra_count += n + info["infra_errors"] = infra_count + info["total_errors"] = total_err_count + except Exception: + pass + + return info + + +def scan_jobs_dir_for_resume( + jobs_dir: str, + dataset_prefixes: List[str], + active_slurm_ids: Set[str], + infra_error_threshold: int = 3, + max_resume_count: int = 5, +) -> List[Dict]: + """Scan eval jobs directory for jobs that need to be resumed. + + Args: + jobs_dir: Path to the eval jobs directory + dataset_prefixes: List of dataset name prefixes to filter (e.g., ["dev_set_v2"]) + active_slurm_ids: Set of SLURM job IDs currently in squeue + infra_error_threshold: Min infra errors to trigger resume for PARTIAL jobs + max_resume_count: Skip dirs with RESUME_COUNT >= this (prevent infinite loops) + + Returns: + List of dicts with keys: hf_model, dataset, run_tag, reason, db_job_id + """ + jobs_path = Path(jobs_dir) + if not jobs_path.is_dir(): + log(f"[v6-resume] Jobs dir not found: {jobs_dir}") + return [] + + # Build prefix patterns from dataset names + # "DCAgent/dev_set_v2" → "dev_set_v2_" + # Must normalize hyphens/dots to underscores to match generate_run_tag() output + dir_prefixes = [] + for ds in dataset_prefixes: + # Dataset format: "DCAgent/dev_set_v2" or "DCAgent2/terminal_bench_2" + ds_short = ds.split("/")[-1] if "/" in ds else ds + ds_safe = ds_short.replace("-", "_").replace(".", "_") + dir_prefixes.append(f"{ds_safe}_") + + candidates = [] + scanned = 0 + skipped_active = 0 + skipped_done = 0 + skipped_resume_limit = 0 + + for entry in sorted(jobs_path.iterdir()): + if not entry.is_dir(): + continue + + # Filter by dataset prefix + if not any(entry.name.startswith(p) for p in dir_prefixes): + continue + + info = _parse_job_dir(entry) + if info is None: + continue + scanned += 1 + + # Skip if SLURM job still running + if info["slurm_job_id"] and info["slurm_job_id"] in active_slurm_ids: + skipped_active += 1 + continue + + # Skip if resume count too high + if info["resume_count"] >= max_resume_count: + skipped_resume_limit += 1 + continue + + # Classify job state + n_completed = info["n_completed"] + n_total = info["n_total"] + finished_at = info["finished_at"] + infra_errors = info["infra_errors"] + + reason = None + + if n_total == 0 and not (jobs_path / entry.name / "result.json").exists(): + # EARLY_KILL: killed before any trial completed + reason = f"early_kill (no result.json, resume #{info['resume_count']+1})" + elif n_completed < n_total and finished_at is None: + # INCOMPLETE: SLURM killed mid-run + reason = f"incomplete ({n_completed}/{n_total} trials, resume #{info['resume_count']+1})" + elif n_completed < n_total and finished_at is not None: + # PARTIAL: harbor finished but some trials failed + if infra_errors > infra_error_threshold: + reason = f"partial ({n_completed}/{n_total}, {infra_errors} infra errors, resume #{info['resume_count']+1})" + elif n_completed == n_total: + # DONE: all trials completed + if infra_errors > infra_error_threshold: + reason = f"done_with_errors ({n_completed}/{n_total}, {infra_errors} infra errors, resume #{info['resume_count']+1})" + else: + skipped_done += 1 + continue + else: + continue + + if reason and info["hf_model"]: + candidates.append({ + "hf_model": info["hf_model"], + "dataset": info["dataset"], + "run_tag": info["run_tag"], + "reason": f"v6_resume: {reason}", + "db_job_id": info["db_job_id"], + }) + + log(f"[v6-resume] Scanned {scanned} job dirs: " + f"{len(candidates)} resume candidates, " + f"{skipped_active} still running, " + f"{skipped_done} completed, " + f"{skipped_resume_limit} at resume limit") + + return candidates + + +# ---------- Preset Definitions ---------- +# Each preset can configure: +# - datasets: list of HF dataset repos +# - sbatch_script: sbatch script to use (default: unified_eval_harbor_v4.sbatch) +# - log_suffix: suffix for log file +# - check_hf_exists: validate model exists on HuggingFace +# - n_concurrent: Harbor --n-concurrent (default: 64) +# - n_attempts: Harbor --n-attempts (default: 3) +# - gpu_memory_util: VLLM --gpu-memory-utilization (default: 0.9) +# - error_threshold: Max invalid errors before abort (default: 3) +# - vllm_max_retries: VLLM startup retries (default: 5) +# - agent_parser: Agent parser type (default: "", use "xml" for swebench) +# - slurm_time: SLURM time limit (default: "24:00:00") +PRESETS: Dict[str, Dict] = { + "aider": { + "datasets": ["DCAgent2/aider_polyglot"], + "log_suffix": "aider", + "n_concurrent": 32, + "error_threshold": 10, + "enable_thinking": True, + }, + "bfcl": { + "datasets": ["DCAgent2/bfcl-parity"], + "log_suffix": "bfcl", + "n_concurrent": 32, + "error_threshold": 10, + "vllm_max_retries": 20, + "enable_thinking": True, + }, + # NOTE: swebench and tb2 use dcagent_eval_config_no_override.yaml (no model overrides) + "swebench": { + "datasets": ["DCAgent2/swebench-verified-random-100-folders"], + "log_suffix": "swebench", + "n_concurrent": 32, + "error_threshold": 10, + "agent_parser": "xml", + "vllm_max_retries": 10, + "enable_thinking": True, + "config_yaml": "dcagent_eval_config_no_override.yaml", + }, + "v2": { + "datasets": ["DCAgent/dev_set_v2"], + "log_suffix": "v2", + "n_concurrent": 32, + "error_threshold": 10, + "vllm_max_retries": 10, + "enable_thinking": True, + "config_yaml": "dcagent_eval_config_no_override.yaml", + "auto_snapshot": True, + }, + "tb2": { + "datasets": ["DCAgent2/terminal_bench_2"], + "log_suffix": "tb2", + "n_concurrent": 32, + "error_threshold": 10, + "enable_thinking": True, + "vllm_max_retries": 10, + "config_yaml": "dcagent_eval_config_no_override.yaml", + "auto_snapshot": True, + }, + "v1": { + "datasets": ["DCAgent/dev_set_71_tasks"], + "log_suffix": "v1", + "n_concurrent": 32, + "error_threshold": 10, + "vllm_max_retries": 10, + "enable_thinking": True, + }, +} + +# ---------- Cluster Config ---------- +_CLUSTER_CONFIG_REQUIRED_KEYS = ["cluster_name", "slurm_partition", "paths"] +_CLUSTER_CONFIG_REQUIRED_PATHS = ["eval_jobs_dir", "sbatch_script"] + +# Global cluster config (set by --cluster-config, None = use hardcoded defaults) +_CLUSTER_CONFIG: Optional[Dict[str, Any]] = None + + +def load_cluster_config(path: str) -> Dict[str, Any]: + """Load and validate a cluster config YAML. + + Returns the parsed config dict. Raises SystemExit on validation failure. + """ + path = os.path.expanduser(path) + if not os.path.isfile(path): + print(f"ERROR: Cluster config not found: {path}") + sys.exit(2) + + with open(path) as f: + cfg = yaml.safe_load(f) + + if not isinstance(cfg, dict): + print(f"ERROR: Cluster config must be a YAML mapping, got {type(cfg).__name__}") + sys.exit(2) + + # Expand $USER / ${USER} and ~ in all string values (paths, conda env dirs, etc.) + def _expand(obj): + if isinstance(obj, str): + return os.path.expandvars(os.path.expanduser(obj)) + elif isinstance(obj, dict): + return {k: _expand(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [_expand(v) for v in obj] + return obj + cfg = _expand(cfg) + + for key in _CLUSTER_CONFIG_REQUIRED_KEYS: + if key not in cfg: + print(f"ERROR: Cluster config missing required key: {key}") + sys.exit(2) + + paths = cfg.get("paths", {}) + for key in _CLUSTER_CONFIG_REQUIRED_PATHS: + if key not in paths: + print(f"ERROR: Cluster config paths.{key} is required") + sys.exit(2) + + return cfg + + +def _cc_get(key: str, default: Any = None) -> Any: + """Get a top-level key from the cluster config, or *default* if not loaded.""" + if _CLUSTER_CONFIG is None: + return default + return _CLUSTER_CONFIG.get(key, default) + + +def _cc_path(key: str, default: Any = None) -> Any: + """Get a paths.* key from the cluster config, or *default*.""" + if _CLUSTER_CONFIG is None: + return default + return _CLUSTER_CONFIG.get("paths", {}).get(key, default) + + +# ---------- Constants ---------- +HF_URL_RE = re.compile(r'https?://(?:www\.)?huggingface\.co/([^/\s]+)/([^/\s#?]+)') +JOB_STATUS_PENDING = "Pending" +JOB_STATUS_STARTED = "Started" +JOB_STATUS_FINISHED = "Finished" +JOB_STATUS_FAILED = "Failed" +DEFAULT_STALE_JOB_HOURS = 24 +DEFAULT_STALE_PENDING_HOURS = 48 +DEFAULT_LOOKBACK_DAYS = 1000 +DEFAULT_CHECK_HOURS = 4.0 +DEFAULT_LOG_DIR = "experiments/listener_logs" + +# Sbatch parameter defaults +DEFAULT_N_CONCURRENT = 64 +DEFAULT_N_ATTEMPTS = 3 +DEFAULT_GPU_MEMORY_UTIL = 0.9 +DEFAULT_ERROR_THRESHOLD = 10 +DEFAULT_VLLM_MAX_RETRIES = 5 +DEFAULT_AGENT_PARSER = "" +DEFAULT_SLURM_TIME = "12:00:00" +DEFAULT_AGENT_NAME = "terminus-2" +DEFAULT_SLURM_PARTITION = "booster" +DEFAULT_SLURM_ACCOUNT = "" # empty = use sbatch header default +DEFAULT_ENABLE_THINKING = False +DEFAULT_TP_SIZE = 1 +DEFAULT_SBATCH_SCRIPT = "eval/unified_eval_harbor.sbatch" + +# Fallback defaults (used when no --cluster-config is provided). +# Empty strings force explicit cluster config — no implicit Jupiter defaults. +_FALLBACK_EVAL_JOBS_DIR = "" +_FALLBACK_HF_CACHE = "" +_FALLBACK_EVAL_LOGS_DIR = "eval/local/logs" + +# Conda env paths: env name → prefix directory (passed as OTAGENT_DIR to sbatch) +# Overridden by cluster_config["conda_envs"] when --cluster-config is used. +CONDA_ENV_PATHS: Dict[str, str] = {} + +# Dataset repo name → short benchmark tag for SLURM job names (squeue readability) +_BENCH_SHORT: Dict[str, str] = { + "dev_set_v2": "v2", + "swebench-verified-random-100-folders": "swe", + "terminal_bench_2": "tb2", + "aider_polyglot": "aider", + "bfcl-parity": "bfcl", + "dev_set_71_tasks": "v1", +} + +# Enhancement 2: SLURM job submission throttle +DEFAULT_MAX_JOBS_SUBMITTED = 20 + +# Enhancement 3: Daytona resource pre-flight check +DEFAULT_DAYTONA_SANDBOX_LIMIT = 2000 +DEFAULT_DAYTONA_WARNING_BUFFER = 0.9 + +# Enhancement 5: Timeout-config-sensitive dedup +DEFAULT_TIMEOUT_MULTIPLIER = 1.0 + + +# ---------- Configuration ---------- +@dataclass +class ListenerConfig: + """Configuration for the eval listener. + + Core fields: + datasets HF dataset repos to evaluate against. + sbatch_script Path to the sbatch script to submit. + priority_models Ordered list of HF model names from the priority file. + File order = submission priority (first = highest). + priority_file Path to the priority file (hot-reloaded each iteration). + + Sbatch parameters (forwarded to sbatch via env vars): + n_concurrent Harbor --n-concurrent. + n_attempts Harbor --n-attempts. + gpu_memory_util VLLM --gpu-memory-utilization. + error_threshold Max invalid errors before aborting upload (v3 Enhancement 1). + Replaces v2's daytona_threshold. Env var kept as + EVAL_DAYTONA_THRESHOLD for sbatch backward compat. + agent_name Agent name for harbor and DB entries. + timeout_multiplier Harbor timeout multiplier (v3 Enhancement 5). + + v3 enhancement fields: + max_jobs_submitted Per-listener SLURM job limit (Enhancement 2). + Each listener tracks its own submitted job IDs and + only counts those still active in squeue. + check_daytona_resources Enable Daytona API pre-flight check (Enhancement 3). + daytona_sandbox_limit Max expected active sandboxes for pre-flight check. + daytona_warning_buffer Fraction of limit to trigger warning (e.g. 0.95). + timeout_aware Enable config-sensitive job dedup (Enhancement 5). + """ + datasets: List[str] + sbatch_script: str + log_file: Optional[Path] + lookback_days: int + check_interval_hours: float + stale_job_hours: int + stale_pending_hours: int + priority_file: Optional[str] + require_priority_list: bool + priority_models: List[str] + check_hf_exists: bool + dry_run: bool + run_once: bool + verbose: bool + # Priority mode: "filter_only" (skip non-priority) or "priority_first" (all models, priority first) + priority_mode: str = "filter_only" + # Sbatch parameters (passed to sbatch via env vars) + n_concurrent: int = DEFAULT_N_CONCURRENT + n_attempts: int = DEFAULT_N_ATTEMPTS + gpu_memory_util: float = DEFAULT_GPU_MEMORY_UTIL + error_threshold: int = DEFAULT_ERROR_THRESHOLD + vllm_max_retries: int = DEFAULT_VLLM_MAX_RETRIES + agent_parser: str = DEFAULT_AGENT_PARSER + slurm_time: str = DEFAULT_SLURM_TIME + enable_thinking: bool = DEFAULT_ENABLE_THINKING + agent_name: str = DEFAULT_AGENT_NAME + slurm_partition: str = DEFAULT_SLURM_PARTITION + slurm_account: str = DEFAULT_SLURM_ACCOUNT + tp_size: int = DEFAULT_TP_SIZE + dp_size: int = 1 # vLLM native data-parallel replicas (total GPUs = tp_size * dp_size) + upload_username: str = "" + log_prefix: str = "[unified-eval-listener-v6]" + # v3 Enhancement 2: Per-listener SLURM throttle + max_jobs_submitted: int = DEFAULT_MAX_JOBS_SUBMITTED + # v3 Enhancement 3: Daytona pre-flight + check_daytona_resources: bool = False + daytona_sandbox_limit: int = DEFAULT_DAYTONA_SANDBOX_LIMIT + daytona_warning_buffer: float = DEFAULT_DAYTONA_WARNING_BUFFER + # v3 Enhancement 5: Timeout-config-sensitive dedup + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER + timeout_aware: bool = False + # Config YAML for harbor (overrides vs no-overrides) + config_yaml: str = "dcagent_eval_config.yaml" + # Model blacklist + blacklist_file: Optional[str] = None + blacklisted_models: Set[str] = field(default_factory=set) + # Daytona auto_snapshot: None = use YAML config default, True/False = override + auto_snapshot: Optional[bool] = None + # Per-model vLLM overrides (baseline model configs) + baseline_model_configs: Optional[str] = None + # Harbor config path + harbor_config: Optional[str] = None + # Parsed eval config from harbor YAML (for config-aware dedup) + eval_config: Dict = field(default_factory=dict) + # Pre-download model weights before submitting jobs + pre_download: bool = False + # Sliding-window batch dependencies + batch_size: Optional[int] = None + # Conda env selector (otagent / otagent2) + conda_env: str = "otagent" + # v6: Disk-based resume + jobs_dirs: List[str] = field(default_factory=list) # Set from CLI or EVAL_JOBS_DIR env var + enable_disk_resume: bool = True + resume_infra_error_threshold: int = 10 + max_resume_count: int = 5 + force_reeval: bool = False # Bypass DB status check (submit even if Finished/Started) + resume_only: bool = False # Only submit resume jobs, skip fresh submissions + submission_delay: float = 1.0 # Seconds to sleep between sbatch submissions + stagger_delay: int = 0 # Minutes between job starts via SLURM after: dependency chain (0 = disabled) + chain_batch_size: int = 1 # Jobs per stagger batch (1 = every job waits, 10 = fire 10 then wait) + pack_jobs: bool = False # Pack multiple jobs onto same node via --nodelist + # DP: data-parallel multi-node eval + dp_nodes: int = 0 # 0 = single-node (default), >0 = use DP sbatch with N nodes + dp_sbatch_script: str = "eval/unified_eval_harbor_dp.sbatch" + # Inherit: seed _submitted_jobs from previous listener logs + inherit_log: Optional[List[str]] = None + # Cluster config (loaded from --cluster-config YAML) + cluster_config: Optional[Dict[str, Any]] = None + + @property + def check_interval_seconds(self) -> int: + return int(self.check_interval_hours * 60 * 60) + + +# ---------- Logging ---------- +_LOG_FILE: Optional[Path] = None +_VERBOSE: bool = False + + +def set_log_file(path: Optional[Path]) -> None: + global _LOG_FILE + _LOG_FILE = path + + +def log(msg: str, prefix: str = "[unified-eval-listener-v6]", verbose_only: bool = False) -> None: + """Log a message to stdout and optionally to file. + + If verbose_only=True, the message is only emitted when _VERBOSE is set. + """ + if verbose_only and not _VERBOSE: + return + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + line = f"{prefix} {ts} {msg}" + print(line, flush=True) + if _LOG_FILE: + try: + with _LOG_FILE.open("a") as f: + f.write(line + "\n") + except Exception: + pass + + +# ---------- Priority Models Loading ---------- +def load_priority_models(filepath: Optional[str]) -> List[str]: + """ + Load priority models from a text file, preserving file order as rank. + + File order determines submission priority: models listed earlier are + submitted first. When the per-listener SLURM job limit truncates the + submission list, higher-priority (earlier) models are kept. + + File format: + - One model per line (HuggingFace format: org/model) + - Lines starting with # are comments + - Blank lines are ignored + + Returns: + Ordered list of model names (duplicates removed, order preserved). + Empty list if file is missing or empty. + """ + if not filepath: + return [] + + path = Path(filepath) + if not path.exists(): + log(f"Priority file not found: {filepath}") + return [] + + models: List[str] = [] + seen: Set[str] = set() + try: + with path.open("r") as f: + for line in f: + line = line.strip() + # Skip empty lines and comments + if not line or line.startswith("#"): + continue + if line not in seen: + seen.add(line) + models.append(line) + log(f"Loaded {len(models)} model(s) from priority file: {filepath}") + return models + except Exception as e: + log(f"ERROR reading priority file {filepath}: {e}") + return [] + + +# ---------- Model Blacklist Loading ---------- +def load_blacklist(filepath: Optional[str]) -> Set[str]: + """Load blacklisted models from a text file. Same format as priority file.""" + return set(load_priority_models(filepath)) + + +# ---------- HuggingFace Utilities ---------- +def check_hf_model_exists(model_name: str) -> bool: + """ + Check if a model exists on HuggingFace Hub. + + Args: + model_name: HF model name (e.g., "org/model-name") + + Returns: + True if model exists and is accessible, False otherwise + """ + if not model_name or not isinstance(model_name, str): + return False + + try: + from huggingface_hub import model_info + model_info(model_name) + return True + except Exception as e: + log(f"HF check failed for {model_name}: {e}") + return False + + +def _parse_hf_from_str(val: Optional[str]) -> Optional[str]: + """Parse HuggingFace model name from a string (URL or org/repo).""" + if not isinstance(val, str): + return None + m = HF_URL_RE.search(val) + if m: + return f"{m.group(1)}/{m.group(2)}" + return None + + +def resolve_hf_model_name(model_row: Dict) -> Optional[str]: + """ + Resolve HF model name from a database model row. + + Checks multiple fields in order of priority. + """ + # Check name field first + v = model_row.get("name") + if isinstance(v, str) and "/" in v and not v.startswith("hosted_vllm/"): + return v + + # Check other URL fields + for field in ("weights_location", "training_parameters", "url", "hf_url"): + vv = model_row.get(field) + if isinstance(vv, str): + name = _parse_hf_from_str(vv) + if name: + return name + + # Check training_parameters as JSON + vv = model_row.get("training_parameters") + if isinstance(vv, str): + try: + obj = json.loads(vv) + except Exception: + obj = None + else: + obj = vv + + if isinstance(obj, dict): + for sval in obj.values(): + if isinstance(sval, str): + name = _parse_hf_from_str(sval) + if name: + return name + + return None + + +# ---------- Dataset Parsing ---------- +def parse_datasets(s: str) -> List[str]: + """ + Parse dataset list from string. + + Supports comma, space, or newline separated values. + Normalizes HF URLs to org/repo format. + """ + parts = [p.strip() for p in re.split(r"[,\s]+", s) if p.strip()] + out = [] + for p in parts: + m = HF_URL_RE.search(p) + out.append(f"{m.group(1)}/{m.group(2)}" if m else p) + + # Dedup while preserving order + seen: Set[str] = set() + uniq: List[str] = [] + for d in out: + if d not in seen: + seen.add(d) + uniq.append(d) + return uniq + + +def dataset_repo_name(dataset_hf: str) -> str: + """Convert 'org/repo' or HF URL to 'repo' (just the repo name).""" + if not dataset_hf: + return dataset_hf + m = HF_URL_RE.search(dataset_hf) + if m: + return m.group(2) + if "/" in dataset_hf: + return dataset_hf.rsplit("/", 1)[-1] + return dataset_hf + + +# ---------- Database Operations ---------- +_BENCH_CACHE: Dict[str, Optional[str]] = {} + + +def _iso(dt: datetime) -> str: + """Convert datetime to ISO format string.""" + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc).isoformat() + + +def _time_filters(q, since_iso: str): + """Apply time filter to Supabase query (handles both column names).""" + try: + return q.gte('creation_time', since_iso) + except Exception: + return q.gte('created_at', since_iso) + + +def fetch_recent_models(days: int) -> List[Dict]: + """Fetch recent models from Supabase within the lookback window. + + Filters out: + - Models with created_by == "precomputed_hf" + - Models with a non-empty "duplicate_of" field (v3: prevents duplicate + eval submissions when the same HF model appears under multiple DB rows) + """ + client = get_supabase_client() + since = _iso(datetime.now(timezone.utc) - timedelta(days=days)) + try: + resp = _time_filters(client.table('models').select('*'), since).execute() + rows = list(resp.data or []) + except Exception as e: + log(f"ERROR: failed querying models by time: {e}") + return [] + + # Filter out precomputed models and duplicates + out: List[Dict] = [] + skipped_dupes = 0 + for r in rows: + if r.get("created_by") == "precomputed_hf": + continue + if r.get("duplicate_of"): + skipped_dupes += 1 + continue + out.append(r) + if skipped_dupes: + log(f"Filtered out {skipped_dupes} duplicate model(s) (duplicate_of set)") + return out + + +def fetch_priority_models(priority_names: List[str]) -> List[Dict]: + """Fetch models by name from Supabase, bypassing the lookback window. + + This ensures priority models are always evaluated even if they were + registered long ago (outside the lookback window). + + Filters out: + - Models with created_by == "precomputed_hf" + - Models with a non-empty "duplicate_of" field + """ + if not priority_names: + return [] + + client = get_supabase_client() + try: + resp = ( + client.table('models') + .select('*') + .in_('name', priority_names) + .execute() + ) + rows = list(resp.data or []) + except Exception as e: + log(f"ERROR: failed querying priority models by name: {e}") + return [] + + out: List[Dict] = [] + for r in rows: + if r.get("created_by") == "precomputed_hf": + continue + if r.get("duplicate_of"): + continue + out.append(r) + return out + + +def resolve_benchmark_id(dataset_hf: str) -> Optional[str]: + """ + Look up benchmark ID from database for a given dataset. + + Caches results for performance. + """ + repo_name = dataset_repo_name(dataset_hf) + if repo_name in _BENCH_CACHE: + return _BENCH_CACHE[repo_name] + + try: + client = get_supabase_client() + resp = ( + client.table('benchmarks') + .select('id,name') + .eq('name', repo_name) + .limit(1) + .execute() + ) + rows = resp.data or [] + bench_id = rows[0]['id'] if rows else None + _BENCH_CACHE[repo_name] = bench_id + if not bench_id: + log(f"No benchmark row found for dataset '{dataset_hf}' (wanted name='{repo_name}').") + return bench_id + except Exception as e: + log(f"ERROR resolving benchmark id for dataset '{dataset_hf}': {e}") + return None + + +def check_job_status( + model_id: str, benchmark_id: Optional[str] +) -> Tuple[bool, Optional[str], Optional[datetime], Optional[datetime], Optional[str]]: + """Check if a job exists for (model_id, benchmark_id) and its status. + + Delegates to check_job_status_v3 (single-ID, non-timeout-aware path). + Kept as a thin wrapper for backward compatibility with callers that don't + need timeout-aware or duplicate-group queries. + """ + return check_job_status_v3(model_id, benchmark_id) + + +# ---------- Cross-Duplicate Aggregation ---------- +_DUP_GROUP_CACHE: Dict[str, List[str]] = {} + + +def get_duplicate_group_ids(table: str, entity_id: str) -> List[str]: + """Get all IDs in the duplicate group for a model or benchmark. + + Given an entity_id, finds the canonical ID and all its duplicates. + - If entity has duplicate_of set, canonical = duplicate_of + - Otherwise canonical = entity_id + - Then finds all rows WHERE duplicate_of = canonical_id + - Returns [canonical_id] + [all duplicate IDs] + + Results are cached per (table, entity_id). + """ + cache_key = f"{table}:{entity_id}" + if cache_key in _DUP_GROUP_CACHE: + return _DUP_GROUP_CACHE[cache_key] + + try: + client = get_supabase_client() + + # Step 1: Find the canonical ID + resp = client.table(table).select('id,duplicate_of').eq('id', entity_id).limit(1).execute() + rows = resp.data or [] + if not rows: + _DUP_GROUP_CACHE[cache_key] = [entity_id] + return [entity_id] + + canonical_id = rows[0].get('duplicate_of') or entity_id + + # Step 2: Find all duplicates of the canonical + resp2 = client.table(table).select('id').eq('duplicate_of', canonical_id).execute() + dup_ids = [r['id'] for r in (resp2.data or [])] + + group = list(set([canonical_id] + dup_ids)) + # Cache for all members of the group + for gid in group: + _DUP_GROUP_CACHE[f"{table}:{gid}"] = group + return group + + except Exception as e: + log(f"WARNING: Failed to get duplicate group for {table}/{entity_id}: {e}") + _DUP_GROUP_CACHE[cache_key] = [entity_id] + return [entity_id] + + +# ---------- v3 Enhancement 5: Timeout-Config-Sensitive Job Dedup ---------- +def check_job_status_v3( + model_id: str, + benchmark_id: Optional[str], + timeout_aware: bool = False, + agent_name: str = DEFAULT_AGENT_NAME, + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER, + duplicate_model_ids: Optional[List[str]] = None, + duplicate_benchmark_ids: Optional[List[str]] = None, +) -> Tuple[bool, Optional[str], Optional[datetime], Optional[datetime], Optional[str]]: + """ + Check if a job exists for (model_id, benchmark_id) and its status. + + When timeout_aware=True, filters to only match jobs with the same + agent_name and timeout_multiplier in their config. + + When duplicate_model_ids/duplicate_benchmark_ids are provided, queries + across the entire duplicate group using .in_() instead of .eq(). + + Returns: + (job_exists, job_status, started_at, submitted_at, slurm_job_id) + """ + if not benchmark_id: + return (False, None, None, None, None) + + # Determine which IDs to query + model_ids = duplicate_model_ids if duplicate_model_ids else [model_id] + bench_ids = duplicate_benchmark_ids if duplicate_benchmark_ids else [benchmark_id] + + try: + client = get_supabase_client() + q = client.table('sandbox_jobs').select( + 'id,job_status,started_at,submitted_at,slurm_job_id,config' + ) + + # Use .in_() for duplicate groups, .eq() for singles + if len(model_ids) == 1: + q = q.eq('model_id', model_ids[0]) + else: + q = q.in_('model_id', model_ids) + + if len(bench_ids) == 1: + q = q.eq('benchmark_id', bench_ids[0]) + else: + q = q.in_('benchmark_id', bench_ids) + + q = q.order('created_at', desc=True).limit(50) + data = (q.execute().data) or [] + + if not data: + return (False, None, None, None, None) + + # Filter to matching config if timeout_aware + for job in data: + if timeout_aware: + config = job.get('config') + if isinstance(config, str): + try: + config = json.loads(config) + except Exception: + config = {} + if not isinstance(config, dict): + config = {} + + job_agent = config.get('agent', DEFAULT_AGENT_NAME) + job_tm = config.get('timeout_multiplier', DEFAULT_TIMEOUT_MULTIPLIER) + + # Skip if agent_name or timeout_multiplier don't match + if job_agent != agent_name or float(job_tm) != float(timeout_multiplier): + continue + + job_status = job.get('job_status') + started_at_str = job.get('started_at') + submitted_at_str = job.get('submitted_at') + slurm_job_id = job.get('slurm_job_id') + + started_at = None + if started_at_str: + try: + started_at = datetime.fromisoformat(started_at_str.replace('Z', '+00:00')) + except Exception: + pass + + submitted_at = None + if submitted_at_str: + try: + submitted_at = datetime.fromisoformat(submitted_at_str.replace('Z', '+00:00')) + except Exception: + pass + + return (True, job_status, started_at, submitted_at, slurm_job_id) + + # No matching job found + return (False, None, None, None, None) + + except Exception as e: + log(f"WARNING: sandbox_jobs v3 check failed for model_id={model_id}, benchmark_id={benchmark_id}: {e}") + return (False, None, None, None, None) # fail-open + + +def is_job_stale(started_at: Optional[datetime], hours: int = DEFAULT_STALE_JOB_HOURS) -> bool: + """Check if a job started more than the specified hours ago.""" + if not started_at: + # If started_at is null but job exists with status='Started', treat as stale + return True + now = datetime.now(timezone.utc) + if started_at.tzinfo is None: + started_at = started_at.replace(tzinfo=timezone.utc) + age = now - started_at + return age > timedelta(hours=hours) + + +def _config_matches_eval(job_config: Optional[Dict], eval_config: Dict) -> bool: + """Check if a DB job's config JSONB matches the current eval config fields. + + Compares: timeout_multiplier, override_cpus, override_memory_mb, override_storage_mb. + A job with no config is treated as defaults (timeout=1.0, no overrides). + If eval_config is empty (no harbor config), any job config matches (backwards compat). + """ + if not eval_config: + return True # no config constraints -- any existing job counts + + job_cfg = job_config or {} + job_env = job_cfg.get("environment") or {} + + # timeout_multiplier: top-level in config JSONB + if "timeout_multiplier" in eval_config: + job_tm = job_cfg.get("timeout_multiplier") + # Treat None/missing as 1.0 + job_tm = float(job_tm) if job_tm is not None else 1.0 + if float(eval_config["timeout_multiplier"]) != job_tm: + return False + + # Environment overrides: nested under config.environment + for key in ("override_cpus", "override_memory_mb", "override_storage_mb"): + if key in eval_config: + job_val = job_env.get(key) + # Treat None/missing as the default (None means no override) + job_val = int(job_val) if job_val is not None else None + eval_val = int(eval_config[key]) + if job_val != eval_val: + return False + + return True + + +def should_start_job( + model_id: str, + benchmark_id: Optional[str], + stale_hours: int = DEFAULT_STALE_JOB_HOURS, + stale_pending_hours: int = DEFAULT_STALE_PENDING_HOURS, + timeout_aware: bool = False, + agent_name: str = DEFAULT_AGENT_NAME, + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER, + duplicate_model_ids: Optional[List[str]] = None, + duplicate_benchmark_ids: Optional[List[str]] = None, + eval_config: Optional[Dict] = None, +) -> Tuple[bool, str, Optional[str]]: + """ + Determine if a job should be started based on DB status. + + When timeout_aware=True (v3 Enhancement 5), uses check_job_status_v3() + which filters jobs by agent_name and timeout_multiplier in config. This + allows running the same model with different configs without one blocking + the other. + + When duplicate_model_ids/duplicate_benchmark_ids are provided, checks + across the entire duplicate group for existing jobs. + + When eval_config is provided (from harbor YAML), performs config-aware + dedup: checks that existing jobs match the current resource overrides + (timeout_multiplier, override_cpus, override_memory_mb, override_storage_mb). + + Returns: + (should_start, reason, slurm_job_id) + slurm_job_id is provided so the caller can scancel stale jobs. + """ + job_exists, job_status, started_at, submitted_at, slurm_job_id = check_job_status_v3( + model_id, benchmark_id, + timeout_aware=timeout_aware, + agent_name=agent_name, + timeout_multiplier=timeout_multiplier, + duplicate_model_ids=duplicate_model_ids, + duplicate_benchmark_ids=duplicate_benchmark_ids, + ) + + if not job_exists: + return (True, "no existing job", None) + + ec = eval_config or {} + + if job_status == JOB_STATUS_FINISHED: + if ec: + # Check if any Finished job matches our eval config + try: + client = get_supabase_client() + q = ( + client.table("sandbox_jobs") + .select("config, metrics") + .eq("model_id", model_id) + .eq("benchmark_id", benchmark_id) + .eq("job_status", "Finished") + .order("created_at", desc=True) + .limit(10) + ) + rows = (q.execute().data) or [] + matching = [r for r in rows if _config_matches_eval(r.get("config"), ec)] + if not matching: + return (True, "no finished job with matching config", slurm_job_id) + if matching[0] and not matching[0].get("metrics"): + return (True, "finished with matching config but metrics cleared", slurm_job_id) + except Exception as e: + log(f"WARNING: config-aware check failed: {e}") + # v6: DaytonaError resume is now handled by disk-based scanning in + # scan_jobs_dir_for_resume(), not here. This avoids the circular dependency + # where DB stats only exist after upload, but upload is skipped on error. + + return (False, "job finished", slurm_job_id) + + if job_status == JOB_STATUS_PENDING: + if ec: + try: + client = get_supabase_client() + q = ( + client.table("sandbox_jobs") + .select("config, slurm_job_id, created_at") + .eq("model_id", model_id) + .eq("benchmark_id", benchmark_id) + .eq("job_status", "Pending") + .order("created_at", desc=True) + .limit(5) + ) + rows = (q.execute().data) or [] + matching = [r for r in rows if _config_matches_eval(r.get("config"), ec)] + if not matching: + return (True, "no pending job with matching config", slurm_job_id) + except Exception as e: + log(f"WARNING: config-aware check failed: {e}") + # Job submitted but not yet running - check if stale using separate pending threshold + if is_job_stale(submitted_at, stale_pending_hours): + submitted_str = submitted_at.isoformat() if submitted_at else "null" + return (True, f"stale pending job (submitted_at={submitted_str})", slurm_job_id) + else: + submitted_str = submitted_at.isoformat() if submitted_at else "null" + return (False, f"job pending in SLURM queue (submitted_at={submitted_str})", slurm_job_id) + + if job_status == JOB_STATUS_STARTED: + if ec: + try: + client = get_supabase_client() + q = ( + client.table("sandbox_jobs") + .select("config, started_at") + .eq("model_id", model_id) + .eq("benchmark_id", benchmark_id) + .eq("job_status", "Started") + .order("created_at", desc=True) + .limit(5) + ) + rows = (q.execute().data) or [] + matching = [r for r in rows if _config_matches_eval(r.get("config"), ec)] + if not matching: + return (True, "no in-progress job with matching config", slurm_job_id) + except Exception as e: + log(f"WARNING: config-aware check failed: {e}") + if is_job_stale(started_at, stale_hours): + started_str = started_at.isoformat() if started_at else "null" + return (True, f"stale job (started_at={started_str})", slurm_job_id) + else: + started_str = started_at.isoformat() if started_at else "null" + return (False, f"job in progress (started_at={started_str})", slurm_job_id) + + # Unknown status - start job to be safe + return (True, f"unknown job status: {job_status}", slurm_job_id) + + +# ---------- v3 Enhancement 2: Per-Listener SLURM Job Throttle ---------- +def get_active_slurm_job_ids() -> Set[str]: + """Return set of SLURM job IDs currently queued/running for this user. + + Used by EvalListener to determine which of its submitted jobs are still + active. The listener intersects this with its internal _submitted_jobs + set to get a per-listener active count. + """ + try: + user = getpass.getuser() + code, out = _run(["squeue", "-u", user, "--noheader", "-h", "-o", "%i"]) + if code != 0: + log(f"WARNING: squeue failed (exit {code}), returning empty set") + return set() + return {line.strip() for line in out.strip().split('\n') if line.strip()} + except Exception as e: + log(f"WARNING: Failed to query squeue: {e}") + return set() + + +def get_active_model_dataset_pairs( + log_dir: str = "eval/local/logs", +) -> Tuple[Set[str], Set[Tuple[str, str]], Dict[str, str]]: + """Return (active_models, active_model_dataset_pairs, active_run_tags) for all active SLURM eval jobs. + + Queries squeue for all active jobs (RUNNING/PENDING/COMPLETING), then parses + each job's eval log file to extract the model name, dataset, and run_tag. + + Args: + log_dir: Directory containing eval log files ({job_name}_{slurm_id}.out). + + Returns: + active_models: Set of HF model names currently running/queued. + active_pairs: Set of (hf_model, dataset_hf) tuples currently running/queued. + active_run_tags: Dict mapping run_tag → slurm_job_id for active jobs. + """ + active_models: Set[str] = set() + active_pairs: Set[Tuple[str, str]] = set() + active_run_tags: Dict[str, str] = {} + + try: + user = getpass.getuser() + code, out = _run(["squeue", "-u", user, "--noheader", "-o", "%i %j"]) + if code != 0: + log(f"WARNING: squeue failed (exit {code}), returning empty active sets") + return active_models, active_pairs, active_run_tags + except Exception as e: + log(f"WARNING: Failed to query squeue: {e}") + return active_models, active_pairs, active_run_tags + + log_path = Path(log_dir) + for line in out.strip().split("\n"): + line = line.strip() + if not line: + continue + parts = line.split(None, 1) + if len(parts) < 2: + continue + job_id, job_name = parts[0], parts[1] + + # Try to parse the eval log file for this job + log_file = log_path / f"{job_name}_{job_id}.out" + if not log_file.exists(): + continue + + model = dataset = run_tag = None + try: + with open(log_file, "r") as f: + for i, fline in enumerate(f): + if i > 200: # Only scan first 200 lines + break + if fline.startswith("Model: "): + model = fline.strip()[7:] + elif fline.startswith("Dataset: "): + dataset = fline.strip()[9:] + elif fline.startswith("Run tag: "): + run_tag = fline.strip()[9:] + if model and dataset and run_tag: + break + except (OSError, IOError): + continue + + if model: + active_models.add(model) + if model and dataset: + # Normalize dataset: local path → HF name (e.g. /e/.../DCAgent_dev_set_v2 → DCAgent/dev_set_v2) + ds_normalized = dataset + if "/" in dataset and not dataset.startswith("/"): + # Already HF format like DCAgent/dev_set_v2 or DCAgent2/terminal_bench_2 + ds_normalized = dataset + active_pairs.add((model, ds_normalized)) + if run_tag: + active_run_tags[run_tag] = job_id + + return active_models, active_pairs, active_run_tags + + +def _parse_job_ids_from_single_log(log_path: str) -> Set[str]: + """Parse SLURM job IDs from a single listener log file. + + Matches two patterns: + 1. "-> Submitted as SLURM job 293324 (job_name=...)" — direct submissions + 2. "[inherit-log] Inherited jobs: 293324,293325,..." — inherited from previous log + """ + job_ids: Set[str] = set() + try: + with open(log_path, "r") as f: + for line in f: + if "Submitted as SLURM job" in line: + parts = line.split("Submitted as SLURM job ") + if len(parts) >= 2: + jid = parts[1].split()[0].strip() + if jid.isdigit(): + job_ids.add(jid) + elif "[inherit-log] Inherited jobs:" in line: + # Parse comma-separated job IDs + parts = line.split("Inherited jobs:") + if len(parts) >= 2: + for jid in parts[1].strip().split(","): + jid = jid.strip() + if jid.isdigit(): + job_ids.add(jid) + except (OSError, IOError) as e: + log(f"WARNING: Failed to read log {log_path}: {e}") + return job_ids + + +def parse_submitted_jobs_from_logs(log_paths: List[str]) -> Set[str]: + """Parse SLURM job IDs from one or more listener logs. + + Aggregates across all logs, then filters to jobs still active in squeue. + """ + all_job_ids: Set[str] = set() + for lp in log_paths: + ids = _parse_job_ids_from_single_log(lp) + log(f"[inherit-log] Parsed {len(ids)} job(s) from {lp}") + all_job_ids |= ids + + active_ids = get_active_slurm_job_ids() + still_active = all_job_ids & active_ids + log(f"[inherit-log] Total: {len(all_job_ids)} job(s) across {len(log_paths)} log(s), " + f"{len(still_active)} still active in squeue") + return still_active + + +# ---------- v3 Enhancement 3: Daytona Resource Pre-flight Check ---------- +def check_daytona_resources(sandbox_limit: int, warning_buffer: float) -> bool: + """ + Check Daytona resource usage via API. + + Called at listener startup and optionally each iteration when + --check-daytona-resources is enabled. Requires DAYTONA_API_KEY in env. + + Returns True if OK to proceed, False if active sandboxes >= sandbox_limit. + Logs a warning when active sandboxes >= sandbox_limit * warning_buffer. + """ + try: + from daytona_api_client import ApiClient, Configuration, SandboxApi + except ImportError: + log("WARNING: daytona_api_client not installed, skipping resource check") + return True + + api_key = os.environ.get("DAYTONA_API_KEY") + api_url = os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api") + if not api_key: + log("WARNING: DAYTONA_API_KEY not set, skipping resource check") + return True + + try: + config = Configuration(host=api_url) + client = ApiClient(config) + client.default_headers["Authorization"] = f"Bearer {api_key}" + api = SandboxApi(client) + + result = api.list_sandboxes_paginated(states=["started"], limit=1, page=1) + active_count = result.total + + threshold = int(sandbox_limit * warning_buffer) + if active_count >= sandbox_limit: + log(f"ERROR: Daytona resources at limit: {active_count}/{sandbox_limit} active sandboxes " + f"({active_count/sandbox_limit:.1%})") + return False + elif active_count >= threshold: + log(f"WARNING: Daytona resources at {active_count}/{sandbox_limit} active sandboxes " + f"({active_count/sandbox_limit:.1%}) - approaching limit!") + return True + else: + log(f"Daytona resources OK: {active_count}/{sandbox_limit} active sandboxes " + f"({active_count/sandbox_limit:.1%})") + return True + except Exception as e: + log(f"WARNING: Daytona resource check failed: {e}") + return True # fail-open + + +# ---------- Job Submission ---------- +@dataclass +class SbatchParams: + """Parameters passed to the sbatch script via environment variables. + + The listener converts these to EVAL_* env vars via to_env(), which the + sbatch script reads at startup. + + v3 additions: + error_threshold Mapped to EVAL_DAYTONA_THRESHOLD (name kept for compat). + Controls the unified invalid error threshold. + timeout_multiplier Mapped to EVAL_TIMEOUT_MULTIPLIER. Passed to harbor + --timeout-multiplier and stored in DB job config. + + Cluster config additions (v6): + When a cluster config YAML is loaded (--cluster-config), to_env() also + exports EVAL_PROJECT_ROOT, EVAL_HF_CACHE, EVAL_HARBOR_SRC, + EVAL_DATASETS_DIRS, EVAL_PROXY_ENABLED, EVAL_LOGIN_NODE, + EVAL_PROXYCHAINS_BIN, EVAL_CUDA_HOME, EVAL_ARCH, EVAL_GPUS_PER_NODE, + and EVAL_LOGS_DIR so sbatch scripts can be cluster-agnostic. + """ + n_concurrent: int = DEFAULT_N_CONCURRENT + n_attempts: int = DEFAULT_N_ATTEMPTS + gpu_memory_util: float = DEFAULT_GPU_MEMORY_UTIL + error_threshold: int = DEFAULT_ERROR_THRESHOLD + vllm_max_retries: int = DEFAULT_VLLM_MAX_RETRIES + agent_parser: str = DEFAULT_AGENT_PARSER + slurm_time: str = DEFAULT_SLURM_TIME + enable_thinking: bool = DEFAULT_ENABLE_THINKING + agent_name: str = DEFAULT_AGENT_NAME + slurm_partition: str = DEFAULT_SLURM_PARTITION + slurm_account: str = DEFAULT_SLURM_ACCOUNT + tp_size: int = DEFAULT_TP_SIZE + dp_size: int = 1 # vLLM native data-parallel replicas (total GPUs = tp_size * dp_size) + upload_username: str = "" + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER # v3 Enhancement 5 + config_yaml: str = "dcagent_eval_config.yaml" + auto_snapshot: Optional[bool] = None # None = use YAML default + + def to_env(self) -> Dict[str, str]: + """Convert to environment variables for sbatch.""" + env = { + "EVAL_N_CONCURRENT": str(self.n_concurrent), + "EVAL_N_ATTEMPTS": str(self.n_attempts), + "EVAL_GPU_MEMORY_UTIL": str(self.gpu_memory_util), + "EVAL_DAYTONA_THRESHOLD": str(self.error_threshold), + "EVAL_VLLM_MAX_RETRIES": str(self.vllm_max_retries), + "EVAL_AGENT_PARSER": self.agent_parser, + "EVAL_SLURM_TIME": self.slurm_time, + "EVAL_ENABLE_THINKING": "true" if self.enable_thinking else "false", + "EVAL_AGENT_NAME": self.agent_name, + } + # Always send tp_size so build_vllm_cmd.sh doesn't fall back to its own default + env["EVAL_VLLM_TENSOR_PARALLEL_SIZE"] = str(self.tp_size) + if self.dp_size > 1: + env["EVAL_VLLM_DATA_PARALLEL_SIZE"] = str(self.dp_size) + if self.upload_username: + env["EVAL_UPLOAD_USERNAME"] = self.upload_username + # Enhancement 5: Pass timeout multiplier + if self.timeout_multiplier != DEFAULT_TIMEOUT_MULTIPLIER: + env["EVAL_TIMEOUT_MULTIPLIER"] = str(self.timeout_multiplier) + # Pass config YAML (no-override for tb2/swebench) + if self.config_yaml != "dcagent_eval_config.yaml": + env["EVAL_CONFIG_YAML"] = self.config_yaml + # Daytona auto_snapshot override (None = use YAML default) + if self.auto_snapshot is not None: + env["EVAL_AUTO_SNAPSHOT"] = "true" if self.auto_snapshot else "false" + # Forward EVAL_JOBS_DIR to sbatch (default to user-writable location) + fallback_jobs_dir = _cc_path("eval_jobs_dir", _FALLBACK_EVAL_JOBS_DIR) + env["EVAL_JOBS_DIR"] = os.environ.get("EVAL_JOBS_DIR", fallback_jobs_dir) + + # --- Cluster config env vars (for sbatch parameterization) --- + cc = _CLUSTER_CONFIG + if cc: + paths = cc.get("paths", {}) + proxy = cc.get("proxy", {}) + hw = cc.get("hardware", {}) + + if cc.get("cluster_name"): + env["EVAL_CLUSTER_NAME"] = cc["cluster_name"] + if paths.get("project_root"): + env["EVAL_PROJECT_ROOT"] = paths["project_root"] + if paths.get("hf_cache"): + env["EVAL_HF_CACHE"] = paths["hf_cache"] + if paths.get("harbor_src"): + env["EVAL_HARBOR_SRC"] = paths["harbor_src"] + if paths.get("datasets_dirs"): + env["EVAL_DATASETS_DIRS"] = ":".join(paths["datasets_dirs"]) + if paths.get("eval_logs_dir"): + env["EVAL_LOGS_DIR"] = paths["eval_logs_dir"] + + env["EVAL_PROXY_ENABLED"] = "true" if proxy.get("enabled") else "false" + if proxy.get("login_node"): + env["EVAL_LOGIN_NODE"] = proxy["login_node"] + if proxy.get("proxychains_bin"): + env["EVAL_PROXYCHAINS_BIN"] = proxy["proxychains_bin"] + + if hw.get("cuda_home"): + env["EVAL_CUDA_HOME"] = hw["cuda_home"] + if hw.get("arch"): + env["EVAL_ARCH"] = hw["arch"] + if hw.get("gpus_per_node"): + env["EVAL_GPUS_PER_NODE"] = str(hw["gpus_per_node"]) + if hw.get("cpus_per_node"): + env["EVAL_CPUS_PER_NODE"] = str(hw["cpus_per_node"]) + + return env + + def __str__(self) -> str: + """String representation for logging.""" + parts = [ + f"n_concurrent={self.n_concurrent}", + f"n_attempts={self.n_attempts}", + f"gpu_memory_util={self.gpu_memory_util}", + f"error_threshold={self.error_threshold}", + f"vllm_max_retries={self.vllm_max_retries}", + ] + if self.agent_parser: + parts.append(f"agent_parser={self.agent_parser}") + if self.slurm_time != DEFAULT_SLURM_TIME: + parts.append(f"slurm_time={self.slurm_time}") + if self.tp_size != DEFAULT_TP_SIZE: + parts.append(f"tp_size={self.tp_size}") + if self.dp_size > 1: + parts.append(f"dp_size={self.dp_size}") + if self.enable_thinking: + parts.append("enable_thinking=True") + if self.agent_name != DEFAULT_AGENT_NAME: + parts.append(f"agent_name={self.agent_name}") + if self.slurm_partition != DEFAULT_SLURM_PARTITION: + parts.append(f"slurm_partition={self.slurm_partition}") + if self.upload_username: + parts.append(f"upload_username={self.upload_username}") + if self.timeout_multiplier != DEFAULT_TIMEOUT_MULTIPLIER: + parts.append(f"timeout_multiplier={self.timeout_multiplier}") + return ", ".join(parts) + + +def _run(cmd: List[str], env: Optional[Dict[str, str]] = None) -> Tuple[int, str]: + """Run a command and return exit code and output.""" + # Merge with current environment if extra env vars provided + run_env = None + if env: + run_env = os.environ.copy() + run_env.update(env) + + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=run_env + ) + out_lines = [] + assert proc.stdout is not None + for line in proc.stdout: + out_lines.append(line.rstrip()) + code = proc.wait() + return code, "\n".join(out_lines) + + +def get_idle_nodes(partition: str) -> List[str]: + """Get list of idle nodes on a SLURM partition.""" + code, out = _run(["sinfo", "-p", partition, "-N", "--format=%N %t", "--noheader"]) + if code != 0: + return [] + nodes = [] + for line in out.strip().split("\n"): + parts = line.split() + if len(parts) >= 2 and parts[1].strip() == "idle": + nodes.append(parts[0].strip()) + return nodes + + +def generate_run_tag(dataset_hf: str, model_hf: str) -> str: + """ + Generate a unique RUN_TAG for the job. + + Format: {safe_repo}_{safe_model}_{timestamp} + """ + safe_repo = dataset_repo_name(dataset_hf).replace("-", "_").replace(".", "_") + safe_model = model_hf.split("/")[-1].replace("-", "_").replace(".", "_") + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"{safe_repo}_{safe_model}_{timestamp}" + + +def cancel_slurm_job(slurm_job_id: str, dry_run: bool = False) -> bool: + """Cancel a SLURM job via scancel. Returns True if successful.""" + if dry_run: + log(f"[DRY RUN] Would cancel SLURM job {slurm_job_id}") + return True + code, out = _run(["scancel", slurm_job_id]) + if code == 0: + log(f"Cancelled SLURM job {slurm_job_id}") + return True + else: + log(f"WARNING: scancel failed for job {slurm_job_id}: {out}") + return False + + +def update_pending_job_slurm_id(db_job_id: str, slurm_job_id: str) -> None: + """Update the Pending job entry with the SLURM job ID after successful sbatch.""" + try: + client = get_supabase_client() + client.table("sandbox_jobs").update( + {"slurm_job_id": slurm_job_id} + ).eq("id", db_job_id).execute() + log(f"Updated job {db_job_id} with slurm_job_id={slurm_job_id}", verbose_only=True) + except Exception as e: + log(f"WARNING: failed to update job {db_job_id} with slurm_job_id: {e}") + + +def submit_eval( + hf_model_name: str, + dataset_hf: str, + benchmark_id: Optional[str], + sbatch_script: str, + sbatch_params: Optional[SbatchParams] = None, + dry_run: bool = False, + upload_username: str = "", + timeout_multiplier: float = DEFAULT_TIMEOUT_MULTIPLIER, + vllm_overrides: Optional[Dict[str, str]] = None, + dependency: Optional[str] = None, + eval_config: Optional[Dict] = None, + conda_env: str = "otagent", + run_tag_override: Optional[str] = None, + dp_nodes: int = 0, + nodelist: Optional[str] = None, + extra_env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[str]]: + """ + Create a Pending DB entry, then submit sbatch job and update with SLURM ID. + + sbatch positional args: + $1 = model HF name + $2 = dataset HF repo (org/repo) + $3 = benchmark_id (uuid) [optional] + $4 = job_name (RUN_TAG) + + Environment variables (from SbatchParams.to_env()): + EVAL_N_CONCURRENT, EVAL_N_ATTEMPTS, EVAL_GPU_MEMORY_UTIL, + EVAL_DAYTONA_THRESHOLD, EVAL_VLLM_MAX_RETRIES, EVAL_AGENT_PARSER, + EVAL_SLURM_TIME, EVAL_ENABLE_THINKING, EVAL_AGENT_NAME, + EVAL_STARTS_LOG (v3), EVAL_TIMEOUT_MULTIPLIER (v3) + + The Pending DB entry includes timeout_multiplier in its config dict + so that timeout-aware dedup (Enhancement 5) can match on it. + + Args: + vllm_overrides: Optional dict of EVAL_VLLM_* env vars from baseline + model config. Merged into sbatch env vars. + dependency: Optional SLURM dependency string (e.g. 'afterany:12345'). + eval_config: Optional harbor eval config dict for DB job config. + run_tag_override: v5 - reuse original run_tag for resume (triggers sbatch auto-resume). + + Returns: + (slurm_job_id, job_name) if successful, ("DRY_RUN", job_name) if dry run, (None, None) on failure + """ + # Generate unique job name, or reuse for resume + if run_tag_override: + job_name = run_tag_override + log(f" [v5] Reusing run_tag for resume: {job_name}") + else: + job_name = generate_run_tag(dataset_hf, hf_model_name) + + # Early return for dry-run — no DB writes, no sbatch + if dry_run: + log(f"[DRY RUN] Would submit: model={hf_model_name} dataset={dataset_hf} job={job_name}") + if sbatch_params: + log(f"[DRY RUN] With params: {sbatch_params}") + if vllm_overrides: + log(f"[DRY RUN] vLLM overrides: {list(vllm_overrides.keys())}", verbose_only=True) + return ("DRY_RUN", job_name) + + # Step 1: Create or reuse DB entry BEFORE sbatch submission + agent = sbatch_params.agent_name if sbatch_params else DEFAULT_AGENT_NAME + tm = sbatch_params.timeout_multiplier if sbatch_params else timeout_multiplier + config: Dict = {"agent": agent, "env": "daytona", "timeout_multiplier": tm, "run_tag": job_name} + # Include harbor eval config fields in DB entry for config-aware dedup + if eval_config: + if "timeout_multiplier" in eval_config: + config["timeout_multiplier"] = eval_config["timeout_multiplier"] + env_overrides = {} + for key in ("override_cpus", "override_memory_mb", "override_storage_mb"): + if key in eval_config: + env_overrides[key] = eval_config[key] + if env_overrides: + config["environment"] = env_overrides + + db_job_id: Optional[str] = None + try: + from database.unified_db.utils import ( + create_job_entry_pending, get_supabase_client, + get_model_by_name, get_benchmark_by_name, + get_job_by_model_benchmark, update_sandbox_job, + ) + + if run_tag_override: + # v6 resume: find existing DB entry and update its job_name to match + # the resume run_tag, so sbatch's update_job_status_to_started() can + # find it by name. Also reset status to Pending for clean state. + model_row = get_model_by_name(hf_model_name) + bench_name = dataset_hf.split("/")[-1] if "/" in dataset_hf else dataset_hf + bench_row = get_benchmark_by_name(bench_name) + if model_row and bench_row: + existing = get_job_by_model_benchmark(model_row['id'], bench_row['id']) + if existing: + old_name = existing.get('job_name', '?') + old_status = existing.get('job_status', '?') + db_job_id = str(existing['id']) + # Update the existing entry: reset to Pending with new job_name + update_sandbox_job(db_job_id, { + "job_name": job_name, + "job_status": "Pending", + "slurm_job_id": "pending", + "config": config, + "submitted_at": datetime.now().isoformat(), + "started_at": None, + }) + log(f" [v6] Reused DB entry {db_job_id}: {old_status} '{old_name}' → Pending '{job_name}'", + verbose_only=True) + + if not db_job_id: + # Normal path: create new Pending entry + result = create_job_entry_pending( + job_name=job_name, + model_hf=hf_model_name, + benchmark_hf=dataset_hf, + agent_name=agent, + slurm_job_id="pending", + username=upload_username or "listener", + config=config, + ) + if result.get("success") and result.get("job"): + db_job_id = str(result["job"].get("id")) + log(f"Created Pending DB entry: {db_job_id}", verbose_only=True) + else: + log(f"WARNING: Failed to create Pending DB entry: {result.get('error')}") + except Exception as e: + log(f"WARNING: Exception creating Pending DB entry: {e}") + + # Step 2: Build sbatch command + cmd = ["sbatch"] + if sbatch_params: + cmd.extend(["--time", sbatch_params.slurm_time]) + cmd.extend(["--partition", sbatch_params.slurm_partition]) + if sbatch_params.slurm_account: + cmd.extend(["--account", sbatch_params.slurm_account]) + # Request GPUs, CPUs, and memory proportional to total GPU count (TP × DP). + # Per-model baseline config can override TP (e.g. 32B models need TP=4). + effective_tp = sbatch_params.tp_size + if vllm_overrides and "EVAL_VLLM_TENSOR_PARALLEL_SIZE" in vllm_overrides: + effective_tp = int(vllm_overrides["EVAL_VLLM_TENSOR_PARALLEL_SIZE"]) + effective_dp = sbatch_params.dp_size + total_gpus = effective_tp * effective_dp + cmd.extend(["--gres", f"gpu:{total_gpus}"]) + cc = _CLUSTER_CONFIG or {} + hw = cc.get("hardware", {}) + gpus_per_node = int(hw.get("gpus_per_node", 8)) + cpus_per_node = int(hw.get("cpus_per_node", 96)) + mem_per_node_mb = int(hw.get("mem_per_node_mb", 1860000)) + cpus_needed = (cpus_per_node * total_gpus) // gpus_per_node + mem_needed = (mem_per_node_mb * total_gpus) // gpus_per_node + cmd.extend(["--cpus-per-task", str(cpus_needed)]) + cmd.extend(["--mem", f"{mem_needed}M"]) + if nodelist: + cmd.extend(["--nodelist", nodelist]) + if dependency: + cmd.append(f"--dependency={dependency}") + # DP: request multiple nodes + if dp_nodes > 0: + cmd.extend(["--nodes", str(dp_nodes)]) + # Benchmark-aware SLURM job name for squeue readability + repo_name = dataset_repo_name(dataset_hf) + bench_tag = _BENCH_SHORT.get(repo_name, repo_name[:12]) + if run_tag_override: + job_prefix = "res_dp_" if dp_nodes > 0 else "res_" + else: + job_prefix = "eval_dp_" if dp_nodes > 0 else "eval_" + slurm_job_name = os.environ.get("EVAL_SLURM_JOB_NAME", "data") + cmd.extend(["--job-name", slurm_job_name]) + cmd.append(sbatch_script) + cmd.extend([hf_model_name, dataset_hf]) + # $3 = benchmark_id (or empty placeholder), $4 = run_tag override + cmd.append(str(benchmark_id) if benchmark_id else "") + cmd.append(job_name) # 4th arg: job_name (RUN_TAG) + + # Get env vars from params and merge vllm overrides + harbor config env vars + env_vars = sbatch_params.to_env() if sbatch_params else {} + if db_job_id: + env_vars["EVAL_DB_JOB_ID"] = db_job_id + if vllm_overrides: + env_vars.update(vllm_overrides) + # Pass harbor eval config fields as sbatch env vars + if eval_config: + if eval_config.get("timeout_multiplier") is not None: + env_vars["EVAL_TIMEOUT_MULTIPLIER"] = str(eval_config["timeout_multiplier"]) + if eval_config.get("override_memory_mb") is not None: + env_vars["EVAL_OVERRIDE_MEMORY_MB"] = str(eval_config["override_memory_mb"]) + + # Pass conda env path so sbatch uses the right Python/vLLM installation + otagent_dir = CONDA_ENV_PATHS.get(conda_env) + if otagent_dir: + env_vars["OTAGENT_DIR"] = otagent_dir + # Merge extra env vars (e.g. EVAL_VLLM_PORT from pack-jobs port planner) + if extra_env: + env_vars.update(extra_env) + # DP: let sbatch compute NUM_SHARDS from GPUS_PER_NODE / TP_SIZE * nodes + # Do NOT pass EVAL_NUM_SHARDS — it would override the per-node shard calculation + + # Step 3: Run sbatch + code, out = _run(cmd, env=env_vars) + log(f"sbatch: {' '.join(cmd)}\n{out}") + + if code != 0: + # sbatch failed; pending entry remains (will be detected as stale later) + return (None, None) + + m = re.search(r"Submitted batch job (\d+)", out) + slurm_job_id = m.group(1) if m else None + + if not slurm_job_id: + log("ERROR: Could not parse SLURM job ID from sbatch output") + return (None, None) + + # Step 4: Update pending entry with actual SLURM job ID + if db_job_id: + update_pending_job_slurm_id(db_job_id, slurm_job_id) + + return (slurm_job_id, job_name) + + +# ---------- Main Listener Class ---------- +class EvalListener: + """Unified eval listener v3 that handles all benchmark configurations. + + Lifecycle: + 1. run() logs config, runs Daytona pre-flight (if enabled), enters main loop + 2. Each iteration: hot-reload priority file, fetch models, filter, build + submissions list, sort by priority rank, apply retry deprioritization, + throttle to per-listener SLURM limit, submit + 3. Sleep for check_interval_hours, then repeat + + Per-listener SLURM tracking: + _submitted_jobs tracks SLURM job IDs submitted by THIS listener instance. + Each iteration, completed jobs are pruned via squeue intersection. This + allows multiple listeners to run in parallel with independent job budgets. + """ + + def __init__(self, config: ListenerConfig): + self.config = config + self._submitted_jobs: Set[str] = set() # SLURM job IDs submitted by THIS listener + self._dep_chain: List[str] = [] # Persistent sliding-window dependency chain across iterations + self._resume_run_tags: Dict[str, str] = {} # v6: hf_model → run_tag for disk resume + set_log_file(config.log_file) + # Seed _submitted_jobs from previous listener logs (--inherit-log) + if config.inherit_log: + inherited = parse_submitted_jobs_from_logs(config.inherit_log) + self._submitted_jobs = inherited + if inherited: + # Log the inherited IDs so future --inherit-log on THIS log picks them up + log(f"[inherit-log] Inherited {len(inherited)} active job(s)") + log(f"[inherit-log] Inherited jobs: {','.join(sorted(inherited))}") + + def run_iteration(self) -> int: + """ + Run one check iteration. + + Returns: + Number of jobs submitted (or would submit in dry-run mode) + """ + # Hot-reload priority models from file (enables editing during long runs) + if self.config.priority_file: + new_priority = load_priority_models(self.config.priority_file) + if new_priority != self.config.priority_models: + log(f"Priority list reloaded: {len(new_priority)} model(s)") + self.config.priority_models = new_priority + + # Hot-reload blacklist from file + if self.config.blacklist_file: + new_blacklist = load_blacklist(self.config.blacklist_file) + if new_blacklist != self.config.blacklisted_models: + log(f"Blacklist reloaded: {len(new_blacklist)} model(s)") + self.config.blacklisted_models = new_blacklist + + # v6: Clear per-iteration resume state + self._resume_run_tags = {} + + log("Checking for new models...") + + # Optimization: in filter_only mode with a priority file, skip the + # expensive fetch_recent_models() (which returns ALL models in the + # lookback window) and only fetch priority models by name. + if (self.config.priority_mode == "filter_only" + and self.config.priority_models): + models = fetch_priority_models(self.config.priority_models) + log(f"Fetched {len(models)} priority model(s) directly (filter_only mode, skipped full scan).") + else: + models = fetch_recent_models(self.config.lookback_days) + log(f"Found {len(models)} model(s) in lookback window.") + + # Priority models bypass lookback window. + # Fetch priority models by name regardless of creation_time, then merge. + if self.config.priority_models: + priority_models_from_db = fetch_priority_models(self.config.priority_models) + seen_ids = {str(m.get("id")) for m in models} + added = 0 + for pm in priority_models_from_db: + if str(pm.get("id")) not in seen_ids: + models.append(pm) + seen_ids.add(str(pm.get("id"))) + added += 1 + if added: + log(f"Added {added} priority model(s) outside lookback window.") + + log(f"Total {len(models)} model(s) to check. Filtering...") + + # Check if we should skip all models due to require_priority_list + if not self.config.priority_models and self.config.require_priority_list: + log("No priority list configured and --require-priority-list is set. Skipping all models.") + return 0 + + submissions: List[Tuple[str, str, str, Optional[str], str, Optional[str]]] = [] + # (model_id, hf_model_name, dataset_hf, benchmark_id, reason, slurm_job_id) + finished_in_db: Set[str] = set() # v6: models DB considers done (skip for resume) + + # v6: Build set of (model, dataset) pairs currently running in squeue (by parsing eval logs). + # Used to prevent both resume and fresh submissions for already-running models. + active_models, active_pairs, active_run_tags = get_active_model_dataset_pairs( + log_dir=_cc_path("eval_logs_dir", _FALLBACK_EVAL_LOGS_DIR), + ) + if active_pairs: + log(f"[v6-active] Found {len(active_pairs)} (model, dataset) pair(s) currently active in squeue") + if self.config.verbose: + for m, d in sorted(active_pairs): + log(f" [v6-active] {m} on {d}") + + # Track stats + skipped_not_in_priority = 0 + skipped_hf_not_exists = 0 + + # Resolve all benchmarks up front (once per loop) + dataset_to_bench: Dict[str, Optional[str]] = { + ds: resolve_benchmark_id(ds) for ds in self.config.datasets + } + + # Precompute benchmark duplicate groups for cross-duplicate aggregation + bench_dup_groups: Dict[str, List[str]] = {} + for ds, bench_id in dataset_to_bench.items(): + if bench_id: + bench_dup_groups[bench_id] = get_duplicate_group_ids('benchmarks', bench_id) + + for m in models: + model_id = str(m.get("id")) + if not model_id: + continue + + hf_model = resolve_hf_model_name(m) + if not hf_model: + if self.config.verbose: + log(f"Skip: cannot resolve HF model for id={model_id}, name={m.get('name')}") + continue + + # Blacklist check (overrides priority) + if hf_model in self.config.blacklisted_models: + if self.config.verbose: + log(f"Skip: model={hf_model} is blacklisted") + continue + + # Priority handling depends on mode + is_priority = bool(self.config.priority_models and hf_model in self.config.priority_models) + + if self.config.priority_mode == "filter_only": + # Only evaluate models in the priority list + if self.config.priority_models and not is_priority: + skipped_not_in_priority += 1 + continue + # priority_first: don't skip, just track is_priority for sorting + + # HuggingFace existence check + if self.config.check_hf_exists: + if not check_hf_model_exists(hf_model): + log(f"Skip: model not found on HuggingFace: {hf_model} (model_id={model_id})") + skipped_hf_not_exists += 1 + continue + + # Compute model duplicate group for cross-duplicate aggregation + model_dup_ids = get_duplicate_group_ids('models', model_id) + + for dataset_hf in self.config.datasets: + bench_id = dataset_to_bench.get(dataset_hf) + + # Get benchmark duplicate group (precomputed above) + bench_dup_ids = bench_dup_groups.get(bench_id) if bench_id else None + + # Check DB status to decide if we should start + # (Enhancement 5: timeout-aware, cross-duplicate aggregation, config-aware dedup) + if self.config.force_reeval: + should_start, reason, old_slurm_job_id = True, "force-reeval", None + else: + should_start, reason, old_slurm_job_id = should_start_job( + model_id, bench_id, self.config.stale_job_hours, + stale_pending_hours=self.config.stale_pending_hours, + timeout_aware=self.config.timeout_aware, + agent_name=self.config.agent_name, + timeout_multiplier=self.config.timeout_multiplier, + duplicate_model_ids=model_dup_ids, + duplicate_benchmark_ids=bench_dup_ids, + eval_config=self.config.eval_config if self.config.eval_config else None, + ) + + if should_start: + # v6: Skip if (model, dataset) already running in squeue (even if DB says "no existing job", + # e.g. when DB entry was deleted but SLURM job is still active) + # Bypass this check in force-reeval mode. + if not self.config.force_reeval and (hf_model, dataset_hf) in active_pairs: + if self.config.verbose: + log(f"Skip: model={hf_model}, dataset={dataset_hf}, reason=currently running in squeue") + continue + submissions.append((model_id, hf_model, dataset_hf, bench_id, reason, old_slurm_job_id)) + else: + # Track models the DB considers done (for v6 resume filtering) + if "finished" in reason: + finished_in_db.add(hf_model) + if self.config.verbose: + log(f"Skip: model={hf_model}, dataset={dataset_hf}, reason={reason}") + + # Log filtering stats + if self.config.priority_mode == "filter_only" and self.config.priority_models and skipped_not_in_priority > 0: + log(f"Skipped {skipped_not_in_priority} model(s) not in priority list") + if self.config.check_hf_exists and skipped_hf_not_exists > 0: + log(f"Skipped {skipped_hf_not_exists} model(s) not found on HuggingFace") + + # v6: Disk-based resume — scan jobs dir for incomplete/errored jobs + resume_submissions = [] + if self.config.enable_disk_resume and self.config.jobs_dirs: + # Always query squeue (even in dry-run) for accurate filtering + active_slurm = get_active_slurm_job_ids() + # Build dataset prefixes from config + ds_prefixes = [] + for ds in self.config.datasets: + ds_short = ds.split("/")[-1] if "/" in ds else ds + ds_prefixes.append(ds_short) + # Scan all configured jobs directories + resume_candidates = [] + for jdir in self.config.jobs_dirs: + resume_candidates.extend(scan_jobs_dir_for_resume( + jobs_dir=jdir, + dataset_prefixes=ds_prefixes, + active_slurm_ids=active_slurm, + infra_error_threshold=self.config.resume_infra_error_threshold, + max_resume_count=self.config.max_resume_count, + )) + # Filter resume candidates through blacklist and priority (same as normal models) + if self.config.blacklisted_models: + before = len(resume_candidates) + resume_candidates = [rc for rc in resume_candidates + if rc["hf_model"] not in self.config.blacklisted_models] + skipped_bl = before - len(resume_candidates) + if skipped_bl: + log(f"[v6-resume] Filtered out {skipped_bl} blacklisted resume candidate(s)") + if self.config.priority_mode == "filter_only" and self.config.priority_models: + before = len(resume_candidates) + resume_candidates = [rc for rc in resume_candidates + if rc["hf_model"] in self.config.priority_models] + skipped_prio = before - len(resume_candidates) + if skipped_prio: + log(f"[v6-resume] Filtered out {skipped_prio} non-priority resume candidate(s)") + + # v6: Filter out (model, dataset) pairs currently running in squeue. + # This prevents resuming old dirs when a job for the same model+dataset is active. + if active_pairs: + before = len(resume_candidates) + resume_candidates = [rc for rc in resume_candidates + if (rc["hf_model"], rc.get("dataset", "")) not in active_pairs] + skipped_active = before - len(resume_candidates) + if skipped_active: + log(f"[v6-resume] Filtered out {skipped_active} currently-running resume candidate(s)") + + # Filter out models that DB already considers finished (stale disk dirs + # from older runs that have been superseded by a successful resubmission) + if finished_in_db: + before = len(resume_candidates) + resume_candidates = [rc for rc in resume_candidates + if rc["hf_model"] not in finished_in_db] + skipped_fin = before - len(resume_candidates) + if skipped_fin: + log(f"[v6-resume] Filtered out {skipped_fin} already-finished-in-DB resume candidate(s)") + + # Dedup: pick the most recent dir per model (reverse so latest timestamp wins). + seen_resume_models: Set[str] = set() + for rc in reversed(resume_candidates): + if rc["hf_model"] not in seen_resume_models: + seen_resume_models.add(rc["hf_model"]) + # Use a sentinel model_id since we don't have it from DB + resume_submissions.append( + ("__resume__", rc["hf_model"], rc["dataset"] or "", + None, rc["reason"], None) + ) + # Store run_tag mapping for submit_eval + self._resume_run_tags[rc["hf_model"]] = rc["run_tag"] + if resume_submissions: + log(f"[v6-resume] Adding {len(resume_submissions)} resume job(s) (priority over new models)") + + # v6: Resume takes priority — remove normal submissions for models + # that already have a resume candidate (avoid duplicate fresh + resume). + resume_model_set = {s[1] for s in resume_submissions} # s[1] = hf_model + if resume_model_set: + before = len(submissions) + submissions = [s for s in submissions if s[1] not in resume_model_set] + skipped_dup = before - len(submissions) + if skipped_dup: + log(f"[v6-resume] Suppressed {skipped_dup} fresh submission(s) in favor of resume") + + # --resume-only: drop all fresh submissions, keep only resume jobs + if self.config.resume_only: + if submissions: + log(f"[v6-resume] --resume-only: dropping {len(submissions)} fresh submission(s)") + submissions = [] + + if not submissions and not resume_submissions: + log("No eligible (model, dataset) pairs to submit.") + return 0 + + # Prepend resume submissions (higher priority than new models) + submissions = resume_submissions + submissions + + # Sort submissions by priority file order (earlier in file = higher priority). + # Models not in the priority list get lowest rank (submitted last). + if self.config.priority_models: + priority_rank = {m: i for i, m in enumerate(self.config.priority_models)} + fallback_rank = len(self.config.priority_models) + submissions.sort(key=lambda s: priority_rank.get(s[1], fallback_rank)) + if self.config.priority_mode == "priority_first": + n_priority = sum(1 for s in submissions if s[1] in priority_rank) + n_non_priority = len(submissions) - n_priority + log(f"Priority-first ordering: {n_priority} priority + {n_non_priority} non-priority submissions") + + prefix = "[DRY RUN] Would submit" if self.config.dry_run else "Submitting" + log(f"{prefix} {len(submissions)} eval(s)...") + + # Enhancement 2: Per-listener SLURM job submission throttle. + # Track which SLURM job IDs this listener submitted. Prune finished ones + # via squeue, then cap new submissions at remaining slots. + if not self.config.dry_run: + active_ids = get_active_slurm_job_ids() + # Prune jobs that are no longer in squeue (finished/failed/cancelled) + still_active = self._submitted_jobs & active_ids + finished = len(self._submitted_jobs) - len(still_active) + self._submitted_jobs = still_active + active_count = len(self._submitted_jobs) + remaining_slots = self.config.max_jobs_submitted - active_count + log(f"Listener SLURM jobs: {active_count} active " + f"({finished} finished since last check), " + f"{remaining_slots} slots available (max {self.config.max_jobs_submitted})") + if remaining_slots <= 0: + log(f"WARNING: At per-listener job limit " + f"({active_count}/{self.config.max_jobs_submitted}), " + f"skipping all submissions this iteration") + return 0 + if len(submissions) > remaining_slots: + log(f"Capping submissions from {len(submissions)} to {remaining_slots} " + f"(per-listener limit: {self.config.max_jobs_submitted})") + submissions = submissions[:remaining_slots] + + # Create sbatch params from config + sbatch_params = SbatchParams( + n_concurrent=self.config.n_concurrent, + n_attempts=self.config.n_attempts, + gpu_memory_util=self.config.gpu_memory_util, + error_threshold=self.config.error_threshold, + vllm_max_retries=self.config.vllm_max_retries, + agent_parser=self.config.agent_parser, + slurm_time=self.config.slurm_time, + enable_thinking=self.config.enable_thinking, + agent_name=self.config.agent_name, + slurm_partition=self.config.slurm_partition, + slurm_account=self.config.slurm_account, + tp_size=self.config.tp_size, + dp_size=self.config.dp_size, + upload_username=self.config.upload_username, + timeout_multiplier=self.config.timeout_multiplier, + config_yaml=self.config.config_yaml, + auto_snapshot=self.config.auto_snapshot, + ) + + # Load baseline model configs for per-model vLLM overrides + baseline_configs = load_baseline_model_configs(self.config.baseline_model_configs) + + # Add harbor config env vars to sbatch params + if self.config.harbor_config: + # Will be merged in submit_eval via eval_config, but also pass path + pass # harbor config fields are passed via eval_config to submit_eval + + # Pre-download setup (for no-internet compute nodes) + if self.config.pre_download: + from huggingface_hub import snapshot_download + downloaded_models: set = set() + + # Sliding-window dependency tracking (persistent across iterations) + # self._dep_chain carries job IDs from previous iterations so new jobs + # respect the concurrency limit even across sleep cycles. + batch_size = self.config.batch_size + if batch_size and batch_size > 0: + if not self.config.dry_run: + active_ids = get_active_slurm_job_ids() + else: + active_ids = set() + active_in_chain = sum(1 for jid in self._dep_chain if jid in active_ids) + log(f"Sliding-window batch-size={batch_size}: " + f"{active_in_chain} active jobs in dependency chain from previous iterations") + + # Node packing: query idle nodes and track GPU + port slots per node + pack_node_list: List[str] = [] + pack_gpus_per_node = 8 + pack_node_gpu_used: Dict[int, int] = {} # node_idx -> GPUs used so far + pack_node_port_next: Dict[int, int] = {} # node_idx -> next available port offset + pack_node_idx = 0 + if self.config.pack_jobs: + cc = self.config.cluster_config or {} + hw = cc.get("hardware", {}) + pack_gpus_per_node = int(hw.get("gpus_per_node", 8)) + pack_node_list = get_idle_nodes(self.config.slurm_partition) + if pack_node_list: + log(f"Pack mode: {len(pack_node_list)} idle nodes, {pack_gpus_per_node} GPUs/node") + else: + log("Pack mode: no idle nodes found, falling back to default scheduling") + + submitted = 0 + for idx, (mid, hf_model, dataset_hf, bench_id, reason, old_slurm_job_id) in enumerate(submissions): + + # Pre-download this model before submitting (download-then-submit per model) + # Uses the shared HF cache so compute nodes (no internet) find it via HF_HUB_OFFLINE=1 + if self.config.pre_download and hf_model not in downloaded_models: + hf_cache = os.environ.get("HF_HUB_CACHE", _cc_path("hf_cache", _FALLBACK_HF_CACHE)) + log(f" Pre-downloading model {hf_model} to {hf_cache}...") + try: + # Run snapshot_download in a subprocess thread with timeout + # to avoid indefinite hangs on network issues + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit( + snapshot_download, repo_id=hf_model, repo_type="model", cache_dir=hf_cache + ) + path = future.result(timeout=300) # 5 minute timeout + log(f" Cached at {path}") + except concurrent.futures.TimeoutError: + log(f" WARNING: Pre-download of {hf_model} timed out after 300s, skipping (will retry next iteration)") + except Exception as e: + log(f" WARNING: Failed to download {hf_model}: {e}") + downloaded_models.add(hf_model) + + dry_prefix = "[DRY RUN] " if self.config.dry_run else "" + prio_tag = " [PRIORITY]" if (self.config.priority_mode == "priority_first" + and self.config.priority_models + and hf_model in self.config.priority_models) else "" + # v6: Pretty-print resume reason + display_reason = reason + log(f"{dry_prefix}Submitting [{idx+1}/{len(submissions)}]: model={hf_model}, dataset={dataset_hf}, reason={display_reason}{prio_tag}") + + # Cancel stale Pending SLURM job before resubmission + if reason.startswith("stale pending") and old_slurm_job_id: + cancel_slurm_job(old_slurm_job_id, dry_run=self.config.dry_run) + + # Per-model vLLM overrides from baseline config mapping + vllm_overrides = get_vllm_env_overrides(hf_model, baseline_configs) + if vllm_overrides: + log(f" Applying baseline model vLLM overrides: {list(vllm_overrides.keys())}", verbose_only=True) + + # Per-model conda env override (e.g. otagent2 for Qwen3.5) + model_conda_env = get_conda_env_override(hf_model, baseline_configs) or self.config.conda_env + if model_conda_env != self.config.conda_env: + log(f" Using conda env '{model_conda_env}' for {hf_model}") + + # Build sliding-window dependency using persistent chain. + # Look back batch_size positions in self._dep_chain. If that job is + # still active (running/pending in squeue), depend on it. If it already + # finished, the concurrency slot is free — no dependency needed. + job_dependency: Optional[str] = None + if batch_size and batch_size > 0: + chain_pos = len(self._dep_chain) # where this job will be appended + if chain_pos >= batch_size: + dep_candidate = self._dep_chain[chain_pos - batch_size] + if not self.config.dry_run and dep_candidate in active_ids: + job_dependency = f"afterany:{dep_candidate}" + log(f" Depends on job {dep_candidate} (chain pos {chain_pos - batch_size})", verbose_only=True) + + # Stagger chain: jobs wait N minutes after the previous batch STARTS. + # Uses SLURM "after:jobid+minutes" so jobs start sequentially even when + # many nodes are idle (prevents Daytona sandbox creation burst). + # With chain_batch_size=K, K jobs fire together, then the next K wait + # for the first job of the previous batch to have started + delay. + if self.config.stagger_delay > 0 and self._dep_chain: + cbs = max(self.config.chain_batch_size, 1) + chain_len = len(self._dep_chain) + # Determine which batch this job belongs to + current_batch = chain_len // cbs + if current_batch > 0: + # Depend on the first job of the previous batch + prev_batch_first = (current_batch - 1) * cbs + prev_job = self._dep_chain[prev_batch_first] + if not prev_job.startswith(("DRY_", "FAILED_")): + stagger_dep = f"after:{prev_job}+{self.config.stagger_delay}" + if job_dependency: + job_dependency = f"{job_dependency},{stagger_dep}" + else: + job_dependency = stagger_dep + if chain_len % cbs == 0: + log(f" Stagger: batch {current_batch} boundary, wait {self.config.stagger_delay}m after job {prev_job} starts") + else: + log(f" Stagger: batch {current_batch} (pos {chain_len % cbs}/{cbs}), wait {self.config.stagger_delay}m after job {prev_job} starts", verbose_only=True) + + # v6: Extract run_tag_override for disk-based resume + resume_run_tag = self._resume_run_tags.get(hf_model) + + # DP: use DP sbatch when dp_nodes > 0 + actual_sbatch = self.config.dp_sbatch_script if self.config.dp_nodes > 0 else self.config.sbatch_script + + # Node packing: assign a target node and port based on GPU slots + target_node = None + pack_port = None + if pack_node_list: + # Determine effective GPU count for this model (TP × DP) + effective_tp = self.config.tp_size + if vllm_overrides and "EVAL_VLLM_TENSOR_PARALLEL_SIZE" in vllm_overrides: + effective_tp = int(vllm_overrides["EVAL_VLLM_TENSOR_PARALLEL_SIZE"]) + effective_dp = self.config.dp_size + total_gpus = effective_tp * effective_dp + # Find a node with enough free GPU slots + while pack_node_idx < len(pack_node_list): + used = pack_node_gpu_used.get(pack_node_idx, 0) + if used + total_gpus <= pack_gpus_per_node: + target_node = pack_node_list[pack_node_idx] + pack_node_gpu_used[pack_node_idx] = used + total_gpus + # Assign a non-overlapping port for this job + port_offset = pack_node_port_next.get(pack_node_idx, 0) + pack_port = 10000 + port_offset + pack_node_port_next[pack_node_idx] = port_offset + max(effective_dp, 1) + break + pack_node_idx += 1 + if target_node: + log(f" Pack: {target_node} (GPUs {pack_node_gpu_used[pack_node_idx]}/{pack_gpus_per_node}, port {pack_port})", verbose_only=True) + + # Pass listener-assigned port to sbatch when packing + extra_env = {} + if pack_port is not None: + extra_env["EVAL_VLLM_PORT"] = str(pack_port) + + slurm_job_id, job_name = submit_eval( + hf_model, + dataset_hf, + bench_id, + actual_sbatch, + sbatch_params=sbatch_params, + dry_run=self.config.dry_run, + upload_username=self.config.upload_username, + timeout_multiplier=self.config.timeout_multiplier, + vllm_overrides=vllm_overrides if vllm_overrides else None, + dependency=job_dependency, + eval_config=self.config.eval_config if self.config.eval_config else None, + conda_env=model_conda_env, + run_tag_override=resume_run_tag, + dp_nodes=self.config.dp_nodes, + nodelist=target_node, + extra_env=extra_env, + ) + + if slurm_job_id: + if self.config.dry_run: + node_str = f" on {target_node}" if target_node else "" + log(f" -> Would submit as SLURM job (job_name={job_name}){node_str}") + self._dep_chain.append(f"DRY_{idx}") + else: + log(f" -> Submitted as SLURM job {slurm_job_id} (job_name={job_name})") + self._submitted_jobs.add(slurm_job_id) + self._dep_chain.append(slurm_job_id) + submitted += 1 + else: + log(f" -> Submission failed") + self._dep_chain.append(f"FAILED_{idx}") + + if not self.config.dry_run and self.config.submission_delay > 0: + time.sleep(self.config.submission_delay) + + return submitted + + def run(self) -> None: + """Main event loop.""" + # Log configuration + hdr = ( + f"lookback={self.config.lookback_days}d, " + f"every {self.config.check_interval_hours}h, " + f"sbatch={self.config.sbatch_script}" + ) + log(f"Starting listener v3 for datasets={self.config.datasets}: {hdr}") + log( + f"Job logic: restart if 'Started' and started_at > {self.config.stale_job_hours}h ago, " + f"restart+scancel if 'Pending' and submitted_at > {self.config.stale_pending_hours}h ago, " + f"skip if 'Finished'" + ) + log(f"Dry run mode: {self.config.dry_run}") + log(f"Run once mode: {self.config.run_once}") + if self.config.force_reeval: + log("WARNING: --force-reeval is ON — bypassing DB status checks, will re-submit even if Finished") + log(f"Check HF exists: {self.config.check_hf_exists}") + log(f"Require priority list: {self.config.require_priority_list}") + + if self.config.priority_models: + mode_desc = "filter_only (skip non-priority)" if self.config.priority_mode == "filter_only" else "priority_first (all models, priority first)" + log(f"Priority mode: {mode_desc}, {len(self.config.priority_models)} model(s) in list") + if self.config.priority_file: + log(f"Priority file: {self.config.priority_file} (hot-reloaded each iteration)") + if self.config.verbose: + for m in sorted(self.config.priority_models): + log(f" - {m}") + else: + log("Priority: disabled (no priority file or empty)") + + if self.config.blacklisted_models: + log(f"Blacklist: {len(self.config.blacklisted_models)} model(s) from {self.config.blacklist_file}") + if self.config.verbose: + for m in sorted(self.config.blacklisted_models): + log(f" - {m}") + else: + log("Blacklist: disabled (no blacklist file or empty)") + + # Log sbatch parameters + sbatch_params = SbatchParams( + n_concurrent=self.config.n_concurrent, + n_attempts=self.config.n_attempts, + gpu_memory_util=self.config.gpu_memory_util, + error_threshold=self.config.error_threshold, + vllm_max_retries=self.config.vllm_max_retries, + agent_parser=self.config.agent_parser, + slurm_time=self.config.slurm_time, + enable_thinking=self.config.enable_thinking, + agent_name=self.config.agent_name, + slurm_partition=self.config.slurm_partition, + slurm_account=self.config.slurm_account, + tp_size=self.config.tp_size, + dp_size=self.config.dp_size, + timeout_multiplier=self.config.timeout_multiplier, + config_yaml=self.config.config_yaml, + auto_snapshot=self.config.auto_snapshot, + ) + log(f"Sbatch params: {sbatch_params}") + + # Log v3 enhancement status + log(f"[v3] Max SLURM jobs per listener: {self.config.max_jobs_submitted}") + log(f"[v3] Daytona resource check: {'enabled' if self.config.check_daytona_resources else 'disabled'}") + log(f"[v3] Timeout-aware dedup: {'enabled' if self.config.timeout_aware else 'disabled'}") + if self.config.timeout_multiplier != DEFAULT_TIMEOUT_MULTIPLIER: + log(f"[v3] Timeout multiplier: {self.config.timeout_multiplier}") + if self.config.stagger_delay > 0: + log(f"[v6] Stagger delay: {self.config.stagger_delay}m between batches of {self.config.chain_batch_size} jobs (SLURM after: chain)") + + # Enhancement 3: Daytona resource pre-flight check at startup + if self.config.check_daytona_resources: + ok = check_daytona_resources( + self.config.daytona_sandbox_limit, + self.config.daytona_warning_buffer, + ) + if not ok: + log("ERROR: Daytona resources at limit. Exiting.") + sys.exit(1) + + while True: + try: + # Enhancement 3: Optional per-iteration Daytona resource check + if self.config.check_daytona_resources: + ok = check_daytona_resources( + self.config.daytona_sandbox_limit, + self.config.daytona_warning_buffer, + ) + if not ok: + log("WARNING: Daytona resources at limit, skipping this iteration") + if self.config.run_once or self.config.dry_run: + break + hours = self.config.check_interval_hours + log(f"Sleeping for {hours} hours...\n") + time.sleep(self.config.check_interval_seconds) + continue + + self.run_iteration() + + # Exit after one iteration if requested + if self.config.run_once or self.config.dry_run: + mode = "DRY RUN" if self.config.dry_run else "ONCE" + log(f"[{mode}] Complete. Exiting after one iteration.") + break + + hours = self.config.check_interval_hours + log(f"Sleeping for {hours} hours...\n") + time.sleep(self.config.check_interval_seconds) + + except KeyboardInterrupt: + log("Interrupted by user. Exiting.") + sys.exit(0) + except Exception as e: + log(f"ERROR in main loop: {e}. Backing off 30s.") + time.sleep(30) + + +# ---------- CLI Argument Parsing ---------- +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Unified Eval Listener v4 - Run models on benchmark datasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +See the module docstring (top of file) for detailed flag reference with tuning +guidance. Quick summary below. + +Presets: aider, bfcl, swebench, v2, tb2, v1 + +v4 new: --blacklist-file PATH Block models from eval (overrides priority list) +v3 opt-in enhancements (all backward compatible): + --error-threshold N Unified invalid error threshold + --max-jobs-submitted N Per-listener SLURM job limit + --check-daytona-resources Daytona sandbox pre-flight check + --track-model-retries Deprioritize repeatedly-started models + --timeout-aware Dedup by model+benchmark+timeout_multiplier + +Examples: + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file priority_models.txt + + python unified_eval_listener_v4.py --preset v2 \\ + --priority-file priority_models.txt \\ + --blacklist-file bad_models.txt + + python unified_eval_listener_v4.py --preset v2 --dry-run --once --verbose + """, + ) + + # Preset configuration + parser.add_argument( + "--preset", "-p", + choices=list(PRESETS.keys()), + help="Use a preset configuration (aider, bfcl, swebench, v2, tb2, v1)", + ) + + # Dataset configuration + parser.add_argument( + "--datasets", "-d", + help="Comma/space separated HF dataset repos (overrides preset)", + ) + parser.add_argument( + "--sbatch-script", "-s", + help="SBATCH script to use (overrides preset)", + ) + parser.add_argument( + "--log-file", + help="Log file path (default: auto-generated based on preset)", + ) + parser.add_argument( + "--log-dir", + help=f"Directory for listener logs (default: {DEFAULT_LOG_DIR}, env: EVAL_LISTENER_LOG_DIR)", + ) + + # Cluster configuration + parser.add_argument( + "--cluster-config", + help="Path to cluster config YAML (e.g. eval/clusters/jupiter.yaml). " + "Provides cluster-specific defaults for SLURM, paths, proxy, and hardware. " + "CLI flags still override cluster config values.", + ) + + # Timing configuration + parser.add_argument( + "--lookback-days", + type=int, + help=f"Days to look back for models (default: {DEFAULT_LOOKBACK_DAYS})", + ) + parser.add_argument( + "--check-hours", + type=float, + help=f"Hours between iterations (default: {DEFAULT_CHECK_HOURS})", + ) + parser.add_argument( + "--stale-hours", + type=int, + help=f"Hours before 'Started' job is stale (default: {DEFAULT_STALE_JOB_HOURS})", + ) + parser.add_argument( + "--stale-pending-hours", + type=int, + help=f"Hours before 'Pending' job is stale (default: {DEFAULT_STALE_PENDING_HOURS})", + ) + + # Priority filtering + parser.add_argument( + "--priority-file", + help="Path to priority models file (one model per line)", + ) + parser.add_argument( + "--require-priority-list", + action="store_true", + help="Skip all models when priority list is empty/missing", + ) + parser.add_argument( + "--blacklist-file", + help="Path to blacklisted models file (one model per line). " + "Models in this file are never submitted. Overrides priority list.", + ) + parser.add_argument( + "--priority-mode", + choices=["filter_only", "priority_first"], + help='Priority mode: "filter_only" (default) only evaluates priority models; ' + '"priority_first" evaluates all models but submits priority ones first', + ) + + # Validation options + parser.add_argument( + "--check-hf-exists", + action="store_true", + help="Validate model exists on HuggingFace before submit", + ) + + # Eval parameters (passed to sbatch via env vars) + parser.add_argument( + "--n-concurrent", + type=int, + help=f"Harbor concurrent jobs (default: {DEFAULT_N_CONCURRENT}, preset overrides)", + ) + parser.add_argument( + "--n-attempts", + type=int, + help=f"Retry attempts per task (default: {DEFAULT_N_ATTEMPTS})", + ) + parser.add_argument( + "--gpu-memory-util", + type=float, + help=f"VLLM GPU memory fraction (default: {DEFAULT_GPU_MEMORY_UTIL})", + ) + # Enhancement 1: Unified error threshold (with backward-compat alias) + parser.add_argument( + "--error-threshold", + type=int, + dest="error_threshold", + help=f"Max invalid errors before abort upload (default: {DEFAULT_ERROR_THRESHOLD})", + ) + parser.add_argument( + "--daytona-threshold", + type=int, + dest="error_threshold_compat", + help=f"Alias for --error-threshold (backward compat, default: {DEFAULT_ERROR_THRESHOLD})", + ) + parser.add_argument( + "--vllm-max-retries", + type=int, + help=f"VLLM startup retries (default: {DEFAULT_VLLM_MAX_RETRIES})", + ) + parser.add_argument( + "--agent-parser", + help=f"Agent parser type (default: \"{DEFAULT_AGENT_PARSER}\", use \"xml\" for swebench)", + ) + parser.add_argument( + "--slurm-time", + help=f"SLURM time limit (default: \"{DEFAULT_SLURM_TIME}\")", + ) + parser.add_argument( + "--agent-name", + help=f"Agent name for harbor and DB entries (default: \"{DEFAULT_AGENT_NAME}\")", + ) + parser.add_argument( + "--slurm-partition", + help=f"SLURM partition (default: \"{DEFAULT_SLURM_PARTITION}\")", + ) + parser.add_argument( + "--slurm-account", + help="SLURM account for job submission (e.g. 'reformo'). " + "Overrides the #SBATCH --account in the sbatch script.", + ) + parser.add_argument( + "--tp-size", + type=int, + choices=[1, 2, 4], + help=f"vLLM tensor parallel size — number of GPUs per model " + f"(default: {DEFAULT_TP_SIZE})", + ) + parser.add_argument( + "--dp-size", + type=int, + default=1, + choices=[1, 2, 4, 8], + help="vLLM native data-parallel size — number of model replicas. " + "Total GPUs = tp_size × dp_size. vLLM load-balances requests " + "across replicas internally. (default: 1)", + ) + parser.add_argument( + "--enable-thinking", + action="store_true", + help="Enable thinking blocks for model inference (default: False)", + ) + parser.add_argument( + "--upload-username", + help="Username for DB entries and result uploads (default: current OS user)", + ) + + # v3 Enhancement 2: Per-listener SLURM job throttle + parser.add_argument( + "--max-jobs-submitted", + type=int, + help=f"Per-listener SLURM job limit. Each listener tracks its own " + f"submitted jobs independently (default: {DEFAULT_MAX_JOBS_SUBMITTED})", + ) + + # v3 Enhancement 3: Daytona resource pre-flight check + parser.add_argument( + "--check-daytona-resources", + action="store_true", + help="Query Daytona API for active sandbox count; skip if at limit. " + "Requires DAYTONA_API_KEY in env", + ) + parser.add_argument( + "--daytona-sandbox-limit", + type=int, + help=f"Max expected active sandboxes (default: {DEFAULT_DAYTONA_SANDBOX_LIMIT})", + ) + parser.add_argument( + "--daytona-warning-buffer", + type=float, + help=f"Warn when active sandboxes reach this fraction of limit " + f"(default: {DEFAULT_DAYTONA_WARNING_BUFFER})", + ) + + # v3 Enhancement 5: Timeout-config-sensitive dedup + parser.add_argument( + "--timeout-multiplier", + type=float, + help=f"Harbor timeout multiplier, stored in DB job config " + f"(default: {DEFAULT_TIMEOUT_MULTIPLIER})", + ) + parser.add_argument( + "--timeout-aware", + action="store_true", + help="Dedup jobs by model+benchmark+agent+timeout_multiplier instead " + "of just model+benchmark. Allows same model with different configs", + ) + + # Baseline model configs (per-model vLLM overrides) + parser.add_argument( + "--baseline-model-configs", + help="Path to YAML mapping baseline models to vLLM serving params " + "(e.g., eval/baseline_model_configs.yaml)", + ) + + # Harbor config + parser.add_argument( + "--harbor-config", + help="Path to Harbor YAML config (parsed for timeout_multiplier, " + "resource overrides; passed as EVAL_HARBOR_CONFIG to sbatch)", + ) + + # Pre-download model weights + parser.add_argument( + "--pre-download", + action="store_true", + help="Pre-download all model weights on login node before submitting jobs. " + "Essential for no-internet compute nodes (Leonardo, Jupiter).", + ) + + # Sliding-window batch dependencies + parser.add_argument( + "--batch-size", + type=int, + help="Max concurrent jobs via sliding-window SLURM dependencies. " + "Job N depends on job N-batch_size finishing (afterany), " + "so at most batch-size jobs run at once.", + ) + + # Conda environment selector + parser.add_argument( + "--conda-env", + default="otagent", + help="Conda environment to use for eval jobs. 'otagent2' has vLLM 0.17+ " + "for Qwen3.5 and newer architectures. Available envs are defined in " + "the cluster config YAML. (default: otagent)", + ) + + # v6: Disk-based resume + parser.add_argument( + "--jobs-dir", + nargs="+", + default=None, # resolved in build_config from cluster config / env / fallback + help="Path(s) to eval jobs directories for disk-based resume scanning. " + "Can specify multiple dirs. (default: $EVAL_JOBS_DIR or cluster config paths.eval_jobs_dir)", + ) + parser.add_argument( + "--no-disk-resume", + action="store_true", + help="Disable v6 disk-based resume scanning.", + ) + parser.add_argument( + "--resume-only", + action="store_true", + help="Only submit resume jobs from disk scan, skip all fresh submissions.", + ) + parser.add_argument( + "--force-reeval", + action="store_true", + help="Force re-evaluation: bypass DB status check (submit even if Finished/Started). " + "Use with --priority-file to re-run specific models.", + ) + parser.add_argument( + "--dp-nodes", + type=int, + default=0, + help="Use DP (data-parallel) eval with N SLURM nodes. " + "0 = single-node (default). Each node runs shards_per_node vLLM replicas (4/TP).", + ) + parser.add_argument( + "--inherit-log", + nargs="+", + default=None, + help="Path(s) to previous listener log file(s). Seeds _submitted_jobs with SLURM IDs " + "still active in squeue. Supports multiple logs for chained takeovers. " + "Future --inherit-log on THIS listener's log will also pick up inherited IDs.", + ) + parser.add_argument( + "--submission-delay", + type=float, + default=1.0, + help="Seconds to sleep between sbatch submissions (default: 1.0). " + "Increase to avoid Daytona rate limits (e.g. 30 for 600 sandboxes/min).", + ) + parser.add_argument( + "--stagger-delay", + type=int, + default=0, + help="Minutes between job starts via SLURM 'after:' dependency chain (default: 0 = disabled). " + "Each batch of --chain-batch-size jobs waits N minutes after the previous batch STARTS. " + "Prevents Daytona sandbox burst when many pending jobs start simultaneously. " + "Minimum 1 (SLURM after: granularity is minutes).", + ) + parser.add_argument( + "--chain-batch-size", + type=int, + default=1, + help="Jobs per stagger batch (default: 1). With --stagger-delay=1 --chain-batch-size=10, " + "10 jobs fire immediately, then the next 10 wait 1 minute after the first batch starts. " + "Only meaningful when --stagger-delay > 0.", + ) + parser.add_argument( + "--pack-jobs", + action="store_true", + help="Pack multiple jobs onto the same node. Queries idle nodes and assigns " + "jobs round-robin so that GPUs_PER_NODE / TP_SIZE jobs share one node.", + ) + parser.add_argument( + "--resume-error-threshold", + type=int, + default=10, + help="Min infrastructure errors to trigger resume for completed jobs. " + "(default: 3)", + ) + parser.add_argument( + "--max-resume-count", + type=int, + default=5, + help="Max times to resume a job dir before giving up. " + "(default: 5)", + ) + + # Execution mode + snapshot_group = parser.add_mutually_exclusive_group() + snapshot_group.add_argument( + "--auto-snapshot", + action="store_true", + default=None, + dest="auto_snapshot", + help="Enable Daytona auto_snapshot (overrides YAML config)", + ) + snapshot_group.add_argument( + "--no-auto-snapshot", + action="store_false", + dest="auto_snapshot", + help="Disable Daytona auto_snapshot (overrides YAML config)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview mode, no actual submission (implies --once)", + ) + parser.add_argument( + "--once", + action="store_true", + help="Run single iteration and exit", + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable verbose logging", + ) + + return parser.parse_args() + + +def _env_bool(name: str) -> bool: + """Get boolean from environment variable.""" + return os.getenv(name, "").lower() in ("1", "true", "yes") + + +def build_config(args: argparse.Namespace) -> ListenerConfig: + """Build configuration from args, env vars, cluster config, and preset defaults. + + Resolution order for most fields: + CLI flag > Preset > Cluster config > Hardcoded default + """ + global _CLUSTER_CONFIG, CONDA_ENV_PATHS + + # --- Load cluster config (if provided) --- + cluster_config: Optional[Dict[str, Any]] = None + if args.cluster_config: + cluster_config = load_cluster_config(args.cluster_config) + _CLUSTER_CONFIG = cluster_config + # Override CONDA_ENV_PATHS from cluster config + if cluster_config.get("conda_envs"): + CONDA_ENV_PATHS = cluster_config["conda_envs"] + + # Helper: get cluster config value + def _cc(key: str, default: Any = None) -> Any: + if cluster_config is None: + return default + return cluster_config.get(key, default) + + def _cc_p(key: str, default: Any = None) -> Any: + if cluster_config is None: + return default + return cluster_config.get("paths", {}).get(key, default) + + # Start with preset if specified + preset_config: Dict = {} + if args.preset: + preset_config = PRESETS.get(args.preset, {}) + + # Resolve datasets: CLI > ENV > Preset + datasets_str = args.datasets or os.getenv("EVAL_LISTENER_DATASETS") or "" + if datasets_str: + datasets = parse_datasets(datasets_str) + else: + datasets = preset_config.get("datasets", []) + + if not datasets: + print("ERROR: No datasets specified. Use --datasets, EVAL_LISTENER_DATASETS, or --preset") + sys.exit(2) + + # Resolve sbatch script: CLI > ENV > Preset > Cluster config > Default + sbatch_script = ( + args.sbatch_script + or os.getenv("EVAL_LISTENER_SBATCH") + or preset_config.get("sbatch_script") + or _cc_p("sbatch_script", DEFAULT_SBATCH_SCRIPT) + ) + + # Resolve timing: CLI > ENV > Default + lookback_days = ( + args.lookback_days + if args.lookback_days is not None + else int(os.getenv("EVAL_LISTENER_LOOKBACK_DAYS", str(DEFAULT_LOOKBACK_DAYS))) + ) + check_hours = ( + args.check_hours + if args.check_hours is not None + else float(os.getenv("EVAL_LISTENER_CHECK_HOURS", str(DEFAULT_CHECK_HOURS))) + ) + stale_hours = args.stale_hours if args.stale_hours is not None else DEFAULT_STALE_JOB_HOURS + stale_pending_hours = args.stale_pending_hours if args.stale_pending_hours is not None else DEFAULT_STALE_PENDING_HOURS + + # Resolve log file (CLI --log-dir > ENV > Cluster config > default) + log_dir = Path( + args.log_dir + or os.getenv("EVAL_LISTENER_LOG_DIR") + or _cc_p("listener_logs_dir", DEFAULT_LOG_DIR) + ) + log_dir.mkdir(parents=True, exist_ok=True) + + suffix = preset_config.get("log_suffix", "unified") + current_time = datetime.now().strftime("%Y%m%d_%H%M%S") + + resume_tag = "_resume" if args.resume_only else "" + dryrun_tag = "_dryrun" if args.dry_run else "" + + if args.log_file: + log_file = Path(args.log_file) + else: + log_file = log_dir / f"{suffix}_eval_listener_v6{resume_tag}{dryrun_tag}_{current_time}.log" + + # Resolve priority file: CLI > ENV + priority_file = args.priority_file or os.getenv("EVAL_LISTENER_PRIORITY_FILE") + priority_models = load_priority_models(priority_file) + + # Resolve blacklist file: CLI > ENV + blacklist_file = args.blacklist_file or os.getenv("EVAL_LISTENER_BLACKLIST_FILE") + blacklisted_models = load_blacklist(blacklist_file) + + # Resolve priority mode: CLI > ENV > default + priority_mode = ( + args.priority_mode + or os.getenv("EVAL_LISTENER_PRIORITY_MODE") + or "filter_only" + ) + + # Resolve boolean flags: CLI > ENV > Preset + require_priority = args.require_priority_list or _env_bool("EVAL_LISTENER_REQUIRE_PRIORITY_LIST") + dry_run = args.dry_run or _env_bool("EVAL_LISTENER_DRY_RUN") + check_hf_exists = ( + args.check_hf_exists + or _env_bool("EVAL_LISTENER_CHECK_HF_EXISTS") + or preset_config.get("check_hf_exists", False) + ) + + # Resolve sbatch parameters: CLI > Preset > Cluster config > Default + def _resolve(cli_val, preset_key: str, default): + if cli_val is not None: + return cli_val + return preset_config.get(preset_key, default) + + n_concurrent = _resolve(args.n_concurrent, "n_concurrent", DEFAULT_N_CONCURRENT) + n_attempts = _resolve(args.n_attempts, "n_attempts", DEFAULT_N_ATTEMPTS) + gpu_memory_util = _resolve(args.gpu_memory_util, "gpu_memory_util", DEFAULT_GPU_MEMORY_UTIL) + + # Enhancement 1: Resolve error_threshold with backward compat + error_threshold_cli = args.error_threshold + if error_threshold_cli is None: + error_threshold_cli = getattr(args, 'error_threshold_compat', None) + error_threshold = _resolve(error_threshold_cli, "error_threshold", DEFAULT_ERROR_THRESHOLD) + + vllm_max_retries = _resolve(args.vllm_max_retries, "vllm_max_retries", DEFAULT_VLLM_MAX_RETRIES) + agent_parser = _resolve(args.agent_parser, "agent_parser", DEFAULT_AGENT_PARSER) + slurm_time = _resolve(args.slurm_time, "slurm_time", _cc("slurm_time", DEFAULT_SLURM_TIME)) + agent_name = _resolve(args.agent_name, "agent_name", DEFAULT_AGENT_NAME) + slurm_partition = _resolve(args.slurm_partition, "slurm_partition", _cc("slurm_partition", DEFAULT_SLURM_PARTITION)) + slurm_account = _resolve(args.slurm_account, "slurm_account", _cc("slurm_account", DEFAULT_SLURM_ACCOUNT)) + tp_size = _resolve(args.tp_size, "tp_size", DEFAULT_TP_SIZE) + dp_size = args.dp_size if args.dp_size else 1 + enable_thinking = args.enable_thinking or preset_config.get("enable_thinking", DEFAULT_ENABLE_THINKING) + + # Resolve upload_username: CLI > ENV > current OS user + upload_username = ( + args.upload_username + or os.getenv("EVAL_UPLOAD_USERNAME") + or getpass.getuser() + ) + + # Enhancement 2: SLURM throttle + max_jobs_submitted = ( + args.max_jobs_submitted + if args.max_jobs_submitted is not None + else int(os.getenv("EVAL_LISTENER_MAX_JOBS", str(DEFAULT_MAX_JOBS_SUBMITTED))) + ) + + # Enhancement 3: Daytona resource check + check_daytona = args.check_daytona_resources + daytona_sandbox_limit = ( + args.daytona_sandbox_limit + if args.daytona_sandbox_limit is not None + else DEFAULT_DAYTONA_SANDBOX_LIMIT + ) + daytona_warning_buffer = ( + args.daytona_warning_buffer + if args.daytona_warning_buffer is not None + else DEFAULT_DAYTONA_WARNING_BUFFER + ) + + # Enhancement 5: Timeout-config-sensitive dedup + timeout_multiplier = ( + args.timeout_multiplier + if args.timeout_multiplier is not None + else DEFAULT_TIMEOUT_MULTIPLIER + ) + timeout_aware = args.timeout_aware + + # auto_snapshot: CLI > Preset > None (use YAML default) + auto_snapshot = args.auto_snapshot + if auto_snapshot is None: + auto_snapshot = preset_config.get("auto_snapshot") + + # Config YAML: Preset > Default + config_yaml = preset_config.get("config_yaml", "dcagent_eval_config.yaml") + + # Harbor config (parse eval-relevant fields for config-aware dedup) + harbor_config = args.harbor_config or preset_config.get("harbor_config") + eval_config = parse_harbor_eval_config(harbor_config) + + # Baseline model configs for per-model vLLM overrides + baseline_model_configs_path = args.baseline_model_configs + + # Pre-download model weights + pre_download = args.pre_download + + # Sliding-window batch dependencies + batch_size = args.batch_size + + # Conda env selector + conda_env = args.conda_env + + # Resolve jobs_dirs: CLI > ENV > Cluster config > Fallback + fallback_jobs_dir = _cc_p("eval_jobs_dir", _FALLBACK_EVAL_JOBS_DIR) + jobs_dirs = args.jobs_dir or [os.environ.get("EVAL_JOBS_DIR", fallback_jobs_dir)] + + # Resolve DP sbatch script: Cluster config > Default + dp_sbatch_script = _cc_p("dp_sbatch_script", "eval/unified_eval_harbor_dp.sbatch") + + return ListenerConfig( + datasets=datasets, + sbatch_script=sbatch_script, + log_file=log_file, + lookback_days=lookback_days, + check_interval_hours=check_hours, + stale_job_hours=stale_hours, + stale_pending_hours=stale_pending_hours, + priority_file=priority_file, + require_priority_list=require_priority, + priority_models=priority_models, + priority_mode=priority_mode, + check_hf_exists=check_hf_exists, + dry_run=dry_run, + run_once=args.once, + verbose=args.verbose, + # Sbatch parameters + n_concurrent=n_concurrent, + n_attempts=n_attempts, + gpu_memory_util=gpu_memory_util, + error_threshold=error_threshold, + vllm_max_retries=vllm_max_retries, + agent_parser=agent_parser, + slurm_time=slurm_time, + enable_thinking=enable_thinking, + agent_name=agent_name, + slurm_partition=slurm_partition, + slurm_account=slurm_account, + tp_size=tp_size, + dp_size=dp_size, + upload_username=upload_username, + # Enhancement 2 + max_jobs_submitted=max_jobs_submitted, + # Enhancement 3 + check_daytona_resources=check_daytona, + daytona_sandbox_limit=daytona_sandbox_limit, + daytona_warning_buffer=daytona_warning_buffer, + # Enhancement 5 + timeout_multiplier=timeout_multiplier, + timeout_aware=timeout_aware, + config_yaml=config_yaml, + auto_snapshot=auto_snapshot, + blacklist_file=blacklist_file, + blacklisted_models=blacklisted_models, + # New features + baseline_model_configs=baseline_model_configs_path, + harbor_config=harbor_config, + eval_config=eval_config, + pre_download=pre_download, + batch_size=batch_size, + conda_env=conda_env, + # v6: Disk-based resume + jobs_dirs=jobs_dirs, + enable_disk_resume=not args.no_disk_resume, + resume_infra_error_threshold=args.resume_error_threshold, + max_resume_count=args.max_resume_count, + force_reeval=args.force_reeval, + resume_only=args.resume_only, + submission_delay=args.submission_delay, + stagger_delay=max(args.stagger_delay, 0), + chain_batch_size=max(args.chain_batch_size, 1), + pack_jobs=args.pack_jobs, + dp_nodes=args.dp_nodes, + dp_sbatch_script=dp_sbatch_script, + inherit_log=args.inherit_log, + # Cluster config + cluster_config=cluster_config, + ) + + +# ---------- Main ---------- +def main() -> None: + global _VERBOSE + _load_secrets() + args = parse_args() + config = build_config(args) + _VERBOSE = config.verbose + if config.cluster_config: + log(f"[v6] Cluster config: {config.cluster_config.get('cluster_name', '?')}") + listener = EvalListener(config) + listener.run() + + +if __name__ == "__main__": + main() diff --git a/hpc/dotenv/m2.env b/hpc/dotenv/m2.env new file mode 100644 index 00000000..facd40ce --- /dev/null +++ b/hpc/dotenv/m2.env @@ -0,0 +1,27 @@ +export SCRATCH="/mnt/weka/home/$USER" +export DCFT="$SCRATCH/OpenThoughts-Agent" +export DC_AGENT="$DCFT" +export DC_AGENT_SECRET_ENV=~/secrets.env +export HF_CACHE_DIR="$SCRATCH/.cache/huggingface" +export HF_HUB_CACHE="$HF_CACHE_DIR/hub" +export DATASETS_DIR="$HF_HUB_CACHE" +export MODELS_DIR="$HF_HUB_CACHE" +export VLLM_CACHE_ROOT="$SCRATCH/.cache/vllm" +export TRITON_CACHE_DIR="$SCRATCH/.cache/triton" +export FLASHINFER_CACHE_DIR="$SCRATCH/.cache/flashinfer" +export HF_HUB_ENABLE_HF_TRANSFER=1 +export WANDB_PROJECT="OpenThoughts-Agent" +export WANDB_ENTITY="${WANDB_ENTITY:-}" +# Keep the repo importable even if Python strips the working directory from sys.path. +export PYTHONPATH="${DCFT}${PYTHONPATH:+:$PYTHONPATH}" +# Conda environment activation helper +export DCFT_CONDA="$SCRATCH/miniconda3" +dcft_activate() { + # shellcheck disable=SC1090 + source "$DCFT_CONDA/bin/activate" otagent + export PYTHONPATH="${DCFT}${PYTHONPATH:+:$PYTHONPATH}" +} +export DCFT_ACTIVATE_ENV=dcft_activate +# TF32 for performance +export PYTORCH_CUDA_ALLOW_TF32=1 +export PYTORCH_CUDNN_ALLOW_TF32=1 diff --git a/hpc/dotenv/mbz.env b/hpc/dotenv/mbz.env new file mode 100644 index 00000000..fd2a65ac --- /dev/null +++ b/hpc/dotenv/mbz.env @@ -0,0 +1,29 @@ +export SCRATCH="/lustrefs/users/$USER" +export DCFT="$SCRATCH/OpenThoughts-Agent" +export DC_AGENT="$DCFT" +export DC_AGENT_SECRET_ENV=~/secrets.env +export HF_CACHE_DIR="$SCRATCH/.cache/huggingface" +export HF_HUB_CACHE="$HF_CACHE_DIR/hub" +export DATASETS_DIR="$HF_HUB_CACHE" +export MODELS_DIR="$HF_HUB_CACHE" +export VLLM_CACHE_ROOT="$SCRATCH/.cache/vllm" +export TRITON_CACHE_DIR="$SCRATCH/.cache/triton" +export FLASHINFER_CACHE_DIR="$SCRATCH/.cache/flashinfer" +export HF_HUB_ENABLE_HF_TRANSFER=1 +export WANDB_PROJECT="OpenThoughts-Agent" +export WANDB_ENTITY="${WANDB_ENTITY:-}" +# Keep the repo importable even if Python strips the working directory from sys.path. +export PYTHONPATH="${DCFT}${PYTHONPATH:+:$PYTHONPATH}" +# Conda environment activation helper +export DCFT_CONDA="$SCRATCH/miniconda3" +dcft_activate() { + # shellcheck disable=SC1090 + source "$DCFT_CONDA/bin/activate" otagent + export PYTHONPATH="${DCFT}${PYTHONPATH:+:$PYTHONPATH}" +} +export DCFT_ACTIVATE_ENV=dcft_activate +# TF32 for performance +export PYTORCH_CUDA_ALLOW_TF32=1 +export PYTORCH_CUDNN_ALLOW_TF32=1 +# Your personal email address for notifications +export EMAIL_ADDRESS="richard.zhuang@example.com" diff --git a/hpc/launch_utils.py b/hpc/launch_utils.py index a1647f69..8b13ceec 100644 --- a/hpc/launch_utils.py +++ b/hpc/launch_utils.py @@ -1842,6 +1842,7 @@ def sync_eval_to_database( hf_token: Optional[str] = None, hf_episodes: str = "last", forced_update: bool = False, + is_overlong: bool = False, dry_run: bool = False, ) -> Dict[str, Any]: """Sync evaluation results to Supabase database (with optional HF upload). @@ -1953,6 +1954,7 @@ def sync_eval_to_database( hf_token=token, hf_episodes=hf_episodes, forced_update=forced_update, + is_overlong=is_overlong, ) uploaded = result.get("n_trials_uploaded", 0) From 0f1366a28c015be90283496b9628603c6867ea68 Mon Sep 17 00:00:00 2001 From: richardzhuang0412 Date: Sun, 5 Apr 2026 20:07:29 +0000 Subject: [PATCH 2/3] Remove duplicate nested eval/lists/lists/ directory Co-Authored-By: Claude Opus 4.6 (1M context) --- eval/lists/lists/16x_32b_lc_baseline.txt | 8 - eval/lists/lists/a1_models.txt | 77 --- eval/lists/lists/a1_nl2bash.txt | 1 - eval/lists/lists/a1_retrained.txt | 4 - eval/lists/lists/alfworld_131k.txt | 1 - .../lists/architecture_invalid_test_model.txt | 1 - eval/lists/lists/baseline_swe.txt | 10 - eval/lists/lists/bfcl_rerun_failed.txt | 22 - eval/lists/lists/bfcl_rerun_models.txt | 74 --- eval/lists/lists/custom_force_run.txt | 1 - eval/lists/lists/dp_test_model.txt | 1 - eval/lists/lists/dsv2_rerun_models.txt | 230 --------- eval/lists/lists/exp_tas_qwen35.txt | 1 - eval/lists/lists/glm46_131k.txt | 1 - eval/lists/lists/glm47_flash.txt | 1 - eval/lists/lists/inactive_models_latest.txt | 457 ------------------ eval/lists/lists/kept_models_names.txt | 221 --------- eval/lists/lists/laion_latest.txt | 28 -- .../latest_sort_by_release_eval_prio.txt | 28 -- eval/lists/lists/missing_dev_set_v2.txt | 189 -------- .../missing_dev_set_v2_inactive_laion.txt | 24 - ...g_swebench_verified_random_100_folders.txt | 69 --- eval/lists/lists/missing_terminal_bench_2.txt | 96 ---- eval/lists/lists/models_131k.txt | 35 -- eval/lists/lists/models_32b.txt | 31 -- eval/lists/lists/nemotron_nano.txt | 1 - eval/lists/lists/no_eval_models_latest.txt | 30 -- eval/lists/lists/pipeline_exp_prio.txt | 19 - eval/lists/lists/priority_131k_test.txt | 1 - eval/lists/lists/priority_batch2.txt | 3 - eval/lists/lists/priority_batch_evalorg.txt | 3 - eval/lists/lists/priority_obiwan.txt | 1 - eval/lists/lists/priority_qwen35.txt | 1 - eval/lists/lists/priority_rl_test.txt | 1 - eval/lists/lists/pruned_models_names.txt | 364 -------------- eval/lists/lists/pyme_v3_40.txt | 1 - eval/lists/lists/qwen35_27b.txt | 1 - eval/lists/lists/qwen35_9b.txt | 1 - eval/lists/lists/richard_base_model.txt | 1 - eval/lists/lists/richard_test_model.txt | 2 - eval/lists/lists/rope_step_batch.txt | 3 - eval/lists/lists/sera_14b.txt | 1 - eval/lists/lists/swesmith_fixthink_45.txt | 1 - eval/lists/lists/syh_32b.txt | 1 - eval/lists/lists/tb2_richard_test_model.txt | 1 - eval/lists/lists/v2_richard_test_model.txt | 1 - 46 files changed, 2049 deletions(-) delete mode 100644 eval/lists/lists/16x_32b_lc_baseline.txt delete mode 100644 eval/lists/lists/a1_models.txt delete mode 100644 eval/lists/lists/a1_nl2bash.txt delete mode 100644 eval/lists/lists/a1_retrained.txt delete mode 100644 eval/lists/lists/alfworld_131k.txt delete mode 100644 eval/lists/lists/architecture_invalid_test_model.txt delete mode 100644 eval/lists/lists/baseline_swe.txt delete mode 100644 eval/lists/lists/bfcl_rerun_failed.txt delete mode 100644 eval/lists/lists/bfcl_rerun_models.txt delete mode 100644 eval/lists/lists/custom_force_run.txt delete mode 100644 eval/lists/lists/dp_test_model.txt delete mode 100644 eval/lists/lists/dsv2_rerun_models.txt delete mode 100644 eval/lists/lists/exp_tas_qwen35.txt delete mode 100644 eval/lists/lists/glm46_131k.txt delete mode 100644 eval/lists/lists/glm47_flash.txt delete mode 100644 eval/lists/lists/inactive_models_latest.txt delete mode 100644 eval/lists/lists/kept_models_names.txt delete mode 100644 eval/lists/lists/laion_latest.txt delete mode 100644 eval/lists/lists/latest_sort_by_release_eval_prio.txt delete mode 100644 eval/lists/lists/missing_dev_set_v2.txt delete mode 100644 eval/lists/lists/missing_dev_set_v2_inactive_laion.txt delete mode 100644 eval/lists/lists/missing_swebench_verified_random_100_folders.txt delete mode 100644 eval/lists/lists/missing_terminal_bench_2.txt delete mode 100644 eval/lists/lists/models_131k.txt delete mode 100644 eval/lists/lists/models_32b.txt delete mode 100644 eval/lists/lists/nemotron_nano.txt delete mode 100644 eval/lists/lists/no_eval_models_latest.txt delete mode 100644 eval/lists/lists/pipeline_exp_prio.txt delete mode 100644 eval/lists/lists/priority_131k_test.txt delete mode 100644 eval/lists/lists/priority_batch2.txt delete mode 100644 eval/lists/lists/priority_batch_evalorg.txt delete mode 100644 eval/lists/lists/priority_obiwan.txt delete mode 100644 eval/lists/lists/priority_qwen35.txt delete mode 100644 eval/lists/lists/priority_rl_test.txt delete mode 100644 eval/lists/lists/pruned_models_names.txt delete mode 100644 eval/lists/lists/pyme_v3_40.txt delete mode 100644 eval/lists/lists/qwen35_27b.txt delete mode 100644 eval/lists/lists/qwen35_9b.txt delete mode 100644 eval/lists/lists/richard_base_model.txt delete mode 100644 eval/lists/lists/richard_test_model.txt delete mode 100644 eval/lists/lists/rope_step_batch.txt delete mode 100644 eval/lists/lists/sera_14b.txt delete mode 100644 eval/lists/lists/swesmith_fixthink_45.txt delete mode 100644 eval/lists/lists/syh_32b.txt delete mode 100644 eval/lists/lists/tb2_richard_test_model.txt delete mode 100644 eval/lists/lists/v2_richard_test_model.txt diff --git a/eval/lists/lists/16x_32b_lc_baseline.txt b/eval/lists/lists/16x_32b_lc_baseline.txt deleted file mode 100644 index b70e4288..00000000 --- a/eval/lists/lists/16x_32b_lc_baseline.txt +++ /dev/null @@ -1,8 +0,0 @@ -nvidia/Nemotron-Terminal-32B -allenai/SERA-32B -laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith -laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B -laion/alfworld-swesmith-r2egym-swegym-131k-32B-lc \ No newline at end of file diff --git a/eval/lists/lists/a1_models.txt b/eval/lists/lists/a1_models.txt deleted file mode 100644 index 8fd5976a..00000000 --- a/eval/lists/lists/a1_models.txt +++ /dev/null @@ -1,77 +0,0 @@ -DCAgent/a1-stack_rspec -DCAgent/a1-stack_go -DCAgent/a1-self_instruct_naive -DCAgent/a1-nemotron_bash_withtests_gpt5mini -DCAgent/a1-nemotron_bash_withtests -DCAgent/a1-inferredbugs -DCAgent/a1-codeforces -DCAgent/a1-code_contests -DCAgent/a1-bash_textbook -DCAgent/a1-swegym_openhands -DCAgent/a1-stack_bash_withtests_gpt5mini -DCAgent/a1-orca_agentinstruct -DCAgent/a1-nnetnav_live -DCAgent/a1-nebius_swe_agent -DCAgent/a1-mind2web -DCAgent/a1-go_browse_wa -DCAgent/a1-codeactinstruct -DCAgent/a1-agenttuning_alfworld -DCAgent/a1-stack_pytest_gpt5mini -DCAgent/a1-nemotron_pytest -DCAgent/a1-glaive_code_assistant -DCAgent/a1-wizardlm_orca -DCAgent/a1-nemo_prism_math -DCAgent/a1-tulu3_sft_personas_math -DCAgent/a1-swesmith -DCAgent/a1-r2egym -DCAgent/a1-magicoder -DCAgent/a1-ghactions -DCAgent/a1-freelancer -DCAgent/a1-codeelo -DCAgent/a1-bugswarm -DCAgent/a1-stackexchange_unix -DCAgent/a1-stackexchange_tor -DCAgent/a1-stackexchange_superuser -DCAgent/a1-stack_pytest_withtests -DCAgent/a1-stack_pytest_synthetic_gpt5nano -DCAgent/a1-stack_phpunit -DCAgent/a1-pymethods2test -DCAgent/a1-defects4j -DCAgent/a1-curriculum_medium -DCAgent/a1-curriculum_hard -DCAgent/a1-curriculum_easy -DCAgent/a1-code_feedback -DCAgent/a1-agenttuning_webshop -DCAgent/a1-agenttuning_os -DCAgent/a1-agenttuning_mind2web -DCAgent/a1-agenttuning_db -DCAgent/a1-agenttuning_kg -DCAgent/a1-taskmaster2 -DCAgent/a1-stack_bash -DCAgent/a1-repo_scaffold -DCAgent/a1-pr_mining -DCAgent/a1-nemotron_junit -DCAgent/a1-nemotron_cpp -DCAgent/a1-nemotron_bash -DCAgent/a1-manybugs -DCAgent/a1-issue_tasks -DCAgent/a1-codenet_python -DCAgent/a1-bugsinpy -DCAgent/a1-multifile_composition -DCAgent/a1-exercism_python -DCAgent/a1-crosscodeeval_typescript -DCAgent/a1-crosscodeeval_python -DCAgent/a1-crosscodeeval_java -DCAgent/a1-taco -DCAgent/a1-staqc -DCAgent/a1-stackexchange_tezos -DCAgent/a1-stackexchange_overflow -DCAgent/a1-stack_rust -DCAgent/a1-stack_ruby -DCAgent/a1-stack_pytest -DCAgent/a1-stack_junit -DCAgent/a1-stack_jest -DCAgent/a1-stack_csharp -DCAgent/a1-stack_cpp -DCAgent/a1-stack_bash_withtests -DCAgent/a1-crosscodeeval_csharp \ No newline at end of file diff --git a/eval/lists/lists/a1_nl2bash.txt b/eval/lists/lists/a1_nl2bash.txt deleted file mode 100644 index 1069eaa1..00000000 --- a/eval/lists/lists/a1_nl2bash.txt +++ /dev/null @@ -1 +0,0 @@ -DCAgent/a1-nl2bash diff --git a/eval/lists/lists/a1_retrained.txt b/eval/lists/lists/a1_retrained.txt deleted file mode 100644 index be82d222..00000000 --- a/eval/lists/lists/a1_retrained.txt +++ /dev/null @@ -1,4 +0,0 @@ -DCAgent/a1-bugswarm -DCAgent/a1-codeelo -DCAgent/a1-ghactions -DCAgent/a1-magicoder diff --git a/eval/lists/lists/alfworld_131k.txt b/eval/lists/lists/alfworld_131k.txt deleted file mode 100644 index 2f77176c..00000000 --- a/eval/lists/lists/alfworld_131k.txt +++ /dev/null @@ -1 +0,0 @@ -laion/alfworld-swesmith-r2egym-swegym-131k-lc diff --git a/eval/lists/lists/architecture_invalid_test_model.txt b/eval/lists/lists/architecture_invalid_test_model.txt deleted file mode 100644 index 13a9cb14..00000000 --- a/eval/lists/lists/architecture_invalid_test_model.txt +++ /dev/null @@ -1 +0,0 @@ -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 \ No newline at end of file diff --git a/eval/lists/lists/baseline_swe.txt b/eval/lists/lists/baseline_swe.txt deleted file mode 100644 index 6df89642..00000000 --- a/eval/lists/lists/baseline_swe.txt +++ /dev/null @@ -1,10 +0,0 @@ -open-thoughts/OpenThinker-Agent-v1 -camel-ai/seta-rl-qwen3-8b -allenai/SERA-14B -nvidia/Nemotron-Terminal-32B -nvidia/Nemotron-Terminal-14B -nvidia/Nemotron-Terminal-8B -obiwan96/qwen3-8b-openthinker-sft-endless-terminals -nvidia/Llama-3.1-Nemotron-Nano-8B-v1 -deepseek-ai/DeepSeek-R1-Distill-Qwen-7B -open-thoughts/OpenThinker3-7B \ No newline at end of file diff --git a/eval/lists/lists/bfcl_rerun_failed.txt b/eval/lists/lists/bfcl_rerun_failed.txt deleted file mode 100644 index fa394720..00000000 --- a/eval/lists/lists/bfcl_rerun_failed.txt +++ /dev/null @@ -1,22 +0,0 @@ -DCAgent2/nl2bash-swesmithseq -DCAgent2/stack-bugsshuffle -DCAgent2/stack-bugs-undr7030 -DCAgent2/swesmith-nl2bashseq -DCAgent/exp_tas_max_episodes_512_traces -DCAgent/exp_tas_max_tokens_1024_traces -DCAgent/exp_tas_presence_penalty_1_0_traces -DCAgent/exp_tas_repetition_penalty_1_05_traces -DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 -laion/exp_tas_baseline_traces -laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B -laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps -laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps diff --git a/eval/lists/lists/bfcl_rerun_models.txt b/eval/lists/lists/bfcl_rerun_models.txt deleted file mode 100644 index e8af30d2..00000000 --- a/eval/lists/lists/bfcl_rerun_models.txt +++ /dev/null @@ -1,74 +0,0 @@ -DCAgent/All_Puzzles_5k_new_context -DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B -DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B -DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B -DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 -DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/taskmaster2-64ep -DCAgent/taskmaster2-banana -DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B -DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw -DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B -DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 -DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra -Qwen/Qwen3-8B-Base -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B -laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B -laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps -laion/dev_set_part1_10k_glm_4_7_traces_locetash -laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter -laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned -laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter -laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-r2egym-askllm-hardened_glm_4_7_traces_jupiter -laion/exp-syh-tezos-askllm-constrained_glm_4_7_traces_jupiter -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter -laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter -laion/exp-uns-tezos-40x_glm_4_7_traces_jupiter -laion/exp_tas_frequency_penalty_0_5_traces -laion/exp_tas_high_diversity_traces -laion/glm-4_6-stackexchange-tezos-32ep-131k -laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k -laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k -laion/r2egym-nl2bash-stack-bugsseq-bash-withtests -laion/r2egym-nl2bash-stack-bugsseq-cpp -laion/r2egym-nl2bash-stack-bugsseq-fixthink-again -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 -laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp -laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large -laion/rl_tp4s64_8x_curated -mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 -mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 -mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 -mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 -mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 -mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 -mlfoundations-dev/qasper-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 -mlfoundations-dev/staqc-sandboxes-traces-terminus-2 -mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 -mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 -penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps -penfever/nl2bash-GLM-4_6-traces-newhparams diff --git a/eval/lists/lists/custom_force_run.txt b/eval/lists/lists/custom_force_run.txt deleted file mode 100644 index 15acb74c..00000000 --- a/eval/lists/lists/custom_force_run.txt +++ /dev/null @@ -1 +0,0 @@ -nvidia/Llama-3.1-Nemotron-Nano-8B-v1 diff --git a/eval/lists/lists/dp_test_model.txt b/eval/lists/lists/dp_test_model.txt deleted file mode 100644 index 01b63ba7..00000000 --- a/eval/lists/lists/dp_test_model.txt +++ /dev/null @@ -1 +0,0 @@ -DCAgent/a1-stack_jest \ No newline at end of file diff --git a/eval/lists/lists/dsv2_rerun_models.txt b/eval/lists/lists/dsv2_rerun_models.txt deleted file mode 100644 index 317130e0..00000000 --- a/eval/lists/lists/dsv2_rerun_models.txt +++ /dev/null @@ -1,230 +0,0 @@ -DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context -DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context -DCAgent/All_Puzzles_5k_new_context -DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct -DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B -DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B -DCAgent/code_contests-Qwen3-Coder-480B-traces -DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B -DCAgent/exp_tas_max_episodes_32_traces -DCAgent/exp_tas_max_episodes_512_traces -DCAgent/exp_tas_max_tokens_1024_traces -DCAgent/exp_tas_max_tokens_8192_traces -DCAgent/exp_tas_presence_penalty_0_25_traces -DCAgent/exp_tas_presence_penalty_1_0_traces -DCAgent/exp_tas_repetition_penalty_1_05_traces -DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B -DCAgent/freelancer-projects-0-1k-traces -DCAgent/freelancer-projects-10k-traces -DCAgent/freelancer-projects-gpt5_Qwen3-8B -DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B -DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B -DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 -DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 -DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B -DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base -DCAgent/taskmaster2-0-1k-traces -DCAgent/taskmaster2-0-3k-traces -DCAgent/taskmaster2-1ep -DCAgent/taskmaster2-1k-traces -DCAgent/taskmaster2-32ep -DCAgent/taskmaster2-4ep -DCAgent/taskmaster2-banana -DCAgent/taskmaster2-gpt5mini -DCAgent/taskmaster2-gpt5mini_global-batch-size_16 -DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B -DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces -DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps -DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps -DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k -DCAgent2/bugs-stack-nl2bashseq -DCAgent2/bugs-swesmith-over5050 -DCAgent2/bugs-swesmith-undr7030 -DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv -DCAgent2/freelancer-projects-31k-traces -DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B -DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces -DCAgent2/glm-4_6-freelancer-traces -DCAgent2/inferredbugs-GLM-4_6-32ep-32k -DCAgent2/inferredbugs-GLM-4_6-32ep-65k -DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw -DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B -DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B -DCAgent2/nl2bash-bugs-over5050 -DCAgent2/nl2bash-bugsshuffle -DCAgent2/nl2bash-stack-bugs-undr503020 -DCAgent2/nl2bash-stack-bugsseq -DCAgent2/nl2bash-stack-bugsshuffle -DCAgent2/nl2bash-stack-undr3070 -DCAgent2/nl2bash-stack-undr7030 -DCAgent2/nl2bash-stackshuffle -DCAgent2/nl2bash-swesmith-reason -DCAgent2/nl2bash-swesmith-undr7030 -DCAgent2/nl2bash-swesmithseq -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 -DCAgent2/stack-bugs-over5050 -DCAgent2/stack-bugs-undr7030 -DCAgent2/stack-bugsseq -DCAgent2/stack-bugsshuffle -DCAgent2/stack-swesmithseq -DCAgent2/swesmith-bugsseq -DCAgent2/swesmith-nl2bashseq -DCAgent2/swesmith-stack-over5050 -DCAgent2/swesmith-stack-reason -DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra -DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra -DCAgent2/taskmaster2-GLM-4_6-32ep-32k -NovaSky-AI/SA-SWE-32B -Qwen/Qwen3-1.7B -Qwen/Qwen3-14B -Qwen/Qwen3-32B -Qwen/Qwen3-4B -Qwen/Qwen3-4B-Thinking-2507 -Qwen/Qwen3-8B-Base -Qwen/Qwen3-Coder-30B-A3B-Instruct -R2E-Gym/R2EGym-32B-Agent -SWE-bench/SWE-agent-LM-7B -bespokelabs/Qwen3-8B-ot_step100 -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B -laion/GLM-4_6-freelancer-32eps-131k -laion/GLM-4_6-stackexchange-overflow-sandboxes-32eps-65k-reasoning -laion/GLM-4_6-stackexchange-superuser-32ep-32k -laion/GLM-4_6-swesmith-32ep-131k-nosumm -laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink -laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps -laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps -laion/bugs-nl2bashseq_Qwen3-8B -laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned -laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned -laion/exp-psu-stackoverflow-1K_glm_4_7_traces -laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned -laion/exp_tas_baseline_traces -laion/exp_tas_frequency_penalty_0_25_traces -laion/exp_tas_frequency_penalty_0_5_traces -laion/exp_tas_frequency_penalty_1_0_traces -laion/exp_tas_high_diversity_traces -laion/exp_tas_linear_history_off_traces -laion/exp_tas_max_tokens_2048_traces -laion/exp_tas_min_p_0_01_traces -laion/exp_tas_min_p_0_05_traces -laion/exp_tas_repetition_penalty_1_2_traces -laion/exp_tas_summarize_off_traces -laion/exp_tas_summarize_threshold_2048_traces -laion/exp_tas_temp_0_5_traces -laion/exp_tas_top_k_128_traces -laion/exp_tas_top_k_16_traces -laion/exp_tas_top_p_0_8_traces -laion/exp_tas_top_p_0_95_traces -laion/exp_tas_top_p_0_9_traces -laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k -laion/glm-4_6-freelancer-32ep-131k-torch -laion/glm-4_6-r2egym-32ep-32k -laion/glm-4_6-stackexchange-tezos-32ep-131k -laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces -laion/glm46-defects4j-32ep-131k -laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k -laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k -laion/glm46-stackexchange-tezos-maxeps-131k -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 -laion/kimi-k2-r2egym_sandboxes-maxeps-32k -laion/kimi-k2t-neulab-synatra-32ep-131k -laion/minimax-m2-stack-overflow-32ep-131k-summtrc -laion/nl2bash-bugsseq_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B -laion/open-thoughts-4-code-qwen3-32b-annotated -laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc -laion/r2egym-bugsseq -laion/r2egym-gpt5-codex-160ep-1M -laion/r2egym-nl2bash-bugsseq -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large -laion/rl_tp4s64_8x_exercism-python -laion/rl_tp4s64_8x_nemotron-junit -laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again -laion/stackexchange-tezos-sandboxes_glm_4_7_traces_locetash -mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 -mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 -mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 -mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 -mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 -mlfoundations-dev/qasper-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 -mlfoundations-dev/staqc-sandboxes-traces-terminus-2 -mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 -mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 -penfever/GLM-4_6-codeforces-32ep-32k-restore-hp -penfever/nl2bash-0-1k-traces-restore-hp -penfever/nl2bash-0-3k-traces-restore-hp -penfever/nl2bash-1k-traces-restore-hp -penfever/nl2bash-2ep-restore-hp -penfever/nl2bash-32ep-restore-hp -penfever/nl2bash-3k-traces-restore-hp -penfever/nl2bash-4ep-restore-hp -penfever/nl2bash-8ep-restore-hp -penfever/nl2bash-GLM-4_6-traces-newhparams -penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft -penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp diff --git a/eval/lists/lists/exp_tas_qwen35.txt b/eval/lists/lists/exp_tas_qwen35.txt deleted file mode 100644 index 3ddf9ad1..00000000 --- a/eval/lists/lists/exp_tas_qwen35.txt +++ /dev/null @@ -1 +0,0 @@ -laion/exp_tas_optimal_combined_traces-Qwen3.5-9B diff --git a/eval/lists/lists/glm46_131k.txt b/eval/lists/lists/glm46_131k.txt deleted file mode 100644 index 8f37fa74..00000000 --- a/eval/lists/lists/glm46_131k.txt +++ /dev/null @@ -1 +0,0 @@ -laion/glm46-swesmith-maxeps-131k-lc diff --git a/eval/lists/lists/glm47_flash.txt b/eval/lists/lists/glm47_flash.txt deleted file mode 100644 index eae1439b..00000000 --- a/eval/lists/lists/glm47_flash.txt +++ /dev/null @@ -1 +0,0 @@ -zai-org/GLM-4.7-Flash diff --git a/eval/lists/lists/inactive_models_latest.txt b/eval/lists/lists/inactive_models_latest.txt deleted file mode 100644 index 05e4667d..00000000 --- a/eval/lists/lists/inactive_models_latest.txt +++ /dev/null @@ -1,457 +0,0 @@ -allenai/SERA-14B -allenai/SERA-32B -allenai/SERA-8B -bespokelabs/Qwen3-8B-ot_step100 -bespokelabs/Qwen3-8B-ot_step60_high -camel-ai/seta-rl-qwen3-8b -claude-haiku-4-5-20251001 -DCAgent/All_Puzzles_5k_new_context -DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context -DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context -DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct -DCAgent/bash_textbook_tasks_traces -DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B -DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B -DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B -DCAgent/code_contests-Qwen3-Coder-480B-traces -DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B -DCAgent/codeforces-gptoss120b-traces -DCAgent/exp_tas_max_tokens_1024_traces -DCAgent/freelancer-askllm-filtered-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B -DCAgent/freelancer-long-instruction-filter_Qwen3-8B -DCAgent/freelancer-projects-0-1k-traces -DCAgent/freelancer-projects-0-3k-traces -DCAgent/freelancer-projects-100k-traces_Qwen3-8B -DCAgent/freelancer-projects-10k-traces -DCAgent/freelancer-projects-1k-traces -DCAgent/freelancer-projects-3k-traces -DCAgent/freelancer-projects-gpt5_Qwen3-8B -DCAgent/freelancer-projects-gpt5mini -DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B -DCAgent/freelancer-short-instruction-filter_Qwen3-8B -DCAgent/freelancer-t1024s-32ep_Qwen3-8B -DCAgent/freelancer-t2048s-32ep_Qwen3-8B -DCAgent/freelancer-t256s-32ep_Qwen3-8B -DCAgent/freelancer-t512s-32ep_Qwen3-8B -DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 -DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 -DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_40 -DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/neulab-mind2web-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/neulab-synatra-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 -DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B -DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 -DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_34 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/staqc-ot3-100k-code-subset-traces-terminus-2_Qwen3-8B -DCAgent/staqc-ot3-100k-math-subset-traces-terminus-2_save-strategy_steps_Qwen3-8B -DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B -DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-Coder-30B-A3B-Instruct -DCAgent/taskmaster2-0-1k-traces -DCAgent/taskmaster2-0-3k-traces -DCAgent/taskmaster2-10k-traces -DCAgent/taskmaster2-16ep -DCAgent/taskmaster2-1ep -DCAgent/taskmaster2-1k-traces -DCAgent/taskmaster2-2ep -DCAgent/taskmaster2-32ep -DCAgent/taskmaster2-3k-traces -DCAgent/taskmaster2-4ep -DCAgent/taskmaster2-64ep -DCAgent/taskmaster2-8ep -DCAgent/taskmaster2-banana -DCAgent/taskmaster2-gpt5mini -DCAgent/taskmaster2-gpt5mini_global-batch-size_16 -DCAgent/tbench_oracle_solutions_terminus -DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B -DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces -DCAgent2/bugs-nl2bashseq -DCAgent2/bugs-stack-nl2bashseq -DCAgent2/bugs-swesmith-over5050 -DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv -DCAgent2/freelancer-projects-100k-traces -DCAgent2/freelancer-projects-31k-traces -DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B -DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces -DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps -DCAgent2/glm-4_6-freelancer-traces -DCAgent2/glm-4_6-freelancer-traces-pm -DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps -DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k -DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor -DCAgent2/inferredbugs-GLM-4_6-32ep-32k -DCAgent2/inferredbugs-GLM-4_6-32ep-65k -DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw -DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B -DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B -DCAgent2/nl2bash-bugs-over5050 -DCAgent2/nl2bash-bugs-undr3070 -DCAgent2/nl2bash-bugsseq -DCAgent2/nl2bash-bugsshuffle -DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -DCAgent2/nl2bash-stack-bugs-over333 -DCAgent2/nl2bash-stack-bugs-undr203050 -DCAgent2/nl2bash-stack-bugs-undr503020 -DCAgent2/nl2bash-stack-over5050 -DCAgent2/nl2bash-stack-undr3070 -DCAgent2/nl2bash-stack-undr7030 -DCAgent2/nl2bash-stackseq -DCAgent2/nl2bash-stackshuffle -DCAgent2/nl2bash-swesmith-reason -DCAgent2/nl2bash-swesmithseq -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2095 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1097_Qwen3-8B -DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -DCAgent2/stack-bugs-over5050 -DCAgent2/stack-bugs-undr3070 -DCAgent2/stack-bugs-undr7030 -DCAgent2/stack-bugsseq -DCAgent2/stack-bugsshuffle -DCAgent2/stack-nl2bashseq -DCAgent2/stack-swesmithseq -DCAgent2/swesmith-nl2bashseq -DCAgent2/swesmith-stack-undr7030 -DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra -DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra -DCAgent2/taskmaster2-GLM-4_6-32ep-32k -DCAgent2/test2-tbench-dev-71-qwen3-8b-8nodes-sync -deepseek-ai/DeepSeek-R1-Distill-Qwen-7B -gemini-2.5-flash -gpt-5-2025-08-07 -gpt-5-mini-2025-08-07 -gpt-5-nano-2025-08-07 -laion/bugs-nl2bashseq_Qwen3-8B -laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces -laion/exp_tas_baseline_traces -laion/exp_tas_frequency_penalty_0_25_traces -laion/exp_tas_frequency_penalty_0_5_traces -laion/exp_tas_frequency_penalty_1_0_traces -laion/exp_tas_high_diversity_traces -laion/exp_tas_linear_history_off_traces -laion/exp_tas_low_diversity_traces -laion/exp_tas_max_tokens_2048_traces -laion/exp_tas_max_tokens_4096_traces -laion/exp_tas_min_p_0_05_traces -laion/exp_tas_optimal_combined_traces -laion/exp_tas_parser_xml_traces -laion/exp_tas_raw_content_off_traces -laion/exp_tas_repetition_penalty_1_2_traces -laion/exp_tas_temp_0_5_traces -laion/exp_tas_top_k_128_traces -laion/exp_tas_top_k_16_traces -laion/exp_tas_top_p_0_8_traces -laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter -laion/exp-psu-stackoverflow-1K_glm_4_7_traces -laion/exp-psu-stackoverflow-316_glm_4_7_traces -laion/exp-psu-stackoverflow-31K_glm_4_7_traces -laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -laion/exp-swd-r2egym-wo-docker_glm_4_7_traces -laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-4_2x_glm_4_7_traces_jupiter -laion/exp-uns-r2egym-8_4x_glm_4_7_traces_jupiter -laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter -laion/glm-4_6-all-puzzles-32ep-131k -laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k -laion/glm-4_6-freelancer-32ep-131k-torch -laion/GLM-4_6-freelancer-32eps-131k -laion/GLM-4_6-inferredbugs-32ep-65k-reasoning -laion/glm-4_6-nemo-prism -laion/GLM-4_6-nl2bash-verified-32ep-32k-reasoning -laion/glm-4_6-r2egym-32ep-32k -laion/GLM-4_6-selfinstruct-naive-2-32ep-32k -laion/glm-4_6-stack-overflow-32ep-131k-summtrc -laion/GLM-4_6-stackexchange-superuser-32ep-32k -laion/glm-4_6-staqc-32ep-131k -laion/GLM-4_6-swesmith-32ep-131k-nosumm -laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning -laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k -laion/GLM-4_7-r2egym_sandboxes-maxeps-131k -laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k -laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B -laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B -laion/glm46-defects4j-32ep-131k -laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k -laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces -laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k -laion/glm46-neulab-synatra-32ep-131k -laion/glm46-qasper-maxeps-131k -laion/glm46-stackexchange-tezos-maxeps-131k -laion/glm46-swegym-tasks-maxeps-131k -laion/glm46-swesmith-maxeps-131k -laion/GPT-OSS-120B-codeforces-fixeps_Qwen3-8B -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 -laion/kimi-k2-r2egym_sandboxes-maxeps-32k -laion/kimi-k2-swegym-tasks-maxeps-32k -laion/kimi-k2t-freelancer-32ep-32k -laion/Kimi-K2T-neulab-agenttuning-kg-sandboxes-maxeps-32k -laion/Kimi-K2T-neulab-agenttuning-mind2web-sandboxes-maxeps-32k -laion/Kimi-K2T-neulab-agenttuning-webshop-sandboxes-maxeps-32k -laion/kimi-k2t-neulab-synatra-32ep-131k -laion/Kimi-K2T-swesmith-32ep-131k -laion/MiniMax-M2-freelancer-32ep-32k -laion/MiniMax-M2-freelancer-32ep-32k-reasoning -laion/minimax-m2-stack-overflow-32ep-131k-summtrc -laion/nl2bash-bugs-undr7030_Qwen3-8B -laion/nl2bash-bugsseq_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e3_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B -laion/open-thoughts-4-code-qwen3-32b-annotated -laion/Qwen3-8B_exp_tas_summarize_threshold_4096_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_trajectory_minimal_traces_save-strategy_steps -laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-Coder-480B-codeforces-fixeps_Qwen3-8B -laion/Qwen3-Coder-480B-nl2bash-fixeps_Qwen3-8B -laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc -laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k -laion/r2egym-bugsseq -laion/r2egym-gpt5-codex-160ep-1M -laion/r2egym-nl2bash-bugsseq -laion/r2egym-nl2bash-stack-bugsseq -laion/r2egym-nl2bash-stack-bugsseq_lr3e-5_exp_rpt_stack-php-v2_step20 -laion/r2egym-nl2bash-stack-bugsseq-bash-withtests -laion/r2egym-nl2bash-stack-bugsseq-cpp -laion/r2egym-nl2bash-stack-bugsseq-fixthink -laion/r2egym-nl2bash-stack-bugsseq-fixthink-again -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 -laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp -laion/r2egym-nl2bash-stack-bugsseq-junit -laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 -laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests -laion/r2egym-nl2bash-stack-bugsseq-stack-php-v2 -laion/r2egym-nl2bash-stackseq -laion/r2egym-stack-bugsseq -laion/rl_bs128-gs16-rloo-n-code-contests-900s-noreg-15 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr1e-5_taco -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-type -laion/rl_think_npfg-code-contests-900s-45 -laion/rl_tp4s64_8x_2skill -laion/rl_tp4s64_8x_exercism-python -laion/rl_tp4s64_8x_flat25_baseline -laion/rl_tp4s64_8x_github_issue -laion/rl_tp4s64_8x_heavy_padding -laion/rl_tp4s64_8x_minimal_instructions -laion/rl_tp4s64_8x_nemotron-cpp -laion/rl_tp4s64_8x_nemotron-junit -laion/rl_tp4s64_8x_proportional -laion/rl_tp4s64_8x_structural_debug -laion/rl_v1_tp4s64_8x_nemotron-junit -laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B -laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again -mistralai/Devstral-Small-2507 -mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_43 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 -mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 -mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 -mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 -mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 -mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 -mlfoundations-dev/qasper-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 -mlfoundations-dev/staqc-sandboxes-traces-terminus-2 -mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 -mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 -moonshotai/Kimi-Dev-72B -moonshotai/Kimi-K2.5 -NovaSky-AI/SA-SWE-32B -nvidia/AceReason-Nemotron-7B -nvidia/Llama-3.1-Nemotron-Nano-8B-v1 -nvidia/Nemotron-Terminal-14B -nvidia/Nemotron-Terminal-32B -nvidia/Nemotron-Terminal-8B -nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 -o4-mini -obiwan96/qwen-2.5-7b-instruct-endless-terminals -obiwan96/qwen3-8b-openthinker-sft-endless-terminals -open-r1/OpenR1-Distill-7B -open-thoughts/OpenThinker-Agent-v1 -open-thoughts/OpenThinker-Agent-v1-SFT -open-thoughts/OpenThinker3-7B -openai/gpt-5 -openai/gpt-5-mini -openai/gpt-5-nano -penfever/freelancer-t1024s-32ep-restore-hp -penfever/freelancer-t2048s-32ep-restore-hp -penfever/freelancer-t512s-32ep-restore-hp -penfever/GLM-4_6-codeforces-32ep-32k-restore-hp -penfever/GLM-4_6-gemini25flash-stackexchange-overflow-32ep-512k-fixeps -penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps -penfever/kimi-k2-swesmith_with_plain_docker-sandboxes-maxeps-32k -penfever/neulab-codeactinstruct-restore-hp -penfever/nl2bash_gpt-5-nano-traces-8ep-restore-hp -penfever/nl2bash_verified_gpt-5-nano-traces-restore-hp -penfever/nl2bash-0-1k-traces-restore-hp -penfever/nl2bash-0-3k-traces-restore-hp -penfever/nl2bash-16ep-restore-hp -penfever/nl2bash-1ep-restore-hp -penfever/nl2bash-1k-traces-restore-hp -penfever/nl2bash-2ep-restore-hp -penfever/nl2bash-32ep-restore-hp -penfever/nl2bash-3k-traces-restore-hp -penfever/nl2bash-4ep-restore-hp -penfever/nl2bash-8ep-restore-hp -penfever/nl2bash-GLM-4_6-traces-newhparams -penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft -penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp -penfever/rl_bs128_gs16_ruby-30 -penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_code-v2-25 -penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-pyte-v2-25 -penfever/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_soft-v2-45 -penfever/selfinstruct-naive-sandboxes-2-traces-restore-hp -penfever/swesmith-2stage-restore-hp -penfever/taskmaster2-4ep-2stage-restore-hp -Qwen/Qwen2.5-7B-Instruct -Qwen/Qwen2.5-Coder-32B-Instruct -Qwen/Qwen3-1.7B -Qwen/Qwen3-14B -Qwen/Qwen3-235B-A22B-Instruct-2507-tput -Qwen/Qwen3-32B -Qwen/Qwen3-4B -Qwen/Qwen3-4B-Instruct-2507 -Qwen/Qwen3-4B-Thinking-2507 -Qwen/Qwen3-8B -Qwen/Qwen3-8B-Base -Qwen/Qwen3-Coder-30B-A3B-Instruct -Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 -Qwen/Qwen3.5-0.8B -Qwen/Qwen3.5-27B -Qwen/Qwen3.5-2B -Qwen/Qwen3.5-35B-A3B -Qwen/Qwen3.5-4B -Qwen/Qwen3.5-9B -R2E-Gym/R2EGym-32B-Agent -Skywork/Skywork-OR1-7B -Skywork/Skywork-SWE-32B -SWE-bench/SWE-agent-LM-32B -SWE-bench/SWE-agent-LM-7B -SWE-Swiss/SWE-Swiss-32B -zai-org/GLM-4.7 -DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 -DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 -DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor -DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor diff --git a/eval/lists/lists/kept_models_names.txt b/eval/lists/lists/kept_models_names.txt deleted file mode 100644 index f98dffb7..00000000 --- a/eval/lists/lists/kept_models_names.txt +++ /dev/null @@ -1,221 +0,0 @@ -/e/data1/datasets/playground/ot/hf_hub/models--laion--r2egym-nl2bash-stack-bugsseq-fixthink-again/snapshots/2f4f59f076583f8c084bbca8308d5f80bfc7def5 -/e/data1/datasets/playground/ot/hf_hub/models--laion--r2egym-nl2bash-stack-bugsseq-fixthink/snapshots/53ccb94616c4fb83ee5c138f334ed1b99c681272 -/leonardo_scratch/fast/AIFAC_5C0_290/dc-agent-shared/hf_hub/models--open-thoughts--OpenThinker-Agent-v1/snapshots/899181e51a920db4b7b580fc50ca1f6d99fbb0f5 -DCAgent/exp_rpt_crosscodeeval-csharp_20260219 -DCAgent/exp_tas_max_episodes_32_traces -DCAgent/exp_tas_max_episodes_512_traces -DCAgent/exp_tas_max_tokens_8192_traces -DCAgent/exp_tas_presence_penalty_0_25_traces -DCAgent/exp_tas_presence_penalty_1_0_traces -DCAgent/exp_tas_repetition_penalty_1_05_traces -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 -DCAgent2/bs64_rloo_n_noct_stri_micr_auto_conv_pref_model_r2e-120 -DCAgent2/bugs-swesmith-reason -DCAgent2/bugs-swesmith-undr7030 -DCAgent2/nl2bash-stack-bugsseq -DCAgent2/nl2bash-stack-bugsshuffle -DCAgent2/nl2bash-swesmith-over5050 -DCAgent2/nl2bash-swesmith-undr7030 -DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -DCAgent2/swesmith-bugsseq -DCAgent2/swesmith-stack-over5050 -DCAgent2/swesmith-stack-reason -DCAgent2/swesmith-stackseq -NovaSky-AI/SA-SWE-32B -Qwen/Qwen2.5-Coder-32B-Instruct -Qwen/Qwen3-1.7B -Qwen/Qwen3-14B -Qwen/Qwen3-235B-A22B-Instruct-2507-tput -Qwen/Qwen3-32B -Qwen/Qwen3-4B -Qwen/Qwen3-4B-Instruct-2507 -Qwen/Qwen3-4B-Thinking-2507 -Qwen/Qwen3-8B -Qwen/Qwen3-8B-Base -Qwen/Qwen3-Coder-30B-A3B-Instruct -Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 -R2E-Gym/R2EGym-32B-Agent -SWE-Swiss/SWE-Swiss-32B -SWE-bench/SWE-agent-LM-32B -SWE-bench/SWE-agent-LM-7B -Skywork/Skywork-SWE-32B -allenai/SERA-32B -allenai/SERA-8B -claude-haiku-4-5-20251001 -gemini-2.5-flash -gpt-5-2025-08-07 -gpt-5-mini-2025-08-07 -gpt-5-nano-2025-08-07 -hosted_vllm/mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_32_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_4.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B -laion/GLM-4_6-stackexchange-overflow-sandboxes-32eps-65k-reasoning -laion/GLM-4_7-r2egym_sandboxes-maxeps-131k-lc -laion/GLM-4_7-stackexchange-tezos-sandboxes-maxeps-131k -laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B -laion/Kimi-K2T-ling-coder-sft-sandboxes-1-maxeps-32k -laion/Qwen3-32B-NL2Bash-31step -laion/Qwen3-32B-R2EGYM-256-3epochs -laion/Qwen3-32B-SweSmith-20step -laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps -laion/alfworld-swesmith-r2egym-swegym-131k-lc -laion/bugs-r2egym-stackseq -laion/dev_set_part1_10k_glm_4_7_traces_jupiter -laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned -laion/dev_set_part1_10k_glm_4_7_traces_locetash -laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter -laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned -laion/exp-gfi-staqc-embedding-mean-filtered-10K_glm_4_7_traces_jupiter -laion/exp-gfi-staqc-short-response-filtered-10K_glm_4_7_traces_locetash -laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter -laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter_cleaned -laion/exp-psu-stackoverflow-10K_glm_4_7_traces -laion/exp-psu-stackoverflow-3K_glm_4_7_traces -laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter -laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-r2egym-askllm-hardened_glm_4_7_traces_jupiter -laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter -laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_locetash -laion/exp-syh-tezos-askllm-constrained_glm_4_7_traces_jupiter -laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter -laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter -laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter -laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-2_1x_glm_4_7_traces_locetash -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter -laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter -laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter -laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter -laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-40x_glm_4_7_traces_jupiter -laion/exp-uns-tezos-80x_glm_4_7_traces_jupiter_cleaned -laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter -laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter -laion/exp_tas_full_thinking_traces -laion/exp_tas_interleaved_thinking_on_traces -laion/exp_tas_min_p_0_01_traces -laion/exp_tas_min_p_0_1_traces -laion/exp_tas_optimal_combined_traces -laion/exp_tas_summarize_off_traces -laion/exp_tas_summarize_threshold_16384_traces -laion/exp_tas_summarize_threshold_2048_traces -laion/exp_tas_timeout_multiplier_0_25_traces -laion/exp_tas_timeout_multiplier_1_0_traces -laion/exp_tas_timeout_multiplier_4_0_traces -laion/exp_tas_timeout_multiplier_8_0_traces -laion/exp_tas_top_k_64_traces -laion/exp_tas_top_p_0_95_traces -laion/exp_tas_top_p_0_9_traces -laion/glm-4_6-stackexchange-tezos-32ep-131k -laion/glm46-Toolscale-tasks-traces -laion/glm46-r2egym_sandboxes-maxeps-131k -laion/glm46-swegym-tasks-maxeps-131k-lc -laion/glm46-swesmith-maxeps-131k-fixthink -laion/glm46-swesmith-maxeps-131k-lc -laion/nl2bash-swesmith-stack-bugsseq -laion/perturbed-docker-exp-freelancer-tasks_glm_4_7_traces -laion/r2egym-nl2bash-stack-bugsseq -laion/r2egym-nl2bash-stack-bugsseq-crosscodeeval-python-v2 -laion/r2egym-nl2bash-stack-bugsseq-fixthink-again -laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large -laion/r2egym-nl2bash-stack-bugsseq-rl-crosscodeeval-csharp -laion/r2egym-nl2bashseq -laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B -laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack -laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 -laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack -laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith -laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith -laion/rl_base-code-contests-900s-160 -laion/rl_base-code-contests-900s-reg-140 -laion/rl_base-code-contests-900s-reg-lr1e-5-140 -laion/rl_base-exp_rpt_stack_bash-90 -laion/rl_base-exp_rpt_stack_bash_with_gpt5-90 -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-php-larg-75 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_meth-larg-60 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-self-larg-60 -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-java -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 -laion/rl_swesmith-fixthink-pymethods2test-45 -laion/rl_tp4s64_8x_curated -laion/rl_tp4s64_8x_detailed -laion/rl_tp4s64_8x_error_report -laion/rl_tp4s64_8x_expert -laion/rl_tp4s64_8x_moderate_padding -laion/rl_tp4s64_8x_partial_ambiguity -laion/rl_tp4s64_8x_stack-jest-v2 -laion/rl_tp4s64_8x_stack-selfdoc-v2 -laion/rl_v1_tp4s64_8x_exercism-python -laion/rl_v1_tp4s64_8x_stack-jest-large -laion/rl_v1_tp4s64_8x_stack-pytest-large -laion/rl_v1_tp4s64_8x_structural_debug -laion/rl_v3_tp4s64_8x_exercism-python -laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B -laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B -laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B -laion/stackexchange-tezos-sandboxes_glm_4_7_traces_locetash -laion/swesmith-nl2bash-stack-bugsseq -laion/swesmith-sandboxes-with_tests-gpt-5-mini-passed_glm_4_7_traces -laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B -mistralai/Devstral-Small-2507 -moonshotai/Kimi-Dev-72B -moonshotai/Kimi-K2.5 -nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 -o4-mini -openai/gpt-5 -openai/gpt-5-mini -openai/gpt-5-nano -penfever/GLM-4_6-taskmaster2-32eps-32k-fixeps -penfever/bs64_rloo_n_noct_stri_micr_auto_tis_model_r2e-100 -penfever/bs64_rloo_n_noct_stri_micr_model_noconv_r2eg_nl2_140 -penfever/bs64_rloo_n_noct_stri_micr_model_r2eg_nl2_160 -penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 -penfever/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-110 -r2egym-nl2bash-stack-bugsseq-bash-withtests -r2egym-nl2bash-stack-bugsseq-cpp -r2egym-nl2bash-stack-bugsseq-junit -r2egym-nl2bash-stack-bugsseq-pytest-v2 -rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack -zai-org/GLM-4.7 diff --git a/eval/lists/lists/laion_latest.txt b/eval/lists/lists/laion_latest.txt deleted file mode 100644 index 864a34e1..00000000 --- a/eval/lists/lists/laion_latest.txt +++ /dev/null @@ -1,28 +0,0 @@ -laion/rl__24GPU_base__llm-verifier-freelancer__r2egym-nl2bash-stack -laion/rl__24GPU_shaped__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/SweSmith-8B-SFT-Rope-step62 -laion/SweSmith-8B-SFT-NoRope-step58 -laion/rl__24GPU_base__mix_h2_language_proportional__r2egym-nl2bash-stack -laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 -laion/rl_v3_tp4s64_8x_exercism-python -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 -laion/rl_v1_tp4s64_8x_exercism-python -laion/rl_v1_tp4s64_8x_stack-pytest-large -laion/rl_v1_tp4s64_8x_structural_debug -laion/rl_v1_tp4s64_8x_stack-jest-large -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem -laion/rl_v1_tp4s64_8x_nemotron-junit -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned diff --git a/eval/lists/lists/latest_sort_by_release_eval_prio.txt b/eval/lists/lists/latest_sort_by_release_eval_prio.txt deleted file mode 100644 index b6f1047e..00000000 --- a/eval/lists/lists/latest_sort_by_release_eval_prio.txt +++ /dev/null @@ -1,28 +0,0 @@ -laion/rl__24GPU_base__llm-verifier-freelancer__r2egym-nl2bash-stack -laion/rl__24GPU_shaped__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/SweSmith-8B-SFT-Rope-step62 -laion/SweSmith-8B-SFT-NoRope-step58 -laion/rl__24GPU_base__mix_h2_language_proportional__r2egym-nl2bash-stack -rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 -laion/rl_v3_tp4s64_8x_exercism-python -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 -laion/rl_v1_tp4s64_8x_exercism-python -laion/rl_v1_tp4s64_8x_stack-pytest-large -laion/rl_v1_tp4s64_8x_structural_debug -laion/rl_v1_tp4s64_8x_stack-jest-large -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem -laion/rl_v1_tp4s64_8x_nemotron-junit -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned diff --git a/eval/lists/lists/missing_dev_set_v2.txt b/eval/lists/lists/missing_dev_set_v2.txt deleted file mode 100644 index 7d0b84ba..00000000 --- a/eval/lists/lists/missing_dev_set_v2.txt +++ /dev/null @@ -1,189 +0,0 @@ -laion/r2egym-nl2bash-stack-bugsseq-fixthink-again -laion/r2egym-nl2bash-stack-bugsseq-fixthink -open-thoughts/OpenThinker-Agent-v1 -DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context -DCAgent/All_Puzzles_5k_new_context -DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct -DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B -DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B -DCAgent/code_contests-Qwen3-Coder-480B-traces -DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B -DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B -DCAgent/exp_tas_max_episodes_512_traces -DCAgent/exp_tas_max_tokens_1024_traces -DCAgent/exp_tas_max_tokens_8192_traces -DCAgent/exp_tas_presence_penalty_1_0_traces -DCAgent/exp_tas_repetition_penalty_1_05_traces -DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B -DCAgent/freelancer-long-instruction-filter_Qwen3-8B -DCAgent/freelancer-projects-0-1k-traces -DCAgent/freelancer-projects-0-3k-traces -DCAgent/freelancer-projects-10k-traces -DCAgent/freelancer-projects-1k-traces -DCAgent/freelancer-projects-gpt5_Qwen3-8B -DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B -DCAgent/freelancer-short-instruction-filter_Qwen3-8B -DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B -DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B -DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base -DCAgent/taskmaster2-0-1k-traces -DCAgent/taskmaster2-0-3k-traces -DCAgent/taskmaster2-1k-traces -DCAgent/taskmaster2-banana -DCAgent2/bugs-swesmith-reason -DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B -DCAgent2/glm-4_6-freelancer-traces -DCAgent2/inferredbugs-GLM-4_6-32ep-65k -DCAgent2/nl2bash-bugs-over5050 -DCAgent2/nl2bash-bugsshuffle -DCAgent2/nl2bash-stack-bugs-undr203050 -DCAgent2/nl2bash-stack-undr3070 -DCAgent2/nl2bash-stack-undr7030 -DCAgent2/nl2bash-stackshuffle -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 -DCAgent2/stack-bugs-undr7030 -DCAgent2/swesmith-stack-undr7030 -Qwen/Qwen2.5-Coder-32B-Instruct -Qwen/Qwen3-4B-Thinking-2507 -Qwen/Qwen3-8B-Base -SWE-bench/SWE-agent-LM-32B -mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B -laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B -laion/GLM-4_6-stackexchange-superuser-32ep-32k -laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B -laion/Qwen3-32B-NL2Bash-31step -laion/Qwen3-32B-R2EGYM-256-3epochs -laion/Qwen3-32B-SweSmith-20step -laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps -laion/dev_set_part1_10k_glm_4_7_traces_jupiter_cleaned -laion/exp-gfi-staqc-askllm-filtered-10K_glm_4_7_traces_jupiter_cleaned -laion/exp-gfi-swesmith-random-filtered-10K_glm_4_7_traces_jupiter_cleaned -laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -laion/exp-syh-r2egym-askllm-constrained_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-r2egym-swesmith-mixed_glm_4_7_traces_jupiter -laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-16_8x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-2_1x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-10x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-160x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-80x_glm_4_7_traces_jupiter_cleaned -laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter -laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter -laion/exp_tas_baseline_traces -laion/exp_tas_frequency_penalty_0_25_traces -laion/exp_tas_frequency_penalty_0_5_traces -laion/exp_tas_interleaved_thinking_on_traces -laion/exp_tas_max_tokens_4096_traces -laion/exp_tas_min_p_0_05_traces -laion/exp_tas_raw_content_off_traces -laion/exp_tas_temp_0_5_traces -laion/exp_tas_top_p_0_8_traces -laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces -laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k -laion/kimi-k2t-neulab-synatra-32ep-131k -laion/nl2bash-bugs-undr7030_Qwen3-8B -laion/nl2bash-bugsseq_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-pytest-large -laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B -laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack -laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 -laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack -laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith -laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 -laion/rl_v1_tp4s64_8x_exercism-python -laion/rl_v1_tp4s64_8x_nemotron-junit -laion/rl_v1_tp4s64_8x_stack-jest-large -laion/rl_v1_tp4s64_8x_stack-pytest-large -laion/rl_v1_tp4s64_8x_structural_debug -laion/rl_v3_tp4s64_8x_exercism-python -laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B -laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B -laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B -mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 -mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 -mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 -mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 -mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 -mlfoundations-dev/qasper-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 -mlfoundations-dev/staqc-sandboxes-traces-terminus-2 -mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 -moonshotai/Kimi-Dev-72B -penfever/GLM-4_6-codeforces-32ep-32k-restore-hp -penfever/bs64_rloo_n_noct_stri_micr_model_noconv_r2eg_nl2_140 -penfever/nl2bash-0-1k-traces-restore-hp -penfever/nl2bash-0-3k-traces-restore-hp -penfever/nl2bash-1k-traces-restore-hp -penfever/nl2bash-2ep-restore-hp -penfever/nl2bash-32ep-restore-hp -penfever/nl2bash-3k-traces-restore-hp -penfever/nl2bash-4ep-restore-hp -penfever/nl2bash-8ep-restore-hp -penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft -penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 -penfever/swesmith-2stage-restore-hp -laion/r2egym-nl2bash-stack-bugsseq-bash-withtests -laion/r2egym-nl2bash-stack-bugsseq-cpp -laion/r2egym-nl2bash-stack-bugsseq-junit -laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 -laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/lists/missing_dev_set_v2_inactive_laion.txt b/eval/lists/lists/missing_dev_set_v2_inactive_laion.txt deleted file mode 100644 index 730ec8cc..00000000 --- a/eval/lists/lists/missing_dev_set_v2_inactive_laion.txt +++ /dev/null @@ -1,24 +0,0 @@ -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B -laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B -laion/GLM-4_6-stackexchange-superuser-32ep-32k -laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps -laion/exp_tas_baseline_traces -laion/exp_tas_frequency_penalty_0_25_traces -laion/exp_tas_frequency_penalty_0_5_traces -laion/exp_tas_max_tokens_4096_traces -laion/exp_tas_min_p_0_05_traces -laion/exp_tas_raw_content_off_traces -laion/exp_tas_temp_0_5_traces -laion/exp_tas_top_p_0_8_traces -laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces -laion/nl2bash-bugs-undr7030_Qwen3-8B -laion/nl2bash-bugsseq_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 -laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B diff --git a/eval/lists/lists/missing_swebench_verified_random_100_folders.txt b/eval/lists/lists/missing_swebench_verified_random_100_folders.txt deleted file mode 100644 index a07b9104..00000000 --- a/eval/lists/lists/missing_swebench_verified_random_100_folders.txt +++ /dev/null @@ -1,69 +0,0 @@ -DCAgent/All_Puzzles_5k_new_context -DCAgent/exp_rpt_crosscodeeval-csharp_20260219 -DCAgent/freelancer-projects-100k-traces_Qwen3-8B -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B -DCAgent2/glm-4_6-freelancer-traces-pm -laion/GLM-4_6-freelancer-32eps-131k -laion/GLM-4_6-stackexchange-superuser-32ep-32k -laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning -laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B -laion/Qwen3-32B-NL2Bash-31step -laion/Qwen3-32B-R2EGYM-256-3epochs -laion/Qwen3-32B-SweSmith-20step -laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter -laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter -laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests -laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B -laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack -laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 -laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack -laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith -laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curriculum-medium -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 -laion/rl_v1_tp4s64_8x_exercism-python -laion/rl_v1_tp4s64_8x_stack-jest-large -laion/rl_v1_tp4s64_8x_stack-pytest-large -laion/rl_v1_tp4s64_8x_structural_debug -laion/rl_v3_tp4s64_8x_exercism-python -laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B -laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B -laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B -penfever/nl2bash-2ep-restore-hp -penfever/nl2bash-3k-traces-restore-hp -penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 -r2egym-nl2bash-stack-bugsseq-bash-withtests -r2egym-nl2bash-stack-bugsseq-cpp -r2egym-nl2bash-stack-bugsseq-junit -r2egym-nl2bash-stack-bugsseq-pytest-v2 -rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/lists/missing_terminal_bench_2.txt b/eval/lists/lists/missing_terminal_bench_2.txt deleted file mode 100644 index 64dd4971..00000000 --- a/eval/lists/lists/missing_terminal_bench_2.txt +++ /dev/null @@ -1,96 +0,0 @@ - -DCAgent/exp_rpt_crosscodeeval-csharp_20260219 -DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step30 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step45 -DCAgent/taskmaster2-banana -DCAgent/tbench-dev-71-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step70 -DCAgent2/nl2bash-stack-bugs-undr203050 -DCAgent2/nl2bash-swesmithseq -DCAgent2/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -Qwen/Qwen2.5-Coder-32B-Instruct -hosted_vllm/mlfoundations-dev/stackexchange-codereview-sandboxes-traces-terminus-2_overwrite-output-dir_True -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B -laion/Kimi-2-5-r2egym_sandboxes-maxeps-32k__Qwen3-8B -laion/Qwen3-32B-NL2Bash-31step -laion/Qwen3-32B-R2EGYM-256-3epochs -laion/Qwen3-32B-SweSmith-20step -laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_2.0_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps -laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps -laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces -laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-psu-swesmith-316_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -laion/exp_rpt_stack-bash-withtests_glm_4_7_traces_jupiter -laion/exp_rpt_stack-csharp_10k_glm_4-7_traces_jupiter__Qwen3-8B -laion/exp_rpt_stack-rust_10k_glm_4_7_traces_jupiter -laion/glm46-swegym-tasks-maxeps-131k -laion/glm46-swesmith-maxeps-131k-lc -laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests -laion/rl__24GPU_base__exp_rpt_codeelo-v2__Qwen3-8B -laion/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack -laion/rl__24GPU_base__exp_rpt_issue__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_multifile__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_scaffold__Qwen3-8B-60 -laion/rl__24GPU_base__exp_rpt_stack-bash__Qwen3-8B-55 -laion/rl__24GPU_base__mix_h2_language_balanced__r2egym-nl2bash-stack -laion/rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack -laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith -laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_curator-hard -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr-postmortem -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_defe-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_nemo-bash -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_soft-larg -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-pyte-synt- -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt-agai_trai-data_exp_rpt_stac-self-gpt5 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-php-larg-75 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_meth-larg-60 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-self-larg-60 -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_stac-bash-wi -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64_8x_tp4_seqs-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_codeelo-v2 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_nemotron-bash-60 -laion/rl_rl-config_24GPU_base-yaml_model-path_Qwen3-8B_train-data_exp_rpt_pymethods2test-large-50 -laion/rl_swesmith-fixthink-pymethods2test-45 -laion/rl_v1_tp4s64_8x_exercism-python -laion/rl_v1_tp4s64_8x_nemotron-junit -laion/rl_v1_tp4s64_8x_stack-jest-large -laion/rl_v1_tp4s64_8x_stack-pytest-large -laion/rl_v1_tp4s64_8x_structural_debug -laion/rl_v3_tp4s64_8x_exercism-python -laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B -laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again -laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B -penfever/rl__24GPU_base__exp_rpt_curriculum-hard__r2egym-nl2bash-stack-15 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__Qwen3-8B-Base-65 -penfever/rl__24GPU_base__exp_rpt_pymethods2test-large__qwen3base-GLM-4_7-sw-70 -r2egym-nl2bash-stack-bugsseq-bash-withtests -r2egym-nl2bash-stack-bugsseq-cpp -r2egym-nl2bash-stack-bugsseq-junit -r2egym-nl2bash-stack-bugsseq-pytest-v2 -rl__24GPU_base__swe_rebench_patched_oracle__r2egym-nl2bash-stack diff --git a/eval/lists/lists/models_131k.txt b/eval/lists/lists/models_131k.txt deleted file mode 100644 index b049ad34..00000000 --- a/eval/lists/lists/models_131k.txt +++ /dev/null @@ -1,35 +0,0 @@ -laion/GLM-4_6-freelancer-32eps-131k -laion/GLM-4_6-swesmith-32ep-131k-nosumm -laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning -laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k -laion/GLM-4_7-r2egym_sandboxes-maxeps-131k -laion/GLM-4_7-r2egym_sandboxes-maxeps-131k-lc -laion/GLM-4_7-stackexchange-tezos-sandboxes-maxeps-131k -laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k -laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink -laion/Kimi-K2T-swesmith-32ep-131k -laion/glm-4_6-all-puzzles-32ep-131k -laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k -laion/glm-4_6-freelancer-32ep-131k-torch -laion/glm-4_6-stack-overflow-32ep-131k-summtrc -laion/glm-4_6-stackexchange-tezos-32ep-131k -laion/glm-4_6-staqc-32ep-131k -laion/glm46-defects4j-32ep-131k -laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k -laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k -laion/glm46-neulab-synatra-32ep-131k -laion/glm46-qasper-maxeps-131k -laion/glm46-r2egym_sandboxes-maxeps-131k -laion/glm46-stackexchange-tezos-maxeps-131k -laion/glm46-swegym-tasks-maxeps-131k -laion/glm46-swegym-tasks-maxeps-131k-lc -laion/glm46-swesmith-maxeps-131k -laion/glm46-swesmith-maxeps-131k-fixthink -laion/glm46-swesmith-maxeps-131k-lc -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 -laion/kimi-k2t-neulab-synatra-32ep-131k -laion/minimax-m2-stack-overflow-32ep-131k-summtrc -laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc -laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k -laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B diff --git a/eval/lists/lists/models_32b.txt b/eval/lists/lists/models_32b.txt deleted file mode 100644 index 2c5a7967..00000000 --- a/eval/lists/lists/models_32b.txt +++ /dev/null @@ -1,31 +0,0 @@ -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B -NovaSky-AI/SA-SWE-32B -Qwen/Qwen2.5-Coder-32B-Instruct -Qwen/Qwen3-32B -R2E-Gym/R2EGym-32B-Agent -SWE-Swiss/SWE-Swiss-32B -SWE-bench/SWE-agent-LM-32B -Skywork/Skywork-SWE-32B -allenai/SERA-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_32_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_4.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B -laion/Qwen3-32B-R2EGYM-256-3epochs -laion/Qwen3-32B-SweSmith-20step -laion/open-thoughts-4-code-qwen3-32b-annotated -laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith -laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith -laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B -laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B -laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B diff --git a/eval/lists/lists/nemotron_nano.txt b/eval/lists/lists/nemotron_nano.txt deleted file mode 100644 index 15acb74c..00000000 --- a/eval/lists/lists/nemotron_nano.txt +++ /dev/null @@ -1 +0,0 @@ -nvidia/Llama-3.1-Nemotron-Nano-8B-v1 diff --git a/eval/lists/lists/no_eval_models_latest.txt b/eval/lists/lists/no_eval_models_latest.txt deleted file mode 100644 index 8d5ac20f..00000000 --- a/eval/lists/lists/no_eval_models_latest.txt +++ /dev/null @@ -1,30 +0,0 @@ -rl__nemotron-bash_fp8_terminus-2_step48 -laion/rl__64GPU_base_32b__nl2bash-tasks-cleaned-oracle__syh-r2eg-askl-glm_4__40-0 -DCAgent/a1-quixbugs -laion/rl__24GPU_shaped__stackexchange-overflow__exp_tas_optimal_comb-25 -DCAgent/a1-e2egit -DCAgent/a1-nemotron_rspec -laion/allenai-sera-unified-100000-opt100k__Qwen3-8B -laion/allenai-sera-unified-31600-opt100k__Qwen3-8B -laion/coderforge-100000-opt100k__Qwen3-8B -laion/sera-10000__Qwen3-8B -laion/coderforge-3160__Qwen3-8B -laion/swesmith-100000-opt100k__Qwen3-8B -laion/coderforge-31600-opt100k__Qwen3-8B -laion/sera-1000-opt1k__Qwen3-8B -laion/r2egym-100000-opt100k__Qwen3-8B -laion/coderforge-1000-opt1k__Qwen3-8B -laion/sera-316-opt1k__Qwen3-8B -laion/r2egym-316-opt1k__Qwen3-8B -laion/r2egym-1000-opt1k__Qwen3-8B -laion/coderforge-316-opt1k__Qwen3-8B -laion/rl__24GPU_shaped__nemotron-math-oracle-filtered__exp_tas_optimal_comb__40-0-30 -laion/coderforge-10000__Qwen3-8B -laion/r2egym-10000__Qwen3-8B -laion/sera-3160__Qwen3-8B -DCAgent/a1-ghactions -laion/swesmith-unified-10000__Qwen3-8B -laion/r2egym-unified-3160__Qwen3-8B -laion/rl__24GPU_shaped__exp_rpt_pymethods2test-large__GLM-4_7-swesmith-san-30 -laion/100k_epochs4__Qwen3-8B -laion/exp-psu-swesmith-1K_glm_4-7_traces_jupiter__0-05__Qwen3-8B \ No newline at end of file diff --git a/eval/lists/lists/pipeline_exp_prio.txt b/eval/lists/lists/pipeline_exp_prio.txt deleted file mode 100644 index 75105143..00000000 --- a/eval/lists/lists/pipeline_exp_prio.txt +++ /dev/null @@ -1,19 +0,0 @@ -DCAgent/a1-taskmaster2 -DCAgent/a1-stack_bash -DCAgent/a1-repo_scaffold -DCAgent/a1-pr_mining -DCAgent/a1-nemotron_junit -DCAgent/a1-nemotron_cpp -DCAgent/a1-nemotron_bash -DCAgent/a1-manybugs -DCAgent/a1-issue_tasks -DCAgent/a1-codenet_python -DCAgent/a1-bugsinpy -DCAgent/a1-multifile_composition -DCAgent/a1-exercism_python -DCAgent/a1-crosscodeeval_typescript -DCAgent/a1-crosscodeeval_python -DCAgent/a1-crosscodeeval_java -DCAgent/a1-taco -DCAgent/a1-staqc -DCAgent/a1-stackexchange_tezos diff --git a/eval/lists/lists/priority_131k_test.txt b/eval/lists/lists/priority_131k_test.txt deleted file mode 100644 index 2f77176c..00000000 --- a/eval/lists/lists/priority_131k_test.txt +++ /dev/null @@ -1 +0,0 @@ -laion/alfworld-swesmith-r2egym-swegym-131k-lc diff --git a/eval/lists/lists/priority_batch2.txt b/eval/lists/lists/priority_batch2.txt deleted file mode 100644 index c15c9b04..00000000 --- a/eval/lists/lists/priority_batch2.txt +++ /dev/null @@ -1,3 +0,0 @@ -open-thoughts/OpenThinker3-7B -deepseek-ai/DeepSeek-R1-Distill-Qwen-7B -camel-ai/seta-rl-qwen3-8b diff --git a/eval/lists/lists/priority_batch_evalorg.txt b/eval/lists/lists/priority_batch_evalorg.txt deleted file mode 100644 index f32a0dd6..00000000 --- a/eval/lists/lists/priority_batch_evalorg.txt +++ /dev/null @@ -1,3 +0,0 @@ -obiwan96/qwen3-8b-openthinker-sft-endless-terminals -nvidia/Nemotron-Terminal-8B -nvidia/Nemotron-Terminal-14B diff --git a/eval/lists/lists/priority_obiwan.txt b/eval/lists/lists/priority_obiwan.txt deleted file mode 100644 index ab57457f..00000000 --- a/eval/lists/lists/priority_obiwan.txt +++ /dev/null @@ -1 +0,0 @@ -obiwan96/qwen3-8b-openthinker-sft-endless-terminals diff --git a/eval/lists/lists/priority_qwen35.txt b/eval/lists/lists/priority_qwen35.txt deleted file mode 100644 index 14ef350e..00000000 --- a/eval/lists/lists/priority_qwen35.txt +++ /dev/null @@ -1 +0,0 @@ -Qwen/Qwen3.5-9B diff --git a/eval/lists/lists/priority_rl_test.txt b/eval/lists/lists/priority_rl_test.txt deleted file mode 100644 index 4aab785f..00000000 --- a/eval/lists/lists/priority_rl_test.txt +++ /dev/null @@ -1 +0,0 @@ -laion/sft__Kimi-2-5-inferredbugs-sandboxes-maxeps-32k__Qwen3-8B diff --git a/eval/lists/lists/pruned_models_names.txt b/eval/lists/lists/pruned_models_names.txt deleted file mode 100644 index 328d8ec8..00000000 --- a/eval/lists/lists/pruned_models_names.txt +++ /dev/null @@ -1,364 +0,0 @@ -DCAgent/All_Puzzles_5k_New_Context_GPT4o-mini_new_context -DCAgent/All_Puzzles_5k_OG_5k_New_Context_GPT4o-mini_new_context -DCAgent/All_Puzzles_5k_new_context -DCAgent/all-puzzles-sandboxes-traces-terminus-2-with-gpt-4o-mini-judgments-correct -DCAgent/bash_textbook_tasks_traces -DCAgent/code-contests-sandboxes-traces-terminus-2_new_hparams_11_10_25_Qwen3-8B -DCAgent/code_contests-GLM-4.6-traces_Qwen3-8B -DCAgent/code_contests-Qwen3-Coder-480B-traces -DCAgent/code_contests_10k_OG_10k_New_Questions_GPT5-mini_Qwen3-8B -DCAgent/code_contests_new_questions_gpt-5-mini_Qwen3-8B -DCAgent/codeforces-gptoss120b-traces -DCAgent/exp_tas_max_tokens_1024_traces -DCAgent/freelancer-askllm-filtered-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/freelancer-embedding-mean-instruction-filter_Qwen3-8B -DCAgent/freelancer-long-instruction-filter_Qwen3-8B -DCAgent/freelancer-projects-0-1k-traces -DCAgent/freelancer-projects-0-3k-traces -DCAgent/freelancer-projects-100k-traces_Qwen3-8B -DCAgent/freelancer-projects-10k-traces -DCAgent/freelancer-projects-1k-traces -DCAgent/freelancer-projects-3k-traces -DCAgent/freelancer-projects-gpt5_Qwen3-8B -DCAgent/freelancer-projects-gpt5mini -DCAgent/freelancer-random-instruction-filter-traces-terminus-2_Qwen3-8B -DCAgent/freelancer-short-instruction-filter_Qwen3-8B -DCAgent/freelancer-t1024s-32ep_Qwen3-8B -DCAgent/freelancer-t2048s-32ep_Qwen3-8B -DCAgent/freelancer-t256s-32ep_Qwen3-8B -DCAgent/freelancer-t512s-32ep_Qwen3-8B -DCAgent/hr1_code-contests-sandboxes-with-tests-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_60 -DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 -DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_40 -DCAgent/hr1_wfc_nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_64 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step20 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step40 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor_step64 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/neulab-codeactinstruct-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/neulab-mind2web-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/neulab-synatra-sandboxes-traces-terminus-2_Qwen3-8B -DCAgent/nl2bash-GLM-4.6-traces_Qwen3-8B -DCAgent/nl2bash-Qwen3-Coder-480B-traces_Qwen3-8B -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step40 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120225harbor_step_73 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_25 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_50 -DCAgent/nl2bash-nl2bash-bugsseq_Qwen3-8B-maxEps32-accThink-disableSummarize-120325harbor_step_73 -DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_20 -DCAgent/nl2bashG5CP-nl2bash-bs_Q3-8B-mE32-aT-dS-120325hbr_step_34 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step20 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor_step34 -DCAgent/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/r2egymG5CP-nl2bash-bs_Q3-8B-mE24-aT-dS-120325hbr_step_25 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step20 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_55 -DCAgent/r2egymGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-maxEps24-112925harbor_step_80 -DCAgent/staqc-ot3-100k-code-subset-traces-terminus-2_Qwen3-8B -DCAgent/staqc-ot3-100k-math-subset-traces-terminus-2_save-strategy_steps_Qwen3-8B -DCAgent/staqc-ot3-100k-science-subset-traces-terminus-2_Qwen3-8B -DCAgent/staqc-ot3-100k-traces-terminus-2_Qwen3-8B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-1-7B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-14B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Instruct-2507 -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-4B-Thinking-2507 -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-8B-Base -DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-Coder-30B-A3B-Instruct -DCAgent/taskmaster2-0-1k-traces -DCAgent/taskmaster2-0-3k-traces -DCAgent/taskmaster2-10k-traces -DCAgent/taskmaster2-16ep -DCAgent/taskmaster2-1ep -DCAgent/taskmaster2-1k-traces -DCAgent/taskmaster2-2ep -DCAgent/taskmaster2-32ep -DCAgent/taskmaster2-3k-traces -DCAgent/taskmaster2-4ep -DCAgent/taskmaster2-64ep -DCAgent/taskmaster2-8ep -DCAgent/taskmaster2-banana -DCAgent/taskmaster2-gpt5mini -DCAgent/taskmaster2-gpt5mini_global-batch-size_16 -DCAgent/tbench_oracle_solutions_terminus -DCAgent/test_sft_qwen3_32k_base_NO_EVAL_Qwen3-8B -DCAgent/wikitable_format_conversion-qwen3-coder-480b-a35b-instruct-awq-traces -DCAgent2/GLM-4_6-codeforces-32eps-32k-fixeps -DCAgent2/GLM-4_6-nl2bash-verified-32eps-32k-fixeps -DCAgent2/GLM-4_6-stackexchange-overflow-sandboxes-32eps-32k -DCAgent2/bugs-nl2bashseq -DCAgent2/bugs-stack-nl2bashseq -DCAgent2/bugs-swesmith-over5050 -DCAgent2/codeforces-GLM-4_6-traces-32ep-32k-1-2-4-dv -DCAgent2/freelancer-projects-100k-traces -DCAgent2/freelancer-projects-31k-traces -DCAgent2/freelancer-t256s-32ep_hub-model-id_freelancer-t256s-32ep-restore-hp_Qwen3-8B -DCAgent2/gemini25flash-stackexchange-overflow-32ep-512k-v3-traces -DCAgent2/glm-4_6-freelancer-traces -DCAgent2/glm-4_6-freelancer-traces-pm -DCAgent2/hr1_wikitable-format-conversion_nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112725harbor -DCAgent2/inferredbugs-GLM-4_6-32ep-32k -DCAgent2/inferredbugs-GLM-4_6-32ep-65k -DCAgent2/neulab-code-feedback-sandboxes-traces-terminus-2_hub-model-id_neulab-code-feedback-restore-hp_Qw -DCAgent2/neulab-mind2web-sandboxes-traces-terminus-2_hub-model-id_neulab-mind2web-restore-hp_Qwen3-8B -DCAgent2/neulab-synatra-sandboxes-traces-terminus-2_hub-model-id_neulab-synatra-restore-hp_Qwen3-8B -DCAgent2/nl2bash-bugs-over5050 -DCAgent2/nl2bash-bugs-undr3070 -DCAgent2/nl2bash-bugsseq -DCAgent2/nl2bash-bugsshuffle -DCAgent2/nl2bash-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -DCAgent2/nl2bash-stack-bugs-over333 -DCAgent2/nl2bash-stack-bugs-undr203050 -DCAgent2/nl2bash-stack-bugs-undr503020 -DCAgent2/nl2bash-stack-over5050 -DCAgent2/nl2bash-stack-undr3070 -DCAgent2/nl2bash-stack-undr7030 -DCAgent2/nl2bash-stackseq -DCAgent2/nl2bash-stackshuffle -DCAgent2/nl2bash-swesmith-reason -DCAgent2/nl2bash-swesmithseq -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1085_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1089_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1093_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k-ab1097_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_128_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_32_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4.6-traces-32ep-32k_global-batch-size_64_Qwen3-8B -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-3 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-4 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-5 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-5epochs-lr1e-6 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-7epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-8epochs -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2095 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2098 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab2099 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-ab20998 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-mgn5e2 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd0 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd002 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd004 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd006 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wd008 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0005 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0010 -DCAgent2/nl2bash-verified-GLM-4_6-traces-32ep-32k-wr0150 -DCAgent2/nl2bashGPT5CodexPassed-nl2bash-bugsseq_Qwen3-8B-8nodes-sync-112625harbor -DCAgent2/stack-bugs-over5050 -DCAgent2/stack-bugs-undr3070 -DCAgent2/stack-bugs-undr7030 -DCAgent2/stack-bugsseq -DCAgent2/stack-bugsshuffle -DCAgent2/stack-nl2bashseq -DCAgent2/stack-swesmithseq -DCAgent2/swesmith-nl2bashseq -DCAgent2/swesmith-stack-undr7030 -DCAgent2/taskmaster2-1ep_hub-model-id_taskmaster2-1ep-2stage-restore-hp_freelancer-projects-sandboxes-tra -DCAgent2/taskmaster2-8ep_hub-model-id_taskmaster2-8ep-2stage-restore-hp_freelancer-projects-sandboxes-tra -DCAgent2/taskmaster2-GLM-4_6-32ep-32k -DCAgent2/test2-tbench-dev-71-qwen3-8b-8nodes-sync -bespokelabs/Qwen3-8B-ot_step100 -bespokelabs/Qwen3-8B-ot_step60_high -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B -laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B -laion/GLM-4.6-stackoverflow-32eps-65k-fixeps_Qwen3-8B -laion/GLM-4_6-freelancer-32eps-131k -laion/GLM-4_6-inferredbugs-32ep-65k-reasoning -laion/GLM-4_6-nl2bash-verified-32ep-32k-reasoning -laion/GLM-4_6-selfinstruct-naive-2-32ep-32k -laion/GLM-4_6-stackexchange-superuser-32ep-32k -laion/GLM-4_6-swesmith-32ep-131k-nosumm -laion/GLM-4_6-swesmith-32ep-131k-nosumm-reasoning -laion/GLM-4_7-inferredbugs-sandboxes-maxeps-131k -laion/GLM-4_7-r2egym_sandboxes-maxeps-131k -laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k -laion/GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k-fixthink -laion/GPT-OSS-120B-codeforces-fixeps_Qwen3-8B -laion/Kimi-K2T-neulab-agenttuning-kg-sandboxes-maxeps-32k -laion/Kimi-K2T-neulab-agenttuning-mind2web-sandboxes-maxeps-32k -laion/Kimi-K2T-neulab-agenttuning-webshop-sandboxes-maxeps-32k -laion/Kimi-K2T-swesmith-32ep-131k -laion/MiniMax-M2-freelancer-32ep-32k -laion/MiniMax-M2-freelancer-32ep-32k-reasoning -laion/Qwen3-8B_exp-swd-r2egym-standard_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp-swd-swesmith-wo-docker_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-8B_exp_tas_summarize_threshold_4096_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.25_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_temp_0.5_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_tmux_large_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_top_k_32_traces_save-strategy_steps -laion/Qwen3-8B_exp_tas_trajectory_minimal_traces_save-strategy_steps -laion/Qwen3-8B_perturbed-docker-exp-taskmaster2-tasks_glm_4.7_traces_locetash_save-strategy_steps -laion/Qwen3-Coder-480B-codeforces-fixeps_Qwen3-8B -laion/Qwen3-Coder-480B-nl2bash-fixeps_Qwen3-8B -laion/bugs-nl2bashseq_Qwen3-8B -laion/claude-4-5-sonnet-thinking-stackexchange-overflow-32ep-32k-traces -laion/exp-gfi-swesmith-short-response-filtered-10K_glm_4_7_traces_jupiter -laion/exp-psu-stackoverflow-1K_glm_4_7_traces -laion/exp-psu-stackoverflow-316_glm_4_7_traces -laion/exp-psu-stackoverflow-31K_glm_4_7_traces -laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -laion/exp-swd-r2egym-wo-docker_glm_4_7_traces -laion/exp-syh-tezos-askllm-hardened_glm_4_7_traces_jupiter_cleaned -laion/exp-syh-tezos-stackoverflow-mixed_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-33_6x_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-r2egym-4_2x_glm_4_7_traces_jupiter -laion/exp-uns-r2egym-8_4x_glm_4_7_traces_jupiter -laion/exp-uns-tezos-128unique_glm_4_7_traces_jupiter_cleaned -laion/exp-uns-tezos-1unique_glm_4_7_traces_jupiter -laion/exp_tas_baseline_traces -laion/exp_tas_frequency_penalty_0_25_traces -laion/exp_tas_frequency_penalty_0_5_traces -laion/exp_tas_frequency_penalty_1_0_traces -laion/exp_tas_high_diversity_traces -laion/exp_tas_linear_history_off_traces -laion/exp_tas_low_diversity_traces -laion/exp_tas_max_tokens_2048_traces -laion/exp_tas_max_tokens_4096_traces -laion/exp_tas_min_p_0_05_traces -laion/exp_tas_parser_xml_traces -laion/exp_tas_raw_content_off_traces -laion/exp_tas_repetition_penalty_1_2_traces -laion/exp_tas_temp_0_5_traces -laion/exp_tas_top_k_128_traces -laion/exp_tas_top_k_16_traces -laion/exp_tas_top_p_0_8_traces -laion/glm-4_6-all-puzzles-32ep-131k -laion/glm-4_6-dclm-baseline-terminal-traces-32ep-131k -laion/glm-4_6-freelancer-32ep-131k-torch -laion/glm-4_6-nemo-prism -laion/glm-4_6-r2egym-32ep-32k -laion/glm-4_6-stack-overflow-32ep-131k-summtrc -laion/glm-4_6-staqc-32ep-131k -laion/glm46-Magicoder-Evol-Instruct-110K-sandboxes-1-traces -laion/glm46-defects4j-32ep-131k -laion/glm46-glaive-code-assistant-sandboxes-maxeps-131k -laion/glm46-neulab-agenttuning-alfworld-sandboxes-maxeps-131k -laion/glm46-neulab-synatra-32ep-131k -laion/glm46-qasper-maxeps-131k -laion/glm46-stackexchange-tezos-maxeps-131k -laion/glm46-swegym-tasks-maxeps-131k -laion/glm46-swesmith-maxeps-131k -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc -laion/gpt-oss-120B-stack-overflow-32ep-131k-summtrc-fixthink1 -laion/kimi-k2-r2egym_sandboxes-maxeps-32k -laion/kimi-k2-swegym-tasks-maxeps-32k -laion/kimi-k2t-freelancer-32ep-32k -laion/kimi-k2t-neulab-synatra-32ep-131k -laion/minimax-m2-stack-overflow-32ep-131k-summtrc -laion/nl2bash-bugs-undr7030_Qwen3-8B -laion/nl2bash-bugsseq_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e3_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn1e5_Qwen3-8B -laion/nl2bash-verified-GLM-4.6-traces-32ep-32k-mgn5e4_Qwen3-8B -laion/open-thoughts-4-code-qwen3-32b-annotated -laion/qwen3-coder-480B-stack-overflow-32ep-131k-summtrc -laion/qwen3base-GLM-4_7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k -laion/r2egym-bugsseq -laion/r2egym-gpt5-codex-160ep-1M -laion/r2egym-nl2bash-bugsseq -laion/r2egym-nl2bash-stack-bugsseq-bash-withtests -laion/r2egym-nl2bash-stack-bugsseq-cpp -laion/r2egym-nl2bash-stack-bugsseq-fixthink -laion/r2egym-nl2bash-stack-bugsseq-fixthink-exercism-python -laion/r2egym-nl2bash-stack-bugsseq-fixthink-methods2test-v2 -laion/r2egym-nl2bash-stack-bugsseq-fixthink-stack-csharp -laion/r2egym-nl2bash-stack-bugsseq-junit -laion/r2egym-nl2bash-stack-bugsseq-pytest-v2 -laion/r2egym-nl2bash-stack-bugsseq-rl-stack-bash-withtests -laion/r2egym-nl2bash-stack-bugsseq-stack-php-v2 -laion/r2egym-nl2bash-stack-bugsseq_lr3e-5_exp_rpt_stack-php-v2_step20 -laion/r2egym-nl2bash-stackseq -laion/r2egym-stack-bugsseq -laion/rl_bs128-gs16-rloo-n-code-contests-900s-noreg-15 -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_postmortem -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr1e-5_taco -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink_lr3e-5_bigcodebench-v2 -laion/rl_r2egym-nl2bash-stack-bugsseq_lr3e-5_stack-php-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_unit-pyth-v3 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_pyme-larg-90 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-dock-v2 -laion/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-self-larg-70 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_e2eg-larg-60 -laion/rl_rl-conf_24GP_base_noth-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_pyme-larg-60 -laion/rl_rl-conf_qwen_8b_ll_lr1e-5_bs64-yaml_mode-path_r2eg-nl2b-stac-bugs_trai-data_exp_rpt_cros-type -laion/rl_think_npfg-code-contests-900s-45 -laion/rl_tp4s64_8x_2skill -laion/rl_tp4s64_8x_exercism-python -laion/rl_tp4s64_8x_flat25_baseline -laion/rl_tp4s64_8x_github_issue -laion/rl_tp4s64_8x_heavy_padding -laion/rl_tp4s64_8x_minimal_instructions -laion/rl_tp4s64_8x_nemotron-cpp -laion/rl_tp4s64_8x_nemotron-junit -laion/rl_tp4s64_8x_proportional -laion/rl_tp4s64_8x_structural_debug -laion/rl_v1_tp4s64_8x_nemotron-junit -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_locetash_again -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together -laion/stackexchange-tezos-sandboxes_glm_4_6_traces_together_again -mlfoundations-dev/all-puzzles-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_global-batch-size_16 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_43 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_44 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_45 -mlfoundations-dev/code-contests-sandboxes-traces-terminus-2_seed_46 -mlfoundations-dev/codeforces-sandboxes-traces-terminus-2 -mlfoundations-dev/defects4j-sandboxes-traces-terminus-2 -mlfoundations-dev/freelancer-projects-sandboxes-traces-terminus-2 -mlfoundations-dev/inferredbugs-sandboxes-traces-terminus-2 -mlfoundations-dev/nemo-prism-math-sandboxes-traces-terminus-2 -mlfoundations-dev/qasper-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-overflow-sandboxes-traces-terminus-2 -mlfoundations-dev/stackexchange-tezos-sandboxes-traces-terminus-2 -mlfoundations-dev/staqc-sandboxes-traces-terminus-2 -mlfoundations-dev/swesmith_with_plain_docker-sandboxes-traces-terminus-2 -mlfoundations-dev/taskmaster2-sandboxes-traces-terminus-2 -open-thoughts/OpenThinker-Agent-v1-SFT -penfever/GLM-4_6-codeforces-32ep-32k-restore-hp -penfever/GLM-4_6-gemini25flash-stackexchange-overflow-32ep-512k-fixeps -penfever/GLM-4_6-inferredbugs-32eps-65k-fixeps -penfever/freelancer-t1024s-32ep-restore-hp -penfever/freelancer-t2048s-32ep-restore-hp -penfever/freelancer-t512s-32ep-restore-hp -penfever/kimi-k2-swesmith_with_plain_docker-sandboxes-maxeps-32k -penfever/neulab-codeactinstruct-restore-hp -penfever/nl2bash-0-1k-traces-restore-hp -penfever/nl2bash-0-3k-traces-restore-hp -penfever/nl2bash-16ep-restore-hp -penfever/nl2bash-1ep-restore-hp -penfever/nl2bash-1k-traces-restore-hp -penfever/nl2bash-2ep-restore-hp -penfever/nl2bash-32ep-restore-hp -penfever/nl2bash-3k-traces-restore-hp -penfever/nl2bash-4ep-restore-hp -penfever/nl2bash-8ep-restore-hp -penfever/nl2bash-GLM-4_6-traces-newhparams -penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-dft -penfever/nl2bash-verified-GLM-4_6-traces-32ep-32k-restore-hp -penfever/nl2bash_gpt-5-nano-traces-8ep-restore-hp -penfever/nl2bash_verified_gpt-5-nano-traces-restore-hp -penfever/rl_bs128_gs16_ruby-30 -penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_code-v2-25 -penfever/rl_rl-conf_20GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_stac-pyte-v2-25 -penfever/rl_rl-conf_24GP_base-yaml_mode-path_r2eg-nl2b-stac-bugs-fixt_trai-data_exp_rpt_soft-v2-45 -penfever/selfinstruct-naive-sandboxes-2-traces-restore-hp -penfever/swesmith-2stage-restore-hp -penfever/taskmaster2-4ep-2stage-restore-hp diff --git a/eval/lists/lists/pyme_v3_40.txt b/eval/lists/lists/pyme_v3_40.txt deleted file mode 100644 index b67069dc..00000000 --- a/eval/lists/lists/pyme_v3_40.txt +++ /dev/null @@ -1 +0,0 @@ -laion/rl_rl-conf_24GP_base-yaml_mode-path_exp_tas_opti_comb_trac_trai-data_exp_rpt_pyme-v3-40 diff --git a/eval/lists/lists/qwen35_27b.txt b/eval/lists/lists/qwen35_27b.txt deleted file mode 100644 index 1e3117a0..00000000 --- a/eval/lists/lists/qwen35_27b.txt +++ /dev/null @@ -1 +0,0 @@ -Qwen/Qwen3.5-27B diff --git a/eval/lists/lists/qwen35_9b.txt b/eval/lists/lists/qwen35_9b.txt deleted file mode 100644 index 14ef350e..00000000 --- a/eval/lists/lists/qwen35_9b.txt +++ /dev/null @@ -1 +0,0 @@ -Qwen/Qwen3.5-9B diff --git a/eval/lists/lists/richard_base_model.txt b/eval/lists/lists/richard_base_model.txt deleted file mode 100644 index 497c9a05..00000000 --- a/eval/lists/lists/richard_base_model.txt +++ /dev/null @@ -1 +0,0 @@ -Qwen/Qwen3.5-9B \ No newline at end of file diff --git a/eval/lists/lists/richard_test_model.txt b/eval/lists/lists/richard_test_model.txt deleted file mode 100644 index bd8a4f31..00000000 --- a/eval/lists/lists/richard_test_model.txt +++ /dev/null @@ -1,2 +0,0 @@ -# laion/exp-psu-swesmith-3K_glm_4_7_traces_jupiter -laion/exp-psu-swesmith-31K_glm_4_7_traces_jupiter \ No newline at end of file diff --git a/eval/lists/lists/rope_step_batch.txt b/eval/lists/lists/rope_step_batch.txt deleted file mode 100644 index 2c54a448..00000000 --- a/eval/lists/lists/rope_step_batch.txt +++ /dev/null @@ -1,3 +0,0 @@ -laion/swesmith_8b_rope_65k-step37 -laion/r2egym_8b_rope_65k-step17 -laion/swesmith_8b-step35 diff --git a/eval/lists/lists/sera_14b.txt b/eval/lists/lists/sera_14b.txt deleted file mode 100644 index 8835118a..00000000 --- a/eval/lists/lists/sera_14b.txt +++ /dev/null @@ -1 +0,0 @@ -allenai/SERA-14B diff --git a/eval/lists/lists/swesmith_fixthink_45.txt b/eval/lists/lists/swesmith_fixthink_45.txt deleted file mode 100644 index c4287878..00000000 --- a/eval/lists/lists/swesmith_fixthink_45.txt +++ /dev/null @@ -1 +0,0 @@ -laion/rl_swesmith-fixthink-pymethods2test-45 diff --git a/eval/lists/lists/syh_32b.txt b/eval/lists/lists/syh_32b.txt deleted file mode 100644 index 492f1c93..00000000 --- a/eval/lists/lists/syh_32b.txt +++ /dev/null @@ -1 +0,0 @@ -laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B diff --git a/eval/lists/lists/tb2_richard_test_model.txt b/eval/lists/lists/tb2_richard_test_model.txt deleted file mode 100644 index 8eddee63..00000000 --- a/eval/lists/lists/tb2_richard_test_model.txt +++ /dev/null @@ -1 +0,0 @@ -laion/exp-psu-swesmith-10K_glm_4_7_traces_jupiter \ No newline at end of file diff --git a/eval/lists/lists/v2_richard_test_model.txt b/eval/lists/lists/v2_richard_test_model.txt deleted file mode 100644 index 83e4b4f6..00000000 --- a/eval/lists/lists/v2_richard_test_model.txt +++ /dev/null @@ -1 +0,0 @@ -laion/rl_r2egym-nl2bash-stack-bugsseq-fixthink-again_lr1e-5_pr \ No newline at end of file From 994b555269ea36ccfa16ba0c770246b8c0c85608 Mon Sep 17 00:00:00 2001 From: richardzhuang0412 Date: Sun, 5 Apr 2026 20:31:36 +0000 Subject: [PATCH 3/3] Address PR review feedback from Gemini - Fix check_progress.py: correct REPO_DIR (parent.parent not parent^3), update LOGS_DIR to eval/local/logs, add --logs-dir CLI arg, fix unclosed file handle with `with` statement - Fix snapshot_download.py: use Jupiter version with fcntl file locking to prevent race conditions during concurrent downloads - Fix test_dp_eval.sh: update sbatch path from legacy eval/MBZ/ to eval/unified_eval_harbor.sbatch - Fix V6_MIGRATION.md: update all eval/MBZ/ paths to new eval/ locations - Revert eval/jupiter/dcagent_eval_config.yaml to original (legacy file, canonical configs are now in eval/configs/) - Remove duplicate eval/jupiter/V6_MIGRATION.md (kept in eval/docs/) Co-Authored-By: Claude Opus 4.6 (1M context) --- eval/check_progress.py | 13 ++- eval/docs/V6_MIGRATION.md | 12 +-- eval/jupiter/V6_MIGRATION.md | 111 -------------------------- eval/jupiter/dcagent_eval_config.yaml | 2 +- eval/snapshot_download.py | 45 +++++++---- eval/test_dp_eval.sh | 8 +- 6 files changed, 51 insertions(+), 140 deletions(-) delete mode 100644 eval/jupiter/V6_MIGRATION.md diff --git a/eval/check_progress.py b/eval/check_progress.py index 61abb0a6..7bdf2cc3 100644 --- a/eval/check_progress.py +++ b/eval/check_progress.py @@ -19,8 +19,8 @@ from datetime import datetime from pathlib import Path -REPO_DIR = Path(__file__).resolve().parent.parent.parent -LOGS_DIR = REPO_DIR / "eval" / "MBZ" / "logs" +REPO_DIR = Path(__file__).resolve().parent.parent +LOGS_DIR = REPO_DIR / "eval" / "local" / "logs" # override with --logs-dir DEFAULT_JOBS_DIR = REPO_DIR / "jobs" @@ -92,7 +92,8 @@ def get_progress_single(run_tag, jobs_dir): if not rf.exists(): return None, None, None, None, None, None try: - d = json.load(open(rf)) + with open(rf) as f: + d = json.load(f) completed = d.get("stats", {}).get("n_trials", None) total = d.get("n_total_trials", None) finished = d.get("finished_at") is not None @@ -604,11 +605,17 @@ def parse_args(): "--jobs-dir", type=Path, default=DEFAULT_JOBS_DIR, help=f"Path to eval jobs directory (default: {DEFAULT_JOBS_DIR})", ) + parser.add_argument( + "--logs-dir", type=Path, default=LOGS_DIR, + help=f"Path to eval logs directory (default: {LOGS_DIR})", + ) return parser.parse_args() def main(): args = parse_args() + global LOGS_DIR + LOGS_DIR = args.logs_dir if args.live: interval = max(3, args.interval) diff --git a/eval/docs/V6_MIGRATION.md b/eval/docs/V6_MIGRATION.md index fe1f2e63..ea94e398 100644 --- a/eval/docs/V6_MIGRATION.md +++ b/eval/docs/V6_MIGRATION.md @@ -2,12 +2,12 @@ ## What's already done (in this repo, will arrive via `git pull`) -1. **`eval/clusters/jupiter.yaml`** — updated sbatch paths to shared `eval/MBZ/unified_eval_harbor.sbatch` +1. **`eval/clusters/jupiter.yaml`** — updated sbatch paths to shared `eval/unified_eval_harbor.sbatch` 2. **`eval/jupiter/dcagent_eval_config.yaml`** — updated `jobs_dir` to `zhuang1_eval_jobs` 3. **`eval/jupiter/dcagent_eval_config_no_override.yaml`** — created (swebench/tb2 variant) -4. **`eval/MBZ/unified_eval_harbor.sbatch`** — cluster-agnostic v6 sbatch (shared across clusters) -5. **`eval/MBZ/unified_eval_harbor_dp.sbatch`** — cluster-agnostic DP sbatch -6. **`eval/MBZ/unified_eval_listener_v6.py`** — shared v6 listener +4. **`eval/unified_eval_harbor.sbatch`** — cluster-agnostic v6 sbatch (shared across clusters) +5. **`eval/unified_eval_harbor_dp.sbatch`** — cluster-agnostic DP sbatch +6. **`eval/unified_eval_listener.py`** — shared v6 listener 7. **`eval/baseline_model_configs.yaml`** — shared model configs ## Steps to run on Jupiter @@ -56,7 +56,7 @@ echo "HF_TOKEN: ${HF_TOKEN:0:8}..." ### 7. Dry-run ```bash -source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ +source ~/secrets.env && python eval/unified_eval_listener.py \ --cluster-config eval/clusters/jupiter.yaml \ --preset v2 \ --priority-file eval/MBZ/lists/a1_retrained.txt \ @@ -71,7 +71,7 @@ source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ ### 8. Real run (example) ```bash -source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ +source ~/secrets.env && python eval/unified_eval_listener.py \ --cluster-config eval/clusters/jupiter.yaml \ --preset swebench \ --priority-file eval/MBZ/lists/no_eval_models_latest.txt \ diff --git a/eval/jupiter/V6_MIGRATION.md b/eval/jupiter/V6_MIGRATION.md deleted file mode 100644 index fe1f2e63..00000000 --- a/eval/jupiter/V6_MIGRATION.md +++ /dev/null @@ -1,111 +0,0 @@ -# Jupiter v6 Listener Migration - -## What's already done (in this repo, will arrive via `git pull`) - -1. **`eval/clusters/jupiter.yaml`** — updated sbatch paths to shared `eval/MBZ/unified_eval_harbor.sbatch` -2. **`eval/jupiter/dcagent_eval_config.yaml`** — updated `jobs_dir` to `zhuang1_eval_jobs` -3. **`eval/jupiter/dcagent_eval_config_no_override.yaml`** — created (swebench/tb2 variant) -4. **`eval/MBZ/unified_eval_harbor.sbatch`** — cluster-agnostic v6 sbatch (shared across clusters) -5. **`eval/MBZ/unified_eval_harbor_dp.sbatch`** — cluster-agnostic DP sbatch -6. **`eval/MBZ/unified_eval_listener_v6.py`** — shared v6 listener -7. **`eval/baseline_model_configs.yaml`** — shared model configs - -## Steps to run on Jupiter - -### 1. Pull latest code -```bash -source ~/.bashrc; conda activate otagent -cd /e/scratch/jureap59/zhuang1/OpenThoughts-Agent -GIT_TERMINAL_PROMPT=0 git pull -``` - -### 2. Pin harbor to known-good commit -```bash -cd /e/scratch/jureap59/feuer1/harbor -git fetch && git checkout 6fdb92e7f5707c2b01214933f1622771784e6f67 -# Reinstall in your conda env -pip install -e . -``` - -### 3. Install hf_transfer -```bash -pip install hf_transfer -``` - -### 4. Create jobs dir (if it doesn't exist) -```bash -mkdir -p /e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs -mkdir -p eval/jupiter/logs -``` - -### 5. Pre-download datasets -```bash -source ~/secrets.env -python eval/jupiter/snapshot_download.py DCAgent/dev_set_v2 -python eval/jupiter/snapshot_download.py DCAgent2/terminal_bench_2 -python eval/jupiter/snapshot_download.py DCAgent2/swebench-verified-random-100-folders -``` - -### 6. Verify secrets.env has all required keys -```bash -source ~/secrets.env -echo "DAYTONA_API_KEY: ${DAYTONA_API_KEY:0:12}..." -echo "SUPABASE_URL: ${SUPABASE_URL:0:20}..." -echo "HF_TOKEN: ${HF_TOKEN:0:8}..." -``` - -### 7. Dry-run -```bash -source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ - --cluster-config eval/clusters/jupiter.yaml \ - --preset v2 \ - --priority-file eval/MBZ/lists/a1_retrained.txt \ - --baseline-model-config eval/baseline_model_configs.yaml \ - --timeout-multiplier 2.0 \ - --tp-size 2 \ - --enable-thinking \ - --slurm-time 12:00:00 \ - --max-jobs-submitted 32 \ - --dry-run --once --verbose -``` - -### 8. Real run (example) -```bash -source ~/secrets.env && python eval/MBZ/unified_eval_listener_v6.py \ - --cluster-config eval/clusters/jupiter.yaml \ - --preset swebench \ - --priority-file eval/MBZ/lists/no_eval_models_latest.txt \ - --baseline-model-config eval/baseline_model_configs.yaml \ - --timeout-multiplier 2.0 \ - --tp-size 2 \ - --enable-thinking \ - --slurm-time 12:00:00 \ - --max-jobs-submitted 32 \ - --pack-jobs \ - --stagger-delay 1 --chain-batch-size 10 \ - --no-disk-resume \ - --once -``` - -## Key differences from M2 - -| Setting | M2 | Jupiter | -|---------|-----|---------| -| Partition | `main` | `booster` | -| Account | (none) | `reformo` | -| Time limit | 24:00:00 | 12:00:00 | -| GPUs/node | 8 | 4 | -| Arch | x86_64 | aarch64 (GH200) | -| Internet on compute | yes | **no** (proxy required) | -| Conda env | otagent/otagent2 | otagent/otagent2 (different paths) | -| Harbor | local install | feuer1's shared install | -| HF cache | `~/.cache/huggingface/hub` | `/e/data1/datasets/playground/ot/hf_hub` | -| Jobs dir | `$PWD/jobs` | `/e/data1/.../zhuang1_eval_jobs` | -| Pre-download | optional (has internet) | **required** (no internet on compute) | - -## Proxy note - -Jupiter compute nodes have no internet. The v6 sbatch auto-detects proxy settings from `jupiter.yaml`: -- Uses proxychains for HF downloads on compute -- SSH tunnel via `jpbl-s01-02` login node -- `--pre-download` flag on listener pre-downloads models on login node before submission diff --git a/eval/jupiter/dcagent_eval_config.yaml b/eval/jupiter/dcagent_eval_config.yaml index 1f729080..149e3607 100644 --- a/eval/jupiter/dcagent_eval_config.yaml +++ b/eval/jupiter/dcagent_eval_config.yaml @@ -1,4 +1,4 @@ -jobs_dir: /e/data1/datasets/playground/mmlaion/shared/zhuang1_eval_jobs +jobs_dir: /e/data1/datasets/playground/mmlaion/shared/guha1/eval_jobs n_attempts: 3 timeout_multiplier: 1.0 orchestrator: diff --git a/eval/snapshot_download.py b/eval/snapshot_download.py index fcd402d9..882c1f85 100644 --- a/eval/snapshot_download.py +++ b/eval/snapshot_download.py @@ -1,6 +1,8 @@ import os import sys import argparse +import fcntl +import time from huggingface_hub import snapshot_download def is_valid_task_dir(path): @@ -112,21 +114,34 @@ def main(): path = None if args.local_dir: - # When --local-dir is specified, download real files (no symlinks) - # Check if local_dir already has valid task dirs - if os.path.isdir(args.local_dir): - task_dirs = [d for d in os.listdir(args.local_dir) - if is_valid_task_dir(os.path.join(args.local_dir, d))] - if task_dirs: - print(f"Found existing dataset at {args.local_dir} with {len(task_dirs)} tasks") - path = args.local_dir - if not path: - print("Downloading dataset to local dir (real files, no symlinks)...", file=sys.stderr) - path = download_sandboxes_dataset( - repo_id=args.repo_id, - local_dir=args.local_dir, - cache_dir=args.cache_dir - ) + # When --local-dir is specified, download real files (no symlinks). + # Use a file lock to prevent race conditions when multiple SLURM jobs + # download the same dataset concurrently. + lock_path = args.local_dir.rstrip("/") + ".lock" + os.makedirs(os.path.dirname(lock_path) or ".", exist_ok=True) + lock_fd = open(lock_path, "w") + try: + print(f"Acquiring dataset lock: {lock_path}", file=sys.stderr) + fcntl.flock(lock_fd, fcntl.LOCK_EX) + print("Lock acquired.", file=sys.stderr) + + # Check if local_dir already has valid task dirs + if os.path.isdir(args.local_dir): + task_dirs = [d for d in os.listdir(args.local_dir) + if is_valid_task_dir(os.path.join(args.local_dir, d))] + if task_dirs: + print(f"Found existing dataset at {args.local_dir} with {len(task_dirs)} tasks") + path = args.local_dir + if not path: + print("Downloading dataset to local dir (real files, no symlinks)...", file=sys.stderr) + path = download_sandboxes_dataset( + repo_id=args.repo_id, + local_dir=args.local_dir, + cache_dir=args.cache_dir + ) + finally: + fcntl.flock(lock_fd, fcntl.LOCK_UN) + lock_fd.close() else: # First try to get existing cached path path = get_dataset_path(args.repo_id) diff --git a/eval/test_dp_eval.sh b/eval/test_dp_eval.sh index 28ec1d71..e9de47c7 100644 --- a/eval/test_dp_eval.sh +++ b/eval/test_dp_eval.sh @@ -54,11 +54,11 @@ sbatch \ --gres gpu:4 \ --cpus-per-task=32 \ --job-name data_dp_test \ - --output eval/MBZ/logs/data_dp_test_%j.out \ - eval/MBZ/unified_eval_harbor_v6.sbatch \ + --output eval/local/logs/data_dp_test_%j.out \ + eval/unified_eval_harbor.sbatch \ "$MODEL" "$DATASET" "$BENCHMARK_ID" "" echo "Submitted! Monitor with:" echo " squeue -u \$USER" -echo " tail -f eval/MBZ/logs/data_dp_test_*.out" -echo " tail -f experiments/logs/vllm_*.log" +echo " tail -f eval/local/logs/data_dp_test_*.out" +echo " tail -f eval/local/logs/vllm_*.log"