Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -410,3 +410,17 @@ train/hpc/dotenv/secret.env
# Claude
CLAUDE.md
development_progress.md

# Eval local runtime data (logs, tracking, job output — per-cluster)
eval/local/
eval/MBZ/logs/
eval/MBZ/tracking/
eval/MBZ/__pycache__/
eval/jupiter/logs/
eval/*/logs/
jobs/

# Secrets (never commit)
secret.env
secrets.env
*.env.local
12 changes: 9 additions & 3 deletions database/unified_db/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2015,7 +2015,8 @@ def register_sandbox_job(
stats: Optional[Dict[str, Any]] = None,
forced_update: bool = True,
hf_traces_link: Optional[str] = None,
job_status: Optional[str] = None
job_status: Optional[str] = None,
is_overlong: bool = False,
) -> Dict[str, Any]:
"""
Register a sandbox job with minimal auto-filling.
Expand Down Expand Up @@ -2055,7 +2056,8 @@ def register_sandbox_job(
"benchmark_id": benchmark_id,
"n_rep_eval": n_rep_eval,
"hf_traces_link": hf_traces_link,
"job_status": job_status
"job_status": job_status,
"is_overlong": is_overlong,
}

# Include job_id if provided (preserves local ID from result.json)
Expand Down Expand Up @@ -3447,6 +3449,7 @@ def upload_job_and_trial_records(
register_benchmark: bool = False,
hf_dataset_url: Optional[str] = None,
forced_update: bool = False,
is_overlong: bool = False,
) -> Dict[str, Any]:
"""
Upload job and trial records to database (with optional HF dataset URL for trials).
Expand Down Expand Up @@ -3858,7 +3861,7 @@ def upload_job_and_trial_records(
job_metadata["hf_traces_link"] = hf_dataset_url
job_metadata["job_status"] = "Finished"

job_record = register_sandbox_job(**job_metadata, forced_update=forced_update)
job_record = register_sandbox_job(**job_metadata, forced_update=forced_update, is_overlong=is_overlong)

if not job_record.get("success"):
raise Exception(f"Job registration failed: {job_record.get('error')}")
Expand Down Expand Up @@ -4141,6 +4144,7 @@ def upload_traces_to_hf(
verbose=verbose,
success_filter=success_filter,
include_verifier_output=include_verifier_output,
export_subagents=export_subagents,
)
logger.info(f"Extracted {len(dataset)} conversation rows from trials")
except Exception as e:
Expand Down Expand Up @@ -4301,6 +4305,7 @@ def upload_eval_results(
hf_verbose: bool = False,
hf_export_subagents: bool = False,
forced_update: bool = False,
is_overlong: bool = False,
) -> Dict[str, Any]:
"""
Upload evaluation results from a job directory to HuggingFace and database.
Expand Down Expand Up @@ -4434,6 +4439,7 @@ def upload_eval_results(
register_benchmark=register_benchmark,
hf_dataset_url=hf_dataset_url, # Will be None if HF upload failed
forced_update=forced_update,
is_overlong=is_overlong,
)

# Add HF-related information to result
Expand Down
103 changes: 68 additions & 35 deletions eval/baseline_model_configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,56 +9,89 @@
# The eval script will default to the node's GPU count (e.g. TP=4 on Polaris A100-40GB).
# Only set it here if a model requires a specific TP size.
#
# Pattern configs (under "patterns") are matched when no exact model name is found.
# Each pattern entry has a "match" regex and the same config fields as a model entry.
# Format:
# models: Per-model overrides (exact HF name -> config).
# groups: Apply one config to many models. Each entry has "models" (list) + config fields.
# Per-model entries in "models:" are merged on top of group config (override wins).
# patterns: Regex fallback when no exact/group match. First match wins.

models:
"NovaSky-AI/SA-SWE-32B":
# --- Groups: many models sharing the same config ---
groups:
- models:
- "nvidia/Nemotron-Terminal-32B"
- "DCAgent/staqc-sandboxes-traces-terminus-2_Qwen3-32B"
- "NovaSky-AI/SA-SWE-32B"
- "Qwen/Qwen2.5-Coder-32B-Instruct"
- "Qwen/Qwen3-32B"
- "R2E-Gym/R2EGym-32B-Agent"
- "SWE-Swiss/SWE-Swiss-32B"
- "SWE-bench/SWE-agent-LM-32B"
- "Skywork/Skywork-SWE-32B"
- "allenai/SERA-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-91_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-93_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_adam-beta1_0-95_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_32_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_global-batch-size_64_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_learning-rate_1e-06_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_lr_1e-5_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_4.0_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_6.0_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_7.0_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_num-train-epochs_8-0_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-01_Qwen3-32B"
- "laion/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning_warmup-ratio_0-05_Qwen3-32B"
- "laion/Qwen3-32B-NL2Bash-31step"
- "laion/Qwen3-32B-R2EGYM-256-3epochs"
- "laion/Qwen3-32B-SweSmith-20step"
- "laion/open-thoughts-4-code-qwen3-32b-annotated"
- "laion/rl__40GPU_base_32b__exp_rpt_codeelo-v2__sft_GLM-4-7-swesmith"
- "laion/rl__40GPU_base_32b__exp_rpt_nemotron-bash__sft_GLM-4-7-swesmith"
- "laion/sft_GLM-4-7-swesmith-sandboxes-with_tests-oracle_verified_120s-maxeps-131k_Qwen3-32B"
- "laion/sft_r2egym-nl2bash-stackoverflow-inferredbugs-32B_Qwen3-32B"
- "laion/syh-r2eg-askl-glm_4-7_trac_jupi_-gfi-swes-rand-filt-10K_glm_4-7_trac_jupi_32B"
- "laion/rl__48GPU_shaped_32b__swe_rebench_patched_oracle__Qwen3-32B-45"
tensor_parallel_size: 4
max_model_len: 32768
swap_space: 32
trust_remote_code: true
tool_call_parser: hermes
reasoning_parser: qwen3
extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching"

"Qwen/Qwen2.5-Coder-32B-Instruct":
max_model_len: 32768
swap_space: 32
trust_remote_code: true
tool_call_parser: hermes
extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching"
# --- Per-model overrides (merged on top of group config) ---
models:
# Adds reasoning_parser on top of the group defaults
"NovaSky-AI/SA-SWE-32B":
reasoning_parser: qwen3

"R2E-Gym/R2EGym-32B-Agent":
max_model_len: 32768
swap_space: 32
trust_remote_code: true
tool_call_parser: hermes
extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching"
# Override: lower swap_space (group default is 32)
"allenai/SERA-32B":
swap_space: 12

"SWE-Swiss/SWE-Swiss-32B":
max_model_len: 32768
swap_space: 32
# Pattern-based configs: matched by regex when no exact/group match is found.
# Checked in order; first match wins.
patterns:
- match: "(?i)qwen3\\.5"
conda_env: otagent2
trust_remote_code: true
tool_call_parser: hermes
extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching"

"Skywork/Skywork-SWE-32B":
tensor_parallel_size: 4
max_model_len: 32768
swap_space: 32
trust_remote_code: true
tool_call_parser: hermes
extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching"
extra_args: "--enforce-eager"

"allenai/SERA-32B":
max_model_len: 32768
- match: "(?i)(?:32b.*(131k|-lc)|(131k|-lc).*32b)"
tensor_parallel_size: 4
max_model_len: 131072
swap_space: 12
tool_call_parser: hermes
extra_args: "--dtype bfloat16 --block-size 16 --enable-chunked-prefill --max-num-partial-prefills 1 --enable-prefix-caching"
hf_overrides: '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}}'

- match: "(?i)32[Bb]"
tensor_parallel_size: 4
max_model_len: 32768
swap_space: 32

# Pattern-based configs: matched by regex when no exact model name is found.
# Checked in order; first match wins.
patterns:
- match: "131k|-lc$"
max_model_len: 131072
swap_space: 12
extra_args: "--hf-overrides '{\"rope_scaling\":{\"rope_type\":\"yarn\",\"factor\":4.0,\"original_max_position_embeddings\":32768}}'"
hf_overrides: '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}}'
11 changes: 9 additions & 2 deletions eval/build_vllm_cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# EVAL_VLLM_REASONING_PARSER (default: unset)
# EVAL_VLLM_DATA_PARALLEL_SIZE (default: unset; vLLM v0.8+ only)
# EVAL_VLLM_EXTRA_ARGS (default: unset; space-separated string)
# EVAL_VLLM_HF_OVERRIDES (default: unset; JSON string for --hf-overrides)
# ==============================================================================

build_vllm_cmd() {
Expand All @@ -31,18 +32,19 @@ build_vllm_cmd() {
# Read overrides from env (set by listener via sbatch --export)
local tp="${EVAL_VLLM_TENSOR_PARALLEL_SIZE:-4}"
local dp="${EVAL_VLLM_DATA_PARALLEL_SIZE:-}"
local max_model_len="${EVAL_VLLM_MAX_MODEL_LEN:-}"
local max_model_len="${EVAL_VLLM_MAX_MODEL_LEN:-32768}"
local swap_space="${EVAL_VLLM_SWAP_SPACE:-4}"
local trust_remote_code="${EVAL_VLLM_TRUST_REMOTE_CODE:-}"
local tool_call_parser="${EVAL_VLLM_TOOL_CALL_PARSER:-}"
local reasoning_parser="${EVAL_VLLM_REASONING_PARSER:-}"
local extra_args="${EVAL_VLLM_EXTRA_ARGS:-}"
local hf_overrides="${EVAL_VLLM_HF_OVERRIDES:-}"

# Build command array
VLLM_CMD=(
"$python_bin" -m vllm.entrypoints.openai.api_server
--model "$model"
--host 0.0.0.0 --port 8000
--host 0.0.0.0 --port "${VLLM_PORT:-8000}"
--served-model-name "$model"
--tensor-parallel-size "$tp"
--gpu-memory-utilization "$gpu_mem_util"
Expand Down Expand Up @@ -70,6 +72,11 @@ build_vllm_cmd() {
VLLM_CMD+=(--reasoning-parser "$reasoning_parser")
fi

# HF model config overrides (JSON string, properly quoted)
if [ -n "$hf_overrides" ]; then
VLLM_CMD+=(--hf-overrides "$hf_overrides")
fi

# Append extra args (space-separated string)
if [ -n "$extra_args" ]; then
# shellcheck disable=SC2206
Expand Down
Loading