Skip to content

Commit 513cbd8

Browse files
authored
🔨 chore: update price info (#188)
1 parent 7ed3905 commit 513cbd8

3 files changed

Lines changed: 26 additions & 10 deletions

File tree

src/aggregators/aggregate_results.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,11 @@ def check_completeness_and_validity(
131131
if task not in run_results:
132132
missing_tasks.append(task)
133133
else:
134-
# Check for retryable errors
134+
# Check for retryable errors only if the task did not succeed
135135
meta = run_results[task]
136+
success = bool(meta.get("execution_result", {}).get("success", False))
136137
error_msg = meta.get("execution_result", {}).get("error_message", "")
137-
if error_msg and is_retryable_error(error_msg):
138+
if (not success) and error_msg and is_retryable_error(error_msg):
138139
invalid_tasks.append(f"{task}: {error_msg[:50]}...")
139140

140141
if missing_tasks:
@@ -188,6 +189,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
188189
actual_model_name: Optional[str] = None
189190
# If cost info is not present in metas, leave as None
190191
per_run_cost: Optional[float] = None
192+
# Model-level flags (to be inferred from meta.json)
193+
is_open_source_model: Optional[bool] = None
194+
is_reasoning_model: Optional[bool] = None
191195

192196
# For pass@1 per-run statistics across all services
193197
pass1_rates_per_run_overall: List[float] = []
@@ -239,6 +243,12 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
239243
if isinstance(possible_cost, (int, float)):
240244
per_run_cost = float(possible_cost)
241245

246+
# capture model flags if present
247+
if is_open_source_model is None and "is_open_source_model" in meta:
248+
is_open_source_model = bool(meta.get("is_open_source_model"))
249+
if is_reasoning_model is None and "is_reasoning_model" in meta:
250+
is_reasoning_model = bool(meta.get("is_reasoning_model"))
251+
242252
pass1_rates_per_run_overall.append(round(successes_this_run / total_tasks, 6))
243253

244254
# Compute pass@k and pass^k across tasks (overall)
@@ -303,6 +313,8 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
303313
"per_run_output_tokens": per_run_output_tokens,
304314
"per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
305315
"actual_model_name": actual_model_name or "",
316+
"is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
317+
"is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
306318
"pass@1": {
307319
"avg": round(avg_pass1, 4),
308320
"std": round(std_pass1, 4),
@@ -414,6 +426,8 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
414426
"per_run_output_tokens": s_per_run_output_tokens,
415427
"per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
416428
"actual_model_name": actual_model_name or "",
429+
"is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
430+
"is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
417431
"pass@1": {
418432
"avg": round(s_mean, 4),
419433
"std": round(s_std, 4),

src/aggregators/pricing.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,15 @@
4040

4141
# Qwen
4242
"qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8},
43+
"qwen3-max-preview": {"input": 1.2, "output": 6},
4344

4445
# Xai
4546
"grok-4-0709": {"input": 3.0, "output": 15.0},
4647
"grok-code-fast-1": {"input": 0.2, "output": 1.5},
4748

4849
# Moonshot
4950
"kimi-k2-0711-preview": {"input": 0.6, "output": 2.5},
51+
"kimi-k2-0905-preview": {"input": 0.6, "output": 2.5},
5052
}
5153

5254

src/model_config.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,15 +119,15 @@ class ModelConfig:
119119
"litellm_input_model_name": "gemini/gemini-2.5-flash",
120120
},
121121
# Moonshot models
122-
"k2": {
122+
"kimi-k2-0711": {
123123
"provider": "moonshot",
124124
"api_key_var": "MOONSHOT_API_KEY",
125125
"litellm_input_model_name": "moonshot/kimi-k2-0711-preview",
126126
},
127-
"k2-turbo": {
127+
"kimi-k2-0905": {
128128
"provider": "moonshot",
129129
"api_key_var": "MOONSHOT_API_KEY",
130-
"litellm_input_model_name": "moonshot/kimi-k2-turbo-preview",
130+
"litellm_input_model_name": "moonshot/kimi-k2-0905-preview",
131131
},
132132
# Grok models
133133
"grok-4": {
@@ -141,16 +141,16 @@ class ModelConfig:
141141
"litellm_input_model_name": "xai/grok-code-fast-1",
142142
},
143143
# Qwen models
144-
"qwen-3-coder": {
145-
"provider": "qwen",
146-
"api_key_var": "OPENROUTER_API_KEY",
147-
"litellm_input_model_name": "openrouter/qwen/qwen3-coder",
148-
},
149144
"qwen-3-coder-plus": {
150145
"provider": "qwen",
151146
"api_key_var": "DASHSCOPE_API_KEY",
152147
"litellm_input_model_name": "dashscope/qwen3-coder-plus",
153148
},
149+
"qwen-3-max": {
150+
"provider": "qwen",
151+
"api_key_var": "DASHSCOPE_API_KEY",
152+
"litellm_input_model_name": "dashscope/qwen3-max-preview",
153+
},
154154
# Zhipu
155155
"glm-4.5": {
156156
"provider": "zhipu",

0 commit comments

Comments
 (0)