🔨 chore: update price info (#188)

zjwu0522 · web-flow · commit 513cbd8a9ff5 · 2025-09-09T21:51:42.000+08:00
diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py
@@ -131,10 +131,11 @@ def check_completeness_and_validity(
                     if task not in run_results:
                         missing_tasks.append(task)
                     else:
-                        # Check for retryable errors
+                        # Check for retryable errors only if the task did not succeed
                         meta = run_results[task]
+                        success = bool(meta.get("execution_result", {}).get("success", False))
                         error_msg = meta.get("execution_result", {}).get("error_message", "")
-                        if error_msg and is_retryable_error(error_msg):
+                        if (not success) and error_msg and is_retryable_error(error_msg):
                             invalid_tasks.append(f"{task}: {error_msg[:50]}...")
                 
                 if missing_tasks:
@@ -188,6 +189,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
         actual_model_name: Optional[str] = None
         # If cost info is not present in metas, leave as None
         per_run_cost: Optional[float] = None
+        # Model-level flags (to be inferred from meta.json)
+        is_open_source_model: Optional[bool] = None
+        is_reasoning_model: Optional[bool] = None
 
         # For pass@1 per-run statistics across all services
         pass1_rates_per_run_overall: List[float] = []
@@ -239,6 +243,12 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
                         if isinstance(possible_cost, (int, float)):
                             per_run_cost = float(possible_cost)
 
+                    # capture model flags if present
+                    if is_open_source_model is None and "is_open_source_model" in meta:
+                        is_open_source_model = bool(meta.get("is_open_source_model"))
+                    if is_reasoning_model is None and "is_reasoning_model" in meta:
+                        is_reasoning_model = bool(meta.get("is_reasoning_model"))
+
             pass1_rates_per_run_overall.append(round(successes_this_run / total_tasks, 6))
 
         # Compute pass@k and pass^k across tasks (overall)
@@ -303,6 +313,8 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
             "per_run_output_tokens": per_run_output_tokens,
             "per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
             "actual_model_name": actual_model_name or "",
+            "is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
+            "is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
             "pass@1": {
                 "avg": round(avg_pass1, 4),
                 "std": round(std_pass1, 4),
@@ -414,6 +426,8 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
                 "per_run_output_tokens": s_per_run_output_tokens,
                 "per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
                 "actual_model_name": actual_model_name or "",
+                "is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
+                "is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
                 "pass@1": {
                     "avg": round(s_mean, 4),
                     "std": round(s_std, 4),
diff --git a/src/aggregators/pricing.py b/src/aggregators/pricing.py
@@ -40,13 +40,15 @@
 
     # Qwen
     "qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8},
+    "qwen3-max-preview": {"input": 1.2, "output": 6},
     
     # Xai
     "grok-4-0709": {"input": 3.0, "output": 15.0},
     "grok-code-fast-1": {"input": 0.2, "output": 1.5},
 
     # Moonshot
     "kimi-k2-0711-preview": {"input": 0.6, "output": 2.5},
+    "kimi-k2-0905-preview": {"input": 0.6, "output": 2.5},
 }
 
 
diff --git a/src/model_config.py b/src/model_config.py
@@ -119,15 +119,15 @@ class ModelConfig:
             "litellm_input_model_name": "gemini/gemini-2.5-flash",
         },
         # Moonshot models
-        "k2": {
+        "kimi-k2-0711": {
             "provider": "moonshot",
             "api_key_var": "MOONSHOT_API_KEY",
             "litellm_input_model_name": "moonshot/kimi-k2-0711-preview",
         },
-        "k2-turbo": {
+        "kimi-k2-0905": {
             "provider": "moonshot",
             "api_key_var": "MOONSHOT_API_KEY",
-            "litellm_input_model_name": "moonshot/kimi-k2-turbo-preview",
+            "litellm_input_model_name": "moonshot/kimi-k2-0905-preview",
         },
         # Grok models
         "grok-4": {
@@ -141,16 +141,16 @@ class ModelConfig:
             "litellm_input_model_name": "xai/grok-code-fast-1",
         },
         # Qwen models
-        "qwen-3-coder": {
-            "provider": "qwen",
-            "api_key_var": "OPENROUTER_API_KEY",
-            "litellm_input_model_name": "openrouter/qwen/qwen3-coder",
-        },
         "qwen-3-coder-plus": {
             "provider": "qwen",
             "api_key_var": "DASHSCOPE_API_KEY",
             "litellm_input_model_name": "dashscope/qwen3-coder-plus",
         },
+        "qwen-3-max": {
+            "provider": "qwen",
+            "api_key_var": "DASHSCOPE_API_KEY",
+            "litellm_input_model_name": "dashscope/qwen3-max-preview",
+        },
         # Zhipu
         "glm-4.5": {
             "provider": "zhipu",