[Move DISCO queue to core]:

arubique · arubique · commit 079ef47fae77 · 2026-03-12T09:00:35.000+01:00
- Replace all .get() calls on required fields by explicit dict lookup.
diff --git a/maseval/__init__.py b/maseval/__init__.py
@@ -49,6 +49,7 @@
     UserError,
     UserExhaustedError,
     TaskTimeoutError,
+    get_with_assert,
     validate_argument_type,
     validate_required_arguments,
     validate_no_extra_arguments,
@@ -106,6 +107,7 @@
     "ChatResponse",
     "ModelScorer",
     # Exceptions and validation
+    "get_with_assert",
     "MASEvalError",
     "AgentError",
     "EnvironmentError",
diff --git a/maseval/benchmark/mmlu/mmlu.py b/maseval/benchmark/mmlu/mmlu.py
@@ -84,14 +84,14 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
 
         Args:
             task_data: Must contain ``"query"`` (str) and ``"environment_data"``
-                (dict with optional ``"choices"``, ``"full_prompt"``, ``"use_full_prompt"``).
+                (dict with ``"choices"``, ``"full_prompt"``, ``"use_full_prompt"``).
         """
         env_data = task_data["environment_data"]
         return {
             "query": task_data["query"],
-            "choices": env_data.get("choices", DEFAULT_CHOICES),
-            "full_prompt": env_data.get("full_prompt", ""),
-            "use_full_prompt": env_data.get("use_full_prompt", False),
+            "choices": env_data["choices"],
+            "full_prompt": env_data["full_prompt"],
+            "use_full_prompt": env_data["use_full_prompt"],
         }
 
     def create_tools(self) -> Dict[str, Any]:
@@ -137,7 +137,7 @@ def __init__(
         self.task = task
         self.environment = environment
         self.gold = task.evaluation_data["gold"]
-        self.choices = task.environment_data.get("choices", DEFAULT_CHOICES)
+        self.choices = task.environment_data["choices"]
 
     def filter_traces(self, traces: Dict[str, Any]) -> Dict[str, Any]:
         """Extract relevant traces for evaluation.
@@ -175,11 +175,11 @@ def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -
             "predicted": predicted,
             "gold": self.gold,
             "correct": correct,
-            "doc_id": self.task.metadata.get("doc_id"),
+            "doc_id": self.task.metadata["doc_id"],
         }
 
         # Extract logprobs from traces if available (for logprobs-based evaluation)
-        messages = traces.get("messages", [])
+        messages = traces["messages"]
         for msg in messages:
             if isinstance(msg, dict) and "logprobs" in msg:
                 result["logprobs"] = msg["logprobs"]
@@ -445,7 +445,7 @@ def precompute_all_logprobs_lmeval(self, tasks: Sequence[Task]) -> Dict[Any, Lis
         instance_map = {}  # (doc_id, choice_idx) -> position in results
 
         for task in tasks:
-            doc_id = task.metadata.get("doc_id")
+            doc_id = task.metadata["doc_id"]
             # Get prompt from task - use full_prompt from environment_data if available
             if self.use_full_prompt and "full_prompt" in task.environment_data:
                 prompt = task.environment_data["full_prompt"]
@@ -471,7 +471,7 @@ def precompute_all_logprobs_lmeval(self, tasks: Sequence[Task]) -> Dict[Any, Lis
         # Map results back to doc_ids
         doc_logprobs = {}
         for task in tasks:
-            doc_id = task.metadata.get("doc_id")
+            doc_id = task.metadata["doc_id"]
             logprobs = []
             for i in range(len(choices)):
                 pos = instance_map[(doc_id, i)]
@@ -498,20 +498,19 @@ def run_agents(
         which automatically picks single-token or multi-token scoring.
         """
         prompt = environment.get_prompt()
-        choices = environment.state.get("choices", DEFAULT_CHOICES)
-        doc_id = task.metadata.get("doc_id") if task else None
-
-        if hasattr(self, "_precomputed_logprobs") and doc_id is not None:
-            logprobs = self._precomputed_logprobs.get(doc_id)
-            if logprobs is not None:
-                best_idx = logprobs.index(max(logprobs))
-                answer = choices[best_idx]
-                environment.state["logprobs"] = logprobs
-                environment.state["predicted_idx"] = best_idx
-                agent = agents[0]
-                agent._messages.append({"role": "user", "content": prompt})
-                agent._messages.append({"role": "assistant", "content": answer, "logprobs": logprobs})
-                return answer
+        choices = environment.state["choices"]
+        doc_id = task.metadata["doc_id"]
+
+        if hasattr(self, "_precomputed_logprobs") and doc_id in self._precomputed_logprobs:
+            logprobs = self._precomputed_logprobs[doc_id]
+            best_idx = logprobs.index(max(logprobs))
+            answer = choices[best_idx]
+            environment.state["logprobs"] = logprobs
+            environment.state["predicted_idx"] = best_idx
+            agent = agents[0]
+            agent._messages.append({"role": "user", "content": prompt})
+            agent._messages.append({"role": "assistant", "content": answer, "logprobs": logprobs})
+            return answer
 
         logprobs = self._scorer.loglikelihood_choices(prompt, choices, delimiter=TARGET_DELIMITER)
 
@@ -677,14 +676,14 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
     acc_norm_sum = 0.0
 
     for res in results:
-        if res.get("status") != STATUS_SUCCESS:
+        if res["status"] != STATUS_SUCCESS:
             continue
 
-        evals = res.get("eval") or []
+        evals = res["eval"] or []
         for entry in evals:
-            acc_sum += entry.get("acc", 0.0)
-            acc_norm_sum += entry.get("acc_norm", 0.0)
-            if entry.get("correct", False):
+            acc_sum += entry["acc"]
+            acc_norm_sum += entry["acc_norm"]
+            if entry["correct"]:
                 correct_count += 1
 
     return {
diff --git a/maseval/core/exceptions.py b/maseval/core/exceptions.py
@@ -308,6 +308,44 @@ def __init__(
 # =============================================================================
 
 
+def get_with_assert(container: Any, key: Any, error_msg: Optional[str] = None) -> Any:
+    """Get a value from a container, raising ``KeyError`` if not found.
+
+    Use instead of ``dict.get(key, default)`` when the key is **required**.
+    A missing key means a bug — not a case to paper over with a fallback.
+
+    Supports nested access via a list of keys::
+
+        get_with_assert(task, ["metadata", "doc_id"])
+        # equivalent to: task["metadata"]["doc_id"] but with a clear error
+
+    Args:
+        container: Dictionary or other container supporting ``in`` and ``[]``.
+        key: Key to look up. Pass a list for nested access.
+        error_msg: Custom error message. If ``None``, a descriptive default
+            is generated.
+
+    Returns:
+        The value at the given key.
+
+    Raises:
+        KeyError: If the key is not found in the container.
+    """
+    if isinstance(key, list):
+        assert len(key) > 0
+        value = get_with_assert(container, key[0], error_msg)
+        if len(key) == 1:
+            return value
+        return get_with_assert(value, key[1:], error_msg)
+
+    if key not in container:
+        if error_msg is None:
+            error_msg = f'Required key "{key}" not in container: {container}'
+        raise KeyError(error_msg)
+
+    return container[key]
+
+
 def validate_argument_type(
     value: Any,
     expected_type: str,