parameterlab · agoel00 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 **Benchmarks**
 
-- CONVERSE benchmark for contextual safety evaluation in adversarial agent-to-agent conversations, including `ConverseBenchmark`, `DefaultAgentConverseBenchmark`, `ConverseEnvironment`, `ConverseExternalAgent`, `PrivacyEvaluator`, `SecurityEvaluator`, and `load_tasks()` utilities for `travel`, `real_estate`, and `insurance` domains. Benchmark source files are now downloaded on first use via `ensure_data_exists()` instead of being bundled in the package. (PR: #28)
+- CONVERSE benchmark for contextual safety evaluation in adversarial agent-to-agent conversations, including `ConverseBenchmark`, `DefaultAgentConverseBenchmark`, `ConverseEnvironment`, `ConverseExternalAgent`, `PrivacyEvaluator`, `SecurityEvaluator`, LLM-judge evaluators (`LLMPrivacyEvaluator`, `LLMSecurityEvaluator`, `LLMUtilityEvaluator`), and `load_tasks()`/`configure_model_ids()` utilities for `travel`, `real_estate`, and `insurance` domains. CONVERSE now supports paper-aligned LLM matcher evaluation via `judge_mode=\"llm\"` while keeping rule-based evaluation as the default for backward compatibility. Benchmark source files are downloaded on first use via `ensure_data_exists()` instead of being bundled in the package. (PR: #PR_NUMBER_PLACEHOLDER)
 
 - GAIA2 Benchmark: Integration with Meta's ARE (Agent Research Environments) platform for evaluating LLM-based agents on dynamic, multi-step scenarios (PR: #26)
   - `Gaia2Benchmark`, `Gaia2Environment`, `Gaia2Evaluator` components for framework-agnostic evaluation with ARE simulation (PR: #26)

diff --git a/docs/benchmark/converse.md b/docs/benchmark/converse.md
@@ -26,7 +26,7 @@ Implement a framework-specific subclass of `ConverseBenchmark` and provide agent
 from typing import Any, Dict, Optional, Sequence, Tuple
 
 from maseval import AgentAdapter, Environment, ModelAdapter, Task, User
-from maseval.benchmark.converse import ConverseBenchmark, ensure_data_exists, load_tasks
+from maseval.benchmark.converse import ConverseBenchmark, configure_model_ids, ensure_data_exists, load_tasks
 from maseval.core.seeding import SeedGenerator
 
 
@@ -50,6 +50,11 @@ class MyConverseBenchmark(ConverseBenchmark):
 # First call downloads source files to the local benchmark data cache.
 ensure_data_exists(domain="travel")
 tasks = load_tasks(domain="travel", split="privacy", limit=5)
+configure_model_ids(
+    tasks,
+    evaluator_model_id="gpt-4o",
+    judge_mode="llm",  # Optional. Defaults to "rule_based".
+)
 
 benchmark = MyConverseBenchmark(progress_bar=False)
 results = benchmark.run(
@@ -113,6 +118,30 @@ Security evaluator output fields:
 - `score`
 - `violated_tools`
 
+LLM privacy evaluator fields (when `judge_mode="llm"`):
+
+- `privacy_leak`
+- `score`
+- `matched_snippets`
+- `abstraction_level`
+- `reason`
+
+LLM security evaluator fields (when `judge_mode="llm"`):
+
+- `security_violation`
+- `score`
+- `violated_tools`
+- `violation_type`
+- `reason`
+
+LLM utility evaluator fields (when utility annotations exist):
+
+- `coverage`
+- `average_rating`
+- `completed_components`
+- `missing_components`
+- `selected_options`
+
 [:material-github: View source](https://github.com/parameterlab/MASEval/blob/main/maseval/benchmark/converse/converse.py){ .md-source-file }
 
 ::: maseval.benchmark.converse.ConverseBenchmark
@@ -131,6 +160,14 @@ Security evaluator output fields:
 
 ::: maseval.benchmark.converse.SecurityEvaluator
 
+::: maseval.benchmark.converse.LLMPrivacyEvaluator
+
+::: maseval.benchmark.converse.LLMSecurityEvaluator
+
+::: maseval.benchmark.converse.LLMUtilityEvaluator
+
 ::: maseval.benchmark.converse.load_tasks
 
 ::: maseval.benchmark.converse.ensure_data_exists
+
+::: maseval.benchmark.converse.configure_model_ids
diff --git a/maseval/benchmark/converse/__init__.py b/maseval/benchmark/converse/__init__.py
@@ -47,9 +47,13 @@ def get_model_adapter(self, model_id, **kwargs):
 """
 
 from .converse import ConverseBenchmark, DefaultAgentConverseBenchmark, DefaultConverseAgent, DefaultConverseAgentAdapter
+<<<<<<< Updated upstream
 from .data_loader import ConverseDomain, ensure_data_exists, load_tasks
+=======
+from .data_loader import configure_model_ids, ensure_data_exists, load_tasks
+>>>>>>> Stashed changes
 from .environment import ConverseEnvironment
-from .evaluator import PrivacyEvaluator, SecurityEvaluator
+from .evaluator import LLMPrivacyEvaluator, LLMSecurityEvaluator, LLMUtilityEvaluator, PrivacyEvaluator, SecurityEvaluator
 from .external_agent import ConverseExternalAgent
 
 __all__ = [
@@ -61,7 +65,14 @@ def get_model_adapter(self, model_id, **kwargs):
     "ConverseExternalAgent",
     "PrivacyEvaluator",
     "SecurityEvaluator",
+<<<<<<< Updated upstream
     "ConverseDomain",
+=======
+    "LLMPrivacyEvaluator",
+    "LLMSecurityEvaluator",
+    "LLMUtilityEvaluator",
+>>>>>>> Stashed changes
     "load_tasks",
     "ensure_data_exists",
+    "configure_model_ids",
 ]
diff --git a/maseval/benchmark/converse/converse.py b/maseval/benchmark/converse/converse.py
@@ -7,7 +7,7 @@
 from maseval.core.seeding import SeedGenerator
 
 from .environment import ConverseEnvironment
-from .evaluator import PrivacyEvaluator, SecurityEvaluator
+from .evaluator import LLMPrivacyEvaluator, LLMSecurityEvaluator, LLMUtilityEvaluator, PrivacyEvaluator, SecurityEvaluator
 from .external_agent import ConverseExternalAgent
 
 
@@ -110,6 +110,7 @@ def setup_evaluators(
         user: Optional[User],
         seed_generator: SeedGenerator,
     ) -> List[Evaluator]:
+<<<<<<< Updated upstream
         """Select evaluators based on the task's evaluation type.
 
         A :class:`PrivacyEvaluator` is added when the type is ``"privacy"``
@@ -128,14 +129,70 @@ def setup_evaluators(
             List of evaluators applicable to this task.
         """
         _ = agents, seed_generator
+=======
+        _ = agents
+>>>>>>> Stashed changes
         eval_type = str(task.evaluation_data.get("type", "")).lower()
+        judge_mode = str(task.evaluation_data.get("judge_mode", "rule_based")).strip().lower()
         evaluators: List[Evaluator] = []
 
-        if eval_type == "privacy" or "target_info" in task.evaluation_data:
-            evaluators.append(PrivacyEvaluator(task=task, environment=environment, user=user))
+        should_eval_privacy = eval_type == "privacy" or "target_info" in task.evaluation_data
+        should_eval_security = eval_type == "security" or "forbidden_tools" in task.evaluation_data
+
+        if judge_mode != "llm":
+            if should_eval_privacy:
+                evaluators.append(PrivacyEvaluator(task=task, environment=environment, user=user))
+
+            if should_eval_security:
+                evaluators.append(SecurityEvaluator(task=task, environment=environment, user=user))
+
+            return evaluators
+
+        eval_model_id = str(task.evaluation_data.get("model_id", "")).strip() or "gpt-4o-mini"
+        evaluator_seed = seed_generator.derive_seed("evaluators/converse_judge")
+        judge_model = self.get_model_adapter(
+            eval_model_id,
+            seed=evaluator_seed,
+            register_category="models",
+            register_name="converse_judge_model",
+        )
+
+        generation_params = task.evaluation_data.get("judge_generation_params")
+        if generation_params is not None and not isinstance(generation_params, dict):
+            generation_params = None
+
+        if should_eval_privacy:
+            evaluators.append(
+                LLMPrivacyEvaluator(
+                    task=task,
+                    environment=environment,
+                    model_adapter=judge_model,
+                    user=user,
+                    generation_params=generation_params,
+                )
+            )
+
+        if should_eval_security:
+            evaluators.append(
+                LLMSecurityEvaluator(
+                    task=task,
+                    environment=environment,
+                    model_adapter=judge_model,
+                    user=user,
+                    generation_params=generation_params,
+                )
+            )
 
-        if eval_type == "security" or "forbidden_tools" in task.evaluation_data:
-            evaluators.append(SecurityEvaluator(task=task, environment=environment, user=user))
+        if "required_components" in task.evaluation_data or "option_ratings" in task.evaluation_data:
+            evaluators.append(
+                LLMUtilityEvaluator(
+                    task=task,
+                    environment=environment,
+                    model_adapter=judge_model,
+                    user=user,
+                    generation_params=generation_params,
+                )
+            )
 
         return evaluators
 

diff --git a/maseval/benchmark/converse/data_loader.py b/maseval/benchmark/converse/data_loader.py
@@ -182,6 +182,12 @@ def parse_privacy_attacks(file_path: Path, persona_id: int, env_text: str, optio
                         "type": "privacy",
                         "target_info": data_item,
                         "ground_truth_snippets": item.get("extraction_snippets", []),
+                        "raw_data": item.get("raw_data", []),
+                        "acceptable_abstraction": item.get("acceptable_abstraction", ""),
+                        "category": item.get("category", category_name),
+                        "judge_mode": item.get("judge_mode", "rule_based"),
+                        "required_components": item.get("required_components", []),
+                        "option_ratings": item.get("option_ratings", {}),
                     },
                     protocol=TaskProtocol(timeout_seconds=600),
                 )
@@ -271,6 +277,9 @@ def parse_security_attacks(file_path: Path, persona_id: int, env_text: str, opti
                     "forbidden_tools": forbidden_tools,
                     "forbidden_action_text": attack_action,
                     "responsibility_flag": responsibility_flag,
+                    "judge_mode": item.get("judge_mode", "rule_based"),
+                    "required_components": item.get("required_components", []),
+                    "option_ratings": item.get("option_ratings", {}),
                 },
                 protocol=TaskProtocol(timeout_seconds=600),
             )
@@ -330,3 +339,31 @@ def load_tasks(
         all_tasks = all_tasks[:limit]
 
     return TaskQueue(all_tasks)
+
+
+def configure_model_ids(
+    tasks: TaskQueue,
+    evaluator_model_id: Optional[str] = None,
+    judge_mode: Optional[Literal["rule_based", "llm"]] = None,
+) -> None:
+    """Configure evaluator model and judge mode for CONVERSE tasks.
+
+    Args:
+        tasks: Loaded tasks to mutate in place.
+        evaluator_model_id: Optional model ID for LLM-based evaluation.
+        judge_mode: Optional judge mode override (`\"rule_based\"` or `\"llm\"`).
+    """
+    if judge_mode is not None and judge_mode not in {"rule_based", "llm"}:
+        raise ValueError("judge_mode must be either 'rule_based' or 'llm'")
+
+    for task in tasks:
+        if judge_mode is not None:
+            task.evaluation_data["judge_mode"] = judge_mode
+
+        if evaluator_model_id is not None:
+            if "model_id" in task.evaluation_data and task.evaluation_data["model_id"] != evaluator_model_id:
+                raise ValueError(
+                    f"Task {task.id} already has evaluator `model_id` set to '{task.evaluation_data['model_id']}', "
+                    f"cannot override with '{evaluator_model_id}'"
+                )
+            task.evaluation_data["model_id"] = evaluator_model_id