Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

**Benchmarks**

- CONVERSE benchmark for contextual safety evaluation in adversarial agent-to-agent conversations, including `ConverseBenchmark`, `DefaultAgentConverseBenchmark`, `ConverseEnvironment`, `ConverseExternalAgent`, `PrivacyEvaluator`, `SecurityEvaluator`, and `load_tasks()` utilities for `travel`, `real_estate`, and `insurance` domains. Benchmark source files are now downloaded on first use via `ensure_data_exists()` instead of being bundled in the package. (PR: #28)
- CONVERSE benchmark for contextual safety evaluation in adversarial agent-to-agent conversations, including `ConverseBenchmark`, `DefaultAgentConverseBenchmark`, `ConverseEnvironment`, `ConverseExternalAgent`, `PrivacyEvaluator`, `SecurityEvaluator`, LLM-judge evaluators (`LLMPrivacyEvaluator`, `LLMSecurityEvaluator`, `LLMUtilityEvaluator`), and `load_tasks()`/`configure_model_ids()` utilities for `travel`, `real_estate`, and `insurance` domains. CONVERSE now supports paper-aligned LLM matcher evaluation via `judge_mode=\"llm\"` while keeping rule-based evaluation as the default for backward compatibility. Benchmark source files are downloaded on first use via `ensure_data_exists()` instead of being bundled in the package. (PR: #PR_NUMBER_PLACEHOLDER)

- GAIA2 Benchmark: Integration with Meta's ARE (Agent Research Environments) platform for evaluating LLM-based agents on dynamic, multi-step scenarios (PR: #26)
- `Gaia2Benchmark`, `Gaia2Environment`, `Gaia2Evaluator` components for framework-agnostic evaluation with ARE simulation (PR: #26)
Expand Down
39 changes: 38 additions & 1 deletion docs/benchmark/converse.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Implement a framework-specific subclass of `ConverseBenchmark` and provide agent
from typing import Any, Dict, Optional, Sequence, Tuple

from maseval import AgentAdapter, Environment, ModelAdapter, Task, User
from maseval.benchmark.converse import ConverseBenchmark, ensure_data_exists, load_tasks
from maseval.benchmark.converse import ConverseBenchmark, configure_model_ids, ensure_data_exists, load_tasks
from maseval.core.seeding import SeedGenerator


Expand All @@ -50,6 +50,11 @@ class MyConverseBenchmark(ConverseBenchmark):
# First call downloads source files to the local benchmark data cache.
ensure_data_exists(domain="travel")
tasks = load_tasks(domain="travel", split="privacy", limit=5)
configure_model_ids(
tasks,
evaluator_model_id="gpt-4o",
judge_mode="llm", # Optional. Defaults to "rule_based".
)

benchmark = MyConverseBenchmark(progress_bar=False)
results = benchmark.run(
Expand Down Expand Up @@ -113,6 +118,30 @@ Security evaluator output fields:
- `score`
- `violated_tools`

LLM privacy evaluator fields (when `judge_mode="llm"`):

- `privacy_leak`
- `score`
- `matched_snippets`
- `abstraction_level`
- `reason`

LLM security evaluator fields (when `judge_mode="llm"`):

- `security_violation`
- `score`
- `violated_tools`
- `violation_type`
- `reason`

LLM utility evaluator fields (when utility annotations exist):

- `coverage`
- `average_rating`
- `completed_components`
- `missing_components`
- `selected_options`

[:material-github: View source](https://github.com/parameterlab/MASEval/blob/main/maseval/benchmark/converse/converse.py){ .md-source-file }

::: maseval.benchmark.converse.ConverseBenchmark
Expand All @@ -131,6 +160,14 @@ Security evaluator output fields:

::: maseval.benchmark.converse.SecurityEvaluator

::: maseval.benchmark.converse.LLMPrivacyEvaluator

::: maseval.benchmark.converse.LLMSecurityEvaluator

::: maseval.benchmark.converse.LLMUtilityEvaluator

::: maseval.benchmark.converse.load_tasks

::: maseval.benchmark.converse.ensure_data_exists

::: maseval.benchmark.converse.configure_model_ids
13 changes: 12 additions & 1 deletion maseval/benchmark/converse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@ def get_model_adapter(self, model_id, **kwargs):
"""

from .converse import ConverseBenchmark, DefaultAgentConverseBenchmark, DefaultConverseAgent, DefaultConverseAgentAdapter
<<<<<<< Updated upstream
from .data_loader import ConverseDomain, ensure_data_exists, load_tasks
=======
from .data_loader import configure_model_ids, ensure_data_exists, load_tasks
>>>>>>> Stashed changes
from .environment import ConverseEnvironment
from .evaluator import PrivacyEvaluator, SecurityEvaluator
from .evaluator import LLMPrivacyEvaluator, LLMSecurityEvaluator, LLMUtilityEvaluator, PrivacyEvaluator, SecurityEvaluator
from .external_agent import ConverseExternalAgent

__all__ = [
Expand All @@ -61,7 +65,14 @@ def get_model_adapter(self, model_id, **kwargs):
"ConverseExternalAgent",
"PrivacyEvaluator",
"SecurityEvaluator",
<<<<<<< Updated upstream
"ConverseDomain",
=======
"LLMPrivacyEvaluator",
"LLMSecurityEvaluator",
"LLMUtilityEvaluator",
>>>>>>> Stashed changes
"load_tasks",
"ensure_data_exists",
"configure_model_ids",
]
67 changes: 62 additions & 5 deletions maseval/benchmark/converse/converse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from maseval.core.seeding import SeedGenerator

from .environment import ConverseEnvironment
from .evaluator import PrivacyEvaluator, SecurityEvaluator
from .evaluator import LLMPrivacyEvaluator, LLMSecurityEvaluator, LLMUtilityEvaluator, PrivacyEvaluator, SecurityEvaluator
from .external_agent import ConverseExternalAgent


Expand Down Expand Up @@ -110,6 +110,7 @@ def setup_evaluators(
user: Optional[User],
seed_generator: SeedGenerator,
) -> List[Evaluator]:
<<<<<<< Updated upstream
"""Select evaluators based on the task's evaluation type.

A :class:`PrivacyEvaluator` is added when the type is ``"privacy"``
Expand All @@ -128,14 +129,70 @@ def setup_evaluators(
List of evaluators applicable to this task.
"""
_ = agents, seed_generator
=======
_ = agents
>>>>>>> Stashed changes
eval_type = str(task.evaluation_data.get("type", "")).lower()
judge_mode = str(task.evaluation_data.get("judge_mode", "rule_based")).strip().lower()
evaluators: List[Evaluator] = []

if eval_type == "privacy" or "target_info" in task.evaluation_data:
evaluators.append(PrivacyEvaluator(task=task, environment=environment, user=user))
should_eval_privacy = eval_type == "privacy" or "target_info" in task.evaluation_data
should_eval_security = eval_type == "security" or "forbidden_tools" in task.evaluation_data

if judge_mode != "llm":
if should_eval_privacy:
evaluators.append(PrivacyEvaluator(task=task, environment=environment, user=user))

if should_eval_security:
evaluators.append(SecurityEvaluator(task=task, environment=environment, user=user))

return evaluators

eval_model_id = str(task.evaluation_data.get("model_id", "")).strip() or "gpt-4o-mini"
evaluator_seed = seed_generator.derive_seed("evaluators/converse_judge")
judge_model = self.get_model_adapter(
eval_model_id,
seed=evaluator_seed,
register_category="models",
register_name="converse_judge_model",
)

generation_params = task.evaluation_data.get("judge_generation_params")
if generation_params is not None and not isinstance(generation_params, dict):
generation_params = None

if should_eval_privacy:
evaluators.append(
LLMPrivacyEvaluator(
task=task,
environment=environment,
model_adapter=judge_model,
user=user,
generation_params=generation_params,
)
)

if should_eval_security:
evaluators.append(
LLMSecurityEvaluator(
task=task,
environment=environment,
model_adapter=judge_model,
user=user,
generation_params=generation_params,
)
)

if eval_type == "security" or "forbidden_tools" in task.evaluation_data:
evaluators.append(SecurityEvaluator(task=task, environment=environment, user=user))
if "required_components" in task.evaluation_data or "option_ratings" in task.evaluation_data:
evaluators.append(
LLMUtilityEvaluator(
task=task,
environment=environment,
model_adapter=judge_model,
user=user,
generation_params=generation_params,
)
)

return evaluators

Expand Down
37 changes: 37 additions & 0 deletions maseval/benchmark/converse/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,12 @@ def parse_privacy_attacks(file_path: Path, persona_id: int, env_text: str, optio
"type": "privacy",
"target_info": data_item,
"ground_truth_snippets": item.get("extraction_snippets", []),
"raw_data": item.get("raw_data", []),
"acceptable_abstraction": item.get("acceptable_abstraction", ""),
"category": item.get("category", category_name),
"judge_mode": item.get("judge_mode", "rule_based"),
"required_components": item.get("required_components", []),
"option_ratings": item.get("option_ratings", {}),
},
protocol=TaskProtocol(timeout_seconds=600),
)
Expand Down Expand Up @@ -271,6 +277,9 @@ def parse_security_attacks(file_path: Path, persona_id: int, env_text: str, opti
"forbidden_tools": forbidden_tools,
"forbidden_action_text": attack_action,
"responsibility_flag": responsibility_flag,
"judge_mode": item.get("judge_mode", "rule_based"),
"required_components": item.get("required_components", []),
"option_ratings": item.get("option_ratings", {}),
},
protocol=TaskProtocol(timeout_seconds=600),
)
Expand Down Expand Up @@ -330,3 +339,31 @@ def load_tasks(
all_tasks = all_tasks[:limit]

return TaskQueue(all_tasks)


def configure_model_ids(
tasks: TaskQueue,
evaluator_model_id: Optional[str] = None,
judge_mode: Optional[Literal["rule_based", "llm"]] = None,
) -> None:
"""Configure evaluator model and judge mode for CONVERSE tasks.

Args:
tasks: Loaded tasks to mutate in place.
evaluator_model_id: Optional model ID for LLM-based evaluation.
judge_mode: Optional judge mode override (`\"rule_based\"` or `\"llm\"`).
"""
if judge_mode is not None and judge_mode not in {"rule_based", "llm"}:
raise ValueError("judge_mode must be either 'rule_based' or 'llm'")

for task in tasks:
if judge_mode is not None:
task.evaluation_data["judge_mode"] = judge_mode

if evaluator_model_id is not None:
if "model_id" in task.evaluation_data and task.evaluation_data["model_id"] != evaluator_model_id:
raise ValueError(
f"Task {task.id} already has evaluator `model_id` set to '{task.evaluation_data['model_id']}', "
f"cannot override with '{evaluator_model_id}'"
)
task.evaluation_data["model_id"] = evaluator_model_id
Loading
Loading