Skip to content

Commit e79b753

Browse files
oam-mitOmkar Masur
andauthored
feat: Adding new set validator (#543)
Issue description: Need a validator that checks if specific tools were called during test execution without requiring a specific order, unlike the existing ToolCallValidator which enforces sequential order. This enables more vague prompts to be tested such as `My payments-latency SLO is breaching. Can you investigate the root cause?` and test only whether a specific tool(s) is being called or not. Introducing this for the new tests being authored (will be part of a separate PR) for the new Change Indicators tool introduced as part of awslabs/mcp#1944 Description of changes: Added ToolCallSetValidator class to mcp-testing/evals/core/validator.py: - Validates that all expected tools are called, regardless of order - Supports filtering out file-related tools via ignore_file_tools parameter - Returns detailed validation results including missing tools, extra tools called, and the complete list of called tools - Provides clear pass/fail criteria with reasoning for test results Rollback procedure: Yes, this commit can be safely reverted. It only adds a new validator class without modifying existing functionality. No migration or cleanup steps are required. By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. --------- Co-authored-by: Omkar Masur <omasur@amazon.com>
1 parent 067eb03 commit e79b753

1 file changed

Lines changed: 78 additions & 0 deletions

File tree

mcp-testing/evals/core/validator.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,84 @@ async def validate(
316316
}
317317

318318

319+
class ToolPresenceValidator(Validator):
320+
"""Validator that checks if specific tools were called, regardless of order and other tools being called."""
321+
322+
def __init__(self, expected_tools: List[str], ignore_file_tools: bool = False):
323+
"""Initialize tool call set validator.
324+
325+
Args:
326+
expected_tools: List of tool names that must be called (order doesn't matter)
327+
ignore_file_tools: If True, filter out file-related tools before validation
328+
"""
329+
self.expected_tools = set(expected_tools)
330+
self.ignore_file_tools = ignore_file_tools
331+
332+
def get_name(self) -> str:
333+
"""Return validator name."""
334+
return 'Tool Call Set'
335+
336+
async def validate(
337+
self,
338+
captured_data: Dict[str, Any],
339+
) -> ValidationResult:
340+
"""Validate that all expected tools were called."""
341+
logger.info('Validating tool calls (order-independent)...')
342+
343+
tool_calls = captured_data.get(TOOL_CALLS, [])
344+
called_tools = [call['name'] for call in tool_calls]
345+
346+
if self.ignore_file_tools:
347+
called_tools = [tool for tool in called_tools if tool not in PERMITTED_FILE_TOOLS]
348+
349+
called_tools_set = set(called_tools)
350+
missing_tools = self.expected_tools - called_tools_set
351+
extra_tools = called_tools_set - self.expected_tools
352+
353+
if not missing_tools:
354+
reasoning = f'All expected tools called: {sorted(self.expected_tools)}'
355+
if extra_tools:
356+
reasoning += f' (also called: {sorted(extra_tools)})'
357+
358+
return {
359+
'validator_name': self.get_name(),
360+
'overall_pass': True,
361+
'criteria_results': [
362+
{
363+
'criterion': 'All expected tools called',
364+
'status': 'PASS',
365+
'reasoning': reasoning,
366+
}
367+
],
368+
'raw_validation_output': {
369+
'expected_tools': sorted(self.expected_tools),
370+
'called_tools': called_tools,
371+
'missing_tools': [],
372+
'extra_tools': sorted(extra_tools),
373+
'ignore_file_tools': self.ignore_file_tools,
374+
},
375+
}
376+
else:
377+
return {
378+
'validator_name': self.get_name(),
379+
'overall_pass': False,
380+
'criteria_results': [
381+
{
382+
'criterion': 'All expected tools called',
383+
'status': 'FAIL',
384+
'reasoning': f'Missing tools: {sorted(missing_tools)}. Called: {called_tools}',
385+
}
386+
],
387+
'raw_validation_output': {
388+
'expected_tools': sorted(self.expected_tools),
389+
'called_tools': called_tools,
390+
'missing_tools': sorted(missing_tools),
391+
'extra_tools': sorted(extra_tools),
392+
'ignore_file_tools': self.ignore_file_tools,
393+
},
394+
}
395+
396+
319397
class BuildValidator(Validator):
320398
"""Validator that runs build commands and checks exit code."""
321399

0 commit comments

Comments
 (0)