Skip to content

Commit 10395ad

Browse files
authored
Merge pull request #23 from microsoft/test-optional
Make functional testing optional and require confirmation
2 parents c65c73a + 6306ae1 commit 10395ad

11 files changed

Lines changed: 125 additions & 50 deletions

File tree

README.md

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Use `--constraints [CODE ...]` to customize output.
2828

2929
### 🛠️ Functional testing
3030

31-
MCP servers are intended to be used by LLM agents, so we test them with an LLM agent. Using your specified LLM, the interviewer generates a test plan based on the MCP server's capabilities and then executes that plan (e.g. by calling tools), collecting statistics about observed tool behavior.
31+
MCP servers are intended to be used by LLM agents, so we can optionally test them with an LLM agent. When enabled with the `--test` flag, the interviewer uses your specified LLM to generate a test plan based on the MCP server's capabilities and then executes that plan (e.g. by calling tools), collecting statistics about observed tool behavior.
3232

3333
### 🧪 LLM evaluation
3434

@@ -68,6 +68,8 @@ Use `--reports [CODE ...]` to customize output.
6868

6969
⚠️ ***mcp-interviewer arbitrarily executes the provided MCP server command in a child process. Whenever possible, run your server in a container like in the examples below to isolate the server from your host system.***
7070

71+
🚨 ***mcp-interviewer actually invokes the server's tools, DO NOT use mcp-interviewer with admin privileges etc***
72+
7173
```bash
7274
# Command to run npx safely inside a Docker container
7375
NPX_CONTAINER="docker run -i --rm node:lts npx"
@@ -102,21 +104,30 @@ Which will generate a report like [this](./mcp-interview.md).
102104

103105
### CLI
104106

107+
**Key Flags:**
108+
- `--test`: Enable functional testing (disabled by default for faster execution)
109+
- `--judge`: Enable experimental LLM evaluation of tools and tests
110+
- `--reports [CODE ...]`: Customize which report sections to include
111+
- `--constraints [CODE ...]`: Customize which constraints to check
112+
105113
```bash
106114
# Docker command to run uvx inside a container
107115
UVX_CONTAINER="docker run -i --rm ghcr.io/astral-sh/uv:python3.12-alpine uvx"
108116

109-
# Constraint checking, functional testing, default report generation
117+
# Basic constraint checking and server inspection (no functional testing)
110118
mcp-interviewer --model gpt-4o "$UVX_CONTAINER mcp-server-fetch"
111119

120+
# Constraint checking with functional testing and default report generation
121+
mcp-interviewer --model gpt-4o --test "$UVX_CONTAINER mcp-server-fetch"
122+
112123
# Constraint checking, functional testing, LLM evaluation, default report generation
113-
mcp-interviewer --model gpt-4o --judge "$UVX_CONTAINER mcp-server-fetch"
124+
mcp-interviewer --model gpt-4o --test --judge "$UVX_CONTAINER mcp-server-fetch"
114125

115-
# Constraint checking, functional testing, custom report generation
116-
mcp-interviewer --model gpt-4o --reports SI TS FT CV "$UVX_CONTAINER mcp-server-fetch"
126+
# Constraint checking with functional testing and custom report generation
127+
mcp-interviewer --model gpt-4o --test --reports SI TS FT CV "$UVX_CONTAINER mcp-server-fetch"
117128

118-
# Custom constraint checking, functional testing, report generation
119-
mcp-interviewer --model gpt-4o --select OTC ONL "$UVX_CONTAINER mcp-server-fetch"
129+
# Custom constraint checking with functional testing and report generation
130+
mcp-interviewer --model gpt-4o --test --constraints OTC ONL "$UVX_CONTAINER mcp-server-fetch"
120131

121132
# Test remote servers
122133
mcp-interviewer --model gpt-4o "https://my-mcp-server.com/sse"
@@ -173,8 +184,8 @@ params = StdioServerParameters(
173184
args=["run", "-i", "--rm", "node:lts", "npx", "-y", "@modelcontextprotocol/server-everything"]
174185
)
175186
176-
interviewer = MCPInterviewer(client, "gpt-4o")
177-
interview = await interviewer.score_server(params)
187+
interviewer = MCPInterviewer(client, "gpt-4o", should_run_functional_test=True)
188+
interview = await interviewer.interview_server(params)
178189
```
179190
180191
## Limitations

src/mcp_interviewer/cli.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import argparse
2+
import sys
23

34

45
def cli():
@@ -72,6 +73,16 @@ def cli():
7273
nargs="+",
7374
help="Specify which constraint violations to check (all enabled by default). Can use full names (e.g., openai-tool-count, openai-name-length) or shorthand codes (e.g., OTC, ONL, ONP, OTL, OA)",
7475
)
76+
parser.add_argument(
77+
"--test",
78+
action="store_true",
79+
help="Enable functional testing of the server",
80+
)
81+
parser.add_argument(
82+
"--accept-risk",
83+
action="store_true",
84+
help="Bypass user confirmation of functional test risk.",
85+
)
7586

7687
args = parser.parse_args()
7788

@@ -130,6 +141,22 @@ def cli():
130141

131142
params = StdioServerParameters(command=params_command, args=params_args)
132143

144+
# Handle the --judge flag which enables experimental judging operations (disabled by default)
145+
should_judge_tool = args.judge or args.judge_tools
146+
should_judge_functional_test = args.judge or args.judge_test
147+
148+
if args.test:
149+
print(
150+
"🚨 MCP Interviewer will make tool call requests to your MCP server. Depending on the server's capabilities this can lead to irreversible outcomes (e.g. deleting files)."
151+
)
152+
accept_risk = args.accept_risk
153+
while not accept_risk:
154+
input_str = input("Do you accept this risk? y|[n]: ").strip().lower()
155+
if not input_str or input_str == "n":
156+
sys.exit(1)
157+
else:
158+
accept_risk = input_str == "y"
159+
133160
import importlib
134161

135162
module, client = args.client.rsplit(".")
@@ -155,17 +182,14 @@ def cli():
155182

156183
from .main import main
157184

158-
# Handle the --judge flag which enables experimental judging operations (disabled by default)
159-
should_judge_tool = args.judge or args.judge_tools
160-
should_judge_functional_test = args.judge or args.judge_test
161-
162185
main(
163186
client,
164187
args.model,
165188
params,
166189
out_dir=args.out_dir,
167190
should_judge_tool=should_judge_tool,
168191
should_judge_functional_test=should_judge_functional_test,
192+
should_run_functional_test=args.test,
169193
custom_reports=args.reports,
170194
no_collapse=args.no_collapse,
171195
selected_constraints=args.constraints,

src/mcp_interviewer/constraints/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ def test(
198198
Yields:
199199
ConstraintViolation: Violations from all tool results
200200
"""
201+
if server.functional_test_scorecard is None:
202+
return
201203
for step in server.functional_test_scorecard.steps:
202204
if isinstance(step.tool_output, CallToolResult):
203205
yield from self.test_tool_result(step.tool_output)

src/mcp_interviewer/interviewer/_interviewer.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def __init__(
6161
self,
6262
client: Client,
6363
model: str,
64+
should_run_functional_test: bool = False,
6465
should_judge_tool: bool = False,
6566
should_judge_functional_test: bool = False,
6667
):
@@ -74,6 +75,7 @@ def __init__(
7475
"""
7576
self._client = client
7677
self._model = model
78+
self._should_run_functional_test = should_run_functional_test
7779
self._should_judge_tool = should_judge_tool
7880
self._should_judge_functional_test = should_judge_functional_test
7981
self._request_counters = create_request_counters()
@@ -381,22 +383,29 @@ async def interview_server(self, params: ServerParameters) -> ServerScoreCard:
381383
tool_scorecards = []
382384

383385
# Phase 3: Functional Testing
384-
logger.info("=" * 60)
385-
logger.info("PHASE 3: Functional Testing")
386-
logger.info("=" * 60)
387-
functional_test = await self.generate_functional_test(server)
388-
389-
(
390-
functional_test_output,
391-
functional_test_step_outputs,
392-
) = await self.execute_functional_test(session, functional_test)
393-
394-
# Judge functional test
395-
functional_test_scorecard = await self.judge_functional_test(
396-
functional_test,
397-
functional_test_output,
398-
functional_test_step_outputs,
399-
)
386+
if self._should_run_functional_test:
387+
logger.info("=" * 60)
388+
logger.info("PHASE 3: Functional Testing")
389+
logger.info("=" * 60)
390+
391+
functional_test = await self.generate_functional_test(server)
392+
393+
(
394+
functional_test_output,
395+
functional_test_step_outputs,
396+
) = await self.execute_functional_test(session, functional_test)
397+
398+
# Judge functional test
399+
functional_test_scorecard = await self.judge_functional_test(
400+
functional_test,
401+
functional_test_output,
402+
functional_test_step_outputs,
403+
)
404+
else:
405+
logger.info("=" * 60)
406+
logger.info("PHASE 3: Functional Testing - SKIPPED")
407+
logger.info("=" * 60)
408+
functional_test_scorecard = None
400409

401410
# Create final scorecard
402411
logger.info("Creating final server scorecard")

src/mcp_interviewer/main.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ async def amain(
2222
out_dir=Path("."),
2323
should_judge_tool: bool = False,
2424
should_judge_functional_test: bool = False,
25+
should_run_functional_test: bool = False,
2526
custom_reports: list[str] | None = None,
2627
no_collapse: bool = False,
2728
selected_constraints: list[str] | None = None,
@@ -39,12 +40,17 @@ async def amain(
3940
out_dir: Directory to save output files (default: current directory)
4041
should_judge_tool: Whether to perform expensive experimental LLM judging of tools (default: False)
4142
should_judge_functional_test: Whether to perform expensive experimental LLM judging of functional tests (default: False)
43+
should_run_functional_test: Whether to run functional tests (default: False)
4244
custom_reports: List of specific report names to include
4345
no_collapse: If True, don't use collapsible sections in the report (default: False)
4446
selected_constraints: List of constraint names or codes to check (all if None)
4547
"""
4648
interviewer = MCPInterviewer(
47-
client, model, should_judge_tool, should_judge_functional_test
49+
client,
50+
model,
51+
should_run_functional_test,
52+
should_judge_tool,
53+
should_judge_functional_test,
4854
)
4955
interview = await interviewer.interview_server(params)
5056

@@ -105,6 +111,7 @@ def main(
105111
out_dir=Path("."),
106112
should_judge_tool: bool = False,
107113
should_judge_functional_test: bool = False,
114+
should_run_functional_test: bool = False,
108115
custom_reports: list[str] | None = None,
109116
no_collapse: bool = False,
110117
selected_constraints: list[str] | None = None,
@@ -118,6 +125,7 @@ def main(
118125
out_dir: Directory to save output files (default: current directory)
119126
should_judge_tool: Whether to perform expensive experimental LLM judging of tools (default: False)
120127
should_judge_functional_test: Whether to perform expensive experimental LLM judging of functional tests (default: False)
128+
should_run_functional_test: Whether to run functional tests (default: False)
121129
custom_reports: List of specific report names to include
122130
no_collapse: If True, don't use collapsible sections in the report (default: False)
123131
selected_constraints: List of constraint names or codes to check (all if None)
@@ -130,6 +138,7 @@ def main(
130138
out_dir,
131139
should_judge_tool,
132140
should_judge_functional_test,
141+
should_run_functional_test,
133142
custom_reports,
134143
no_collapse,
135144
selected_constraints,

src/mcp_interviewer/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,4 +206,4 @@ class Server(BaseModel):
206206
class ServerScoreCard(Server):
207207
model: str
208208
tool_scorecards: list[ToolScoreCard]
209-
functional_test_scorecard: FunctionalTestScoreCard
209+
functional_test_scorecard: FunctionalTestScoreCard | None

src/mcp_interviewer/reports/functional_test/failed_tests.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Failed tests report generation."""
22

3-
from ...models import ServerScoreCard
3+
from ...models import FunctionalTestScoreCard, ServerScoreCard
44
from ..base import BaseReport
55
from .test_step import TestStepReport
66

@@ -20,34 +20,40 @@ def __init__(self, scorecard: ServerScoreCard, detailed: bool = False):
2020
"""Initialize and build the failed tests report."""
2121
super().__init__(scorecard)
2222
self.detailed = detailed
23-
if self._has_failed_tests():
24-
self._build()
23+
self._build(scorecard.functional_test_scorecard)
2524

26-
def _has_failed_tests(self) -> bool:
25+
def _has_failed_tests(
26+
self, functional_test_scorecard: FunctionalTestScoreCard
27+
) -> bool:
2728
"""Check if there are any failed tests."""
28-
if not self._scorecard.functional_test_scorecard:
29-
return False
30-
31-
for step in self._scorecard.functional_test_scorecard.steps:
29+
for step in functional_test_scorecard.steps:
3230
# Check if any evaluation criteria failed
3331
for field_name in step.model_fields_set:
3432
field_value = getattr(step, field_name)
3533
if hasattr(field_value, "score") and field_value.score == "fail":
3634
return True
3735
return False
3836

39-
def _build(self):
37+
def _build(self, functional_test_scorecard: FunctionalTestScoreCard | None):
4038
"""Build the failed tests section."""
39+
if functional_test_scorecard is None:
40+
return
41+
42+
if self._has_failed_tests(functional_test_scorecard):
43+
return
44+
4145
if self.detailed:
42-
self._add_detailed_failed_test_steps()
46+
self._add_detailed_failed_test_steps(functional_test_scorecard)
4347
else:
44-
self._add_failed_test_steps()
48+
self._add_failed_test_steps(functional_test_scorecard)
4549

46-
def _add_failed_test_steps(self) -> "FailedTestsReport":
50+
def _add_failed_test_steps(
51+
self, functional_test_scorecard: FunctionalTestScoreCard
52+
) -> "FailedTestsReport":
4753
"""Add a summary of failed test steps."""
4854
self.add_title("Failed Test Steps (🤖)", 2)
4955

50-
for i, step in enumerate(self._scorecard.functional_test_scorecard.steps):
56+
for i, step in enumerate(functional_test_scorecard.steps):
5157
has_failure = False
5258
failures = []
5359

@@ -66,11 +72,13 @@ def _add_failed_test_steps(self) -> "FailedTestsReport":
6672
self.add_blank_line()
6773
return self
6874

69-
def _add_detailed_failed_test_steps(self) -> "FailedTestsReport":
75+
def _add_detailed_failed_test_steps(
76+
self, functional_test_scorecard: FunctionalTestScoreCard
77+
) -> "FailedTestsReport":
7078
"""Add detailed information about failed test steps."""
7179
self.add_title("Failed Test Steps (🤖)", 2)
7280

73-
for i, step in enumerate(self._scorecard.functional_test_scorecard.steps):
81+
for i, step in enumerate(functional_test_scorecard.steps):
7482
# Use TestStepReport with show_only_failures=True
7583
# This will only build the report if the step has failures
7684
step_report = TestStepReport(

src/mcp_interviewer/reports/functional_test/score_summary.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,11 @@ def _build(self):
107107
]
108108
)
109109

110-
if self.detailed and self._scorecard.functional_test_scorecard.steps:
110+
if (
111+
self.detailed
112+
and self._scorecard.functional_test_scorecard
113+
and self._scorecard.functional_test_scorecard.steps
114+
):
111115
# Show individual test steps
112116
for i, step in enumerate(
113117
self._scorecard.functional_test_scorecard.steps

src/mcp_interviewer/reports/functional_test/test.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,13 @@ def __init__(self, scorecard: ServerScoreCard, include_evaluations: bool = True)
3131

3232
def _build(self):
3333
"""Build the functional test results section."""
34+
test = self._scorecard.functional_test_scorecard
35+
if test is None:
36+
return
37+
3438
# Check if scoring was disabled
3539
self.start_collapsible("Functional Test Results", 2)
3640

37-
test = self._scorecard.functional_test_scorecard
38-
3941
# Test plan
4042
if self.include_evaluations and test.plan:
4143
self.add_text("**Test Plan (🤖):**")

src/mcp_interviewer/reports/statistics/tool_call_statistics.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,17 @@ def add_stats_table_row(
6262

6363
def _build(self):
6464
"""Build the tool output analysis section."""
65+
functional_test_scorecard = self._scorecard.functional_test_scorecard
66+
if functional_test_scorecard is None:
67+
return
68+
6569
tokenizer = encoding_for_model("gpt-4o")
6670

6771
# Never collapse this section - always show the statistics
6872
self.add_title("Tool Call Statistics", 2)
6973
self.add_table_header(["Metric", "Total", "Average", "Min", "Max"])
7074

71-
steps = self._scorecard.functional_test_scorecard.steps
75+
steps = functional_test_scorecard.steps
7276

7377
# Analyze tool calls
7478
total_attempted = len(steps)

0 commit comments

Comments
 (0)