eval-protocol · LLiuZheng · Oct 30, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl
diff --git a/tests/pytest/datasets/klavis_mcp_test.jsonl b/tests/pytest/datasets/klavis_mcp_test.jsonl
@@ -0,0 +1,2 @@
+{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." }
+{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to help you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout" } ], "ground_truth": "Pizzeria Badiali" }
diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py
@@ -13,8 +13,9 @@ class ResponseFormat(BaseModel):
     score: float
 
 
+# You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819 into your Notion for the notion test.
 @evaluation_test(
-    input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
+    input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"],
     rollout_processor=AgentRolloutProcessor(),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
     mode="pointwise",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." }
cursor Bot Nov 14, 2025 Copy link Copy Markdown Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Bug: Judging Descriptions, Not Content The Gmail test's ground truth is a meta-description ("The response contains 5 email addresses.") rather than actual expected content. The LLM judge checks if the output literally contains this string, not whether the output actually has 5 email addresses. This differs from the Notion test which uses actual expected content ("Pizzeria Badiali") and will cause the Gmail test to fail even with correct outputs.
		{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to help you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout" } ], "ground_truth": "Pizzeria Badiali" }