Skip to content

Commit a620cd4

Browse files
authored
Add PostgreSQL MCP Server benchmark results (#3)
* Add PostgreSQL MCP Server benchmark results * add meta information
1 parent dcc91ea commit a620cd4

521 files changed

Lines changed: 141172 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"author": {
3+
"name": "InsForge",
4+
"url": "https://insforge.dev"
5+
},
6+
"avatar": "https://insforge.dev/favicon.ico",
7+
"description": "Official InsForge MCP Server that connects AI tools to InsForge's complete backend platform. This gives AI agents the ability to manage databases, execute SQL queries, handle authentication, manage storage buckets, deploy serverless functions, and monitor container logs. All through natural language interactions.",
8+
"homepage": "https://github.com/InsForge/insforge-mcp",
9+
"name": "InsForge"
10+
}

mcp_servers/postgres/insforge/run-1/chinook__customer_data_migration/messages.json

Lines changed: 544 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"task_name": "chinook__customer_data_migration",
3+
"model_name": "claude-sonnet-4.5",
4+
"litellm_run_model_name": "claude-sonnet-4-5-20250929",
5+
"reasoning_effort": "default",
6+
"mcp": "insforge",
7+
"timeout": 3600,
8+
"time": {
9+
"start": "2025-10-19T18:19:37.821320",
10+
"end": "2025-10-19T18:27:29.630157"
11+
},
12+
"agent_execution_time": 471.2874081134796,
13+
"task_execution_time": 471.80882501602173,
14+
"execution_result": {
15+
"success": true,
16+
"error_message": null,
17+
"verification_error": null,
18+
"verification_output": "============================================================\nVerifying Customer Data Migration Task\n============================================================\nLoaded 200 expected customer records\n✅ All 200 customers migrated correctly\n✅ All customers assigned to SupportRepId 3\n✅ All customers have Fax field set to NULL\n✅ Customer data sets match exactly (order-independent)\n\n🎉 Task verification: PASS\n"
19+
},
20+
"token_usage": {
21+
"input_tokens": 653390,
22+
"output_tokens": 23727,
23+
"total_tokens": 677117,
24+
"reasoning_tokens": 0
25+
},
26+
"turn_count": 22
27+
}

mcp_servers/postgres/insforge/run-1/chinook__employee_hierarchy_management/messages.json

Lines changed: 563 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"task_name": "chinook__employee_hierarchy_management",
3+
"model_name": "claude-sonnet-4.5",
4+
"litellm_run_model_name": "claude-sonnet-4-5-20250929",
5+
"reasoning_effort": "default",
6+
"mcp": "insforge",
7+
"timeout": 3600,
8+
"time": {
9+
"start": "2025-10-19T18:27:29.630721",
10+
"end": "2025-10-19T18:29:07.384418"
11+
},
12+
"agent_execution_time": 97.15503311157227,
13+
"task_execution_time": 97.75368189811707,
14+
"execution_result": {
15+
"success": true,
16+
"error_message": null,
17+
"verification_error": null,
18+
"verification_output": "==================================================\nVerifying Task 3: Employee Hierarchy Management\n==================================================\n✅ Employee count and title verification passed\n✅ Specific employee verification passed - all fields match exactly\n✅ Customer assignment verification passed\n✅ Employee performance table verification passed\n✅ Employee deletion and promotion verification passed\n✅ Salary column verification passed\n\n🎉 Task verification: PASS\nAll employee hierarchy management operations completed correctly!\n"
19+
},
20+
"token_usage": {
21+
"input_tokens": 242228,
22+
"output_tokens": 3814,
23+
"total_tokens": 246042,
24+
"reasoning_tokens": 0
25+
},
26+
"turn_count": 18
27+
}

mcp_servers/postgres/insforge/run-1/chinook__sales_and_music_charts/messages.json

Lines changed: 513 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"task_name": "chinook__sales_and_music_charts",
3+
"model_name": "claude-sonnet-4.5",
4+
"litellm_run_model_name": "claude-sonnet-4-5-20250929",
5+
"reasoning_effort": "default",
6+
"mcp": "insforge",
7+
"timeout": 3600,
8+
"time": {
9+
"start": "2025-10-19T18:29:07.385248",
10+
"end": "2025-10-19T18:30:55.001055"
11+
},
12+
"agent_execution_time": 106.8680911064148,
13+
"task_execution_time": 107.61579585075378,
14+
"execution_result": {
15+
"success": false,
16+
"error_message": null,
17+
"verification_error": "Verification failed with no error message",
18+
"verification_output": "==================================================\n❌ Monthly sales row 1 mismatch: expected ('2009-01', 6, Decimal('35.64'), 36, Decimal('5.9400000000000000'), 6), got ('2009-01', 6, Decimal('330.66'), 36, Decimal('9.19'), 6)\n❌ Monthly sales row 2 mismatch: expected ('2009-02', 7, Decimal('37.62'), 38, Decimal('5.3742857142857143'), 7), got ('2009-02', 7, Decimal('334.62'), 38, Decimal('8.81'), 7)\n❌ Monthly sales row 3 mismatch: expected ('2009-03', 7, Decimal('37.62'), 38, Decimal('5.3742857142857143'), 7), got ('2009-03', 7, Decimal('334.62'), 38, Decimal('8.81'), 7)\n❌ Monthly sales row 4 mismatch: expected ('2009-04', 7, Decimal('37.62'), 38, Decimal('5.3742857142857143'), 7), got ('2009-04', 7, Decimal('334.62'), 38, Decimal('8.81'), 7)\n❌ Monthly sales row 5 mismatch: expected ('2009-05', 7, Decimal('37.62'), 38, Decimal('5.3742857142857143'), 7), got ('2009-05', 7, Decimal('334.62'), 38, Decimal('8.81'), 7)\n❌ Total monthly sales mismatches: 60\n\n❌ Task verification: FAIL\n"
19+
},
20+
"token_usage": {
21+
"input_tokens": 260779,
22+
"output_tokens": 5055,
23+
"total_tokens": 265834,
24+
"reasoning_tokens": 0
25+
},
26+
"turn_count": 16
27+
}

mcp_servers/postgres/insforge/run-1/dvdrental__customer_analysis_fix/messages.json

Lines changed: 378 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"task_name": "dvdrental__customer_analysis_fix",
3+
"model_name": "claude-sonnet-4.5",
4+
"litellm_run_model_name": "claude-sonnet-4-5-20250929",
5+
"reasoning_effort": "default",
6+
"mcp": "insforge",
7+
"timeout": 3600,
8+
"time": {
9+
"start": "2025-10-19T18:30:55.001915",
10+
"end": "2025-10-19T18:32:40.592755"
11+
},
12+
"agent_execution_time": 104.66081619262695,
13+
"task_execution_time": 105.59082698822021,
14+
"execution_result": {
15+
"success": false,
16+
"error_message": null,
17+
"verification_error": "Verification failed with no error message",
18+
"verification_output": "======================================================================\nPostgreSQL Task 3 Verification: Fix Customer Analysis Query\n======================================================================\n❌ Row 3 mismatch:\n Expected: (178, 'Marion Snyder', 'Santa Brbara dOeste', 'Brazil', 39, 38, Decimal('194.61'), 'Animation', 'Gina Degeneres', Decimal('5.63892543859649123363'), 'Premium', 'Hamlet Wisdom', 4)\n Actual: (137, 'Rhonda Kennedy', 'Apeldoorn', 'Netherlands', 38, 37, Decimal('191.62'), 'Games', 'Frances Day-Lewis', Decimal('5.94959795321637426901'), 'Premium', 'Amistad Midsummer', 2)\n❌ Row 4 mismatch:\n Expected: (137, 'Rhonda Kennedy', 'Apeldoorn', 'Netherlands', 38, 37, Decimal('191.62'), 'Games', 'Frances Day-Lewis', Decimal('5.94959795321637426901'), 'Premium', 'Amistad Midsummer', 2)\n Actual: (178, 'Marion Snyder', 'Santa Brbara dOeste', 'Brazil', 38, 37, Decimal('189.62'), 'Animation', 'Gina Degeneres', Decimal('5.63892543859649123363'), 'Premium', 'Hamlet Wisdom', 4)\n❌ Row 7 mismatch:\n Expected: (181, 'Ana Bradley', 'Memphis', 'United States', 33, 31, Decimal('167.67'), 'Family', 'Christian Akroyd', Decimal('6.0310329861111111'), 'Premium', 'Island Exorcist', 5)\n Actual: (236, 'Marcia Dean', 'Tanza', 'Philippines', 38, 38, Decimal('165.62'), 'Foreign', 'Dustin Tautou', Decimal('5.25438596491228069357'), 'Premium', 'Alamo Videotape', 4)\n❌ Row 8 mismatch:\n Expected: (410, 'Curtis Irby', 'Richmond Hill', 'Canada', 38, 37, Decimal('167.62'), 'Action', 'Susan Davis', Decimal('5.34350600600600600841'), 'Premium', 'Hanky October', 3)\n Actual: (181, 'Ana Bradley', 'Memphis', 'United States', 32, 30, Decimal('164.68'), 'Family', 'Christian Akroyd', Decimal('6.0310329861111111'), 'Premium', 'Island Exorcist', 5)\n❌ Row 9 mismatch:\n Expected: (236, 'Marcia Dean', 'Tanza', 'Philippines', 39, 39, Decimal('166.61'), 'Foreign', 'Dustin Tautou', Decimal('5.25438596491228069357'), 'Premium', 'Alamo Videotape', 4)\n Actual: (410, 'Curtis Irby', 'Richmond Hill', 'Canada', 37, 36, Decimal('164.63'), 'Action', 'Susan Davis', Decimal('5.34350600600600600841'), 'Premium', 'Hanky October', 3)\n❌ Total mismatches: 571\n\n❌ Task verification: FAIL\n - The query still has issues\n - Please review the duplicate counting problem\n"
19+
},
20+
"token_usage": {
21+
"input_tokens": 216633,
22+
"output_tokens": 5407,
23+
"total_tokens": 222040,
24+
"reasoning_tokens": 0
25+
},
26+
"turn_count": 12
27+
}

mcp_servers/postgres/insforge/run-1/dvdrental__customer_analytics_optimization/messages.json

Lines changed: 267 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)