From bce2daf825a426ef70ee3c92257b1652b6920d60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Thu, 24 Apr 2025 11:45:57 +0200 Subject: [PATCH 1/9] feat: update gitignore for IDE --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index a14702c4..e06d00f8 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Finder (MacOS) folder config .DS_Store + +.qodo \ No newline at end of file From f9bbee4308ca84f1df1967d29ee2b7f1b1b0854d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Thu, 24 Apr 2025 12:10:36 +0200 Subject: [PATCH 2/9] feat: implement Elo-based credit assignment for grant applications --- scripts/credit-assignment-pairwise.ts | 117 ++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 scripts/credit-assignment-pairwise.ts diff --git a/scripts/credit-assignment-pairwise.ts b/scripts/credit-assignment-pairwise.ts new file mode 100644 index 00000000..3560f73b --- /dev/null +++ b/scripts/credit-assignment-pairwise.ts @@ -0,0 +1,117 @@ +import { + fetchModelSpecs, + getApplicationId, + loadApplicationsFromDirectory, + loadReview, + saveFile, +} from "../utils/utils"; + +import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; + +// Core Elo scoring parameters +const BASE_RATING = 1000; +const K_FACTOR = 32; // Adjust this to control how volatile the scores are + +// Helper: Calculate expected score between two ratings +function expectedScore(ratingA: number, ratingB: number): number { + return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); +} + +// Helper: Update Elo rating after a matchup +function updateElo(rating: number, expected: number, actual: number): number { + return rating + K_FACTOR * (actual - expected); +} + +async function main() { + // Load all applications from the directory + const applications = loadApplicationsFromDirectory(); + console.log(`Processing ${applications.length} applications...`); + + // Get the available review models/agents + const modelSpecs = await fetchModelSpecs(); + + // Load all reviews for all agents + const reviewsByAgent = modelSpecs.reduce((acc, agent) => { + const agentName = agent?.name; + acc[agentName] = applications.map((app) => { + const id = getApplicationId(app); + const { score, ...review } = loadReview(id, agentName); + return { id, name: review.project?.title || id, review }; + }); + return acc; + }, {} as Record); + + // Loop through each agent's reviews + for (const [agentName, reviews] of Object.entries(reviewsByAgent)) { + console.log(`\n🎯 Running Elo tournament for agent: ${agentName}`); + + // Initialize all ratings + const ratings: Record = {}; + for (const { id } of reviews) { + ratings[id] = BASE_RATING; + } + + // Run simulated pairwise matchups (round-robin style) + for (let i = 0; i < reviews.length; i++) { + for (let j = i + 1; j < reviews.length; j++) { + const appA = reviews[i]; + const appB = reviews[j]; + + const prompt = ` +You are comparing two grant applications. + +Based on their reviews, which project deserves *more* funding? + +Respond with just "A" or "B". + +Project A: +${JSON.stringify(appA.review, null, 2)} + +Project B: +${JSON.stringify(appB.review, null, 2)} +`; + + const result = await creditAssignmentAgent.generate(prompt); + const winner = result.text.trim().toUpperCase(); + + const ratingA = ratings[appA.id]; + const ratingB = ratings[appB.id]; + const expectedA = expectedScore(ratingA, ratingB); + const expectedB = expectedScore(ratingB, ratingA); + + if (winner === "A") { + ratings[appA.id] = updateElo(ratingA, expectedA, 1); + ratings[appB.id] = updateElo(ratingB, expectedB, 0); + } else if (winner === "B") { + ratings[appA.id] = updateElo(ratingA, expectedA, 0); + ratings[appB.id] = updateElo(ratingB, expectedB, 1); + } else { + console.warn(`⚠️ Unexpected response: ${result.text}`); + } + } + } + + // Normalize scores so they sum to 1 (for funding allocation) + const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); + const normalized = Object.entries(ratings).map(([id, score]) => { + const name = reviews.find((r) => r.id === id)?.name || id; + return { + id, + name, + score: (score / totalScore).toFixed(6), + }; + }); + + // Prepare output CSV format + const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); + + // Save results to file + saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); + console.log(`✅ Saved results for ${agentName}`); + } +} + +main().catch((error) => { + console.error("❌ Error:", error); + process.exit(1); +}); From 1655239559b35e09785eef788b4a477a9a7531bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Thu, 24 Apr 2025 13:38:16 +0200 Subject: [PATCH 3/9] refactor: switch credit assignment model to OpenAI and remove unused pairwise script --- agents/agents/credit-assigner.ts | 3 +- .../credit-assignment-gitcoin-communist.csv | 6 ++ ...edit-assignment-open-source-capitalist.csv | 5 ++ scores/credit-assignment-regenerator.csv | 6 ++ ...ent-gitcoin-communist-from-review-only.csv | 6 ++ ...ment-gitcoin-communist-review-and-data.csv | 6 ++ ...lo-credit-assignment-gitcoin-communist.csv | 6 ++ ...pen-source-capitalist-from-review-only.csv | 6 ++ ...open-source-capitalist-review-and-data.csv | 6 ++ ...edit-assignment-open-source-capitalist.csv | 6 ++ ...ssignment-regenerator-from-review-only.csv | 6 ++ ...assignment-regenerator-review-and-data.csv | 6 ++ scores/elo-credit-assignment-regenerator.csv | 6 ++ ...t-pairwise.ts => credit-assignment-elo.ts} | 90 ++++++++++++++----- utils/utils.ts | 9 ++ 15 files changed, 149 insertions(+), 24 deletions(-) create mode 100644 scores/credit-assignment-gitcoin-communist.csv create mode 100644 scores/credit-assignment-open-source-capitalist.csv create mode 100644 scores/credit-assignment-regenerator.csv create mode 100644 scores/elo-credit-assignment-gitcoin-communist-from-review-only.csv create mode 100644 scores/elo-credit-assignment-gitcoin-communist-review-and-data.csv create mode 100644 scores/elo-credit-assignment-gitcoin-communist.csv create mode 100644 scores/elo-credit-assignment-open-source-capitalist-from-review-only.csv create mode 100644 scores/elo-credit-assignment-open-source-capitalist-review-and-data.csv create mode 100644 scores/elo-credit-assignment-open-source-capitalist.csv create mode 100644 scores/elo-credit-assignment-regenerator-from-review-only.csv create mode 100644 scores/elo-credit-assignment-regenerator-review-and-data.csv create mode 100644 scores/elo-credit-assignment-regenerator.csv rename scripts/{credit-assignment-pairwise.ts => credit-assignment-elo.ts} (50%) diff --git a/agents/agents/credit-assigner.ts b/agents/agents/credit-assigner.ts index 895bfd61..8ac9defd 100644 --- a/agents/agents/credit-assigner.ts +++ b/agents/agents/credit-assigner.ts @@ -10,5 +10,6 @@ export const creditAssignmentAgent = new Agent({ You are given a list of applications reviews and you should assign a score between 0.00 and 1.00 to each review based on how much funding the project deserve. The total score of all reviews should be 1.00. `, - model: google("gemini-2.0-flash-thinking-exp-01-21"), + //model: google("gemini-2.0-flash-thinking-exp-01-21"), + model: openai("gpt-4.1-2025-04-14"), }); diff --git a/scores/credit-assignment-gitcoin-communist.csv b/scores/credit-assignment-gitcoin-communist.csv new file mode 100644 index 00000000..921c4c77 --- /dev/null +++ b/scores/credit-assignment-gitcoin-communist.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.3125 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO,0.2250 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.0875 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.2500 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.1250 \ No newline at end of file diff --git a/scores/credit-assignment-open-source-capitalist.csv b/scores/credit-assignment-open-source-capitalist.csv new file mode 100644 index 00000000..6fe5960c --- /dev/null +++ b/scores/credit-assignment-open-source-capitalist.csv @@ -0,0 +1,5 @@ +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.3500 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO,0.2500 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.1000 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.1500 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.1500 \ No newline at end of file diff --git a/scores/credit-assignment-regenerator.csv b/scores/credit-assignment-regenerator.csv new file mode 100644 index 00000000..29d23592 --- /dev/null +++ b/scores/credit-assignment-regenerator.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.3800 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO,0.1800 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.0800 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.2300 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.1300 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist-from-review-only.csv b/scores/elo-credit-assignment-gitcoin-communist-from-review-only.csv new file mode 100644 index 00000000..fae0c4f7 --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist-from-review-only.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,0.206004 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187525 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,0.200414 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,0.194112 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist-review-and-data.csv b/scores/elo-credit-assignment-gitcoin-communist-review-and-data.csv new file mode 100644 index 00000000..21baec7f --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist-review-and-data.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,AERTH,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist.csv b/scores/elo-credit-assignment-gitcoin-communist.csv new file mode 100644 index 00000000..44db4cf4 --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist-from-review-only.csv b/scores/elo-credit-assignment-open-source-capitalist-from-review-only.csv new file mode 100644 index 00000000..b54ee566 --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist-from-review-only.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,0.206004 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.193925 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,0.200120 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,0.188006 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist-review-and-data.csv b/scores/elo-credit-assignment-open-source-capitalist-review-and-data.csv new file mode 100644 index 00000000..21baec7f --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist-review-and-data.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,AERTH,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist.csv b/scores/elo-credit-assignment-open-source-capitalist.csv new file mode 100644 index 00000000..44db4cf4 --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator-from-review-only.csv b/scores/elo-credit-assignment-regenerator-from-review-only.csv new file mode 100644 index 00000000..774afb03 --- /dev/null +++ b/scores/elo-credit-assignment-regenerator-from-review-only.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,0.199898 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.194192 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,0.205938 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,0.188027 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator-review-and-data.csv b/scores/elo-credit-assignment-regenerator-review-and-data.csv new file mode 100644 index 00000000..4a5d7be6 --- /dev/null +++ b/scores/elo-credit-assignment-regenerator-review-and-data.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,AERTH, 0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator.csv b/scores/elo-credit-assignment-regenerator.csv new file mode 100644 index 00000000..27e2531b --- /dev/null +++ b/scores/elo-credit-assignment-regenerator.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scripts/credit-assignment-pairwise.ts b/scripts/credit-assignment-elo.ts similarity index 50% rename from scripts/credit-assignment-pairwise.ts rename to scripts/credit-assignment-elo.ts index 3560f73b..fc7e48f1 100644 --- a/scripts/credit-assignment-pairwise.ts +++ b/scripts/credit-assignment-elo.ts @@ -4,6 +4,10 @@ import { loadApplicationsFromDirectory, loadReview, saveFile, + loadApplication, + loadKarmaGap, + loadResearch, + getProjectName, } from "../utils/utils"; import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; @@ -27,55 +31,95 @@ async function main() { const applications = loadApplicationsFromDirectory(); console.log(`Processing ${applications.length} applications...`); + // Pre-load all necessary data for each application + console.log("Pre-loading application data (app, research, karmagap)..."); + const applicationDataMap = new Map(); + for (const app of applications) { + const id = getApplicationId(app); + const name = getProjectName(app) || id; + applicationDataMap.set(id, { + id, + name, + application: loadApplication(id), + research: loadResearch(id), + karmaGap: loadKarmaGap(id), + }); + } + console.log("Finished pre-loading data."); + // Get the available review models/agents const modelSpecs = await fetchModelSpecs(); - // Load all reviews for all agents + // Load all reviews for all agents, associating with pre-loaded data const reviewsByAgent = modelSpecs.reduce((acc, agent) => { const agentName = agent?.name; - acc[agentName] = applications.map((app) => { - const id = getApplicationId(app); - const { score, ...review } = loadReview(id, agentName); - return { id, name: review.project?.title || id, review }; - }); + acc[agentName] = applications + .map((app) => { + const id = getApplicationId(app); + const baseData = applicationDataMap.get(id); + if (!baseData) return null; + + const reviewData = loadReview(id, agentName); + if (!reviewData) return null; + + return { ...baseData, review: reviewData }; + }) + .filter(Boolean); return acc; - }, {} as Record); + }, {} as Record); // Loop through each agent's reviews - for (const [agentName, reviews] of Object.entries(reviewsByAgent)) { + for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { console.log(`\n🎯 Running Elo tournament for agent: ${agentName}`); // Initialize all ratings const ratings: Record = {}; - for (const { id } of reviews) { + for (const { id } of agentApplicationsData) { ratings[id] = BASE_RATING; } // Run simulated pairwise matchups (round-robin style) - for (let i = 0; i < reviews.length; i++) { - for (let j = i + 1; j < reviews.length; j++) { - const appA = reviews[i]; - const appB = reviews[j]; + for (let i = 0; i < agentApplicationsData.length; i++) { + for (let j = i + 1; j < agentApplicationsData.length; j++) { + const appA = agentApplicationsData[i]!; + const appB = agentApplicationsData[j]!; + + // Prepare data snippets for the prompt (avoid stringifying huge objects) + const projectAData = { + title: appA.name, + application_summary: appA.application?.project?.description, + research_summary: appA.research?.summary, + karmagap_score: appA.karmaGap?.score, + reviewer_comment: appA.review?.comments + }; + const projectBData = { + title: appB.name, + application_summary: appB.application?.project?.description, + research_summary: appB.research?.summary, + karmagap_score: appB.karmaGap?.score, + reviewer_comment: appB.review?.comments + }; const prompt = ` -You are comparing two grant applications. +You are a grant allocator reviewing two projects. Consider all available information. -Based on their reviews, which project deserves *more* funding? +Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, and overall quality presented in the data below. -Respond with just "A" or "B". +You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. +Respond ONLY with "A" or "B". Do NOT explain. -Project A: -${JSON.stringify(appA.review, null, 2)} +--- Project A --- +${JSON.stringify(projectAData, null, 2)} -Project B: -${JSON.stringify(appB.review, null, 2)} +--- Project B --- +${JSON.stringify(projectBData, null, 2)} `; const result = await creditAssignmentAgent.generate(prompt); const winner = result.text.trim().toUpperCase(); - const ratingA = ratings[appA.id]; - const ratingB = ratings[appB.id]; + const ratingA = ratings[appA.id]!; + const ratingB = ratings[appB.id]!; const expectedA = expectedScore(ratingA, ratingB); const expectedB = expectedScore(ratingB, ratingA); @@ -94,7 +138,7 @@ ${JSON.stringify(appB.review, null, 2)} // Normalize scores so they sum to 1 (for funding allocation) const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); const normalized = Object.entries(ratings).map(([id, score]) => { - const name = reviews.find((r) => r.id === id)?.name || id; + const name = applicationDataMap.get(id)?.name || id; return { id, name, diff --git a/utils/utils.ts b/utils/utils.ts index f94498f3..fe40d89b 100644 --- a/utils/utils.ts +++ b/utils/utils.ts @@ -136,6 +136,15 @@ export function loadReview(applicationId: string, agent: string): any { return null; } } +export function loadApplication(applicationId: string): any { + try { + return JSON.parse( + readFileSync(getApplicationPath(applicationId) + "/application.json", "utf8") + ); + } catch (error) { + return null; + } +} export function loadKarmaGap(applicationId: string): any { try { return JSON.parse( From ec51cc76876207a192d86c83187f083aa7f0f384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Fri, 9 May 2025 14:16:38 +0200 Subject: [PATCH 4/9] refactor: standardize application ID usage in credit assignment script --- scripts/credit-assignment-elo.ts | 44 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/scripts/credit-assignment-elo.ts b/scripts/credit-assignment-elo.ts index fc7e48f1..d10cf0ef 100644 --- a/scripts/credit-assignment-elo.ts +++ b/scripts/credit-assignment-elo.ts @@ -35,14 +35,14 @@ async function main() { console.log("Pre-loading application data (app, research, karmagap)..."); const applicationDataMap = new Map(); for (const app of applications) { - const id = getApplicationId(app); - const name = getProjectName(app) || id; - applicationDataMap.set(id, { - id, + const applicationId = getApplicationId(app); + const name = getProjectName(app) || applicationId; + applicationDataMap.set(applicationId, { + applicationId, name, - application: loadApplication(id), - research: loadResearch(id), - karmaGap: loadKarmaGap(id), + application: loadApplication(applicationId), + research: loadResearch(applicationId), + karmaGap: loadKarmaGap(applicationId), }); } console.log("Finished pre-loading data."); @@ -55,11 +55,11 @@ async function main() { const agentName = agent?.name; acc[agentName] = applications .map((app) => { - const id = getApplicationId(app); - const baseData = applicationDataMap.get(id); + const applicationId = getApplicationId(app); + const baseData = applicationDataMap.get(applicationId); if (!baseData) return null; - const reviewData = loadReview(id, agentName); + const reviewData = loadReview(applicationId, agentName); if (!reviewData) return null; return { ...baseData, review: reviewData }; @@ -74,8 +74,8 @@ async function main() { // Initialize all ratings const ratings: Record = {}; - for (const { id } of agentApplicationsData) { - ratings[id] = BASE_RATING; + for (const { applicationId } of agentApplicationsData) { + ratings[applicationId] = BASE_RATING; } // Run simulated pairwise matchups (round-robin style) @@ -118,17 +118,17 @@ ${JSON.stringify(projectBData, null, 2)} const result = await creditAssignmentAgent.generate(prompt); const winner = result.text.trim().toUpperCase(); - const ratingA = ratings[appA.id]!; - const ratingB = ratings[appB.id]!; + const ratingA = ratings[appA.applicationId]!; + const ratingB = ratings[appB.applicationId]!; const expectedA = expectedScore(ratingA, ratingB); const expectedB = expectedScore(ratingB, ratingA); if (winner === "A") { - ratings[appA.id] = updateElo(ratingA, expectedA, 1); - ratings[appB.id] = updateElo(ratingB, expectedB, 0); + ratings[appA.applicationId] = updateElo(ratingA, expectedA, 1); + ratings[appB.applicationId] = updateElo(ratingB, expectedB, 0); } else if (winner === "B") { - ratings[appA.id] = updateElo(ratingA, expectedA, 0); - ratings[appB.id] = updateElo(ratingB, expectedB, 1); + ratings[appA.applicationId] = updateElo(ratingA, expectedA, 0); + ratings[appB.applicationId] = updateElo(ratingB, expectedB, 1); } else { console.warn(`⚠️ Unexpected response: ${result.text}`); } @@ -137,17 +137,17 @@ ${JSON.stringify(projectBData, null, 2)} // Normalize scores so they sum to 1 (for funding allocation) const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); - const normalized = Object.entries(ratings).map(([id, score]) => { - const name = applicationDataMap.get(id)?.name || id; + const normalized = Object.entries(ratings).map(([applicationId, score]) => { + const name = applicationDataMap.get(applicationId)?.name || applicationId; return { - id, + applicationId, name, score: (score / totalScore).toFixed(6), }; }); // Prepare output CSV format - const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); + const output = ["applicationId,name,score", ...normalized.map((r) => `${r.applicationId},${r.name},${r.score}`)].join("\n"); // Save results to file saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); From cb51ec55e4d30d04e7012bbaffaca0bbdaa9f941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Tue, 29 Apr 2025 08:55:07 +0200 Subject: [PATCH 5/9] fix: update scores in Elo credit assignment CSV files for consistency --- ...unist-review-and-data-emphasize-review.csv | 6 + ...lo-credit-assignment-gitcoin-communist.csv | 10 +- ...alist-review-and-data-emphasize-review.csv | 6 + ...edit-assignment-open-source-capitalist.csv | 10 +- ...rator-review-and-data-emphasize-review.csv | 6 + scores/elo-credit-assignment-regenerator.csv | 10 +- .../credit-assignment-elo-how-much-better.ts | 199 ++++++++++++++++++ 7 files changed, 232 insertions(+), 15 deletions(-) create mode 100644 scores/elo-credit-assignment-gitcoin-communist-review-and-data-emphasize-review.csv create mode 100644 scores/elo-credit-assignment-open-source-capitalist-review-and-data-emphasize-review.csv create mode 100644 scores/elo-credit-assignment-regenerator-review-and-data-emphasize-review.csv create mode 100644 scripts/credit-assignment-elo-how-much-better.ts diff --git a/scores/elo-credit-assignment-gitcoin-communist-review-and-data-emphasize-review.csv b/scores/elo-credit-assignment-gitcoin-communist-review-and-data-emphasize-review.csv new file mode 100644 index 00000000..44db4cf4 --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist-review-and-data-emphasize-review.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist.csv b/scores/elo-credit-assignment-gitcoin-communist.csv index 44db4cf4..9884871a 100644 --- a/scores/elo-credit-assignment-gitcoin-communist.csv +++ b/scores/elo-credit-assignment-gitcoin-communist.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.201165 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.201116 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.196483 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200631 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200605 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist-review-and-data-emphasize-review.csv b/scores/elo-credit-assignment-open-source-capitalist-review-and-data-emphasize-review.csv new file mode 100644 index 00000000..44db4cf4 --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist-review-and-data-emphasize-review.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist.csv b/scores/elo-credit-assignment-open-source-capitalist.csv index 44db4cf4..9884871a 100644 --- a/scores/elo-credit-assignment-open-source-capitalist.csv +++ b/scores/elo-credit-assignment-open-source-capitalist.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.201165 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.201116 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.196483 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200631 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200605 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator-review-and-data-emphasize-review.csv b/scores/elo-credit-assignment-regenerator-review-and-data-emphasize-review.csv new file mode 100644 index 00000000..27e2531b --- /dev/null +++ b/scores/elo-credit-assignment-regenerator-review-and-data-emphasize-review.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator.csv b/scores/elo-credit-assignment-regenerator.csv index 27e2531b..9884871a 100644 --- a/scores/elo-credit-assignment-regenerator.csv +++ b/scores/elo-credit-assignment-regenerator.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.201165 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.201116 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.196483 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200631 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200605 \ No newline at end of file diff --git a/scripts/credit-assignment-elo-how-much-better.ts b/scripts/credit-assignment-elo-how-much-better.ts new file mode 100644 index 00000000..83abc4bb --- /dev/null +++ b/scripts/credit-assignment-elo-how-much-better.ts @@ -0,0 +1,199 @@ +import { + fetchModelSpecs, + getApplicationId, + loadApplicationsFromDirectory, + loadReview, + saveFile, + loadApplication, + loadKarmaGap, + loadResearch, + getProjectName, +} from "../utils/utils"; + +import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; + +// Core Elo scoring parameters +const BASE_RATING = 1000; +const K_FACTOR = 32; // Adjust this to control how volatile the scores are + +// Helper: Calculate expected score between two ratings +function expectedScore(ratingA: number, ratingB: number): number { + return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); +} + +// Helper: Update Elo rating after a matchup +function updateElo(rating: number, expected: number, actual: number): number { + return rating + K_FACTOR * (actual - expected); +} +function updateScore(rating: number, score: number): number { + return rating * score; +} +async function main() { + // Load all applications from the directory + const applications = loadApplicationsFromDirectory(); + console.log(`Processing ${applications.length} applications...`); + + // Pre-load all necessary data for each application + console.log("Pre-loading application data (app, research, karmagap)..."); + const applicationDataMap = new Map(); + for (const app of applications) { + const id = getApplicationId(app); + const name = getProjectName(app) || id; + applicationDataMap.set(id, { + id, + name, + application: loadApplication(id), + research: loadResearch(id), + karmaGap: loadKarmaGap(id), + }); + } + console.log("Finished pre-loading data."); + + // Get the available review models/agents + const modelSpecs = await fetchModelSpecs(); + + // Load all reviews for all agents, associating with pre-loaded data + const reviewsByAgent = modelSpecs.reduce((acc, agent) => { + const agentName = agent?.name; + acc[agentName] = applications + .map((app) => { + const id = getApplicationId(app); + const baseData = applicationDataMap.get(id); + if (!baseData) return null; + + const reviewData = loadReview(id, agentName); + if (!reviewData) return null; + + return { ...baseData, review: reviewData }; + }) + .filter(Boolean); + return acc; + }, {} as Record); + + // Loop through each agent's reviews + for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { + console.log(`\n🎯 Running Elo tournament for agent: ${agentName}`); + + // Initialize all ratings + const ratings: Record = {}; + for (const { id } of agentApplicationsData) { + ratings[id] = BASE_RATING; + } + + // Run simulated pairwise matchups (round-robin style) + for (let i = 0; i < agentApplicationsData.length; i++) { + for (let j = i + 1; j < agentApplicationsData.length; j++) { + const appA = agentApplicationsData[i]!; + const appB = agentApplicationsData[j]!; + + // Prepare data snippets for the prompt (avoid stringifying huge objects) + const projectAData = { + title: appA.name, + application_summary: appA.application?.project?.description, + research_summary: appA.research?.summary, + karmagap_score: appA.karmaGap?.score, + reviewer_comment: appA.review?.comments + }; + const projectBData = { + title: appB.name, + application_summary: appB.application?.project?.description, + research_summary: appB.research?.summary, + karmagap_score: appB.karmaGap?.score, + reviewer_comment: appB.review?.comments + }; + + const prompt = ` +You are a grant allocator reviewing two projects. Consider all available information. + +Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, and overall quality presented in the data below. + +Then, estimate *how much* better the winning project is on a scale from 0.5 (projects are roughly equal) to 1.0 (winner is significantly better). + +You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. + +Respond ONLY in the format 'W,S' where W is the winning project ('A' or 'B') and S is the score (e.g., 'A,0.8' or 'B,0.6'). Do NOT explain. + +--- Project A --- +${JSON.stringify(projectAData, null, 2)} + +--- Project B --- +${JSON.stringify(projectBData, null, 2)} +`; + + const result = await creditAssignmentAgent.generate(prompt); + + // Parse the response "W,S" + const responseParts = result.text.trim().toUpperCase().split(','); + let actualA = 0.5; // Default to draw + let actualB = 0.5; + + if (responseParts.length === 2) { + const winnerLetter = responseParts[0]; + const scoreString = responseParts[1]; + console.log("winnerLetter is ", winnerLetter); + console.log("scoreString is ", scoreString); + // Check if parts are defined before using them + if (winnerLetter !== undefined && scoreString !== undefined) { + const winMagnitude = parseFloat(scoreString); + + if ((winnerLetter === 'A' || winnerLetter === 'B') && !isNaN(winMagnitude) && winMagnitude >= 0.5 && winMagnitude <= 1.0) { + if (winnerLetter === 'A') { + actualA = winMagnitude; + actualB = 1 - winMagnitude; + } else { // Winner is B + actualB = winMagnitude; + actualA = 1 - winMagnitude; + } + } else { + console.warn(`⚠️ Unexpected response format or score out of range [0.5, 1.0]: ${result.text}. Treating as draw.`); + // Keep default actualA = 0.5, actualB = 0.5 + } + } else { + console.warn(`⚠️ Unexpected response format: ${result.text}. Expected 'W,S'. Treating as draw.`); + // Keep default actualA = 0.5, actualB = 0.5 + } + } else { + // Enhanced logging for unexpected format + console.warn( + `\x1b[31m⚠️ Unexpected response format. Expected 'W,S'. Treating as draw.\n` + + ` Raw Response: "${result.text.trim()}"\n` + + ` Parsed Parts (${responseParts.length}): ${JSON.stringify(responseParts)}\x1b[0m` // Reset color + ); + // Keep default actualA = 0.5, actualB = 0.5 + } + + const ratingA = ratings[appA.id]!; + const ratingB = ratings[appB.id]!; + const expectedA = expectedScore(ratingA, ratingB); + const expectedB = expectedScore(ratingB, ratingA); + + // Update ratings using the calculated actual scores + ratings[appA.id] = updateElo(ratingA, expectedA, actualA); + ratings[appB.id] = updateElo(ratingB, expectedB, actualB); + } + } + + // Normalize scores so they sum to 1 (for funding allocation) + const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); + const normalized = Object.entries(ratings).map(([id, score]) => { + const name = applicationDataMap.get(id)?.name || id; + return { + id, + name, + score: (score / totalScore).toFixed(6), + }; + }); + + // Prepare output CSV format + const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); + + // Save results to file + saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); + console.log(`✅ Saved results for ${agentName}`); + } +} + +main().catch((error) => { + console.error("❌ Error:", error); + process.exit(1); +}); From c49d0fcacdfcb53e2359f9f061644f19faf1f9c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Tue, 29 Apr 2025 11:33:20 +0200 Subject: [PATCH 6/9] refactor: enhance Elo scoring logic and update CSV scores for improved accuracy --- ...lo-credit-assignment-gitcoin-communist.csv | 10 +- ...edit-assignment-open-source-capitalist.csv | 10 +- scores/elo-credit-assignment-regenerator.csv | 10 +- .../credit-assignment-elo-how-much-better.ts | 202 ++++++++++++++---- 4 files changed, 174 insertions(+), 58 deletions(-) diff --git a/scores/elo-credit-assignment-gitcoin-communist.csv b/scores/elo-credit-assignment-gitcoin-communist.csv index 9884871a..aac19ea9 100644 --- a/scores/elo-credit-assignment-gitcoin-communist.csv +++ b/scores/elo-credit-assignment-gitcoin-communist.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.201165 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.201116 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.196483 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200631 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200605 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.246080 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.276800 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.105280 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.197440 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.174400 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist.csv b/scores/elo-credit-assignment-open-source-capitalist.csv index 9884871a..f6bd854a 100644 --- a/scores/elo-credit-assignment-open-source-capitalist.csv +++ b/scores/elo-credit-assignment-open-source-capitalist.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.201165 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.201116 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.196483 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200631 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200605 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.246080 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.276800 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.102720 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200000 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.174400 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator.csv b/scores/elo-credit-assignment-regenerator.csv index 9884871a..291f9d10 100644 --- a/scores/elo-credit-assignment-regenerator.csv +++ b/scores/elo-credit-assignment-regenerator.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.201165 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.201116 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.196483 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200631 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200605 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.251200 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.276800 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.102720 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200000 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.169280 \ No newline at end of file diff --git a/scripts/credit-assignment-elo-how-much-better.ts b/scripts/credit-assignment-elo-how-much-better.ts index 83abc4bb..816742dc 100644 --- a/scripts/credit-assignment-elo-how-much-better.ts +++ b/scripts/credit-assignment-elo-how-much-better.ts @@ -14,20 +14,48 @@ import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; // Core Elo scoring parameters const BASE_RATING = 1000; -const K_FACTOR = 32; // Adjust this to control how volatile the scores are +// INCREASE K_FACTOR SIGNIFICANTLY to amplify small magnitude differences +const K_FACTOR = 256; // Was 32, then 128 -// Helper: Calculate expected score between two ratings +// Helper: Calculate expected score between two ratings (Standard Elo - currently unused in rating updates) function expectedScore(ratingA: number, ratingB: number): number { return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); } -// Helper: Update Elo rating after a matchup -function updateElo(rating: number, expected: number, actual: number): number { - return rating + K_FACTOR * (actual - expected); +// Helper: Update Elo rating after a matchup (Standard Elo - currently unused) +// function updateElo(rating: number, expected: number, actual: number): number { +// return rating + K_FACTOR * (actual - expected); +// } + +// New Helper: Update rating based *only* on the actual score magnitude from the match +function updateRatingDirectly(rating: number, actual: number): number { + // actual is the score for *this* player (0 to 1, derived from magnitude 0.5-1.0) + // differenceFromNeutral will be positive for wins (>0.5), negative for losses (<0.5) + const differenceFromNeutral = actual - 0.5; + // Adjust rating based on deviation from neutral, scaled by K_FACTOR + return rating + K_FACTOR * differenceFromNeutral; } -function updateScore(rating: number, score: number): number { - return rating * score; + +// Helper to safely get the 'output' of the last research entry for a specific agent +// Assumes input is a valid array with at least one element +const getResearchOutput = (researchAgentArray: any[]): string | undefined => { + // Access the last element safely and return its output field (if it exists) + return researchAgentArray[researchAgentArray.length - 1]?.output; } + +// Helper function to process research data for a single agent type +const processResearchAgent = (appName: string, agentKey: string, agentData: any): string | undefined => { + if (Array.isArray(agentData) && agentData.length > 0) { + return getResearchOutput(agentData); + } else if (agentData !== undefined && agentData !== null) { + // Log if we get something other than undefined/null that's not a non-empty array + console.warn(`DEBUG: Unexpected research data type for ${appName} -> ${agentKey}. Expected array, got:`, typeof agentData, JSON.stringify(agentData)?.substring(0, 100) + '...'); + return undefined; + } + // Return undefined if data is null, undefined, or an empty array + return undefined; +}; + async function main() { // Load all applications from the directory const applications = loadApplicationsFromDirectory(); @@ -43,11 +71,16 @@ async function main() { id, name, application: loadApplication(id), - research: loadResearch(id), + research: loadResearch(id), // This loads the whole research object for the ID karmaGap: loadKarmaGap(id), }); } - console.log("Finished pre-loading data."); + // Add this logging temporarily AFTER the loop finishes (around line 53) + const gainForestId = "42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e"; // Fixed ID + const treegensId = "42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca"; + // console.log("DEBUG Preloaded GainForest Data:", JSON.stringify(applicationDataMap.get(gainForestId), null, 2)); + // console.log("DEBUG Preloaded Treegens Data:", JSON.stringify(applicationDataMap.get(treegensId), null, 2)); + // console.log("Finished pre-loading data."); // Get the available review models/agents const modelSpecs = await fetchModelSpecs(); @@ -62,14 +95,17 @@ async function main() { if (!baseData) return null; const reviewData = loadReview(id, agentName); + // We need review data to proceed for this agent if (!reviewData) return null; + // Add the review data to the base data return { ...baseData, review: reviewData }; }) - .filter(Boolean); + .filter(Boolean); // Filter out any apps where review data was missing for this agent return acc; }, {} as Record); + // Loop through each agent's reviews for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { console.log(`\n🎯 Running Elo tournament for agent: ${agentName}`); @@ -86,30 +122,91 @@ async function main() { const appA = agentApplicationsData[i]!; const appB = agentApplicationsData[j]!; + // Process research data using the helper function for better debugging + const academicResearchOutputA = processResearchAgent(appA.name, 'Academic_Research_Agent', appA.research?.Academic_Research_Agent); + const academicResearchOutputB = processResearchAgent(appB.name, 'Academic_Research_Agent', appB.research?.Academic_Research_Agent); + + const factCheckingOutputA = processResearchAgent(appA.name, 'Fact_Checking_Agent', appA.research?.Fact_Checking_Agent); + const factCheckingOutputB = processResearchAgent(appB.name, 'Fact_Checking_Agent', appB.research?.Fact_Checking_Agent); + + const webSearchOutputA = processResearchAgent(appA.name, 'Web_Search_Agent', appA.research?.Web_Search_Agent); + const webSearchOutputB = processResearchAgent(appB.name, 'Web_Search_Agent', appB.research?.Web_Search_Agent); + + const primaryResearchOutputA = processResearchAgent(appA.name, 'Primary_Research_Agent', appA.research?.Primary_Research_Agent); + const primaryResearchOutputB = processResearchAgent(appB.name, 'Primary_Research_Agent', appB.research?.Primary_Research_Agent); + + const dataAnalysisOutputA = processResearchAgent(appA.name, 'Data_Analysis_Agent', appA.research?.Data_Analysis_Agent); + const dataAnalysisOutputB = processResearchAgent(appB.name, 'Data_Analysis_Agent', appB.research?.Data_Analysis_Agent); + + // Prepare data snippets for the prompt (avoid stringifying huge objects) + // Note: application_summary now uses metadata.description if available const projectAData = { title: appA.name, - application_summary: appA.application?.project?.description, - research_summary: appA.research?.summary, + application_summary: appA.application?.project?.metadata?.description ?? appA.application?.project?.description, // Use metadata first karmagap_score: appA.karmaGap?.score, - reviewer_comment: appA.review?.comments + reviewer_comment: appA.review?.comments, + academic_research: academicResearchOutputA, + fact_checking: factCheckingOutputA, + web_search: webSearchOutputA, + primary_research: primaryResearchOutputA, + data_analysis: dataAnalysisOutputA }; const projectBData = { title: appB.name, - application_summary: appB.application?.project?.description, - research_summary: appB.research?.summary, + application_summary: appB.application?.project?.metadata?.description ?? appB.application?.project?.description, // Use metadata first karmagap_score: appB.karmaGap?.score, - reviewer_comment: appB.review?.comments + reviewer_comment: appB.review?.comments, + academic_research: academicResearchOutputB, + fact_checking: factCheckingOutputB, + web_search: webSearchOutputB, + primary_research: primaryResearchOutputB, + data_analysis: dataAnalysisOutputB }; + // --- DEBUG LOGGING START --- + // Log data specifically for the GainForest vs Treegens DAO comparison + const isTargetComparison = (appA.name === "GainForest" && appB.name === "Treegens DAO🌳") || (appA.name === "Treegens DAO🌳" && appB.name === "GainForest"); + if (isTargetComparison) { + // Trim potentially very long outputs for concise debug logging + const trim = (s: string | undefined) => s ? s.substring(0, 100) + '...' : undefined; + console.log(`DEBUG Data sent to agent for ${ appA.name} ${JSON.stringify({ + ...projectAData, + academic_research: trim(projectAData.academic_research), + fact_checking: trim(projectAData.fact_checking), + web_search: trim(projectAData.web_search), + primary_research: trim(projectAData.primary_research), + data_analysis: trim(projectAData.data_analysis), + application_summary: trim(projectAData.application_summary), + reviewer_comment: trim(projectAData.reviewer_comment) + }, null, 2)}`); + console.log(`DEBUG Data sent to agent for ${ appB.name} ${JSON.stringify({ + ...projectBData, + academic_research: trim(projectBData.academic_research), + fact_checking: trim(projectBData.fact_checking), + web_search: trim(projectBData.web_search), + primary_research: trim(projectBData.primary_research), + data_analysis: trim(projectBData.data_analysis), + application_summary: trim(projectBData.application_summary), + reviewer_comment: trim(projectBData.reviewer_comment) + }, null, 2)}`); + } + // --- DEBUG LOGGING END --- + + // Refined prompt with clearer scale definitions const prompt = ` -You are a grant allocator reviewing two projects. Consider all available information. +You are a grant allocator reviewing two projects. Consider all available information including application summaries, research reports, fact checks, and specific reviewer comments. -Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, and overall quality presented in the data below. +Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, feasibility, and overall quality presented in the data below. Then, estimate *how much* better the winning project is on a scale from 0.5 (projects are roughly equal) to 1.0 (winner is significantly better). -You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. +* A score of 0.5 means the projects are virtually identical in potential/quality based on the provided data. +* A score of 0.6-0.7 indicates the winner is noticeably better. +* A score of 0.8-0.9 indicates the winner is significantly better. +* A score of 1.0 means the winner is vastly superior and clearly deserves much more funding consideration relative to the other. + +Use the full range [0.5, 1.0] to reflect the true difference you perceive. Be decisive if the difference is clear. You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. Give weight to the research outputs (web search, fact checking, academic context) as objective inputs alongside the application summary (project's own description) and the reviewer's subjective comment. Respond ONLY in the format 'W,S' where W is the winning project ('A' or 'B') and S is the score (e.g., 'A,0.8' or 'B,0.6'). Do NOT explain. @@ -126,15 +223,19 @@ ${JSON.stringify(projectBData, null, 2)} const responseParts = result.text.trim().toUpperCase().split(','); let actualA = 0.5; // Default to draw let actualB = 0.5; + let winnerLetter = 'DRAW'; + let winMagnitude = 0.5; + if (responseParts.length === 2) { - const winnerLetter = responseParts[0]; - const scoreString = responseParts[1]; - console.log("winnerLetter is ", winnerLetter); - console.log("scoreString is ", scoreString); - // Check if parts are defined before using them - if (winnerLetter !== undefined && scoreString !== undefined) { - const winMagnitude = parseFloat(scoreString); + // Linter Fix: Assert that parts[0] and parts[1] are defined because we checked length === 2 + winnerLetter = responseParts[0]!; + const scoreString = responseParts[1]!; + // console.log("winnerLetter is ", winnerLetter); // Keep logging minimal for clarity + // console.log("scoreString is ", scoreString); + // Check if parts are defined before using them (Redundant due to length check and assertion, but safe) + // if (winnerLetter !== undefined && scoreString !== undefined) { + winMagnitude = parseFloat(scoreString); if ((winnerLetter === 'A' || winnerLetter === 'B') && !isNaN(winMagnitude) && winMagnitude >= 0.5 && winMagnitude <= 1.0) { if (winnerLetter === 'A') { @@ -145,42 +246,57 @@ ${JSON.stringify(projectBData, null, 2)} actualA = 1 - winMagnitude; } } else { - console.warn(`⚠️ Unexpected response format or score out of range [0.5, 1.0]: ${result.text}. Treating as draw.`); - // Keep default actualA = 0.5, actualB = 0.5 + console.warn(`⚠️ Invalid response format/score: ${result.text}. Draw.`); + actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; } - } else { - console.warn(`⚠️ Unexpected response format: ${result.text}. Expected 'W,S'. Treating as draw.`); - // Keep default actualA = 0.5, actualB = 0.5 - } + // } else { // This block becomes unreachable due to length check / assertion + // console.warn(`⚠️ Invalid response parts: ${result.text}. Draw.`); + // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; + // } } else { - // Enhanced logging for unexpected format console.warn( - `\x1b[31m⚠️ Unexpected response format. Expected 'W,S'. Treating as draw.\n` + - ` Raw Response: "${result.text.trim()}"\n` + - ` Parsed Parts (${responseParts.length}): ${JSON.stringify(responseParts)}\x1b[0m` // Reset color + `\x1b[31m⚠️ Unexpected response format: "${result.text.trim()}". Draw.\x1b[0m` ); - // Keep default actualA = 0.5, actualB = 0.5 + actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; } + // Log the outcome of the match concisely only if not the debugged comparison + // Always log for now to see results + console.log(`Match: ${appA.name} vs ${appB.name} -> Winner: ${winnerLetter}, Mag: ${winMagnitude.toFixed(2)}`); + + const ratingA = ratings[appA.id]!; const ratingB = ratings[appB.id]!; - const expectedA = expectedScore(ratingA, ratingB); - const expectedB = expectedScore(ratingB, ratingA); + // We are no longer using expected scores for the update + // const expectedA = expectedScore(ratingA, ratingB); + // const expectedB = expectedScore(ratingB, ratingA); + + // Update ratings using the calculated actual scores directly + ratings[appA.id] = updateRatingDirectly(ratingA, actualA); + ratings[appB.id] = updateRatingDirectly(ratingB, actualB); + // Optional: Log rating changes if needed for debugging + // console.log(` Ratings: A=${ratings[appA.id]!.toFixed(1)}, B=${ratings[appB.id]!.toFixed(1)}`); - // Update ratings using the calculated actual scores - ratings[appA.id] = updateElo(ratingA, expectedA, actualA); - ratings[appB.id] = updateElo(ratingB, expectedB, actualB); } } + // Log raw final scores before normalization + console.log("Raw final ratings:", JSON.stringify(ratings, null, 2)); + + // Normalize scores so they sum to 1 (for funding allocation) const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); const normalized = Object.entries(ratings).map(([id, score]) => { const name = applicationDataMap.get(id)?.name || id; + // Ensure score is not negative before normalization, although unlikely with BASE_RATING=1000 and K=128 unless many losses occur + const nonNegativeScore = Math.max(0, score); + // Prevent division by zero or negative totals + const safeTotalScore = Math.max(1, totalScore); // Avoid total being 0 or negative + return { id, name, - score: (score / totalScore).toFixed(6), + score: (nonNegativeScore / safeTotalScore).toFixed(6), }; }); From 27edafa4d23cc01b2821cd3c7d85a1e16c946524 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Thu, 22 May 2025 10:34:51 +0100 Subject: [PATCH 7/9] refactor: enhance project comparison logic by including ethics and scoring details in the review process --- ...lo-credit-assignment-gitcoin-communist.csv | 10 +- ...edit-assignment-open-source-capitalist.csv | 10 +- scores/elo-credit-assignment-regenerator.csv | 10 +- .../credit-assignment-elo-how-much-better.ts | 16 +- scripts/credit-assignment-elo-v2.ts | 363 ++++++++++++++++++ utils/utils.ts | 2 +- 6 files changed, 392 insertions(+), 19 deletions(-) create mode 100644 scripts/credit-assignment-elo-v2.ts diff --git a/scores/elo-credit-assignment-gitcoin-communist.csv b/scores/elo-credit-assignment-gitcoin-communist.csv index aac19ea9..92366d26 100644 --- a/scores/elo-credit-assignment-gitcoin-communist.csv +++ b/scores/elo-credit-assignment-gitcoin-communist.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.246080 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.276800 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.105280 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.197440 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.174400 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.193779 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.199725 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.188209 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206342 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist.csv b/scores/elo-credit-assignment-open-source-capitalist.csv index f6bd854a..3b3f09f0 100644 --- a/scores/elo-credit-assignment-open-source-capitalist.csv +++ b/scores/elo-credit-assignment-open-source-capitalist.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.246080 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.276800 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.102720 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200000 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.174400 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.199719 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.194314 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.205544 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.188485 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.211938 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator.csv b/scores/elo-credit-assignment-regenerator.csv index 291f9d10..92366d26 100644 --- a/scores/elo-credit-assignment-regenerator.csv +++ b/scores/elo-credit-assignment-regenerator.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.251200 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.276800 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.102720 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200000 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.169280 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.193779 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.199725 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.188209 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206342 \ No newline at end of file diff --git a/scripts/credit-assignment-elo-how-much-better.ts b/scripts/credit-assignment-elo-how-much-better.ts index 816742dc..4ce04d74 100644 --- a/scripts/credit-assignment-elo-how-much-better.ts +++ b/scripts/credit-assignment-elo-how-much-better.ts @@ -99,7 +99,7 @@ async function main() { if (!reviewData) return null; // Add the review data to the base data - return { ...baseData, review: reviewData }; + return { ...baseData, review: reviewData, ethics: agent.ethics, constitution: agent.constitution, scoring: agent.scoring }; }) .filter(Boolean); // Filter out any apps where review data was missing for this agent return acc; @@ -143,6 +143,9 @@ async function main() { // Note: application_summary now uses metadata.description if available const projectAData = { title: appA.name, + scoring: appA.scoring, + ethics: appA.ethics, + constitution: appA.constitution, application_summary: appA.application?.project?.metadata?.description ?? appA.application?.project?.description, // Use metadata first karmagap_score: appA.karmaGap?.score, reviewer_comment: appA.review?.comments, @@ -154,6 +157,9 @@ async function main() { }; const projectBData = { title: appB.name, + scoring: appB.scoring, + ethics: appB.ethics, + constitution: appB.constitution, application_summary: appB.application?.project?.metadata?.description ?? appB.application?.project?.description, // Use metadata first karmagap_score: appB.karmaGap?.score, reviewer_comment: appB.review?.comments, @@ -192,13 +198,17 @@ async function main() { } // --- DEBUG LOGGING END --- - + const oldPrompt = ` + + `; // Refined prompt with clearer scale definitions const prompt = ` -You are a grant allocator reviewing two projects. Consider all available information including application summaries, research reports, fact checks, and specific reviewer comments. +You are an expert grant allocator reviewing and comparing two different projects. Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, feasibility, and overall quality presented in the data below. +Consider all available information including application summaries, research reports, fact checks and specific reviewer comments, while considering their particular scoring, ethics and consitution. + Then, estimate *how much* better the winning project is on a scale from 0.5 (projects are roughly equal) to 1.0 (winner is significantly better). * A score of 0.5 means the projects are virtually identical in potential/quality based on the provided data. diff --git a/scripts/credit-assignment-elo-v2.ts b/scripts/credit-assignment-elo-v2.ts new file mode 100644 index 00000000..eeec6e01 --- /dev/null +++ b/scripts/credit-assignment-elo-v2.ts @@ -0,0 +1,363 @@ +import { + fetchModelSpecs, + getApplicationId, + loadApplicationsFromDirectory, + loadReview, + saveFile, + loadApplication, + loadKarmaGap, + loadResearch, + getProjectName, +} from "../utils/utils"; + +import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; + +// Core Elo scoring parameters +const BASE_RATING = 1000; +// INCREASE K_FACTOR SIGNIFICANTLY to amplify small magnitude differences +const K_FACTOR = 32; // Was 32, then 128 + +// Helper: Calculate expected score between two ratings (Standard Elo - currently unused in rating updates) +function expectedScore(ratingA: number, ratingB: number): number { + return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); +} + +// Helper: Update Elo rating after a matchup (Standard Elo - currently unused) +function updateElo(rating: number, expected: number, actual: number): number { + return rating + K_FACTOR * (actual - expected); +} + +// New Helper: Update rating based *only* on the actual score magnitude from the match +// function updateRatingDirectly(rating: number, actual: number): number { +// // actual is the score for *this* player (0 to 1, derived from magnitude 0.5-1.0) +// // differenceFromNeutral will be positive for wins (>0.5), negative for losses (<0.5) +// const differenceFromNeutral = actual - 0.5; +// // Adjust rating based on deviation from neutral, scaled by K_FACTOR +// return rating + K_FACTOR * differenceFromNeutral; +// } + +// Helper to safely get the 'output' of the last research entry for a specific agent +// Assumes input is a valid array with at least one element +const getResearchOutput = (researchAgentArray: any[]): string | undefined => { + // Access the last element safely and return its output field (if it exists) + return researchAgentArray[researchAgentArray.length - 1]?.output; +} + +// Helper function to process research data for a single agent type +const processResearchAgent = (appName: string, agentKey: string, agentData: any): string | undefined => { + if (Array.isArray(agentData) && agentData.length > 0) { + return getResearchOutput(agentData); + } else if (agentData !== undefined && agentData !== null) { + // Log if we get something other than undefined/null that's not a non-empty array + console.warn(`DEBUG: Unexpected research data type for ${appName} -> ${agentKey}. Expected array, got:`, typeof agentData, JSON.stringify(agentData)?.substring(0, 100) + '...'); + return undefined; + } + // Return undefined if data is null, undefined, or an empty array + return undefined; +}; + +// ANSI escape codes for colors +const colors = { + reset: "\x1b[0m", + bright: "\x1b[1m", + dim: "\x1b[2m", + underscore: "\x1b[4m", + blink: "\x1b[5m", + reverse: "\x1b[7m", + hidden: "\x1b[8m", + + fg: { + black: "\x1b[30m", + red: "\x1b[31m", + green: "\x1b[32m", + yellow: "\x1b[33m", + blue: "\x1b[34m", + magenta: "\x1b[35m", + cyan: "\x1b[36m", + white: "\x1b[37m", + crimson: "\x1b[38m" // Scarlet + }, + bg: { + black: "\x1b[40m", + red: "\x1b[41m", + green: "\x1b[42m", + yellow: "\x1b[43m", + blue: "\x1b[44m", + magenta: "\x1b[45m", + cyan: "\x1b[46m", + white: "\x1b[47m", + crimson: "\x1b[48m" + } +}; + +async function main() { + // Load all applications from the directory + const applications = loadApplicationsFromDirectory(); + console.log(`Processing ${colors.fg.yellow}${applications.length}${colors.reset} applications...`); + + // Pre-load all necessary data for each application + console.log("Pre-loading application data (app, research, karmagap)..."); + const applicationDataMap = new Map(); + for (const app of applications) { + const id = getApplicationId(app); + const name = getProjectName(app) || id; + applicationDataMap.set(id, { + id, + name, + application: loadApplication(id), + research: loadResearch(id), // This loads the whole research object for the ID + karmaGap: loadKarmaGap(id), + }); + } + // Add this logging temporarily AFTER the loop finishes (around line 53) + const gainForestId = "42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e"; // Fixed ID + const treegensId = "42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca"; + // console.log("DEBUG Preloaded GainForest Data:", JSON.stringify(applicationDataMap.get(gainForestId), null, 2)); + // console.log("DEBUG Preloaded Treegens Data:", JSON.stringify(applicationDataMap.get(treegensId), null, 2)); + // console.log("Finished pre-loading data."); + + // Get the available review models/agents + const modelSpecs = await fetchModelSpecs(); + + // Load all reviews for all agents, associating with pre-loaded data + const reviewsByAgent = modelSpecs.reduce((acc, agent) => { + const agentName = agent?.name; + acc[agentName] = applications + .map((app) => { + const id = getApplicationId(app); + const baseData = applicationDataMap.get(id); + if (!baseData) return null; + + const reviewData = loadReview(id, agentName); + // We need review data to proceed for this agent + if (!reviewData) return null; + + // Add the review data to the base data + return { ...baseData, review: reviewData, ethics: agent.ethics, constitution: agent.constitution, scoring: agent.scoring }; + }) + .filter(Boolean); // Filter out any apps where review data was missing for this agent + return acc; + }, {} as Record); + + + // Loop through each agent's reviews + for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { + console.log(`\n${colors.fg.magenta}🎯 Running Elo tournament for agent: ${colors.bright}${agentName}${colors.reset}`); + + // Initialize all ratings + const ratings: Record = {}; + for (const { id } of agentApplicationsData) { + ratings[id] = BASE_RATING; + } + + // Run simulated pairwise matchups (round-robin style) + for (let i = 0; i < agentApplicationsData.length; i++) { + for (let j = i + 1; j < agentApplicationsData.length; j++) { + const appA = agentApplicationsData[i]!; + const appB = agentApplicationsData[j]!; + + // Process research data using the helper function for better debugging + const academicResearchOutputA = processResearchAgent(appA.name, 'Academic_Research_Agent', appA.research?.Academic_Research_Agent); + const academicResearchOutputB = processResearchAgent(appB.name, 'Academic_Research_Agent', appB.research?.Academic_Research_Agent); + + const factCheckingOutputA = processResearchAgent(appA.name, 'Fact_Checking_Agent', appA.research?.Fact_Checking_Agent); + const factCheckingOutputB = processResearchAgent(appB.name, 'Fact_Checking_Agent', appB.research?.Fact_Checking_Agent); + + const webSearchOutputA = processResearchAgent(appA.name, 'Web_Search_Agent', appA.research?.Web_Search_Agent); + const webSearchOutputB = processResearchAgent(appB.name, 'Web_Search_Agent', appB.research?.Web_Search_Agent); + + const primaryResearchOutputA = processResearchAgent(appA.name, 'Primary_Research_Agent', appA.research?.Primary_Research_Agent); + const primaryResearchOutputB = processResearchAgent(appB.name, 'Primary_Research_Agent', appB.research?.Primary_Research_Agent); + + const dataAnalysisOutputA = processResearchAgent(appA.name, 'Data_Analysis_Agent', appA.research?.Data_Analysis_Agent); + const dataAnalysisOutputB = processResearchAgent(appB.name, 'Data_Analysis_Agent', appB.research?.Data_Analysis_Agent); + + + // Prepare data snippets for the prompt (avoid stringifying huge objects) + // Note: application_summary now uses metadata.description if available + const projectAData = { + title: appA.name, + scoring: appA.scoring, + ethics: appA.ethics, + constitution: appA.constitution, + application_summary: appA.application?.project?.metadata?.description ?? appA.application?.project?.description, // Use metadata first + karmagap_score: appA.karmaGap?.score, + reviewer_comment: appA.review?.comments, + academic_research: academicResearchOutputA, + fact_checking: factCheckingOutputA, + web_search: webSearchOutputA, + primary_research: primaryResearchOutputA, + data_analysis: dataAnalysisOutputA + }; + const projectBData = { + title: appB.name, + scoring: appB.scoring, + ethics: appB.ethics, + constitution: appB.constitution, + application_summary: appB.application?.project?.metadata?.description ?? appB.application?.project?.description, // Use metadata first + karmagap_score: appB.karmaGap?.score, + reviewer_comment: appB.review?.comments, + academic_research: academicResearchOutputB, + fact_checking: factCheckingOutputB, + web_search: webSearchOutputB, + primary_research: primaryResearchOutputB, + data_analysis: dataAnalysisOutputB + }; + // --- DEBUG LOGGING START --- + // Log data specifically for the GainForest vs Treegens DAO comparison + const isTargetComparison = (appA.name === "GainForest" && appB.name === "Treegens DAO🌳") || (appA.name === "Treegens DAO🌳" && appB.name === "GainForest"); + if (isTargetComparison) { + // Trim potentially very long outputs for concise debug logging + const trim = (s: string | undefined) => s ? s.substring(0, 100) + '...' : undefined; + console.log(`${colors.fg.blue}DEBUG Data sent to agent for ${colors.bright}${ appA.name}${colors.reset} ${JSON.stringify({ + ...projectAData, + academic_research: trim(projectAData.academic_research), + fact_checking: trim(projectAData.fact_checking), + web_search: trim(projectAData.web_search), + primary_research: trim(projectAData.primary_research), + data_analysis: trim(projectAData.data_analysis), + application_summary: trim(projectAData.application_summary), + reviewer_comment: trim(projectAData.reviewer_comment) + }, null, 2)}${colors.reset}`); + console.log(`${colors.fg.blue}DEBUG Data sent to agent for ${colors.bright}${ appB.name}${colors.reset} ${JSON.stringify({ + ...projectBData, + academic_research: trim(projectBData.academic_research), + fact_checking: trim(projectBData.fact_checking), + web_search: trim(projectBData.web_search), + primary_research: trim(projectBData.primary_research), + data_analysis: trim(projectBData.data_analysis), + application_summary: trim(projectBData.application_summary), + reviewer_comment: trim(projectBData.reviewer_comment) + }, null, 2)}${colors.reset}`); + } + // --- DEBUG LOGGING END --- + + const oldPrompt = ` + + `; + // Refined prompt with clearer scale definitions + const prompt = ` +You are an expert grant allocator reviewing and comparing two different projects. + +Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, feasibility, and overall quality presented in the data below. + +Consider all available information including application summaries, research reports, fact checks and specific reviewer comments, while considering their particular scoring, ethics and consitution. + +You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. Give weight to the research outputs (web search, fact checking, academic context) as objective inputs alongside the application summary (project's own description) and the reviewer's subjective comment. + +Respond ONLY with "A" or "B". Do NOT explain. + +--- Project A --- +${JSON.stringify(projectAData, null, 2)} + +--- Project B --- +${JSON.stringify(projectBData, null, 2)} +`; + + const result = await creditAssignmentAgent.generate(prompt); + + // Parse the response "W,S" + // const responseParts = result.text.trim().toUpperCase().split(','); + // let actualA = 0.5; // Default to draw + // let actualB = 0.5; + // let winnerLetter = 'DRAW'; + // let winMagnitude = 0.5; + + + // if (responseParts.length === 2) { + // // Linter Fix: Assert that parts[0] and parts[1] are defined because we checked length === 2 + // winnerLetter = responseParts[0]!; + // const scoreString = responseParts[1]!; + // // console.log("winnerLetter is ", winnerLetter); // Keep logging minimal for clarity + // // console.log("scoreString is ", scoreString); + // // Check if parts are defined before using them (Redundant due to length check and assertion, but safe) + // // if (winnerLetter !== undefined && scoreString !== undefined) { + // winMagnitude = parseFloat(scoreString); + + // if ((winnerLetter === 'A' || winnerLetter === 'B') && !isNaN(winMagnitude) && winMagnitude >= 0.5 && winMagnitude <= 1.0) { + // if (winnerLetter === 'A') { + // actualA = winMagnitude; + // actualB = 1 - winMagnitude; + // } else { // Winner is B + // actualB = winMagnitude; + // actualA = 1 - winMagnitude; + // } + // } else { + // console.warn(`⚠️ Invalid response format/score: ${result.text}. Draw.`); + // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; + // } + // // } else { // This block becomes unreachable due to length check / assertion + // // console.warn(`⚠️ Invalid response parts: ${result.text}. Draw.`); + // // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; + // // } + // } else { + // console.warn( + // `⚠️ Unexpected response format: "${result.text.trim()}". Draw.` + // ); + // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; + // } + + // Log the outcome of the match concisely only if not the debugged comparison + // Always log for now to see results + // console.log(`Match: ${appA.name} vs ${appB.name} -> Winner: ${winnerLetter}, Mag: ${winMagnitude.toFixed(2)}`); + + const winner = result.text.trim().toUpperCase(); + console.log(`Match: ${colors.fg.cyan}${appA.name}${colors.reset} vs ${colors.fg.cyan}${appB.name}${colors.reset} -> Winner: ${winner === "A" || winner === "B" ? colors.fg.green : colors.fg.yellow}${winner}${colors.reset}`); + + const ratingA = ratings[appA.id]!; + const ratingB = ratings[appB.id]!; + const expectedA = expectedScore(ratingA, ratingB); + const expectedB = expectedScore(ratingB, ratingA); + + if (winner === "A") { + ratings[appA.id] = updateElo(ratingA, expectedA, 1); + ratings[appB.id] = updateElo(ratingB, expectedB, 0); + } else if (winner === "B") { + ratings[appA.id] = updateElo(ratingA, expectedA, 0); + ratings[appB.id] = updateElo(ratingB, expectedB, 1); + } else { + // If the response is neither A nor B, treat it as a draw. + // Elo ratings don't change for a draw with actual scores of 0.5 vs 0.5 against expected. + ratings[appA.id] = updateElo(ratingA, expectedA, 0.5); + ratings[appB.id] = updateElo(ratingB, expectedB, 0.5); + console.warn(`⚠️ ${colors.fg.yellow}Unexpected response: "${result.text.trim()}". Treating as a draw.${colors.reset}`); + } + // Optional: Log rating changes if needed for debugging + // console.log(` Ratings: A=${ratings[appA.id]!.toFixed(1)}, B=${ratings[appB.id]!.toFixed(1)}`); + + } + } + + // Log raw final scores before normalization + console.log("Raw final ratings:", JSON.stringify(ratings, null, 2)); + + + // Normalize scores so they sum to 1 (for funding allocation) + const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); + const normalized = Object.entries(ratings).map(([id, score]) => { + const name = applicationDataMap.get(id)?.name || id; + // Ensure score is not negative before normalization, although unlikely with BASE_RATING=1000 and K=128 unless many losses occur + const nonNegativeScore = Math.max(0, score); + // Prevent division by zero or negative totals + const safeTotalScore = Math.max(1, totalScore); // Avoid total being 0 or negative + + return { + id, + name, + score: (nonNegativeScore / safeTotalScore).toFixed(6), + }; + }); + + // Prepare output CSV format + const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); + + // Save results to file + saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); + console.log(`${colors.fg.green}✅ Saved results for ${agentName}${colors.reset}`); + } +} + +main().catch((error) => { + console.error(`${colors.fg.red}❌ Error:${colors.reset}`, error); + process.exit(1); +}); diff --git a/utils/utils.ts b/utils/utils.ts index fe40d89b..fdb17a1c 100644 --- a/utils/utils.ts +++ b/utils/utils.ts @@ -191,7 +191,7 @@ export function normalizeProjectName(name: string) { } export async function fetchModelSpecs(): Promise< - { name: string; profileUrl: string; style: string; constitution: string }[] + { name: string; profileUrl: string; style: string; constitution: string; scoring: string, ethics: string }[] > { const baseURL = `https://api.github.com/repos/evalscience/deepgov-gg23/contents/agents`; const contentURL = `https://raw.githubusercontent.com/evalscience/deepgov-gg23/refs/heads/main`; From 27a6e4840dc5efbfdbdcbe59300390f379dac7d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Thu, 22 May 2025 20:25:26 +0100 Subject: [PATCH 8/9] refactor: integrate hypercert data into application processing and enhance scoring logic for improved evaluation --- README.md | 8 + .../hypercerts.json | 6 +- ...lo-credit-assignment-gitcoin-communist.csv | 8 +- ...edit-assignment-open-source-capitalist.csv | 10 +- scores/elo-credit-assignment-regenerator.csv | 8 +- scripts/credit-assignment-elo-v2.ts | 363 ------------------ scripts/credit-assignment-elo.ts | 221 ++++++++--- scripts/review-applications.ts | 59 +-- utils/utils.ts | 59 +++ 9 files changed, 262 insertions(+), 480 deletions(-) delete mode 100644 scripts/credit-assignment-elo-v2.ts diff --git a/README.md b/README.md index 1a561c92..42255ccc 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,14 @@ bun run review-applications Evaluates grant applications using multiple AI agents with different evaluation criteria. Each agent provides an independent review based on the project's application, research data, and historical grant information. + +#### 6. Run Elo Tournament + +```sh +bun run review-applications +``` + + ### Data Structure The scripts work with the following directory structure: diff --git a/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json b/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json index f4365a36..3b025825 100644 --- a/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json +++ b/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json @@ -25,12 +25,12 @@ }, { "hypercert_id": "42220-0x16bA53B74c234C870c61EFC04cD418B8f2865959-23479483317544753978972847912792006590464", + "attestations": { + "data": [] + }, "metadata": { "name": "GainForest - Support XPRIZE Rainforest’s “Single Most Impactful Approach/Technology”", "description": "GainForest is co-lead of BioDivX, the XPRIZE Rainforest bonus prize award winner recognized as the “single most impactful approach/technology”. GainForest has worked with over 25 communities to bring crucial biodiversity data onchain, distributing $32K in conservation data income to local communities in 2024 for their role in recording crucial forestation and wildlife information, and co-hosted 13 workshops with local communities across 3 countries. GainForest is fostering global climate action through capacity building with local communities and scientists, maintaining a public database of AI+Environment models. By purchasing this hypercert, you are supporting the teams’ work on the ground to protect the world’s rainforests and incentivize local participation in climate initiatives." - }, - "attestations": { - "data": [] } }, { diff --git a/scores/elo-credit-assignment-gitcoin-communist.csv b/scores/elo-credit-assignment-gitcoin-communist.csv index 92366d26..c5a335f6 100644 --- a/scores/elo-credit-assignment-gitcoin-communist.csv +++ b/scores/elo-credit-assignment-gitcoin-communist.csv @@ -1,6 +1,6 @@ id,name,score 42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.193779 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.199725 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.188209 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206342 \ No newline at end of file +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.187672 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.199991 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194028 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206363 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist.csv b/scores/elo-credit-assignment-open-source-capitalist.csv index 3b3f09f0..7565a1fe 100644 --- a/scores/elo-credit-assignment-open-source-capitalist.csv +++ b/scores/elo-credit-assignment-open-source-capitalist.csv @@ -1,6 +1,6 @@ id,name,score -42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.199719 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.194314 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.205544 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.188485 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.211938 \ No newline at end of file +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.187672 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.206391 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.193734 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200256 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator.csv b/scores/elo-credit-assignment-regenerator.csv index 92366d26..7565a1fe 100644 --- a/scores/elo-credit-assignment-regenerator.csv +++ b/scores/elo-credit-assignment-regenerator.csv @@ -1,6 +1,6 @@ id,name,score 42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 -42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.193779 -42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.199725 -42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.188209 -42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206342 \ No newline at end of file +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.187672 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.206391 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.193734 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200256 \ No newline at end of file diff --git a/scripts/credit-assignment-elo-v2.ts b/scripts/credit-assignment-elo-v2.ts deleted file mode 100644 index eeec6e01..00000000 --- a/scripts/credit-assignment-elo-v2.ts +++ /dev/null @@ -1,363 +0,0 @@ -import { - fetchModelSpecs, - getApplicationId, - loadApplicationsFromDirectory, - loadReview, - saveFile, - loadApplication, - loadKarmaGap, - loadResearch, - getProjectName, -} from "../utils/utils"; - -import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; - -// Core Elo scoring parameters -const BASE_RATING = 1000; -// INCREASE K_FACTOR SIGNIFICANTLY to amplify small magnitude differences -const K_FACTOR = 32; // Was 32, then 128 - -// Helper: Calculate expected score between two ratings (Standard Elo - currently unused in rating updates) -function expectedScore(ratingA: number, ratingB: number): number { - return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); -} - -// Helper: Update Elo rating after a matchup (Standard Elo - currently unused) -function updateElo(rating: number, expected: number, actual: number): number { - return rating + K_FACTOR * (actual - expected); -} - -// New Helper: Update rating based *only* on the actual score magnitude from the match -// function updateRatingDirectly(rating: number, actual: number): number { -// // actual is the score for *this* player (0 to 1, derived from magnitude 0.5-1.0) -// // differenceFromNeutral will be positive for wins (>0.5), negative for losses (<0.5) -// const differenceFromNeutral = actual - 0.5; -// // Adjust rating based on deviation from neutral, scaled by K_FACTOR -// return rating + K_FACTOR * differenceFromNeutral; -// } - -// Helper to safely get the 'output' of the last research entry for a specific agent -// Assumes input is a valid array with at least one element -const getResearchOutput = (researchAgentArray: any[]): string | undefined => { - // Access the last element safely and return its output field (if it exists) - return researchAgentArray[researchAgentArray.length - 1]?.output; -} - -// Helper function to process research data for a single agent type -const processResearchAgent = (appName: string, agentKey: string, agentData: any): string | undefined => { - if (Array.isArray(agentData) && agentData.length > 0) { - return getResearchOutput(agentData); - } else if (agentData !== undefined && agentData !== null) { - // Log if we get something other than undefined/null that's not a non-empty array - console.warn(`DEBUG: Unexpected research data type for ${appName} -> ${agentKey}. Expected array, got:`, typeof agentData, JSON.stringify(agentData)?.substring(0, 100) + '...'); - return undefined; - } - // Return undefined if data is null, undefined, or an empty array - return undefined; -}; - -// ANSI escape codes for colors -const colors = { - reset: "\x1b[0m", - bright: "\x1b[1m", - dim: "\x1b[2m", - underscore: "\x1b[4m", - blink: "\x1b[5m", - reverse: "\x1b[7m", - hidden: "\x1b[8m", - - fg: { - black: "\x1b[30m", - red: "\x1b[31m", - green: "\x1b[32m", - yellow: "\x1b[33m", - blue: "\x1b[34m", - magenta: "\x1b[35m", - cyan: "\x1b[36m", - white: "\x1b[37m", - crimson: "\x1b[38m" // Scarlet - }, - bg: { - black: "\x1b[40m", - red: "\x1b[41m", - green: "\x1b[42m", - yellow: "\x1b[43m", - blue: "\x1b[44m", - magenta: "\x1b[45m", - cyan: "\x1b[46m", - white: "\x1b[47m", - crimson: "\x1b[48m" - } -}; - -async function main() { - // Load all applications from the directory - const applications = loadApplicationsFromDirectory(); - console.log(`Processing ${colors.fg.yellow}${applications.length}${colors.reset} applications...`); - - // Pre-load all necessary data for each application - console.log("Pre-loading application data (app, research, karmagap)..."); - const applicationDataMap = new Map(); - for (const app of applications) { - const id = getApplicationId(app); - const name = getProjectName(app) || id; - applicationDataMap.set(id, { - id, - name, - application: loadApplication(id), - research: loadResearch(id), // This loads the whole research object for the ID - karmaGap: loadKarmaGap(id), - }); - } - // Add this logging temporarily AFTER the loop finishes (around line 53) - const gainForestId = "42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e"; // Fixed ID - const treegensId = "42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca"; - // console.log("DEBUG Preloaded GainForest Data:", JSON.stringify(applicationDataMap.get(gainForestId), null, 2)); - // console.log("DEBUG Preloaded Treegens Data:", JSON.stringify(applicationDataMap.get(treegensId), null, 2)); - // console.log("Finished pre-loading data."); - - // Get the available review models/agents - const modelSpecs = await fetchModelSpecs(); - - // Load all reviews for all agents, associating with pre-loaded data - const reviewsByAgent = modelSpecs.reduce((acc, agent) => { - const agentName = agent?.name; - acc[agentName] = applications - .map((app) => { - const id = getApplicationId(app); - const baseData = applicationDataMap.get(id); - if (!baseData) return null; - - const reviewData = loadReview(id, agentName); - // We need review data to proceed for this agent - if (!reviewData) return null; - - // Add the review data to the base data - return { ...baseData, review: reviewData, ethics: agent.ethics, constitution: agent.constitution, scoring: agent.scoring }; - }) - .filter(Boolean); // Filter out any apps where review data was missing for this agent - return acc; - }, {} as Record); - - - // Loop through each agent's reviews - for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { - console.log(`\n${colors.fg.magenta}🎯 Running Elo tournament for agent: ${colors.bright}${agentName}${colors.reset}`); - - // Initialize all ratings - const ratings: Record = {}; - for (const { id } of agentApplicationsData) { - ratings[id] = BASE_RATING; - } - - // Run simulated pairwise matchups (round-robin style) - for (let i = 0; i < agentApplicationsData.length; i++) { - for (let j = i + 1; j < agentApplicationsData.length; j++) { - const appA = agentApplicationsData[i]!; - const appB = agentApplicationsData[j]!; - - // Process research data using the helper function for better debugging - const academicResearchOutputA = processResearchAgent(appA.name, 'Academic_Research_Agent', appA.research?.Academic_Research_Agent); - const academicResearchOutputB = processResearchAgent(appB.name, 'Academic_Research_Agent', appB.research?.Academic_Research_Agent); - - const factCheckingOutputA = processResearchAgent(appA.name, 'Fact_Checking_Agent', appA.research?.Fact_Checking_Agent); - const factCheckingOutputB = processResearchAgent(appB.name, 'Fact_Checking_Agent', appB.research?.Fact_Checking_Agent); - - const webSearchOutputA = processResearchAgent(appA.name, 'Web_Search_Agent', appA.research?.Web_Search_Agent); - const webSearchOutputB = processResearchAgent(appB.name, 'Web_Search_Agent', appB.research?.Web_Search_Agent); - - const primaryResearchOutputA = processResearchAgent(appA.name, 'Primary_Research_Agent', appA.research?.Primary_Research_Agent); - const primaryResearchOutputB = processResearchAgent(appB.name, 'Primary_Research_Agent', appB.research?.Primary_Research_Agent); - - const dataAnalysisOutputA = processResearchAgent(appA.name, 'Data_Analysis_Agent', appA.research?.Data_Analysis_Agent); - const dataAnalysisOutputB = processResearchAgent(appB.name, 'Data_Analysis_Agent', appB.research?.Data_Analysis_Agent); - - - // Prepare data snippets for the prompt (avoid stringifying huge objects) - // Note: application_summary now uses metadata.description if available - const projectAData = { - title: appA.name, - scoring: appA.scoring, - ethics: appA.ethics, - constitution: appA.constitution, - application_summary: appA.application?.project?.metadata?.description ?? appA.application?.project?.description, // Use metadata first - karmagap_score: appA.karmaGap?.score, - reviewer_comment: appA.review?.comments, - academic_research: academicResearchOutputA, - fact_checking: factCheckingOutputA, - web_search: webSearchOutputA, - primary_research: primaryResearchOutputA, - data_analysis: dataAnalysisOutputA - }; - const projectBData = { - title: appB.name, - scoring: appB.scoring, - ethics: appB.ethics, - constitution: appB.constitution, - application_summary: appB.application?.project?.metadata?.description ?? appB.application?.project?.description, // Use metadata first - karmagap_score: appB.karmaGap?.score, - reviewer_comment: appB.review?.comments, - academic_research: academicResearchOutputB, - fact_checking: factCheckingOutputB, - web_search: webSearchOutputB, - primary_research: primaryResearchOutputB, - data_analysis: dataAnalysisOutputB - }; - // --- DEBUG LOGGING START --- - // Log data specifically for the GainForest vs Treegens DAO comparison - const isTargetComparison = (appA.name === "GainForest" && appB.name === "Treegens DAO🌳") || (appA.name === "Treegens DAO🌳" && appB.name === "GainForest"); - if (isTargetComparison) { - // Trim potentially very long outputs for concise debug logging - const trim = (s: string | undefined) => s ? s.substring(0, 100) + '...' : undefined; - console.log(`${colors.fg.blue}DEBUG Data sent to agent for ${colors.bright}${ appA.name}${colors.reset} ${JSON.stringify({ - ...projectAData, - academic_research: trim(projectAData.academic_research), - fact_checking: trim(projectAData.fact_checking), - web_search: trim(projectAData.web_search), - primary_research: trim(projectAData.primary_research), - data_analysis: trim(projectAData.data_analysis), - application_summary: trim(projectAData.application_summary), - reviewer_comment: trim(projectAData.reviewer_comment) - }, null, 2)}${colors.reset}`); - console.log(`${colors.fg.blue}DEBUG Data sent to agent for ${colors.bright}${ appB.name}${colors.reset} ${JSON.stringify({ - ...projectBData, - academic_research: trim(projectBData.academic_research), - fact_checking: trim(projectBData.fact_checking), - web_search: trim(projectBData.web_search), - primary_research: trim(projectBData.primary_research), - data_analysis: trim(projectBData.data_analysis), - application_summary: trim(projectBData.application_summary), - reviewer_comment: trim(projectBData.reviewer_comment) - }, null, 2)}${colors.reset}`); - } - // --- DEBUG LOGGING END --- - - const oldPrompt = ` - - `; - // Refined prompt with clearer scale definitions - const prompt = ` -You are an expert grant allocator reviewing and comparing two different projects. - -Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, feasibility, and overall quality presented in the data below. - -Consider all available information including application summaries, research reports, fact checks and specific reviewer comments, while considering their particular scoring, ethics and consitution. - -You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. Give weight to the research outputs (web search, fact checking, academic context) as objective inputs alongside the application summary (project's own description) and the reviewer's subjective comment. - -Respond ONLY with "A" or "B". Do NOT explain. - ---- Project A --- -${JSON.stringify(projectAData, null, 2)} - ---- Project B --- -${JSON.stringify(projectBData, null, 2)} -`; - - const result = await creditAssignmentAgent.generate(prompt); - - // Parse the response "W,S" - // const responseParts = result.text.trim().toUpperCase().split(','); - // let actualA = 0.5; // Default to draw - // let actualB = 0.5; - // let winnerLetter = 'DRAW'; - // let winMagnitude = 0.5; - - - // if (responseParts.length === 2) { - // // Linter Fix: Assert that parts[0] and parts[1] are defined because we checked length === 2 - // winnerLetter = responseParts[0]!; - // const scoreString = responseParts[1]!; - // // console.log("winnerLetter is ", winnerLetter); // Keep logging minimal for clarity - // // console.log("scoreString is ", scoreString); - // // Check if parts are defined before using them (Redundant due to length check and assertion, but safe) - // // if (winnerLetter !== undefined && scoreString !== undefined) { - // winMagnitude = parseFloat(scoreString); - - // if ((winnerLetter === 'A' || winnerLetter === 'B') && !isNaN(winMagnitude) && winMagnitude >= 0.5 && winMagnitude <= 1.0) { - // if (winnerLetter === 'A') { - // actualA = winMagnitude; - // actualB = 1 - winMagnitude; - // } else { // Winner is B - // actualB = winMagnitude; - // actualA = 1 - winMagnitude; - // } - // } else { - // console.warn(`⚠️ Invalid response format/score: ${result.text}. Draw.`); - // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; - // } - // // } else { // This block becomes unreachable due to length check / assertion - // // console.warn(`⚠️ Invalid response parts: ${result.text}. Draw.`); - // // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; - // // } - // } else { - // console.warn( - // `⚠️ Unexpected response format: "${result.text.trim()}". Draw.` - // ); - // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; - // } - - // Log the outcome of the match concisely only if not the debugged comparison - // Always log for now to see results - // console.log(`Match: ${appA.name} vs ${appB.name} -> Winner: ${winnerLetter}, Mag: ${winMagnitude.toFixed(2)}`); - - const winner = result.text.trim().toUpperCase(); - console.log(`Match: ${colors.fg.cyan}${appA.name}${colors.reset} vs ${colors.fg.cyan}${appB.name}${colors.reset} -> Winner: ${winner === "A" || winner === "B" ? colors.fg.green : colors.fg.yellow}${winner}${colors.reset}`); - - const ratingA = ratings[appA.id]!; - const ratingB = ratings[appB.id]!; - const expectedA = expectedScore(ratingA, ratingB); - const expectedB = expectedScore(ratingB, ratingA); - - if (winner === "A") { - ratings[appA.id] = updateElo(ratingA, expectedA, 1); - ratings[appB.id] = updateElo(ratingB, expectedB, 0); - } else if (winner === "B") { - ratings[appA.id] = updateElo(ratingA, expectedA, 0); - ratings[appB.id] = updateElo(ratingB, expectedB, 1); - } else { - // If the response is neither A nor B, treat it as a draw. - // Elo ratings don't change for a draw with actual scores of 0.5 vs 0.5 against expected. - ratings[appA.id] = updateElo(ratingA, expectedA, 0.5); - ratings[appB.id] = updateElo(ratingB, expectedB, 0.5); - console.warn(`⚠️ ${colors.fg.yellow}Unexpected response: "${result.text.trim()}". Treating as a draw.${colors.reset}`); - } - // Optional: Log rating changes if needed for debugging - // console.log(` Ratings: A=${ratings[appA.id]!.toFixed(1)}, B=${ratings[appB.id]!.toFixed(1)}`); - - } - } - - // Log raw final scores before normalization - console.log("Raw final ratings:", JSON.stringify(ratings, null, 2)); - - - // Normalize scores so they sum to 1 (for funding allocation) - const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); - const normalized = Object.entries(ratings).map(([id, score]) => { - const name = applicationDataMap.get(id)?.name || id; - // Ensure score is not negative before normalization, although unlikely with BASE_RATING=1000 and K=128 unless many losses occur - const nonNegativeScore = Math.max(0, score); - // Prevent division by zero or negative totals - const safeTotalScore = Math.max(1, totalScore); // Avoid total being 0 or negative - - return { - id, - name, - score: (nonNegativeScore / safeTotalScore).toFixed(6), - }; - }); - - // Prepare output CSV format - const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); - - // Save results to file - saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); - console.log(`${colors.fg.green}✅ Saved results for ${agentName}${colors.reset}`); - } -} - -main().catch((error) => { - console.error(`${colors.fg.red}❌ Error:${colors.reset}`, error); - process.exit(1); -}); diff --git a/scripts/credit-assignment-elo.ts b/scripts/credit-assignment-elo.ts index d10cf0ef..b2d0d4a3 100644 --- a/scripts/credit-assignment-elo.ts +++ b/scripts/credit-assignment-elo.ts @@ -8,44 +8,121 @@ import { loadKarmaGap, loadResearch, getProjectName, + loadHypercerts, + parseHypercerts, } from "../utils/utils"; import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; // Core Elo scoring parameters const BASE_RATING = 1000; -const K_FACTOR = 32; // Adjust this to control how volatile the scores are +// INCREASE K_FACTOR SIGNIFICANTLY to amplify small magnitude differences +const K_FACTOR = 32; // Was 32, then 128 -// Helper: Calculate expected score between two ratings +// Helper: Calculate expected score between two ratings (Standard Elo - currently unused in rating updates) function expectedScore(ratingA: number, ratingB: number): number { return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); } -// Helper: Update Elo rating after a matchup +// Helper: Update Elo rating after a matchup (Standard Elo - currently unused) function updateElo(rating: number, expected: number, actual: number): number { return rating + K_FACTOR * (actual - expected); } +// New Helper: Update rating based *only* on the actual score magnitude from the match +// function updateRatingDirectly(rating: number, actual: number): number { +// // actual is the score for *this* player (0 to 1, derived from magnitude 0.5-1.0) +// // differenceFromNeutral will be positive for wins (>0.5), negative for losses (<0.5) +// const differenceFromNeutral = actual - 0.5; +// // Adjust rating based on deviation from neutral, scaled by K_FACTOR +// return rating + K_FACTOR * differenceFromNeutral; +// } + +// Helper to safely get the 'output' of the last research entry for a specific agent +// Assumes input is a valid array with at least one element +const getResearchOutput = (researchAgentArray: any[]): string | undefined => { + // Access the last element safely and return its output field (if it exists) + return researchAgentArray[researchAgentArray.length - 1]?.output; +} + +// Helper function to process research data for a single agent type +const processResearchAgent = (appName: string, agentKey: string, agentData: any): string | undefined => { + if (Array.isArray(agentData) && agentData.length > 0) { + return getResearchOutput(agentData); + } else if (agentData !== undefined && agentData !== null) { + // Log if we get something other than undefined/null that's not a non-empty array + console.warn(`DEBUG: Unexpected research data type for ${appName} -> ${agentKey}. Expected array, got:`, typeof agentData, JSON.stringify(agentData)?.substring(0, 100) + '...'); + return undefined; + } + // Return undefined if data is null, undefined, or an empty array + return undefined; +}; + +// ANSI escape codes for colors +const colors = { + reset: "\x1b[0m", + bright: "\x1b[1m", + dim: "\x1b[2m", + underscore: "\x1b[4m", + blink: "\x1b[5m", + reverse: "\x1b[7m", + hidden: "\x1b[8m", + + fg: { + black: "\x1b[30m", + red: "\x1b[31m", + green: "\x1b[32m", + yellow: "\x1b[33m", + blue: "\x1b[34m", + magenta: "\x1b[35m", + cyan: "\x1b[36m", + white: "\x1b[37m", + crimson: "\x1b[38m" // Scarlet + }, + bg: { + black: "\x1b[40m", + red: "\x1b[41m", + green: "\x1b[42m", + yellow: "\x1b[43m", + blue: "\x1b[44m", + magenta: "\x1b[45m", + cyan: "\x1b[46m", + white: "\x1b[47m", + crimson: "\x1b[48m" + } +}; + async function main() { // Load all applications from the directory const applications = loadApplicationsFromDirectory(); - console.log(`Processing ${applications.length} applications...`); + console.log(`Processing ${colors.fg.yellow}${applications.length}${colors.reset} applications...`); // Pre-load all necessary data for each application console.log("Pre-loading application data (app, research, karmagap)..."); const applicationDataMap = new Map(); + + for (const app of applications) { - const applicationId = getApplicationId(app); - const name = getProjectName(app) || applicationId; - applicationDataMap.set(applicationId, { - applicationId, + const id = getApplicationId(app); + const name = getProjectName(app) || id; + const hypercerts = loadHypercerts(id); + const hypercertsData = parseHypercerts(hypercerts ?? { data: { hypercerts: { data: [] } } }); + + applicationDataMap.set(id, { + id, name, - application: loadApplication(applicationId), - research: loadResearch(applicationId), - karmaGap: loadKarmaGap(applicationId), + application: loadApplication(id), + research: loadResearch(id), // This loads the whole research object for the ID + karmaGap: loadKarmaGap(id), + hypercerts: hypercertsData, }); } - console.log("Finished pre-loading data."); + // Add this logging temporarily AFTER the loop finishes (around line 53) + const gainForestId = "42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e"; // Fixed ID + const treegensId = "42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca"; + // console.log("DEBUG Preloaded GainForest Data:", JSON.stringify(applicationDataMap.get(gainForestId), null, 2)); + // console.log("DEBUG Preloaded Treegens Data:", JSON.stringify(applicationDataMap.get(treegensId), null, 2)); + // console.log("Finished pre-loading data."); // Get the available review models/agents const modelSpecs = await fetchModelSpecs(); @@ -55,27 +132,30 @@ async function main() { const agentName = agent?.name; acc[agentName] = applications .map((app) => { - const applicationId = getApplicationId(app); - const baseData = applicationDataMap.get(applicationId); + const id = getApplicationId(app); + const baseData = applicationDataMap.get(id); if (!baseData) return null; - const reviewData = loadReview(applicationId, agentName); + const reviewData = loadReview(id, agentName); + // We need review data to proceed for this agent if (!reviewData) return null; - return { ...baseData, review: reviewData }; + // Add the review data to the base data + return { ...baseData, review: reviewData, ethics: agent.ethics, constitution: agent.constitution, scoring: agent.scoring }; }) - .filter(Boolean); + .filter(Boolean); // Filter out any apps where review data was missing for this agent return acc; }, {} as Record); + // Loop through each agent's reviews for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { - console.log(`\n🎯 Running Elo tournament for agent: ${agentName}`); + console.log(`\n${colors.fg.magenta}🎯 Running Elo tournament for agent: ${colors.bright}${agentName}${colors.reset}`); // Initialize all ratings const ratings: Record = {}; - for (const { applicationId } of agentApplicationsData) { - ratings[applicationId] = BASE_RATING; + for (const { id } of agentApplicationsData) { + ratings[id] = BASE_RATING; } // Run simulated pairwise matchups (round-robin style) @@ -84,28 +164,65 @@ async function main() { const appA = agentApplicationsData[i]!; const appB = agentApplicationsData[j]!; + // Process research data using the helper function for better debugging + const academicResearchOutputA = processResearchAgent(appA.name, 'Academic_Research_Agent', appA.research?.Academic_Research_Agent); + const academicResearchOutputB = processResearchAgent(appB.name, 'Academic_Research_Agent', appB.research?.Academic_Research_Agent); + + const factCheckingOutputA = processResearchAgent(appA.name, 'Fact_Checking_Agent', appA.research?.Fact_Checking_Agent); + const factCheckingOutputB = processResearchAgent(appB.name, 'Fact_Checking_Agent', appB.research?.Fact_Checking_Agent); + + const webSearchOutputA = processResearchAgent(appA.name, 'Web_Search_Agent', appA.research?.Web_Search_Agent); + const webSearchOutputB = processResearchAgent(appB.name, 'Web_Search_Agent', appB.research?.Web_Search_Agent); + + const primaryResearchOutputA = processResearchAgent(appA.name, 'Primary_Research_Agent', appA.research?.Primary_Research_Agent); + const primaryResearchOutputB = processResearchAgent(appB.name, 'Primary_Research_Agent', appB.research?.Primary_Research_Agent); + + const dataAnalysisOutputA = processResearchAgent(appA.name, 'Data_Analysis_Agent', appA.research?.Data_Analysis_Agent); + const dataAnalysisOutputB = processResearchAgent(appB.name, 'Data_Analysis_Agent', appB.research?.Data_Analysis_Agent); + // Prepare data snippets for the prompt (avoid stringifying huge objects) + // Note: application_summary now uses metadata.description if available const projectAData = { title: appA.name, - application_summary: appA.application?.project?.description, - research_summary: appA.research?.summary, + scoring: appA.scoring, + ethics: appA.ethics, + constitution: appA.constitution, + application_summary: appA.application?.project?.metadata?.description ?? appA.application?.project?.description, // Use metadata first karmagap_score: appA.karmaGap?.score, - reviewer_comment: appA.review?.comments + reviewer_comment: appA.review?.comments, + academic_research: academicResearchOutputA, + fact_checking: factCheckingOutputA, + web_search: webSearchOutputA, + primary_research: primaryResearchOutputA, + data_analysis: dataAnalysisOutputA, + hypercerts: appA.hypercerts }; const projectBData = { title: appB.name, - application_summary: appB.application?.project?.description, - research_summary: appB.research?.summary, + scoring: appB.scoring, + ethics: appB.ethics, + constitution: appB.constitution, + application_summary: appB.application?.project?.metadata?.description ?? appB.application?.project?.description, // Use metadata first karmagap_score: appB.karmaGap?.score, - reviewer_comment: appB.review?.comments + reviewer_comment: appB.review?.comments, + academic_research: academicResearchOutputB, + fact_checking: factCheckingOutputB, + web_search: webSearchOutputB, + primary_research: primaryResearchOutputB, + data_analysis: dataAnalysisOutputB, + hypercerts: appB.hypercerts }; + // Refined prompt with clearer scale definitions const prompt = ` -You are a grant allocator reviewing two projects. Consider all available information. +You are an expert grant allocator reviewing and comparing two different projects. + +Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, feasibility, and overall quality presented in the data below. + +Consider all available information including application summaries, research reports, fact checks and specific reviewer comments, while considering their particular scoring, ethics and consitution. -Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, and overall quality presented in the data below. +You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. Give weight to the research outputs (web search, fact checking, academic context) as objective inputs alongside the application summary (project's own description) and the reviewer's subjective comment. -You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. Respond ONLY with "A" or "B". Do NOT explain. --- Project A --- @@ -116,46 +233,64 @@ ${JSON.stringify(projectBData, null, 2)} `; const result = await creditAssignmentAgent.generate(prompt); + const winner = result.text.trim().toUpperCase(); + console.log(`Match: ${colors.fg.cyan}${appA.name}${colors.reset} vs ${colors.fg.cyan}${appB.name}${colors.reset} -> Winner: ${winner === "A" || winner === "B" ? colors.fg.green : colors.fg.yellow}${winner}${colors.reset}`); - const ratingA = ratings[appA.applicationId]!; - const ratingB = ratings[appB.applicationId]!; + const ratingA = ratings[appA.id]!; + const ratingB = ratings[appB.id]!; const expectedA = expectedScore(ratingA, ratingB); const expectedB = expectedScore(ratingB, ratingA); if (winner === "A") { - ratings[appA.applicationId] = updateElo(ratingA, expectedA, 1); - ratings[appB.applicationId] = updateElo(ratingB, expectedB, 0); + ratings[appA.id] = updateElo(ratingA, expectedA, 1); + ratings[appB.id] = updateElo(ratingB, expectedB, 0); } else if (winner === "B") { - ratings[appA.applicationId] = updateElo(ratingA, expectedA, 0); - ratings[appB.applicationId] = updateElo(ratingB, expectedB, 1); + ratings[appA.id] = updateElo(ratingA, expectedA, 0); + ratings[appB.id] = updateElo(ratingB, expectedB, 1); } else { - console.warn(`⚠️ Unexpected response: ${result.text}`); + // If the response is neither A nor B, treat it as a draw. + // Elo ratings don't change for a draw with actual scores of 0.5 vs 0.5 against expected. + ratings[appA.id] = updateElo(ratingA, expectedA, 0.5); + ratings[appB.id] = updateElo(ratingB, expectedB, 0.5); + console.warn(`⚠️ ${colors.fg.yellow}Unexpected response: "${result.text.trim()}". Treating as a draw.${colors.reset}`); } + // Optional: Log rating changes if needed for debugging + // console.log(` Ratings: A=${ratings[appA.id]!.toFixed(1)}, B=${ratings[appB.id]!.toFixed(1)}`); + } } + // Log raw final scores before normalization + console.log("Raw final ratings:", JSON.stringify(ratings, null, 2)); + + // Normalize scores so they sum to 1 (for funding allocation) const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); - const normalized = Object.entries(ratings).map(([applicationId, score]) => { - const name = applicationDataMap.get(applicationId)?.name || applicationId; + const normalized = Object.entries(ratings).map(([id, score]) => { + const name = applicationDataMap.get(id)?.name || id; + // Ensure score is not negative before normalization, although unlikely with BASE_RATING=1000 and K=128 unless many losses occur + const nonNegativeScore = Math.max(0, score); + // Prevent division by zero or negative totals + const safeTotalScore = Math.max(1, totalScore); // Avoid total being 0 or negative + return { - applicationId, + id, name, - score: (score / totalScore).toFixed(6), + score: (nonNegativeScore / safeTotalScore).toFixed(6), }; }); // Prepare output CSV format - const output = ["applicationId,name,score", ...normalized.map((r) => `${r.applicationId},${r.name},${r.score}`)].join("\n"); + const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); // Save results to file saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); - console.log(`✅ Saved results for ${agentName}`); + console.log(`${colors.fg.green}✅ Saved results for ${agentName}${colors.reset}`); } } main().catch((error) => { - console.error("❌ Error:", error); + console.error(`${colors.fg.red}❌ Error:${colors.reset}`, error); process.exit(1); }); diff --git a/scripts/review-applications.ts b/scripts/review-applications.ts index 6a212a30..cc5ba6ec 100644 --- a/scripts/review-applications.ts +++ b/scripts/review-applications.ts @@ -12,6 +12,7 @@ import { loadKarmaGap, loadResearch, loadHypercerts, + parseHypercerts, } from "../utils/utils"; import { evaluationAgent } from "../agents/agents/evaluator"; @@ -162,61 +163,3 @@ function parseKarmaGap({ grants }: KarmaGapData) { })); } -// Interfaces for Hypercerts -interface HypercertAttestation { - attester: string; - creation_block_timestamp: string; - data: { - title: string; - sources: string[]; - chain_id: number; - token_id: string; - description: string; - contract_address: string; - }; - id: string; -} - -interface HypercertData { - hypercert_id: string; - metadata: { - name: string; - description: string; - }; - attestations: { - data: HypercertAttestation[]; - }; -} - -interface HypercertsResponse { - data: { - hypercerts: { - count: number; - data: HypercertData[]; - }; - }; -} - -function parseHypercerts(hypercerts: HypercertsResponse) { - if (!hypercerts?.data?.hypercerts?.data) { - return []; - } - - return hypercerts.data.hypercerts.data.flatMap((hypercert) => { - const { hypercert_id, metadata } = hypercert; - - if (!hypercert.attestations?.data || hypercert.attestations.data.length === 0) { - return []; - } - - return hypercert.attestations.data.map((attestation) => ({ - hypercert_id, - metadata, - attester: attestation.attester, - timestamp: new Date(parseInt(attestation.creation_block_timestamp) * 1000).toLocaleDateString(), - title: attestation.data.title, - description: attestation.data.description, - sources: attestation.data.sources, - })); - }); -} diff --git a/utils/utils.ts b/utils/utils.ts index fdb17a1c..d0ed1254 100644 --- a/utils/utils.ts +++ b/utils/utils.ts @@ -219,3 +219,62 @@ export async function fetchModelSpecs(): Promise< })) ); } +// Interfaces for Hypercerts +interface HypercertAttestation { + attester: string; + creation_block_timestamp: string; + data: { + title: string; + sources: string[]; + chain_id: number; + token_id: string; + description: string; + contract_address: string; + }; + id: string; +} + +interface HypercertData { + hypercert_id: string; + metadata: { + name: string; + description: string; + }; + attestations: { + data: HypercertAttestation[]; + }; +} + + +interface HypercertsResponse { + data: { + hypercerts: { + count: number; + data: HypercertData[]; + }; + }; +} + +export function parseHypercerts(hypercerts: HypercertsResponse) { + if (!hypercerts?.data?.hypercerts?.data) { + return []; + } + + return hypercerts.data.hypercerts.data.flatMap((hypercert) => { + const { hypercert_id, metadata } = hypercert; + + if (!hypercert.attestations?.data || hypercert.attestations.data.length === 0) { + return []; + } + + return hypercert.attestations.data.map((attestation) => ({ + hypercert_id, + metadata, + attester: attestation.attester, + timestamp: new Date(parseInt(attestation.creation_block_timestamp) * 1000).toLocaleDateString(), + title: attestation.data.title, + description: attestation.data.description, + sources: attestation.data.sources, + })); + }); +} From e16a217cc97c5e3d813d7b193848fd21f38260bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=E2=96=B3M=CE=9ES?= Date: Thu, 22 May 2025 20:29:18 +0100 Subject: [PATCH 9/9] refactor: remove temporary debug logging from credit assignment script for cleaner code --- README.md | 1 + scripts/credit-assignment-elo.ts | 8 +------- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 42255ccc..0e4f16ae 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ bun run review-applications ``` + ### Data Structure The scripts work with the following directory structure: diff --git a/scripts/credit-assignment-elo.ts b/scripts/credit-assignment-elo.ts index b2d0d4a3..7fa886cc 100644 --- a/scripts/credit-assignment-elo.ts +++ b/scripts/credit-assignment-elo.ts @@ -117,13 +117,7 @@ async function main() { hypercerts: hypercertsData, }); } - // Add this logging temporarily AFTER the loop finishes (around line 53) - const gainForestId = "42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e"; // Fixed ID - const treegensId = "42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca"; - // console.log("DEBUG Preloaded GainForest Data:", JSON.stringify(applicationDataMap.get(gainForestId), null, 2)); - // console.log("DEBUG Preloaded Treegens Data:", JSON.stringify(applicationDataMap.get(treegensId), null, 2)); - // console.log("Finished pre-loading data."); - + // Get the available review models/agents const modelSpecs = await fetchModelSpecs();