diff --git a/.gitignore b/.gitignore index a14702c4..e06d00f8 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Finder (MacOS) folder config .DS_Store + +.qodo \ No newline at end of file diff --git a/README.md b/README.md index 1a561c92..0e4f16ae 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,15 @@ bun run review-applications Evaluates grant applications using multiple AI agents with different evaluation criteria. Each agent provides an independent review based on the project's application, research data, and historical grant information. + +#### 6. Run Elo Tournament + +```sh +bun run review-applications +``` + + + ### Data Structure The scripts work with the following directory structure: diff --git a/agents/agents/credit-assigner.ts b/agents/agents/credit-assigner.ts index 895bfd61..8ac9defd 100644 --- a/agents/agents/credit-assigner.ts +++ b/agents/agents/credit-assigner.ts @@ -10,5 +10,6 @@ export const creditAssignmentAgent = new Agent({ You are given a list of applications reviews and you should assign a score between 0.00 and 1.00 to each review based on how much funding the project deserve. The total score of all reviews should be 1.00. `, - model: google("gemini-2.0-flash-thinking-exp-01-21"), + //model: google("gemini-2.0-flash-thinking-exp-01-21"), + model: openai("gpt-4.1-2025-04-14"), }); diff --git a/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json b/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json index f4365a36..3b025825 100644 --- a/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json +++ b/applications/42161/867/0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e/hypercerts.json @@ -25,12 +25,12 @@ }, { "hypercert_id": "42220-0x16bA53B74c234C870c61EFC04cD418B8f2865959-23479483317544753978972847912792006590464", + "attestations": { + "data": [] + }, "metadata": { "name": "GainForest - Support XPRIZE Rainforest’s “Single Most Impactful Approach/Technology”", "description": "GainForest is co-lead of BioDivX, the XPRIZE Rainforest bonus prize award winner recognized as the “single most impactful approach/technology”. GainForest has worked with over 25 communities to bring crucial biodiversity data onchain, distributing $32K in conservation data income to local communities in 2024 for their role in recording crucial forestation and wildlife information, and co-hosted 13 workshops with local communities across 3 countries. GainForest is fostering global climate action through capacity building with local communities and scientists, maintaining a public database of AI+Environment models. By purchasing this hypercert, you are supporting the teams’ work on the ground to protect the world’s rainforests and incentivize local participation in climate initiatives." - }, - "attestations": { - "data": [] } }, { diff --git a/scores/credit-assignment-gitcoin-communist.csv b/scores/credit-assignment-gitcoin-communist.csv new file mode 100644 index 00000000..921c4c77 --- /dev/null +++ b/scores/credit-assignment-gitcoin-communist.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.3125 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO,0.2250 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.0875 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.2500 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.1250 \ No newline at end of file diff --git a/scores/credit-assignment-open-source-capitalist.csv b/scores/credit-assignment-open-source-capitalist.csv new file mode 100644 index 00000000..6fe5960c --- /dev/null +++ b/scores/credit-assignment-open-source-capitalist.csv @@ -0,0 +1,5 @@ +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.3500 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO,0.2500 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.1000 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.1500 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.1500 \ No newline at end of file diff --git a/scores/credit-assignment-regenerator.csv b/scores/credit-assignment-regenerator.csv new file mode 100644 index 00000000..29d23592 --- /dev/null +++ b/scores/credit-assignment-regenerator.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.3800 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO,0.1800 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.0800 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.2300 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.1300 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist-from-review-only.csv b/scores/elo-credit-assignment-gitcoin-communist-from-review-only.csv new file mode 100644 index 00000000..fae0c4f7 --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist-from-review-only.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,0.206004 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187525 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,0.200414 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,0.194112 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist-review-and-data-emphasize-review.csv b/scores/elo-credit-assignment-gitcoin-communist-review-and-data-emphasize-review.csv new file mode 100644 index 00000000..44db4cf4 --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist-review-and-data-emphasize-review.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist-review-and-data.csv b/scores/elo-credit-assignment-gitcoin-communist-review-and-data.csv new file mode 100644 index 00000000..21baec7f --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist-review-and-data.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,AERTH,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-gitcoin-communist.csv b/scores/elo-credit-assignment-gitcoin-communist.csv new file mode 100644 index 00000000..c5a335f6 --- /dev/null +++ b/scores/elo-credit-assignment-gitcoin-communist.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.187672 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.199991 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194028 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206363 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist-from-review-only.csv b/scores/elo-credit-assignment-open-source-capitalist-from-review-only.csv new file mode 100644 index 00000000..b54ee566 --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist-from-review-only.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,0.206004 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.193925 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,0.200120 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,0.188006 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist-review-and-data-emphasize-review.csv b/scores/elo-credit-assignment-open-source-capitalist-review-and-data-emphasize-review.csv new file mode 100644 index 00000000..44db4cf4 --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist-review-and-data-emphasize-review.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.194294 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.206340 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist-review-and-data.csv b/scores/elo-credit-assignment-open-source-capitalist-review-and-data.csv new file mode 100644 index 00000000..21baec7f --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist-review-and-data.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,AERTH,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-open-source-capitalist.csv b/scores/elo-credit-assignment-open-source-capitalist.csv new file mode 100644 index 00000000..7565a1fe --- /dev/null +++ b/scores/elo-credit-assignment-open-source-capitalist.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.187672 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.206391 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.193734 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200256 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator-from-review-only.csv b/scores/elo-credit-assignment-regenerator-from-review-only.csv new file mode 100644 index 00000000..774afb03 --- /dev/null +++ b/scores/elo-credit-assignment-regenerator-from-review-only.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,0.199898 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.194192 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,0.205938 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,0.188027 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator-review-and-data-emphasize-review.csv b/scores/elo-credit-assignment-regenerator-review-and-data-emphasize-review.csv new file mode 100644 index 00000000..27e2531b --- /dev/null +++ b/scores/elo-credit-assignment-regenerator-review-and-data-emphasize-review.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator-review-and-data.csv b/scores/elo-credit-assignment-regenerator-review-and-data.csv new file mode 100644 index 00000000..4a5d7be6 --- /dev/null +++ b/scores/elo-credit-assignment-regenerator-review-and-data.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.199604 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,AERTH, 0.187817 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.200694 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.199940 \ No newline at end of file diff --git a/scores/elo-credit-assignment-regenerator.csv b/scores/elo-credit-assignment-regenerator.csv new file mode 100644 index 00000000..7565a1fe --- /dev/null +++ b/scores/elo-credit-assignment-regenerator.csv @@ -0,0 +1,6 @@ +id,name,score +42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e,GainForest,0.211946 +42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca,Treegens DAO🌳,0.187672 +42161-867-0x5a35dc4ee0fd8cf69eb9f227b626c0c093c3efc5a6f1b518a3792d5e8b721860,ÆRTH - Planetary AI,0.206391 +42161-867-0xe573019b9f23a496663f5944a83c8acdc99792bfc5f5ad603ee8f6cb0f46f9fe,Hydrapad,0.193734 +42161-865-0x9119659eb8173b32bb4423f83702ee30c1e1db49ae0c07b00263bf3ea7f4d4ef,Deep Funding,0.200256 \ No newline at end of file diff --git a/scripts/credit-assignment-elo-how-much-better.ts b/scripts/credit-assignment-elo-how-much-better.ts new file mode 100644 index 00000000..4ce04d74 --- /dev/null +++ b/scripts/credit-assignment-elo-how-much-better.ts @@ -0,0 +1,325 @@ +import { + fetchModelSpecs, + getApplicationId, + loadApplicationsFromDirectory, + loadReview, + saveFile, + loadApplication, + loadKarmaGap, + loadResearch, + getProjectName, +} from "../utils/utils"; + +import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; + +// Core Elo scoring parameters +const BASE_RATING = 1000; +// INCREASE K_FACTOR SIGNIFICANTLY to amplify small magnitude differences +const K_FACTOR = 256; // Was 32, then 128 + +// Helper: Calculate expected score between two ratings (Standard Elo - currently unused in rating updates) +function expectedScore(ratingA: number, ratingB: number): number { + return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); +} + +// Helper: Update Elo rating after a matchup (Standard Elo - currently unused) +// function updateElo(rating: number, expected: number, actual: number): number { +// return rating + K_FACTOR * (actual - expected); +// } + +// New Helper: Update rating based *only* on the actual score magnitude from the match +function updateRatingDirectly(rating: number, actual: number): number { + // actual is the score for *this* player (0 to 1, derived from magnitude 0.5-1.0) + // differenceFromNeutral will be positive for wins (>0.5), negative for losses (<0.5) + const differenceFromNeutral = actual - 0.5; + // Adjust rating based on deviation from neutral, scaled by K_FACTOR + return rating + K_FACTOR * differenceFromNeutral; +} + +// Helper to safely get the 'output' of the last research entry for a specific agent +// Assumes input is a valid array with at least one element +const getResearchOutput = (researchAgentArray: any[]): string | undefined => { + // Access the last element safely and return its output field (if it exists) + return researchAgentArray[researchAgentArray.length - 1]?.output; +} + +// Helper function to process research data for a single agent type +const processResearchAgent = (appName: string, agentKey: string, agentData: any): string | undefined => { + if (Array.isArray(agentData) && agentData.length > 0) { + return getResearchOutput(agentData); + } else if (agentData !== undefined && agentData !== null) { + // Log if we get something other than undefined/null that's not a non-empty array + console.warn(`DEBUG: Unexpected research data type for ${appName} -> ${agentKey}. Expected array, got:`, typeof agentData, JSON.stringify(agentData)?.substring(0, 100) + '...'); + return undefined; + } + // Return undefined if data is null, undefined, or an empty array + return undefined; +}; + +async function main() { + // Load all applications from the directory + const applications = loadApplicationsFromDirectory(); + console.log(`Processing ${applications.length} applications...`); + + // Pre-load all necessary data for each application + console.log("Pre-loading application data (app, research, karmagap)..."); + const applicationDataMap = new Map(); + for (const app of applications) { + const id = getApplicationId(app); + const name = getProjectName(app) || id; + applicationDataMap.set(id, { + id, + name, + application: loadApplication(id), + research: loadResearch(id), // This loads the whole research object for the ID + karmaGap: loadKarmaGap(id), + }); + } + // Add this logging temporarily AFTER the loop finishes (around line 53) + const gainForestId = "42161-867-0x62f25a11c2ae5a2af563cc5b1f772b3aebe1bd4a0a82e41a78e61e1db972ad7e"; // Fixed ID + const treegensId = "42161-867-0xd089724cd73c932413bce5c797aee7d2fbcd1ad282f24cff790977e77908fdca"; + // console.log("DEBUG Preloaded GainForest Data:", JSON.stringify(applicationDataMap.get(gainForestId), null, 2)); + // console.log("DEBUG Preloaded Treegens Data:", JSON.stringify(applicationDataMap.get(treegensId), null, 2)); + // console.log("Finished pre-loading data."); + + // Get the available review models/agents + const modelSpecs = await fetchModelSpecs(); + + // Load all reviews for all agents, associating with pre-loaded data + const reviewsByAgent = modelSpecs.reduce((acc, agent) => { + const agentName = agent?.name; + acc[agentName] = applications + .map((app) => { + const id = getApplicationId(app); + const baseData = applicationDataMap.get(id); + if (!baseData) return null; + + const reviewData = loadReview(id, agentName); + // We need review data to proceed for this agent + if (!reviewData) return null; + + // Add the review data to the base data + return { ...baseData, review: reviewData, ethics: agent.ethics, constitution: agent.constitution, scoring: agent.scoring }; + }) + .filter(Boolean); // Filter out any apps where review data was missing for this agent + return acc; + }, {} as Record); + + + // Loop through each agent's reviews + for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { + console.log(`\n🎯 Running Elo tournament for agent: ${agentName}`); + + // Initialize all ratings + const ratings: Record = {}; + for (const { id } of agentApplicationsData) { + ratings[id] = BASE_RATING; + } + + // Run simulated pairwise matchups (round-robin style) + for (let i = 0; i < agentApplicationsData.length; i++) { + for (let j = i + 1; j < agentApplicationsData.length; j++) { + const appA = agentApplicationsData[i]!; + const appB = agentApplicationsData[j]!; + + // Process research data using the helper function for better debugging + const academicResearchOutputA = processResearchAgent(appA.name, 'Academic_Research_Agent', appA.research?.Academic_Research_Agent); + const academicResearchOutputB = processResearchAgent(appB.name, 'Academic_Research_Agent', appB.research?.Academic_Research_Agent); + + const factCheckingOutputA = processResearchAgent(appA.name, 'Fact_Checking_Agent', appA.research?.Fact_Checking_Agent); + const factCheckingOutputB = processResearchAgent(appB.name, 'Fact_Checking_Agent', appB.research?.Fact_Checking_Agent); + + const webSearchOutputA = processResearchAgent(appA.name, 'Web_Search_Agent', appA.research?.Web_Search_Agent); + const webSearchOutputB = processResearchAgent(appB.name, 'Web_Search_Agent', appB.research?.Web_Search_Agent); + + const primaryResearchOutputA = processResearchAgent(appA.name, 'Primary_Research_Agent', appA.research?.Primary_Research_Agent); + const primaryResearchOutputB = processResearchAgent(appB.name, 'Primary_Research_Agent', appB.research?.Primary_Research_Agent); + + const dataAnalysisOutputA = processResearchAgent(appA.name, 'Data_Analysis_Agent', appA.research?.Data_Analysis_Agent); + const dataAnalysisOutputB = processResearchAgent(appB.name, 'Data_Analysis_Agent', appB.research?.Data_Analysis_Agent); + + + // Prepare data snippets for the prompt (avoid stringifying huge objects) + // Note: application_summary now uses metadata.description if available + const projectAData = { + title: appA.name, + scoring: appA.scoring, + ethics: appA.ethics, + constitution: appA.constitution, + application_summary: appA.application?.project?.metadata?.description ?? appA.application?.project?.description, // Use metadata first + karmagap_score: appA.karmaGap?.score, + reviewer_comment: appA.review?.comments, + academic_research: academicResearchOutputA, + fact_checking: factCheckingOutputA, + web_search: webSearchOutputA, + primary_research: primaryResearchOutputA, + data_analysis: dataAnalysisOutputA + }; + const projectBData = { + title: appB.name, + scoring: appB.scoring, + ethics: appB.ethics, + constitution: appB.constitution, + application_summary: appB.application?.project?.metadata?.description ?? appB.application?.project?.description, // Use metadata first + karmagap_score: appB.karmaGap?.score, + reviewer_comment: appB.review?.comments, + academic_research: academicResearchOutputB, + fact_checking: factCheckingOutputB, + web_search: webSearchOutputB, + primary_research: primaryResearchOutputB, + data_analysis: dataAnalysisOutputB + }; + // --- DEBUG LOGGING START --- + // Log data specifically for the GainForest vs Treegens DAO comparison + const isTargetComparison = (appA.name === "GainForest" && appB.name === "Treegens DAO🌳") || (appA.name === "Treegens DAO🌳" && appB.name === "GainForest"); + if (isTargetComparison) { + // Trim potentially very long outputs for concise debug logging + const trim = (s: string | undefined) => s ? s.substring(0, 100) + '...' : undefined; + console.log(`DEBUG Data sent to agent for ${ appA.name} ${JSON.stringify({ + ...projectAData, + academic_research: trim(projectAData.academic_research), + fact_checking: trim(projectAData.fact_checking), + web_search: trim(projectAData.web_search), + primary_research: trim(projectAData.primary_research), + data_analysis: trim(projectAData.data_analysis), + application_summary: trim(projectAData.application_summary), + reviewer_comment: trim(projectAData.reviewer_comment) + }, null, 2)}`); + console.log(`DEBUG Data sent to agent for ${ appB.name} ${JSON.stringify({ + ...projectBData, + academic_research: trim(projectBData.academic_research), + fact_checking: trim(projectBData.fact_checking), + web_search: trim(projectBData.web_search), + primary_research: trim(projectBData.primary_research), + data_analysis: trim(projectBData.data_analysis), + application_summary: trim(projectBData.application_summary), + reviewer_comment: trim(projectBData.reviewer_comment) + }, null, 2)}`); + } + // --- DEBUG LOGGING END --- + + const oldPrompt = ` + + `; + // Refined prompt with clearer scale definitions + const prompt = ` +You are an expert grant allocator reviewing and comparing two different projects. + +Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, feasibility, and overall quality presented in the data below. + +Consider all available information including application summaries, research reports, fact checks and specific reviewer comments, while considering their particular scoring, ethics and consitution. + +Then, estimate *how much* better the winning project is on a scale from 0.5 (projects are roughly equal) to 1.0 (winner is significantly better). + +* A score of 0.5 means the projects are virtually identical in potential/quality based on the provided data. +* A score of 0.6-0.7 indicates the winner is noticeably better. +* A score of 0.8-0.9 indicates the winner is significantly better. +* A score of 1.0 means the winner is vastly superior and clearly deserves much more funding consideration relative to the other. + +Use the full range [0.5, 1.0] to reflect the true difference you perceive. Be decisive if the difference is clear. You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. Give weight to the research outputs (web search, fact checking, academic context) as objective inputs alongside the application summary (project's own description) and the reviewer's subjective comment. + +Respond ONLY in the format 'W,S' where W is the winning project ('A' or 'B') and S is the score (e.g., 'A,0.8' or 'B,0.6'). Do NOT explain. + +--- Project A --- +${JSON.stringify(projectAData, null, 2)} + +--- Project B --- +${JSON.stringify(projectBData, null, 2)} +`; + + const result = await creditAssignmentAgent.generate(prompt); + + // Parse the response "W,S" + const responseParts = result.text.trim().toUpperCase().split(','); + let actualA = 0.5; // Default to draw + let actualB = 0.5; + let winnerLetter = 'DRAW'; + let winMagnitude = 0.5; + + + if (responseParts.length === 2) { + // Linter Fix: Assert that parts[0] and parts[1] are defined because we checked length === 2 + winnerLetter = responseParts[0]!; + const scoreString = responseParts[1]!; + // console.log("winnerLetter is ", winnerLetter); // Keep logging minimal for clarity + // console.log("scoreString is ", scoreString); + // Check if parts are defined before using them (Redundant due to length check and assertion, but safe) + // if (winnerLetter !== undefined && scoreString !== undefined) { + winMagnitude = parseFloat(scoreString); + + if ((winnerLetter === 'A' || winnerLetter === 'B') && !isNaN(winMagnitude) && winMagnitude >= 0.5 && winMagnitude <= 1.0) { + if (winnerLetter === 'A') { + actualA = winMagnitude; + actualB = 1 - winMagnitude; + } else { // Winner is B + actualB = winMagnitude; + actualA = 1 - winMagnitude; + } + } else { + console.warn(`⚠️ Invalid response format/score: ${result.text}. Draw.`); + actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; + } + // } else { // This block becomes unreachable due to length check / assertion + // console.warn(`⚠️ Invalid response parts: ${result.text}. Draw.`); + // actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; + // } + } else { + console.warn( + `\x1b[31m⚠️ Unexpected response format: "${result.text.trim()}". Draw.\x1b[0m` + ); + actualA = 0.5; actualB = 0.5; winnerLetter = 'DRAW'; winMagnitude = 0.5; + } + + // Log the outcome of the match concisely only if not the debugged comparison + // Always log for now to see results + console.log(`Match: ${appA.name} vs ${appB.name} -> Winner: ${winnerLetter}, Mag: ${winMagnitude.toFixed(2)}`); + + + const ratingA = ratings[appA.id]!; + const ratingB = ratings[appB.id]!; + // We are no longer using expected scores for the update + // const expectedA = expectedScore(ratingA, ratingB); + // const expectedB = expectedScore(ratingB, ratingA); + + // Update ratings using the calculated actual scores directly + ratings[appA.id] = updateRatingDirectly(ratingA, actualA); + ratings[appB.id] = updateRatingDirectly(ratingB, actualB); + // Optional: Log rating changes if needed for debugging + // console.log(` Ratings: A=${ratings[appA.id]!.toFixed(1)}, B=${ratings[appB.id]!.toFixed(1)}`); + + } + } + + // Log raw final scores before normalization + console.log("Raw final ratings:", JSON.stringify(ratings, null, 2)); + + + // Normalize scores so they sum to 1 (for funding allocation) + const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); + const normalized = Object.entries(ratings).map(([id, score]) => { + const name = applicationDataMap.get(id)?.name || id; + // Ensure score is not negative before normalization, although unlikely with BASE_RATING=1000 and K=128 unless many losses occur + const nonNegativeScore = Math.max(0, score); + // Prevent division by zero or negative totals + const safeTotalScore = Math.max(1, totalScore); // Avoid total being 0 or negative + + return { + id, + name, + score: (nonNegativeScore / safeTotalScore).toFixed(6), + }; + }); + + // Prepare output CSV format + const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); + + // Save results to file + saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); + console.log(`✅ Saved results for ${agentName}`); + } +} + +main().catch((error) => { + console.error("❌ Error:", error); + process.exit(1); +}); diff --git a/scripts/credit-assignment-elo.ts b/scripts/credit-assignment-elo.ts new file mode 100644 index 00000000..7fa886cc --- /dev/null +++ b/scripts/credit-assignment-elo.ts @@ -0,0 +1,290 @@ +import { + fetchModelSpecs, + getApplicationId, + loadApplicationsFromDirectory, + loadReview, + saveFile, + loadApplication, + loadKarmaGap, + loadResearch, + getProjectName, + loadHypercerts, + parseHypercerts, +} from "../utils/utils"; + +import { creditAssignmentAgent } from "../agents/agents/credit-assigner"; + +// Core Elo scoring parameters +const BASE_RATING = 1000; +// INCREASE K_FACTOR SIGNIFICANTLY to amplify small magnitude differences +const K_FACTOR = 32; // Was 32, then 128 + +// Helper: Calculate expected score between two ratings (Standard Elo - currently unused in rating updates) +function expectedScore(ratingA: number, ratingB: number): number { + return 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400)); +} + +// Helper: Update Elo rating after a matchup (Standard Elo - currently unused) +function updateElo(rating: number, expected: number, actual: number): number { + return rating + K_FACTOR * (actual - expected); +} + +// New Helper: Update rating based *only* on the actual score magnitude from the match +// function updateRatingDirectly(rating: number, actual: number): number { +// // actual is the score for *this* player (0 to 1, derived from magnitude 0.5-1.0) +// // differenceFromNeutral will be positive for wins (>0.5), negative for losses (<0.5) +// const differenceFromNeutral = actual - 0.5; +// // Adjust rating based on deviation from neutral, scaled by K_FACTOR +// return rating + K_FACTOR * differenceFromNeutral; +// } + +// Helper to safely get the 'output' of the last research entry for a specific agent +// Assumes input is a valid array with at least one element +const getResearchOutput = (researchAgentArray: any[]): string | undefined => { + // Access the last element safely and return its output field (if it exists) + return researchAgentArray[researchAgentArray.length - 1]?.output; +} + +// Helper function to process research data for a single agent type +const processResearchAgent = (appName: string, agentKey: string, agentData: any): string | undefined => { + if (Array.isArray(agentData) && agentData.length > 0) { + return getResearchOutput(agentData); + } else if (agentData !== undefined && agentData !== null) { + // Log if we get something other than undefined/null that's not a non-empty array + console.warn(`DEBUG: Unexpected research data type for ${appName} -> ${agentKey}. Expected array, got:`, typeof agentData, JSON.stringify(agentData)?.substring(0, 100) + '...'); + return undefined; + } + // Return undefined if data is null, undefined, or an empty array + return undefined; +}; + +// ANSI escape codes for colors +const colors = { + reset: "\x1b[0m", + bright: "\x1b[1m", + dim: "\x1b[2m", + underscore: "\x1b[4m", + blink: "\x1b[5m", + reverse: "\x1b[7m", + hidden: "\x1b[8m", + + fg: { + black: "\x1b[30m", + red: "\x1b[31m", + green: "\x1b[32m", + yellow: "\x1b[33m", + blue: "\x1b[34m", + magenta: "\x1b[35m", + cyan: "\x1b[36m", + white: "\x1b[37m", + crimson: "\x1b[38m" // Scarlet + }, + bg: { + black: "\x1b[40m", + red: "\x1b[41m", + green: "\x1b[42m", + yellow: "\x1b[43m", + blue: "\x1b[44m", + magenta: "\x1b[45m", + cyan: "\x1b[46m", + white: "\x1b[47m", + crimson: "\x1b[48m" + } +}; + +async function main() { + // Load all applications from the directory + const applications = loadApplicationsFromDirectory(); + console.log(`Processing ${colors.fg.yellow}${applications.length}${colors.reset} applications...`); + + // Pre-load all necessary data for each application + console.log("Pre-loading application data (app, research, karmagap)..."); + const applicationDataMap = new Map(); + + + for (const app of applications) { + const id = getApplicationId(app); + const name = getProjectName(app) || id; + const hypercerts = loadHypercerts(id); + const hypercertsData = parseHypercerts(hypercerts ?? { data: { hypercerts: { data: [] } } }); + + applicationDataMap.set(id, { + id, + name, + application: loadApplication(id), + research: loadResearch(id), // This loads the whole research object for the ID + karmaGap: loadKarmaGap(id), + hypercerts: hypercertsData, + }); + } + + // Get the available review models/agents + const modelSpecs = await fetchModelSpecs(); + + // Load all reviews for all agents, associating with pre-loaded data + const reviewsByAgent = modelSpecs.reduce((acc, agent) => { + const agentName = agent?.name; + acc[agentName] = applications + .map((app) => { + const id = getApplicationId(app); + const baseData = applicationDataMap.get(id); + if (!baseData) return null; + + const reviewData = loadReview(id, agentName); + // We need review data to proceed for this agent + if (!reviewData) return null; + + // Add the review data to the base data + return { ...baseData, review: reviewData, ethics: agent.ethics, constitution: agent.constitution, scoring: agent.scoring }; + }) + .filter(Boolean); // Filter out any apps where review data was missing for this agent + return acc; + }, {} as Record); + + + // Loop through each agent's reviews + for (const [agentName, agentApplicationsData] of Object.entries(reviewsByAgent)) { + console.log(`\n${colors.fg.magenta}🎯 Running Elo tournament for agent: ${colors.bright}${agentName}${colors.reset}`); + + // Initialize all ratings + const ratings: Record = {}; + for (const { id } of agentApplicationsData) { + ratings[id] = BASE_RATING; + } + + // Run simulated pairwise matchups (round-robin style) + for (let i = 0; i < agentApplicationsData.length; i++) { + for (let j = i + 1; j < agentApplicationsData.length; j++) { + const appA = agentApplicationsData[i]!; + const appB = agentApplicationsData[j]!; + + // Process research data using the helper function for better debugging + const academicResearchOutputA = processResearchAgent(appA.name, 'Academic_Research_Agent', appA.research?.Academic_Research_Agent); + const academicResearchOutputB = processResearchAgent(appB.name, 'Academic_Research_Agent', appB.research?.Academic_Research_Agent); + + const factCheckingOutputA = processResearchAgent(appA.name, 'Fact_Checking_Agent', appA.research?.Fact_Checking_Agent); + const factCheckingOutputB = processResearchAgent(appB.name, 'Fact_Checking_Agent', appB.research?.Fact_Checking_Agent); + + const webSearchOutputA = processResearchAgent(appA.name, 'Web_Search_Agent', appA.research?.Web_Search_Agent); + const webSearchOutputB = processResearchAgent(appB.name, 'Web_Search_Agent', appB.research?.Web_Search_Agent); + + const primaryResearchOutputA = processResearchAgent(appA.name, 'Primary_Research_Agent', appA.research?.Primary_Research_Agent); + const primaryResearchOutputB = processResearchAgent(appB.name, 'Primary_Research_Agent', appB.research?.Primary_Research_Agent); + + const dataAnalysisOutputA = processResearchAgent(appA.name, 'Data_Analysis_Agent', appA.research?.Data_Analysis_Agent); + const dataAnalysisOutputB = processResearchAgent(appB.name, 'Data_Analysis_Agent', appB.research?.Data_Analysis_Agent); + + // Prepare data snippets for the prompt (avoid stringifying huge objects) + // Note: application_summary now uses metadata.description if available + const projectAData = { + title: appA.name, + scoring: appA.scoring, + ethics: appA.ethics, + constitution: appA.constitution, + application_summary: appA.application?.project?.metadata?.description ?? appA.application?.project?.description, // Use metadata first + karmagap_score: appA.karmaGap?.score, + reviewer_comment: appA.review?.comments, + academic_research: academicResearchOutputA, + fact_checking: factCheckingOutputA, + web_search: webSearchOutputA, + primary_research: primaryResearchOutputA, + data_analysis: dataAnalysisOutputA, + hypercerts: appA.hypercerts + }; + const projectBData = { + title: appB.name, + scoring: appB.scoring, + ethics: appB.ethics, + constitution: appB.constitution, + application_summary: appB.application?.project?.metadata?.description ?? appB.application?.project?.description, // Use metadata first + karmagap_score: appB.karmaGap?.score, + reviewer_comment: appB.review?.comments, + academic_research: academicResearchOutputB, + fact_checking: factCheckingOutputB, + web_search: webSearchOutputB, + primary_research: primaryResearchOutputB, + data_analysis: dataAnalysisOutputB, + hypercerts: appB.hypercerts + }; + + // Refined prompt with clearer scale definitions + const prompt = ` +You are an expert grant allocator reviewing and comparing two different projects. + +Choose the one that deserves *more funding*, based on impact, clarity, roadmap, potential, feasibility, and overall quality presented in the data below. + +Consider all available information including application summaries, research reports, fact checks and specific reviewer comments, while considering their particular scoring, ethics and consitution. + +You are essentially a judge in the tournament which gives a score based on each Agent's review, so it's important you strongly consider reviewer_comment along with the metric data provided. Give weight to the research outputs (web search, fact checking, academic context) as objective inputs alongside the application summary (project's own description) and the reviewer's subjective comment. + +Respond ONLY with "A" or "B". Do NOT explain. + +--- Project A --- +${JSON.stringify(projectAData, null, 2)} + +--- Project B --- +${JSON.stringify(projectBData, null, 2)} +`; + + const result = await creditAssignmentAgent.generate(prompt); + + const winner = result.text.trim().toUpperCase(); + console.log(`Match: ${colors.fg.cyan}${appA.name}${colors.reset} vs ${colors.fg.cyan}${appB.name}${colors.reset} -> Winner: ${winner === "A" || winner === "B" ? colors.fg.green : colors.fg.yellow}${winner}${colors.reset}`); + + const ratingA = ratings[appA.id]!; + const ratingB = ratings[appB.id]!; + const expectedA = expectedScore(ratingA, ratingB); + const expectedB = expectedScore(ratingB, ratingA); + + if (winner === "A") { + ratings[appA.id] = updateElo(ratingA, expectedA, 1); + ratings[appB.id] = updateElo(ratingB, expectedB, 0); + } else if (winner === "B") { + ratings[appA.id] = updateElo(ratingA, expectedA, 0); + ratings[appB.id] = updateElo(ratingB, expectedB, 1); + } else { + // If the response is neither A nor B, treat it as a draw. + // Elo ratings don't change for a draw with actual scores of 0.5 vs 0.5 against expected. + ratings[appA.id] = updateElo(ratingA, expectedA, 0.5); + ratings[appB.id] = updateElo(ratingB, expectedB, 0.5); + console.warn(`⚠️ ${colors.fg.yellow}Unexpected response: "${result.text.trim()}". Treating as a draw.${colors.reset}`); + } + // Optional: Log rating changes if needed for debugging + // console.log(` Ratings: A=${ratings[appA.id]!.toFixed(1)}, B=${ratings[appB.id]!.toFixed(1)}`); + + } + } + + // Log raw final scores before normalization + console.log("Raw final ratings:", JSON.stringify(ratings, null, 2)); + + + // Normalize scores so they sum to 1 (for funding allocation) + const totalScore = Object.values(ratings).reduce((sum, score) => sum + score, 0); + const normalized = Object.entries(ratings).map(([id, score]) => { + const name = applicationDataMap.get(id)?.name || id; + // Ensure score is not negative before normalization, although unlikely with BASE_RATING=1000 and K=128 unless many losses occur + const nonNegativeScore = Math.max(0, score); + // Prevent division by zero or negative totals + const safeTotalScore = Math.max(1, totalScore); // Avoid total being 0 or negative + + return { + id, + name, + score: (nonNegativeScore / safeTotalScore).toFixed(6), + }; + }); + + // Prepare output CSV format + const output = ["id,name,score", ...normalized.map((r) => `${r.id},${r.name},${r.score}`)].join("\n"); + + // Save results to file + saveFile(`scores/elo-credit-assignment-${agentName}.csv`, output); + console.log(`${colors.fg.green}✅ Saved results for ${agentName}${colors.reset}`); + } +} + +main().catch((error) => { + console.error(`${colors.fg.red}❌ Error:${colors.reset}`, error); + process.exit(1); +}); diff --git a/scripts/review-applications.ts b/scripts/review-applications.ts index 6a212a30..cc5ba6ec 100644 --- a/scripts/review-applications.ts +++ b/scripts/review-applications.ts @@ -12,6 +12,7 @@ import { loadKarmaGap, loadResearch, loadHypercerts, + parseHypercerts, } from "../utils/utils"; import { evaluationAgent } from "../agents/agents/evaluator"; @@ -162,61 +163,3 @@ function parseKarmaGap({ grants }: KarmaGapData) { })); } -// Interfaces for Hypercerts -interface HypercertAttestation { - attester: string; - creation_block_timestamp: string; - data: { - title: string; - sources: string[]; - chain_id: number; - token_id: string; - description: string; - contract_address: string; - }; - id: string; -} - -interface HypercertData { - hypercert_id: string; - metadata: { - name: string; - description: string; - }; - attestations: { - data: HypercertAttestation[]; - }; -} - -interface HypercertsResponse { - data: { - hypercerts: { - count: number; - data: HypercertData[]; - }; - }; -} - -function parseHypercerts(hypercerts: HypercertsResponse) { - if (!hypercerts?.data?.hypercerts?.data) { - return []; - } - - return hypercerts.data.hypercerts.data.flatMap((hypercert) => { - const { hypercert_id, metadata } = hypercert; - - if (!hypercert.attestations?.data || hypercert.attestations.data.length === 0) { - return []; - } - - return hypercert.attestations.data.map((attestation) => ({ - hypercert_id, - metadata, - attester: attestation.attester, - timestamp: new Date(parseInt(attestation.creation_block_timestamp) * 1000).toLocaleDateString(), - title: attestation.data.title, - description: attestation.data.description, - sources: attestation.data.sources, - })); - }); -} diff --git a/utils/utils.ts b/utils/utils.ts index f94498f3..d0ed1254 100644 --- a/utils/utils.ts +++ b/utils/utils.ts @@ -136,6 +136,15 @@ export function loadReview(applicationId: string, agent: string): any { return null; } } +export function loadApplication(applicationId: string): any { + try { + return JSON.parse( + readFileSync(getApplicationPath(applicationId) + "/application.json", "utf8") + ); + } catch (error) { + return null; + } +} export function loadKarmaGap(applicationId: string): any { try { return JSON.parse( @@ -182,7 +191,7 @@ export function normalizeProjectName(name: string) { } export async function fetchModelSpecs(): Promise< - { name: string; profileUrl: string; style: string; constitution: string }[] + { name: string; profileUrl: string; style: string; constitution: string; scoring: string, ethics: string }[] > { const baseURL = `https://api.github.com/repos/evalscience/deepgov-gg23/contents/agents`; const contentURL = `https://raw.githubusercontent.com/evalscience/deepgov-gg23/refs/heads/main`; @@ -210,3 +219,62 @@ export async function fetchModelSpecs(): Promise< })) ); } +// Interfaces for Hypercerts +interface HypercertAttestation { + attester: string; + creation_block_timestamp: string; + data: { + title: string; + sources: string[]; + chain_id: number; + token_id: string; + description: string; + contract_address: string; + }; + id: string; +} + +interface HypercertData { + hypercert_id: string; + metadata: { + name: string; + description: string; + }; + attestations: { + data: HypercertAttestation[]; + }; +} + + +interface HypercertsResponse { + data: { + hypercerts: { + count: number; + data: HypercertData[]; + }; + }; +} + +export function parseHypercerts(hypercerts: HypercertsResponse) { + if (!hypercerts?.data?.hypercerts?.data) { + return []; + } + + return hypercerts.data.hypercerts.data.flatMap((hypercert) => { + const { hypercert_id, metadata } = hypercert; + + if (!hypercert.attestations?.data || hypercert.attestations.data.length === 0) { + return []; + } + + return hypercert.attestations.data.map((attestation) => ({ + hypercert_id, + metadata, + attester: attestation.attester, + timestamp: new Date(parseInt(attestation.creation_block_timestamp) * 1000).toLocaleDateString(), + title: attestation.data.title, + description: attestation.data.description, + sources: attestation.data.sources, + })); + }); +}