diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d811561
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,165 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*.pyc
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Project specific
+outputs.json
+*.bin
+*.gz
+GoogleNews-vectors-negative300.bin
+GoogleNews-vectors-negative300.bin.gz
+
+# Temporary files
+*.tmp
+*.temp
+
+# Model files and data
+models/
+data/
+*.model
+*.pkl
+*.pickle
+
+# Results and outputs
+results/
+outputs/
+logs/
+*.log
+
+# API keys
+.env
+.env.local
+.env.*.local
+api_keys.txt
+config.json
+
+# AI settings
+.claude/
diff --git a/README.md b/README.md
index dd1bd43..a9977e7 100644
--- a/README.md
+++ b/README.md
@@ -75,18 +75,31 @@ The Scientific Knowledge Evaluation (SciKnowEval) be
🏹 QuickStart
⬇️ Step 1: Installation
-To evaluate LLMs on SciKnowEval, first clone the repository:
+**Option 1: pip install from GitHub (Recommended)**
+```bash
+pip install sciknoweval@https://github.com/HICAI-ZJU/SciKnowEval.git
+```
+
+**Option 2: Install from Source**
```bash
git clone https://github.com/HICAI-ZJU/SciKnowEval.git
cd SciKnowEval
+pip install .
```
-Next, set up a conda environment to manage the dependencies:
+
+**Option 3: Development Installation**
```bash
-conda create -n sciknoweval python=3.10.9
-conda activate sciknoweval
+git clone https://github.com/HICAI-ZJU/SciKnowEval.git
+cd SciKnowEval
+pip install -e .
```
-Then, install the required dependencies:
+
+**Option 4: Manual Setup (Legacy)**
```bash
+git clone https://github.com/HICAI-ZJU/SciKnowEval.git
+cd SciKnowEval
+conda create -n sciknoweval python=3.10.9
+conda activate sciknoweval
pip install -r requirements.txt
```
@@ -152,7 +165,31 @@ By following these guidelines, you can effectively use the SciKnowEval benchmark
🚀 Step 4: Evaluate
-You can run `eval.py` to evaluate your model:
+**Option 1: Using the Command Line Interface (Recommended)**
+
+After installing SciKnowEval, you can use the `sciknoweval` command:
+
+```bash
+export OPENAI_API_KEY="YOUR_API_KEY"
+sciknoweval \
+ --data_path "your/model/predictions.json" \
+ --word2vec_model_path "path/to/GoogleNews-vectors-negative300.bin" \
+ --gen_evaluator "gpt-4o" \
+ --output_path "path/to/your/output.json"
+```
+
+**Option 2: Using Python Module**
+
+```bash
+export OPENAI_API_KEY="YOUR_API_KEY"
+python -m sciknoweval.eval \
+ --data_path "your/model/predictions.json" \
+ --word2vec_model_path "path/to/GoogleNews-vectors-negative300.bin" \
+ --gen_evaluator "gpt-4o" \
+ --output_path "path/to/your/output.json"
+```
+
+**Option 3: Direct Script Execution (Legacy)**
```bash
data_path="your/model/predictions.json"
@@ -161,7 +198,7 @@ gen_evaluator="gpt-4o" # the correct model name in OpenAI
output_path="path/to/your/output.json"
export OPENAI_API_KEY="YOUR_API_KEY"
-python eval.py \
+python sciknoweval/eval.py \
--data_path $data_path \
--word2vec_model_path $word2vec_model_path \
--gen_evaluator $gen_evaluator \
diff --git a/evaluation/define.py b/evaluation/define.py
deleted file mode 100644
index bc3c7e9..0000000
--- a/evaluation/define.py
+++ /dev/null
@@ -1,362 +0,0 @@
-import json
-import os
-import time
-from evaluation.metrics import *
-from typing import List, Tuple, Any, Dict
-import warnings
-from tqdm import tqdm
-warnings.filterwarnings("ignore")
-
-
-def get_task_func():
- try:
- return {
- ### Chemistry
- # L1
- 'molecule_name_conversion': get_score_CLS,
- 'molecular_property_prediction': get_score_CLS,
- 'chemical_literature_QA': get_score_CLS,
- # L2
- 'reaction_mechanism_inference': get_score_CLS,
- 'compound_identification_and_properties': get_score_CLS,
- 'extract_doping': get_score_GPT4,
- 'chemical_detailed_understanding': get_score_CLS,
- 'chemical_text_summary': get_score_GPT4,
- 'chemical_hypothesis_verification': get_score_CLS,
- 'chemical_reasoning_and_interpretation': get_score_CLS,
- # L3
- 'molar_weight_calculation': get_score_CLS,
- 'molecular_property_calculation': get_score_CLS,
- 'molecule_structure_prediction': get_score_CLS,
- 'reaction_prediction': get_score_reaction,
- 'retrosynthesis': get_score_reaction,
- 'balancing_chemical_equation': get_score_filling,
- 'chemical_calculation': get_score_CLS,
- # L4
- 'chemical_harmful_QA': get_score_GPT4,
- 'mol_toxicity_prediction': get_score_CLS,
- 'chemical_laboratory_safety_test': get_score_CLS,
- # L5
- 'molecule_captioning': get_score_BLEU_ROUGE,
- 'molecule_generation': get_score_Mol_GEN,
- 'chemical_procedure_generation': get_score_GPT4,
- 'chemical_reagent_generation': get_score_GPT4,
- ### Biology
- # L1
- 'protein_property_identification': get_score_CLS,
- 'biology_literature_QA': get_score_CLS,
- # L2
- 'drug_drug_relation_extraction': get_score_RE_triplets,
- 'biomedical_judgment_and_interpretation': get_score_CLS,
- 'compound_disease_relation_extraction': get_score_RE_tuples,
- 'gene_disease_relation_extraction': get_score_RE_triplets,
- 'biological_detailed_understanding': get_score_CLS,
- 'biological_text_summary': get_score_GPT4,
- 'biological_hypothesis_verification': get_score_CLS,
- 'biological_reasoning_and_interpretation': get_score_CLS,
- # L3
- 'solubility_prediction': get_score_CLS,
- 'beta_lactamase_activity_prediction': get_score_CLS,
- 'fluorescence_prediction': get_score_CLS,
- 'GB1_ftness_prediction': get_score_CLS,
- 'stability_prediction': get_score_CLS,
- 'Protein_Protein_Interaction': get_score_CLS,
- 'biological_calculation': get_score_CLS,
- # L4
- 'biological_harmful_QA': get_score_GPT4,
- 'proteotoxicity_prediction': get_score_CLS,
- 'biological_laboratory_safety_test': get_score_CLS,
- # L5
- 'biological_procedure_generation': get_score_GPT4,
- 'biological_reagent_generation': get_score_GPT4,
- 'protein_description_generation': get_score_BLEU_ROUGE,
- 'protein_design': get_score_smith_waterman,
- 'single_cell_analysis': get_score_BLEU_ROUGE,
- ### Meterial
- # L1
- 'material_literature_QA': get_score_CLS,
- # L2
- 'material_hypothesis_verification': get_score_CLS,
- 'material_component_extraction': get_score_GPT4,
- 'material_data_extraction': get_score_CLS,
- 'material_detailed_understanding': get_score_CLS,
- 'material_reasoning_and_interpretation': get_score_CLS,
- 'material_text_summary': get_score_GPT4,
- # L3
- 'valence_electron_difference_calculation': get_score_CLS,
- 'material_calculation': get_score_CLS,
- 'lattice_volume_calculation': get_score_CLS,
- 'perovskite_stability_prediction': get_score_CLS,
- 'diffusion_rate_analysis': get_score_CLS,
- # L4
- 'material_safety_QA': get_score_CLS,
- 'material_toxicity_prediction': get_score_CLS,
- # L5
- 'crystal_structure_and_composition_analysis': get_score_GPT4,
- 'specified_band_gap_material_generation': get_score_GPT4,
- 'property_and_usage_analysis': get_score_GPT4,
- ### Physics
- # L1
- 'physics_literature_QA': get_score_CLS,
- 'fundamental_physics_exam': get_score_CLS,
- # L2
- 'physics_hypothesis_verification': get_score_CLS,
- 'physics_detailed_understanding': get_score_CLS,
- 'physics_reasoning_and_interpretation': get_score_CLS,
- 'physics_text_summary': get_score_GPT4,
- # L3
- 'high_school_physics_calculation': get_score_CLS,
- 'general_physics_calculation': get_score_CLS,
- 'physics_formula_derivation': get_score_GPT4,
- # L4
- 'physics_safety_QA': get_score_CLS,
- 'physics_laboratory_safety_test': get_score_CLS,
- # L5
- 'physics_problem_solving': get_score_GPT4,
- }
- except:
- raise NotImplementedError("task not found")
-
-def reformat_result(result: Dict[str, Any]) -> Dict[str, Any]:
- reformatted_result = {
- 'Biology': {
- 'L1': {
- 'molecule_name_conversion': result['molecule_name_conversion'],
- 'molecular_property_prediction': result['molecular_property_prediction'],
- 'biology_literature_QA': result['biology_literature_QA'],
- 'protein_description_generation': result['protein_description_generation'],
- },
- 'L2': {
- 'drug_drug_relation_extraction': result['drug_drug_relation_extraction'],
- 'biomedical_judgment_and_interpretation': result['biomedical_judgment_and_interpretation'],
- 'compound_disease_relation_extraction': result['compound_disease_relation_extraction'],
- 'gene_disease_relation_extraction': result['gene_disease_relation_extraction'],
- 'biological_detailed_understanding': result['biological_detailed_understanding'],
- 'biological_text_summary': result['biological_text_summary'],
- 'biological_hypothesis_verification': result['biological_hypothesis_verification'],
- 'biological_reasoning_and_interpretation': result['biological_reasoning_and_interpretation'],
- },
- 'L3': {
- 'solubility_prediction': result['solubility_prediction'],
- 'beta_lactamase_activity_prediction': result['beta_lactamase_activity_prediction'],
- 'fluorescence_prediction': result['fluorescence_prediction'],
- 'GB1_ftness_prediction': result['GB1_ftness_prediction'],
- 'stability_prediction': result['stability_prediction'],
- 'Protein_Protein_Interaction': result['Protein_Protein_Interaction'],
- 'biological_calculation': result['biological_calculation'],
- },
- 'L4': {
- 'biological_harmful_QA': result['biological_harmful_QA'],
- 'proteotoxicity_prediction': result['proteotoxicity_prediction'],
- 'biological_laboratory_safety_test': result['biological_laboratory_safety_test'],
- },
- 'L5': {
- 'biological_procedure_generation': result['biological_procedure_generation'],
- 'biological_reagent_generation': result['biological_reagent_generation'],
- 'protein_design': result['protein_design'],
- 'single_cell_analysis': result['single_cell_analysis'],
- },
- },
- 'Chemistry': {
- 'L1': {
- 'molecule_name_conversion': result['molecule_name_conversion'],
- 'molecular_property_prediction': result['molecular_property_prediction'],
- 'chemical_literature_QA': result['chemical_literature_QA'],
- 'molecule_captioning': result['molecule_captioning'],
- },
- 'L2': {
- 'reaction_mechanism_inference': result['reaction_mechanism_inference'],
- 'compound_identification_and_properties': result['compound_identification_and_properties'],
- 'extract_doping': result['extract_doping'],
- 'chemical_detailed_understanding': result['chemical_detailed_understanding'],
- 'chemical_text_summary': result['chemical_text_summary'],
- 'chemical_hypothesis_verification': result['chemical_hypothesis_verification'],
- 'chemical_reasoning_and_interpretation': result['chemical_reasoning_and_interpretation'],
- },
- 'L3': {
- 'molar_weight_calculation': result['molar_weight_calculation'],
- 'molecular_property_calculation': result['molecular_property_calculation'],
- 'molecule_structure_prediction': result['molecule_structure_prediction'],
- 'reaction_prediction': result['reaction_prediction'],
- 'retrosynthesis': result['retrosynthesis'],
- 'balancing_chemical_equation': result['balancing_chemical_equation'],
- 'chemical_calculation': result['chemical_calculation'],
- },
- 'L4': {
- 'chemical_harmful_QA': result['chemical_harmful_QA'],
- 'mol_toxicity_prediction': result['mol_toxicity_prediction'],
- 'chemical_laboratory_safety_test': result['chemical_laboratory_safety_test'],
- },
- 'L5': {
- 'molecule_generation': result['molecule_generation'],
- 'chemical_procedure_generation': result['chemical_procedure_generation'],
- 'chemical_reagent_generation': result['chemical_reagent_generation'],
- },
- },
- 'Materials': {
- 'L1': {
- 'material_literature_QA': result['material_literature_QA'],
- },
- 'L2': {
- 'material_hypothesis_verification': result['material_hypothesis_verification'],
- 'material_component_extraction': result['material_component_extraction'],
- 'material_data_extraction': result['material_data_extraction'],
- 'material_detailed_understanding': result['material_detailed_understanding'],
- 'material_reasoning_and_interpretation': result['material_reasoning_and_interpretation'],
- 'material_text_summary': result['material_text_summary'],
- },
- 'L3': {
- 'valence_electron_difference_calculation': result['valence_electron_difference_calculation'],
- 'material_calculation': result['material_calculation'],
- 'lattice_volume_calculation': result['lattice_volume_calculation'],
- 'perovskite_stability_prediction': result['perovskite_stability_prediction'],
- 'diffusion_rate_analysis': result['diffusion_rate_analysis'],
- },
- 'L4': {
- 'material_safety_QA': result['material_safety_QA'],
- 'material_toxicity_prediction': result['material_toxicity_prediction'],
- },
- 'L5': {
- 'crystal_structure_and_composition_analysis': result['crystal_structure_and_composition_analysis'],
- 'specified_band_gap_material_generation': result['specified_band_gap_material_generation'],
- 'property_and_usage_analysis': result['property_and_usage_analysis'],
- },
- },
- 'Physics': {
- 'L1': {
- 'physics_literature_QA': result['physics_literature_QA'],
- 'fundamental_physics_exam': result['fundamental_physics_exam'],
- },
- 'L2': {
- 'physics_hypothesis_verification': result['physics_hypothesis_verification'],
- 'physics_detailed_understanding': result['physics_detailed_understanding'],
- 'physics_reasoning_and_interpretation': result['physics_reasoning_and_interpretation'],
- 'physics_text_summary': result['physics_text_summary'],
- },
- 'L3': {
- 'high_school_physics_calculation': result['high_school_physics_calculation'],
- 'general_physics_calculation': result['general_physics_calculation'],
- 'physics_formula_derivation': result['physics_formula_derivation'],
- },
- 'L4': {
- 'physics_safety_QA': result['physics_safety_QA'],
- 'physics_laboratory_safety_test': result['physics_laboratory_safety_test'],
- },
- 'L5': {
- 'physics_problem_solving': result['physics_problem_solving'],
- },
- },
- }
-
- return reformatted_result
-
-def get_task_data(data: List[Dict[str, Any]]) -> Dict[str, Any]:
- try:
- task_data = {
- # L1
- 'molecule_name_conversion': [d for d in data if d['details']['task'] == 'molecule_name_conversion'],
- 'molecular_property_prediction': [d for d in data if d['details']['task'] == 'molecular_property_prediction' and d['details']['level'] == 'L1'],
- 'chemical_literature_QA': [d for d in data if d['details']['task'] == 'literature_multi_choice_question' and d['domain'] == 'Chemistry'],
- 'molecule_captioning': [d for d in data if d['details']['task'] == 'molecule_captioning'],
- # L2
- 'reaction_mechanism_inference': [d for d in data if d['details']['subtask'] == 'reaction_mechanism_inference'],
- 'compound_identification_and_properties': [d for d in data if d['details']['subtask'] == 'compound_identification_and_properties'],
- 'extract_doping': [d for d in data if d['details']['subtask'] == 'extract_doping'],
- 'chemical_detailed_understanding': [d for d in data if d['details']['subtask'] == 'detailed_understanding' and d['domain'] == 'Chemistry'],
- 'chemical_text_summary': [d for d in data if d['details']['subtask'] == 'text_summary' and d['domain'] == 'Chemistry'],
- 'chemical_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'hypothesis_verification' and d['domain'] == 'Chemistry'],
- 'chemical_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'reasoning_and_interpretation' and d['domain'] == 'Chemistry'],
- # L3
- 'molar_weight_calculation': [d for d in data if d['details']['task'] == 'molar_weight_calculation'],
- 'molecular_property_calculation': [d for d in data if d['details']['task'] == 'molecular_property_prediction' and d['details']['level'] == 'L3'],
- 'molecule_structure_prediction': [d for d in data if d['details']['task'] == 'molecule_structure_prediction'],
- 'reaction_prediction': [d for d in data if d['details']['task'] == 'reaction_prediction'],
- 'retrosynthesis': [d for d in data if d['details']['task'] == 'retrosynthesis'],
- 'balancing_chemical_equation': [d for d in data if d['details']['task'] == 'balancing_chemical_equation'],
- 'chemical_calculation': [d for d in data if d['details']['task'] == 'sci_calculate' and d['domain'] == 'Chemistry'],
- # L4
- 'chemical_harmful_QA': [d for d in data if d['details']['task'] == 'harmful_QA' and d['domain'] == 'Chemistry'],
- 'mol_toxicity_prediction': [d for d in data if d['details']['task'] == 'mol_toxicity_prediction'],
- 'chemical_laboratory_safety_test': [d for d in data if d['details']['task'] == 'laboratory_safety_test' and d['domain'] == 'Chemistry'],
- # L5
- 'molecule_generation': [d for d in data if d['details']['task'] == 'molecule_generation'],
- 'chemical_procedure_generation': [d for d in data if d['details']['task'] == 'procedure_generation' and d['domain'] == 'Chemistry'],
- 'chemical_reagent_generation': [d for d in data if d['details']['task'] == 'reagent_generation' and d['domain'] == 'Chemistry'],
- ### Biology
- # L1
- 'protein_property_identification': [d for d in data if d['details']['task'] == 'protein_property_identification'],
- 'biology_literature_QA': [d for d in data if d['details']['task'] == 'literature_multi_choice_question' and d['domain'] == 'Biology'],
- 'protein_description_generation': [d for d in data if d['details']['task'] == 'protein_description_generation'],
- # L2
- 'drug_drug_relation_extraction': [d for d in data if d['details']['subtask'] == 'drug_drug_relation_extraction'],
- 'biomedical_judgment_and_interpretation': [d for d in data if d['details']['subtask'] == 'biomedical_judgment_and_interpretation'],
- 'compound_disease_relation_extraction': [d for d in data if d['details']['subtask'] == 'compound_disease_relation_extraction'],
- 'gene_disease_relation_extraction': [d for d in data if d['details']['subtask'] == 'gene_disease_relation_extraction'],
- 'biological_detailed_understanding': [d for d in data if d['details']['subtask'] == 'detailed_understanding' and d['domain'] == 'Biology'],
- 'biological_text_summary': [d for d in data if d['details']['subtask'] == 'text_summary' and d['domain'] == 'Biology'],
- 'biological_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'hypothesis_verification' and d['domain'] == 'Biology'],
- 'biological_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'reasoning_and_interpretation' and d['domain'] == 'Biology'],
- # L3
- 'solubility_prediction': [d for d in data if d['details']['subtask'] == 'solubility_prediction'],
- 'beta_lactamase_activity_prediction': [d for d in data if d['details']['subtask'] == 'beta_lactamase_activity_prediction'],
- 'fluorescence_prediction': [d for d in data if d['details']['subtask'] == 'fluorescence_prediction'],
- 'GB1_ftness_prediction': [d for d in data if d['details']['subtask'] == 'GB1_ftness_prediction'],
- 'stability_prediction': [d for d in data if d['details']['subtask'] == 'stability_prediction'],
- 'Protein_Protein_Interaction': [d for d in data if d['details']['subtask'] == 'Protein_Protein_Interaction'],
- 'biological_calculation': [d for d in data if d['details']['task'] == 'sci_calculate' and d['domain'] == 'Biology'],
- # L4
- 'biological_harmful_QA': [d for d in data if d['details']['task'] == 'harmful_QA' and d['domain'] == 'Biology'],
- 'proteotoxicity_prediction': [d for d in data if d['details']['task'] == 'proteotoxicity_prediction'],
- 'biological_laboratory_safety_test': [d for d in data if d['details']['task'] == 'laboratory_safety_test' and d['domain'] == 'Biology'],
- # L5
- 'biological_procedure_generation': [d for d in data if d['details']['task'] == 'procedure_generation' and d['domain'] == 'Biology'],
- 'biological_reagent_generation': [d for d in data if d['details']['task'] == 'reagent_generation' and d['domain'] == 'Biology'],
- 'protein_design': [d for d in data if d['details']['task'] == 'protein_design'],
- 'single_cell_analysis': [d for d in data if d['details']['task'] == 'single_cell_analysis'],
- ### Material
- # L1
- 'material_literature_QA': [d for d in data if d['details']['task'] == 'material_literature_QA'],
- # L2
- 'material_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'material_hypothesis_verification'],
- 'material_component_extraction': [d for d in data if d['details']['subtask'] == 'material_component_extraction'],
- 'material_data_extraction': [d for d in data if d['details']['subtask'] == 'material_data_extraction'],
- 'material_detailed_understanding': [d for d in data if d['details']['subtask'] == 'material_detailed_understanding'],
- 'material_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'material_reasoning_and_interpretation'],
- 'material_text_summary': [d for d in data if d['details']['subtask'] == 'material_text_summary'],
- # L3
- 'valence_electron_difference_calculation': [d for d in data if d['details']['task'] == 'valence_electron_difference_calculation'],
- 'material_calculation': [d for d in data if d['details']['task'] == 'material_calculation'],
- 'lattice_volume_calculation': [d for d in data if d['details']['task'] == 'lattice_volume_calculation'],
- 'perovskite_stability_prediction': [d for d in data if d['details']['task'] == 'perovskite_stability_prediction'],
- 'diffusion_rate_analysis': [d for d in data if d['details']['task'] == 'diffusion_rate_analysis'],
- # L4
- 'material_safety_QA': [d for d in data if d['details']['task'] == 'material_safety_QA'],
- 'material_toxicity_prediction': [d for d in data if d['details']['task'] == 'material_toxicity_prediction'],
- # L5
- 'crystal_structure_and_composition_analysis': [d for d in data if d['details']['task'] == 'crystal_structure_and_composition_analysis'],
- 'specified_band_gap_material_generation': [d for d in data if d['details']['task'] == 'specified_band_gap_material_generation'],
- 'property_and_usage_analysis': [d for d in data if d['details']['task'] in ['property_and_usage_analysis', 'L5_material']],
- ### Physics
- # L1
- 'physics_literature_QA': [d for d in data if d['details']['task'] == 'physics_literature_QA'],
- 'fundamental_physics_exam': [d for d in data if d['details']['task'] == 'fundamental_physics_exam'],
- # L2
- 'physics_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'physics_hypothesis_verification'],
- 'physics_detailed_understanding': [d for d in data if d['details']['subtask'] == 'physics_detailed_understanding'],
- 'physics_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'physics_reasoning_and_interpretation'],
- 'physics_text_summary': [d for d in data if d['details']['subtask'] == 'physics_text_summary'],
- # L3
- 'high_school_physics_calculation': [d for d in data if d['details']['task'] == 'high_school_physics_calculation'],
- 'general_physics_calculation': [d for d in data if d['details']['task'] == 'general_physics_calculation'],
- 'physics_formula_derivation': [d for d in data if d['details']['task'] == 'physics_formula_derivation'],
- # L4
- 'physics_safety_QA': [d for d in data if d['details']['task'] == 'physics_safety_QA'],
- 'physics_laboratory_safety_test': [d for d in data if d['details']['task'] == 'physics_laboratory_safety_test'],
- # L5
- 'physics_problem_solving': [d for d in data if d['details']['task'] == 'physics_problem_solving'],
- }
- assert sum([len(d) for d in task_data.values()]) == len(data), f'length not equal, 0 length task: {[k for k, v in task_data.items() if len(v) == 0]}'
- print(">>>>>> Total data length:", len(data))
- return task_data
- except Exception as e:
- raise NotImplementedError(f"data error: {e}. please check your task name.")
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1970abd
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,77 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "sciknoweval"
+version = "0.1.0"
+description = "Evaluating Multi-level Scientific Knowledge of Large Language Models"
+readme = "README.md"
+license ={file = "LICENSE"}
+authors = [
+ {name = "Kehua Feng", email = "kehuafeng@zju.edu.cn"},
+ {name = "HICAI-ZJU"}
+]
+maintainers = [
+ {name = "HICAI-ZJU"}
+]
+keywords = [
+ "llm",
+ "evaluation",
+ "benchmark",
+ "scientific-knowledge",
+ "ai"
+]
+classifiers = [
+ "Development Status :: 3 - Alpha",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "Topic :: Software Development :: Libraries :: Python Modules"
+]
+requires-python = ">=3.8"
+dynamic = ["dependencies"]
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+
+[project.urls]
+Homepage = "https://github.com/HICAI-ZJU/SciKnowEval"
+Documentation = "http://www.scimind.ai/sciknoweval/"
+Repository = "https://github.com/HICAI-ZJU/SciKnowEval.git"
+"Bug Tracker" = "https://github.com/HICAI-ZJU/SciKnowEval/issues"
+Dataset = "https://huggingface.co/datasets/hicai-zju/SciKnowEval"
+Paper = "https://arxiv.org/abs/2406.09098"
+
+[project.scripts]
+sciknoweval = "sciknoweval.eval:main"
+
+[project.optional-dependencies]
+dev = [
+ "pytest>=6.0",
+ "pytest-cov",
+ "black",
+ "isort",
+ "flake8"
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["sciknoweval*"]
+
+[tool.setuptools.package-data]
+sciknoweval = ["evaluation/utils/prompts/*.yaml"]
+
+[tool.black]
+line-length = 88
+target-version = ['py38']
+
+[tool.isort]
+profile = "black"
+line_length = 88
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 5c2b3ba..edbb53c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ numpy
rouge_score
rdkit
rdchiral
-openai==0.28.1
+openai>=0.28.1
scipy
gensim
tiktoken
\ No newline at end of file
diff --git a/sciknoweval/__init__.py b/sciknoweval/__init__.py
new file mode 100644
index 0000000..c45b189
--- /dev/null
+++ b/sciknoweval/__init__.py
@@ -0,0 +1,10 @@
+"""
+SciKnowEval: Evaluating Multi-level Scientific Knowledge of Large Language Models
+
+A comprehensive benchmark for evaluating Large Language Models across five levels
+of scientific knowledge: memory, comprehension, reasoning, discernment, and application.
+"""
+
+__version__ = "0.1.0"
+__author__ = "HICAI-ZJU"
+__description__ = "Evaluating Multi-level Scientific Knowledge of Large Language Models"
diff --git a/eval.py b/sciknoweval/eval.py
similarity index 91%
rename from eval.py
rename to sciknoweval/eval.py
index a64c623..ffaa420 100644
--- a/eval.py
+++ b/sciknoweval/eval.py
@@ -1,10 +1,15 @@
+import argparse
import json
import os
-import argparse
-from evaluation.metrics import *
-from evaluation.define import get_task_func, get_task_data, reformat_result
-
import warnings
+
+from .evaluation.define import get_task_data, get_task_func, reformat_result
+from .evaluation.metrics import (
+ get_score_GPT4,
+ get_score_RE_triplets,
+ get_score_RE_tuples,
+)
+
warnings.filterwarnings("ignore")
diff --git a/sciknoweval/evaluation/__init__.py b/sciknoweval/evaluation/__init__.py
new file mode 100644
index 0000000..5c9808e
--- /dev/null
+++ b/sciknoweval/evaluation/__init__.py
@@ -0,0 +1,6 @@
+"""
+Evaluation module for SciKnowEval benchmark.
+
+Contains task definitions, metrics, and utility functions for evaluating
+scientific knowledge of Large Language Models.
+"""
\ No newline at end of file
diff --git a/sciknoweval/evaluation/define.py b/sciknoweval/evaluation/define.py
new file mode 100644
index 0000000..feb1d43
--- /dev/null
+++ b/sciknoweval/evaluation/define.py
@@ -0,0 +1,716 @@
+import warnings
+from typing import Any, Dict, List
+
+from .metrics import (
+ get_score_BLEU_ROUGE,
+ get_score_CLS,
+ get_score_filling,
+ get_score_GPT4,
+ get_score_Mol_GEN,
+ get_score_RE_triplets,
+ get_score_RE_tuples,
+ get_score_reaction,
+ get_score_smith_waterman,
+)
+
+warnings.filterwarnings("ignore")
+
+
+def get_task_func():
+ try:
+ return {
+ ### Chemistry
+ # L1
+ "molecule_name_conversion": get_score_CLS,
+ "molecular_property_prediction": get_score_CLS,
+ "chemical_literature_QA": get_score_CLS,
+ # L2
+ "reaction_mechanism_inference": get_score_CLS,
+ "compound_identification_and_properties": get_score_CLS,
+ "extract_doping": get_score_GPT4,
+ "chemical_detailed_understanding": get_score_CLS,
+ "chemical_text_summary": get_score_GPT4,
+ "chemical_hypothesis_verification": get_score_CLS,
+ "chemical_reasoning_and_interpretation": get_score_CLS,
+ # L3
+ "molar_weight_calculation": get_score_CLS,
+ "molecular_property_calculation": get_score_CLS,
+ "molecule_structure_prediction": get_score_CLS,
+ "reaction_prediction": get_score_reaction,
+ "retrosynthesis": get_score_reaction,
+ "balancing_chemical_equation": get_score_filling,
+ "chemical_calculation": get_score_CLS,
+ # L4
+ "chemical_harmful_QA": get_score_GPT4,
+ "mol_toxicity_prediction": get_score_CLS,
+ "chemical_laboratory_safety_test": get_score_CLS,
+ # L5
+ "molecule_captioning": get_score_BLEU_ROUGE,
+ "molecule_generation": get_score_Mol_GEN,
+ "chemical_procedure_generation": get_score_GPT4,
+ "chemical_reagent_generation": get_score_GPT4,
+ ### Biology
+ # L1
+ "protein_property_identification": get_score_CLS,
+ "biology_literature_QA": get_score_CLS,
+ # L2
+ "drug_drug_relation_extraction": get_score_RE_triplets,
+ "biomedical_judgment_and_interpretation": get_score_CLS,
+ "compound_disease_relation_extraction": get_score_RE_tuples,
+ "gene_disease_relation_extraction": get_score_RE_triplets,
+ "biological_detailed_understanding": get_score_CLS,
+ "biological_text_summary": get_score_GPT4,
+ "biological_hypothesis_verification": get_score_CLS,
+ "biological_reasoning_and_interpretation": get_score_CLS,
+ # L3
+ "solubility_prediction": get_score_CLS,
+ "beta_lactamase_activity_prediction": get_score_CLS,
+ "fluorescence_prediction": get_score_CLS,
+ "GB1_ftness_prediction": get_score_CLS,
+ "stability_prediction": get_score_CLS,
+ "Protein_Protein_Interaction": get_score_CLS,
+ "biological_calculation": get_score_CLS,
+ # L4
+ "biological_harmful_QA": get_score_GPT4,
+ "proteotoxicity_prediction": get_score_CLS,
+ "biological_laboratory_safety_test": get_score_CLS,
+ # L5
+ "biological_procedure_generation": get_score_GPT4,
+ "biological_reagent_generation": get_score_GPT4,
+ "protein_description_generation": get_score_BLEU_ROUGE,
+ "protein_design": get_score_smith_waterman,
+ "single_cell_analysis": get_score_BLEU_ROUGE,
+ ### Meterial
+ # L1
+ "material_literature_QA": get_score_CLS,
+ # L2
+ "material_hypothesis_verification": get_score_CLS,
+ "material_component_extraction": get_score_GPT4,
+ "material_data_extraction": get_score_CLS,
+ "material_detailed_understanding": get_score_CLS,
+ "material_reasoning_and_interpretation": get_score_CLS,
+ "material_text_summary": get_score_GPT4,
+ # L3
+ "valence_electron_difference_calculation": get_score_CLS,
+ "material_calculation": get_score_CLS,
+ "lattice_volume_calculation": get_score_CLS,
+ "perovskite_stability_prediction": get_score_CLS,
+ "diffusion_rate_analysis": get_score_CLS,
+ # L4
+ "material_safety_QA": get_score_CLS,
+ "material_toxicity_prediction": get_score_CLS,
+ # L5
+ "crystal_structure_and_composition_analysis": get_score_GPT4,
+ "specified_band_gap_material_generation": get_score_GPT4,
+ "property_and_usage_analysis": get_score_GPT4,
+ ### Physics
+ # L1
+ "physics_literature_QA": get_score_CLS,
+ "fundamental_physics_exam": get_score_CLS,
+ # L2
+ "physics_hypothesis_verification": get_score_CLS,
+ "physics_detailed_understanding": get_score_CLS,
+ "physics_reasoning_and_interpretation": get_score_CLS,
+ "physics_text_summary": get_score_GPT4,
+ # L3
+ "high_school_physics_calculation": get_score_CLS,
+ "general_physics_calculation": get_score_CLS,
+ "physics_formula_derivation": get_score_GPT4,
+ # L4
+ "physics_safety_QA": get_score_CLS,
+ "physics_laboratory_safety_test": get_score_CLS,
+ # L5
+ "physics_problem_solving": get_score_GPT4,
+ }
+ except:
+ raise NotImplementedError("task not found")
+
+
+def reformat_result(result: Dict[str, Any]) -> Dict[str, Any]:
+ reformatted_result = {
+ "Biology": {
+ "L1": {
+ "molecule_name_conversion": result["molecule_name_conversion"],
+ "molecular_property_prediction": result[
+ "molecular_property_prediction"
+ ],
+ "biology_literature_QA": result["biology_literature_QA"],
+ "protein_description_generation": result[
+ "protein_description_generation"
+ ],
+ },
+ "L2": {
+ "drug_drug_relation_extraction": result[
+ "drug_drug_relation_extraction"
+ ],
+ "biomedical_judgment_and_interpretation": result[
+ "biomedical_judgment_and_interpretation"
+ ],
+ "compound_disease_relation_extraction": result[
+ "compound_disease_relation_extraction"
+ ],
+ "gene_disease_relation_extraction": result[
+ "gene_disease_relation_extraction"
+ ],
+ "biological_detailed_understanding": result[
+ "biological_detailed_understanding"
+ ],
+ "biological_text_summary": result["biological_text_summary"],
+ "biological_hypothesis_verification": result[
+ "biological_hypothesis_verification"
+ ],
+ "biological_reasoning_and_interpretation": result[
+ "biological_reasoning_and_interpretation"
+ ],
+ },
+ "L3": {
+ "solubility_prediction": result["solubility_prediction"],
+ "beta_lactamase_activity_prediction": result[
+ "beta_lactamase_activity_prediction"
+ ],
+ "fluorescence_prediction": result["fluorescence_prediction"],
+ "GB1_ftness_prediction": result["GB1_ftness_prediction"],
+ "stability_prediction": result["stability_prediction"],
+ "Protein_Protein_Interaction": result["Protein_Protein_Interaction"],
+ "biological_calculation": result["biological_calculation"],
+ },
+ "L4": {
+ "biological_harmful_QA": result["biological_harmful_QA"],
+ "proteotoxicity_prediction": result["proteotoxicity_prediction"],
+ "biological_laboratory_safety_test": result[
+ "biological_laboratory_safety_test"
+ ],
+ },
+ "L5": {
+ "biological_procedure_generation": result[
+ "biological_procedure_generation"
+ ],
+ "biological_reagent_generation": result[
+ "biological_reagent_generation"
+ ],
+ "protein_design": result["protein_design"],
+ "single_cell_analysis": result["single_cell_analysis"],
+ },
+ },
+ "Chemistry": {
+ "L1": {
+ "molecule_name_conversion": result["molecule_name_conversion"],
+ "molecular_property_prediction": result[
+ "molecular_property_prediction"
+ ],
+ "chemical_literature_QA": result["chemical_literature_QA"],
+ "molecule_captioning": result["molecule_captioning"],
+ },
+ "L2": {
+ "reaction_mechanism_inference": result["reaction_mechanism_inference"],
+ "compound_identification_and_properties": result[
+ "compound_identification_and_properties"
+ ],
+ "extract_doping": result["extract_doping"],
+ "chemical_detailed_understanding": result[
+ "chemical_detailed_understanding"
+ ],
+ "chemical_text_summary": result["chemical_text_summary"],
+ "chemical_hypothesis_verification": result[
+ "chemical_hypothesis_verification"
+ ],
+ "chemical_reasoning_and_interpretation": result[
+ "chemical_reasoning_and_interpretation"
+ ],
+ },
+ "L3": {
+ "molar_weight_calculation": result["molar_weight_calculation"],
+ "molecular_property_calculation": result[
+ "molecular_property_calculation"
+ ],
+ "molecule_structure_prediction": result[
+ "molecule_structure_prediction"
+ ],
+ "reaction_prediction": result["reaction_prediction"],
+ "retrosynthesis": result["retrosynthesis"],
+ "balancing_chemical_equation": result["balancing_chemical_equation"],
+ "chemical_calculation": result["chemical_calculation"],
+ },
+ "L4": {
+ "chemical_harmful_QA": result["chemical_harmful_QA"],
+ "mol_toxicity_prediction": result["mol_toxicity_prediction"],
+ "chemical_laboratory_safety_test": result[
+ "chemical_laboratory_safety_test"
+ ],
+ },
+ "L5": {
+ "molecule_generation": result["molecule_generation"],
+ "chemical_procedure_generation": result[
+ "chemical_procedure_generation"
+ ],
+ "chemical_reagent_generation": result["chemical_reagent_generation"],
+ },
+ },
+ "Materials": {
+ "L1": {
+ "material_literature_QA": result["material_literature_QA"],
+ },
+ "L2": {
+ "material_hypothesis_verification": result[
+ "material_hypothesis_verification"
+ ],
+ "material_component_extraction": result[
+ "material_component_extraction"
+ ],
+ "material_data_extraction": result["material_data_extraction"],
+ "material_detailed_understanding": result[
+ "material_detailed_understanding"
+ ],
+ "material_reasoning_and_interpretation": result[
+ "material_reasoning_and_interpretation"
+ ],
+ "material_text_summary": result["material_text_summary"],
+ },
+ "L3": {
+ "valence_electron_difference_calculation": result[
+ "valence_electron_difference_calculation"
+ ],
+ "material_calculation": result["material_calculation"],
+ "lattice_volume_calculation": result["lattice_volume_calculation"],
+ "perovskite_stability_prediction": result[
+ "perovskite_stability_prediction"
+ ],
+ "diffusion_rate_analysis": result["diffusion_rate_analysis"],
+ },
+ "L4": {
+ "material_safety_QA": result["material_safety_QA"],
+ "material_toxicity_prediction": result["material_toxicity_prediction"],
+ },
+ "L5": {
+ "crystal_structure_and_composition_analysis": result[
+ "crystal_structure_and_composition_analysis"
+ ],
+ "specified_band_gap_material_generation": result[
+ "specified_band_gap_material_generation"
+ ],
+ "property_and_usage_analysis": result["property_and_usage_analysis"],
+ },
+ },
+ "Physics": {
+ "L1": {
+ "physics_literature_QA": result["physics_literature_QA"],
+ "fundamental_physics_exam": result["fundamental_physics_exam"],
+ },
+ "L2": {
+ "physics_hypothesis_verification": result[
+ "physics_hypothesis_verification"
+ ],
+ "physics_detailed_understanding": result[
+ "physics_detailed_understanding"
+ ],
+ "physics_reasoning_and_interpretation": result[
+ "physics_reasoning_and_interpretation"
+ ],
+ "physics_text_summary": result["physics_text_summary"],
+ },
+ "L3": {
+ "high_school_physics_calculation": result[
+ "high_school_physics_calculation"
+ ],
+ "general_physics_calculation": result["general_physics_calculation"],
+ "physics_formula_derivation": result["physics_formula_derivation"],
+ },
+ "L4": {
+ "physics_safety_QA": result["physics_safety_QA"],
+ "physics_laboratory_safety_test": result[
+ "physics_laboratory_safety_test"
+ ],
+ },
+ "L5": {
+ "physics_problem_solving": result["physics_problem_solving"],
+ },
+ },
+ }
+
+ return reformatted_result
+
+
+def get_task_data(data: List[Dict[str, Any]]) -> Dict[str, Any]:
+ try:
+ task_data = {
+ # L1
+ "molecule_name_conversion": [
+ d for d in data if d["details"]["task"] == "molecule_name_conversion"
+ ],
+ "molecular_property_prediction": [
+ d
+ for d in data
+ if d["details"]["task"] == "molecular_property_prediction"
+ and d["details"]["level"] == "L1"
+ ],
+ "chemical_literature_QA": [
+ d
+ for d in data
+ if d["details"]["task"] == "literature_multi_choice_question"
+ and d["domain"] == "Chemistry"
+ ],
+ "molecule_captioning": [
+ d for d in data if d["details"]["task"] == "molecule_captioning"
+ ],
+ # L2
+ "reaction_mechanism_inference": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "reaction_mechanism_inference"
+ ],
+ "compound_identification_and_properties": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "compound_identification_and_properties"
+ ],
+ "extract_doping": [
+ d for d in data if d["details"]["subtask"] == "extract_doping"
+ ],
+ "chemical_detailed_understanding": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "detailed_understanding"
+ and d["domain"] == "Chemistry"
+ ],
+ "chemical_text_summary": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "text_summary"
+ and d["domain"] == "Chemistry"
+ ],
+ "chemical_hypothesis_verification": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "hypothesis_verification"
+ and d["domain"] == "Chemistry"
+ ],
+ "chemical_reasoning_and_interpretation": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "reasoning_and_interpretation"
+ and d["domain"] == "Chemistry"
+ ],
+ # L3
+ "molar_weight_calculation": [
+ d for d in data if d["details"]["task"] == "molar_weight_calculation"
+ ],
+ "molecular_property_calculation": [
+ d
+ for d in data
+ if d["details"]["task"] == "molecular_property_prediction"
+ and d["details"]["level"] == "L3"
+ ],
+ "molecule_structure_prediction": [
+ d
+ for d in data
+ if d["details"]["task"] == "molecule_structure_prediction"
+ ],
+ "reaction_prediction": [
+ d for d in data if d["details"]["task"] == "reaction_prediction"
+ ],
+ "retrosynthesis": [
+ d for d in data if d["details"]["task"] == "retrosynthesis"
+ ],
+ "balancing_chemical_equation": [
+ d for d in data if d["details"]["task"] == "balancing_chemical_equation"
+ ],
+ "chemical_calculation": [
+ d
+ for d in data
+ if d["details"]["task"] == "sci_calculate"
+ and d["domain"] == "Chemistry"
+ ],
+ # L4
+ "chemical_harmful_QA": [
+ d
+ for d in data
+ if d["details"]["task"] == "harmful_QA" and d["domain"] == "Chemistry"
+ ],
+ "mol_toxicity_prediction": [
+ d for d in data if d["details"]["task"] == "mol_toxicity_prediction"
+ ],
+ "chemical_laboratory_safety_test": [
+ d
+ for d in data
+ if d["details"]["task"] == "laboratory_safety_test"
+ and d["domain"] == "Chemistry"
+ ],
+ # L5
+ "molecule_generation": [
+ d for d in data if d["details"]["task"] == "molecule_generation"
+ ],
+ "chemical_procedure_generation": [
+ d
+ for d in data
+ if d["details"]["task"] == "procedure_generation"
+ and d["domain"] == "Chemistry"
+ ],
+ "chemical_reagent_generation": [
+ d
+ for d in data
+ if d["details"]["task"] == "reagent_generation"
+ and d["domain"] == "Chemistry"
+ ],
+ ### Biology
+ # L1
+ "protein_property_identification": [
+ d
+ for d in data
+ if d["details"]["task"] == "protein_property_identification"
+ ],
+ "biology_literature_QA": [
+ d
+ for d in data
+ if d["details"]["task"] == "literature_multi_choice_question"
+ and d["domain"] == "Biology"
+ ],
+ "protein_description_generation": [
+ d
+ for d in data
+ if d["details"]["task"] == "protein_description_generation"
+ ],
+ # L2
+ "drug_drug_relation_extraction": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "drug_drug_relation_extraction"
+ ],
+ "biomedical_judgment_and_interpretation": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "biomedical_judgment_and_interpretation"
+ ],
+ "compound_disease_relation_extraction": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "compound_disease_relation_extraction"
+ ],
+ "gene_disease_relation_extraction": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "gene_disease_relation_extraction"
+ ],
+ "biological_detailed_understanding": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "detailed_understanding"
+ and d["domain"] == "Biology"
+ ],
+ "biological_text_summary": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "text_summary"
+ and d["domain"] == "Biology"
+ ],
+ "biological_hypothesis_verification": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "hypothesis_verification"
+ and d["domain"] == "Biology"
+ ],
+ "biological_reasoning_and_interpretation": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "reasoning_and_interpretation"
+ and d["domain"] == "Biology"
+ ],
+ # L3
+ "solubility_prediction": [
+ d for d in data if d["details"]["subtask"] == "solubility_prediction"
+ ],
+ "beta_lactamase_activity_prediction": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "beta_lactamase_activity_prediction"
+ ],
+ "fluorescence_prediction": [
+ d for d in data if d["details"]["subtask"] == "fluorescence_prediction"
+ ],
+ "GB1_ftness_prediction": [
+ d for d in data if d["details"]["subtask"] == "GB1_ftness_prediction"
+ ],
+ "stability_prediction": [
+ d for d in data if d["details"]["subtask"] == "stability_prediction"
+ ],
+ "Protein_Protein_Interaction": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "Protein_Protein_Interaction"
+ ],
+ "biological_calculation": [
+ d
+ for d in data
+ if d["details"]["task"] == "sci_calculate" and d["domain"] == "Biology"
+ ],
+ # L4
+ "biological_harmful_QA": [
+ d
+ for d in data
+ if d["details"]["task"] == "harmful_QA" and d["domain"] == "Biology"
+ ],
+ "proteotoxicity_prediction": [
+ d for d in data if d["details"]["task"] == "proteotoxicity_prediction"
+ ],
+ "biological_laboratory_safety_test": [
+ d
+ for d in data
+ if d["details"]["task"] == "laboratory_safety_test"
+ and d["domain"] == "Biology"
+ ],
+ # L5
+ "biological_procedure_generation": [
+ d
+ for d in data
+ if d["details"]["task"] == "procedure_generation"
+ and d["domain"] == "Biology"
+ ],
+ "biological_reagent_generation": [
+ d
+ for d in data
+ if d["details"]["task"] == "reagent_generation"
+ and d["domain"] == "Biology"
+ ],
+ "protein_design": [
+ d for d in data if d["details"]["task"] == "protein_design"
+ ],
+ "single_cell_analysis": [
+ d for d in data if d["details"]["task"] == "single_cell_analysis"
+ ],
+ ### Material
+ # L1
+ "material_literature_QA": [
+ d for d in data if d["details"]["task"] == "material_literature_QA"
+ ],
+ # L2
+ "material_hypothesis_verification": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "material_hypothesis_verification"
+ ],
+ "material_component_extraction": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "material_component_extraction"
+ ],
+ "material_data_extraction": [
+ d for d in data if d["details"]["subtask"] == "material_data_extraction"
+ ],
+ "material_detailed_understanding": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "material_detailed_understanding"
+ ],
+ "material_reasoning_and_interpretation": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "material_reasoning_and_interpretation"
+ ],
+ "material_text_summary": [
+ d for d in data if d["details"]["subtask"] == "material_text_summary"
+ ],
+ # L3
+ "valence_electron_difference_calculation": [
+ d
+ for d in data
+ if d["details"]["task"] == "valence_electron_difference_calculation"
+ ],
+ "material_calculation": [
+ d for d in data if d["details"]["task"] == "material_calculation"
+ ],
+ "lattice_volume_calculation": [
+ d for d in data if d["details"]["task"] == "lattice_volume_calculation"
+ ],
+ "perovskite_stability_prediction": [
+ d
+ for d in data
+ if d["details"]["task"] == "perovskite_stability_prediction"
+ ],
+ "diffusion_rate_analysis": [
+ d for d in data if d["details"]["task"] == "diffusion_rate_analysis"
+ ],
+ # L4
+ "material_safety_QA": [
+ d for d in data if d["details"]["task"] == "material_safety_QA"
+ ],
+ "material_toxicity_prediction": [
+ d
+ for d in data
+ if d["details"]["task"] == "material_toxicity_prediction"
+ ],
+ # L5
+ "crystal_structure_and_composition_analysis": [
+ d
+ for d in data
+ if d["details"]["task"] == "crystal_structure_and_composition_analysis"
+ ],
+ "specified_band_gap_material_generation": [
+ d
+ for d in data
+ if d["details"]["task"] == "specified_band_gap_material_generation"
+ ],
+ "property_and_usage_analysis": [
+ d
+ for d in data
+ if d["details"]["task"]
+ in ["property_and_usage_analysis", "L5_material"]
+ ],
+ ### Physics
+ # L1
+ "physics_literature_QA": [
+ d for d in data if d["details"]["task"] == "physics_literature_QA"
+ ],
+ "fundamental_physics_exam": [
+ d for d in data if d["details"]["task"] == "fundamental_physics_exam"
+ ],
+ # L2
+ "physics_hypothesis_verification": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "physics_hypothesis_verification"
+ ],
+ "physics_detailed_understanding": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "physics_detailed_understanding"
+ ],
+ "physics_reasoning_and_interpretation": [
+ d
+ for d in data
+ if d["details"]["subtask"] == "physics_reasoning_and_interpretation"
+ ],
+ "physics_text_summary": [
+ d for d in data if d["details"]["subtask"] == "physics_text_summary"
+ ],
+ # L3
+ "high_school_physics_calculation": [
+ d
+ for d in data
+ if d["details"]["task"] == "high_school_physics_calculation"
+ ],
+ "general_physics_calculation": [
+ d for d in data if d["details"]["task"] == "general_physics_calculation"
+ ],
+ "physics_formula_derivation": [
+ d for d in data if d["details"]["task"] == "physics_formula_derivation"
+ ],
+ # L4
+ "physics_safety_QA": [
+ d for d in data if d["details"]["task"] == "physics_safety_QA"
+ ],
+ "physics_laboratory_safety_test": [
+ d
+ for d in data
+ if d["details"]["task"] == "physics_laboratory_safety_test"
+ ],
+ # L5
+ "physics_problem_solving": [
+ d for d in data if d["details"]["task"] == "physics_problem_solving"
+ ],
+ }
+ assert sum([len(d) for d in task_data.values()]) == len(data), (
+ f"length not equal, 0 length task: {[k for k, v in task_data.items() if len(v) == 0]}"
+ )
+ print(">>>>>> Total data length:", len(data))
+ return task_data
+ except Exception as e:
+ raise NotImplementedError(f"data error: {e}. please check your task name.")
diff --git a/evaluation/metrics.py b/sciknoweval/evaluation/metrics.py
similarity index 96%
rename from evaluation/metrics.py
rename to sciknoweval/evaluation/metrics.py
index cee34a1..de868b9 100644
--- a/evaluation/metrics.py
+++ b/sciknoweval/evaluation/metrics.py
@@ -1,15 +1,21 @@
import os
+from typing import Any, Dict, List
+
+import tiktoken
import yaml
-from typing import List, Any, Dict
from tqdm import tqdm
-from scipy.spatial.distance import cosine
-from gensim.models import KeyedVectors
-from evaluation.utils.relation_extraction import *
-from evaluation.utils.process import load_word2vec_model
-from evaluation.utils.generation import calculate_nltk_scores, calculate_smiles_metrics
-import tiktoken
-from evaluation.utils.openai_api import OpenAIChat
+from .utils.generation import calculate_nltk_scores, calculate_smiles_metrics
+from .utils.openai_api import OpenAIChat
+from .utils.process import load_word2vec_model
+from .utils.relation_extraction import (
+ cos_f1_score,
+ macro_f1_score_triplets,
+ macro_f1_score_tuples,
+ validate_format_and_extract_data_triplets,
+ validate_format_and_extract_data_tuples,
+)
+
script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/sciknoweval/evaluation/utils/__init__.py b/sciknoweval/evaluation/utils/__init__.py
new file mode 100644
index 0000000..95ffa6d
--- /dev/null
+++ b/sciknoweval/evaluation/utils/__init__.py
@@ -0,0 +1,6 @@
+"""
+Utility functions for SciKnowEval evaluation.
+
+Contains utilities for generation, OpenAI API integration, data processing,
+and relation extraction tasks.
+"""
\ No newline at end of file
diff --git a/evaluation/utils/generation.py b/sciknoweval/evaluation/utils/generation.py
similarity index 99%
rename from evaluation/utils/generation.py
rename to sciknoweval/evaluation/utils/generation.py
index d5c4dcd..7373b09 100644
--- a/evaluation/utils/generation.py
+++ b/sciknoweval/evaluation/utils/generation.py
@@ -1,14 +1,12 @@
-
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
-from tqdm import tqdm
-from rouge_score import rouge_scorer
-
-from rdkit import Chem, DataStructs
-from rdkit.Chem import MACCSkeys, AllChem
from rdchiral.chiral import copy_chirality
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Chem.AllChem import AssignStereochemistry
+from rouge_score import rouge_scorer
+from tqdm import tqdm
def calculate_nltk_scores(tokenizer, ans_strs, pred_strs):
diff --git a/evaluation/utils/openai_api.py b/sciknoweval/evaluation/utils/openai_api.py
similarity index 99%
rename from evaluation/utils/openai_api.py
rename to sciknoweval/evaluation/utils/openai_api.py
index b3afee1..5ba9e92 100644
--- a/evaluation/utils/openai_api.py
+++ b/sciknoweval/evaluation/utils/openai_api.py
@@ -1,7 +1,8 @@
-import openai
+import asyncio
import os
from typing import List
-import asyncio
+
+import openai
class OpenAIChat():
diff --git a/evaluation/utils/process.py b/sciknoweval/evaluation/utils/process.py
similarity index 100%
rename from evaluation/utils/process.py
rename to sciknoweval/evaluation/utils/process.py
diff --git a/evaluation/utils/prompts/prompt.yaml b/sciknoweval/evaluation/utils/prompts/prompt.yaml
similarity index 100%
rename from evaluation/utils/prompts/prompt.yaml
rename to sciknoweval/evaluation/utils/prompts/prompt.yaml
diff --git a/evaluation/utils/relation_extraction.py b/sciknoweval/evaluation/utils/relation_extraction.py
similarity index 98%
rename from evaluation/utils/relation_extraction.py
rename to sciknoweval/evaluation/utils/relation_extraction.py
index 7086b7a..99677a5 100644
--- a/evaluation/utils/relation_extraction.py
+++ b/sciknoweval/evaluation/utils/relation_extraction.py
@@ -1,10 +1,13 @@
-from collections import Counter
-import re
-from typing import List, Tuple, Any, Dict
-import numpy as np
import re
-from evaluation.utils.process import same_entities, sentence_to_vec, cosine_similarity, cosine_similarity_2
-
+from collections import Counter
+from typing import Any, Dict, List, Tuple
+
+from .process import (
+ cosine_similarity,
+ cosine_similarity_2,
+ same_entities,
+ sentence_to_vec,
+)
def parse_tuples(tuple_str):