diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d811561 --- /dev/null +++ b/.gitignore @@ -0,0 +1,165 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.pyc +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Project specific +outputs.json +*.bin +*.gz +GoogleNews-vectors-negative300.bin +GoogleNews-vectors-negative300.bin.gz + +# Temporary files +*.tmp +*.temp + +# Model files and data +models/ +data/ +*.model +*.pkl +*.pickle + +# Results and outputs +results/ +outputs/ +logs/ +*.log + +# API keys +.env +.env.local +.env.*.local +api_keys.txt +config.json + +# AI settings +.claude/ diff --git a/README.md b/README.md index dd1bd43..a9977e7 100644 --- a/README.md +++ b/README.md @@ -75,18 +75,31 @@ The Scientific Knowledge Evaluation (SciKnowEval) be

🏹 QuickStart

⬇️ Step 1: Installation

-To evaluate LLMs on SciKnowEval, first clone the repository: +**Option 1: pip install from GitHub (Recommended)** +```bash +pip install sciknoweval@https://github.com/HICAI-ZJU/SciKnowEval.git +``` + +**Option 2: Install from Source** ```bash git clone https://github.com/HICAI-ZJU/SciKnowEval.git cd SciKnowEval +pip install . ``` -Next, set up a conda environment to manage the dependencies: + +**Option 3: Development Installation** ```bash -conda create -n sciknoweval python=3.10.9 -conda activate sciknoweval +git clone https://github.com/HICAI-ZJU/SciKnowEval.git +cd SciKnowEval +pip install -e . ``` -Then, install the required dependencies: + +**Option 4: Manual Setup (Legacy)** ```bash +git clone https://github.com/HICAI-ZJU/SciKnowEval.git +cd SciKnowEval +conda create -n sciknoweval python=3.10.9 +conda activate sciknoweval pip install -r requirements.txt ``` @@ -152,7 +165,31 @@ By following these guidelines, you can effectively use the SciKnowEval benchmark

🚀 Step 4: Evaluate

-You can run `eval.py` to evaluate your model: +**Option 1: Using the Command Line Interface (Recommended)** + +After installing SciKnowEval, you can use the `sciknoweval` command: + +```bash +export OPENAI_API_KEY="YOUR_API_KEY" +sciknoweval \ + --data_path "your/model/predictions.json" \ + --word2vec_model_path "path/to/GoogleNews-vectors-negative300.bin" \ + --gen_evaluator "gpt-4o" \ + --output_path "path/to/your/output.json" +``` + +**Option 2: Using Python Module** + +```bash +export OPENAI_API_KEY="YOUR_API_KEY" +python -m sciknoweval.eval \ + --data_path "your/model/predictions.json" \ + --word2vec_model_path "path/to/GoogleNews-vectors-negative300.bin" \ + --gen_evaluator "gpt-4o" \ + --output_path "path/to/your/output.json" +``` + +**Option 3: Direct Script Execution (Legacy)** ```bash data_path="your/model/predictions.json" @@ -161,7 +198,7 @@ gen_evaluator="gpt-4o" # the correct model name in OpenAI output_path="path/to/your/output.json" export OPENAI_API_KEY="YOUR_API_KEY" -python eval.py \ +python sciknoweval/eval.py \ --data_path $data_path \ --word2vec_model_path $word2vec_model_path \ --gen_evaluator $gen_evaluator \ diff --git a/evaluation/define.py b/evaluation/define.py deleted file mode 100644 index bc3c7e9..0000000 --- a/evaluation/define.py +++ /dev/null @@ -1,362 +0,0 @@ -import json -import os -import time -from evaluation.metrics import * -from typing import List, Tuple, Any, Dict -import warnings -from tqdm import tqdm -warnings.filterwarnings("ignore") - - -def get_task_func(): - try: - return { - ### Chemistry - # L1 - 'molecule_name_conversion': get_score_CLS, - 'molecular_property_prediction': get_score_CLS, - 'chemical_literature_QA': get_score_CLS, - # L2 - 'reaction_mechanism_inference': get_score_CLS, - 'compound_identification_and_properties': get_score_CLS, - 'extract_doping': get_score_GPT4, - 'chemical_detailed_understanding': get_score_CLS, - 'chemical_text_summary': get_score_GPT4, - 'chemical_hypothesis_verification': get_score_CLS, - 'chemical_reasoning_and_interpretation': get_score_CLS, - # L3 - 'molar_weight_calculation': get_score_CLS, - 'molecular_property_calculation': get_score_CLS, - 'molecule_structure_prediction': get_score_CLS, - 'reaction_prediction': get_score_reaction, - 'retrosynthesis': get_score_reaction, - 'balancing_chemical_equation': get_score_filling, - 'chemical_calculation': get_score_CLS, - # L4 - 'chemical_harmful_QA': get_score_GPT4, - 'mol_toxicity_prediction': get_score_CLS, - 'chemical_laboratory_safety_test': get_score_CLS, - # L5 - 'molecule_captioning': get_score_BLEU_ROUGE, - 'molecule_generation': get_score_Mol_GEN, - 'chemical_procedure_generation': get_score_GPT4, - 'chemical_reagent_generation': get_score_GPT4, - ### Biology - # L1 - 'protein_property_identification': get_score_CLS, - 'biology_literature_QA': get_score_CLS, - # L2 - 'drug_drug_relation_extraction': get_score_RE_triplets, - 'biomedical_judgment_and_interpretation': get_score_CLS, - 'compound_disease_relation_extraction': get_score_RE_tuples, - 'gene_disease_relation_extraction': get_score_RE_triplets, - 'biological_detailed_understanding': get_score_CLS, - 'biological_text_summary': get_score_GPT4, - 'biological_hypothesis_verification': get_score_CLS, - 'biological_reasoning_and_interpretation': get_score_CLS, - # L3 - 'solubility_prediction': get_score_CLS, - 'beta_lactamase_activity_prediction': get_score_CLS, - 'fluorescence_prediction': get_score_CLS, - 'GB1_ftness_prediction': get_score_CLS, - 'stability_prediction': get_score_CLS, - 'Protein_Protein_Interaction': get_score_CLS, - 'biological_calculation': get_score_CLS, - # L4 - 'biological_harmful_QA': get_score_GPT4, - 'proteotoxicity_prediction': get_score_CLS, - 'biological_laboratory_safety_test': get_score_CLS, - # L5 - 'biological_procedure_generation': get_score_GPT4, - 'biological_reagent_generation': get_score_GPT4, - 'protein_description_generation': get_score_BLEU_ROUGE, - 'protein_design': get_score_smith_waterman, - 'single_cell_analysis': get_score_BLEU_ROUGE, - ### Meterial - # L1 - 'material_literature_QA': get_score_CLS, - # L2 - 'material_hypothesis_verification': get_score_CLS, - 'material_component_extraction': get_score_GPT4, - 'material_data_extraction': get_score_CLS, - 'material_detailed_understanding': get_score_CLS, - 'material_reasoning_and_interpretation': get_score_CLS, - 'material_text_summary': get_score_GPT4, - # L3 - 'valence_electron_difference_calculation': get_score_CLS, - 'material_calculation': get_score_CLS, - 'lattice_volume_calculation': get_score_CLS, - 'perovskite_stability_prediction': get_score_CLS, - 'diffusion_rate_analysis': get_score_CLS, - # L4 - 'material_safety_QA': get_score_CLS, - 'material_toxicity_prediction': get_score_CLS, - # L5 - 'crystal_structure_and_composition_analysis': get_score_GPT4, - 'specified_band_gap_material_generation': get_score_GPT4, - 'property_and_usage_analysis': get_score_GPT4, - ### Physics - # L1 - 'physics_literature_QA': get_score_CLS, - 'fundamental_physics_exam': get_score_CLS, - # L2 - 'physics_hypothesis_verification': get_score_CLS, - 'physics_detailed_understanding': get_score_CLS, - 'physics_reasoning_and_interpretation': get_score_CLS, - 'physics_text_summary': get_score_GPT4, - # L3 - 'high_school_physics_calculation': get_score_CLS, - 'general_physics_calculation': get_score_CLS, - 'physics_formula_derivation': get_score_GPT4, - # L4 - 'physics_safety_QA': get_score_CLS, - 'physics_laboratory_safety_test': get_score_CLS, - # L5 - 'physics_problem_solving': get_score_GPT4, - } - except: - raise NotImplementedError("task not found") - -def reformat_result(result: Dict[str, Any]) -> Dict[str, Any]: - reformatted_result = { - 'Biology': { - 'L1': { - 'molecule_name_conversion': result['molecule_name_conversion'], - 'molecular_property_prediction': result['molecular_property_prediction'], - 'biology_literature_QA': result['biology_literature_QA'], - 'protein_description_generation': result['protein_description_generation'], - }, - 'L2': { - 'drug_drug_relation_extraction': result['drug_drug_relation_extraction'], - 'biomedical_judgment_and_interpretation': result['biomedical_judgment_and_interpretation'], - 'compound_disease_relation_extraction': result['compound_disease_relation_extraction'], - 'gene_disease_relation_extraction': result['gene_disease_relation_extraction'], - 'biological_detailed_understanding': result['biological_detailed_understanding'], - 'biological_text_summary': result['biological_text_summary'], - 'biological_hypothesis_verification': result['biological_hypothesis_verification'], - 'biological_reasoning_and_interpretation': result['biological_reasoning_and_interpretation'], - }, - 'L3': { - 'solubility_prediction': result['solubility_prediction'], - 'beta_lactamase_activity_prediction': result['beta_lactamase_activity_prediction'], - 'fluorescence_prediction': result['fluorescence_prediction'], - 'GB1_ftness_prediction': result['GB1_ftness_prediction'], - 'stability_prediction': result['stability_prediction'], - 'Protein_Protein_Interaction': result['Protein_Protein_Interaction'], - 'biological_calculation': result['biological_calculation'], - }, - 'L4': { - 'biological_harmful_QA': result['biological_harmful_QA'], - 'proteotoxicity_prediction': result['proteotoxicity_prediction'], - 'biological_laboratory_safety_test': result['biological_laboratory_safety_test'], - }, - 'L5': { - 'biological_procedure_generation': result['biological_procedure_generation'], - 'biological_reagent_generation': result['biological_reagent_generation'], - 'protein_design': result['protein_design'], - 'single_cell_analysis': result['single_cell_analysis'], - }, - }, - 'Chemistry': { - 'L1': { - 'molecule_name_conversion': result['molecule_name_conversion'], - 'molecular_property_prediction': result['molecular_property_prediction'], - 'chemical_literature_QA': result['chemical_literature_QA'], - 'molecule_captioning': result['molecule_captioning'], - }, - 'L2': { - 'reaction_mechanism_inference': result['reaction_mechanism_inference'], - 'compound_identification_and_properties': result['compound_identification_and_properties'], - 'extract_doping': result['extract_doping'], - 'chemical_detailed_understanding': result['chemical_detailed_understanding'], - 'chemical_text_summary': result['chemical_text_summary'], - 'chemical_hypothesis_verification': result['chemical_hypothesis_verification'], - 'chemical_reasoning_and_interpretation': result['chemical_reasoning_and_interpretation'], - }, - 'L3': { - 'molar_weight_calculation': result['molar_weight_calculation'], - 'molecular_property_calculation': result['molecular_property_calculation'], - 'molecule_structure_prediction': result['molecule_structure_prediction'], - 'reaction_prediction': result['reaction_prediction'], - 'retrosynthesis': result['retrosynthesis'], - 'balancing_chemical_equation': result['balancing_chemical_equation'], - 'chemical_calculation': result['chemical_calculation'], - }, - 'L4': { - 'chemical_harmful_QA': result['chemical_harmful_QA'], - 'mol_toxicity_prediction': result['mol_toxicity_prediction'], - 'chemical_laboratory_safety_test': result['chemical_laboratory_safety_test'], - }, - 'L5': { - 'molecule_generation': result['molecule_generation'], - 'chemical_procedure_generation': result['chemical_procedure_generation'], - 'chemical_reagent_generation': result['chemical_reagent_generation'], - }, - }, - 'Materials': { - 'L1': { - 'material_literature_QA': result['material_literature_QA'], - }, - 'L2': { - 'material_hypothesis_verification': result['material_hypothesis_verification'], - 'material_component_extraction': result['material_component_extraction'], - 'material_data_extraction': result['material_data_extraction'], - 'material_detailed_understanding': result['material_detailed_understanding'], - 'material_reasoning_and_interpretation': result['material_reasoning_and_interpretation'], - 'material_text_summary': result['material_text_summary'], - }, - 'L3': { - 'valence_electron_difference_calculation': result['valence_electron_difference_calculation'], - 'material_calculation': result['material_calculation'], - 'lattice_volume_calculation': result['lattice_volume_calculation'], - 'perovskite_stability_prediction': result['perovskite_stability_prediction'], - 'diffusion_rate_analysis': result['diffusion_rate_analysis'], - }, - 'L4': { - 'material_safety_QA': result['material_safety_QA'], - 'material_toxicity_prediction': result['material_toxicity_prediction'], - }, - 'L5': { - 'crystal_structure_and_composition_analysis': result['crystal_structure_and_composition_analysis'], - 'specified_band_gap_material_generation': result['specified_band_gap_material_generation'], - 'property_and_usage_analysis': result['property_and_usage_analysis'], - }, - }, - 'Physics': { - 'L1': { - 'physics_literature_QA': result['physics_literature_QA'], - 'fundamental_physics_exam': result['fundamental_physics_exam'], - }, - 'L2': { - 'physics_hypothesis_verification': result['physics_hypothesis_verification'], - 'physics_detailed_understanding': result['physics_detailed_understanding'], - 'physics_reasoning_and_interpretation': result['physics_reasoning_and_interpretation'], - 'physics_text_summary': result['physics_text_summary'], - }, - 'L3': { - 'high_school_physics_calculation': result['high_school_physics_calculation'], - 'general_physics_calculation': result['general_physics_calculation'], - 'physics_formula_derivation': result['physics_formula_derivation'], - }, - 'L4': { - 'physics_safety_QA': result['physics_safety_QA'], - 'physics_laboratory_safety_test': result['physics_laboratory_safety_test'], - }, - 'L5': { - 'physics_problem_solving': result['physics_problem_solving'], - }, - }, - } - - return reformatted_result - -def get_task_data(data: List[Dict[str, Any]]) -> Dict[str, Any]: - try: - task_data = { - # L1 - 'molecule_name_conversion': [d for d in data if d['details']['task'] == 'molecule_name_conversion'], - 'molecular_property_prediction': [d for d in data if d['details']['task'] == 'molecular_property_prediction' and d['details']['level'] == 'L1'], - 'chemical_literature_QA': [d for d in data if d['details']['task'] == 'literature_multi_choice_question' and d['domain'] == 'Chemistry'], - 'molecule_captioning': [d for d in data if d['details']['task'] == 'molecule_captioning'], - # L2 - 'reaction_mechanism_inference': [d for d in data if d['details']['subtask'] == 'reaction_mechanism_inference'], - 'compound_identification_and_properties': [d for d in data if d['details']['subtask'] == 'compound_identification_and_properties'], - 'extract_doping': [d for d in data if d['details']['subtask'] == 'extract_doping'], - 'chemical_detailed_understanding': [d for d in data if d['details']['subtask'] == 'detailed_understanding' and d['domain'] == 'Chemistry'], - 'chemical_text_summary': [d for d in data if d['details']['subtask'] == 'text_summary' and d['domain'] == 'Chemistry'], - 'chemical_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'hypothesis_verification' and d['domain'] == 'Chemistry'], - 'chemical_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'reasoning_and_interpretation' and d['domain'] == 'Chemistry'], - # L3 - 'molar_weight_calculation': [d for d in data if d['details']['task'] == 'molar_weight_calculation'], - 'molecular_property_calculation': [d for d in data if d['details']['task'] == 'molecular_property_prediction' and d['details']['level'] == 'L3'], - 'molecule_structure_prediction': [d for d in data if d['details']['task'] == 'molecule_structure_prediction'], - 'reaction_prediction': [d for d in data if d['details']['task'] == 'reaction_prediction'], - 'retrosynthesis': [d for d in data if d['details']['task'] == 'retrosynthesis'], - 'balancing_chemical_equation': [d for d in data if d['details']['task'] == 'balancing_chemical_equation'], - 'chemical_calculation': [d for d in data if d['details']['task'] == 'sci_calculate' and d['domain'] == 'Chemistry'], - # L4 - 'chemical_harmful_QA': [d for d in data if d['details']['task'] == 'harmful_QA' and d['domain'] == 'Chemistry'], - 'mol_toxicity_prediction': [d for d in data if d['details']['task'] == 'mol_toxicity_prediction'], - 'chemical_laboratory_safety_test': [d for d in data if d['details']['task'] == 'laboratory_safety_test' and d['domain'] == 'Chemistry'], - # L5 - 'molecule_generation': [d for d in data if d['details']['task'] == 'molecule_generation'], - 'chemical_procedure_generation': [d for d in data if d['details']['task'] == 'procedure_generation' and d['domain'] == 'Chemistry'], - 'chemical_reagent_generation': [d for d in data if d['details']['task'] == 'reagent_generation' and d['domain'] == 'Chemistry'], - ### Biology - # L1 - 'protein_property_identification': [d for d in data if d['details']['task'] == 'protein_property_identification'], - 'biology_literature_QA': [d for d in data if d['details']['task'] == 'literature_multi_choice_question' and d['domain'] == 'Biology'], - 'protein_description_generation': [d for d in data if d['details']['task'] == 'protein_description_generation'], - # L2 - 'drug_drug_relation_extraction': [d for d in data if d['details']['subtask'] == 'drug_drug_relation_extraction'], - 'biomedical_judgment_and_interpretation': [d for d in data if d['details']['subtask'] == 'biomedical_judgment_and_interpretation'], - 'compound_disease_relation_extraction': [d for d in data if d['details']['subtask'] == 'compound_disease_relation_extraction'], - 'gene_disease_relation_extraction': [d for d in data if d['details']['subtask'] == 'gene_disease_relation_extraction'], - 'biological_detailed_understanding': [d for d in data if d['details']['subtask'] == 'detailed_understanding' and d['domain'] == 'Biology'], - 'biological_text_summary': [d for d in data if d['details']['subtask'] == 'text_summary' and d['domain'] == 'Biology'], - 'biological_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'hypothesis_verification' and d['domain'] == 'Biology'], - 'biological_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'reasoning_and_interpretation' and d['domain'] == 'Biology'], - # L3 - 'solubility_prediction': [d for d in data if d['details']['subtask'] == 'solubility_prediction'], - 'beta_lactamase_activity_prediction': [d for d in data if d['details']['subtask'] == 'beta_lactamase_activity_prediction'], - 'fluorescence_prediction': [d for d in data if d['details']['subtask'] == 'fluorescence_prediction'], - 'GB1_ftness_prediction': [d for d in data if d['details']['subtask'] == 'GB1_ftness_prediction'], - 'stability_prediction': [d for d in data if d['details']['subtask'] == 'stability_prediction'], - 'Protein_Protein_Interaction': [d for d in data if d['details']['subtask'] == 'Protein_Protein_Interaction'], - 'biological_calculation': [d for d in data if d['details']['task'] == 'sci_calculate' and d['domain'] == 'Biology'], - # L4 - 'biological_harmful_QA': [d for d in data if d['details']['task'] == 'harmful_QA' and d['domain'] == 'Biology'], - 'proteotoxicity_prediction': [d for d in data if d['details']['task'] == 'proteotoxicity_prediction'], - 'biological_laboratory_safety_test': [d for d in data if d['details']['task'] == 'laboratory_safety_test' and d['domain'] == 'Biology'], - # L5 - 'biological_procedure_generation': [d for d in data if d['details']['task'] == 'procedure_generation' and d['domain'] == 'Biology'], - 'biological_reagent_generation': [d for d in data if d['details']['task'] == 'reagent_generation' and d['domain'] == 'Biology'], - 'protein_design': [d for d in data if d['details']['task'] == 'protein_design'], - 'single_cell_analysis': [d for d in data if d['details']['task'] == 'single_cell_analysis'], - ### Material - # L1 - 'material_literature_QA': [d for d in data if d['details']['task'] == 'material_literature_QA'], - # L2 - 'material_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'material_hypothesis_verification'], - 'material_component_extraction': [d for d in data if d['details']['subtask'] == 'material_component_extraction'], - 'material_data_extraction': [d for d in data if d['details']['subtask'] == 'material_data_extraction'], - 'material_detailed_understanding': [d for d in data if d['details']['subtask'] == 'material_detailed_understanding'], - 'material_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'material_reasoning_and_interpretation'], - 'material_text_summary': [d for d in data if d['details']['subtask'] == 'material_text_summary'], - # L3 - 'valence_electron_difference_calculation': [d for d in data if d['details']['task'] == 'valence_electron_difference_calculation'], - 'material_calculation': [d for d in data if d['details']['task'] == 'material_calculation'], - 'lattice_volume_calculation': [d for d in data if d['details']['task'] == 'lattice_volume_calculation'], - 'perovskite_stability_prediction': [d for d in data if d['details']['task'] == 'perovskite_stability_prediction'], - 'diffusion_rate_analysis': [d for d in data if d['details']['task'] == 'diffusion_rate_analysis'], - # L4 - 'material_safety_QA': [d for d in data if d['details']['task'] == 'material_safety_QA'], - 'material_toxicity_prediction': [d for d in data if d['details']['task'] == 'material_toxicity_prediction'], - # L5 - 'crystal_structure_and_composition_analysis': [d for d in data if d['details']['task'] == 'crystal_structure_and_composition_analysis'], - 'specified_band_gap_material_generation': [d for d in data if d['details']['task'] == 'specified_band_gap_material_generation'], - 'property_and_usage_analysis': [d for d in data if d['details']['task'] in ['property_and_usage_analysis', 'L5_material']], - ### Physics - # L1 - 'physics_literature_QA': [d for d in data if d['details']['task'] == 'physics_literature_QA'], - 'fundamental_physics_exam': [d for d in data if d['details']['task'] == 'fundamental_physics_exam'], - # L2 - 'physics_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'physics_hypothesis_verification'], - 'physics_detailed_understanding': [d for d in data if d['details']['subtask'] == 'physics_detailed_understanding'], - 'physics_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'physics_reasoning_and_interpretation'], - 'physics_text_summary': [d for d in data if d['details']['subtask'] == 'physics_text_summary'], - # L3 - 'high_school_physics_calculation': [d for d in data if d['details']['task'] == 'high_school_physics_calculation'], - 'general_physics_calculation': [d for d in data if d['details']['task'] == 'general_physics_calculation'], - 'physics_formula_derivation': [d for d in data if d['details']['task'] == 'physics_formula_derivation'], - # L4 - 'physics_safety_QA': [d for d in data if d['details']['task'] == 'physics_safety_QA'], - 'physics_laboratory_safety_test': [d for d in data if d['details']['task'] == 'physics_laboratory_safety_test'], - # L5 - 'physics_problem_solving': [d for d in data if d['details']['task'] == 'physics_problem_solving'], - } - assert sum([len(d) for d in task_data.values()]) == len(data), f'length not equal, 0 length task: {[k for k, v in task_data.items() if len(v) == 0]}' - print(">>>>>> Total data length:", len(data)) - return task_data - except Exception as e: - raise NotImplementedError(f"data error: {e}. please check your task name.") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1970abd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,77 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "sciknoweval" +version = "0.1.0" +description = "Evaluating Multi-level Scientific Knowledge of Large Language Models" +readme = "README.md" +license ={file = "LICENSE"} +authors = [ + {name = "Kehua Feng", email = "kehuafeng@zju.edu.cn"}, + {name = "HICAI-ZJU"} +] +maintainers = [ + {name = "HICAI-ZJU"} +] +keywords = [ + "llm", + "evaluation", + "benchmark", + "scientific-knowledge", + "ai" +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries :: Python Modules" +] +requires-python = ">=3.8" +dynamic = ["dependencies"] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[project.urls] +Homepage = "https://github.com/HICAI-ZJU/SciKnowEval" +Documentation = "http://www.scimind.ai/sciknoweval/" +Repository = "https://github.com/HICAI-ZJU/SciKnowEval.git" +"Bug Tracker" = "https://github.com/HICAI-ZJU/SciKnowEval/issues" +Dataset = "https://huggingface.co/datasets/hicai-zju/SciKnowEval" +Paper = "https://arxiv.org/abs/2406.09098" + +[project.scripts] +sciknoweval = "sciknoweval.eval:main" + +[project.optional-dependencies] +dev = [ + "pytest>=6.0", + "pytest-cov", + "black", + "isort", + "flake8" +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["sciknoweval*"] + +[tool.setuptools.package-data] +sciknoweval = ["evaluation/utils/prompts/*.yaml"] + +[tool.black] +line-length = 88 +target-version = ['py38'] + +[tool.isort] +profile = "black" +line_length = 88 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5c2b3ba..edbb53c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ numpy rouge_score rdkit rdchiral -openai==0.28.1 +openai>=0.28.1 scipy gensim tiktoken \ No newline at end of file diff --git a/sciknoweval/__init__.py b/sciknoweval/__init__.py new file mode 100644 index 0000000..c45b189 --- /dev/null +++ b/sciknoweval/__init__.py @@ -0,0 +1,10 @@ +""" +SciKnowEval: Evaluating Multi-level Scientific Knowledge of Large Language Models + +A comprehensive benchmark for evaluating Large Language Models across five levels +of scientific knowledge: memory, comprehension, reasoning, discernment, and application. +""" + +__version__ = "0.1.0" +__author__ = "HICAI-ZJU" +__description__ = "Evaluating Multi-level Scientific Knowledge of Large Language Models" diff --git a/eval.py b/sciknoweval/eval.py similarity index 91% rename from eval.py rename to sciknoweval/eval.py index a64c623..ffaa420 100644 --- a/eval.py +++ b/sciknoweval/eval.py @@ -1,10 +1,15 @@ +import argparse import json import os -import argparse -from evaluation.metrics import * -from evaluation.define import get_task_func, get_task_data, reformat_result - import warnings + +from .evaluation.define import get_task_data, get_task_func, reformat_result +from .evaluation.metrics import ( + get_score_GPT4, + get_score_RE_triplets, + get_score_RE_tuples, +) + warnings.filterwarnings("ignore") diff --git a/sciknoweval/evaluation/__init__.py b/sciknoweval/evaluation/__init__.py new file mode 100644 index 0000000..5c9808e --- /dev/null +++ b/sciknoweval/evaluation/__init__.py @@ -0,0 +1,6 @@ +""" +Evaluation module for SciKnowEval benchmark. + +Contains task definitions, metrics, and utility functions for evaluating +scientific knowledge of Large Language Models. +""" \ No newline at end of file diff --git a/sciknoweval/evaluation/define.py b/sciknoweval/evaluation/define.py new file mode 100644 index 0000000..feb1d43 --- /dev/null +++ b/sciknoweval/evaluation/define.py @@ -0,0 +1,716 @@ +import warnings +from typing import Any, Dict, List + +from .metrics import ( + get_score_BLEU_ROUGE, + get_score_CLS, + get_score_filling, + get_score_GPT4, + get_score_Mol_GEN, + get_score_RE_triplets, + get_score_RE_tuples, + get_score_reaction, + get_score_smith_waterman, +) + +warnings.filterwarnings("ignore") + + +def get_task_func(): + try: + return { + ### Chemistry + # L1 + "molecule_name_conversion": get_score_CLS, + "molecular_property_prediction": get_score_CLS, + "chemical_literature_QA": get_score_CLS, + # L2 + "reaction_mechanism_inference": get_score_CLS, + "compound_identification_and_properties": get_score_CLS, + "extract_doping": get_score_GPT4, + "chemical_detailed_understanding": get_score_CLS, + "chemical_text_summary": get_score_GPT4, + "chemical_hypothesis_verification": get_score_CLS, + "chemical_reasoning_and_interpretation": get_score_CLS, + # L3 + "molar_weight_calculation": get_score_CLS, + "molecular_property_calculation": get_score_CLS, + "molecule_structure_prediction": get_score_CLS, + "reaction_prediction": get_score_reaction, + "retrosynthesis": get_score_reaction, + "balancing_chemical_equation": get_score_filling, + "chemical_calculation": get_score_CLS, + # L4 + "chemical_harmful_QA": get_score_GPT4, + "mol_toxicity_prediction": get_score_CLS, + "chemical_laboratory_safety_test": get_score_CLS, + # L5 + "molecule_captioning": get_score_BLEU_ROUGE, + "molecule_generation": get_score_Mol_GEN, + "chemical_procedure_generation": get_score_GPT4, + "chemical_reagent_generation": get_score_GPT4, + ### Biology + # L1 + "protein_property_identification": get_score_CLS, + "biology_literature_QA": get_score_CLS, + # L2 + "drug_drug_relation_extraction": get_score_RE_triplets, + "biomedical_judgment_and_interpretation": get_score_CLS, + "compound_disease_relation_extraction": get_score_RE_tuples, + "gene_disease_relation_extraction": get_score_RE_triplets, + "biological_detailed_understanding": get_score_CLS, + "biological_text_summary": get_score_GPT4, + "biological_hypothesis_verification": get_score_CLS, + "biological_reasoning_and_interpretation": get_score_CLS, + # L3 + "solubility_prediction": get_score_CLS, + "beta_lactamase_activity_prediction": get_score_CLS, + "fluorescence_prediction": get_score_CLS, + "GB1_ftness_prediction": get_score_CLS, + "stability_prediction": get_score_CLS, + "Protein_Protein_Interaction": get_score_CLS, + "biological_calculation": get_score_CLS, + # L4 + "biological_harmful_QA": get_score_GPT4, + "proteotoxicity_prediction": get_score_CLS, + "biological_laboratory_safety_test": get_score_CLS, + # L5 + "biological_procedure_generation": get_score_GPT4, + "biological_reagent_generation": get_score_GPT4, + "protein_description_generation": get_score_BLEU_ROUGE, + "protein_design": get_score_smith_waterman, + "single_cell_analysis": get_score_BLEU_ROUGE, + ### Meterial + # L1 + "material_literature_QA": get_score_CLS, + # L2 + "material_hypothesis_verification": get_score_CLS, + "material_component_extraction": get_score_GPT4, + "material_data_extraction": get_score_CLS, + "material_detailed_understanding": get_score_CLS, + "material_reasoning_and_interpretation": get_score_CLS, + "material_text_summary": get_score_GPT4, + # L3 + "valence_electron_difference_calculation": get_score_CLS, + "material_calculation": get_score_CLS, + "lattice_volume_calculation": get_score_CLS, + "perovskite_stability_prediction": get_score_CLS, + "diffusion_rate_analysis": get_score_CLS, + # L4 + "material_safety_QA": get_score_CLS, + "material_toxicity_prediction": get_score_CLS, + # L5 + "crystal_structure_and_composition_analysis": get_score_GPT4, + "specified_band_gap_material_generation": get_score_GPT4, + "property_and_usage_analysis": get_score_GPT4, + ### Physics + # L1 + "physics_literature_QA": get_score_CLS, + "fundamental_physics_exam": get_score_CLS, + # L2 + "physics_hypothesis_verification": get_score_CLS, + "physics_detailed_understanding": get_score_CLS, + "physics_reasoning_and_interpretation": get_score_CLS, + "physics_text_summary": get_score_GPT4, + # L3 + "high_school_physics_calculation": get_score_CLS, + "general_physics_calculation": get_score_CLS, + "physics_formula_derivation": get_score_GPT4, + # L4 + "physics_safety_QA": get_score_CLS, + "physics_laboratory_safety_test": get_score_CLS, + # L5 + "physics_problem_solving": get_score_GPT4, + } + except: + raise NotImplementedError("task not found") + + +def reformat_result(result: Dict[str, Any]) -> Dict[str, Any]: + reformatted_result = { + "Biology": { + "L1": { + "molecule_name_conversion": result["molecule_name_conversion"], + "molecular_property_prediction": result[ + "molecular_property_prediction" + ], + "biology_literature_QA": result["biology_literature_QA"], + "protein_description_generation": result[ + "protein_description_generation" + ], + }, + "L2": { + "drug_drug_relation_extraction": result[ + "drug_drug_relation_extraction" + ], + "biomedical_judgment_and_interpretation": result[ + "biomedical_judgment_and_interpretation" + ], + "compound_disease_relation_extraction": result[ + "compound_disease_relation_extraction" + ], + "gene_disease_relation_extraction": result[ + "gene_disease_relation_extraction" + ], + "biological_detailed_understanding": result[ + "biological_detailed_understanding" + ], + "biological_text_summary": result["biological_text_summary"], + "biological_hypothesis_verification": result[ + "biological_hypothesis_verification" + ], + "biological_reasoning_and_interpretation": result[ + "biological_reasoning_and_interpretation" + ], + }, + "L3": { + "solubility_prediction": result["solubility_prediction"], + "beta_lactamase_activity_prediction": result[ + "beta_lactamase_activity_prediction" + ], + "fluorescence_prediction": result["fluorescence_prediction"], + "GB1_ftness_prediction": result["GB1_ftness_prediction"], + "stability_prediction": result["stability_prediction"], + "Protein_Protein_Interaction": result["Protein_Protein_Interaction"], + "biological_calculation": result["biological_calculation"], + }, + "L4": { + "biological_harmful_QA": result["biological_harmful_QA"], + "proteotoxicity_prediction": result["proteotoxicity_prediction"], + "biological_laboratory_safety_test": result[ + "biological_laboratory_safety_test" + ], + }, + "L5": { + "biological_procedure_generation": result[ + "biological_procedure_generation" + ], + "biological_reagent_generation": result[ + "biological_reagent_generation" + ], + "protein_design": result["protein_design"], + "single_cell_analysis": result["single_cell_analysis"], + }, + }, + "Chemistry": { + "L1": { + "molecule_name_conversion": result["molecule_name_conversion"], + "molecular_property_prediction": result[ + "molecular_property_prediction" + ], + "chemical_literature_QA": result["chemical_literature_QA"], + "molecule_captioning": result["molecule_captioning"], + }, + "L2": { + "reaction_mechanism_inference": result["reaction_mechanism_inference"], + "compound_identification_and_properties": result[ + "compound_identification_and_properties" + ], + "extract_doping": result["extract_doping"], + "chemical_detailed_understanding": result[ + "chemical_detailed_understanding" + ], + "chemical_text_summary": result["chemical_text_summary"], + "chemical_hypothesis_verification": result[ + "chemical_hypothesis_verification" + ], + "chemical_reasoning_and_interpretation": result[ + "chemical_reasoning_and_interpretation" + ], + }, + "L3": { + "molar_weight_calculation": result["molar_weight_calculation"], + "molecular_property_calculation": result[ + "molecular_property_calculation" + ], + "molecule_structure_prediction": result[ + "molecule_structure_prediction" + ], + "reaction_prediction": result["reaction_prediction"], + "retrosynthesis": result["retrosynthesis"], + "balancing_chemical_equation": result["balancing_chemical_equation"], + "chemical_calculation": result["chemical_calculation"], + }, + "L4": { + "chemical_harmful_QA": result["chemical_harmful_QA"], + "mol_toxicity_prediction": result["mol_toxicity_prediction"], + "chemical_laboratory_safety_test": result[ + "chemical_laboratory_safety_test" + ], + }, + "L5": { + "molecule_generation": result["molecule_generation"], + "chemical_procedure_generation": result[ + "chemical_procedure_generation" + ], + "chemical_reagent_generation": result["chemical_reagent_generation"], + }, + }, + "Materials": { + "L1": { + "material_literature_QA": result["material_literature_QA"], + }, + "L2": { + "material_hypothesis_verification": result[ + "material_hypothesis_verification" + ], + "material_component_extraction": result[ + "material_component_extraction" + ], + "material_data_extraction": result["material_data_extraction"], + "material_detailed_understanding": result[ + "material_detailed_understanding" + ], + "material_reasoning_and_interpretation": result[ + "material_reasoning_and_interpretation" + ], + "material_text_summary": result["material_text_summary"], + }, + "L3": { + "valence_electron_difference_calculation": result[ + "valence_electron_difference_calculation" + ], + "material_calculation": result["material_calculation"], + "lattice_volume_calculation": result["lattice_volume_calculation"], + "perovskite_stability_prediction": result[ + "perovskite_stability_prediction" + ], + "diffusion_rate_analysis": result["diffusion_rate_analysis"], + }, + "L4": { + "material_safety_QA": result["material_safety_QA"], + "material_toxicity_prediction": result["material_toxicity_prediction"], + }, + "L5": { + "crystal_structure_and_composition_analysis": result[ + "crystal_structure_and_composition_analysis" + ], + "specified_band_gap_material_generation": result[ + "specified_band_gap_material_generation" + ], + "property_and_usage_analysis": result["property_and_usage_analysis"], + }, + }, + "Physics": { + "L1": { + "physics_literature_QA": result["physics_literature_QA"], + "fundamental_physics_exam": result["fundamental_physics_exam"], + }, + "L2": { + "physics_hypothesis_verification": result[ + "physics_hypothesis_verification" + ], + "physics_detailed_understanding": result[ + "physics_detailed_understanding" + ], + "physics_reasoning_and_interpretation": result[ + "physics_reasoning_and_interpretation" + ], + "physics_text_summary": result["physics_text_summary"], + }, + "L3": { + "high_school_physics_calculation": result[ + "high_school_physics_calculation" + ], + "general_physics_calculation": result["general_physics_calculation"], + "physics_formula_derivation": result["physics_formula_derivation"], + }, + "L4": { + "physics_safety_QA": result["physics_safety_QA"], + "physics_laboratory_safety_test": result[ + "physics_laboratory_safety_test" + ], + }, + "L5": { + "physics_problem_solving": result["physics_problem_solving"], + }, + }, + } + + return reformatted_result + + +def get_task_data(data: List[Dict[str, Any]]) -> Dict[str, Any]: + try: + task_data = { + # L1 + "molecule_name_conversion": [ + d for d in data if d["details"]["task"] == "molecule_name_conversion" + ], + "molecular_property_prediction": [ + d + for d in data + if d["details"]["task"] == "molecular_property_prediction" + and d["details"]["level"] == "L1" + ], + "chemical_literature_QA": [ + d + for d in data + if d["details"]["task"] == "literature_multi_choice_question" + and d["domain"] == "Chemistry" + ], + "molecule_captioning": [ + d for d in data if d["details"]["task"] == "molecule_captioning" + ], + # L2 + "reaction_mechanism_inference": [ + d + for d in data + if d["details"]["subtask"] == "reaction_mechanism_inference" + ], + "compound_identification_and_properties": [ + d + for d in data + if d["details"]["subtask"] == "compound_identification_and_properties" + ], + "extract_doping": [ + d for d in data if d["details"]["subtask"] == "extract_doping" + ], + "chemical_detailed_understanding": [ + d + for d in data + if d["details"]["subtask"] == "detailed_understanding" + and d["domain"] == "Chemistry" + ], + "chemical_text_summary": [ + d + for d in data + if d["details"]["subtask"] == "text_summary" + and d["domain"] == "Chemistry" + ], + "chemical_hypothesis_verification": [ + d + for d in data + if d["details"]["subtask"] == "hypothesis_verification" + and d["domain"] == "Chemistry" + ], + "chemical_reasoning_and_interpretation": [ + d + for d in data + if d["details"]["subtask"] == "reasoning_and_interpretation" + and d["domain"] == "Chemistry" + ], + # L3 + "molar_weight_calculation": [ + d for d in data if d["details"]["task"] == "molar_weight_calculation" + ], + "molecular_property_calculation": [ + d + for d in data + if d["details"]["task"] == "molecular_property_prediction" + and d["details"]["level"] == "L3" + ], + "molecule_structure_prediction": [ + d + for d in data + if d["details"]["task"] == "molecule_structure_prediction" + ], + "reaction_prediction": [ + d for d in data if d["details"]["task"] == "reaction_prediction" + ], + "retrosynthesis": [ + d for d in data if d["details"]["task"] == "retrosynthesis" + ], + "balancing_chemical_equation": [ + d for d in data if d["details"]["task"] == "balancing_chemical_equation" + ], + "chemical_calculation": [ + d + for d in data + if d["details"]["task"] == "sci_calculate" + and d["domain"] == "Chemistry" + ], + # L4 + "chemical_harmful_QA": [ + d + for d in data + if d["details"]["task"] == "harmful_QA" and d["domain"] == "Chemistry" + ], + "mol_toxicity_prediction": [ + d for d in data if d["details"]["task"] == "mol_toxicity_prediction" + ], + "chemical_laboratory_safety_test": [ + d + for d in data + if d["details"]["task"] == "laboratory_safety_test" + and d["domain"] == "Chemistry" + ], + # L5 + "molecule_generation": [ + d for d in data if d["details"]["task"] == "molecule_generation" + ], + "chemical_procedure_generation": [ + d + for d in data + if d["details"]["task"] == "procedure_generation" + and d["domain"] == "Chemistry" + ], + "chemical_reagent_generation": [ + d + for d in data + if d["details"]["task"] == "reagent_generation" + and d["domain"] == "Chemistry" + ], + ### Biology + # L1 + "protein_property_identification": [ + d + for d in data + if d["details"]["task"] == "protein_property_identification" + ], + "biology_literature_QA": [ + d + for d in data + if d["details"]["task"] == "literature_multi_choice_question" + and d["domain"] == "Biology" + ], + "protein_description_generation": [ + d + for d in data + if d["details"]["task"] == "protein_description_generation" + ], + # L2 + "drug_drug_relation_extraction": [ + d + for d in data + if d["details"]["subtask"] == "drug_drug_relation_extraction" + ], + "biomedical_judgment_and_interpretation": [ + d + for d in data + if d["details"]["subtask"] == "biomedical_judgment_and_interpretation" + ], + "compound_disease_relation_extraction": [ + d + for d in data + if d["details"]["subtask"] == "compound_disease_relation_extraction" + ], + "gene_disease_relation_extraction": [ + d + for d in data + if d["details"]["subtask"] == "gene_disease_relation_extraction" + ], + "biological_detailed_understanding": [ + d + for d in data + if d["details"]["subtask"] == "detailed_understanding" + and d["domain"] == "Biology" + ], + "biological_text_summary": [ + d + for d in data + if d["details"]["subtask"] == "text_summary" + and d["domain"] == "Biology" + ], + "biological_hypothesis_verification": [ + d + for d in data + if d["details"]["subtask"] == "hypothesis_verification" + and d["domain"] == "Biology" + ], + "biological_reasoning_and_interpretation": [ + d + for d in data + if d["details"]["subtask"] == "reasoning_and_interpretation" + and d["domain"] == "Biology" + ], + # L3 + "solubility_prediction": [ + d for d in data if d["details"]["subtask"] == "solubility_prediction" + ], + "beta_lactamase_activity_prediction": [ + d + for d in data + if d["details"]["subtask"] == "beta_lactamase_activity_prediction" + ], + "fluorescence_prediction": [ + d for d in data if d["details"]["subtask"] == "fluorescence_prediction" + ], + "GB1_ftness_prediction": [ + d for d in data if d["details"]["subtask"] == "GB1_ftness_prediction" + ], + "stability_prediction": [ + d for d in data if d["details"]["subtask"] == "stability_prediction" + ], + "Protein_Protein_Interaction": [ + d + for d in data + if d["details"]["subtask"] == "Protein_Protein_Interaction" + ], + "biological_calculation": [ + d + for d in data + if d["details"]["task"] == "sci_calculate" and d["domain"] == "Biology" + ], + # L4 + "biological_harmful_QA": [ + d + for d in data + if d["details"]["task"] == "harmful_QA" and d["domain"] == "Biology" + ], + "proteotoxicity_prediction": [ + d for d in data if d["details"]["task"] == "proteotoxicity_prediction" + ], + "biological_laboratory_safety_test": [ + d + for d in data + if d["details"]["task"] == "laboratory_safety_test" + and d["domain"] == "Biology" + ], + # L5 + "biological_procedure_generation": [ + d + for d in data + if d["details"]["task"] == "procedure_generation" + and d["domain"] == "Biology" + ], + "biological_reagent_generation": [ + d + for d in data + if d["details"]["task"] == "reagent_generation" + and d["domain"] == "Biology" + ], + "protein_design": [ + d for d in data if d["details"]["task"] == "protein_design" + ], + "single_cell_analysis": [ + d for d in data if d["details"]["task"] == "single_cell_analysis" + ], + ### Material + # L1 + "material_literature_QA": [ + d for d in data if d["details"]["task"] == "material_literature_QA" + ], + # L2 + "material_hypothesis_verification": [ + d + for d in data + if d["details"]["subtask"] == "material_hypothesis_verification" + ], + "material_component_extraction": [ + d + for d in data + if d["details"]["subtask"] == "material_component_extraction" + ], + "material_data_extraction": [ + d for d in data if d["details"]["subtask"] == "material_data_extraction" + ], + "material_detailed_understanding": [ + d + for d in data + if d["details"]["subtask"] == "material_detailed_understanding" + ], + "material_reasoning_and_interpretation": [ + d + for d in data + if d["details"]["subtask"] == "material_reasoning_and_interpretation" + ], + "material_text_summary": [ + d for d in data if d["details"]["subtask"] == "material_text_summary" + ], + # L3 + "valence_electron_difference_calculation": [ + d + for d in data + if d["details"]["task"] == "valence_electron_difference_calculation" + ], + "material_calculation": [ + d for d in data if d["details"]["task"] == "material_calculation" + ], + "lattice_volume_calculation": [ + d for d in data if d["details"]["task"] == "lattice_volume_calculation" + ], + "perovskite_stability_prediction": [ + d + for d in data + if d["details"]["task"] == "perovskite_stability_prediction" + ], + "diffusion_rate_analysis": [ + d for d in data if d["details"]["task"] == "diffusion_rate_analysis" + ], + # L4 + "material_safety_QA": [ + d for d in data if d["details"]["task"] == "material_safety_QA" + ], + "material_toxicity_prediction": [ + d + for d in data + if d["details"]["task"] == "material_toxicity_prediction" + ], + # L5 + "crystal_structure_and_composition_analysis": [ + d + for d in data + if d["details"]["task"] == "crystal_structure_and_composition_analysis" + ], + "specified_band_gap_material_generation": [ + d + for d in data + if d["details"]["task"] == "specified_band_gap_material_generation" + ], + "property_and_usage_analysis": [ + d + for d in data + if d["details"]["task"] + in ["property_and_usage_analysis", "L5_material"] + ], + ### Physics + # L1 + "physics_literature_QA": [ + d for d in data if d["details"]["task"] == "physics_literature_QA" + ], + "fundamental_physics_exam": [ + d for d in data if d["details"]["task"] == "fundamental_physics_exam" + ], + # L2 + "physics_hypothesis_verification": [ + d + for d in data + if d["details"]["subtask"] == "physics_hypothesis_verification" + ], + "physics_detailed_understanding": [ + d + for d in data + if d["details"]["subtask"] == "physics_detailed_understanding" + ], + "physics_reasoning_and_interpretation": [ + d + for d in data + if d["details"]["subtask"] == "physics_reasoning_and_interpretation" + ], + "physics_text_summary": [ + d for d in data if d["details"]["subtask"] == "physics_text_summary" + ], + # L3 + "high_school_physics_calculation": [ + d + for d in data + if d["details"]["task"] == "high_school_physics_calculation" + ], + "general_physics_calculation": [ + d for d in data if d["details"]["task"] == "general_physics_calculation" + ], + "physics_formula_derivation": [ + d for d in data if d["details"]["task"] == "physics_formula_derivation" + ], + # L4 + "physics_safety_QA": [ + d for d in data if d["details"]["task"] == "physics_safety_QA" + ], + "physics_laboratory_safety_test": [ + d + for d in data + if d["details"]["task"] == "physics_laboratory_safety_test" + ], + # L5 + "physics_problem_solving": [ + d for d in data if d["details"]["task"] == "physics_problem_solving" + ], + } + assert sum([len(d) for d in task_data.values()]) == len(data), ( + f"length not equal, 0 length task: {[k for k, v in task_data.items() if len(v) == 0]}" + ) + print(">>>>>> Total data length:", len(data)) + return task_data + except Exception as e: + raise NotImplementedError(f"data error: {e}. please check your task name.") diff --git a/evaluation/metrics.py b/sciknoweval/evaluation/metrics.py similarity index 96% rename from evaluation/metrics.py rename to sciknoweval/evaluation/metrics.py index cee34a1..de868b9 100644 --- a/evaluation/metrics.py +++ b/sciknoweval/evaluation/metrics.py @@ -1,15 +1,21 @@ import os +from typing import Any, Dict, List + +import tiktoken import yaml -from typing import List, Any, Dict from tqdm import tqdm -from scipy.spatial.distance import cosine -from gensim.models import KeyedVectors -from evaluation.utils.relation_extraction import * -from evaluation.utils.process import load_word2vec_model -from evaluation.utils.generation import calculate_nltk_scores, calculate_smiles_metrics -import tiktoken -from evaluation.utils.openai_api import OpenAIChat +from .utils.generation import calculate_nltk_scores, calculate_smiles_metrics +from .utils.openai_api import OpenAIChat +from .utils.process import load_word2vec_model +from .utils.relation_extraction import ( + cos_f1_score, + macro_f1_score_triplets, + macro_f1_score_tuples, + validate_format_and_extract_data_triplets, + validate_format_and_extract_data_tuples, +) + script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/sciknoweval/evaluation/utils/__init__.py b/sciknoweval/evaluation/utils/__init__.py new file mode 100644 index 0000000..95ffa6d --- /dev/null +++ b/sciknoweval/evaluation/utils/__init__.py @@ -0,0 +1,6 @@ +""" +Utility functions for SciKnowEval evaluation. + +Contains utilities for generation, OpenAI API integration, data processing, +and relation extraction tasks. +""" \ No newline at end of file diff --git a/evaluation/utils/generation.py b/sciknoweval/evaluation/utils/generation.py similarity index 99% rename from evaluation/utils/generation.py rename to sciknoweval/evaluation/utils/generation.py index d5c4dcd..7373b09 100644 --- a/evaluation/utils/generation.py +++ b/sciknoweval/evaluation/utils/generation.py @@ -1,14 +1,12 @@ - import numpy as np from nltk.translate.bleu_score import sentence_bleu from nltk.translate.meteor_score import meteor_score -from tqdm import tqdm -from rouge_score import rouge_scorer - -from rdkit import Chem, DataStructs -from rdkit.Chem import MACCSkeys, AllChem from rdchiral.chiral import copy_chirality +from rdkit import Chem, DataStructs +from rdkit.Chem import AllChem, MACCSkeys from rdkit.Chem.AllChem import AssignStereochemistry +from rouge_score import rouge_scorer +from tqdm import tqdm def calculate_nltk_scores(tokenizer, ans_strs, pred_strs): diff --git a/evaluation/utils/openai_api.py b/sciknoweval/evaluation/utils/openai_api.py similarity index 99% rename from evaluation/utils/openai_api.py rename to sciknoweval/evaluation/utils/openai_api.py index b3afee1..5ba9e92 100644 --- a/evaluation/utils/openai_api.py +++ b/sciknoweval/evaluation/utils/openai_api.py @@ -1,7 +1,8 @@ -import openai +import asyncio import os from typing import List -import asyncio + +import openai class OpenAIChat(): diff --git a/evaluation/utils/process.py b/sciknoweval/evaluation/utils/process.py similarity index 100% rename from evaluation/utils/process.py rename to sciknoweval/evaluation/utils/process.py diff --git a/evaluation/utils/prompts/prompt.yaml b/sciknoweval/evaluation/utils/prompts/prompt.yaml similarity index 100% rename from evaluation/utils/prompts/prompt.yaml rename to sciknoweval/evaluation/utils/prompts/prompt.yaml diff --git a/evaluation/utils/relation_extraction.py b/sciknoweval/evaluation/utils/relation_extraction.py similarity index 98% rename from evaluation/utils/relation_extraction.py rename to sciknoweval/evaluation/utils/relation_extraction.py index 7086b7a..99677a5 100644 --- a/evaluation/utils/relation_extraction.py +++ b/sciknoweval/evaluation/utils/relation_extraction.py @@ -1,10 +1,13 @@ -from collections import Counter -import re -from typing import List, Tuple, Any, Dict -import numpy as np import re -from evaluation.utils.process import same_entities, sentence_to_vec, cosine_similarity, cosine_similarity_2 - +from collections import Counter +from typing import Any, Dict, List, Tuple + +from .process import ( + cosine_similarity, + cosine_similarity_2, + same_entities, + sentence_to_vec, +) def parse_tuples(tuple_str):