diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d811561
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,165 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*.pyc
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Project specific
+outputs.json
+*.bin
+*.gz
+GoogleNews-vectors-negative300.bin
+GoogleNews-vectors-negative300.bin.gz
+
+# Temporary files
+*.tmp
+*.temp
+
+# Model files and data
+models/
+data/
+*.model
+*.pkl
+*.pickle
+
+# Results and outputs
+results/
+outputs/
+logs/
+*.log
+
+# API keys
+.env
+.env.local
+.env.*.local
+api_keys.txt
+config.json
+
+# AI settings
+.claude/
diff --git a/README.md b/README.md
index dd1bd43..a9977e7 100644
--- a/README.md
+++ b/README.md
@@ -75,18 +75,31 @@ The <b>Sci</b>entific <b>Know</b>ledge <b>Eval</b>uation (<b>SciKnowEval</b>) be
 <h2 id="3">🏹 QuickStart</h2>
 <h3 id="3.1">⬇️ Step 1: Installation</h3>
 
-To evaluate LLMs on SciKnowEval, first clone the repository:
+**Option 1: pip install from GitHub (Recommended)**
+```bash
+pip install sciknoweval@https://github.com/HICAI-ZJU/SciKnowEval.git
+```
+
+**Option 2: Install from Source**
 ```bash
 git clone https://github.com/HICAI-ZJU/SciKnowEval.git
 cd SciKnowEval
+pip install .
 ```
-Next, set up a conda environment to manage the dependencies:
+
+**Option 3: Development Installation**
 ```bash
-conda create -n sciknoweval python=3.10.9
-conda activate sciknoweval
+git clone https://github.com/HICAI-ZJU/SciKnowEval.git
+cd SciKnowEval
+pip install -e .
 ```
-Then, install the required dependencies:
+
+**Option 4: Manual Setup (Legacy)**
 ```bash
+git clone https://github.com/HICAI-ZJU/SciKnowEval.git
+cd SciKnowEval
+conda create -n sciknoweval python=3.10.9
+conda activate sciknoweval
 pip install -r requirements.txt
 ```
 
@@ -152,7 +165,31 @@ By following these guidelines, you can effectively use the SciKnowEval benchmark
 
 <h3 id="3.4">🚀 Step 4: Evaluate</h3>
 
-You can run `eval.py` to evaluate your model:
+**Option 1: Using the Command Line Interface (Recommended)**
+
+After installing SciKnowEval, you can use the `sciknoweval` command:
+
+```bash
+export OPENAI_API_KEY="YOUR_API_KEY"
+sciknoweval \
+  --data_path "your/model/predictions.json" \
+  --word2vec_model_path "path/to/GoogleNews-vectors-negative300.bin" \
+  --gen_evaluator "gpt-4o" \
+  --output_path "path/to/your/output.json"
+```
+
+**Option 2: Using Python Module**
+
+```bash
+export OPENAI_API_KEY="YOUR_API_KEY"
+python -m sciknoweval.eval \
+  --data_path "your/model/predictions.json" \
+  --word2vec_model_path "path/to/GoogleNews-vectors-negative300.bin" \
+  --gen_evaluator "gpt-4o" \
+  --output_path "path/to/your/output.json"
+```
+
+**Option 3: Direct Script Execution (Legacy)**
 
 ```bash
 data_path="your/model/predictions.json"
@@ -161,7 +198,7 @@ gen_evaluator="gpt-4o" # the correct model name in OpenAI
 output_path="path/to/your/output.json"
 
 export OPENAI_API_KEY="YOUR_API_KEY"
-python eval.py \
+python sciknoweval/eval.py \
   --data_path $data_path \
   --word2vec_model_path $word2vec_model_path \
   --gen_evaluator $gen_evaluator \
diff --git a/evaluation/define.py b/evaluation/define.py
deleted file mode 100644
index bc3c7e9..0000000
--- a/evaluation/define.py
+++ /dev/null
@@ -1,362 +0,0 @@
-import json
-import os
-import time
-from evaluation.metrics import *
-from typing import List, Tuple, Any, Dict
-import warnings
-from tqdm import tqdm
-warnings.filterwarnings("ignore")
-
-
-def get_task_func():
-    try:
-        return {
-            ### Chemistry
-            # L1
-            'molecule_name_conversion': get_score_CLS,
-            'molecular_property_prediction': get_score_CLS,
-            'chemical_literature_QA': get_score_CLS,
-            # L2
-            'reaction_mechanism_inference': get_score_CLS,
-            'compound_identification_and_properties': get_score_CLS,
-            'extract_doping': get_score_GPT4,
-            'chemical_detailed_understanding': get_score_CLS,
-            'chemical_text_summary': get_score_GPT4,
-            'chemical_hypothesis_verification': get_score_CLS,
-            'chemical_reasoning_and_interpretation': get_score_CLS,
-            # L3
-            'molar_weight_calculation': get_score_CLS,
-            'molecular_property_calculation': get_score_CLS,
-            'molecule_structure_prediction': get_score_CLS,
-            'reaction_prediction': get_score_reaction,
-            'retrosynthesis': get_score_reaction,
-            'balancing_chemical_equation': get_score_filling,
-            'chemical_calculation': get_score_CLS,
-            # L4
-            'chemical_harmful_QA': get_score_GPT4,
-            'mol_toxicity_prediction': get_score_CLS,
-            'chemical_laboratory_safety_test': get_score_CLS,
-            # L5
-            'molecule_captioning': get_score_BLEU_ROUGE,
-            'molecule_generation': get_score_Mol_GEN,
-            'chemical_procedure_generation': get_score_GPT4,
-            'chemical_reagent_generation': get_score_GPT4,
-            ### Biology
-            # L1
-            'protein_property_identification': get_score_CLS,
-            'biology_literature_QA': get_score_CLS,
-            # L2
-            'drug_drug_relation_extraction': get_score_RE_triplets,
-            'biomedical_judgment_and_interpretation': get_score_CLS,
-            'compound_disease_relation_extraction': get_score_RE_tuples,
-            'gene_disease_relation_extraction': get_score_RE_triplets,
-            'biological_detailed_understanding': get_score_CLS,
-            'biological_text_summary': get_score_GPT4,
-            'biological_hypothesis_verification': get_score_CLS,
-            'biological_reasoning_and_interpretation': get_score_CLS,
-            # L3
-            'solubility_prediction': get_score_CLS,
-            'beta_lactamase_activity_prediction': get_score_CLS,
-            'fluorescence_prediction': get_score_CLS,
-            'GB1_ftness_prediction': get_score_CLS,
-            'stability_prediction': get_score_CLS,
-            'Protein_Protein_Interaction': get_score_CLS,
-            'biological_calculation': get_score_CLS,
-            # L4
-            'biological_harmful_QA': get_score_GPT4,
-            'proteotoxicity_prediction': get_score_CLS,
-            'biological_laboratory_safety_test': get_score_CLS,
-            # L5
-            'biological_procedure_generation': get_score_GPT4,
-            'biological_reagent_generation': get_score_GPT4,
-            'protein_description_generation': get_score_BLEU_ROUGE,
-            'protein_design': get_score_smith_waterman,
-            'single_cell_analysis': get_score_BLEU_ROUGE,
-            ### Meterial
-            # L1
-            'material_literature_QA': get_score_CLS,
-            # L2
-            'material_hypothesis_verification': get_score_CLS,
-            'material_component_extraction': get_score_GPT4,
-            'material_data_extraction': get_score_CLS,
-            'material_detailed_understanding': get_score_CLS,
-            'material_reasoning_and_interpretation': get_score_CLS,
-            'material_text_summary': get_score_GPT4,
-            # L3
-            'valence_electron_difference_calculation': get_score_CLS,
-            'material_calculation': get_score_CLS,
-            'lattice_volume_calculation': get_score_CLS,
-            'perovskite_stability_prediction': get_score_CLS,
-            'diffusion_rate_analysis': get_score_CLS,
-            # L4
-            'material_safety_QA': get_score_CLS,
-            'material_toxicity_prediction': get_score_CLS,
-            # L5
-            'crystal_structure_and_composition_analysis': get_score_GPT4,
-            'specified_band_gap_material_generation': get_score_GPT4,
-            'property_and_usage_analysis': get_score_GPT4,
-            ### Physics
-            # L1
-            'physics_literature_QA': get_score_CLS,
-            'fundamental_physics_exam': get_score_CLS,
-            # L2
-            'physics_hypothesis_verification': get_score_CLS,
-            'physics_detailed_understanding': get_score_CLS,
-            'physics_reasoning_and_interpretation': get_score_CLS,
-            'physics_text_summary': get_score_GPT4,
-            # L3
-            'high_school_physics_calculation': get_score_CLS,
-            'general_physics_calculation': get_score_CLS,
-            'physics_formula_derivation': get_score_GPT4,
-            # L4
-            'physics_safety_QA': get_score_CLS,
-            'physics_laboratory_safety_test': get_score_CLS,
-            # L5
-            'physics_problem_solving': get_score_GPT4,
-        }
-    except:
-        raise NotImplementedError("task not found")
-
-def reformat_result(result: Dict[str, Any]) -> Dict[str, Any]:
-    reformatted_result = {
-        'Biology': {
-            'L1': {
-                'molecule_name_conversion': result['molecule_name_conversion'],
-                'molecular_property_prediction': result['molecular_property_prediction'],
-                'biology_literature_QA': result['biology_literature_QA'],
-                'protein_description_generation': result['protein_description_generation'],
-            },
-            'L2': {
-                'drug_drug_relation_extraction': result['drug_drug_relation_extraction'],
-                'biomedical_judgment_and_interpretation': result['biomedical_judgment_and_interpretation'],
-                'compound_disease_relation_extraction': result['compound_disease_relation_extraction'],
-                'gene_disease_relation_extraction': result['gene_disease_relation_extraction'],
-                'biological_detailed_understanding': result['biological_detailed_understanding'],
-                'biological_text_summary': result['biological_text_summary'],
-                'biological_hypothesis_verification': result['biological_hypothesis_verification'],
-                'biological_reasoning_and_interpretation': result['biological_reasoning_and_interpretation'],
-            },
-            'L3': {
-                'solubility_prediction': result['solubility_prediction'],
-                'beta_lactamase_activity_prediction': result['beta_lactamase_activity_prediction'],
-                'fluorescence_prediction': result['fluorescence_prediction'],
-                'GB1_ftness_prediction': result['GB1_ftness_prediction'],
-                'stability_prediction': result['stability_prediction'],
-                'Protein_Protein_Interaction': result['Protein_Protein_Interaction'],
-                'biological_calculation': result['biological_calculation'],
-            },
-            'L4': {
-                'biological_harmful_QA': result['biological_harmful_QA'],
-                'proteotoxicity_prediction': result['proteotoxicity_prediction'],
-                'biological_laboratory_safety_test': result['biological_laboratory_safety_test'],
-            },
-            'L5': {
-                'biological_procedure_generation': result['biological_procedure_generation'],
-                'biological_reagent_generation': result['biological_reagent_generation'],
-                'protein_design': result['protein_design'],
-                'single_cell_analysis': result['single_cell_analysis'],
-            },
-        },
-        'Chemistry': {
-            'L1': {
-                'molecule_name_conversion': result['molecule_name_conversion'],
-                'molecular_property_prediction': result['molecular_property_prediction'],
-                'chemical_literature_QA': result['chemical_literature_QA'],
-                'molecule_captioning': result['molecule_captioning'],
-            },
-            'L2': {
-                'reaction_mechanism_inference': result['reaction_mechanism_inference'],
-                'compound_identification_and_properties': result['compound_identification_and_properties'],
-                'extract_doping': result['extract_doping'],
-                'chemical_detailed_understanding': result['chemical_detailed_understanding'],
-                'chemical_text_summary': result['chemical_text_summary'],
-                'chemical_hypothesis_verification': result['chemical_hypothesis_verification'],
-                'chemical_reasoning_and_interpretation': result['chemical_reasoning_and_interpretation'],
-            },
-            'L3': {
-                'molar_weight_calculation': result['molar_weight_calculation'],
-                'molecular_property_calculation': result['molecular_property_calculation'],
-                'molecule_structure_prediction': result['molecule_structure_prediction'],
-                'reaction_prediction': result['reaction_prediction'],
-                'retrosynthesis': result['retrosynthesis'],
-                'balancing_chemical_equation': result['balancing_chemical_equation'],
-                'chemical_calculation': result['chemical_calculation'],
-            },
-            'L4': {
-                'chemical_harmful_QA': result['chemical_harmful_QA'],
-                'mol_toxicity_prediction': result['mol_toxicity_prediction'],
-                'chemical_laboratory_safety_test': result['chemical_laboratory_safety_test'],
-            },
-            'L5': {
-                'molecule_generation': result['molecule_generation'],
-                'chemical_procedure_generation': result['chemical_procedure_generation'],
-                'chemical_reagent_generation': result['chemical_reagent_generation'],
-            },
-        },
-        'Materials': {
-            'L1': {
-                'material_literature_QA': result['material_literature_QA'],
-            },
-            'L2': {
-                'material_hypothesis_verification': result['material_hypothesis_verification'],
-                'material_component_extraction': result['material_component_extraction'],
-                'material_data_extraction': result['material_data_extraction'],
-                'material_detailed_understanding': result['material_detailed_understanding'],
-                'material_reasoning_and_interpretation': result['material_reasoning_and_interpretation'],
-                'material_text_summary': result['material_text_summary'],
-            },
-            'L3': {
-                'valence_electron_difference_calculation': result['valence_electron_difference_calculation'],
-                'material_calculation': result['material_calculation'],
-                'lattice_volume_calculation': result['lattice_volume_calculation'],
-                'perovskite_stability_prediction': result['perovskite_stability_prediction'],
-                'diffusion_rate_analysis': result['diffusion_rate_analysis'],
-            },
-            'L4': {
-                'material_safety_QA': result['material_safety_QA'],
-                'material_toxicity_prediction': result['material_toxicity_prediction'],
-            },
-            'L5': {
-                'crystal_structure_and_composition_analysis': result['crystal_structure_and_composition_analysis'],
-                'specified_band_gap_material_generation': result['specified_band_gap_material_generation'],
-                'property_and_usage_analysis': result['property_and_usage_analysis'],
-            },
-        },
-        'Physics': {
-            'L1': {
-                'physics_literature_QA': result['physics_literature_QA'],
-                'fundamental_physics_exam': result['fundamental_physics_exam'],
-            },
-            'L2': {
-                'physics_hypothesis_verification': result['physics_hypothesis_verification'],
-                'physics_detailed_understanding': result['physics_detailed_understanding'],
-                'physics_reasoning_and_interpretation': result['physics_reasoning_and_interpretation'],
-                'physics_text_summary': result['physics_text_summary'],
-            },
-            'L3': {
-                'high_school_physics_calculation': result['high_school_physics_calculation'],
-                'general_physics_calculation': result['general_physics_calculation'],
-                'physics_formula_derivation': result['physics_formula_derivation'],
-            },
-            'L4': {
-                'physics_safety_QA': result['physics_safety_QA'],
-                'physics_laboratory_safety_test': result['physics_laboratory_safety_test'],
-            },
-            'L5': {
-                'physics_problem_solving': result['physics_problem_solving'],
-            },
-        },
-    }
-
-    return reformatted_result
-
-def get_task_data(data: List[Dict[str, Any]]) -> Dict[str, Any]:
-    try:
-        task_data = {
-            # L1
-            'molecule_name_conversion': [d for d in data if d['details']['task'] == 'molecule_name_conversion'],
-            'molecular_property_prediction': [d for d in data if d['details']['task'] == 'molecular_property_prediction' and d['details']['level'] == 'L1'],
-            'chemical_literature_QA': [d for d in data if d['details']['task'] == 'literature_multi_choice_question' and d['domain'] == 'Chemistry'],
-            'molecule_captioning': [d for d in data if d['details']['task'] == 'molecule_captioning'],
-            # L2
-            'reaction_mechanism_inference': [d for d in data if d['details']['subtask'] == 'reaction_mechanism_inference'],
-            'compound_identification_and_properties': [d for d in data if d['details']['subtask'] == 'compound_identification_and_properties'],
-            'extract_doping': [d for d in data if d['details']['subtask'] == 'extract_doping'],
-            'chemical_detailed_understanding': [d for d in data if d['details']['subtask'] == 'detailed_understanding' and d['domain'] == 'Chemistry'],
-            'chemical_text_summary': [d for d in data if d['details']['subtask'] == 'text_summary' and d['domain'] == 'Chemistry'],
-            'chemical_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'hypothesis_verification' and d['domain'] == 'Chemistry'],
-            'chemical_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'reasoning_and_interpretation' and d['domain'] == 'Chemistry'],
-            # L3
-            'molar_weight_calculation': [d for d in data if d['details']['task'] == 'molar_weight_calculation'],
-            'molecular_property_calculation': [d for d in data if d['details']['task'] == 'molecular_property_prediction' and d['details']['level'] == 'L3'],
-            'molecule_structure_prediction': [d for d in data if d['details']['task'] == 'molecule_structure_prediction'],
-            'reaction_prediction': [d for d in data if d['details']['task'] == 'reaction_prediction'],
-            'retrosynthesis': [d for d in data if d['details']['task'] == 'retrosynthesis'],
-            'balancing_chemical_equation': [d for d in data if d['details']['task'] == 'balancing_chemical_equation'],
-            'chemical_calculation': [d for d in data if d['details']['task'] == 'sci_calculate' and d['domain'] == 'Chemistry'],
-            # L4
-            'chemical_harmful_QA': [d for d in data if d['details']['task'] == 'harmful_QA' and d['domain'] == 'Chemistry'],
-            'mol_toxicity_prediction': [d for d in data if d['details']['task'] == 'mol_toxicity_prediction'],
-            'chemical_laboratory_safety_test': [d for d in data if d['details']['task'] == 'laboratory_safety_test' and d['domain'] == 'Chemistry'],
-            # L5
-            'molecule_generation': [d for d in data if d['details']['task'] == 'molecule_generation'],
-            'chemical_procedure_generation': [d for d in data if d['details']['task'] == 'procedure_generation' and d['domain'] == 'Chemistry'],
-            'chemical_reagent_generation': [d for d in data if d['details']['task'] == 'reagent_generation' and d['domain'] == 'Chemistry'],
-            ### Biology
-            # L1
-            'protein_property_identification': [d for d in data if d['details']['task'] == 'protein_property_identification'],
-            'biology_literature_QA': [d for d in data if d['details']['task'] == 'literature_multi_choice_question' and d['domain'] == 'Biology'],
-            'protein_description_generation': [d for d in data if d['details']['task'] == 'protein_description_generation'],
-            # L2
-            'drug_drug_relation_extraction': [d for d in data if d['details']['subtask'] == 'drug_drug_relation_extraction'],
-            'biomedical_judgment_and_interpretation': [d for d in data if d['details']['subtask'] == 'biomedical_judgment_and_interpretation'],
-            'compound_disease_relation_extraction': [d for d in data if d['details']['subtask'] == 'compound_disease_relation_extraction'],
-            'gene_disease_relation_extraction': [d for d in data if d['details']['subtask'] == 'gene_disease_relation_extraction'],
-            'biological_detailed_understanding': [d for d in data if d['details']['subtask'] == 'detailed_understanding' and d['domain'] == 'Biology'],
-            'biological_text_summary': [d for d in data if d['details']['subtask'] == 'text_summary' and d['domain'] == 'Biology'],
-            'biological_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'hypothesis_verification' and d['domain'] == 'Biology'],
-            'biological_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'reasoning_and_interpretation' and d['domain'] == 'Biology'],
-            # L3
-            'solubility_prediction': [d for d in data if d['details']['subtask'] == 'solubility_prediction'],
-            'beta_lactamase_activity_prediction': [d for d in data if d['details']['subtask'] == 'beta_lactamase_activity_prediction'],
-            'fluorescence_prediction': [d for d in data if d['details']['subtask'] == 'fluorescence_prediction'],
-            'GB1_ftness_prediction': [d for d in data if d['details']['subtask'] == 'GB1_ftness_prediction'],
-            'stability_prediction': [d for d in data if d['details']['subtask'] == 'stability_prediction'],
-            'Protein_Protein_Interaction': [d for d in data if d['details']['subtask'] == 'Protein_Protein_Interaction'],
-            'biological_calculation': [d for d in data if d['details']['task'] == 'sci_calculate' and d['domain'] == 'Biology'],
-            # L4
-            'biological_harmful_QA': [d for d in data if d['details']['task'] == 'harmful_QA' and d['domain'] == 'Biology'],
-            'proteotoxicity_prediction': [d for d in data if d['details']['task'] == 'proteotoxicity_prediction'],
-            'biological_laboratory_safety_test': [d for d in data if d['details']['task'] == 'laboratory_safety_test' and d['domain'] == 'Biology'],
-            # L5
-            'biological_procedure_generation': [d for d in data if d['details']['task'] == 'procedure_generation' and d['domain'] == 'Biology'],
-            'biological_reagent_generation': [d for d in data if d['details']['task'] == 'reagent_generation' and d['domain'] == 'Biology'],
-            'protein_design': [d for d in data if d['details']['task'] == 'protein_design'],
-            'single_cell_analysis': [d for d in data if d['details']['task'] == 'single_cell_analysis'],
-            ### Material
-            # L1
-            'material_literature_QA': [d for d in data if d['details']['task'] == 'material_literature_QA'],
-            # L2
-            'material_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'material_hypothesis_verification'],
-            'material_component_extraction': [d for d in data if d['details']['subtask'] == 'material_component_extraction'],
-            'material_data_extraction': [d for d in data if d['details']['subtask'] == 'material_data_extraction'],
-            'material_detailed_understanding': [d for d in data if d['details']['subtask'] == 'material_detailed_understanding'],
-            'material_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'material_reasoning_and_interpretation'],
-            'material_text_summary': [d for d in data if d['details']['subtask'] == 'material_text_summary'],
-            # L3
-            'valence_electron_difference_calculation': [d for d in data if d['details']['task'] == 'valence_electron_difference_calculation'],
-            'material_calculation': [d for d in data if d['details']['task'] == 'material_calculation'],
-            'lattice_volume_calculation': [d for d in data if d['details']['task'] == 'lattice_volume_calculation'],
-            'perovskite_stability_prediction': [d for d in data if d['details']['task'] == 'perovskite_stability_prediction'],
-            'diffusion_rate_analysis': [d for d in data if d['details']['task'] == 'diffusion_rate_analysis'],
-            # L4
-            'material_safety_QA': [d for d in data if d['details']['task'] == 'material_safety_QA'],
-            'material_toxicity_prediction': [d for d in data if d['details']['task'] == 'material_toxicity_prediction'],
-            # L5
-            'crystal_structure_and_composition_analysis': [d for d in data if d['details']['task'] == 'crystal_structure_and_composition_analysis'],
-            'specified_band_gap_material_generation': [d for d in data if d['details']['task'] == 'specified_band_gap_material_generation'],
-            'property_and_usage_analysis': [d for d in data if d['details']['task'] in ['property_and_usage_analysis', 'L5_material']],
-            ### Physics
-            # L1
-            'physics_literature_QA': [d for d in data if d['details']['task'] == 'physics_literature_QA'],
-            'fundamental_physics_exam': [d for d in data if d['details']['task'] == 'fundamental_physics_exam'],
-            # L2
-            'physics_hypothesis_verification': [d for d in data if d['details']['subtask'] == 'physics_hypothesis_verification'],
-            'physics_detailed_understanding': [d for d in data if d['details']['subtask'] == 'physics_detailed_understanding'],
-            'physics_reasoning_and_interpretation': [d for d in data if d['details']['subtask'] == 'physics_reasoning_and_interpretation'],
-            'physics_text_summary': [d for d in data if d['details']['subtask'] == 'physics_text_summary'],
-            # L3
-            'high_school_physics_calculation': [d for d in data if d['details']['task'] == 'high_school_physics_calculation'],
-            'general_physics_calculation': [d for d in data if d['details']['task'] == 'general_physics_calculation'],
-            'physics_formula_derivation': [d for d in data if d['details']['task'] == 'physics_formula_derivation'],
-            # L4
-            'physics_safety_QA': [d for d in data if d['details']['task'] == 'physics_safety_QA'],
-            'physics_laboratory_safety_test': [d for d in data if d['details']['task'] == 'physics_laboratory_safety_test'],
-            # L5
-            'physics_problem_solving': [d for d in data if d['details']['task'] == 'physics_problem_solving'],
-        }
-        assert sum([len(d) for d in task_data.values()]) == len(data), f'length not equal, 0 length task: {[k for k, v in task_data.items() if len(v) == 0]}'
-        print(">>>>>> Total data length:", len(data))
-        return task_data
-    except Exception as e:
-        raise NotImplementedError(f"data error: {e}. please check your task name.")
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1970abd
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,77 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "sciknoweval"
+version = "0.1.0"
+description = "Evaluating Multi-level Scientific Knowledge of Large Language Models"
+readme = "README.md"
+license ={file = "LICENSE"}
+authors = [
+    {name = "Kehua Feng", email = "kehuafeng@zju.edu.cn"},
+    {name = "HICAI-ZJU"}
+]
+maintainers = [
+    {name = "HICAI-ZJU"}
+]
+keywords = [
+    "llm",
+    "evaluation",
+    "benchmark", 
+    "scientific-knowledge",
+    "ai"
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules"
+]
+requires-python = ">=3.8"
+dynamic = ["dependencies"]
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+
+[project.urls]
+Homepage = "https://github.com/HICAI-ZJU/SciKnowEval"
+Documentation = "http://www.scimind.ai/sciknoweval/"
+Repository = "https://github.com/HICAI-ZJU/SciKnowEval.git"
+"Bug Tracker" = "https://github.com/HICAI-ZJU/SciKnowEval/issues"
+Dataset = "https://huggingface.co/datasets/hicai-zju/SciKnowEval"
+Paper = "https://arxiv.org/abs/2406.09098"
+
+[project.scripts]
+sciknoweval = "sciknoweval.eval:main"
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=6.0",
+    "pytest-cov",
+    "black",
+    "isort",
+    "flake8"
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["sciknoweval*"]
+
+[tool.setuptools.package-data]
+sciknoweval = ["evaluation/utils/prompts/*.yaml"]
+
+[tool.black]
+line-length = 88
+target-version = ['py38']
+
+[tool.isort]  
+profile = "black"
+line_length = 88
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 5c2b3ba..edbb53c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ numpy
 rouge_score
 rdkit
 rdchiral
-openai==0.28.1
+openai>=0.28.1
 scipy
 gensim
 tiktoken
\ No newline at end of file
diff --git a/sciknoweval/__init__.py b/sciknoweval/__init__.py
new file mode 100644
index 0000000..c45b189
--- /dev/null
+++ b/sciknoweval/__init__.py
@@ -0,0 +1,10 @@
+"""
+SciKnowEval: Evaluating Multi-level Scientific Knowledge of Large Language Models
+
+A comprehensive benchmark for evaluating Large Language Models across five levels
+of scientific knowledge: memory, comprehension, reasoning, discernment, and application.
+"""
+
+__version__ = "0.1.0"
+__author__ = "HICAI-ZJU"
+__description__ = "Evaluating Multi-level Scientific Knowledge of Large Language Models"
diff --git a/eval.py b/sciknoweval/eval.py
similarity index 91%
rename from eval.py
rename to sciknoweval/eval.py
index a64c623..ffaa420 100644
--- a/eval.py
+++ b/sciknoweval/eval.py
@@ -1,10 +1,15 @@
+import argparse
 import json
 import os
-import argparse
-from evaluation.metrics import *
-from evaluation.define import get_task_func, get_task_data, reformat_result
-
 import warnings
+
+from .evaluation.define import get_task_data, get_task_func, reformat_result
+from .evaluation.metrics import (
+    get_score_GPT4,
+    get_score_RE_triplets,
+    get_score_RE_tuples,
+)
+
 warnings.filterwarnings("ignore")
 
 
diff --git a/sciknoweval/evaluation/__init__.py b/sciknoweval/evaluation/__init__.py
new file mode 100644
index 0000000..5c9808e
--- /dev/null
+++ b/sciknoweval/evaluation/__init__.py
@@ -0,0 +1,6 @@
+"""
+Evaluation module for SciKnowEval benchmark.
+
+Contains task definitions, metrics, and utility functions for evaluating
+scientific knowledge of Large Language Models.
+"""
\ No newline at end of file
diff --git a/sciknoweval/evaluation/define.py b/sciknoweval/evaluation/define.py
new file mode 100644
index 0000000..feb1d43
--- /dev/null
+++ b/sciknoweval/evaluation/define.py
@@ -0,0 +1,716 @@
+import warnings
+from typing import Any, Dict, List
+
+from .metrics import (
+    get_score_BLEU_ROUGE,
+    get_score_CLS,
+    get_score_filling,
+    get_score_GPT4,
+    get_score_Mol_GEN,
+    get_score_RE_triplets,
+    get_score_RE_tuples,
+    get_score_reaction,
+    get_score_smith_waterman,
+)
+
+warnings.filterwarnings("ignore")
+
+
+def get_task_func():
+    try:
+        return {
+            ### Chemistry
+            # L1
+            "molecule_name_conversion": get_score_CLS,
+            "molecular_property_prediction": get_score_CLS,
+            "chemical_literature_QA": get_score_CLS,
+            # L2
+            "reaction_mechanism_inference": get_score_CLS,
+            "compound_identification_and_properties": get_score_CLS,
+            "extract_doping": get_score_GPT4,
+            "chemical_detailed_understanding": get_score_CLS,
+            "chemical_text_summary": get_score_GPT4,
+            "chemical_hypothesis_verification": get_score_CLS,
+            "chemical_reasoning_and_interpretation": get_score_CLS,
+            # L3
+            "molar_weight_calculation": get_score_CLS,
+            "molecular_property_calculation": get_score_CLS,
+            "molecule_structure_prediction": get_score_CLS,
+            "reaction_prediction": get_score_reaction,
+            "retrosynthesis": get_score_reaction,
+            "balancing_chemical_equation": get_score_filling,
+            "chemical_calculation": get_score_CLS,
+            # L4
+            "chemical_harmful_QA": get_score_GPT4,
+            "mol_toxicity_prediction": get_score_CLS,
+            "chemical_laboratory_safety_test": get_score_CLS,
+            # L5
+            "molecule_captioning": get_score_BLEU_ROUGE,
+            "molecule_generation": get_score_Mol_GEN,
+            "chemical_procedure_generation": get_score_GPT4,
+            "chemical_reagent_generation": get_score_GPT4,
+            ### Biology
+            # L1
+            "protein_property_identification": get_score_CLS,
+            "biology_literature_QA": get_score_CLS,
+            # L2
+            "drug_drug_relation_extraction": get_score_RE_triplets,
+            "biomedical_judgment_and_interpretation": get_score_CLS,
+            "compound_disease_relation_extraction": get_score_RE_tuples,
+            "gene_disease_relation_extraction": get_score_RE_triplets,
+            "biological_detailed_understanding": get_score_CLS,
+            "biological_text_summary": get_score_GPT4,
+            "biological_hypothesis_verification": get_score_CLS,
+            "biological_reasoning_and_interpretation": get_score_CLS,
+            # L3
+            "solubility_prediction": get_score_CLS,
+            "beta_lactamase_activity_prediction": get_score_CLS,
+            "fluorescence_prediction": get_score_CLS,
+            "GB1_ftness_prediction": get_score_CLS,
+            "stability_prediction": get_score_CLS,
+            "Protein_Protein_Interaction": get_score_CLS,
+            "biological_calculation": get_score_CLS,
+            # L4
+            "biological_harmful_QA": get_score_GPT4,
+            "proteotoxicity_prediction": get_score_CLS,
+            "biological_laboratory_safety_test": get_score_CLS,
+            # L5
+            "biological_procedure_generation": get_score_GPT4,
+            "biological_reagent_generation": get_score_GPT4,
+            "protein_description_generation": get_score_BLEU_ROUGE,
+            "protein_design": get_score_smith_waterman,
+            "single_cell_analysis": get_score_BLEU_ROUGE,
+            ### Meterial
+            # L1
+            "material_literature_QA": get_score_CLS,
+            # L2
+            "material_hypothesis_verification": get_score_CLS,
+            "material_component_extraction": get_score_GPT4,
+            "material_data_extraction": get_score_CLS,
+            "material_detailed_understanding": get_score_CLS,
+            "material_reasoning_and_interpretation": get_score_CLS,
+            "material_text_summary": get_score_GPT4,
+            # L3
+            "valence_electron_difference_calculation": get_score_CLS,
+            "material_calculation": get_score_CLS,
+            "lattice_volume_calculation": get_score_CLS,
+            "perovskite_stability_prediction": get_score_CLS,
+            "diffusion_rate_analysis": get_score_CLS,
+            # L4
+            "material_safety_QA": get_score_CLS,
+            "material_toxicity_prediction": get_score_CLS,
+            # L5
+            "crystal_structure_and_composition_analysis": get_score_GPT4,
+            "specified_band_gap_material_generation": get_score_GPT4,
+            "property_and_usage_analysis": get_score_GPT4,
+            ### Physics
+            # L1
+            "physics_literature_QA": get_score_CLS,
+            "fundamental_physics_exam": get_score_CLS,
+            # L2
+            "physics_hypothesis_verification": get_score_CLS,
+            "physics_detailed_understanding": get_score_CLS,
+            "physics_reasoning_and_interpretation": get_score_CLS,
+            "physics_text_summary": get_score_GPT4,
+            # L3
+            "high_school_physics_calculation": get_score_CLS,
+            "general_physics_calculation": get_score_CLS,
+            "physics_formula_derivation": get_score_GPT4,
+            # L4
+            "physics_safety_QA": get_score_CLS,
+            "physics_laboratory_safety_test": get_score_CLS,
+            # L5
+            "physics_problem_solving": get_score_GPT4,
+        }
+    except:
+        raise NotImplementedError("task not found")
+
+
+def reformat_result(result: Dict[str, Any]) -> Dict[str, Any]:
+    reformatted_result = {
+        "Biology": {
+            "L1": {
+                "molecule_name_conversion": result["molecule_name_conversion"],
+                "molecular_property_prediction": result[
+                    "molecular_property_prediction"
+                ],
+                "biology_literature_QA": result["biology_literature_QA"],
+                "protein_description_generation": result[
+                    "protein_description_generation"
+                ],
+            },
+            "L2": {
+                "drug_drug_relation_extraction": result[
+                    "drug_drug_relation_extraction"
+                ],
+                "biomedical_judgment_and_interpretation": result[
+                    "biomedical_judgment_and_interpretation"
+                ],
+                "compound_disease_relation_extraction": result[
+                    "compound_disease_relation_extraction"
+                ],
+                "gene_disease_relation_extraction": result[
+                    "gene_disease_relation_extraction"
+                ],
+                "biological_detailed_understanding": result[
+                    "biological_detailed_understanding"
+                ],
+                "biological_text_summary": result["biological_text_summary"],
+                "biological_hypothesis_verification": result[
+                    "biological_hypothesis_verification"
+                ],
+                "biological_reasoning_and_interpretation": result[
+                    "biological_reasoning_and_interpretation"
+                ],
+            },
+            "L3": {
+                "solubility_prediction": result["solubility_prediction"],
+                "beta_lactamase_activity_prediction": result[
+                    "beta_lactamase_activity_prediction"
+                ],
+                "fluorescence_prediction": result["fluorescence_prediction"],
+                "GB1_ftness_prediction": result["GB1_ftness_prediction"],
+                "stability_prediction": result["stability_prediction"],
+                "Protein_Protein_Interaction": result["Protein_Protein_Interaction"],
+                "biological_calculation": result["biological_calculation"],
+            },
+            "L4": {
+                "biological_harmful_QA": result["biological_harmful_QA"],
+                "proteotoxicity_prediction": result["proteotoxicity_prediction"],
+                "biological_laboratory_safety_test": result[
+                    "biological_laboratory_safety_test"
+                ],
+            },
+            "L5": {
+                "biological_procedure_generation": result[
+                    "biological_procedure_generation"
+                ],
+                "biological_reagent_generation": result[
+                    "biological_reagent_generation"
+                ],
+                "protein_design": result["protein_design"],
+                "single_cell_analysis": result["single_cell_analysis"],
+            },
+        },
+        "Chemistry": {
+            "L1": {
+                "molecule_name_conversion": result["molecule_name_conversion"],
+                "molecular_property_prediction": result[
+                    "molecular_property_prediction"
+                ],
+                "chemical_literature_QA": result["chemical_literature_QA"],
+                "molecule_captioning": result["molecule_captioning"],
+            },
+            "L2": {
+                "reaction_mechanism_inference": result["reaction_mechanism_inference"],
+                "compound_identification_and_properties": result[
+                    "compound_identification_and_properties"
+                ],
+                "extract_doping": result["extract_doping"],
+                "chemical_detailed_understanding": result[
+                    "chemical_detailed_understanding"
+                ],
+                "chemical_text_summary": result["chemical_text_summary"],
+                "chemical_hypothesis_verification": result[
+                    "chemical_hypothesis_verification"
+                ],
+                "chemical_reasoning_and_interpretation": result[
+                    "chemical_reasoning_and_interpretation"
+                ],
+            },
+            "L3": {
+                "molar_weight_calculation": result["molar_weight_calculation"],
+                "molecular_property_calculation": result[
+                    "molecular_property_calculation"
+                ],
+                "molecule_structure_prediction": result[
+                    "molecule_structure_prediction"
+                ],
+                "reaction_prediction": result["reaction_prediction"],
+                "retrosynthesis": result["retrosynthesis"],
+                "balancing_chemical_equation": result["balancing_chemical_equation"],
+                "chemical_calculation": result["chemical_calculation"],
+            },
+            "L4": {
+                "chemical_harmful_QA": result["chemical_harmful_QA"],
+                "mol_toxicity_prediction": result["mol_toxicity_prediction"],
+                "chemical_laboratory_safety_test": result[
+                    "chemical_laboratory_safety_test"
+                ],
+            },
+            "L5": {
+                "molecule_generation": result["molecule_generation"],
+                "chemical_procedure_generation": result[
+                    "chemical_procedure_generation"
+                ],
+                "chemical_reagent_generation": result["chemical_reagent_generation"],
+            },
+        },
+        "Materials": {
+            "L1": {
+                "material_literature_QA": result["material_literature_QA"],
+            },
+            "L2": {
+                "material_hypothesis_verification": result[
+                    "material_hypothesis_verification"
+                ],
+                "material_component_extraction": result[
+                    "material_component_extraction"
+                ],
+                "material_data_extraction": result["material_data_extraction"],
+                "material_detailed_understanding": result[
+                    "material_detailed_understanding"
+                ],
+                "material_reasoning_and_interpretation": result[
+                    "material_reasoning_and_interpretation"
+                ],
+                "material_text_summary": result["material_text_summary"],
+            },
+            "L3": {
+                "valence_electron_difference_calculation": result[
+                    "valence_electron_difference_calculation"
+                ],
+                "material_calculation": result["material_calculation"],
+                "lattice_volume_calculation": result["lattice_volume_calculation"],
+                "perovskite_stability_prediction": result[
+                    "perovskite_stability_prediction"
+                ],
+                "diffusion_rate_analysis": result["diffusion_rate_analysis"],
+            },
+            "L4": {
+                "material_safety_QA": result["material_safety_QA"],
+                "material_toxicity_prediction": result["material_toxicity_prediction"],
+            },
+            "L5": {
+                "crystal_structure_and_composition_analysis": result[
+                    "crystal_structure_and_composition_analysis"
+                ],
+                "specified_band_gap_material_generation": result[
+                    "specified_band_gap_material_generation"
+                ],
+                "property_and_usage_analysis": result["property_and_usage_analysis"],
+            },
+        },
+        "Physics": {
+            "L1": {
+                "physics_literature_QA": result["physics_literature_QA"],
+                "fundamental_physics_exam": result["fundamental_physics_exam"],
+            },
+            "L2": {
+                "physics_hypothesis_verification": result[
+                    "physics_hypothesis_verification"
+                ],
+                "physics_detailed_understanding": result[
+                    "physics_detailed_understanding"
+                ],
+                "physics_reasoning_and_interpretation": result[
+                    "physics_reasoning_and_interpretation"
+                ],
+                "physics_text_summary": result["physics_text_summary"],
+            },
+            "L3": {
+                "high_school_physics_calculation": result[
+                    "high_school_physics_calculation"
+                ],
+                "general_physics_calculation": result["general_physics_calculation"],
+                "physics_formula_derivation": result["physics_formula_derivation"],
+            },
+            "L4": {
+                "physics_safety_QA": result["physics_safety_QA"],
+                "physics_laboratory_safety_test": result[
+                    "physics_laboratory_safety_test"
+                ],
+            },
+            "L5": {
+                "physics_problem_solving": result["physics_problem_solving"],
+            },
+        },
+    }
+
+    return reformatted_result
+
+
+def get_task_data(data: List[Dict[str, Any]]) -> Dict[str, Any]:
+    try:
+        task_data = {
+            # L1
+            "molecule_name_conversion": [
+                d for d in data if d["details"]["task"] == "molecule_name_conversion"
+            ],
+            "molecular_property_prediction": [
+                d
+                for d in data
+                if d["details"]["task"] == "molecular_property_prediction"
+                and d["details"]["level"] == "L1"
+            ],
+            "chemical_literature_QA": [
+                d
+                for d in data
+                if d["details"]["task"] == "literature_multi_choice_question"
+                and d["domain"] == "Chemistry"
+            ],
+            "molecule_captioning": [
+                d for d in data if d["details"]["task"] == "molecule_captioning"
+            ],
+            # L2
+            "reaction_mechanism_inference": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "reaction_mechanism_inference"
+            ],
+            "compound_identification_and_properties": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "compound_identification_and_properties"
+            ],
+            "extract_doping": [
+                d for d in data if d["details"]["subtask"] == "extract_doping"
+            ],
+            "chemical_detailed_understanding": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "detailed_understanding"
+                and d["domain"] == "Chemistry"
+            ],
+            "chemical_text_summary": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "text_summary"
+                and d["domain"] == "Chemistry"
+            ],
+            "chemical_hypothesis_verification": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "hypothesis_verification"
+                and d["domain"] == "Chemistry"
+            ],
+            "chemical_reasoning_and_interpretation": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "reasoning_and_interpretation"
+                and d["domain"] == "Chemistry"
+            ],
+            # L3
+            "molar_weight_calculation": [
+                d for d in data if d["details"]["task"] == "molar_weight_calculation"
+            ],
+            "molecular_property_calculation": [
+                d
+                for d in data
+                if d["details"]["task"] == "molecular_property_prediction"
+                and d["details"]["level"] == "L3"
+            ],
+            "molecule_structure_prediction": [
+                d
+                for d in data
+                if d["details"]["task"] == "molecule_structure_prediction"
+            ],
+            "reaction_prediction": [
+                d for d in data if d["details"]["task"] == "reaction_prediction"
+            ],
+            "retrosynthesis": [
+                d for d in data if d["details"]["task"] == "retrosynthesis"
+            ],
+            "balancing_chemical_equation": [
+                d for d in data if d["details"]["task"] == "balancing_chemical_equation"
+            ],
+            "chemical_calculation": [
+                d
+                for d in data
+                if d["details"]["task"] == "sci_calculate"
+                and d["domain"] == "Chemistry"
+            ],
+            # L4
+            "chemical_harmful_QA": [
+                d
+                for d in data
+                if d["details"]["task"] == "harmful_QA" and d["domain"] == "Chemistry"
+            ],
+            "mol_toxicity_prediction": [
+                d for d in data if d["details"]["task"] == "mol_toxicity_prediction"
+            ],
+            "chemical_laboratory_safety_test": [
+                d
+                for d in data
+                if d["details"]["task"] == "laboratory_safety_test"
+                and d["domain"] == "Chemistry"
+            ],
+            # L5
+            "molecule_generation": [
+                d for d in data if d["details"]["task"] == "molecule_generation"
+            ],
+            "chemical_procedure_generation": [
+                d
+                for d in data
+                if d["details"]["task"] == "procedure_generation"
+                and d["domain"] == "Chemistry"
+            ],
+            "chemical_reagent_generation": [
+                d
+                for d in data
+                if d["details"]["task"] == "reagent_generation"
+                and d["domain"] == "Chemistry"
+            ],
+            ### Biology
+            # L1
+            "protein_property_identification": [
+                d
+                for d in data
+                if d["details"]["task"] == "protein_property_identification"
+            ],
+            "biology_literature_QA": [
+                d
+                for d in data
+                if d["details"]["task"] == "literature_multi_choice_question"
+                and d["domain"] == "Biology"
+            ],
+            "protein_description_generation": [
+                d
+                for d in data
+                if d["details"]["task"] == "protein_description_generation"
+            ],
+            # L2
+            "drug_drug_relation_extraction": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "drug_drug_relation_extraction"
+            ],
+            "biomedical_judgment_and_interpretation": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "biomedical_judgment_and_interpretation"
+            ],
+            "compound_disease_relation_extraction": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "compound_disease_relation_extraction"
+            ],
+            "gene_disease_relation_extraction": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "gene_disease_relation_extraction"
+            ],
+            "biological_detailed_understanding": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "detailed_understanding"
+                and d["domain"] == "Biology"
+            ],
+            "biological_text_summary": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "text_summary"
+                and d["domain"] == "Biology"
+            ],
+            "biological_hypothesis_verification": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "hypothesis_verification"
+                and d["domain"] == "Biology"
+            ],
+            "biological_reasoning_and_interpretation": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "reasoning_and_interpretation"
+                and d["domain"] == "Biology"
+            ],
+            # L3
+            "solubility_prediction": [
+                d for d in data if d["details"]["subtask"] == "solubility_prediction"
+            ],
+            "beta_lactamase_activity_prediction": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "beta_lactamase_activity_prediction"
+            ],
+            "fluorescence_prediction": [
+                d for d in data if d["details"]["subtask"] == "fluorescence_prediction"
+            ],
+            "GB1_ftness_prediction": [
+                d for d in data if d["details"]["subtask"] == "GB1_ftness_prediction"
+            ],
+            "stability_prediction": [
+                d for d in data if d["details"]["subtask"] == "stability_prediction"
+            ],
+            "Protein_Protein_Interaction": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "Protein_Protein_Interaction"
+            ],
+            "biological_calculation": [
+                d
+                for d in data
+                if d["details"]["task"] == "sci_calculate" and d["domain"] == "Biology"
+            ],
+            # L4
+            "biological_harmful_QA": [
+                d
+                for d in data
+                if d["details"]["task"] == "harmful_QA" and d["domain"] == "Biology"
+            ],
+            "proteotoxicity_prediction": [
+                d for d in data if d["details"]["task"] == "proteotoxicity_prediction"
+            ],
+            "biological_laboratory_safety_test": [
+                d
+                for d in data
+                if d["details"]["task"] == "laboratory_safety_test"
+                and d["domain"] == "Biology"
+            ],
+            # L5
+            "biological_procedure_generation": [
+                d
+                for d in data
+                if d["details"]["task"] == "procedure_generation"
+                and d["domain"] == "Biology"
+            ],
+            "biological_reagent_generation": [
+                d
+                for d in data
+                if d["details"]["task"] == "reagent_generation"
+                and d["domain"] == "Biology"
+            ],
+            "protein_design": [
+                d for d in data if d["details"]["task"] == "protein_design"
+            ],
+            "single_cell_analysis": [
+                d for d in data if d["details"]["task"] == "single_cell_analysis"
+            ],
+            ### Material
+            # L1
+            "material_literature_QA": [
+                d for d in data if d["details"]["task"] == "material_literature_QA"
+            ],
+            # L2
+            "material_hypothesis_verification": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "material_hypothesis_verification"
+            ],
+            "material_component_extraction": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "material_component_extraction"
+            ],
+            "material_data_extraction": [
+                d for d in data if d["details"]["subtask"] == "material_data_extraction"
+            ],
+            "material_detailed_understanding": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "material_detailed_understanding"
+            ],
+            "material_reasoning_and_interpretation": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "material_reasoning_and_interpretation"
+            ],
+            "material_text_summary": [
+                d for d in data if d["details"]["subtask"] == "material_text_summary"
+            ],
+            # L3
+            "valence_electron_difference_calculation": [
+                d
+                for d in data
+                if d["details"]["task"] == "valence_electron_difference_calculation"
+            ],
+            "material_calculation": [
+                d for d in data if d["details"]["task"] == "material_calculation"
+            ],
+            "lattice_volume_calculation": [
+                d for d in data if d["details"]["task"] == "lattice_volume_calculation"
+            ],
+            "perovskite_stability_prediction": [
+                d
+                for d in data
+                if d["details"]["task"] == "perovskite_stability_prediction"
+            ],
+            "diffusion_rate_analysis": [
+                d for d in data if d["details"]["task"] == "diffusion_rate_analysis"
+            ],
+            # L4
+            "material_safety_QA": [
+                d for d in data if d["details"]["task"] == "material_safety_QA"
+            ],
+            "material_toxicity_prediction": [
+                d
+                for d in data
+                if d["details"]["task"] == "material_toxicity_prediction"
+            ],
+            # L5
+            "crystal_structure_and_composition_analysis": [
+                d
+                for d in data
+                if d["details"]["task"] == "crystal_structure_and_composition_analysis"
+            ],
+            "specified_band_gap_material_generation": [
+                d
+                for d in data
+                if d["details"]["task"] == "specified_band_gap_material_generation"
+            ],
+            "property_and_usage_analysis": [
+                d
+                for d in data
+                if d["details"]["task"]
+                in ["property_and_usage_analysis", "L5_material"]
+            ],
+            ### Physics
+            # L1
+            "physics_literature_QA": [
+                d for d in data if d["details"]["task"] == "physics_literature_QA"
+            ],
+            "fundamental_physics_exam": [
+                d for d in data if d["details"]["task"] == "fundamental_physics_exam"
+            ],
+            # L2
+            "physics_hypothesis_verification": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "physics_hypothesis_verification"
+            ],
+            "physics_detailed_understanding": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "physics_detailed_understanding"
+            ],
+            "physics_reasoning_and_interpretation": [
+                d
+                for d in data
+                if d["details"]["subtask"] == "physics_reasoning_and_interpretation"
+            ],
+            "physics_text_summary": [
+                d for d in data if d["details"]["subtask"] == "physics_text_summary"
+            ],
+            # L3
+            "high_school_physics_calculation": [
+                d
+                for d in data
+                if d["details"]["task"] == "high_school_physics_calculation"
+            ],
+            "general_physics_calculation": [
+                d for d in data if d["details"]["task"] == "general_physics_calculation"
+            ],
+            "physics_formula_derivation": [
+                d for d in data if d["details"]["task"] == "physics_formula_derivation"
+            ],
+            # L4
+            "physics_safety_QA": [
+                d for d in data if d["details"]["task"] == "physics_safety_QA"
+            ],
+            "physics_laboratory_safety_test": [
+                d
+                for d in data
+                if d["details"]["task"] == "physics_laboratory_safety_test"
+            ],
+            # L5
+            "physics_problem_solving": [
+                d for d in data if d["details"]["task"] == "physics_problem_solving"
+            ],
+        }
+        assert sum([len(d) for d in task_data.values()]) == len(data), (
+            f"length not equal, 0 length task: {[k for k, v in task_data.items() if len(v) == 0]}"
+        )
+        print(">>>>>> Total data length:", len(data))
+        return task_data
+    except Exception as e:
+        raise NotImplementedError(f"data error: {e}. please check your task name.")
diff --git a/evaluation/metrics.py b/sciknoweval/evaluation/metrics.py
similarity index 96%
rename from evaluation/metrics.py
rename to sciknoweval/evaluation/metrics.py
index cee34a1..de868b9 100644
--- a/evaluation/metrics.py
+++ b/sciknoweval/evaluation/metrics.py
@@ -1,15 +1,21 @@
 import os
+from typing import Any, Dict, List
+
+import tiktoken
 import yaml
-from typing import List, Any, Dict
 from tqdm import tqdm
-from scipy.spatial.distance import cosine
-from gensim.models import KeyedVectors
-from evaluation.utils.relation_extraction import *
-from evaluation.utils.process import load_word2vec_model
-from evaluation.utils.generation import calculate_nltk_scores, calculate_smiles_metrics
 
-import tiktoken
-from evaluation.utils.openai_api import OpenAIChat
+from .utils.generation import calculate_nltk_scores, calculate_smiles_metrics
+from .utils.openai_api import OpenAIChat
+from .utils.process import load_word2vec_model
+from .utils.relation_extraction import (
+    cos_f1_score,
+    macro_f1_score_triplets,
+    macro_f1_score_tuples,
+    validate_format_and_extract_data_triplets,
+    validate_format_and_extract_data_tuples,
+)
+
 script_dir = os.path.dirname(os.path.abspath(__file__))
 
 
diff --git a/sciknoweval/evaluation/utils/__init__.py b/sciknoweval/evaluation/utils/__init__.py
new file mode 100644
index 0000000..95ffa6d
--- /dev/null
+++ b/sciknoweval/evaluation/utils/__init__.py
@@ -0,0 +1,6 @@
+"""
+Utility functions for SciKnowEval evaluation.
+
+Contains utilities for generation, OpenAI API integration, data processing,
+and relation extraction tasks.
+"""
\ No newline at end of file
diff --git a/evaluation/utils/generation.py b/sciknoweval/evaluation/utils/generation.py
similarity index 99%
rename from evaluation/utils/generation.py
rename to sciknoweval/evaluation/utils/generation.py
index d5c4dcd..7373b09 100644
--- a/evaluation/utils/generation.py
+++ b/sciknoweval/evaluation/utils/generation.py
@@ -1,14 +1,12 @@
-
 import numpy as np
 from nltk.translate.bleu_score import sentence_bleu
 from nltk.translate.meteor_score import meteor_score
-from tqdm import tqdm
-from rouge_score import rouge_scorer
-
-from rdkit import Chem, DataStructs
-from rdkit.Chem import MACCSkeys, AllChem
 from rdchiral.chiral import copy_chirality
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem, MACCSkeys
 from rdkit.Chem.AllChem import AssignStereochemistry
+from rouge_score import rouge_scorer
+from tqdm import tqdm
 
 
 def calculate_nltk_scores(tokenizer, ans_strs, pred_strs):
diff --git a/evaluation/utils/openai_api.py b/sciknoweval/evaluation/utils/openai_api.py
similarity index 99%
rename from evaluation/utils/openai_api.py
rename to sciknoweval/evaluation/utils/openai_api.py
index b3afee1..5ba9e92 100644
--- a/evaluation/utils/openai_api.py
+++ b/sciknoweval/evaluation/utils/openai_api.py
@@ -1,7 +1,8 @@
-import openai
+import asyncio
 import os
 from typing import List
-import asyncio
+
+import openai
 
 
 class OpenAIChat():
diff --git a/evaluation/utils/process.py b/sciknoweval/evaluation/utils/process.py
similarity index 100%
rename from evaluation/utils/process.py
rename to sciknoweval/evaluation/utils/process.py
diff --git a/evaluation/utils/prompts/prompt.yaml b/sciknoweval/evaluation/utils/prompts/prompt.yaml
similarity index 100%
rename from evaluation/utils/prompts/prompt.yaml
rename to sciknoweval/evaluation/utils/prompts/prompt.yaml
diff --git a/evaluation/utils/relation_extraction.py b/sciknoweval/evaluation/utils/relation_extraction.py
similarity index 98%
rename from evaluation/utils/relation_extraction.py
rename to sciknoweval/evaluation/utils/relation_extraction.py
index 7086b7a..99677a5 100644
--- a/evaluation/utils/relation_extraction.py
+++ b/sciknoweval/evaluation/utils/relation_extraction.py
@@ -1,10 +1,13 @@
-from collections import Counter
-import re
-from typing import List, Tuple, Any, Dict
-import numpy as np
 import re
-from evaluation.utils.process import same_entities, sentence_to_vec, cosine_similarity, cosine_similarity_2
-
+from collections import Counter
+from typing import Any, Dict, List, Tuple
+
+from .process import (
+    cosine_similarity,
+    cosine_similarity_2,
+    same_entities,
+    sentence_to_vec,
+)
 
 
 def parse_tuples(tuple_str):