pytest fixtures for test data loading (#39)

janosh · web-flow · commit f2fa786cbaad · 2022-04-09T17:34:11.000+01:00
diff --git a/examples/cgcnn-example.py b/examples/cgcnn-example.py
@@ -48,7 +48,7 @@ def main(  # noqa: C901
     weight_decay=1e-6,
     batch_size=128,
     workers=0,
-    device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
+    device="cuda" if torch.cuda.is_available() else "cpu",
     **kwargs,
 ):
 
diff --git a/examples/roost-example.py b/examples/roost-example.py
@@ -42,7 +42,7 @@ def main(  # noqa: C901
     weight_decay=1e-6,
     batch_size=128,
     workers=0,
-    device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
+    device="cuda" if torch.cuda.is_available() else "cpu",
     **kwargs,
 ):
     if not len(targets) == len(tasks) == len(losses):
diff --git a/examples/wren-example.py b/examples/wren-example.py
@@ -44,7 +44,7 @@ def main(  # noqa: C901
     weight_decay=1e-6,
     batch_size=128,
     workers=0,
-    device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
+    device="cuda" if torch.cuda.is_available() else "cpu",
     **kwargs,
 ):
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,41 @@
+import os
+
+import pytest
+import torch
+from matminer.datasets import load_dataset
+
+from aviary.cgcnn.utils import get_cgcnn_input
+from aviary.wren.utils import get_aflow_label_spglib
+
+torch.manual_seed(0)  # ensure reproducible results (applies to all tests)
+
+
+@pytest.fixture(scope="session")
+def df_matbench_phonons():
+    """Return a pandas dataframe with the data from the Matbench phonons dataset."""
+
+    df = load_dataset("matbench_phonons")
+    df[["lattice", "sites"]] = [get_cgcnn_input(x) for x in df.structure]
+    df["material_id"] = [f"mb_phdos_{i}" for i in range(len(df))]
+    df["composition"] = [x.composition.formula.replace(" ", "") for x in df.structure]
+
+    df["phdos_clf"] = [1 if x > 450 else 0 for x in df["last phdos peak"]]
+
+    return df
+
+
+@pytest.fixture(scope="session")
+def df_matbench_phonons_wyckoff(df_matbench_phonons):
+    """Getting Aflow labels is expensive so we split into a separate fixture to avoid
+    paying for it unless needed.
+    """
+    df_matbench_phonons["wyckoff"] = [
+        get_aflow_label_spglib(x) for x in df_matbench_phonons.structure
+    ]
+
+    return df_matbench_phonons
+
+
+@pytest.fixture(scope="session")
+def tests_dir():
+    return os.path.dirname(os.path.abspath(__file__))
diff --git a/tests/test_cgcnn_classification.py b/tests/test_cgcnn_classification.py
@@ -2,22 +2,15 @@
 
 import numpy as np
 import torch
-from matminer.utils.io import load_dataframe_from_json
 from sklearn.metrics import accuracy_score, roc_auc_score
 from sklearn.model_selection import train_test_split as split
 
 from aviary.cgcnn.data import CrystalGraphData, collate_batch
 from aviary.cgcnn.model import CrystalGraphConvNet
-from aviary.cgcnn.utils import get_cgcnn_input
 from aviary.utils import results_multitask, train_ensemble
 
-torch.manual_seed(0)  # ensure reproducible results
 
-
-def test_cgcnn_clf():
-    data_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "data/matbench_phonons.json.gz"
-    )
+def test_cgcnn_clf(df_matbench_phonons):
     elem_emb = "cgcnn92"
     targets = ["phdos_clf"]
     tasks = ["classification"]
@@ -44,26 +37,14 @@ def test_cgcnn_clf():
     weight_decay = 1e-6
     batch_size = 128
     workers = 0
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
 
     task_dict = dict(zip(targets, tasks))
     loss_dict = dict(zip(targets, losses))
 
-    assert os.path.exists(data_path), f"{data_path} does not exist!"
-
-    df = load_dataframe_from_json(data_path)
-    df["lattice"] = [None] * len(df)
-    df["sites"] = [None] * len(df)
-    df[["lattice", "sites"]] = df.apply(
-        lambda x: get_cgcnn_input(x.structure), axis=1, result_type="expand"
-    )
-    df["material_id"] = [f"mb_phdos_{i}" for i in range(len(df))]
-    df["composition"] = df.structure.apply(
-        lambda x: x.composition.formula.replace(" ", "")
+    dataset = CrystalGraphData(
+        df=df_matbench_phonons, elem_emb=elem_emb, task_dict=task_dict
     )
-    df["phdos_clf"] = np.where((df["last phdos peak"] > 450), 1, 0)
-
-    dataset = CrystalGraphData(df=df, elem_emb=elem_emb, task_dict=task_dict)
     n_targets = dataset.n_targets
     elem_emb_len = dataset.elem_emb_len
     nbr_fea_len = dataset.nbr_fea_dim
@@ -166,7 +147,3 @@ def test_cgcnn_clf():
 
     assert ens_acc > 0.85
     assert ens_roc_auc > 0.9
-
-
-if __name__ == "__main__":
-    test_cgcnn_clf()
diff --git a/tests/test_cgcnn_regression.py b/tests/test_cgcnn_regression.py
@@ -2,22 +2,15 @@
 
 import numpy as np
 import torch
-from matminer.utils.io import load_dataframe_from_json
 from sklearn.metrics import r2_score
 from sklearn.model_selection import train_test_split as split
 
 from aviary.cgcnn.data import CrystalGraphData, collate_batch
 from aviary.cgcnn.model import CrystalGraphConvNet
-from aviary.cgcnn.utils import get_cgcnn_input
 from aviary.utils import results_multitask, train_ensemble
 
-torch.manual_seed(0)  # ensure reproducible results
 
-
-def test_cgcnn_regression():
-    data_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "data/matbench_phonons.json.gz"
-    )
+def test_cgcnn_regression(df_matbench_phonons):
     elem_emb = "cgcnn92"
     targets = ["last phdos peak"]
     tasks = ["regression"]
@@ -44,25 +37,14 @@ def test_cgcnn_regression():
     weight_decay = 1e-6
     batch_size = 128
     workers = 0
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
 
     task_dict = dict(zip(targets, tasks))
     loss_dict = dict(zip(targets, losses))
 
-    assert os.path.exists(data_path), f"{data_path} does not exist!"
-
-    df = load_dataframe_from_json(data_path)
-    df["lattice"] = [None] * len(df)
-    df["sites"] = [None] * len(df)
-    df[["lattice", "sites"]] = df.apply(
-        lambda x: get_cgcnn_input(x.structure), axis=1, result_type="expand"
-    )
-    df["material_id"] = [f"mb_phdos_{i}" for i in range(len(df))]
-    df["composition"] = df.structure.apply(
-        lambda x: x.composition.formula.replace(" ", "")
+    dataset = CrystalGraphData(
+        df=df_matbench_phonons, elem_emb=elem_emb, task_dict=task_dict
     )
-
-    dataset = CrystalGraphData(df=df, elem_emb=elem_emb, task_dict=task_dict)
     n_targets = dataset.n_targets
     elem_emb_len = dataset.elem_emb_len
     nbr_fea_len = dataset.nbr_fea_dim
@@ -164,7 +146,3 @@ def test_cgcnn_regression():
     assert r2 > 0.7
     assert mae < 150
     assert rmse < 300
-
-
-if __name__ == "__main__":
-    test_cgcnn_regression()
diff --git a/tests/test_roost_classification.py b/tests/test_roost_classification.py
@@ -2,21 +2,15 @@
 
 import numpy as np
 import torch
-from matminer.utils.io import load_dataframe_from_json
 from sklearn.metrics import accuracy_score, roc_auc_score
 from sklearn.model_selection import train_test_split as split
 
 from aviary.roost.data import CompositionData, collate_batch
 from aviary.roost.model import Roost
 from aviary.utils import results_multitask, train_ensemble
 
-torch.manual_seed(0)  # ensure reproducible results
 
-
-def test_roost_clf():
-    data_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "data/matbench_phonons.json.gz"
-    )
+def test_roost_clf(df_matbench_phonons):
     elem_emb = "matscholar200"
     targets = ["phdos_clf"]
     tasks = ["classification"]
@@ -41,21 +35,14 @@ def test_roost_clf():
     weight_decay = 1e-6
     batch_size = 128
     workers = 0
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
 
     task_dict = dict(zip(targets, tasks))
     loss_dict = dict(zip(targets, losses))
 
-    assert os.path.exists(data_path), f"{data_path} does not exist!"
-
-    df = load_dataframe_from_json(data_path)
-    df["material_id"] = [f"mb_phdos_{i}" for i in range(len(df))]
-    df["composition"] = df.structure.apply(
-        lambda x: x.composition.formula.replace(" ", "")
+    dataset = CompositionData(
+        df=df_matbench_phonons, elem_emb=elem_emb, task_dict=task_dict
     )
-    df["phdos_clf"] = np.where((df["last phdos peak"] > 450), 1, 0)
-
-    dataset = CompositionData(df=df, elem_emb=elem_emb, task_dict=task_dict)
     n_targets = dataset.n_targets
     elem_emb_len = dataset.elem_emb_len
 
@@ -162,7 +149,3 @@ def test_roost_clf():
 
     assert ens_acc > 0.9
     assert ens_roc_auc > 0.9
-
-
-if __name__ == "__main__":
-    test_roost_clf()
diff --git a/tests/test_roost_regression.py b/tests/test_roost_regression.py
@@ -2,21 +2,15 @@
 
 import numpy as np
 import torch
-from matminer.utils.io import load_dataframe_from_json
 from sklearn.metrics import r2_score
 from sklearn.model_selection import train_test_split as split
 
 from aviary.roost.data import CompositionData, collate_batch
 from aviary.roost.model import Roost
 from aviary.utils import results_multitask, train_ensemble
 
-torch.manual_seed(0)  # ensure reproducible results
 
-
-def test_roost_regression():
-    data_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "data/matbench_phonons.json.gz"
-    )
+def test_roost_regression(df_matbench_phonons):
     elem_emb = "matscholar200"
     targets = ["last phdos peak"]
     tasks = ["regression"]
@@ -41,20 +35,14 @@ def test_roost_regression():
     weight_decay = 1e-6
     batch_size = 128
     workers = 0
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
 
     task_dict = dict(zip(targets, tasks))
     loss_dict = dict(zip(targets, losses))
 
-    assert os.path.exists(data_path), f"{data_path} does not exist!"
-
-    df = load_dataframe_from_json(data_path)
-    df["material_id"] = [f"mb_phdos_{i}" for i in range(len(df))]
-    df["composition"] = df.structure.apply(
-        lambda x: x.composition.formula.replace(" ", "")
+    dataset = CompositionData(
+        df=df_matbench_phonons, elem_emb=elem_emb, task_dict=task_dict
     )
-
-    dataset = CompositionData(df=df, elem_emb=elem_emb, task_dict=task_dict)
     n_targets = dataset.n_targets
     elem_emb_len = dataset.elem_emb_len
 
@@ -160,7 +148,3 @@ def test_roost_regression():
     assert r2 > 0.7
     assert mae < 150
     assert rmse < 300
-
-
-if __name__ == "__main__":
-    test_roost_regression()
diff --git a/tests/test_wren_classification.py b/tests/test_wren_classification.py
@@ -2,22 +2,15 @@
 
 import numpy as np
 import torch
-from matminer.utils.io import load_dataframe_from_json
 from sklearn.metrics import accuracy_score, roc_auc_score
 from sklearn.model_selection import train_test_split as split
 
 from aviary.utils import results_multitask, train_ensemble
 from aviary.wren.data import WyckoffData, collate_batch
 from aviary.wren.model import Wren
-from aviary.wren.utils import get_aflow_label_spglib
 
-torch.manual_seed(0)  # ensure reproducible results
 
-
-def test_wren_clf():
-    data_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "data/matbench_phonons.json.gz"
-    )
+def test_wren_clf(df_matbench_phonons_wyckoff):
     elem_emb = "matscholar200"
     sym_emb = "bra-alg-off"
     targets = ["phdos_clf"]
@@ -44,23 +37,16 @@ def test_wren_clf():
     weight_decay = 1e-6
     batch_size = 128
     workers = 0
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
 
     task_dict = dict(zip(targets, tasks))
     loss_dict = dict(zip(targets, losses))
 
-    assert os.path.exists(data_path), f"{data_path} does not exist!"
-
-    df = load_dataframe_from_json(data_path)
-    df["wyckoff"] = df.structure.apply(get_aflow_label_spglib)
-    df["material_id"] = [f"mb_phdos_{i}" for i in range(len(df))]
-    df["composition"] = df.structure.apply(
-        lambda x: x.composition.formula.replace(" ", "")
-    )
-    df["phdos_clf"] = np.where((df["last phdos peak"] > 450), 1, 0)
-
     dataset = WyckoffData(
-        df=df, elem_emb=elem_emb, sym_emb=sym_emb, task_dict=task_dict
+        df=df_matbench_phonons_wyckoff,
+        elem_emb=elem_emb,
+        sym_emb=sym_emb,
+        task_dict=task_dict,
     )
     n_targets = dataset.n_targets
     elem_emb_len = dataset.elem_emb_len
@@ -171,7 +157,3 @@ def test_wren_clf():
 
     assert ens_acc > 0.85
     assert ens_roc_auc > 0.9
-
-
-if __name__ == "__main__":
-    test_wren_clf()
diff --git a/tests/test_wren_regression.py b/tests/test_wren_regression.py
diff --git a/tests/test_wyckoff_ops.py b/tests/test_wyckoff_ops.py