add total std calculation and kwarg warn_target_mismatch to make_ensemble_predictions()

janosh · janosh · commit 4b48f59725ac · 2022-06-24T17:07:25.000+01:00
diff --git a/aviary/wrenformer/utils.py b/aviary/wrenformer/utils.py
@@ -102,6 +102,7 @@ def make_ensemble_predictions(
     model_class: type[BaseModelClass] = Wrenformer,
     device: str = None,
     print_metrics: bool = True,
+    warn_target_mismatch: bool = False,
 ) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
     """Make predictions using an ensemble of Wrenformer models.
 
@@ -117,11 +118,15 @@ def make_ensemble_predictions(
             else "cpu".
         print_metrics (bool, optional): Whether to print performance metrics. Defaults to True
             if target_col is not None.
+        warn_target_mismatch (bool, optional): Whether to warn if target_col != target_name from
+            model checkpoint. Defaults to False.
 
     Returns:
         pd.DataFrame: Input dataframe with added columns for model and ensemble predictions. If
             target_col is not None, returns a 2nd dataframe containing model and ensemble metrics.
     """
+    # TODO: Add support for predicting all tasks a multi-task models was trained on. Currently only
+    # handles single targets.
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
 
     data_loader = df_to_in_mem_dataloader(
@@ -138,11 +143,12 @@ def make_ensemble_predictions(
         checkpoint = torch.load(checkpoint_path, map_location=device)
 
         model_params = checkpoint["model_params"]
-        target_name, task_type = next(model_params["task_dict"].items())
+        target_name, task_type = list(model_params["task_dict"].items())[0]
         assert task_type in ("regression", "classification"), f"invalid {task_type = }"
-        if target_name != target_col:
+        if target_name != target_col and warn_target_mismatch:
             print(
-                f"Warning: {target_col = } does not match {target_name = } in checkpoint."
+                f"Warning: {target_col = } does not match {target_name = } in checkpoint. "
+                "If this is not by accident, disable this warning by passing warn_target=False."
             )
         model = model_class(**model_params)
         model.to(device)
@@ -155,15 +161,22 @@ def make_ensemble_predictions(
         if model.robust:
             predictions, aleat_log_std = predictions.chunk(2, dim=1)
             aleat_std = aleat_log_std.exp().cpu().numpy().squeeze()
-            df[f"aleat_std_{idx}"] = aleat_std.tolist()
+            df[f"aleatoric_std_{idx}"] = aleat_std.tolist()
 
         predictions = predictions.cpu().numpy().squeeze()
         pred_col = f"{target_col}_pred_{idx}" if target_col else f"pred_{idx}"
         df[pred_col] = predictions.tolist()
 
     df_preds = df.filter(regex=r"_pred_\d")
     df[f"{target_col}_pred_ens"] = ensemble_preds = df_preds.mean(axis=1)
-    df[f"{target_col}_ens_epistemic_std"] = df_preds.std(axis=1)
+    df[f"{target_col}_epistemic_std_ens"] = epistemic_std = df_preds.std(axis=1)
+
+    if df.columns.str.startswith("aleatoric_std_").sum() > 0:
+        aleatoric_std = df.filter(regex=r"aleatoric_std_\d").mean(axis=1)
+        df[f"{target_col}_aleatoric_std_ens"] = aleatoric_std
+        df[f"{target_col}_total_std_ens"] = (
+            epistemic_std**2 + aleatoric_std**2
+        ) ** 0.5
 
     if target_col and print_metrics:
         targets = df[target_col]
@@ -175,12 +188,12 @@ def make_ensemble_predictions(
             index=df_preds.columns,
         )
 
-        print("Single model performance:")
-        print(all_model_metrics.describe().loc[["mean", "std"]])
+        print("\nSingle model performance:")
+        print(all_model_metrics.describe().round(4).loc[["mean", "std"]])
 
         ensemble_metrics = get_metrics(targets, ensemble_preds, task_type)
 
-        print("Ensemble performance:")
+        print("\nEnsemble performance:")
         for key, val in ensemble_metrics.items():
             print(f"{key:<8} {val:.3}")
         return df, all_model_metrics
diff --git a/examples/mp_wbm/use_trained_wrenformer_ensemble.py b/examples/mp_wbm/use_trained_wrenformer_ensemble.py
@@ -16,6 +16,7 @@
 the MP+WBM dataset and makes predictions on the test set, then prints ensemble metrics.
 """
 
+
 data_path = f"{ROOT}/datasets/2022-06-09-mp+wbm.json.gz"
 target_col = "e_form"
 test_size = 0.05
@@ -34,6 +35,10 @@
 
     runs = wandb_api.runs("aviary/mp-wbm", filters={"tags": {"$in": ["ensemble-id-2"]}})
 
+    print(
+        f"Loading checkpoints for the following run IDs:\n{', '.join(run.id for run in runs)}\n"
+    )
+
     checkpoint_paths: list[str] = []
     for run in runs:
         run_path = "/".join(run.path)
@@ -59,4 +64,18 @@
     checkpoint_paths, df=test_df, target_col=target_col
 )
 
-test_df.to_csv(f"{ROOT}/examples/mp_wbm/ensemble-test-predictions.csv")
+test_df.to_csv(f"{ROOT}/examples/mp_wbm/ensemble-predictions.csv")
+
+
+# print output:
+# Predicting with 10 model checkpoints(s)
+#
+# Single model performance:
+#          MAE    RMSE      R2
+# mean  0.0369  0.1218  0.9864
+# std   0.0005  0.0014  0.0003
+#
+# Ensemble performance:
+# MAE      0.0308
+# RMSE     0.118
+# R2       0.987