Skip to content

Commit eccb344

Browse files
author
sborms
committed
improve usage metric arg
1 parent 390fc1c commit eccb344

7 files changed

Lines changed: 120 additions & 43 deletions

File tree

cobra/evaluation/evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,13 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
8888
"""
8989
fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred)
9090

91-
# if probability_cutoff is not set, take the optimal cut off
91+
# if probability_cutoff is not set, take the optimal cut-off
9292
if not self.probability_cutoff:
9393
self.probability_cutoff = (ClassificationEvaluator.
9494
_compute_optimal_cutoff(fpr, tpr,
9595
thresholds))
9696

97-
# Transform probabilities to binary array using cut off:
97+
# Transform probabilities to binary array using cut-off
9898
y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1
9999
for pred in y_pred])
100100

cobra/model_building/forward_selection.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ def get_model_from_step(self, step: int):
7777

7878
def compute_model_performances(self, data: pd.DataFrame,
7979
target_column_name: str,
80-
splits: list = ["train", "selection", "validation"],
81-
metric: Optional[Callable] = None,
80+
splits: list=["train", "selection", "validation"],
81+
metric: Optional[Callable]=None,
8282
) -> pd.DataFrame:
8383
"""Compute for each model the performance for different sets (e.g.
8484
train-selection-validation) and return them along with a list of
@@ -170,7 +170,6 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
170170
"""
171171

172172
assert "split" in train_data.columns, "The train_data input df does not include a split column."
173-
print(train_data["split"].unique())
174173
assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \
175174
"The train_data input df does not include a 'train' and 'selection' split."
176175

cobra/model_building/models.py

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11

2-
# third party imports
32
from typing import Callable, Optional
43

4+
# third party imports
55
import numpy as np
66
import pandas as pd
77
from scipy import stats
88
from sklearn.metrics import roc_auc_score, mean_squared_error
99
from numpy import sqrt
1010
from sklearn.linear_model import LogisticRegression, LinearRegression
11+
from sklearn.metrics import roc_curve
1112

1213
# custom imports
1314
import cobra.utils as utils
15+
from cobra.evaluation import ClassificationEvaluator
1416

1517
class LogisticRegressionModel:
1618
"""Wrapper around the LogisticRegression class, with additional methods
@@ -148,8 +150,8 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
148150
def evaluate(self, X: pd.DataFrame, y: pd.Series,
149151
split: str=None,
150152
metric: Optional[Callable]=None) -> float:
151-
"""Evaluate the model on a given data set (X, y). The optional split
152-
parameter is to indicate that the data set belongs to
153+
"""Evaluate the model on a given dataset (X, y). The optional split
154+
parameter is to indicate that the dataset belongs to
153155
(train, selection, validation), so that the computation on these sets
154156
can be cached!
155157
@@ -164,7 +166,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
164166
metric: Callable (function), optional
165167
Function that computes an evaluation metric to evaluate the model's
166168
performances, instead of the default metric (AUC).
167-
The function should require y_true and y_pred arguments.
169+
The function should require y_true and y_pred (binary output) arguments.
168170
Metric functions from sklearn can be used, for example, see
169171
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
170172
@@ -173,20 +175,25 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
173175
float
174176
The performance score of the model (AUC by default).
175177
"""
178+
if metric is not None: # decouple from _eval_metrics_by_split attribute
179+
y_pred = self.score_model(X)
176180

177-
if (split is None) or (split not in self._eval_metrics_by_split):
181+
fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred)
182+
cutoff = (ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds))
183+
y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred])
178184

179-
y_pred = self.score_model(X)
185+
performance = metric(y_true=y, y_pred=y_pred_b)
180186

181-
if metric is None:
187+
return performance
188+
else:
189+
if (split is None) or (split not in self._eval_metrics_by_split):
190+
y_pred = self.score_model(X)
182191
performance = roc_auc_score(y_true=y, y_score=y_pred)
183-
else:
184-
performance = metric(y_true=y, y_pred=y_pred)
185192

186-
if split is None:
187-
return performance
188-
else:
189-
self._eval_metrics_by_split[split] = performance
193+
if split is None:
194+
return performance
195+
else:
196+
self._eval_metrics_by_split[split] = performance
190197

191198
return self._eval_metrics_by_split[split]
192199

@@ -371,8 +378,8 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
371378
def evaluate(self, X: pd.DataFrame, y: pd.Series,
372379
split: str=None,
373380
metric: Optional[Callable]=None) -> float:
374-
"""Evaluate the model on a given data set (X, y). The optional split
375-
parameter is to indicate that the data set belongs to
381+
"""Evaluate the model on a given dataset (X, y). The optional split
382+
parameter is to indicate that the dataset belongs to
376383
(train, selection, validation), so that the computation on these sets
377384
can be cached!
378385
@@ -396,19 +403,20 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
396403
float
397404
The performance score of the model (RMSE by default).
398405
"""
399-
400-
if (split is None) or (split not in self._eval_metrics_by_split):
401-
406+
if metric is not None: # decouple from _eval_metrics_by_split attribute
402407
y_pred = self.score_model(X)
403-
if metric is None:
408+
performance = metric(y_true=y, y_pred=y_pred)
409+
410+
return performance
411+
else:
412+
if (split is None) or (split not in self._eval_metrics_by_split):
413+
y_pred = self.score_model(X)
404414
performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
405-
else:
406-
performance = metric(y_true=y, y_pred=y_pred)
407415

408-
if split is None:
409-
return performance
410-
else:
411-
self._eval_metrics_by_split[split] = performance
416+
if split is None:
417+
return performance
418+
else:
419+
self._eval_metrics_by_split[split] = performance
412420

413421
return self._eval_metrics_by_split[split]
414422

cobra/preprocessing/preprocessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
# std lib imports
2+
# standard lib imports
33
import inspect
44
import time
55
import math

tests/model_building/test_forward_selection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_compute_model_performances(self, mocker, model_type):
6161
mock_model_num_pred(3, model_type=model_type)
6262
]
6363

64-
def mock_evaluate(self, X, y, split): # on AUC scale, but gives the same for RMSE as it is a mock
64+
def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the same for RMSE as it is a mock
6565
if split == "train":
6666
return 0.612
6767
else:

tutorials/tutorial_Cobra_linear_regression.ipynb

Lines changed: 30 additions & 5 deletions
Large diffs are not rendered by default.

tutorials/tutorial_Cobra_logistic_regression.ipynb

Lines changed: 51 additions & 6 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)