Skip to content

Commit c59da30

Browse files
author
Sam Borms
authored
Merge pull request #109 from PythonPredictions/general-fixes-bis
General fixes bis
2 parents 3b71e4b + fca06da commit c59da30

18 files changed

Lines changed: 1445 additions & 808 deletions

cobra/evaluation/evaluator.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class ClassificationEvaluator():
4040
cumulative_gains : tuple
4141
Data for plotting cumulative gains curve.
4242
evaluation_metrics : dict
43-
Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.)
43+
Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.).
4444
lift_at : float
4545
Parameter to determine at which top level percentage the lift of the
4646
model should be computed.
@@ -191,7 +191,7 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
191191
ax.set_xlabel("False Positive Rate", fontsize=15)
192192
ax.set_ylabel("True Positive Rate", fontsize=15)
193193
ax.legend(loc="lower right")
194-
ax.set_title("ROC Curve", fontsize=20)
194+
ax.set_title("ROC curve", fontsize=20)
195195

196196
if path:
197197
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -274,7 +274,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
274274
ax.grid(False)
275275

276276
# Description
277-
ax.set_title("Cumulative response", fontsize=20)
277+
ax.set_title("Cumulative Response curve", fontsize=20)
278278

279279
if path is not None:
280280
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -323,7 +323,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
323323
ax.grid(False)
324324

325325
# Description
326-
ax.set_title("Cumulative Lift", fontsize=20)
326+
ax.set_title("Cumulative Lift curve", fontsize=20)
327327

328328
if path is not None:
329329
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -350,7 +350,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
350350
ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3,
351351
ls="--", color="darkorange", label="random selection")
352352

353-
ax.set_title("Cumulative Gains", fontsize=20)
353+
ax.set_title("Cumulative Gains curve", fontsize=20)
354354

355355
# Format axes
356356
ax.set_xlim([0, 100])
@@ -681,7 +681,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
681681
ax.set_xlabel("Index", fontsize=15)
682682
ax.set_ylabel("Value", fontsize=15)
683683
ax.legend(loc="best")
684-
ax.set_title("Prediction Plot", fontsize=20)
684+
ax.set_title("Predictions vs. Actuals", fontsize=20)
685685

686686
if path:
687687
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -722,7 +722,7 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
722722
ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")])))+1, 1))
723723

724724
ax.legend(loc="best")
725-
ax.set_title("Q-Q Plot", fontsize=20)
725+
ax.set_title("Q-Q plot", fontsize=20)
726726

727727
if path:
728728
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")

cobra/evaluation/pigs_tables.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -180,21 +180,19 @@ def plot_incidence(pig_tables: pd.DataFrame,
180180
ax.yaxis.set_major_formatter(
181181
FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
182182
elif model_type == "regression":
183-
# If both the difference between the highest avg target of all bins
184-
# versus the global avg target AND the difference between the
185-
# lowest avg target versus the global avg target are both smaller
186-
# than 25% of the global avg target itself, we increase the y
187-
# axis range, to avoid that the minor avg target differences are
188-
# spread out over the configure figure height, suggesting
189-
# incorrectly that there are big differences in avg target across
190-
# the bins and versus the global avg target.
183+
# If the difference between the highest avg. target of all bins
184+
# versus the global avg. target AND the difference between the
185+
# lowest avg. target versus the global avg. target are both smaller
186+
# than 25% of the global avg. target itself, we increase the
187+
# y-axis range, to avoid that the minor avg. target differences are
188+
# spread out over the configured figure height, suggesting
189+
# incorrectly that there are big differences in avg. target across
190+
# the bins and versus the global avg. target.
191191
# (Motivation for the AND above: if on one end there IS enough
192192
# difference, the effect that we discuss here does not occur.)
193-
global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
194-
if (np.abs((max(df_plot['avg_target']) - global_avg_target))
195-
/ global_avg_target < 0.25) \
196-
and (np.abs((min(df_plot['avg_target']) - global_avg_target))
197-
/ global_avg_target < 0.25):
193+
global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
194+
if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
195+
and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
198196
ax.set_ylim(global_avg_target * 0.75,
199197
global_avg_target * 1.25)
200198

@@ -234,9 +232,9 @@ def plot_incidence(pig_tables: pd.DataFrame,
234232
title = "Incidence plot - " + variable
235233
else:
236234
title = "Mean target plot - " + variable
237-
fig.suptitle(title, fontsize=22, y=1.02)
235+
fig.suptitle(title, fontsize=22)
238236
ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
239-
loc=3, ncol=3, mode="expand", borderaxespad=0.,
237+
loc=3, ncol=1, mode="expand", borderaxespad=0.,
240238
prop={"size": 14})
241239

242240
# Set order of layers
@@ -245,5 +243,8 @@ def plot_incidence(pig_tables: pd.DataFrame,
245243

246244
del df_plot
247245

246+
plt.tight_layout()
247+
plt.margins(0.01)
248+
248249
# Show
249250
plt.show()

cobra/model_building/forward_selection.py

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class ForwardFeatureSelection:
2929
more or less with the maximum number of steps in the forward feature
3030
selection.
3131
pos_only : bool
32-
Whether or not the model coefficients should all be positive.
32+
Whether or not the model coefficients should all be positive (no sign flips).
3333
self._fitted_models : list
3434
List of fitted models.
3535
"""
@@ -76,8 +76,7 @@ def get_model_from_step(self, step: int):
7676

7777
def compute_model_performances(self, data: pd.DataFrame,
7878
target_column_name: str,
79-
splits: list = ["train", "selection",
80-
"validation"]
79+
splits: list=["train", "selection", "validation"]
8180
) -> pd.DataFrame:
8281
"""Compute for each model the performance for different sets (e.g.
8382
train-selection-validation) and return them along with a list of
@@ -111,7 +110,7 @@ def compute_model_performances(self, data: pd.DataFrame,
111110
"last_added_predictor": list(last_added_predictor)[0]
112111
}
113112

114-
# Evaluate model on each data set split,
113+
# Evaluate model on each dataset split,
115114
# e.g. train-selection-validation
116115
tmp.update({
117116
f"{split}_performance": model.evaluate(
@@ -139,7 +138,11 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
139138
Parameters
140139
----------
141140
train_data : pd.DataFrame
142-
Data on which to fit the model.
141+
Data on which to fit the model. Should include a "train"
142+
and "selection" split for correct model selection! The
143+
"train" split is used to train a model, the "selection"
144+
split is used to evaluate which model to include in the
145+
actual forward feature selection.
143146
target_column_name : str
144147
Name of the target column.
145148
predictors : list
@@ -155,6 +158,12 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
155158
In case the number of forced predictors is larger than the maximum
156159
number of allowed predictors in the model.
157160
"""
161+
162+
assert "split" in train_data.columns, "The train_data input df does not include a split column."
163+
print(train_data["split"].unique())
164+
assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \
165+
"The train_data input df does not include a 'train' and 'selection' split."
166+
158167
# remove excluded predictors from predictor lists
159168
filtered_predictors = [var for var in predictors
160169
if (var not in excluded_predictors and
@@ -163,13 +172,13 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
163172
# checks on predictor lists and self.max_predictors attr
164173
if len(forced_predictors) > self.max_predictors:
165174
raise ValueError("Size of forced_predictors cannot be bigger than "
166-
"max_predictors")
175+
"max_predictors.")
167176
elif len(forced_predictors) == self.max_predictors:
168177
log.info("Size of forced_predictors equals max_predictors "
169178
"only one model will be trained...")
170179
# train model with all forced_predictors (only)
171180
(self._fitted_models
172-
.append(self._train_model(train_data,
181+
.append(self._train_model(train_data[train_data["split"] == "train"],
173182
target_column_name,
174183
forced_predictors)))
175184
else:
@@ -178,12 +187,14 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
178187
filtered_predictors,
179188
forced_predictors)
180189

181-
def _forward_selection(self, train_data: pd.DataFrame,
182-
target_column_name: str, predictors: list,
190+
def _forward_selection(self,
191+
train_data: pd.DataFrame,
192+
target_column_name: str,
193+
predictors: list,
183194
forced_predictors: list = []) -> list:
184195
"""Perform the forward feature selection algorithm to compute a list
185196
of models (with increasing performance). The length of the list,
186-
i.e. the number of models is bounded by the max_predictors class
197+
i.e. the number of models, is bounded by the max_predictors class
187198
attribute.
188199
189200
Parameters
@@ -208,10 +219,11 @@ def _forward_selection(self, train_data: pd.DataFrame,
208219

209220
max_steps = 1 + min(self.max_predictors,
210221
len(predictors) + len(forced_predictors))
222+
211223
for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
212224
"predictor..."):
213225
if step <= len(forced_predictors):
214-
# first, we go through forced predictors
226+
# first, we go through the forced predictors
215227
candidate_predictors = [var for var in forced_predictors
216228
if var not in current_predictors]
217229
else:
@@ -230,13 +242,19 @@ def _forward_selection(self, train_data: pd.DataFrame,
230242
.union(set(model.predictors)))
231243

232244
fitted_models.append(model)
245+
# else:
246+
# # If model returns None for the first time,
247+
# # one can in theory stop the feature selection process
248+
# # but we leave it run such that tqdm cleanly finishes
249+
# break
233250

234251
if not fitted_models:
235-
log.error("No models found in forward selection")
252+
log.error("No models found in forward selection.")
236253

237254
return fitted_models
238255

239-
def _find_next_best_model(self, train_data: pd.DataFrame,
256+
def _find_next_best_model(self,
257+
train_data: pd.DataFrame,
240258
target_column_name: str,
241259
candidate_predictors: list,
242260
current_predictors: list):
@@ -272,15 +290,19 @@ def _find_next_best_model(self, train_data: pd.DataFrame,
272290
"for the given model_type specified as "
273291
"ForwardFeatureSelection argument.")
274292

293+
fit_data = train_data[train_data["split"] == "train"] # data to fit the models with
294+
sel_data = train_data[train_data["split"] == "selection"] # data to compare the models with
295+
275296
for pred in candidate_predictors:
276297
# Train a model with an additional predictor
277-
model = self._train_model(train_data, target_column_name,
298+
model = self._train_model(fit_data, target_column_name,
278299
(current_predictors + [pred]))
300+
279301
# Evaluate the model
280302
performance = (model
281-
.evaluate(train_data[current_predictors + [pred]],
282-
train_data[target_column_name],
283-
split="train"))
303+
.evaluate(sel_data[current_predictors + [pred]],
304+
sel_data[target_column_name],
305+
split="selection"))
284306

285307
if self.pos_only and (not (model.get_coef() >= 0).all()):
286308
continue

cobra/model_building/models.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,12 @@ def deserialize(self, model_dict: dict):
8383
self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
8484

8585
def get_coef(self) -> np.array:
86-
"""Returns the model coefficients
86+
"""Returns the model coefficients.
8787
8888
Returns
8989
-------
9090
np.array
91-
array of model coefficients
91+
Array of model coefficients.
9292
"""
9393
return self.logit.coef_[0]
9494

@@ -157,7 +157,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
157157
y : pd.Series
158158
Dataset containing the target of each observation.
159159
split : str, optional
160-
Split of the dataset (e.g. train-selection-validation).
160+
Split name of the dataset (e.g. "train", "selection", or "validation").
161161
162162
Returns
163163
-------
@@ -249,12 +249,12 @@ def __init__(self):
249249
self._eval_metrics_by_split = {}
250250

251251
def serialize(self) -> dict:
252-
"""Serialize model as JSON
252+
"""Serialize model as JSON.
253253
254254
Returns
255255
-------
256256
dict
257-
dictionary containing the serialized JSON
257+
Dictionary containing the serialized JSON.
258258
"""
259259
serialized_model = {
260260
"meta": "linear-regression",
@@ -272,7 +272,7 @@ def serialize(self) -> dict:
272272
return serialized_model
273273

274274
def deserialize(self, model_dict: dict):
275-
"""Deserialize a model previously stored as JSON
275+
"""Deserialize a model previously stored as JSON.
276276
277277
Parameters
278278
----------
@@ -282,7 +282,7 @@ def deserialize(self, model_dict: dict):
282282
Raises
283283
------
284284
ValueError
285-
In case JSON file is no valid serialized model
285+
In case JSON file is no valid serialized model.
286286
"""
287287

288288
if not self._is_valid_dict(model_dict):
@@ -296,37 +296,37 @@ def deserialize(self, model_dict: dict):
296296
self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
297297

298298
def get_coef(self) -> np.array:
299-
"""Returns the model coefficients
299+
"""Returns the model coefficients.
300300
301301
Returns
302302
-------
303303
np.array
304-
array of model coefficients
304+
Array of model coefficients.
305305
"""
306-
return self.linear.coef_[0]
306+
return self.linear.coef_
307307

308308
def get_intercept(self) -> float:
309-
"""Returns the intercept of the model
309+
"""Returns the intercept of the model.
310310
311311
Returns
312312
-------
313313
float
314-
intercept of the model
314+
Intercept of the model.
315315
"""
316316
return self.linear.intercept_[0]
317317

318318
def get_coef_by_predictor(self) -> dict:
319-
"""Returns a dictionary mapping predictor (key) to coefficient (value)
319+
"""Returns a dictionary mapping predictor (key) to coefficient (value).
320320
321321
Returns
322322
-------
323323
dict
324-
map ``{predictor: coefficient}``
324+
A map ``{predictor: coefficient}``.
325325
"""
326-
return dict(zip(self.predictors, self.linear.coef_[0]))
326+
return dict(zip(self.predictors, self.linear.coef_))
327327

328328
def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
329-
"""Fit the model
329+
"""Fit the model.
330330
331331
Parameters
332332
----------
@@ -370,7 +370,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
370370
y : pd.Series
371371
Dataset containing the target of each observation.
372372
split : str, optional
373-
Split of the dataset (e.g. train-selection-validation).
373+
Split name of the dataset (e.g. "train", "selection", or "validation").
374374
375375
Returns
376376
-------

0 commit comments

Comments
 (0)