Skip to content

Commit 4e4aeea

Browse files
author
Sander Vanden Hautte
committed
Merge branch 'develop' into fix/discretizer_inf
# Conflicts: # cobra/preprocessing/kbins_discretizer.py
2 parents 9eb1afd + 2b5a107 commit 4e4aeea

17 files changed

Lines changed: 353 additions & 165 deletions
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Runs CI when pushing to develop branch
2+
# runs pylint and pytest
3+
4+
name: CI_develop_action
5+
6+
on:
7+
push:
8+
branches: [ develop ]
9+
pull_request:
10+
branches: [ develop ]
11+
12+
jobs:
13+
build:
14+
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- uses: actions/checkout@v2
19+
20+
- name: Set up Python 3.8
21+
uses: actions/setup-python@v2
22+
with:
23+
python-version: 3.8
24+
25+
- name: Install dependencies
26+
run: |
27+
python -m pip install --upgrade pip
28+
python -m pip install -r requirements.txt
29+
python -m pip install pylint pytest pytest-mock pytest-cov
30+
31+
- name: Test with pytest
32+
run: |
33+
pytest --cov=cobra tests/
34+
35+
# until we refactor accordingly
36+
#- name: Lint check with pylint
37+
# run: |
38+
# pylint cobra

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#Ignoired directories in root folder
1+
#Ignored directories in root folder
22

33

44
# Byte-compiled / optimized / DLL files
@@ -109,3 +109,4 @@ ENV/
109109
# Other ignore files
110110
*.pptx
111111
*.ppt
112+
.idea/

README.rst

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
1+
2+
3+
.. image:: https://img.shields.io/pypi/v/pythonpredictions-cobra.svg
4+
:target: https://pypi.org/project/pythonpredictions-cobra/
5+
.. image:: https://img.shields.io/pypi/dm/pythonpredictions-cobra.svg
6+
:target: https://pypistats.org/packages/pythonpredictions-cobra
7+
.. image:: https://github.com/PythonPredictions/cobra/actions/workflows/development_CI.yaml/badge.svg?branch=develop
8+
:target: https://github.com/PythonPredictions/cobra/actions/workflows/development_CI.yaml
9+
10+
------------------------------------------------------------------------------------------------------------------------------------
11+
112
=====
213
cobra
314
=====
415

5-
**cobra** is a Python package to build predictive models using linear/logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers.
16+
.. image:: material\logo.png
17+
:width: 300
618

19+
**cobra** is a Python package to build predictive models using linear/logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers.
720

821
Main Features
922
=============
@@ -70,3 +83,8 @@ Documentation
7083

7184
- HTML documentation of the `individual modules <https://pythonpredictions.github.io/cobra.io/docstring/modules.html>`_
7285
- A step-by-step `tutorial <https://pythonpredictions.github.io/cobra.io/tutorial.html>`_
86+
87+
Outreach
88+
-------------
89+
90+
- Check out the Data Science Leuven Meetup `talk <https://www.youtube.com/watch?v=w7ceZZqMEaA&feature=youtu.be>`_ by one of the core developers (second presentation)

cobra/evaluation/evaluator.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,20 @@ class Evaluator():
3535
probability_cutoff : float
3636
probability cut off to convert probability scores to a binary score
3737
roc_curve : dict
38-
map containing true-positive-rate, false-positve-rate at various
38+
map containing true-positive-rate, false-positive-rate at various
3939
thresholds (also incl.)
40+
n_bins : int, optional
41+
defines the number of bins used to calculate the lift curve for
42+
(by default 10, so deciles)
4043
"""
4144

4245
def __init__(self, probability_cutoff: float=None,
43-
lift_at: float=0.05):
46+
lift_at: float=0.05,
47+
n_bins: int = 10):
4448

4549
self.lift_at = lift_at
4650
self.probability_cutoff = probability_cutoff
51+
self.n_bins = n_bins
4752

4853
# Placeholder to store fitted output
4954
self.scalar_metrics = None
@@ -85,7 +90,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
8590

8691
self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
8792
self.confusion_matrix = confusion_matrix(y_true, y_pred_b)
88-
self.lift_curve = Evaluator._compute_lift_per_decile(y_true, y_pred)
93+
self.lift_curve = Evaluator._compute_lift_per_bin(y_true, y_pred, self.n_bins)
8994
self.cumulative_gains = Evaluator._compute_cumulative_gains(y_true,
9095
y_pred)
9196

@@ -199,8 +204,7 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8),
199204

200205
plt.show()
201206

202-
def plot_cumulative_response_curve(self, path: str=None,
203-
dim: tuple=(12, 8)):
207+
def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
204208
"""Plot cumulative response curve
205209
206210
Parameters
@@ -430,17 +434,21 @@ def _compute_cumulative_gains(y_true: np.ndarray,
430434
return percentages, gains
431435

432436
@staticmethod
433-
def _compute_lift_per_decile(y_true: np.ndarray,
434-
y_pred: np.ndarray) -> tuple:
435-
"""Compute lift of the model per decile, returns x-labels, lifts and
436-
the target incidence to create cummulative response curves
437+
def _compute_lift_per_bin(y_true: np.ndarray,
438+
y_pred: np.ndarray,
439+
n_bins: int = 10) -> tuple:
440+
"""Compute lift of the model for a given number of bins, returns x-labels,
441+
lifts and the target incidence to create cumulative response curves
437442
438443
Parameters
439444
----------
440445
y_true : np.ndarray
441446
True binary target data labels
442447
y_pred : np.ndarray
443448
Target scores of the model
449+
n_bins : int, optional
450+
defines the number of bins used to calculate the lift curve for
451+
(by default 10, so deciles)
444452
445453
Returns
446454
-------
@@ -451,7 +459,7 @@ def _compute_lift_per_decile(y_true: np.ndarray,
451459
lifts = [Evaluator._compute_lift(y_true=y_true,
452460
y_pred=y_pred,
453461
lift_at=perc_lift)
454-
for perc_lift in np.arange(0.1, 1.1, 0.1)]
462+
for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)]
455463

456464
x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)]
457465

cobra/evaluation/plotting_utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
1414
Parameters
1515
----------
1616
df_auc : pd.DatFrame
17-
Contains for each variable the train auc and selection auc allong with
17+
Contains for each variable the train auc and selection auc along with
1818
a boolean indicating whether or not it is selected based on the
1919
criteria
2020
dim : tuple, optional
21-
tuple with width and lentgh of the plot
21+
tuple with width and length of the plot
2222
path : str, optional
2323
path to store the figure
2424
"""
2525

2626
df = (df_auc[df_auc["preselection"]]
27-
.sort_values(by='AUC train', ascending=False))
27+
.sort_values(by='AUC selection', ascending=False))
2828

2929
df = pd.melt(df, id_vars=["predictor"],
3030
value_vars=["AUC train", "AUC selection"],
@@ -60,7 +60,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame,
6060
df_corr : pd.DataFrame
6161
Correlation matrix
6262
dim : tuple, optional
63-
tuple with width and lentgh of the plot
63+
tuple with width and length of the plot
6464
path : str, optional
6565
path to store the figure
6666
"""
@@ -89,7 +89,7 @@ def plot_performance_curves(model_performance: pd.DataFrame,
8989
contains train-selection-validation performance for each model trained
9090
in the forward feature selection
9191
dim : tuple, optional
92-
tuple with width and lentgh of the plot
92+
tuple with width and length of the plot
9393
path : str, optional
9494
path to store the figure
9595
"""
@@ -141,7 +141,7 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame,
141141
title : str, optional
142142
Title of the plot
143143
dim : tuple, optional
144-
tuple with width and lentgh of the plot
144+
tuple with width and length of the plot
145145
path : str, optional
146146
path to store the figure
147147
"""

cobra/model_building/forward_selection.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import logging
2-
log = logging.getLogger(__name__)
32

43
import pandas as pd
4+
from tqdm.auto import tqdm
55

66
from cobra.model_building import LogisticRegressionModel as MLModel
77

8+
log = logging.getLogger(__name__)
9+
810

911
class ForwardFeatureSelection:
1012

@@ -159,7 +161,7 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
159161
def _forward_selection(self, train_data: pd.DataFrame,
160162
target_column_name: str, predictors: list,
161163
forced_predictors: list=[]) -> list:
162-
"""Perform the forward feature selection algoritm to compute a list
164+
"""Perform the forward feature selection algorithm to compute a list
163165
of models (with increasing performance?). The length of the list,
164166
i.e. the number of models is bounded by the max_predictors class
165167
attribute.
@@ -186,7 +188,8 @@ def _forward_selection(self, train_data: pd.DataFrame,
186188

187189
max_steps = 1 + min(self.max_predictors,
188190
len(predictors) + len(forced_predictors))
189-
for step in range(1, max_steps):
191+
for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
192+
"predictor..."):
190193
if step <= len(forced_predictors):
191194
# first, we go through forced predictors
192195
candidate_predictors = [var for var in forced_predictors

cobra/preprocessing/categorical_data_processor.py

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@
1717
# standard lib imports
1818
import re
1919
from typing import Optional
20-
2120
import logging
22-
log = logging.getLogger(__name__)
2321

2422
# third party imports
2523
import numpy as np
2624
import pandas as pd
2725
from scipy import stats
28-
26+
from tqdm.auto import tqdm
2927
from sklearn.base import BaseEstimator
3028
from sklearn.exceptions import NotFittedError
3129

30+
log = logging.getLogger(__name__)
31+
3232

3333
class CategoricalDataProcessor(BaseEstimator):
3434
"""
@@ -58,12 +58,12 @@ class CategoricalDataProcessor(BaseEstimator):
5858
"category_size_threshold", "p_value_threshold",
5959
"scale_contingency_table", "forced_categories"]
6060

61-
def __init__(self, regroup: bool=True, regroup_name: str="Other",
62-
keep_missing: bool=True,
63-
category_size_threshold: int=5,
64-
p_value_threshold: float=0.001,
65-
scale_contingency_table: bool=True,
66-
forced_categories: dict={}):
61+
def __init__(self, regroup: bool = True, regroup_name: str = "Other",
62+
keep_missing: bool = True,
63+
category_size_threshold: int = 5,
64+
p_value_threshold: float = 0.001,
65+
scale_contingency_table: bool = True,
66+
forced_categories: dict = {}):
6767

6868
self.regroup = regroup
6969
self.regroup_name = regroup_name
@@ -149,7 +149,8 @@ def fit(self, data: pd.DataFrame, column_names: list,
149149
log.info("regroup was set to False, so no fitting is required")
150150
return None
151151

152-
for column_name in column_names:
152+
for column_name in tqdm(column_names, desc="Fitting category "
153+
"regrouping..."):
153154

154155
if column_name not in data.columns:
155156
log.warning("DataFrame has no column '{}', so it will be "
@@ -310,7 +311,8 @@ def _transform_column(self, data: pd.DataFrame,
310311
data.loc[:, column_name_clean] = (CategoricalDataProcessor
311312
._replace_categories(
312313
data[column_name_clean],
313-
categories))
314+
categories,
315+
self.regroup_name))
314316

315317
# change data to categorical
316318
data.loc[:, column_name_clean] = (data[column_name_clean]
@@ -371,7 +373,7 @@ def _get_small_categories(predictor_series: pd.Series,
371373

372374
@staticmethod
373375
def _replace_missings(data: pd.DataFrame,
374-
column_names: Optional[list]=None) -> pd.DataFrame:
376+
column_names: Optional[list] = None) -> pd.DataFrame:
375377
"""Replace missing values (incl empty strings)
376378
377379
Parameters
@@ -403,23 +405,25 @@ def _replace_missings(data: pd.DataFrame,
403405
@staticmethod
404406
def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
405407
scale_contingency_table: bool) -> float:
406-
"""Summary
408+
"""Calculates p-value in contingency table (chi-square test) in
409+
order to evaluate whether category of interest is significantly
410+
different from the rest of the categories, given the target variable.
407411
408412
Parameters
409413
----------
410414
X : pd.Series
411-
Description
415+
Variables data.
412416
y : pd.Series
413-
Description
417+
Target data.
414418
category : str
415-
Description
419+
Category for which we carry out the test
416420
scale_contingency_table : bool
417-
Description
421+
Whether we scale contingency table with incidence rate
418422
419423
Returns
420424
-------
421425
float
422-
Description
426+
p-value of chi-square test
423427
"""
424428
df = pd.concat([X, y], axis=1)
425429
df["other_categories"] = np.where(X == category, 0, 1)
@@ -439,20 +443,24 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
439443
return stats.chi2_contingency(contigency_table, correction=False)[1]
440444

441445
@staticmethod
442-
def _replace_categories(data: pd.Series, categories: set) -> pd.Series:
446+
def _replace_categories(data: pd.Series, categories: set,
447+
replace_with: str) -> pd.Series:
443448
"""replace categories in set with "Other" and transform the remaining
444449
categories to strings to avoid type errors later on in the pipeline
445450
446451
Parameters
447452
----------
448453
data : pd.Series
449-
Description
454+
Dataset which contains the variable to be replaced
450455
categories : set
451-
Description
456+
Cleaned categories.
457+
replace_with: str
458+
String to be used as replacement for category.
452459
453460
Returns
454461
-------
455462
pd.Series
456-
Description
463+
Series with replaced categories
457464
"""
458-
return data.apply(lambda x: str(x) if x in categories else "Other")
465+
return data.apply(
466+
lambda x: str(x) if x in categories else replace_with)

0 commit comments

Comments
 (0)