PythonPredictions
diff --git a/‎.github/workflows/development_CI.yaml‎
Lines changed: 38 additions & 0 deletions b/‎.github/workflows/development_CI.yaml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.rst‎
Lines changed: 19 additions & 1 deletion b/‎README.rst‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎cobra/evaluation/evaluator.py‎
Lines changed: 18 additions & 10 deletions b/‎cobra/evaluation/evaluator.py‎
Lines changed: 18 additions & 10 deletions
diff --git a/‎cobra/evaluation/plotting_utils.py‎
Lines changed: 6 additions & 6 deletions b/‎cobra/evaluation/plotting_utils.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎cobra/model_building/forward_selection.py‎
Lines changed: 6 additions & 3 deletions b/‎cobra/model_building/forward_selection.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cobra/preprocessing/categorical_data_processor.py‎
Lines changed: 31 additions & 23 deletions b/‎cobra/preprocessing/categorical_data_processor.py‎
Lines changed: 31 additions & 23 deletions
@@ -0,0 +1,38 @@
+# Runs CI when pushing to develop branch
+# runs pylint and pytest
+
+name: CI_develop_action
+
+on:
+  push:
+    branches: [ develop ]
+  pull_request:
+    branches: [ develop ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install -r requirements.txt
+        python -m pip install pylint pytest pytest-mock pytest-cov
+
+    - name: Test with pytest
+      run: |
+        pytest --cov=cobra tests/
+        
+    # until we refactor accordingly
+    #- name: Lint check with pylint
+    #  run: |
+    #    pylint cobra
@@ -1,4 +1,4 @@
-#Ignoired directories in root folder
+#Ignored directories in root folder
 
 
 # Byte-compiled / optimized / DLL files
@@ -109,3 +109,4 @@ ENV/
 # Other ignore files
 *.pptx
 *.ppt
+.idea/
@@ -1,9 +1,22 @@
+
+
+.. image:: https://img.shields.io/pypi/v/pythonpredictions-cobra.svg
+    :target: https://pypi.org/project/pythonpredictions-cobra/
+.. image:: https://img.shields.io/pypi/dm/pythonpredictions-cobra.svg
+    :target: https://pypistats.org/packages/pythonpredictions-cobra
+.. image:: https://github.com/PythonPredictions/cobra/actions/workflows/development_CI.yaml/badge.svg?branch=develop
+    :target: https://github.com/PythonPredictions/cobra/actions/workflows/development_CI.yaml
+
+------------------------------------------------------------------------------------------------------------------------------------ 
+
 =====
 cobra
 =====
 
-**cobra** is a Python package to build predictive models using linear/logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers.
+.. image:: material\logo.png
+    :width: 300
 
+**cobra** is a Python package to build predictive models using linear/logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers.
 
 Main Features
 =============
@@ -70,3 +83,8 @@ Documentation
 
 - HTML documentation of the `individual modules <https://pythonpredictions.github.io/cobra.io/docstring/modules.html>`_
 - A step-by-step `tutorial <https://pythonpredictions.github.io/cobra.io/tutorial.html>`_
+
+Outreach
+-------------
+
+- Check out the Data Science Leuven Meetup `talk <https://www.youtube.com/watch?v=w7ceZZqMEaA&feature=youtu.be>`_ by one of the core developers (second presentation)
@@ -35,15 +35,20 @@ class Evaluator():
     probability_cutoff : float
         probability cut off to convert probability scores to a binary score
     roc_curve : dict
-        map containing true-positive-rate, false-positve-rate at various
+        map containing true-positive-rate, false-positive-rate at various
         thresholds (also incl.)
+    n_bins : int, optional
+        defines the number of bins used to calculate the lift curve for
+        (by default 10, so deciles)
     """
 
     def __init__(self, probability_cutoff: float=None,
-                 lift_at: float=0.05):
+                 lift_at: float=0.05,
+                 n_bins: int = 10):
 
         self.lift_at = lift_at
         self.probability_cutoff = probability_cutoff
+        self.n_bins = n_bins
 
         # Placeholder to store fitted output
         self.scalar_metrics = None
@@ -85,7 +90,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray):
 
         self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
         self.confusion_matrix = confusion_matrix(y_true, y_pred_b)
-        self.lift_curve = Evaluator._compute_lift_per_decile(y_true, y_pred)
+        self.lift_curve = Evaluator._compute_lift_per_bin(y_true, y_pred, self.n_bins)
         self.cumulative_gains = Evaluator._compute_cumulative_gains(y_true,
                                                                     y_pred)
 
@@ -199,8 +204,7 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8),
 
         plt.show()
 
-    def plot_cumulative_response_curve(self, path: str=None,
-                                       dim: tuple=(12, 8)):
+    def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
         """Plot cumulative response curve
 
         Parameters
@@ -430,17 +434,21 @@ def _compute_cumulative_gains(y_true: np.ndarray,
         return percentages, gains
 
     @staticmethod
-    def _compute_lift_per_decile(y_true: np.ndarray,
-                                 y_pred: np.ndarray) -> tuple:
-        """Compute lift of the model per decile, returns x-labels, lifts and
-        the target incidence to create cummulative response curves
+    def _compute_lift_per_bin(y_true: np.ndarray,
+                              y_pred: np.ndarray,
+                              n_bins: int = 10) -> tuple:
+        """Compute lift of the model for a given number of bins, returns x-labels,
+        lifts and the target incidence to create cumulative response curves
 
         Parameters
         ----------
         y_true : np.ndarray
             True binary target data labels
         y_pred : np.ndarray
             Target scores of the model
+        n_bins : int, optional
+            defines the number of bins used to calculate the lift curve for
+            (by default 10, so deciles)
 
         Returns
         -------
@@ -451,7 +459,7 @@ def _compute_lift_per_decile(y_true: np.ndarray,
         lifts = [Evaluator._compute_lift(y_true=y_true,
                                          y_pred=y_pred,
                                          lift_at=perc_lift)
-                 for perc_lift in np.arange(0.1, 1.1, 0.1)]
+                 for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)]
 
         x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)]
 
 
@@ -14,17 +14,17 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
     Parameters
     ----------
     df_auc : pd.DatFrame
-        Contains for each variable the train auc and selection auc allong with
+        Contains for each variable the train auc and selection auc along with
         a boolean indicating whether or not it is selected based on the
         criteria
     dim : tuple, optional
-        tuple with width and lentgh of the plot
+        tuple with width and length of the plot
     path : str, optional
         path to store the figure
     """
 
     df = (df_auc[df_auc["preselection"]]
-          .sort_values(by='AUC train', ascending=False))
+          .sort_values(by='AUC selection', ascending=False))
 
     df = pd.melt(df, id_vars=["predictor"],
                  value_vars=["AUC train", "AUC selection"],
@@ -60,7 +60,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame,
     df_corr : pd.DataFrame
         Correlation matrix
     dim : tuple, optional
-        tuple with width and lentgh of the plot
+        tuple with width and length of the plot
     path : str, optional
         path to store the figure
     """
@@ -89,7 +89,7 @@ def plot_performance_curves(model_performance: pd.DataFrame,
         contains train-selection-validation performance for each model trained
         in the forward feature selection
     dim : tuple, optional
-        tuple with width and lentgh of the plot
+        tuple with width and length of the plot
     path : str, optional
         path to store the figure
     """
@@ -141,7 +141,7 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame,
     title : str, optional
         Title of the plot
     dim : tuple, optional
-        tuple with width and lentgh of the plot
+        tuple with width and length of the plot
     path : str, optional
         path to store the figure
     """
 
@@ -1,10 +1,12 @@
 import logging
-log = logging.getLogger(__name__)
 
 import pandas as pd
+from tqdm.auto import tqdm
 
 from cobra.model_building import LogisticRegressionModel as MLModel
 
+log = logging.getLogger(__name__)
+
 
 class ForwardFeatureSelection:
 
@@ -159,7 +161,7 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
     def _forward_selection(self, train_data: pd.DataFrame,
                            target_column_name: str, predictors: list,
                            forced_predictors: list=[]) -> list:
-        """Perform the forward feature selection algoritm to compute a list
+        """Perform the forward feature selection algorithm to compute a list
         of models (with increasing performance?). The length of the list,
         i.e. the number of models is bounded by the max_predictors class
         attribute.
@@ -186,7 +188,8 @@ def _forward_selection(self, train_data: pd.DataFrame,
 
         max_steps = 1 + min(self.max_predictors,
                             len(predictors) + len(forced_predictors))
-        for step in range(1, max_steps):
+        for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
+                                                   "predictor..."):
             if step <= len(forced_predictors):
                 # first, we go through forced predictors
                 candidate_predictors = [var for var in forced_predictors
 
@@ -17,18 +17,18 @@
 # standard lib imports
 import re
 from typing import Optional
-
 import logging
-log = logging.getLogger(__name__)
 
 # third party imports
 import numpy as np
 import pandas as pd
 from scipy import stats
-
+from tqdm.auto import tqdm
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
+log = logging.getLogger(__name__)
+
 
 class CategoricalDataProcessor(BaseEstimator):
     """
@@ -58,12 +58,12 @@ class CategoricalDataProcessor(BaseEstimator):
                   "category_size_threshold", "p_value_threshold",
                   "scale_contingency_table", "forced_categories"]
 
-    def __init__(self, regroup: bool=True, regroup_name: str="Other",
-                 keep_missing: bool=True,
-                 category_size_threshold: int=5,
-                 p_value_threshold: float=0.001,
-                 scale_contingency_table: bool=True,
-                 forced_categories: dict={}):
+    def __init__(self, regroup: bool = True, regroup_name: str = "Other",
+                 keep_missing: bool = True,
+                 category_size_threshold: int = 5,
+                 p_value_threshold: float = 0.001,
+                 scale_contingency_table: bool = True,
+                 forced_categories: dict = {}):
 
         self.regroup = regroup
         self.regroup_name = regroup_name
@@ -149,7 +149,8 @@ def fit(self, data: pd.DataFrame, column_names: list,
             log.info("regroup was set to False, so no fitting is required")
             return None
 
-        for column_name in column_names:
+        for column_name in tqdm(column_names, desc="Fitting category "
+                                                   "regrouping..."):
 
             if column_name not in data.columns:
                 log.warning("DataFrame has no column '{}', so it will be "
@@ -310,7 +311,8 @@ def _transform_column(self, data: pd.DataFrame,
             data.loc[:, column_name_clean] = (CategoricalDataProcessor
                                               ._replace_categories(
                                                   data[column_name_clean],
-                                                  categories))
+                                                  categories,
+                                                  self.regroup_name))
 
         # change data to categorical
         data.loc[:, column_name_clean] = (data[column_name_clean]
@@ -371,7 +373,7 @@ def _get_small_categories(predictor_series: pd.Series,
 
     @staticmethod
     def _replace_missings(data: pd.DataFrame,
-                          column_names: Optional[list]=None) -> pd.DataFrame:
+                          column_names: Optional[list] = None) -> pd.DataFrame:
         """Replace missing values (incl empty strings)
 
         Parameters
@@ -403,23 +405,25 @@ def _replace_missings(data: pd.DataFrame,
     @staticmethod
     def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
                          scale_contingency_table: bool) -> float:
-        """Summary
+        """Calculates p-value in contingency table (chi-square test) in
+        order to evaluate whether category of interest is significantly
+        different from the rest of the categories, given the target variable.
 
         Parameters
         ----------
         X : pd.Series
-            Description
+            Variables data.
         y : pd.Series
-            Description
+            Target data.
         category : str
-            Description
+            Category for which we carry out the test
         scale_contingency_table : bool
-            Description
+            Whether we scale contingency table with incidence rate
 
         Returns
         -------
         float
-            Description
+            p-value of chi-square test
         """
         df = pd.concat([X, y], axis=1)
         df["other_categories"] = np.where(X == category, 0, 1)
@@ -439,20 +443,24 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
         return stats.chi2_contingency(contigency_table, correction=False)[1]
 
     @staticmethod
-    def _replace_categories(data: pd.Series, categories: set) -> pd.Series:
+    def _replace_categories(data: pd.Series, categories: set,
+                            replace_with: str) -> pd.Series:
         """replace categories in set with "Other" and transform the remaining
         categories to strings to avoid type errors later on in the pipeline
 
         Parameters
         ----------
         data : pd.Series
-            Description
+            Dataset which contains the variable to be replaced
         categories : set
-            Description
+            Cleaned categories.
+        replace_with: str
+            String to be used as replacement for category.
 
         Returns
         -------
         pd.Series
-            Description
+            Series with replaced categories
         """
-        return data.apply(lambda x: str(x) if x in categories else "Other")
+        return data.apply(
+            lambda x: str(x) if x in categories else replace_with)