DataResponsibly · KeYang0923 · Jun 22, 2020 · Jun 22, 2020 · Jul 8, 2020 · Jul 8, 2020
diff --git a/fp/utils.py b/fp/utils.py
@@ -0,0 +1,43 @@
+import pandas as pd
+
+def filter_optimal_results_skyline_order(_df, _order_list):
+    _df['selection_rate'] = abs(1 - _df['selection_rate'])
+    higher_is_better = ['num_true_positives', 'num_true_negatives', 'num_generalized_true_positives',
+                        'num_generalized_true_negatives', 'true_positive_rate', 'true_negative_rate',
+                        'generalized_true_positive_rate', 'generalized_true_negative_rate', 'positive_predictive_value',
+                        'accuracy', 'num_pred_positives']
+    lower_is_better = ['selection_rate', 'num_false_positives', 'num_false_negatives',
+                       'num_generalized_false_positives', 'num_generalized_false_negatives', 'false_positive_rate',
+                       'false_negative_rate', 'generalized_false_positive_rate', 'generalized_false_negative_rate',
+                       'false_discovery_rate', 'false_omission_rate', 'negative_predictive_value', 'error_rate',
+                       'num_pred_negatives']
+    order = []
+    for item in _order_list:
+        if item in higher_is_better:
+            order.append(False)
+        else:
+            order.append(True)
+    _df = _df.sort_values(_order_list, ascending=order)
+
+    return _df.values[0]
+
+
+def filter_optimal_results_skyline_formula(_df, _formula):
+    df = pd.DataFrame()
+    for key in _formula:
+        df["norm_" + key] = (_df[key] - _df[key].min()) / (_df[key].max() - _df[key].min())
+
+    df_temp = list(_formula.values())
+    keys = list(_formula.keys())
+    for col in range(len(keys)):
+        keys[col] = "norm_" + keys[col]
+
+    # Multiplying with the multiplier to perform sorting operation
+    df['norm_avg'] = df[keys].multiply(df_temp).sum(axis=1)
+    frames = [_df, df]
+    df_fin = pd.concat(frames, axis=1)
+
+    df_fin = df_fin.sort_values(by='norm_avg', ascending=False)
+    cols = [c for c in df_fin.columns if c[:4] != 'norm']
+    df_fin = df_fin[cols]
+    return df_fin.values[0]
diff --git a/pipeline/fairprep.py b/pipeline/fairprep.py
diff --git a/pipeline/model/classifiers.py b/pipeline/model/classifiers.py
@@ -0,0 +1,87 @@
+"""
+    Classes of supervised binary classifiers.
+"""
+
+import pandas as pd
+from sklearn.linear_model import SGDClassifier
+from sklearn.tree import DecisionTreeClassifier
+from pipeline.model.inprocessor import Model
+
+
+class SK_LogisticRegression(Model):
+    def __init__(self, df, target_col, loss_func="log", instance_weights=[], seed=0):
+        """
+        :param df: pandas dataframe, stores the data to fit the classifier.
+        :param target_col: str, the name of the target variable in above data.
+        :param loss_func: str, the name of the loss function used in linear model. Same as the loss parameter in sklearn.linear_model.SGDClassifier.
+                         The possible options are ‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’, ‘perceptron’, or a regression loss: ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’.
+        :param instance_weights: list of float, each number represents the weight of the sample in above data.
+        :param seed: integer, the seed for random state.
+        """
+
+        cur_step = SGDClassifier(loss=loss_func, random_state=seed)
+        super().__init__("@".join(["SK_LogisticRegression", target_col]), cur_step, df, target_col, instance_weights=instance_weights)
+
+
+class SK_DecisionTree(Model):
+    def __init__(self, df, target_col, instance_weights=[], seed=0):
+        """
+        :param df: pandas dataframe, stores the data to fit the classifier.
+        :param target_col: str, the name of the target variable in above data.
+        :param instance_weights: list of float, each number represents the weight of the sample in above data.
+        :param seed: integer, the seed for random state.
+        """
+        cur_step = DecisionTreeClassifier(random_state=seed)
+        super().__init__("@".join(["SK_DecisionTree", target_col]), cur_step, df, target_col, instance_weights=instance_weights)
+
+
+class OPT_LogisticRegression(Model):
+    def __init__(self, df, target_col, loss_func="log", max_iter=1000, instance_weights=[], seed=0):
+        """
+        :param df: pandas dataframe, stores the data to fit the classifier.
+        :param target_col: str, the name of the target variable in above data.
+        :param loss_func: str, the name of the loss function used in linear model. Same as the loss parameter in sklearn.linear_model.SGDClassifier.
+                         The possible options are ‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’, ‘perceptron’, or a regression loss: ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’.
+        :param max_iter: integer, max number of iterations of the model.
+        :param instance_weights: list of float, each number represents the weight of the sample in above data.
+        :param seed: integer, random seed.
+        """
+        # Update below parameters according to the loss function used
+        param_grid = {
+            'learner__loss': [loss_func],
+            'learner__penalty': ['l2', 'l1', 'elasticnet'],
+            'learner__alpha': [0.00005, 0.0001, 0.005, 0.001]
+        }
+        cur_step = SGDClassifier(max_iter=max_iter, random_state=seed)
+        super().__init__("@".join(["OPT_LogisticRegression", target_col]), cur_step, df, target_col, instance_weights=instance_weights, hyper_tune=True, param_grid=param_grid)
+
+class OPT_DecisionTree(Model):
+    def __init__(self, df, target_col, instance_weights=[], seed=0):
+        """
+        :param df: pandas dataframe, stores the data to fit the classifier.
+        :param target_col: str, the name of the target variable in above data.
+        :param instance_weights: list of float, each number represents the weight of the sample in above data.
+        :param seed: integer, random seed.
+        """
+        param_grid = {
+            'learner__min_samples_split': range(20, 500, 10),
+            'learner__max_depth': range(15, 30, 2),
+            'learner__min_samples_leaf': [3, 4, 5, 10],
+            "learner__criterion": ["gini", "entropy"]
+        }
+
+        cur_step = DecisionTreeClassifier(random_state=seed)
+        super().__init__("@".join(["OPT_DecisionTree", target_col]), cur_step, df, target_col, instance_weights=instance_weights, hyper_tune=True, param_grid=param_grid)
+
+
+if __name__ == '__main__':
+    data = pd.read_csv("../../data/adult_pre_reweigh.csv")
+    cur_o = SK_LogisticRegression(data, "income-per-year")
+    # cur_o = SK_DecisionTree(data, "income-per-year")
+    # cur_o = OPT_LogisticRegression(data, "income-per-year")
+    # cur_o = OPT_DecisionTree(data, "income-per-year")
+
+    after_data = cur_o.apply(data)
+    after_data.to_csv("../../data/adult_"+cur_o.get_name()+".csv", index=False)
+
+    print(cur_o.get_name())
diff --git a/pipeline/model/fair_classifiers.py b/pipeline/model/fair_classifiers.py
@@ -0,0 +1,69 @@
+"""
+    Classes of fair supervised binary classifiers.
+"""
+
+import pandas as pd
+from aif360.algorithms.inprocessing import AdversarialDebiasing
+from aif360.algorithms.inprocessing import MetaFairClassifier
+from aif360.algorithms.inprocessing import PrejudiceRemover
+from pipeline.model.inprocessor import Model
+import warnings
+warnings.filterwarnings("ignore")
+
+class AIF_AdversarialDebiasing(Model):
+
+    def __init__(self, df, target_col, sensitive_att, seed=0):
+        """
+        :param df: pandas dataframe, stores the data to fit the fair classifier.
+        :param target_col: str, the name of the target variable in above data.
+        :param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
+        :param seed: integer, random seed.
+
+        """
+
+        import tensorflow as tf
+        sess = tf.Session()
+        cur_step = AdversarialDebiasing(unprivileged_groups=[{sensitive_att: 0}], privileged_groups=[{sensitive_att: 1}], scope_name='debiased_classifier', debias=True, sess=sess, seed=seed)
+        super().__init__("@".join(["AIF_AdversarialDebiasing", sensitive_att]), cur_step, df, target_col, sensitive_att=sensitive_att, fair_aware=True)
+
+
+class AIF_MetaFairClassifier(Model):
+
+    def __init__(self, df, target_col, sensitive_att, fairness_penalty=0.8, fair_metric="sr"):
+        """
+        :param df: pandas dataframe, stores the data to fit the fair classifier.
+        :param target_col: str, the name of the target variable in above data.
+        :param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
+        :param fairness_penalty: float in [0,1], fairness penalty parameter. default is 0.8. The same parameter in aif360.algorithms.inprocessing.MetaFairClassifier.
+        :param fair_metric: str, fairness metric used in this method. Value from ["fdr" (false discovery rate ratio), "sr" (statistical rate/disparate impact)].
+                            The same parameter in aif360.algorithms.inprocessing.MetaFairClassifier.
+        """
+
+        cur_step = MetaFairClassifier(tau=fairness_penalty, sensitive_attr=sensitive_att, type=fair_metric)
+        super().__init__("@".join(["AIF_MetaFairClassifier", sensitive_att]), cur_step, df, target_col, sensitive_att=sensitive_att, fair_aware=True)
+
+class AIF_PrejudiceRemover(Model):
+
+    def __init__(self, df, target_col, sensitive_att, fairness_penalty=1.0):
+        """
+        :param df: pandas dataframe, stores the data to fit the fair classifier.
+        :param target_col: str, the name of the target variable in above data.
+        :param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
+        :param fairness_penalty: float in [0,1], fairness penalty parameter. default is 1. The same parameter in aif360.algorithms.inprocessing.PrejudiceRemover.
+
+        """
+        # TODO: fix the bug that cannot import lib of 'getoutput'
+        cur_step = PrejudiceRemover(eta=fairness_penalty, sensitive_attr=sensitive_att, class_attr=target_col)
+        super().__init__("@".join(["AIF_PrejudiceRemover", sensitive_att]), cur_step, df, target_col, sensitive_att=sensitive_att, fair_aware=True)
+
+
+if __name__ == '__main__':
+    data = pd.read_csv("../../data/adult_pre_reweigh.csv")
+    cur_o = AIF_AdversarialDebiasing(data, "income-per-year", "sex")
+    # cur_o = AIF_MetaFairClassifier(data, "income-per-year", "sex")
+    # cur_o = AIF_PrejudiceRemover(data, "income-per-year", "sex")
+
+    after_data = cur_o.apply(data)
+    after_data.to_csv("../../data/adult_"+cur_o.get_name()+".csv", index=False)
+
+    print(cur_o.get_name())
diff --git a/pipeline/model/inprocessor.py b/pipeline/model/inprocessor.py
@@ -0,0 +1,74 @@
+"""
+    Super class for all the supported classifier classes including fair-classifiers.
+"""
+import numpy as np
+from aif360.datasets import BinaryLabelDataset
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from pipeline.step import Step
+
+class Model(Step):
+    def __init__(self, step_name, step, df, target_col, instance_weights=[], hyper_tune=False, param_grid={}, sensitive_att=None, fair_aware=False, target_positive=1):
+        """
+        :param step_name: str, name of the current input step.
+        :param step: object of the initialized class.
+        :param df: pandas dataframe, stores the data.
+        :param target_col: str, the name of the target attribute.
+        :param instance_weights: list of float in [0,1], each float represents the weight of the sample in above data.
+        :param hyper_tune: boolean, whether to tune the hyper-parameter. Default is False.
+        :param param_grid: dict, stores the search range of the hyper-parameter. When hyper_tune is True, this must be provided.
+        :param sensitive_att: str, the name of a sensitive attribute.
+        :param fair_aware: boolean, whether the model is fair-aware. Default is False.
+        :param target_positive: integer, 0 or 1, represents the positive value of the target attribute. Default is 1.
+        """
+
+        super().__init__(step_name=step_name, df=df, sensitive_att=sensitive_att, target_col=target_col)
+        # assume the data set has been encoded to numerical values
+        if fair_aware: # fair classifiers
+            # intitialize a binary label dataset from AIF 360
+            aif_df = BinaryLabelDataset(df=df, label_names=[target_col], protected_attribute_names=[sensitive_att])
+            fitted_step = step.fit(aif_df)
+            input_score = False
+        else: # regular classifiers
+            if len(instance_weights) == 0:
+                instance_weights = [1 for _ in range(1, df.shape[0] + 1)]
+            if hyper_tune: # grid search for best hyper parameters
+                if not param_grid:
+                    print("Need to specify the search range of the hyper parameters - 'param_grid' is empty!")
+                    raise ValueError
+
+                search = GridSearchCV(Pipeline([('learner', step)]), param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
+                fitted_step = search.fit(np.array(df.drop(columns=[target_col])), np.array(df[target_col]), None, **{'learner__sample_weight': instance_weights})
+            else:
+                fitted_step = step.fit(np.array(df.drop(columns=[target_col])), np.array(df[target_col]), sample_weight=instance_weights)
+            input_score = True
+
+        self.input_score = input_score
+        self.step = fitted_step
+        self.target_positive = target_positive
+
+
+    def apply(self, df):
+        """
+        :param df: pandas dataframe, stores the data to apply the learned discretizer.
+        :return: pandas dataframe, stores the data after discretize.
+        """
+
+        # initialize AIF360 BinaryLabelDataset
+
+        if self.input_score:  # for regular model, generate score prediction
+            aif_pred_df = BinaryLabelDataset(df=df, label_names=[self.target_col], protected_attribute_names=[])
+            after_df, _ = aif_pred_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True)
+
+            favorable_class_idx = list(self.step.classes_).index(self.target_positive)
+            after_df[self.pred_target_col] = [x[favorable_class_idx] for x in self.step.predict_proba(np.array(df.drop(columns=[self.target_col])))]
+
+        else:  # for fair model, generate label prediction
+            aif_pred_df = BinaryLabelDataset(df=df, label_names=[self.target_col],
+                                             protected_attribute_names=[self.sensitive_att])
+
+            after_aif_df = self.step.predict(aif_pred_df)
+            after_df, _ = after_aif_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True)
+            after_df[self.pred_target_col] = after_aif_df.labels
+
+        return after_df
diff --git a/pipeline/postprocess/fair_postprocessors.py b/pipeline/postprocess/fair_postprocessors.py
@@ -0,0 +1,66 @@
+"""
+    Classes for post-process data and model outcomes
+"""
+
+import pandas as pd
+from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing
+from aif360.algorithms.postprocessing import EqOddsPostprocessing
+from aif360.algorithms.postprocessing import RejectOptionClassification
+from pipeline.postprocess.postprocessor import Postprocessor
+
+class AIF_EqOddsPostprocessing(Postprocessor):
+
+    def __init__(self, df, target_col, sensitive_att, threshold=0.5, seed=0):
+        """
+        :param df: pandas dataframe, stores the data to fit the postprocessor.
+        :param target_col: str, the name of the target variable in above data. Assume 1 represents the favorable class.
+        :param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
+        :param threshold: float in [0, 1], the classification threshold to generate the predicted class label.
+        :param seed: integer, the seed for random state.
+        """
+
+        cur_step = EqOddsPostprocessing([{sensitive_att: 0}], [{sensitive_att: 1}], seed)
+        super().__init__("@".join(["AIF_EqOddsPostprocessing", sensitive_att]), cur_step, df, sensitive_att, target_col, input_score=False, clf_threshold=threshold)
+
+
+class AIF_CalibratedEqOddsPostprocessing(Postprocessor):
+
+    def __init__(self, df, target_col, sensitive_att, threshold=0.5, seed=0, cost_constraint='weighted'):
+        """
+        :param df: pandas dataframe, stores the data to fit the postprocessor.
+        :param target_col: str, the name of the target variable in above data. Assume 1 represents the favorable class.
+        :param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
+        :param threshold: float in [0, 1], the classification threshold to generate the predicted class label.
+        :param seed: integer, the seed for random state.
+        :param cost_constraint: str, the fairness constraints format, value from [fpr, fnr, weighted].
+                                The same parameter as in aif360.algorithms.postprocessing.CalibratedEqOddsPostprocessing.
+        """
+
+        cur_step = CalibratedEqOddsPostprocessing([{sensitive_att: 0}], [{sensitive_att: 1}], cost_constraint=cost_constraint, seed=seed)
+        super().__init__("@".join(["AIF_CalibratedEqOddsPostprocessing", sensitive_att]), cur_step, df, sensitive_att, target_col, input_score=True, clf_threshold=threshold)
+
+class AIF_RejectOptionPostprocessing(Postprocessor):
+
+    def __init__(self, df, target_col, sensitive_att, threshold=0.5):
+        """
+        :param df: pandas dataframe, stores the data to fit the postprocessor.
+        :param target_col: str, the name of the target variable in above data. Assume 1 represents the favorable class.
+        :param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
+        :param threshold: float in [0, 1], the classification threshold to generate the predicted class label.
+        """
+        # TODO: fix the bug that reject option doesn't return results
+        cur_step = RejectOptionClassification([{sensitive_att: 0}], [{sensitive_att: 1}])
+        super().__init__("@".join(["AIF_RejectOptionClassification", sensitive_att]), cur_step, df, sensitive_att, target_col, input_score=True, clf_threshold=threshold)
+
+
+
+if __name__ == '__main__':
+    data = pd.read_csv("../../data/adult_post.csv")
+    # cur_o = AIF_RejectOptionPostprocessing(data, "income-per-year", "sex")
+    cur_o = AIF_EqOddsPostprocessing(data, "income-per-year", "sex")
+    # cur_o = AIF_CalibratedEqOddsPostprocessing(data, "income-per-year", "sex")
+
+    after_data = cur_o.apply(data)
+    after_data.to_csv("../../data/adult_"+cur_o.get_name()+".csv", index=False)
+
+    print(cur_o.get_name())