Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions fp/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd

def filter_optimal_results_skyline_order(_df, _order_list):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this code in this PR? This belongs to the skyline PR, right?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed by deleting it. This file should be deleted for the refactored code.

_df['selection_rate'] = abs(1 - _df['selection_rate'])
higher_is_better = ['num_true_positives', 'num_true_negatives', 'num_generalized_true_positives',
'num_generalized_true_negatives', 'true_positive_rate', 'true_negative_rate',
'generalized_true_positive_rate', 'generalized_true_negative_rate', 'positive_predictive_value',
'accuracy', 'num_pred_positives']
lower_is_better = ['selection_rate', 'num_false_positives', 'num_false_negatives',
'num_generalized_false_positives', 'num_generalized_false_negatives', 'false_positive_rate',
'false_negative_rate', 'generalized_false_positive_rate', 'generalized_false_negative_rate',
'false_discovery_rate', 'false_omission_rate', 'negative_predictive_value', 'error_rate',
'num_pred_negatives']
order = []
for item in _order_list:
if item in higher_is_better:
order.append(False)
else:
order.append(True)
_df = _df.sort_values(_order_list, ascending=order)

return _df.values[0]


def filter_optimal_results_skyline_formula(_df, _formula):
df = pd.DataFrame()
for key in _formula:
df["norm_" + key] = (_df[key] - _df[key].min()) / (_df[key].max() - _df[key].min())

df_temp = list(_formula.values())
keys = list(_formula.keys())
for col in range(len(keys)):
keys[col] = "norm_" + keys[col]

# Multiplying with the multiplier to perform sorting operation
df['norm_avg'] = df[keys].multiply(df_temp).sum(axis=1)
frames = [_df, df]
df_fin = pd.concat(frames, axis=1)

df_fin = df_fin.sort_values(by='norm_avg', ascending=False)
cols = [c for c in df_fin.columns if c[:4] != 'norm']
df_fin = df_fin[cols]
return df_fin.values[0]
506 changes: 506 additions & 0 deletions pipeline/fairprep.py

Large diffs are not rendered by default.

87 changes: 87 additions & 0 deletions pipeline/model/classifiers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Classes of supervised binary classifiers.
"""

import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from pipeline.model.inprocessor import Model


class SK_LogisticRegression(Model):
def __init__(self, df, target_col, loss_func="log", instance_weights=[], seed=0):
"""
:param df: pandas dataframe, stores the data to fit the classifier.
:param target_col: str, the name of the target variable in above data.
:param loss_func: str, the name of the loss function used in linear model. Same as the loss parameter in sklearn.linear_model.SGDClassifier.
The possible options are ‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’, ‘perceptron’, or a regression loss: ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’.
:param instance_weights: list of float, each number represents the weight of the sample in above data.
:param seed: integer, the seed for random state.
"""

cur_step = SGDClassifier(loss=loss_func, random_state=seed)
super().__init__("@".join(["SK_LogisticRegression", target_col]), cur_step, df, target_col, instance_weights=instance_weights)


class SK_DecisionTree(Model):
def __init__(self, df, target_col, instance_weights=[], seed=0):
"""
:param df: pandas dataframe, stores the data to fit the classifier.
:param target_col: str, the name of the target variable in above data.
:param instance_weights: list of float, each number represents the weight of the sample in above data.
:param seed: integer, the seed for random state.
"""
cur_step = DecisionTreeClassifier(random_state=seed)
super().__init__("@".join(["SK_DecisionTree", target_col]), cur_step, df, target_col, instance_weights=instance_weights)


class OPT_LogisticRegression(Model):
def __init__(self, df, target_col, loss_func="log", max_iter=1000, instance_weights=[], seed=0):
"""
:param df: pandas dataframe, stores the data to fit the classifier.
:param target_col: str, the name of the target variable in above data.
:param loss_func: str, the name of the loss function used in linear model. Same as the loss parameter in sklearn.linear_model.SGDClassifier.
The possible options are ‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’, ‘perceptron’, or a regression loss: ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’.
:param max_iter: integer, max number of iterations of the model.
:param instance_weights: list of float, each number represents the weight of the sample in above data.
:param seed: integer, random seed.
"""
# Update below parameters according to the loss function used
param_grid = {
'learner__loss': [loss_func],
'learner__penalty': ['l2', 'l1', 'elasticnet'],
'learner__alpha': [0.00005, 0.0001, 0.005, 0.001]
}
cur_step = SGDClassifier(max_iter=max_iter, random_state=seed)
super().__init__("@".join(["OPT_LogisticRegression", target_col]), cur_step, df, target_col, instance_weights=instance_weights, hyper_tune=True, param_grid=param_grid)

class OPT_DecisionTree(Model):
def __init__(self, df, target_col, instance_weights=[], seed=0):
"""
:param df: pandas dataframe, stores the data to fit the classifier.
:param target_col: str, the name of the target variable in above data.
:param instance_weights: list of float, each number represents the weight of the sample in above data.
:param seed: integer, random seed.
"""
param_grid = {
'learner__min_samples_split': range(20, 500, 10),
'learner__max_depth': range(15, 30, 2),
'learner__min_samples_leaf': [3, 4, 5, 10],
"learner__criterion": ["gini", "entropy"]
}

cur_step = DecisionTreeClassifier(random_state=seed)
super().__init__("@".join(["OPT_DecisionTree", target_col]), cur_step, df, target_col, instance_weights=instance_weights, hyper_tune=True, param_grid=param_grid)


if __name__ == '__main__':
data = pd.read_csv("../../data/adult_pre_reweigh.csv")
cur_o = SK_LogisticRegression(data, "income-per-year")
# cur_o = SK_DecisionTree(data, "income-per-year")
# cur_o = OPT_LogisticRegression(data, "income-per-year")
# cur_o = OPT_DecisionTree(data, "income-per-year")

after_data = cur_o.apply(data)
after_data.to_csv("../../data/adult_"+cur_o.get_name()+".csv", index=False)

print(cur_o.get_name())
69 changes: 69 additions & 0 deletions pipeline/model/fair_classifiers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Classes of fair supervised binary classifiers.
"""

import pandas as pd
from aif360.algorithms.inprocessing import AdversarialDebiasing
from aif360.algorithms.inprocessing import MetaFairClassifier
from aif360.algorithms.inprocessing import PrejudiceRemover
from pipeline.model.inprocessor import Model
import warnings
warnings.filterwarnings("ignore")

class AIF_AdversarialDebiasing(Model):

def __init__(self, df, target_col, sensitive_att, seed=0):
"""
:param df: pandas dataframe, stores the data to fit the fair classifier.
:param target_col: str, the name of the target variable in above data.
:param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
:param seed: integer, random seed.

"""

import tensorflow as tf
sess = tf.Session()
cur_step = AdversarialDebiasing(unprivileged_groups=[{sensitive_att: 0}], privileged_groups=[{sensitive_att: 1}], scope_name='debiased_classifier', debias=True, sess=sess, seed=seed)
super().__init__("@".join(["AIF_AdversarialDebiasing", sensitive_att]), cur_step, df, target_col, sensitive_att=sensitive_att, fair_aware=True)


class AIF_MetaFairClassifier(Model):

def __init__(self, df, target_col, sensitive_att, fairness_penalty=0.8, fair_metric="sr"):
"""
:param df: pandas dataframe, stores the data to fit the fair classifier.
:param target_col: str, the name of the target variable in above data.
:param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
:param fairness_penalty: float in [0,1], fairness penalty parameter. default is 0.8. The same parameter in aif360.algorithms.inprocessing.MetaFairClassifier.
:param fair_metric: str, fairness metric used in this method. Value from ["fdr" (false discovery rate ratio), "sr" (statistical rate/disparate impact)].
The same parameter in aif360.algorithms.inprocessing.MetaFairClassifier.
"""

cur_step = MetaFairClassifier(tau=fairness_penalty, sensitive_attr=sensitive_att, type=fair_metric)
super().__init__("@".join(["AIF_MetaFairClassifier", sensitive_att]), cur_step, df, target_col, sensitive_att=sensitive_att, fair_aware=True)

class AIF_PrejudiceRemover(Model):

def __init__(self, df, target_col, sensitive_att, fairness_penalty=1.0):
"""
:param df: pandas dataframe, stores the data to fit the fair classifier.
:param target_col: str, the name of the target variable in above data.
:param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
:param fairness_penalty: float in [0,1], fairness penalty parameter. default is 1. The same parameter in aif360.algorithms.inprocessing.PrejudiceRemover.

"""
# TODO: fix the bug that cannot import lib of 'getoutput'
cur_step = PrejudiceRemover(eta=fairness_penalty, sensitive_attr=sensitive_att, class_attr=target_col)
super().__init__("@".join(["AIF_PrejudiceRemover", sensitive_att]), cur_step, df, target_col, sensitive_att=sensitive_att, fair_aware=True)


if __name__ == '__main__':
data = pd.read_csv("../../data/adult_pre_reweigh.csv")
cur_o = AIF_AdversarialDebiasing(data, "income-per-year", "sex")
# cur_o = AIF_MetaFairClassifier(data, "income-per-year", "sex")
# cur_o = AIF_PrejudiceRemover(data, "income-per-year", "sex")

after_data = cur_o.apply(data)
after_data.to_csv("../../data/adult_"+cur_o.get_name()+".csv", index=False)

print(cur_o.get_name())
74 changes: 74 additions & 0 deletions pipeline/model/inprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Super class for all the supported classifier classes including fair-classifiers.
"""
import numpy as np
from aif360.datasets import BinaryLabelDataset
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pipeline.step import Step

class Model(Step):
def __init__(self, step_name, step, df, target_col, instance_weights=[], hyper_tune=False, param_grid={}, sensitive_att=None, fair_aware=False, target_positive=1):
"""
:param step_name: str, name of the current input step.
:param step: object of the initialized class.
:param df: pandas dataframe, stores the data.
:param target_col: str, the name of the target attribute.
:param instance_weights: list of float in [0,1], each float represents the weight of the sample in above data.
:param hyper_tune: boolean, whether to tune the hyper-parameter. Default is False.
:param param_grid: dict, stores the search range of the hyper-parameter. When hyper_tune is True, this must be provided.
:param sensitive_att: str, the name of a sensitive attribute.
:param fair_aware: boolean, whether the model is fair-aware. Default is False.
:param target_positive: integer, 0 or 1, represents the positive value of the target attribute. Default is 1.
"""

super().__init__(step_name=step_name, df=df, sensitive_att=sensitive_att, target_col=target_col)
# assume the data set has been encoded to numerical values
if fair_aware: # fair classifiers
# intitialize a binary label dataset from AIF 360
aif_df = BinaryLabelDataset(df=df, label_names=[target_col], protected_attribute_names=[sensitive_att])
fitted_step = step.fit(aif_df)
input_score = False
else: # regular classifiers
if len(instance_weights) == 0:
instance_weights = [1 for _ in range(1, df.shape[0] + 1)]
if hyper_tune: # grid search for best hyper parameters
if not param_grid:
print("Need to specify the search range of the hyper parameters - 'param_grid' is empty!")
raise ValueError

search = GridSearchCV(Pipeline([('learner', step)]), param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
fitted_step = search.fit(np.array(df.drop(columns=[target_col])), np.array(df[target_col]), None, **{'learner__sample_weight': instance_weights})
else:
fitted_step = step.fit(np.array(df.drop(columns=[target_col])), np.array(df[target_col]), sample_weight=instance_weights)
input_score = True

self.input_score = input_score
self.step = fitted_step
self.target_positive = target_positive


def apply(self, df):
"""
:param df: pandas dataframe, stores the data to apply the learned discretizer.
:return: pandas dataframe, stores the data after discretize.
"""

# initialize AIF360 BinaryLabelDataset

if self.input_score: # for regular model, generate score prediction
aif_pred_df = BinaryLabelDataset(df=df, label_names=[self.target_col], protected_attribute_names=[])
after_df, _ = aif_pred_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True)

favorable_class_idx = list(self.step.classes_).index(self.target_positive)
after_df[self.pred_target_col] = [x[favorable_class_idx] for x in self.step.predict_proba(np.array(df.drop(columns=[self.target_col])))]

else: # for fair model, generate label prediction
aif_pred_df = BinaryLabelDataset(df=df, label_names=[self.target_col],
protected_attribute_names=[self.sensitive_att])

after_aif_df = self.step.predict(aif_pred_df)
after_df, _ = after_aif_df.convert_to_dataframe(de_dummy_code=True, sep='=', set_category=True)
after_df[self.pred_target_col] = after_aif_df.labels

return after_df
66 changes: 66 additions & 0 deletions pipeline/postprocess/fair_postprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
Classes for post-process data and model outcomes
"""

import pandas as pd
from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing
from aif360.algorithms.postprocessing import EqOddsPostprocessing
from aif360.algorithms.postprocessing import RejectOptionClassification
from pipeline.postprocess.postprocessor import Postprocessor

class AIF_EqOddsPostprocessing(Postprocessor):

def __init__(self, df, target_col, sensitive_att, threshold=0.5, seed=0):
"""
:param df: pandas dataframe, stores the data to fit the postprocessor.
:param target_col: str, the name of the target variable in above data. Assume 1 represents the favorable class.
:param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
:param threshold: float in [0, 1], the classification threshold to generate the predicted class label.
:param seed: integer, the seed for random state.
"""

cur_step = EqOddsPostprocessing([{sensitive_att: 0}], [{sensitive_att: 1}], seed)
super().__init__("@".join(["AIF_EqOddsPostprocessing", sensitive_att]), cur_step, df, sensitive_att, target_col, input_score=False, clf_threshold=threshold)


class AIF_CalibratedEqOddsPostprocessing(Postprocessor):

def __init__(self, df, target_col, sensitive_att, threshold=0.5, seed=0, cost_constraint='weighted'):
"""
:param df: pandas dataframe, stores the data to fit the postprocessor.
:param target_col: str, the name of the target variable in above data. Assume 1 represents the favorable class.
:param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
:param threshold: float in [0, 1], the classification threshold to generate the predicted class label.
:param seed: integer, the seed for random state.
:param cost_constraint: str, the fairness constraints format, value from [fpr, fnr, weighted].
The same parameter as in aif360.algorithms.postprocessing.CalibratedEqOddsPostprocessing.
"""

cur_step = CalibratedEqOddsPostprocessing([{sensitive_att: 0}], [{sensitive_att: 1}], cost_constraint=cost_constraint, seed=seed)
super().__init__("@".join(["AIF_CalibratedEqOddsPostprocessing", sensitive_att]), cur_step, df, sensitive_att, target_col, input_score=True, clf_threshold=threshold)

class AIF_RejectOptionPostprocessing(Postprocessor):

def __init__(self, df, target_col, sensitive_att, threshold=0.5):
"""
:param df: pandas dataframe, stores the data to fit the postprocessor.
:param target_col: str, the name of the target variable in above data. Assume 1 represents the favorable class.
:param sensitive_att: str, the name of a sensitive attribute in above data. If none, call auto_detection to update. Value 0 represent protected.
:param threshold: float in [0, 1], the classification threshold to generate the predicted class label.
"""
# TODO: fix the bug that reject option doesn't return results
cur_step = RejectOptionClassification([{sensitive_att: 0}], [{sensitive_att: 1}])
super().__init__("@".join(["AIF_RejectOptionClassification", sensitive_att]), cur_step, df, sensitive_att, target_col, input_score=True, clf_threshold=threshold)



if __name__ == '__main__':
data = pd.read_csv("../../data/adult_post.csv")
# cur_o = AIF_RejectOptionPostprocessing(data, "income-per-year", "sex")
cur_o = AIF_EqOddsPostprocessing(data, "income-per-year", "sex")
# cur_o = AIF_CalibratedEqOddsPostprocessing(data, "income-per-year", "sex")

after_data = cur_o.apply(data)
after_data.to_csv("../../data/adult_"+cur_o.get_name()+".csv", index=False)

print(cur_o.get_name())
Loading