-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathreal.py
More file actions
144 lines (129 loc) · 6.29 KB
/
real.py
File metadata and controls
144 lines (129 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from itertools import combinations_with_replacement
import numpy as np
import pandas as pd
import click
import ast
from lib.greedy_deter import compute_gd
from lib.greedy_rand import compute_gr
from lib.max_cover import compute_maxcov
from lib.min_cost import compute_mincos
from lib.iterative import compute_iter
from lib.thres import compute_thres
class PythonLiteralOption(click.Option):
def type_cast_value(self, ctx, value):
try:
return ast.literal_eval(value)
except:
raise click.BadParameter(value)
@click.command()
@click.option('--data', required=True)
@click.option('--gamma', type=float, required=True)
@click.option('--alpha',type=float ,required=True)
@click.option('--output', required=True, help="output")
@click.option('--algo', required=True, help="algorithm to execute")
@click.option('--k', type=int, default=2)
@click.option('--seed', type=int, default=1, help="random number for seed.")
@click.option('--leaking', cls=PythonLiteralOption, default='None', help="probabilities of sampling some other individual")
@click.option('--max_iter', type=int, default=100, help="max iterations for the iterative algorithm")
@click.option('--cost_method', type=str, help="method of setting the cost function")
@click.option('--njobs', default=1, help="number of parallel threads")
@click.option('--noisy_pyx', default=0.0, type=float, help="relative variance of noise for predictions")
@click.option('--noisy_cost',default=0.0, type=float, help="relative variance of noise for costs")
@click.option('--confounding',default=0.0, type=float, help="level of confounding")
def experiment(data, output, seed, gamma, alpha, k, leaking, algo, cost_method, njobs, max_iter, noisy_pyx, noisy_cost, confounding):
"""
Executes one of the algorithms on real data.
Parameters
----------
data : string
data directory prefix (e.g., data/processed/fico)
output : string
output directory prefix (e.g., outputs/exec1_)
seed : int
random seed for reproducibility
gamma : float
gamma parameter value
alpha : float
alpha parameter value
k : int
maximum number of explanations
leaking : list of floats
leakage probabilities
algo : string
algorithm to be used
cost_method : string
method of setting the cost function (euclidean or max percentile shift)
njobs : int
number of parallel threads to be used
max_iter : int
max iterations for the iterative algorithm
noisy_pyx : float
relative variance of noise for predictions
noisy_cost : float
relative variance of noise for costs
confounding : float
level of confounding
"""
# Read outcomes
u_temp = pd.read_csv(data+'_pyx.csv', index_col=0, names=["ID", "Probability"], header=0, dtype={'Probability': float})
if noisy_pyx==0.0:
u = u_temp.copy()
else:
np.random.seed(seed)
u = u_temp.copy()
pyx_std = u_temp['Probability'].std()
u['Probability'] = u_temp['Probability'] + np.random.normal(0, noisy_pyx*pyx_std, len(u_temp['Probability']))
u['Probability'] = u['Probability'].apply(lambda x : max(0,min(x,1)))
u.sort_values(by=["Probability"],inplace=True, ascending=False)
indexing=u.index.values.flatten().tolist()
u=u.values.flatten()-gamma
u_temp=u_temp.reindex(indexing)
u_temp=u_temp.values.flatten()-gamma
# Read costs
cost_temp=pd.read_csv(data+'_cost_' + str(cost_method) + '.csv', index_col=0, header=0)
cost_temp.columns = cost_temp.columns.astype(int)
if noisy_cost==0.0:
cost = cost_temp.copy()
else:
np.random.seed(seed)
cost = cost_temp.copy()
cost_std = np.std([x for x in cost.to_numpy().flatten() if x<1.01])
cost = cost + np.random.normal(0, noisy_cost*cost_std, cost.shape)
cost = cost.applymap(lambda x : max(0,x))
for i in range(len(cost.index)):
for j in range(len(cost.columns)):
if i==j:
cost.loc[i,j] = 0
cost=cost[indexing]
c = cost.reindex(indexing).to_numpy()
c = c*alpha # scaling
cost_temp=cost_temp[indexing]
c_temp = cost_temp.reindex(indexing).to_numpy()
c_temp = c_temp*alpha # scaling
# Read population
px_df = pd.read_csv(data+'_px.csv', index_col=0, header=0)
px = px_df.reindex(indexing).to_numpy().flatten()
# Compute
if noisy_cost==0.0 and noisy_pyx==0.0:
if algo=="gd": # Greedy deterministic
compute_gd(output=output, C=c, U=u, Px=px, k=k, seed=seed, alpha=alpha, indexing=indexing, njobs=njobs)
elif algo=="gr": # Greedy randomized
compute_gr(output=output, C=c, U=u, Px=px, k=k, seed=seed, leaking=leaking, alpha=alpha, indexing=indexing, njobs=njobs)
elif algo=="maxcov": # Maximum coverage
compute_maxcov(output=output, C=c, U=u, Px=px, k=k, seed=seed, alpha=alpha, indexing=indexing, njobs=njobs)
elif algo=="mincos": # Minimum cost
compute_mincos(output=output, C=c, U=u, Px=px, k=k, seed=seed, alpha=alpha, indexing=indexing, njobs=njobs)
elif algo=="it": # Iterative
compute_iter(output=output, C=c, U=u, Px=px, seed=seed, alpha=alpha, indexing=indexing, max_iter=max_iter, verbose=False, njobs=njobs, split_components=False)
elif algo=="itc": # Iterative (with components)
compute_iter(output=output, C=c, U=u, Px=px, seed=seed, alpha=alpha, indexing=indexing, max_iter=max_iter, verbose=False, njobs=njobs, split_components=True, confounding=confounding)
elif algo=="th": # Threshold
compute_thres(output=output, C=c, U=u, Px=px, seed=seed, alpha=alpha, indexing=indexing, njobs=njobs, confounding=confounding)
else:
if algo=="itc": # Iterative (with components)
compute_iter(output=output, C=c, U=u, Px=px, seed=seed, alpha=alpha, indexing=indexing, max_iter=max_iter, verbose=False, njobs=njobs, split_components=True, U_real=u_temp, C_real=c_temp)
elif algo=="th": # Threshold
compute_thres(output=output, C=c, U=u, Px=px, seed=seed, alpha=alpha, indexing=indexing, njobs=njobs, U_real=u_temp, C_real=c_temp)
# TODO: Make sure to give cost_method as argument when calling real in counterfactuals' scripts
if __name__ == '__main__':
experiment()