-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatacost.py
More file actions
186 lines (138 loc) · 6.69 KB
/
datacost.py
File metadata and controls
186 lines (138 loc) · 6.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""datacost.py : Cost-Sensitive Data Measures
This module can be used to calculate various cost-sensitive measurements.
It is mainly intended for use in machine learning algorithms.
"""
import math
def cost_labelling_positive(num_positive, num_negative, cost_matrix):
"""Used to calculate the cost of labelling every data point as positive.
Args:
num_positive (int): The number of positive data points.
num_negative (int): The number of negative data points.
cost_matrix (dict): Every cost. e.g., {'TP':1, 'TN':0, 'FP':1, 'FN':5}
Returns:
(num): The cost of labelling every data point as positive.
Raises:
TypeError: If an incorrect number of arguments are passed.
KeyError: If the passed cost_matrix is missing a cost.
"""
if len(locals()) < 3:
raise TypeError('Too few arguments.')
elif len(locals()) > 3:
raise TypeError('Too many arguments.')
if any(k not in cost_matrix for k in ('TP', 'TN', 'FP', 'FN')):
raise KeyError('A cost is missing from the passed cost matrix.')
return num_positive * cost_matrix['TP'] + num_negative * cost_matrix['FP']
def cost_labelling_negative(num_positive, num_negative, cost_matrix):
"""Used to calculate the cost of labelling every data point as negative.
Args:
num_positive (int): The number of positive data points.
num_negative (int): The number of negative data points.
cost_matrix (dict): Every cost. e.g., {'TP':1, 'TN':0, 'FP':1, 'FN':5}
Returns:
(num): The cost of labelling every data point as negative.
Raises:
TypeError: If an incorrect number of arguments are passed.
KeyError: If the passed cost_matrix is missing a cost.
"""
if len(locals()) < 3:
raise TypeError('Too few arguments.')
elif len(locals()) > 3:
raise TypeError('Too many arguments.')
if any(k not in cost_matrix for k in ('TP', 'TN', 'FP', 'FN')):
raise KeyError('A cost is missing from the passed cost matrix.')
return num_negative * cost_matrix['TN'] + num_positive * cost_matrix['FN']
def expected_cost(num_positive, num_negative, cost_matrix):
"""Used to calculate the expected cost for a set of data points.
Args:
num_positive (int): The number of positive data points.
num_negative (int): The number of negative data points.
cost_matrix (dict): Every cost. e.g., {'TP':1, 'TN':0, 'FP':1, 'FN':5}
Returns:
(num): The expected cost for the given data points.
Raises:
TypeError: If an incorrect number of arguments are passed.
KeyError: If the passed cost_matrix is missing a cost.
"""
if len(locals()) < 3:
raise TypeError('Too few arguments.')
elif len(locals()) > 3:
raise TypeError('Too many arguments.')
if any(k not in cost_matrix for k in ('TP', 'TN', 'FP', 'FN')):
raise KeyError('A cost is missing from the passed cost matrix.')
c_p = cost_labelling_positive(num_positive, num_negative, cost_matrix)
c_n = cost_labelling_negative(num_positive, num_negative, cost_matrix)
return (2 * c_p * c_n)/(c_p + c_n)
def expected_cost_after_split(class_supports, cost_matrix):
"""Used to calculate the expected cost for a set of splitted data points.
Args:
class_supports (list<dict>): The class supports for each split where the
i'th split corresponds to the (i-1)'th element of the list. For example,
the second split could be {'positive' : 2, 'negative' : 6}.
cost_matrix (dict): Every cost. e.g., {'TP':1, 'TN':0, 'FP':1, 'FN':5}
Returns:
(num): The expected cost for the set of splitted data points.
Raises:
TypeError: If an incorrect number of arguments are passed.
KeyError: If the passed cost_matrix is missing a cost or the passed
class supports is missing either of the keys: 'positive' or 'negative'.
"""
if len(locals()) < 2:
raise TypeError('Too few arguments.')
elif len(locals()) > 2:
raise TypeError('Too many arguments.')
if any(k not in cost_matrix for k in ('TP', 'TN', 'FP', 'FN')):
raise KeyError('A cost is missing from the passed cost matrix.')
for child_class_supports in class_supports:
if any(k not in child_class_supports for k in ('positive', 'negative')):
raise KeyError('Class supports missing either positive or negative key.')
child_expected_costs = []
for supports in class_supports:
num_positive = supports['positive']
num_negative = supports['negative']
child_cost = expected_cost(num_positive, num_negative, cost_matrix)
child_expected_costs.append(child_cost)
return sum(child_expected_costs)
def expected_cost_per_record(num_positive, num_negative, cost_matrix):
"""Used to calculate the expected cost per data point.
Args:
num_positive (int): The number of positive data points.
num_negative (int): The number of negative data points.
cost_matrix (dict): Every cost. e.g., {'TP':1, 'TN':0, 'FP':1, 'FN':5}
Returns:
(num): The expected cost per data point.
Raises:
TypeError: If an incorrect number of arguments are passed.
KeyError: If the passed cost_matrix is missing a cost.
"""
if len(locals()) < 3:
raise TypeError('Too few arguments.')
elif len(locals()) > 3:
raise TypeError('Too many arguments.')
if any(k not in cost_matrix for k in ('TP', 'TN', 'FP', 'FN')):
raise KeyError('A cost is missing from the passed cost matrix.')
expected_cost_all = expected_cost(num_positive, num_negative, cost_matrix)
num_data_points = num_positive + num_negative
return expected_cost_all / num_data_points
def total_cost(num_positive, num_negative, cost_matrix):
"""Used to calculate the total cost of the set of data points.
Args:
num_positive (int): The number of positive data points.
num_negative (int): The number of negative data points.
cost_matrix (dict): Every cost. e.g., {'TP':1, 'TN':0, 'FP':1, 'FN':5}
Returns:
(num): The total cost of the set of data points.
Raises:
TypeError: If an incorrect number of arguments are passed.
KeyError: If the passed cost_matrix is missing a cost.
"""
if len(locals()) < 3:
raise TypeError('Too few arguments.')
elif len(locals()) > 3:
raise TypeError('Too many arguments.')
if any(k not in cost_matrix for k in ('TP', 'TN', 'FP', 'FN')):
raise KeyError('A cost is missing from the passed cost matrix.')
# The total cost is actually equal to the lowest cost out of 1) the cost
# of labelling as positive, and 2) the cost of labelling as negative.
c_p = cost_labelling_positive(num_positive, num_negative, cost_matrix)
c_n = cost_labelling_negative(num_positive, num_negative, cost_matrix)
return min(c_p, c_n)