-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGeneticOptimizer.py
More file actions
191 lines (160 loc) · 9.53 KB
/
GeneticOptimizer.py
File metadata and controls
191 lines (160 loc) · 9.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
'''
GeneticOptimizer
Author: Nicholas Klepp
The genetic optimizer is an implementation of a binary logistic regression document
classifier the weights of which are learned via genetic programming.
Given a set of training documents, X, and the corresponding training labels, y, the
binary classification logistic regression model
y(X;Theta) = exp(-Theta*X) / ( 1 + exp(-Theta *X))
is learned by via a simple genetic programming algorithm with respect to parameter Theta.
The predictions for an unlabeled set of documents, z, is given by
y_hat(z;Theta) = floor(exp(-Theta*z) / ( 1 + exp(-Theta *z) ) + .5)
In other words, the prediction y_hat(z;Theta) is 0 if the logistic regression function is less
.5 and 1 if the logistic regression function is greater than or equal to .5.
The genetic programming:
A initial population of possible Theta parameters, P, of size n is drawn from a random
distribution. For some predetermined number of iterations (generations) g, the prediction
function y_hat(X;Theta) is calculated for each p in P and the population evolves. Each p in
P is judged for its fitness for prediction, the fitness function being the average number of
documents for which p generates an accurate prediction. The top s percent of performers according to
the fitness criteria are selected, and the remaining members of the population are culled. The
remaining members of the population are then randomly "mated" to produce the next generation
of P, whereby two p_i and p_j combine to form p', the average of the two weight vectors. The
offspring then undergo random mutation. If Theta=[Theta_0, Theta_1, ... , Theta_n] is the vector
of weights used in the logistic classifier, then P describes a set of n_p many possible weights
for each parameter Theta_i. To mutate the population, for each Theta_i we calculate the mean, mu,
and variance, sigma^2, for w_i over the entire population. Then, for each p=[w_p0, w_p1, ... , w_pn]
in P, with probability m each w_pi is reassigned a new value pulled from a standard normal distribution
centered at mu with standard deviation sigma^2. The next iteration of evolution then begins.
Required Input:
input training documents, X
input training labels, y
Optional Input:
unlabeled documents, z
population size, n_p [DEFAULT: 100]
mutation rate, m [DEFAULT: .01]
survival rate, s [DEFAULT: .3]
# of generations, g [DEFAULT: 100]
'''
import argparse
import numpy as np
def _load_X(path):
# Load the data.
mat = np.loadtxt(path, dtype = int)
max_doc_id = mat[:, 0].max()
max_word_id = mat[:, 1].max()
X = np.zeros(shape = (max_doc_id, max_word_id))
for (docid, wordid, count) in mat:
X[docid - 1, wordid - 1] = count
return X
def _load_Z(path):
mat = np.loadtxt(path,dtype = int)
max_doc_id = mat[:,0].max()
max_word_id = mat[:,1].max()
Z = np.zeros(shape = (max_doc_id,max_word_id))
Z = np.zeros(shape=(max_doc_id,X.shape[1]))
for (docid,wordid,count) in mat:
Z[docid-1,wordid-1]=count
return Z[:,:X.shape[1]]
def _load_train(data, labels):
# Load the labels.
y = np.loadtxt(labels, dtype = int)
X = _load_X(data)
return [X, y]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Assignment 2",
epilog = "CSCI 4360/6360 Data Science II: Fall 2017",
add_help = "How to use",
prog = "python assignment2.py -x <train-data> -y <train-label> <optional args>")
parser.add_argument("-x", "--traindata", default = 100, required = True,
help = "The training data for learning the model.")
parser.add_argument("-y", "--trainlabel", default = 100, required = True,
help = "The training labels for learning the model.")
parser.add_argument("-z", "--testdata",
help = "Unlabeled data for generating predictions.")
parser.add_argument("-n", "--population", default = 100, type = int,
help = "Population size [DEFAULT: 100].")
parser.add_argument("-s", "--survival", default = 0.3, type = float,
help = "Per-generation survival rate [DEFAULT: 0.3].")
parser.add_argument("-m", "--mutation", default = 0.01, type = float,
help = "Point mutation rate [DEFAULT: 0.01].")
parser.add_argument("-g", "--generations", default = 100, type = int,
help = "Number of generations to run [DEFAULT: 100].")
parser.add_argument("-r", "--random", default = -1, type = int,
help = "Random seed for debugging [DEFAULT: -1].")
args = vars(parser.parse_args())
# Do we set a random seed?
if args['random'] > -1:
np.random.seed(args['random'])
# Read in the training data.
X, y = _load_train(args["traindata"], args["trainlabel"])
DEBUG = False
#declare useful variables
n=args['population'] #the size of the population
s=args['survival'] #the survival rate
m=args['mutation'] #the mutation rate
g=args['generations'] #the number of generations to run the algorithm for
d=int(X.shape[0]) #the number of documents in the training set
v=int(X.shape[1]) #the size of the vocabulary
if DEBUG : print("args:",args)
test_data = args['testdata']
#initialize a random population.
pop = np.random.normal(loc=0,scale = 1,size = (n,v)) #an initial population drawn from a normal distribution in an nXv matrix
if DEBUG : print("pop:",pop.shape)
def copy_doc(docs): #stack n many mXv arrays on top of each other for one nXmXv array
if DEBUG: print("copy_doc")
docs = np.array([docs]*n)
return np.transpose(docs,[1,0,2])
def combine_cubes(population,doc_copy): #return an nXm matrix where M[i,j] is p_i DOT d_j
if DEBUG: print("combining cubes")
return np.sum(population*doc_copy,axis=2).T
def h(z): #the sigmoid function
if DEBUG: print("h(z)")
return 1.0 / (1.0 + np.exp(-1.0*z))
def predict(hz): #the prediction rule
if DEBUG: print("predict")
return np.where(hz>1/2,1,0)
def make_predictions(combo_cube): #return an nXm matrix where M[i,j] is the prediction from p_i for document d_j
if DEBUG: print("make_predictions")
return predict(h(combo_cube))
def cull_flock(population, preds,y): #assess the fitness of the population.
if DEBUG: print("cull_flock")
accurates = np.array(preds==y) #a boolean array of where each prediction was correct
accuracies = np.where(accurates,1,0) #a binary array (1=>correct,0=>incorrect)
if DEBUG: print("accuracies:",accuracies.shape)
accuracies = np.sum(accuracies,axis=1) #sum of correct predictions for each population member
accuracies = accuracies/accuracies.shape[0] #the average correctness for each population member
survivors = population[np.argsort(-1*accuracies)] #sorting the population by their average accuracy descending
return survivors[:int(np.floor(s*survivors.shape[0]))]
def mate(survivors): #create a new population of size=n from survivors of size sn
if DEBUG: print("mate")
mate1=survivors[np.random.randint(survivors.shape[0],size=n)]
mate2=survivors[np.random.randint(survivors.shape[0],size=n)]
return (mate1+mate2)/2
def mutate(offspring): #mutate the offspring
if DEBUG: print("mutate:",m)
pop_mean = np.mean(pop,axis=0) #mean of each w_i over the entire population
pop_var = np.var(pop,axis=0) #variance of each w_i over the entire population
pop_sd = np.sqrt(pop_var) #the sd ...
mutations = np.random.normal(loc=pop_mean,scale=pop_sd,size=pop.shape) #an entirely random population pulled from a normal dist with above params
coin = np.random.binomial(1,m,pop.shape) #a population of weighted coin flips (mostly 0s)
return coin*mutations + (1-coin)*offspring
for i in range(0,g): #run the generations
print("generation",i)
doc_copy = copy_doc(X)
combo_cube = combine_cubes(pop,doc_copy)
preds = make_predictions(combo_cube)
survivors = cull_flock(pop,preds,y)
offspring = mate(survivors)
mutants = mutate(offspring)
pop = mutants
doc_copy = copy_doc(X) #one final generation
combo_cube = combine_cubes(pop,doc_copy)
preds = make_predictions(combo_cube)
survivors = cull_flock(pop,preds,y)
highlander = survivors[0] #the most fit of the final generation...THERE CAN BE ONLY ONE!
if not test_data == None:
Z = _load_Z(test_data) #the test data
predictions = predict(h(np.sum(highlander*Z,axis=1))) #make predictions using the Highlander
for p in predictions:
print(p)