d-Simplexed/src/3d/sampler.py at master · HY-UDBMS/d-Simplexed · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
    Sampling utils and implementations used in Delaunay Triangulation model.
"""
from pyDOE import *
from delaunaymodel import DelaunayModel
import random
import math

# number of decimal places to preserve in sample
sample_precision = 3

# scale from [old_min, old_max] --> [new_min, new_max]
def scale_to(old_min, old_max, new_min, new_max, value):
	return new_min + ((new_max - new_min)/(old_max - old_min))*(value - old_min)

# https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

# quicker way to LHS sample from a discrete set of features
# is easy to impl but has limitation that we can only return len(f1) samples
# TODO: optimize to handle when sizeof f1 != sizeof f2 (scaling needed)
def seed_sample(features, samples):
	f1,f2 = features

	# sizeof f1 must equal sizeof f2
	assert len(f1) == len(f2)

	lhs_samples = discrete_lhs([], features, samples)
	return lhs_samples

# taken_points = points in points_space already used in model as array of [[f1,f2]runtime]
# feature_space = f1=[a, b, c, ..., n], f2=[a, b, c, d, ..., n] -- total range of features
# samples = number of samples to take
def discrete_lhs(taken_points, feature_space, samples):
	f1,f2 = feature_space

	# sizeof f1 must equal sizeof f2
	assert len(f1) == len(f2)

	print "discrete_lhs samples to take: " + str(samples)

	# divide feature_space into buckets
	f1_buckets = chunkIt(f1, samples)
	f2_buckets = chunkIt(f2, samples)

	print "f1_buckets " + str(f1_buckets)
	print "f2_buckets " + str(f2_buckets)

	random.shuffle(f1_buckets)
	random.shuffle(f2_buckets)

	lhs_sample_buckets = []
	while len(f1_buckets) > 0:
		lhs_sample_buckets.append([f1_buckets.pop(), f2_buckets.pop()])

	# each lhs_sample_buckets[i] is something like [[a,b,c][d,e,f]], which represents a square-ish range of the domain that we need to pick a sample from
	# generate the possible points, then return the first one that is not in taken_points
	print str(lhs_sample_buckets)

	lhs_samples = []
	for bucket_pair in lhs_sample_buckets:
		f1_buckets_i,f2_buckets_i = bucket_pair
		print "f1_buckets_i" + str(f1_buckets_i)
		print "f2_buckets_i" + str(f2_buckets_i)

		candidates = []
		for f1_vals in f1_buckets_i:
			for f2_vals in f2_buckets_i:
				candidates.append([f1_vals, f2_vals])

		for candidate in candidates:
			f1,f2 = candidate
			# check if point is already taken (in model)
			def is_taken(points, point):
				for taken_point in points:
					if taken_point[0] == point:
						print "point " + str(point) + " is taken!"
						return True
				return False

			if not is_taken(taken_points, candidate):
				lhs_samples.append(candidate)
				break

	print "discrete_lhs final samples: " + str(lhs_samples)

	return lhs_samples

# model_points = points to create model with
# sample = [f1,f2] -- sample to calculate utility for
def utility(model_points, sample):
	# construct DT model with model_points
	# calculate utility using sample
	# utility in 3d = average (distance from current point w/ predicted runtime to the other 3 points that construct its prediction plane)
	print "constructing model for utility of sample " + str(sample)

	dt = DelaunayModel(model_points)
	dt.construct_model()

	est_runtime = dt.predict(sample)
	sample_hyperplane = dt.hyperplane_for(sample)

	print "util: find euclid dist between " + str(sample_hyperplane) + " --> " + str(sample)

	x_0, y_0, z_0 = sample[0], sample[1], est_runtime
	sum_dist = 0.0
	for x,y,z in sample_hyperplane:
		# euclidean dist between [x_0, y_0, z_0] and [x_1, y_1, z_1]: sqrt((x_1 - x_0)^2+(y_1 - y_0)^2+(z_1 - z_0)^2)
		sum_dist += (math.sqrt((x-x_0)**2 + (y-y_0)**2 + (z-z_0)**2))

	# utiliy = avg of distances to 3 points
	utility = sum_dist / 3.0
	print "util: util for " + str(sample) + " is " + str(utility)

	return utility

# return complete list of gridded samples.  Grab from 0 the pre-computed gridded values.
# feature_space = possible f1 and f2 values, eg [[1,2,3...n][5,10,15,...n]]
# step_sizes = [100, 50, 25, 5, 1] = step sizes to take thru the dataset, in order, e.g. first take passes 100 steps long, then go by 50, 25 etc
def get_gridding_samples(feature_space, step_sizes):
	#def grid(features, step_size):
    result =[]
    for i in range(len(step_sizes)):
        x1_1 = []
        x1_1.append(feature_space[0][0])
        k = 1
        while (feature_space[0][0]+k*step_sizes[i]*2<=feature_space[0][-1]):
         x1_1.append(feature_space[0][0+k*step_sizes[i]])
         k=k+1
        # print x1_1

        x2_2 = []
        x2_2.append(feature_space[1][0])
        k = 1
        while (feature_space[1][0]+k*step_sizes[i]<=feature_space[1][-1]):
         x2_2.append(feature_space[1][0+k*step_sizes[i]])
         k=k+1
        # print x2_2
        # print (len(x1_1))
        for j in range(len(x1_1)):
            for k in range(len(x2_2)):
                # print "x1, x2:" +str( [x1_1[j],x2_2[k]])
                if [x1_1[j],x2_2[k]] not in result:
                    result.append([x1_1[j],x2_2[k]])
    return result

# current_model_points = points in current delaunay model
# available_points = points that can be added to the model
# feature_space = possible f1 and f2 values, eg [[1,2,3...n][5,10,15,...n]]
def next_random_sample(current_model_points, available_points, feature_space):
	print "next_random_sample: sizeof current_model_points + available_points = " + str(len(current_model_points) + len(available_points))

	random.shuffle(available_points)
	return available_points.pop()

# current_model_points = points in current delaunay model
# available_points = points that can be added to the model
# feature_space = possible f1 and f2 values, eg [[1,2,3...n][5,10,15,...n]]
def next_adaptive_sample(current_model_points, available_points, feature_space):
	print "next_adaptive_sample: sizeof current_model_points + available_points = " + str(len(current_model_points) + len(available_points))
	# after some threshold, you can't really LHS sample anymore because there are too many "holes" in the mesh
	# better to do it randomly after some threshold

	# re-lhs n samples
	# for each sample, predict runtime with same model

	next_samples = []

	# for md model, the LHS samples we get from lhs_sample_v2 is 21
	lhs_sample_size = len(feature_space[0])
	if len(available_points) < lhs_sample_size:
		# not enough points to get a full LHS sample, so we'll consider the rest at once (edge case)
		print "Not enough points remain to LHS"
		next_samples = [[f1,f2] for [[f1,f2],rt] in available_points]
	else:
		print "discrete_lhs current_model_points" + str(current_model_points)
		print "discrete_lhs feature_space" + str(feature_space)
		next_samples = discrete_lhs(current_model_points, feature_space, lhs_sample_size)
		# discrete_lhs returns [f1,f2] values, need to attach actual runtime

	utils = {}
	for i,hist_point in enumerate(next_samples):
		cfg = hist_point
		#runtime = hist_point[1]
		#print str(i) + " checking " + str(cfg) + " --> " +str(runtime)
		utils[i] = utility(current_model_points, cfg)

	# if utils are empty, it means we couldn't get enough samples for a full discrete_lhs, so in that case just return the rest randomized (it should be very few points relatively anyway)
	if len(utils) == 0:
		random.shuffle(available_points)
		return available_points.pop()

	print "determined utils: " + str(utils)
	# sort highest util
	highest_util_idx = 0
	highest_util_val = utils[0]
	for key,val in utils.items():
		if val > highest_util_val:
			highest_util_val = val
			highest_util_idx = key

	print "LARGEST utility ({}) belongs to {}".format(highest_util_val, available_points[highest_util_idx])
	popped = available_points.pop(highest_util_idx)
	return popped