-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathSuperLogLogEstimator.py
More file actions
54 lines (44 loc) · 1.35 KB
/
SuperLogLogEstimator.py
File metadata and controls
54 lines (44 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
# COS226 University of Maine
import math
#returns an estimate of the cardinality of hashes using k bits for the buckets
def SuperLogLog(hashes, k):
#number of buckets
m = 2**k
#initialize buckets to 0
buckets = [0] * m
#loop through all hashes
for i in range(0, len(hashes)):
#get bucket{j} of the hash (first k bits)
j = str(hashes[i])[:k]
j = int(j, 2)
#get remaining bits (remaining bits after bucket)
data = hashes[i][k:]
#get rank of the first 1 in the remaining bits (number of trailing zeros)
rank = 1
for c in reversed(data):
if c == "0":
rank += 1
else:
break
#if rank is higher than the previous bucket value set it to the new rank
buckets[j] = max(buckets[j], rank)
#calculate lower and upper cutoffs
#a cut off of ~0.7 is optimal
cutoff = 0.7
lower = math.floor(cutoff * m)
upper = m - lower
#apply cutoff
buckets.sort()
for i in range(upper):
del buckets[-1]
#get bucket average
average = sum(buckets) / lower
#calculate estimate
estimate = m * 2 ** average
#the theoretical calculation of the bias factor for SLL is unknown
#however, rough empircal tuning gives us a coefficient of around 0.764
BIAS = 0.764
#correct for bias
estimate = BIAS * estimate
return estimate