-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathfd_discover.py
More file actions
124 lines (106 loc) · 3.16 KB
/
fd_discover.py
File metadata and controls
124 lines (106 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# coding: utf-8
from __future__ import print_function
import numpy as np
import csv
import time
from datetime import datetime
from collections import defaultdict
t1 = datetime.now()
partitions = {}
def getFrozenSetFromOne(x):
return frozenset([x,])
def merge_partition(ps1, ps2):
s = []
iRow2p = {}
for i, p1 in enumerate(ps1):
for iRow in p1:
iRow2p[iRow] = i
for p2 in ps2:
tmp = defaultdict(list)
for iRow in p2:
tmp[iRow2p[iRow]].append(iRow)
s += tmp.values()
return s
def get_partition(attributes):
if attributes in partitions:
return partitions[attributes]
if len(attributes) == 0:
partitions[attributes] = []
elif len(attributes) == 1:
iAttr = tuple(attributes)[0]
d = defaultdict(list)
for index, row in enumerate(table):
d[row[iAttr]].append(index)
partitions[attributes] = d.values()
else:
attr_tuple = tuple(attributes)
ps1 = get_partition(frozenset(attr_tuple[0:-1]))
ps2 = get_partition(frozenset(attr_tuple[0:-2] + attr_tuple[-1:]))
partitions[attributes] = merge_partition(ps1, ps2)
return partitions[attributes]
def isValid(X, E):
'''
test if X\{E} -> E is valid
X is a set of number, E is a number
'''
return len(get_partition(X - {E})) == len(get_partition(X))
def compute_dependencies(L): # L is a set of tuple of number
L_new = L.copy()
for X in L:
Xs = frozenset(X)
RHS[X] = R
for E in Xs:
RHS[X] = RHS[X] & RHS[Xs - {E}]
for E in RHS[X] & Xs:
if isValid(Xs, E):
fds.append((Xs - {E}, E))
RHS[X] -= getFrozenSetFromOne(E)
RHS[X] = RHS[X] & Xs
if len(RHS[X]) == 0:
L_new.remove(X)
return L_new
def generate_next_level(L):
# list comprehension?
Ln = set([])
for l1 in L:
for l2 in L:
if l1 != l2 and len(l1 - l2) == 1:
Ln.add(l1 | l2)
return Ln
def mycmp(fd1, fd2):
left1 = sorted(list(fd1[0]))
left2 = sorted(list(fd2[0]))
if left1 < left2 or (fd1[0] == fd2[0] and fd1[1] < fd2[1]):
return -1
elif left1 > left2 or (fd1[0] == fd2[0] and fd1[1] > fd2[1]):
return 1
else:
return 0
def output_fd(fds):
with open(output_filename, 'w') as f:
fds_sorted = sorted(fds, cmp=mycmp)
for fd in fds_sorted:
str = ''
for l in sorted(list(fd[0])):
str += '%d ' % (l+1)
str += '-> %d\n' % (fd[1]+1)
f.write(str)
# get data
input_filename = 'data.txt'
output_filename = 'outputp.txt'
with open(input_filename, 'rb') as f:
reader = csv.reader(f)
table = map(tuple, reader)
maxL = len(table[0])
R = frozenset(range(maxL))
RHS = {frozenset([]): R}
fds = []
def main():
L = frozenset(map(getFrozenSetFromOne, R))
L = compute_dependencies(L)
for i in range(1, maxL):
L = compute_dependencies(generate_next_level(L))
print('level: %d, time used: %s' %(i+1, datetime.now() - t1))
output_fd(fds)
print('time used: %s' %(datetime.now() - t1))
main()