-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfind_dup.py
More file actions
executable file
·38 lines (31 loc) · 1.07 KB
/
find_dup.py
File metadata and controls
executable file
·38 lines (31 loc) · 1.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sys
import pandas as pd
import lib.segment as segment
import lib.dupe as dupe
data_file = sys.argv[1]
seg_method = sys.argv[2]
comp_method = sys.argv[3]
data = pd.read_csv(data_file)
segments = segment.assign_segments(seg_method, data)
print "Using", seg_method, "for segmentation of data set."
print "Using", comp_method, "for comparing records."
i = 0
d = 0
results = []
for s in segments:
print "comparing segment ", s
for x in data.loc[data["segment"] == s].iterrows():
#print "comparing data to id ", x[1]["key"]
for y in data.loc[data["segment"] == s].iterrows():
if x[1]["key"] == y[1]["key"]:
continue
else:
i = i + 1
score = dupe.compare(comp_method, x, y)
if score > 0.8:
d = d + 1
results.append({"key_x":x[1]["key"], "key_y":y[1]["key"], "score":score})
results_df = pd.DataFrame(results)
results_df.to_csv("output/results.csv")
print "Compared", i, "records."
print "Found", d , "possible duplicates."