-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathKLD.py
More file actions
66 lines (65 loc) · 2.6 KB
/
KLD.py
File metadata and controls
66 lines (65 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from nltk import FreqDist
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import entropy
from scipy.stats import ks_2samp
import json
base_dir = "POStags/"
flst = ["cd.POSrst", "imdb.POSrst", "sst.POSrst",
"elec.POSrst", "movie.POSrst", "video.POSrst"
]
CD={"pos":3574,"neg":4732,"obj":68352}
Video={"pos":1700,"neg":2303,"obj":29946}
Movie={"pos":3831,"neg":5312,"obj":77807}
Elec={"pos":2544,"neg":3309,"obj":51224}
IMDB={"pos":3588,"neg":4797,"obj":62233}
SST={"pos":452,"neg":456,"obj":5311}
sw_dic={"CD":CD,"Video":Video,"Movie":Movie,"Elec":Elec,"IMDB":IMDB,"SST":SST}
fd_dic = {}
tags_dic = {}
def loadFD():
for fname in flst:
with open(base_dir + fname) as f:
for line in f:
line = line.strip()
tag = line.split("\t")[0]
count = int(line.split("\t")[1])
if fname not in fd_dic:
fd = {}
fd[tag] = count
fd_dic[fname] = fd
else:
fd_dic[fname][tag] = count
if tag not in tags_dic:
tags_dic[tag] = 1
else:
tags_dic[tag] += 1
def sort_dic(odic):
return {k:odic[k]for k in sorted(odic.iterkeys())}
loadFD()
POS = [k for k in tags_dic.keys() if tags_dic[k] == 6 and len(k) > 0]
for fl in flst:
fd_dic[fl] = {k:v for k, v in fd_dic[fl].iteritems() if k in POS}
pos_array = np.zeros((len(flst), len(flst)))
# fd_dic=sorted(fd_dic.iteritems(), key=lambda (k,v): (v,k))
print ",".join(fd_dic.keys())
for dk, dv in fd_dic.iteritems():
dv = sort_dic(dv)
for ik, iv in fd_dic.iteritems():
# if "imdb" in dk or "sst" in dk:
# print dk,ik,entropy(dv.values(),iv.values())
iv = sort_dic(iv)
kl = entropy(dv.values(), iv.values())
pos_array[fd_dic.keys().index(dk)][fd_dic.keys().index(ik)] = kl # [0]
print dk, ",", json.dumps(pos_array[fd_dic.keys().index(dk)].tolist()).replace("],", "\n").replace("[", "").replace("]", "").replace(" ", "")
pos_sarray = np.zeros((6, 6))
print ",".join(sw_dic.keys())
for dk, dv in sw_dic.iteritems():
dv = sort_dic(dv)
for ik, iv in sw_dic.iteritems():
# if "imdb" in dk or "sst" in dk:
# print dk,ik,entropy(dv.values(),iv.values())
iv = sort_dic(iv)
kl = entropy(dv.values(), iv.values())
pos_sarray[sw_dic.keys().index(dk)][sw_dic.keys().index(ik)] = kl # [0]
print dk, ",", json.dumps(pos_sarray[sw_dic.keys().index(dk)].tolist()).replace("],", "\n").replace("[", "").replace("]", "").replace(" ", "")