counteract/nlp1.py at master · CounteractIO/counteract · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# nlp test
import numpy
from sklearn import svm
from sklearn import datasets


# returns a sourcefile's array of sentences, dictionary of words with respective counts, and arbitrary (but
# consistently) ordered list of words
def sentencify(sourcefile):
    sentences = []
    for line in open(sourcefile, 'r'):
        l1 = ''
        for i in range(len(line)):
            try:
                if line[i] != '\n':
                    l1 += (line[i])
            except UnicodeEncodeError:
                pass
        line_sent = l1.lower().split(".")
        if len(sentences) > 0:
            if len(line_sent) > 0:
                sentences[-1] += line_sent[0]
            sentences += line_sent[1:]
        else:
            sentences = line_sent
    words = {}
    for x in sentences:
        for y in x.split(" "):
            words[y] = words.get(y, 0) + 1
    word_list = list(words)
    word_list.sort()
    while '' in sentences:
        sentences.remove('')
    return sentences, words, word_list


# simple helper method that counts the number of occurences of element in list_
def count(list_, element):
    lc = list_.copy()
    r = 0
    while element in lc:
        lc.remove(element)
        r += 1
    return r


# represents each sentence as a vector related to its relative frequency of words in the word_list
def vectorize(sentence, words, word_list):
    l = []
    for word in word_list:
        l.append(count(sentence.split(" "), word) * 1.0 / len(sentence.split(" ")) * 1 / words.get(word,
                                                                                                   1))  # adds vector data on relative frequency of word
    return l


sentences, words, word_list = sentencify('posts.txt')
sentence_d = {}
for sentence in sentences:
    sentence_d[sentence] = vectorize(sentence, words, word_list)
tags = {}  # blank tags dictionary (for user submitted tags)
print("A series of sentences will be presented to you.")

print(
    "On a scale from 0-100, indicate the 'positivity' of the statement, where 100 represents an extremely positive "
    "statement.")

print(
    "You will have to hit enter to submit each time. You will also have to hit enter an additional time when all "
    "sentences have been prompted.")

# allows user to rate sentences

with open('classify.txt') as f:
    for a in f.readlines():
        for sentence in sentences:
            tags[sentence] = a

l_vectors, l_tags = [], []

for sentence in sentences:
    l_vectors.append(sentence_d[sentence])
    l_tags.append(tags[sentence])

l_tags, l_vectors = numpy.asarray(l_tags), numpy.asarray(l_vectors)
print(str(l_vectors) + "vectors in training")

train = svm.SVC()
train.fit(l_vectors, l_tags)

# forms our vectors and tags and submits them as appropriate into our virtual machine

test_sent, _, _1 = sentencify('test.txt')

# extracts sentences from test document

test_d = {}

for sentence in test_sent:
    test_d[sentence] = vectorize(sentence, words,
                                 word_list)  # we use the dictionary and list ordering from the training data for
    # consistency

l, lname = [], []

for x in test_d:
    lname.append(x)
    l.append(test_d[x])
x = train.predict(numpy.asarray(l))  # prints our predicted ratings

print("PREDICTED POSITIVITY LEVELS\n for")
for i in range(len(x)):
    print(lname[i] + " : " + str(x[i]))