StockPredictionWithSentiment/sentiment_analysis.py at master · ankitesh97/StockPredictionWithSentiment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81


from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
import pandas as pd

lemmatizer = WordNetLemmatizer()


def penn_to_wn(tag):

    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


def clean_text(text):
    text = text.replace("<br />", " ")
    text = text.decode("utf-8")

    return text


def swn_polarity(text):

    text = clean_text(text)

    positive_sent = 0
    negative_sent = 0
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))

        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue

            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue

            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue

            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())

            sentiment = swn_synset.pos_score() - swn_synset.neg_score()
            if sentiment>=0:
                positive_sent += sentiment
            else:
                negative_sent += sentiment

    return positive_sent,negative_sent

def readTweets():
    return list(pd.from_csv('data/tweets.csv'))


def main():
    #read tweets
    tweets = readTweets()
    data = []
    for tweet in tweets:
        pos,neg = swn_polarity(tweet[2])
        data.append([tweet[0],tweet[1],pos,neg])
    pd.DataFrame(data).to_csv('data/generated_scores.csv')


main()