-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlp.py
More file actions
134 lines (107 loc) · 4.85 KB
/
nlp.py
File metadata and controls
134 lines (107 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import re
import pandas as pd
import plotly.graph_objects as go
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import TfidfVectorizer
def get_news(company_id, dir_):
df = pd.read_csv(dir_ + company_id + '_News.csv')
df = df.drop_duplicates(subset=['link'])
df = df.sort_values(by=['date'], ascending=False)
source = df['source'].tolist()
df['title'] = df['title'].replace(source, "", regex=True)
df = df.reset_index(drop=True)
return df
class Tokenizer:
def __init__(self, level=3):
self.ws_driver = CkipWordSegmenter(level=level)
self.pos_driver = CkipPosTagger(level=level)
self.ner_driver = CkipNerChunker(level=level)
self.stopwords = self.get_stopwords()
@staticmethod
def get_stopwords():
stopwords = []
with open('./stopwords.txt', 'r', encoding='UTF-8') as file:
for data in file.readlines():
data = data.strip()
stopwords.append(data)
stopwords.append('蘋果日報')
stopwords.append('蘋果新聞網')
return stopwords
@staticmethod
def to_list(content):
if type(content) == str:
return content.split(', ')
else:
return None
def clean(self, content, remove_digit=True):
clean_content = []
for word in content:
if remove_digit and re.search(r'\d+', word):
continue
if (len(word.strip()) < 2) or (word in self.stopwords):
continue
clean_content.append(word)
return clean_content
def tokenize(self, content):
if type(content) == str:
sentence_list = content.split(",")
word_sentence_list = self.ws_driver(sentence_list)
return ", ".join(sum(word_sentence_list, []))
else:
return None
def tokenize_ner(self, content):
if type(content) == str:
sentence_list = content.split(",")
entity_sentence_list = self.ner_driver(sentence_list)
entity_sentence_set = set()
for entity in entity_sentence_list:
entity_sentence_set.update(entity)
entity_sentence_list = list(entity_sentence_set)
entity_list = []
for entity in entity_sentence_list:
entity_list.append((entity.ner, entity.word))
return entity_list
else:
return None
@staticmethod
def get_word_from_ner_dict(ner_dict):
if ner_dict:
return [word for tag, word in ner_dict if tag == 'ORG' or tag == 'PERSON']
else:
return []
def get_tfidf(document, dataframe, max_features=100, max_df=0.5, norm='l1'):
tf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_features=max_features,
max_df=max_df, smooth_idf=False, use_idf=False, norm=norm)
tf = tf_model.fit_transform(document)
df_tf = pd.DataFrame(tf.toarray(), columns=tf_model.get_feature_names(), index=dataframe['date'])
df_sum_tf = pd.DataFrame(df_tf.sum(), columns=['TF'])
tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_features=max_features,
max_df=max_df, smooth_idf=False, use_idf=True, norm=norm)
tfidf = tfidf_model.fit_transform(document)
df_tfidf = pd.DataFrame(tfidf.toarray(), columns=tfidf_model.get_feature_names(), index=dataframe['date'])
df_sum_tfidf = pd.DataFrame(df_tfidf.sum(), columns=['TF-IDF'])
df_sum_tfidf = pd.concat([df_sum_tf, df_sum_tfidf], axis=1)
df_sum_tfidf['IDF'] = df_sum_tfidf['TF-IDF'] / df_sum_tfidf['TF']
df_sum_tfidf = df_sum_tfidf[['TF', 'IDF', 'TF-IDF']]
df_sum_tfidf = df_sum_tfidf.sort_values(by='TF-IDF', ascending=False)
return df_tf, df_tfidf, df_sum_tfidf
def plot_freq(df_sum_tfidf, max_features=20):
df = df_sum_tfidf[:max_features]
fig = make_subplots(rows=2, cols=1,
shared_xaxes=True,
row_heights=[0.8, 0.2],
vertical_spacing=0.1)
fig.add_trace(go.Bar(x=df.index, y=df['TF'], name='TF'), row=1, col=1)
fig.add_trace(go.Bar(x=df.index, y=df['IDF'], name='IDF'), row=2, col=1)
fig.add_trace(go.Bar(x=df.index, y=df['TF-IDF'], name='TF-IDF'), row=1, col=1)
fig.update_layout(yaxis1=dict(range=[df['TF'].min() * 0.9, df['TF'].max() * 1.05]))
fig.update_layout(yaxis2=dict(range=[df['IDF'].min() * 0.9, df['IDF'].max() * 1.05]))
fig.update_layout(legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1), margin=dict(l=20, r=50, t=50, b=50), height=450, showlegend=False, hovermode='x unified')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
return fig