forked from ArkinDharawat/JournalTopicModel
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_papers.py
More file actions
56 lines (44 loc) · 1.78 KB
/
load_papers.py
File metadata and controls
56 lines (44 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pandas as pd
import logging
import os
import mysql.connector
import yaml
from SQLQueries.SQLStrQuery import SQLStrQuery
from TopicModel.TopicExtractor import TopicModel
from TopicModel.TextProcessor import remove_non_ascii
logger = logging.getLogger("load-journal-sql")
SQLStrObj = SQLStrQuery(10) # TODO: Change to global var
TopicModelobj = TopicModel(os.path.join(os.path.expanduser('~'), "../project/data/"))
# Read YAML file
with open(os.path.join(os.getcwd(), "config.yml"), 'r') as stream:
config = yaml.safe_load(stream)
chunksize = 10 ** 3
df_full = pd.read_csv(os.path.join(os.path.expanduser('~'), "../project/data/AllArticles.csv"), chunksize=chunksize,
header=None)
cnx = mysql.connector.connect(**config)
logger.info("Connected to SQL")
cursor = cnx.cursor()
logger.info("Created Cursor")
insert_paper_query = SQLStrObj.insert_paper()
i = 10
for chunk in df_full:
if i > 10:
break
df_chunked = chunk.iloc[:, [1, 2, 10, 13]]
df_chunked.columns = ["title", "author", "abstract", "journal_id"]
for id, row in df_chunked.iterrows():
title, author, abstract, journal_id = row.title, row.author, row.abstract, row.journal_id
title = remove_non_ascii(title)
abstract = remove_non_ascii(abstract)
topics = TopicModelobj.get_topics(title=title, abstract=abstract)
try:
insert_topic_query = SQLStrObj.insert_topic(id, topics)
cursor.execute(insert_paper_query, (id, author, journal_id, title, abstract))
cursor.execute(insert_topic_query)
except Exception as e:
logger.debug("Failed at {0} with exception {1}".format(str(id), e))
# cnx.commit() # Commit one row at a time
i += 1
cursor.close()
cnx.close()
logger.info("Connection Closed")