Research-Project-Supervisor-Predictor/Instructor Predictor.py at master · YunusStahlschmidt/Research-Project-Supervisor-Predictor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from tkinter import *
import docclass
import urllib2
from bs4 import BeautifulSoup


class PI_Estimator(Frame):
    def __init__(self, parent):  # Initializing the UI
        Frame.__init__(self)
        self.initUI()
        self.my_predictor = Predictor()  # creating predictor object

    def initUI(self):  # Creates the UI
        self.label_title = Label(text="PI Estimator Tool for SEHIR CS Projects", font=("", "20", "bold"), fg="white", bg="teal").pack(fill=X)
        self.entry_url_people = Entry(width=100, justify=CENTER)
        self.entry_url_people.pack(pady=(20, 10))
        self.entry_url_people.insert(0, "http://cs.sehir.edu.tr/en/people/")
        self.entry_url_research = Entry(width=100, justify=CENTER)
        self.entry_url_research.pack(pady=(0, 10))
        self.entry_url_research.insert(0, "http://cs.sehir.edu.tr/en/research/")
        self.button_fetch = Button(text="Fetch", width=10, command=self.fetch_data)
        self.button_fetch.pack(padx=10, pady=10)
        self.frame_main = Frame()
        self.frame_main.pack()
        self.label_projcets = Label(self.frame_main, text="Projects", font=("", "10", "bold")).grid(row=0, column=0)
        self.frame_listbox = Frame(self.frame_main)
        self.frame_listbox.grid(row=1, column=0)
        self.scrollbar_proteins = Scrollbar(self.frame_listbox)
        self.scrollbar_proteins.pack(side=RIGHT, fill=Y)
        self.listbox = Listbox(self.frame_listbox, yscrollcommand=self.scrollbar_proteins.set, height=10, width=90)
        self.listbox.bind("<<ListboxSelect>>", self.on_select)
        self.listbox.pack(side=LEFT)
        self.scrollbar_proteins.configure(command=self.listbox.yview)
        self.label_prediction_title = Label(self.frame_main, text="Prediction", font=("", "10", "bold")).grid(row=0, column=1)
        self.label_prediction_placeholder = Label(self.frame_main, width=15, height=1, text="", font=("", "15", "bold"))
        self.label_prediction_placeholder.grid(row=1, column=1, padx=(20, 0))
        self.pack(fill=BOTH)

    def fetch_data(self):  # calls methods from predictor class to fetch the necessary data from the provided links and trains the classifier
        self.my_predictor.fetch_publications()  # gets the publications
        self.my_predictor.fetch_projects()  # gets the research projects
        insertion_list = []
        for project in self.my_predictor.projects:
            insertion_list.append(project)
        insertion_list.sort()  # for alphabetical order
        self.my_predictor.train_classifier()  # calls the method from predictor class to create and train the naive bayes classifier
        for item in insertion_list:
            self.listbox.insert(END, item)

    def on_select(self, event):  # calls the method from predictor class to make a PI prediction for selected project
        self.my_predictor.predict_PI()


class Predictor:  # Predictor class for collecting data, training and creating classifier and make prediction
    def __init__(self):
        self.classifier = ""
        self.faculty_members = {}
        self.projects = {}

    def fetch_members(self):  # collects the links to the members profile pages from the first link
        url = app.entry_url_people.get()
        page = urllib2.urlopen(url)
        doc = page.read()
        soup = BeautifulSoup(doc, 'html.parser')
        items = soup.find_all(class_="member")
        links_temp = []
        for i in items:
            for tag in i.find_all('a'):
                links_temp.append(tag.get('href'))
        links = []
        i = 0
        while i < len(links_temp):
            links.append("http://cs.sehir.edu.tr"+links_temp[i])
            i += 3
        return links

    def fetch_publications(self):  # goes to each members profile page and collects all the necessary data
        list_of_members_url = self.fetch_members()
        for member_url in list_of_members_url:
            url = member_url
            page = urllib2.urlopen(url)
            doc = page.read()
            soup = BeautifulSoup(doc, 'html.parser')
            name = soup.find_all('h3')
            name = name[0].text.split()
            name = name[0] + " " + name[-1]
            table = soup.find_all(class_="tab-pane active pubs")
            publications = []
            for item in table:
                for tag in item.find_all("li"):
                    app_item = tag.text.strip()[4:]  # filtering out unwanted info and characters
                    while app_item.startswith('\n'):
                        app_item = app_item[1:]
                    if app_item.endswith("[1\n  Citation]"):
                        app_item = app_item[:-19]
                    elif app_item.endswith("\n  \n  Citations]"):
                        app_item = app_item[:-23]
                        while app_item.endswith('\n'):
                            app_item = app_item[:-1]
                        app_item = app_item
                    publications.append(app_item)
            current_fac_member = FacultyMember(name, member_url, publications)  # crating a faculty member object for adding to database
            self.faculty_members.setdefault(name, current_fac_member)  # adding each member to the database

    def fetch_projects(self):  # goes to the projects page from the second link and gets all the wanted info
        url = app.entry_url_research.get()
        page = urllib2.urlopen(url)
        doc = page.read()
        soup = BeautifulSoup(doc, 'html.parser')
        items = soup.find_all(class_="list-group-item")
        for item in items:
            title = item.find_all("h4")
            title = title[0].text.strip()
            PI = item.find_all("p")
            PI = PI[2].find("a")
            PI = PI.text.strip()
            summary = item.find(class_="gap")
            summary = summary.text
            current_r_pro = ResearchProject(title, summary, PI)  # crates a research project object for adding into databse
            if PI not in self.faculty_members:
                continue
            else:
                self.projects.setdefault(title, current_r_pro)  # adding each project to the database

    def train_classifier(self):  # crates and trains a naive bayes classifier object
        self.classifier = docclass.naivebayes(docclass.getwords)
        for member in self.faculty_members:  # training the naive bayes classifier object with the publications of each member
            for publication in self.faculty_members[member].publications:
                self.classifier.train(publication, member)

    def predict_PI(self):  # makes a PI prediction for the selected project
        selection = app.listbox.get(app.listbox.curselection())  # gets the selection
        project_summary = self.projects[selection].summary
        data = selection + " " + project_summary
        prediction = self.classifier.classify(data)  # calls a method from classifier class from docclass.py to make a prediciton
        app.label_prediction_placeholder.configure(text=prediction)
        if prediction == self.projects[selection].PI_name:  # configures the background of the label based on the correctness of the prediction
            app.label_prediction_placeholder.configure(bg="green")
        else:
            app.label_prediction_placeholder.configure(bg="red")


class FacultyMember:  # class for a faculty member
    def __init__(self, name, profile_url, publications):
        self.name = name
        self.profile_url = profile_url
        self.publications = publications


class ResearchProject:  # class for a research project
    def __init__(self, title, summary, PI_name):
        self.title = title
        self.summary = summary
        self.PI_name = PI_name


root = Tk()
root.geometry("1050x500")
root.title("PI Estimator Tool for SEHIR CS Projects")
app = PI_Estimator(root)
root.mainloop()