-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
90 lines (76 loc) · 3.5 KB
/
main.py
File metadata and controls
90 lines (76 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# This script is developed by https://github.com/CodeProcessor
import json
import requests
from bs4 import BeautifulSoup
class ProfSearch:
def __init__(self):
self.config = {}
with open("config.json", "r") as fp:
self.config = json.load(fp)
def get_h_index(self, profile):
r = requests.get(profile)
soup = BeautifulSoup(r.content, 'lxml')
table = soup.find('table', attrs={'id': 'gsc_rsb_st'})
for row in table.findAll('tr'):
row_text = [td.text for td in row.findAll('td')]
if len(row_text) > 0 and row_text[0] == "h-index":
return row_text[1]
def main(self):
starting_url = self.config["starting_url"]
prof_list = []
try:
depth = 0
no_of_profs = 0
while True:
depth += 1
r = requests.get(starting_url)
soup = BeautifulSoup(r.content, 'lxml')
profs = soup.findAll('div', attrs={'class': 'gs_ai gs_scl gs_ai_chpr'})
for prof in profs:
no_of_profs += 1
prof_name = prof.img['alt']
bs_interests = prof.findAll('a', attrs={'class': 'gs_ai_one_int'})
prof_profile = f"https://scholar.google.com{prof.a['href']}"
interests = []
for bs_interest in bs_interests:
interests.append(bs_interest.string.lower())
my_interests = self.config["my_interests"]
my_list = [my_interest.lower() in interests for my_interest in my_interests]
if any(my_list):
print(
f"prof {prof_name} Found with interest fields : {[val for ok, val in zip(my_list, my_interests) if ok]}")
h_index = self.get_h_index(prof_profile)
print(f"H-index: {h_index}")
prof_list.append(
{
"name": prof_name,
"interests": interests,
"h-index": h_index,
"profile": prof_profile
}
)
try:
next = soup.find('button', attrs={'aria-label': 'Next'})
onclick = next.attrs["onclick"]
url_pos = onclick.split("/")[-1]
replaced_url = url_pos.replace("\\x26", "&").replace("\\x3d", "=")
full_url = f"https://scholar.google.com/{replaced_url}"
starting_url = full_url
except (AttributeError, KeyError):
break
print(f"Page {depth} total professors searched: {no_of_profs}")
if depth > self.config["search_page_depth"]:
break
except KeyboardInterrupt:
pass
print(f" {'PROF NAME':<25} | {'h-index':<10} |Interests")
with open(self.config["save_filename"], 'a') as fp:
for i, p in enumerate(prof_list):
print(
f"{i:2d} | {p['name']:<25} | {p['h-index']:<10} | {','.join(p['interests']):<50} | {p['profile']}")
fp.write(
f"{i},{p['name']},{p['h-index']},{' | '.join(p['interests'])},{p['profile']}\n"
)
if __name__ == '__main__':
prof_object = ProfSearch()
prof_object.main()