-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathuk_scraping.py
More file actions
229 lines (200 loc) · 7.54 KB
/
uk_scraping.py
File metadata and controls
229 lines (200 loc) · 7.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import os
import time
import json
import datetime
import requests
from pathlib import Path
from typing import Dict, Optional, List, Tuple
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from htmldate import find_date
from web_scraping.get_information import extract_article_info
GUARDIAN_API_KEY = os.environ.get("GUARDIAN_API_KEY")
# copied here from mex_scraping_functions.py for now to add in incognito options
# move to helper misc_helper_functions.py?
def selenium_driver_helper(main_url: str):
"""
Create driver object using chrome driver and selenium.
:param main_url: string with link to main website
:return driver: selenium driver object
"""
# path to the chromedriver executable
chromedriver = "C:/Users/robal/Downloads/chromedriver_win32chromedriver/chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
options = webdriver.ChromeOptions()
options.add_argument("--incognito")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(main_url)
return driver
# move to helper misc_helper_functions.py?
def call_api_helper(params: Dict[str, str], url: str):
'''
Simple requests helper functon
input:
params: dict with parameters for api call
url: string of base url for requests
returns: json requests output
'''
r = requests.get(url, params)
return json.loads(r.content)
def times_scraper_helper(driver):
'''
BeautifulSoup helper function
input:
driver: webdriver object
returns: list of article links (or empty list if no articles are found)
'''
soup = BeautifulSoup(driver.page_source, 'html.parser')
rows = soup.find_all("a", class_=None)
if rows:
return ['https://www.thetimes.co.uk' + x.get("href") for x in rows]
else:
return []
def sun_scraper_helper(driver):
'''
BeautifulSoup helper function
input:
driver: webdriver object
returns: list of article links (or empty list if no articles are found)
'''
soup = BeautifulSoup(driver.page_source, 'html.parser')
rows = soup.find_all("a", class_="teaser-anchor teaser-anchor--search")
if rows:
return [x.get("href") for x in rows]
else:
return []
def click_selenium_helper(driver, xpath: str):
'''
Selenium helper function to click link
inputs:
driver: Selenium webdriver object
xpath: string, the xpath of the button to click
returns:
True if it successfully clicks the button, False otherwise
'''
try:
driver.find_elements_by_xpath(xpath)[0].click()
return True
except:
return False
def get_guardian_articles_by_term(search_term: str):
'''
Call guardian articles api for a specific search term
input:
search_term: string, the term to be searched in the api
returns: DataFrame of the search results
'''
url='https://content.guardianapis.com/search'
params = {'api-key': GUARDIAN_API_KEY, 'from-date':'2020-01-01',
'show-fields': 'body', 'page-size': 199, 'q': search_term,
'order-by': 'relevance'}
articles = []
res = call_api_helper(params, url)
articles += res['response']['results']
pages = res['response']['pages']
stopping = min(pages, 6)
for page in range(2, stopping+1):
params['page'] = page
r = call_api_helper(params, url)
if 'results' in r['response']:
results = r['response']['results']
else:
results = []
articles += results
df = pd.DataFrame(articles)
df['search_term'] = search_term
return df
def scrape_the_times(keyword: str):
'''
Scrape links of articles from The Times newspaper based on a keyword.
input:
keyword: string to be searched
returns: list of strings of article links
'''
all_links = []
url = 'https://www.thetimes.co.uk/search?source=nav-desktop&filter=past_year&q=' + keyword
driver = selenium_driver_helper(url)
# handle cookie consent
time.sleep(2)
driver.switch_to_frame('sp_message_iframe_479077')
driver.find_elements_by_xpath("/html/body/div/div[2]/div[3]/button[2]")[0].click()
driver.switch_to.parent_frame()
article_links = times_scraper_helper(driver)
all_links.extend(article_links)
time.sleep(5)
xpath_first_page = '/html/body/section/div/div[3]/ul/li/a'
xpath_other_pages = '/html/body/section/div/div[3]/ul/li[2]/a'
if click_selenium_helper(driver, xpath_first_page):
all_links.extend(times_scraper_helper(driver))
time.sleep(3)
page = 2
while click_selenium_helper(driver, xpath_other_pages) and page < 50:
all_links.extend(times_scraper_helper(driver))
page += 1
time.sleep(3)
return all_links
def scrape_the_sun(keyword: str):
'''
Scrape links of articles from The Times based on a keyword.
input:
keyword: string to be searched
returns: list of strings of article links
'''
all_links = []
url = 'https://www.thesun.co.uk/?s=' + keyword
driver = selenium_driver_helper(url)
article_links = sun_scraper_helper(driver)
all_links.extend(article_links)
time.sleep(2)
for page in range(2, 51):
url = 'https://www.thesun.co.uk/page/{}/?s='.format(page) + keyword
if click_selenium_helper(driver, '//a[@href="'+url+'"]'):
all_links.extend(sun_scraper_helper(driver))
time.sleep(2)
if page%3==0:
time.sleep(4)
return all_links
def get_article_info_df(links: List[str]):
'''
Takes a list of article urls and returns a dataframe containing article info for all links
inputs:
links: list of strings, the article links
returns: pandas DataFrame containing article info for each link
'''
list_of_dictionaries = []
for link in set(links):
list_of_dictionaries.append(extract_article_info(link))
return pd.DataFrame(list_of_dictionaries)
if __name__ == '__main__':
root = Path.cwd()
data_dir = root/"data"
data_dir.mkdir(exist_ok=True)
keywords = ["rape", "gang-rape", '"gender-based violence"', '"child abuse"', '"forced marriage"',
'"forced abortion"', '"sexual assault"', '"domestic violence"', '"sexual abuse"', '"woman murder"',
'"honor killing"', '"woman killed over honor"']
# get guardian articles
mode, header = ('w', True)
for keyword in keywords:
guardian_df = get_guardian_articles_by_term(keyword)
guardian_df.to_csv(data_dir/'guardian_scraped.csv', mode=mode, header=header, index=False)
mode, header = ('a', False)
# get the times articles
mode, header = ('w', True)
for keyword in keywords:
times_links = scrape_the_times(keyword)
times_df = get_article_info_df(times_links)
times_df['search_term'] = keyword
times_df.to_csv(data_dir/'times_scraped.csv', mode=mode, header=header, index=False)
mode, header = ('a', False)
# get the sun articles
mode, header = ('w', True)
for keyword in keywords:
sun_links = scrape_the_sun(keyword)
sun_df = get_article_info_df(sun_links)
sun_df['search_term'] = keyword
sun_df.to_csv(data_dir/'sun_scraped.csv', mode=mode, header=header, index=False)
mode, header = ('a', False)