-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Expand file tree
/
Copy pathmultiprocess.py
More file actions
38 lines (33 loc) · 1.21 KB
/
Copy pathmultiprocess.py
File metadata and controls
38 lines (33 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import pandas as pd
import numpy as np
from multiprocessing import Process, Queue
import os
import time
import Thread
def getLinks(bsObj, queue):
print('Getting links in {}'.format(os.getpid()))
links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
return [link for link in links if link not in queue.get()]
def scrape_article(path, queue):
queue.get().append()
print("Process {} list is now: {}".format(os.getpid(), visited))
html = urlopen('http://en.wikipedia.org{}'.format(path))
time.sleep(5)
bsObj = BeautifulSoup(html, 'html.parser')
title = bsObj.find('h1').get_text()
print('Scraping {} in process {}'.format(title, os.getpid()))
links = getLinks(bsObj)
if len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
print(newArticle)
scrape_article(newArticle)
processes = []
queue = Queue()
processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', queue,)))
processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', queue,)))
for p in processes:
p.start()