1+ """
2+ The base mixin used to implement the different scraper
3+
4+ TODO : Write a complete and working doc
5+ TODO : Write tests
6+ TODO : Let the code follow PEP 8 Coding style
7+ """
8+
9+ import requests
10+ import shortuuid as shuuid
11+ from bs4 import BeautifulSoup
12+ import time
13+
14+ NO_DESCRIPTION = 'No description provided by the site'
15+
16+
17+ class PyToMeBaseScrapperMixin :
18+ """
19+ default url : https://pythonjobs.github.io/
20+ """
21+
22+ base_url = ''
23+ job_block = []
24+ job_data_set = []
25+
26+ def __init__ (self , base_url , * args , ** kwargs ):
27+ if base_url == None :
28+ self .base_url = 'https://pythonjobs.github.io/'
29+ else :
30+ self .base_url = base_url
31+ self .job_block = []
32+
33+ def get_datablocks (self ,page_content ):
34+ """
35+ Retrive all the block from the targeted page which represents job info,
36+ result should be on the form of
37+ [
38+ 'html content here',
39+ 'html content here',
40+ ......
41+ ] # to be parsed one by one with get_block_content()
42+ """
43+ raise NotImplementedError ('This method has not being implemented, the current class should be use only by inheritance' )
44+
45+ def get_block_content (self ):
46+ """
47+ Retrive the content of one block of data, data should be shaped as follow
48+ {
49+ 'job_title':'',
50+ 'job_description:'',
51+ 'job_date':'',
52+ 'job_location':'',
53+ 'job_details_link':'',
54+ 'job_compagny':'',
55+ 'job_site':'', # where the job has been scrapped
56+ 'job_hash':'' # built by using shortuuuid() lib by combining job_title+job_compagny+job_date str
57+ }
58+ """
59+ raise NotImplementedError ('This method has not being implemented, the current class should be use only by inheritance' )
60+
61+ def _log_result (self ):
62+ #print(self.job_block)
63+ pass
64+
65+ def launch (self ):
66+ self .__init__ ()
67+
68+ if self .job_block == []:
69+ page_response = requests .get (self .base_url )
70+ page_content = BeautifulSoup (page_response .content , 'html.parser' )
71+ self .get_datablocks (page_content )
72+ self .get_block_content ()
73+ return self .get_job_dataset ()
74+
75+ def get_job_dataset (self ):
76+ """
77+ Just return the current dataset of job fetched or not yet fetched
78+ """
79+ return self .job_data_set
0 commit comments