-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathfonction_scraping_total.py
More file actions
59 lines (36 loc) · 1.65 KB
/
Copy pathfonction_scraping_total.py
File metadata and controls
59 lines (36 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 25 17:30:48 2020
@author: Victor HENRIO
"""
import fonction_scraping_accueil as scrap
import fonction_traitement as trait
import definition_tab as dftab
from bs4 import BeautifulSoup
from requests import get
def scraping_total(nb_years,nb_pages):
mv_attributs = dftab.instanciation_tablist()
# Parameters
years_url = scrap.years_loop(nb_years)
pages = scrap.nb_page(nb_pages)
headers = {"Accept-Language": "en-US, en;q=0.5"}
#SCRAPPING :
# For every year in an interval
for year_url in years_url:
# For every page in an interval
for page in pages:
# Make a get request
url = 'https://www.imdb.com/search/title/?release_date='+ str(year_url) +'-01-01,'+ str(year_url) +'-12-31&sort=num_votes,desc&start='+ str(page)+'&ref_=adv_nxt'
print(url)
response = get(url, headers = headers)
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html.parser')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
#Take the information from the containers
mv_attributs = scrap.extraction_data(mv_containers, mv_attributs)
movie_ratings = dftab.creation_dataframe(mv_attributs)
print(movie_ratings.info())
movie_ratings = trait.clean_dataframe_scrapping(movie_ratings,4,5,6,7,8,9)
movie_ratings.to_csv("./Data_csv/Nouveau_scraping.csv")
return movie_ratings