-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathfonction_traitement.py
More file actions
148 lines (114 loc) · 5.21 KB
/
fonction_traitement.py
File metadata and controls
148 lines (114 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 16 11:45:39 2020
@author: Victor HENRIO
"""
import math
import pandas as pd
import actors_labelisation as act
import pandas as pd
import statistics
import API as api
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OrdinalEncoder, OneHotEncoder
def delete_raws_nan(movie_ratings):
'''
If there is not a specific attribut we delete the ligne cause
:param1 dataframe movie_ratings: with all the dataframe from movies,
:return dataframe movie_ratings: with all the dataframe from movies
:rtype: dataframe
'''
i = 0
for note in movie_ratings.itertuples():
test = math.isnan(float(note.imdb_ratings))
if test is True:
movie_ratings = movie_ratings.drop(movie_ratings.index[i])
i -= 1
i += 1
return movie_ratings
def replace_metascore(movie_ratings):
'''
We replace the metascore by imdb score if the metascore does not exist
:param dataframe movie_ratings: dataframe with all the data from movies
:return dataframe movie_ratings: dataframe with all the data from movies
:rtype: dataframe
'''
i = 0
for note in movie_ratings.itertuples():
test = math.isnan(float(note.metascore))
if test is True:
movie_ratings['metascore'] = movie_ratings['metascore'].replace(movie_ratings['metascore'][i], note.imdb_ratings*10)
i += 1
return movie_ratings
def clean_dataframe_scrapping(movie_ratings,genres1,genres2,genres3,stars1,stars2,stars3):
'''
All the procedures that we need to clean the data frame:
drop mv_page, imdb_ratings
set on index movie
labelize the category
delete line without imdb_ratings
replace metascore
:param1 movie_ratings: dataframe with all the dataframe from movies
:return dataframe movie_ratings: clean dataframe with all the dataframe from movies
:rtype: dataframe
'''
#movie_ratings = movie_ratings.drop(["mv_page"],axis=1)
movie_ratings = movie_ratings.drop(["year"],axis=1)
#movie_ratings = movie_ratings.drop(["Unnamed: 0"],axis=1)
movie_ratings = movie_ratings.drop(["rank"],axis=1)
movie_ratings = movie_ratings.drop(["category"],axis=1)
movie_ratings = movie_ratings.set_index('movie')
movie_ratings = api.API_search_director(movie_ratings) #Récupération du directeur avec l'API
movie_ratings['runtime'] = pd.to_numeric(movie_ratings['runtime'])
movie_ratings['budget'] = pd.to_numeric(movie_ratings['budget'])
movie_ratings['gross'] = pd.to_numeric(movie_ratings['gross'])
# movie_ratings['runtime'] = movie_ratings['runtime'].fillna(movie_ratings['runtime'].mean())
# movie_ratings['budget'] = movie_ratings['budget'].fillna(movie_ratings['budget'].mean())
# movie_ratings['gross'] = movie_ratings['gross'].fillna(movie_ratings['gross'].mean())
movie_ratings = replace_metascore(movie_ratings)
movie_ratings = add_0_win_nom(movie_ratings)
movie_ratings = act.imputation_previous_value(movie_ratings)
movie_ratings = movie_ratings.dropna()
movie_ratings = act.labelisation(movie_ratings,genres1,genres2,genres3,stars1,stars2,stars3)
return movie_ratings
def clean_dataframe(movie_ratings,genres1,genres2,genres3,stars1,stars2,stars3):
'''
All the procedures that we need to clean the data frame:
drop mv_page, imdb_ratings
set on index movie
labelize the category
delete line without imdb_ratings
replace metascore
:param1 movie_ratings: dataframe with all the dataframe from movies
:return dataframe movie_ratings: clean dataframe with all the dataframe from movies
:rtype: dataframe
'''
movie_ratings = movie_ratings.drop(["mv_page"],axis=1)
movie_ratings = movie_ratings.drop(["year"],axis=1)
movie_ratings = movie_ratings.drop(["Unnamed: 0"],axis=1)
movie_ratings = movie_ratings.drop(["rank"],axis=1)
movie_ratings = movie_ratings.drop(["category"],axis=1)
movie_ratings = movie_ratings.set_index('movie')
movie_ratings['runtime'] = pd.to_numeric(movie_ratings['runtime'])
movie_ratings['budget'] = pd.to_numeric(movie_ratings['budget'])
movie_ratings['gross'] = pd.to_numeric(movie_ratings['gross'])
movie_ratings['runtime'] = movie_ratings['runtime'].fillna(movie_ratings['runtime'].mean())
movie_ratings['budget'] = movie_ratings['budget'].fillna(movie_ratings['budget'].mean())
movie_ratings['gross'] = movie_ratings['gross'].fillna(movie_ratings['gross'].mean())
movie_ratings = replace_metascore(movie_ratings)
movie_ratings = add_0_win_nom(movie_ratings)
movie_ratings = act.imputation_previous_value(movie_ratings)
movie_ratings = movie_ratings.dropna()
movie_ratings = act.labelisation(movie_ratings,genres1,genres2,genres3,stars1,stars2,stars3)
return movie_ratings
def add_0_win_nom(movie_ratings):
col = ['win','nom']
for c in col:
i = 0
for ind in movie_ratings[c]:
test = math.isnan(float(ind))
if test is True:
movie_ratings[c][i] = 0
else:
pass
i += 1
return movie_ratings