-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLinearRegression.py
More file actions
95 lines (62 loc) · 3.16 KB
/
LinearRegression.py
File metadata and controls
95 lines (62 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
teams = pd.read_csv("teams.csv")
#print(teams)
#Data Exploration
teams = teams[["team", "country", "year", "athletes", "age", "prev_medals", "medals"]]
print(teams)
teams_numeric = teams[["athletes", "age", "prev_medals", "medals"]] #only ints because .corr won't work with string values like the year, team, and country
print(teams_numeric.corr()["medals"])
import seaborn as sns #run in interactive session using jupyter notebook
#sns.lmplot(x = "athletes", y = "medals", data = teams, fit_reg = True, ci = None) #fits regression line over data, ci = None gets rid of confidence interval.
#sns.lmplot(x="age", y="medals", data = teams, fit_reg = True, ci =None)
#teams.plot.hist(y= "medals") #look at column you are trying to predict to see how balanced it is
#Data Cleaning
print(teams[teams.isnull().any(axis = 1)]) #finds any rows w missing values
teams = teams.dropna() #removes all empty rows from data
print(teams)
train = teams[teams["year"]<2012].copy()
test = teams[teams["year"]>= 2012].copy()
print(train.shape)
print(test.shape) #abt 80/20 split in training vs test data so its valid
#Training the Model
#using mean absolute error to evaluate metric
import sklearn
from sklearn.linear_model import LinearRegression
reg = LinearRegression() #training a LR model
predictors = ["athletes", "prev_medals"] #using these columns
target = "medals" #to predict this
print(reg.fit(train[predictors], train["medals"]))
predictions = reg.predict(test[predictors])
test["predictions"] = predictions
#print(predictions) #returns very long numpy array (not rounded)
#countries can only predict a whole number of medals and some numbers are negative which is incorrect
#rescale data
test.loc[test["predictions"] < 0, "predictions"] = 0 #if predictions < 0, turned into a zero
test["predictions"] = test["predictions"].round() #rounds values to nearest whole number
from sklearn.metrics import mean_absolute_error
error = mean_absolute_error(test["medals"], test["predictions"])
print(error) #3.299
print(teams.describe()["medals"]) #error is far below SD
print(test[test["team"]=="USA"]) #can view predictions by country
print(test[test["team"]=="IND"]) #enters less athletes
errors = (test["medals"] - test["predictions"]).abs()
print(errors) #difference between predicted number of medals and actual number of medals
#group error rate by team
errors_by_team = errors.groupby(test["team"]).mean()
print(errors_by_team) #prints how many medals off we were for each country
medals_by_team = test["medals"].groupby(test["team"]).mean() #finds how many medals each team won on average
print(medals_by_team)
#Clean results
error_ratio = errors_by_team / medals_by_team
print(error_ratio) #has many NaN because a lot of team's have an average number won = 0
error_ratio = error_ratio[~pd.isnull(error_ratio)] #there is also some countries with infinity
print(error_ratio)
import numpy as np
error_ratio = error_ratio[np.isfinite(error_ratio)] #deals with inf
print(error_ratio)
error_ratio.plot.hist()
error_ratio = error_ratio.sort_values()
print(error_ratio)
#Add in more predictors
#try different ML models (neural network/computer vision)
#build a specific athlete level model