-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGapMinderMulti.py
More file actions
53 lines (40 loc) · 1.67 KB
/
GapMinderMulti.py
File metadata and controls
53 lines (40 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
##This code tests multiple regression with the GapMinder data set
##import modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
##Read data
data = pd.read_csv('C:\\Users\\Vadim Katsemba\\Documents\\gapminder.csv')
##Customize the data frame and remove missing and unknown values for the columns pertaining to the response and explanatory variables.
dataf = pd.DataFrame()
dataf['Income'] = data['incomeperperson'].replace(' ',np.NaN).astype(float)
dataf['EmployRate'] = data['employrate'].replace(' ',np.NaN).astype(float)
dataf['LifeExpect'] = data['lifeexpectancy'].replace(' ',np.NaN).astype(float)
dataf['ResidElectric'] = data['relectricperperson'].replace(' ',np.NaN).astype(float)
dataf['Urban'] = data['urbanrate'].replace(' ',np.NaN).astype(float)
##Summarize the data frame
dataf = dataf.dropna()
dataf.describe()
##Center the means
COLS = ['EmployRate','LifeExpect','ResidElectric','Urban']
for c in COLS:
dataf[c] = dataf[c]-dataf[c].mean()
dataf.describe()
##Run the multiple regression
multlm = smf.ols('Income ~ EmployRate + LifeExpect + ResidElectric + Urban', data = dataf).fit()
print(multlm.summary())
##Generate the QQ Plot
qq_plot = sm.qqplot(multlm.resid, line='r')
##Generate the standardized residual plot
stdres=pd.DataFrame(multlm.resid_pearson)
plt.plot(stdres, 'o', ls='None')
l = plt.axhline(y=0, color='r')
plt.ylabel('Standardized Residual')
plt.xlabel('Observation Number')
##Generate variable interaction plots
for c in COLS:
regress = sm.graphics.plot_regress_exog(multlm, c )
regress.show()
influence =sm.graphics.influence_plot(multlm,size=2)