# -*-
coding: utf-8 -*-
"""
Created on Sat Feb 13 12:28:07 2016
@author: Bernard
"""
import pandas
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn
data = pandas.read_csv('gapminder_ghana_updated.csv')
data["incomeperperson"] =
data["incomeperperson"].convert_objects(convert_numeric=True)
data['lifeexpectancy'] = data['lifeexpectancy'].convert_objects(convert_numeric=True)
# listwise deletion of missing values
dataSub = data[['incomeperperson',
'lifeexpectancy']].dropna()
scat1 = seaborn.regplot(x="incomeperperson",
y="lifeexpectancy", scatter=True, data=dataSub)
plt.xlabel('Income Per Person')
plt.ylabel('Life Expectancy')
plt.title ('Scatterplot for the Association Between Income
Per Person and Life Expectancy of the People Of Ghana')
print(scat1)
# center quantitative Explanatory variable for regression
analysis
dataSub['incomeperperson_c'] = (dataSub['incomeperperson'] -
dataSub['incomeperperson'].mean())
print("Describe the centered quantitative Explanatory
variable")
ds0 = dataSub["incomeperperson_c"].describe()
print(ds0)
# printing mean
print("Mean for centered quantitative explanatory
variable: incomeperperson_c")
ds1 = dataSub.groupby('incomeperperson_c').mean()
print (ds1)
print("Standard deviation for centered quantitative
explanatory variable:incomeperperson_c")
sd1 = dataSub.groupby('incomeperperson_c').std()
print (sd1)
print("Mean for quantitative explanatory variable:
incomeperperson")
ds2 = dataSub.groupby('incomeperperson').mean()
print (ds2)
print("Checking values in incomeperperson_c")
print(dataSub["incomeperperson_c"])
#Value counts
print("Counts for incomeperperson_c")
inc_c_Count =
dataSub["incomeperperson_c"].value_counts(sort = False ,dropna=False)
#dropna displays missen values
print(inc_c_Count)
print ("OLS regression model for the association
between Income Per Person and Life Expectancy of the People of Ghana")
reg1 = smf.ols('lifeexpectancy ~ incomeperperson_c',
data=dataSub).fit()
print (reg1.summary())