吴裕雄--天生自然 python数据分析：医疗费数据分析

import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as pl
import seaborn as sns
import warnings
warnings.filterwarnings(\'ignore\')
data = pd.read_csv(\'F:\\kaggleDataSet\\MedicalCostPersonal\\insurance.csv\')

data.head()

data.isnull().sum()

from sklearn.preprocessing import LabelEncoder
#sex
le = LabelEncoder()
le.fit(data.sex.drop_duplicates()) 
data.sex = le.transform(data.sex)
# smoker or not
le.fit(data.smoker.drop_duplicates()) 
data.smoker = le.transform(data.smoker)
#region
le.fit(data.region.drop_duplicates()) 
data.region = le.transform(data.region)

data.corr()[\'charges\'].sort_values()

f, ax = pl.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),square=True, ax=ax)

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()
import scipy.special
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
p = figure(title="Distribution of charges",tools="save",background_fill_color="#E8DDCB")
hist, edges = np.histogram(data.charges)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
p.xaxis.axis_label = \'x\'
p.yaxis.axis_label = \'Pr(x)\'
show(gridplot(p,ncols = 2, plot_width=400, plot_height=400, toolbar_location=None))

f= pl.figure(figsize=(12,5))

ax=f.add_subplot(121)
sns.distplot(data[(data.smoker == 1)]["charges"],color=\'c\',ax=ax)
ax.set_title(\'Distribution of charges for smokers\')

ax=f.add_subplot(122)
sns.distplot(data[(data.smoker == 0)][\'charges\'],color=\'b\',ax=ax)
ax.set_title(\'Distribution of charges for non-smokers\')

sns.catplot(x="smoker", kind="count",hue = \'sex\', palette="pink", data=data)

sns.catplot(x="sex", y="charges", hue="smoker",kind="violin", data=data, palette = \'magma\')

pl.figure(figsize=(12,5))
pl.title("Box plot for charges of women")
sns.boxplot(y="smoker", x="charges", data =  data[(data.sex == 1)] , orient="h", palette = \'magma\')

pl.figure(figsize=(12,5))
pl.title("Box plot for charges of men")
sns.boxplot(y="smoker", x="charges", data =  data[(data.sex == 0)] , orient="h", palette = \'rainbow\')

pl.figure(figsize=(12,5))
pl.title("Distribution of age")
ax = sns.distplot(data["age"], color = \'g\')

sns.catplot(x="smoker", kind="count",hue = \'sex\', palette="rainbow", data=data[(data.age == 18)])
pl.title("The number of smokers and non-smokers (18 years old)")

g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 0)],kind="kde", color="m")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title(\'Distribution of charges and age for non-smokers\')

g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 1)],kind="kde", color="c")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title(\'Distribution of charges and age for smokers\')

#non - smokers
p = figure(plot_width=500, plot_height=450)
p.circle(x=data[(data.smoker == 0)].age,y=data[(data.smoker == 0)].charges, size=7, line_color="navy", fill_color="pink", fill_alpha=0.9)

show(p)

#smokers
p = figure(plot_width=500, plot_height=450)
p.circle(x=data[(data.smoker == 1)].age,y=data[(data.smoker == 1)].charges, size=7, line_color="navy", fill_color="red", fill_alpha=0.9)
show(p)

sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = \'inferno_r\', size = 7)
ax.set_title(\'Smokers and non-smokers\')

pl.figure(figsize=(12,5))
pl.title("Distribution of bmi")
ax = sns.distplot(data["bmi"], color = \'m\')

pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI greater than 30")
ax = sns.distplot(data[(data.bmi >= 30)][\'charges\'], color = \'m\')

pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI less than 30")
ax = sns.distplot(data[(data.bmi < 30)][\'charges\'], color = \'b\')

g = sns.jointplot(x="bmi", y="charges", data = data,kind="kde", color="r")
g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
ax.set_title(\'Distribution of bmi and charges\')

pl.figure(figsize=(10,6))
ax = sns.scatterplot(x=\'bmi\',y=\'charges\',data=data,palette=\'magma\',hue=\'smoker\')
ax.set_title(\'Scatter plot of charges and bmi\')

sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = \'magma\', size = 8)

sns.catplot(x="children", kind="count", palette="ch:.25", data=data, size = 6)

sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
            data=data[(data.children > 0)], size = 6)
ax.set_title(\'Smokers and non-smokers who have childrens\')

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor

x = data.drop([\'charges\'], axis = 1)
y = data.charges

x_train,x_test,y_train,y_test = train_test_split(x,y, random_state = 0)
lr = LinearRegression().fit(x_train,y_train)

y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)

print(lr.score(x_test,y_test))

X = data.drop([\'charges\',\'region\'], axis = 1)
Y = data.charges

quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X)

X_train,X_test,Y_train,Y_test = train_test_split(x_quad,Y, random_state = 0)

plr = LinearRegression().fit(X_train,Y_train)

Y_train_pred = plr.predict(X_train)
Y_test_pred = plr.predict(X_test)

print(plr.score(X_test,Y_test))

forest = RandomForestRegressor(n_estimators = 100,criterion = \'mse\',random_state = 1,n_jobs = -1)
forest.fit(x_train,y_train)
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test)

print(\'MSE train data: %.3f, MSE test data: %.3f\' % (
mean_squared_error(y_train,forest_train_pred),
mean_squared_error(y_test,forest_test_pred)))
print(\'R2 train data: %.3f, R2 test data: %.3f\' % (
r2_score(y_train,forest_train_pred),
r2_score(y_test,forest_test_pred)))

pl.figure(figsize=(10,6))

pl.scatter(forest_train_pred,forest_train_pred - y_train,c = \'black\', marker = \'o\', s = 35, alpha = 0.5,label = \'Train data\')
pl.scatter(forest_test_pred,forest_test_pred - y_test,c = \'c\', marker = \'o\', s = 35, alpha = 0.7,label = \'Test data\')
pl.xlabel(\'Predicted values\')
pl.ylabel(\'Tailings\')
pl.legend(loc = \'upper left\')
pl.hlines(y = 0, xmin = 0, xmax = 60000, lw = 2, color = \'red\')
pl.show()

发表于 2019-07-24 17:10 吴裕雄阅读(605) 评论(0) 编辑收藏举报