• 吴裕雄--天生自然 python数据分析:医疗费数据分析


    import numpy as np 
    import pandas as pd 
    import os
    import matplotlib.pyplot as pl
    import seaborn as sns
    import warnings
    warnings.filterwarnings('ignore')
    data = pd.read_csv('F:\kaggleDataSet\MedicalCostPersonal\insurance.csv')
    data.head()

    data.isnull().sum()

    from sklearn.preprocessing import LabelEncoder
    #sex
    le = LabelEncoder()
    le.fit(data.sex.drop_duplicates()) 
    data.sex = le.transform(data.sex)
    # smoker or not
    le.fit(data.smoker.drop_duplicates()) 
    data.smoker = le.transform(data.smoker)
    #region
    le.fit(data.region.drop_duplicates()) 
    data.region = le.transform(data.region)
    data.corr()['charges'].sort_values()

    f, ax = pl.subplots(figsize=(10, 8))
    corr = data.corr()
    sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),square=True, ax=ax)

    from bokeh.io import output_notebook, show
    from bokeh.plotting import figure
    output_notebook()
    import scipy.special
    from bokeh.layouts import gridplot
    from bokeh.plotting import figure, show, output_file
    p = figure(title="Distribution of charges",tools="save",background_fill_color="#E8DDCB")
    hist, edges = np.histogram(data.charges)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    show(gridplot(p,ncols = 2, plot_width=400, plot_height=400, toolbar_location=None))

    f= pl.figure(figsize=(12,5))
    
    ax=f.add_subplot(121)
    sns.distplot(data[(data.smoker == 1)]["charges"],color='c',ax=ax)
    ax.set_title('Distribution of charges for smokers')
    
    ax=f.add_subplot(122)
    sns.distplot(data[(data.smoker == 0)]['charges'],color='b',ax=ax)
    ax.set_title('Distribution of charges for non-smokers')

    sns.catplot(x="smoker", kind="count",hue = 'sex', palette="pink", data=data)

    sns.catplot(x="sex", y="charges", hue="smoker",kind="violin", data=data, palette = 'magma')

    pl.figure(figsize=(12,5))
    pl.title("Box plot for charges of women")
    sns.boxplot(y="smoker", x="charges", data =  data[(data.sex == 1)] , orient="h", palette = 'magma')

    pl.figure(figsize=(12,5))
    pl.title("Box plot for charges of men")
    sns.boxplot(y="smoker", x="charges", data =  data[(data.sex == 0)] , orient="h", palette = 'rainbow')

    pl.figure(figsize=(12,5))
    pl.title("Distribution of age")
    ax = sns.distplot(data["age"], color = 'g')

    sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow", data=data[(data.age == 18)])
    pl.title("The number of smokers and non-smokers (18 years old)")

    g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 0)],kind="kde", color="m")
    g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
    g.ax_joint.collections[0].set_alpha(0)
    g.set_axis_labels("$X$", "$Y$")
    ax.set_title('Distribution of charges and age for non-smokers')

    g = sns.jointplot(x="age", y="charges", data = data[(data.smoker == 1)],kind="kde", color="c")
    g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
    g.ax_joint.collections[0].set_alpha(0)
    g.set_axis_labels("$X$", "$Y$")
    ax.set_title('Distribution of charges and age for smokers')

    #non - smokers
    p = figure(plot_width=500, plot_height=450)
    p.circle(x=data[(data.smoker == 0)].age,y=data[(data.smoker == 0)].charges, size=7, line_color="navy", fill_color="pink", fill_alpha=0.9)
    
    show(p)

    #smokers
    p = figure(plot_width=500, plot_height=450)
    p.circle(x=data[(data.smoker == 1)].age,y=data[(data.smoker == 1)].charges, size=7, line_color="navy", fill_color="red", fill_alpha=0.9)
    show(p)

    sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'inferno_r', size = 7)
    ax.set_title('Smokers and non-smokers')

    pl.figure(figsize=(12,5))
    pl.title("Distribution of bmi")
    ax = sns.distplot(data["bmi"], color = 'm')

    pl.figure(figsize=(12,5))
    pl.title("Distribution of charges for patients with BMI greater than 30")
    ax = sns.distplot(data[(data.bmi >= 30)]['charges'], color = 'm')

    pl.figure(figsize=(12,5))
    pl.title("Distribution of charges for patients with BMI less than 30")
    ax = sns.distplot(data[(data.bmi < 30)]['charges'], color = 'b')

    g = sns.jointplot(x="bmi", y="charges", data = data,kind="kde", color="r")
    g.plot_joint(pl.scatter, c="w", s=30, linewidth=1, marker="+")
    g.ax_joint.collections[0].set_alpha(0)
    g.set_axis_labels("$X$", "$Y$")
    ax.set_title('Distribution of bmi and charges')

    pl.figure(figsize=(10,6))
    ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
    ax.set_title('Scatter plot of charges and bmi')
    
    sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = 'magma', size = 8)

    sns.catplot(x="children", kind="count", palette="ch:.25", data=data, size = 6)

    sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
                data=data[(data.children > 0)], size = 6)
    ax.set_title('Smokers and non-smokers who have childrens')

    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics import r2_score,mean_squared_error
    from sklearn.ensemble import RandomForestRegressor
    x = data.drop(['charges'], axis = 1)
    y = data.charges
    
    x_train,x_test,y_train,y_test = train_test_split(x,y, random_state = 0)
    lr = LinearRegression().fit(x_train,y_train)
    
    y_train_pred = lr.predict(x_train)
    y_test_pred = lr.predict(x_test)
    
    print(lr.score(x_test,y_test))

    X = data.drop(['charges','region'], axis = 1)
    Y = data.charges
    
    quad = PolynomialFeatures (degree = 2)
    x_quad = quad.fit_transform(X)
    
    X_train,X_test,Y_train,Y_test = train_test_split(x_quad,Y, random_state = 0)
    
    plr = LinearRegression().fit(X_train,Y_train)
    
    Y_train_pred = plr.predict(X_train)
    Y_test_pred = plr.predict(X_test)
    
    print(plr.score(X_test,Y_test))

    forest = RandomForestRegressor(n_estimators = 100,criterion = 'mse',random_state = 1,n_jobs = -1)
    forest.fit(x_train,y_train)
    forest_train_pred = forest.predict(x_train)
    forest_test_pred = forest.predict(x_test)
    
    print('MSE train data: %.3f, MSE test data: %.3f' % (
    mean_squared_error(y_train,forest_train_pred),
    mean_squared_error(y_test,forest_test_pred)))
    print('R2 train data: %.3f, R2 test data: %.3f' % (
    r2_score(y_train,forest_train_pred),
    r2_score(y_test,forest_test_pred)))

    pl.figure(figsize=(10,6))
    
    pl.scatter(forest_train_pred,forest_train_pred - y_train,c = 'black', marker = 'o', s = 35, alpha = 0.5,label = 'Train data')
    pl.scatter(forest_test_pred,forest_test_pred - y_test,c = 'c', marker = 'o', s = 35, alpha = 0.7,label = 'Test data')
    pl.xlabel('Predicted values')
    pl.ylabel('Tailings')
    pl.legend(loc = 'upper left')
    pl.hlines(y = 0, xmin = 0, xmax = 60000, lw = 2, color = 'red')
    pl.show()

  • 相关阅读:
    触发器
    变量
    Python 3.6 抓取微博m站数据
    Linux cp/rm/mv 强制覆盖
    Oracle的CLOB大数据字段类型
    4、NameNode启动过程详解
    2、HDFS交互式Shell
    1、HDFS 架构、启动过程
    11、 Hadoop 2.x各个服务组件如何配置在那台服务器运行并测试
    10、Hadoop组件启动方式和SSH无密码登陆
  • 原文地址:https://www.cnblogs.com/tszr/p/11239352.html
Copyright © 2020-2023  润新知