• python 泰坦尼克存活问题分析


    先导入基本的模块

    #导入模块
    import numpy as np
    import pandas as pd
    from scipy import stats
    import seaborn as sns
    import matplotlib.pyplot as plt
    %matplotlib inline
    plt.rc("font",family="SimHei",size="12")  #解决中文无法显示的问题

    导入数据

    #导入CSV数据
    train=pd.read_csv('F://python//titanic_data.csv')

    字段含义:

    PassengerId:乘客编号

    Survived:乘客是否存活

    Pclass:乘客所在的船舱等级

    Name:乘客姓名

    Sex:乘客性别

    Age:乘客年龄

    SibSp:乘客的兄弟姐妹和配偶数量

    Parch:乘客的父母与子女数量

    Ticket:票的编号

    Fare:票价

    Cabin:座位号

    Embarked:乘客登船码头


    查查数据分布

    #使用info,describe 
    train.info()
    train.describe()

    查看缺失值

    #null
    train.isnull().sum()

    区别类别变量和连续变量

    #连续变量和类别变量分开
    num_features=train.select_dtypes(include=[np.number])
    categ_features=train.select_dtypes(include=[np.object])
    num_features=num_features.columns
    categ_features=categ_features.columns

    类别变量分析(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'])主要分析性别和乘客登船码头

    #类别变量的每个类别频数可视化
    def count_plot(x,  **kwargs):
        sns.countplot(x=x)
        x=plt.xticks(rotation=90)
    
    f = pd.melt(train,  value_vars=['Sex','Embarked'])
    g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
    g = g.map(count_plot, "value")

    查看这二者和存活的关系如何

    #画图
    pd.crosstab(train["Sex"],train["Survived"]).plot(kind="bar")
    pd.crosstab(train['Embarked'],train["Survived"]).plot(kind="bar")

    处理连续变量(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])

    #每个数字特征得分布可视化
    f = pd.melt(train, value_vars=num_features)
    g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
    g = g.map(sns.distplot, "value")

    存活画图

    #使用pandas的画图
    train["Survived"].value_counts().plot(kind="bar")

    乘客所在的船舱等级

    #乘客所在的船舱等级画图
    train['Pclass'].value_counts().plot(kind="bar")
    
    #和存活做二维表画图
    pd.crosstab(train['Pclass'],train["Survived"]).plot(kind="bar")

     年龄处理

    #画条形图
    train["Age"].hist()
    
    #按照五岁一个区间去分
    cut=[i*5 for i in range(1,20)]
    train.age=pd.cut(train.Age,cut)
    
    a=pd.crosstab(train.age,train.Survived)
    a.plot(kind="bar")
    a['rate']=a[1]/(a[1]+a[0])
    a['rate'].plot(kind="bar")

    效果并不明显,再分一次

    #重新分区间
    cut=[0,5,15,20,30,40,50,60,100]
    train.age=pd.cut(train.Age,cut)
    a=pd.crosstab(train.age,train.Survived)
    a.plot(kind="bar")
    a['rate']=a[1]/(a[1]+a[0])
    a['rate'].plot(kind="bar")

     

    SibSp:乘客的兄弟姐妹和配偶数量

    Parch:乘客的父母与子女数量

     pd.crosstab(train["SibSp"],train["Survived"]).plot(kind="bar")
    pd.crosstab(train["Parch"],train["Survived"]).plot(kind="bar")

     票价

    #使用箱型图查看二者分布如何
    fig,ay = plt.subplots()
    Fare1 = train.Fare[train.Survived == 1]
    Fare0 = train.Fare[train.Survived == 0]
    plt.boxplot((Fare1,Fare0),labels=('Survived','Not Survived'))
    ay.set_ylim([-10,150])
    ay.set_title("Boxplot of Fare")

    train["Fare"].hist()

    变量之间相关性分析

    #corr
    corr=train.corr()
    corr["Survived"].sort_values()

    特征构造

    #缺失值处理
    train['Age'].mean()  #29.69911764705882
    train['Age'].fillna(30,inplace=True)
    
    #删除
    train['Cabin'].value_counts()
    train=train.drop(['Cabin'],axis=1)
    #众数填充
    train['Embarked'].value_counts()
    train['Embarked'].fillna('S',inplace=True)
    
    
    #年龄分箱
    train.age=pd.cut(train.Age,[0,5,15,20,35,50,60,100])
    pd.crosstab(train.age,train.Survived).plot.bar()
    
    train.Parch[(train.Parch>0) & (train.Parch<=2)]=1
    train.Parch[train.Parch>2]=2
    train.SibSp[(train.SibSp>0) & (train.SibSp<=2)]=1
    train.SibSp[train.SibSp>2]=2
    
    
    
    dummy_Pclass = pd.get_dummies(train.Pclass, prefix='Pclass')
    dummy_Sex = pd.get_dummies(train.Sex, prefix='Sex')
    dummy_Embarked = pd.get_dummies(train.Embarked, prefix='Embarked')
    dummy_Parch = pd.get_dummies(train.Parch, prefix='Parch')
    dummy_SibSp = pd.get_dummies(train.SibSp, prefix='SibSp')
    dummy_Age = pd.get_dummies(train.age, prefix='Age')
    
    
    train_1=dummy_Pclass.join(dummy_Sex).join(dummy_Embarked).join(dummy_Parch).join(dummy_SibSp).join(dummy_Age)
    train_1['Fare']=train['Fare']

    模型构造

    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import confusion_matrix, roc_curve,roc_auc_score,classification_report 
    from sklearn.model_selection import train_test_split
    
    train_x=train_1
    train_y=train['Survived']
    x_train,x_test,y_train,y_test=train_test_split(train_x,train_y,test_size=0.3,random_state=0)
    
    
    #逻辑回归
    clf = LogisticRegression()
    clf.fit(x_train,y_train)
    
    #用测试集进行检验
    clf.predict(x_test)
    
    #混淆矩阵
    confusion_matrix(y_test,clf.predict(x_test))  
    #array([[144,  24],[ 28,  72]], dtype=int64)
    
    #roc
    roc_auc_score(y_test,clf.predict_proba(x_test)[:,1])   #0.8578869047619048
    
    #画图
    fpr,tpr,thresholds = roc_curve(y_test,clf.predict_proba(x_test)[:,1])
    plt.plot(fpr,tpr)
    
    
    #分类报告
    print(classification_report(y_test,clf.predict(x_test)))

     

  • 相关阅读:
    Python_Crawler_Foundation1-2_MYSQL_Regular Expression
    Linux_Best Practice_01_Ubuntu_set prox_set Repositories
    python_Note_Preview_01
    Python_Note_Day 10_Coroutine
    Python_Note_Day 9_threading.Thread_multiprocessing.Process_Queue
    Linux_学习_Day4_user/group/permission
    Linux_学习_Day3_bash
    Linux_学习_Day2~3
    Python_Note_Day 8_socket
    Python_Note_Day 7_Advanced Class
  • 原文地址:https://www.cnblogs.com/cgmcoding/p/13254897.html
Copyright © 2020-2023  润新知