• kaggle_Titanic


    # -*- coding: utf-8 -*-
    """
    Created on Mon Oct  9 14:05:41 2017
    
    @author: lenovo
    """
    
    import numpy as np
    import pandas as pd
    
    #载入数据,合并测试集和训练集做特征处理
    data_train = pd.read_csv('./input/train.csv')
    data_train['id'] = 'train'
    data_test = pd.read_csv('./input/test.csv')
    data_test['id'] = 'test'
    data = pd.concat((data_train,data_test),axis=0)
    #计算各属性的缺失值
    for column in data.columns:
        print(column,data[column].isnull().sum())
    
    #填充fare数据
    fare_mean = data[data['Fare']>0].groupby('Pclass').mean()['Fare'] #查看各个船舱的价格均值
    #用价格均值填充缺失价格和为0价格
    for i in range(0,3):
        data.loc[(data.Fare.isnull()) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1]
        data.loc[(data.Fare==0) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1]
    #处理年龄缺失值,用随机森林建模做预测
    data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    age_exist = data_for_age[data_for_age['Age'].notnull()]
    age_null = data_for_age[data_for_age['Age'].isnull()]
    y = age_exist.values[:,0]
    x = age_exist.values[:,1:]
    x_test = age_null.values[:,1:]
    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=200,max_depth=5)
    rf.fit(x,y)
    y_pred = rf.predict(x_test)
    data.loc[(data.Age.isnull()),'Age'] = y_pred
    #处理性别字段,无缺失值直接转成0,1格式
    data['Sex'] = data['Sex'].map({'female':0,'male':1})
    #将Sibsp、Pclass字段one_hot
    SibSp = pd.get_dummies(data['SibSp'],prefix='SibSp')
    Pclass = pd.get_dummies(data['Pclass'],prefix='Pclass')
    Parch = pd.get_dummies(data['Parch'],prefix='Parch')
    #处理Embarked缺失值,直接众数填充
    data['Embarked'].fillna('S',inplace=True)
    Embarked = pd.get_dummies(data['Embarked'],prefix='Embarked')
    #处理Cabin值,缺失直接就当做没有u0
    data[data['Cabin'].isnull()]['Cabin'] = 'u0'
    Cabin = pd.get_dummies(data['Cabin'],prefix='Cabin')
    #全部数据合并
    data.drop(['SibSp','Pclass','Parch','Embarked','Cabin','Name','Ticket','PassengerId'],axis=1,inplace=True)
    data_all = pd.concat([data,SibSp,Pclass,Parch,Embarked],axis=1)
    
    #建模做预测
    data_train = data_all[data_all['id']=='train']
    data_train.drop('id',axis=1,inplace=True)
    data_test = data_all[data_all['id']=='test']
    data_test.drop(['Survived','id'],axis=1,inplace=True)
    x = data_train.drop('Survived',axis=1).values[:,:]
    y = data_train.loc[:,'Survived']
    
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn import metrics
    
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=0)
    sc = StandardScaler()
    x_train_std = sc.fit_transform(x_train)
    x_test_std = sc.transform(x_test)
    
    #logistics 回归
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train_std,y_train)
    y_pred_lr = lr.predict(x_test_std)
    print('Logistic Regression:',metrics.accuracy_score(y_test,y_pred_lr))
    #Logistic Regression: 0.809701492537
    
    #决策树
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import GridSearchCV
    dt = DecisionTreeClassifier()
    model_dt = GridSearchCV(dt,param_grid={'max_depth':range(1,10)},cv=5)
    model_dt.fit(x_train_std,y_train)
    y_pred_dt = model_dt.predict(x_test_std)
    print('Decision Tree:',metrics.accuracy_score(y_test,y_pred_dt))
    #Decision Tree: 0.813432835821
    
    #随机森林
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(max_depth=4,n_estimators=200)
    rf.fit(x_train_std,y_train)
    y_pred_rf = rf.predict(x_test_std)
    y_pred_rf1 = rf.predict(data_test_xgb)
    print('RandomForest:',metrics.accuracy_score(y_test,y_pred_rf))
    #RandomForest: 0.817164179104
    
    
    #svm
    from sklearn.svm import SVC
    svc = SVC(kernel='rbf',decision_function_shape='ovo')
    model_svc = GridSearchCV(svc,param_grid={'C':np.arange(5,10)/10,'gamma':range(10,101,10)},cv=5)
    model_svc.fit(x_train_std,y_train)
    y_pred_svc = model_svc.predict(x_test_std)
    
    print('SVM:',metrics.accuracy_score(y_test,y_pred_svc))  
    #SVM: 0.723880597015
    
    #xgboost
    import xgboost as xgb
    xgb_train = xgb.DMatrix(x_train_std,label=y_train)
    xgb_test = xgb.DMatrix(x_test_std,label=y_test)
    param = {'max_depth':4,'eta':0.3,'silent':1,'object':'binary:logistic'}
    watchlist = [(xgb_train,'train'),(xgb_test,'test')]
    def error_rate(y_hat,y):
        return 'error',float(sum(y.get_label()!=(y_hat>0.5)))/len(y_hat)
    bst = xgb.train(param,xgb_train,evals=watchlist,num_boost_round=4,feval=error_rate)
    y_pred_xgb = bst.predict(xgb_test)
    print('xgb:',np.average(y_test == (y_pred_xgb>0.5)))  
    #XGB: 0.832089552239
    
    
    #用xgb来做预测
    data_test_xgb = sc.transform(data_test)
    xgb_test = xgb.DMatrix(data_test_xgb)
    y_pred_xgb1 = bst.predict(xgb_test)
    y_pred_xgb1[y_pred_xgb1>0.5]=1
    y_pred_xgb1[y_pred_xgb1<0.5]=0
    
    #输出到文件
    test = pd.read_csv('predictions.csv')
    test['Survived']=y_pred_xgb1
    test.to_csv('xgb.csv')
    test = pd.read_csv('predictions.csv')
    test['Survived']=y_pred_rf1
    test.to_csv('rf.csv')
  • 相关阅读:
    day10 Java学习(开发工具.Eclipse)
    day9 Java学习(面向对象Package关键字)
    day8 Java学习(面向对象.多态&接口)
    day7 Java学习(面向对象.继承)
    day5 Java学习(面向对象)
    day4 Java学习(数组)
    day3 Java学习(循环结构)
    day2 Java学习(位运算符)
    2019-05-26 java学习日记
    2019-05-25 java学习日记
  • 原文地址:https://www.cnblogs.com/jiegege/p/7641838.html
Copyright © 2020-2023  润新知