# -*- coding: utf-8 -*- """ Created on Mon Oct 9 14:05:41 2017 @author: lenovo """ import numpy as np import pandas as pd #载入数据,合并测试集和训练集做特征处理 data_train = pd.read_csv('./input/train.csv') data_train['id'] = 'train' data_test = pd.read_csv('./input/test.csv') data_test['id'] = 'test' data = pd.concat((data_train,data_test),axis=0) #计算各属性的缺失值 for column in data.columns: print(column,data[column].isnull().sum()) #填充fare数据 fare_mean = data[data['Fare']>0].groupby('Pclass').mean()['Fare'] #查看各个船舱的价格均值 #用价格均值填充缺失价格和为0价格 for i in range(0,3): data.loc[(data.Fare.isnull()) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1] data.loc[(data.Fare==0) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1] #处理年龄缺失值,用随机森林建模做预测 data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] age_exist = data_for_age[data_for_age['Age'].notnull()] age_null = data_for_age[data_for_age['Age'].isnull()] y = age_exist.values[:,0] x = age_exist.values[:,1:] x_test = age_null.values[:,1:] from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=200,max_depth=5) rf.fit(x,y) y_pred = rf.predict(x_test) data.loc[(data.Age.isnull()),'Age'] = y_pred #处理性别字段,无缺失值直接转成0,1格式 data['Sex'] = data['Sex'].map({'female':0,'male':1}) #将Sibsp、Pclass字段one_hot SibSp = pd.get_dummies(data['SibSp'],prefix='SibSp') Pclass = pd.get_dummies(data['Pclass'],prefix='Pclass') Parch = pd.get_dummies(data['Parch'],prefix='Parch') #处理Embarked缺失值,直接众数填充 data['Embarked'].fillna('S',inplace=True) Embarked = pd.get_dummies(data['Embarked'],prefix='Embarked') #处理Cabin值,缺失直接就当做没有u0 data[data['Cabin'].isnull()]['Cabin'] = 'u0' Cabin = pd.get_dummies(data['Cabin'],prefix='Cabin') #全部数据合并 data.drop(['SibSp','Pclass','Parch','Embarked','Cabin','Name','Ticket','PassengerId'],axis=1,inplace=True) data_all = pd.concat([data,SibSp,Pclass,Parch,Embarked],axis=1) #建模做预测 data_train = data_all[data_all['id']=='train'] data_train.drop('id',axis=1,inplace=True) data_test = data_all[data_all['id']=='test'] data_test.drop(['Survived','id'],axis=1,inplace=True) x = data_train.drop('Survived',axis=1).values[:,:] y = data_train.loc[:,'Survived'] from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn import metrics x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=0) sc = StandardScaler() x_train_std = sc.fit_transform(x_train) x_test_std = sc.transform(x_test) #logistics 回归 from sklearn.linear_model import LogisticRegression lr = LogisticRegression(penalty='l2') lr.fit(x_train_std,y_train) y_pred_lr = lr.predict(x_test_std) print('Logistic Regression:',metrics.accuracy_score(y_test,y_pred_lr)) #Logistic Regression: 0.809701492537 #决策树 from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV dt = DecisionTreeClassifier() model_dt = GridSearchCV(dt,param_grid={'max_depth':range(1,10)},cv=5) model_dt.fit(x_train_std,y_train) y_pred_dt = model_dt.predict(x_test_std) print('Decision Tree:',metrics.accuracy_score(y_test,y_pred_dt)) #Decision Tree: 0.813432835821 #随机森林 from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(max_depth=4,n_estimators=200) rf.fit(x_train_std,y_train) y_pred_rf = rf.predict(x_test_std) y_pred_rf1 = rf.predict(data_test_xgb) print('RandomForest:',metrics.accuracy_score(y_test,y_pred_rf)) #RandomForest: 0.817164179104 #svm from sklearn.svm import SVC svc = SVC(kernel='rbf',decision_function_shape='ovo') model_svc = GridSearchCV(svc,param_grid={'C':np.arange(5,10)/10,'gamma':range(10,101,10)},cv=5) model_svc.fit(x_train_std,y_train) y_pred_svc = model_svc.predict(x_test_std) print('SVM:',metrics.accuracy_score(y_test,y_pred_svc)) #SVM: 0.723880597015 #xgboost import xgboost as xgb xgb_train = xgb.DMatrix(x_train_std,label=y_train) xgb_test = xgb.DMatrix(x_test_std,label=y_test) param = {'max_depth':4,'eta':0.3,'silent':1,'object':'binary:logistic'} watchlist = [(xgb_train,'train'),(xgb_test,'test')] def error_rate(y_hat,y): return 'error',float(sum(y.get_label()!=(y_hat>0.5)))/len(y_hat) bst = xgb.train(param,xgb_train,evals=watchlist,num_boost_round=4,feval=error_rate) y_pred_xgb = bst.predict(xgb_test) print('xgb:',np.average(y_test == (y_pred_xgb>0.5))) #XGB: 0.832089552239 #用xgb来做预测 data_test_xgb = sc.transform(data_test) xgb_test = xgb.DMatrix(data_test_xgb) y_pred_xgb1 = bst.predict(xgb_test) y_pred_xgb1[y_pred_xgb1>0.5]=1 y_pred_xgb1[y_pred_xgb1<0.5]=0 #输出到文件 test = pd.read_csv('predictions.csv') test['Survived']=y_pred_xgb1 test.to_csv('xgb.csv') test = pd.read_csv('predictions.csv') test['Survived']=y_pred_rf1 test.to_csv('rf.csv')