# coding: utf-8 # In[19]: # 0.78468 # In[20]: import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') from sklearn import preprocessing # In[21]: train_path = r'C:UserscbattleDesktop rain.csv' # r'/home/adminn/桌面/train.csv' test_path = r'C:UserscbattleDesktop est.csv' # r'/home/adminn/桌面/test.csv' out_path = r'C:UserscbattleDesktopout.csv' # r'/home/adminn/桌面/out.csv' train = pd.read_csv(train_path) test = pd.read_csv(test_path) print('train:',train.shape) print('test:',test.shape) # train.info() # test.info() # print(train.head()) # 属性列 # print([col for col in train]) # print([col for col in test]) # 策略 # ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] # drop onehot drop 0/1 num num num drop num 0/1 用S补空,onehot # In[22]: X = train.drop(['Survived','PassengerId','Name'], axis=1) y = train['Survived'] Xtest = test.drop(['PassengerId','Name'], axis=1) # print('X:',X.shape) # print('y:',y.shape) # print('Xtest:',Xtest.shape) # In[23]: key = [col for col in X if X[col].dtype != 'object' # numberic ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] or col == 'Sex' or col == 'Embarked' or col == 'Cabin' ] X = X[key] Xtest = Xtest[key] # print(key) def showNullNum(a,b): print(a.isnull().sum()) print() print(b.isnull().sum()) print('------------------------------------') showNullNum(X,Xtest) # Xtest['Fare'][Xtest['Fare'].isnull()] = Xtest['Fare'].median() # replace nan with median # X = X.dropna(axis=0) # drop X and y in the same row #------------------------------------------------------------------------------- # Pclass Ticket class # 1 = 1st, 2 = 2nd, 3 = 3rd onehot # for i in X['Pclass'].unique(): # X['Pclass_'+str(i)] = (X['Pclass']==i).astype(int) # Xtest['Pclass_'+str(i)] = (Xtest['Pclass']==i).astype(int) # X = X.drop(['Pclass'],axis=1) # Xtest = Xtest.drop(['Pclass'],axis=1) #------------------------------------------------------------------------------- # Sex X['Sex'] = X['Sex'].apply(lambda i:1 if i=='female' else 0) Xtest['Sex'] = Xtest['Sex'].apply(lambda i:1 if i=='female' else 0) #------------------------------------------------------------------------------- # Embarked # 1 label encoding X['Embarked'][X['Embarked'].isnull()] = 'S' X['Embarked'] = X['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int) Xtest['Embarked'] = Xtest['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int) # or use sklearn.preprocessing.LabelEncoder # print(X.head()) # print(Xtest.head()) # X['Embarked'][X['Embarked'].isnull()] = 'S' # from sklearn import preprocessing # le = preprocessing.LabelEncoder() # X['Embarked'] = le.fit_transform(X['Embarked']) # Xtest['Embarked'] = le.transform(Xtest['Embarked']) # print(X.head()) # print(Xtest.head()) # 2 onehot # for i in X['Embarked'].unique(): # print(i, 'sum:', sum(X['Embarked']==i)) # X['Embarked'][X['Embarked'].isnull()] = 'S' # most_frequent # for i in X['Embarked'].unique(): # X['Embarked_type_'+i] = (X['Embarked']==i).astype(int) # Xtest['Embarked_type_'+i] = (Xtest['Embarked']==i).astype(int) # X = X.drop(['Embarked'],axis=1) # Xtest = Xtest.drop(['Embarked'],axis=1) # print(X.head(10)) #------------------------------------------------------------------------------- # Cabin # has a cabin or not # print(X.head(5)) Xtest['Cabin'] = Xtest['Cabin'].apply(lambda i:1 if isinstance(i,str) else 0) X['Cabin'] = X['Cabin'].apply(lambda i:1 if isinstance(i,str) else 0) # print(X.head(5)) #------------------------------------------------------------------------------- # age and fare # use median to replace nan from sklearn.preprocessing import Imputer ip = Imputer(strategy='median') X = ip.fit_transform(X) Xtest = ip.transform(Xtest) print(np.isnan(X).sum(),np.isnan(Xtest).sum()) # In[24]: from xgboost import XGBClassifier xgb = XGBClassifier() xgb.fit(X,y) ans = xgb.predict(Xtest) # from sklearn.tree import DecisionTreeClassifier # from sklearn.ensemble import ExtraTreesClassifier # from sklearn.svm import LinearSVC # In[25]: out = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':ans}) out.to_csv(out_path,index = False) print('ok') # In[26]: from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(['a','b','c']) ans = le.transform(['a','a','c']) print(ans)