• kaggle Titanic


    # coding: utf-8
    
    # In[19]:
    
    # 0.78468
    
    
    # In[20]:
    
    
    import numpy as np
    import pandas as pd
    import warnings
    warnings.filterwarnings('ignore')
    from sklearn import preprocessing
    
    
    # In[21]:
    
    
    train_path = r'C:UserscbattleDesktop	rain.csv' # r'/home/adminn/桌面/train.csv' 
    test_path = r'C:UserscbattleDesktop	est.csv' # r'/home/adminn/桌面/test.csv'
    out_path = r'C:UserscbattleDesktopout.csv' # r'/home/adminn/桌面/out.csv'
    
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    print('train:',train.shape)
    print('test:',test.shape)
    # train.info()
    # test.info()
    # print(train.head())
    
    # 属性列
    # print([col for col in train])
    # print([col for col in test])
    
    # 策略
    # ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
    #     drop          onehot   drop    0/1    num    num       num      drop     num      0/1    用S补空,onehot
    
    
    
    # In[22]:
    
    
    X = train.drop(['Survived','PassengerId','Name'], axis=1)
    y = train['Survived']
    Xtest = test.drop(['PassengerId','Name'], axis=1)
    # print('X:',X.shape)
    # print('y:',y.shape)
    # print('Xtest:',Xtest.shape)
    
    
    # In[23]:
    
    
    key = [col for col in X if X[col].dtype != 'object' # numberic ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
           or col == 'Sex'
           or col == 'Embarked'
           or col == 'Cabin'
          ] 
    X = X[key]
    Xtest = Xtest[key]
    # print(key)
    
    def showNullNum(a,b):
        print(a.isnull().sum())
        print()
        print(b.isnull().sum())
        print('------------------------------------')
    
    showNullNum(X,Xtest)
    
    # Xtest['Fare'][Xtest['Fare'].isnull()] = Xtest['Fare'].median() # replace nan with median
    # X = X.dropna(axis=0) # drop X and y in the same row
    
    #-------------------------------------------------------------------------------
    # Pclass    Ticket class
    # 1 = 1st, 2 = 2nd, 3 = 3rd  onehot
    # for i in X['Pclass'].unique():
    #     X['Pclass_'+str(i)] = (X['Pclass']==i).astype(int)
    #     Xtest['Pclass_'+str(i)] = (Xtest['Pclass']==i).astype(int)
    
    # X = X.drop(['Pclass'],axis=1)
    # Xtest = Xtest.drop(['Pclass'],axis=1)
    
    #-------------------------------------------------------------------------------
    # Sex
    X['Sex'] = X['Sex'].apply(lambda i:1 if i=='female' else 0)
    Xtest['Sex'] = Xtest['Sex'].apply(lambda i:1 if i=='female' else 0)
    
    #-------------------------------------------------------------------------------
    # Embarked
    
    # 1 label encoding
    X['Embarked'][X['Embarked'].isnull()] = 'S'
    X['Embarked'] = X['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)
    Xtest['Embarked'] = Xtest['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)
    # or use sklearn.preprocessing.LabelEncoder
    
    
    
    
    # print(X.head())
    # print(Xtest.head())
    
    # X['Embarked'][X['Embarked'].isnull()] = 'S'
    # from sklearn import preprocessing
    # le = preprocessing.LabelEncoder()
    # X['Embarked'] = le.fit_transform(X['Embarked'])
    # Xtest['Embarked'] = le.transform(Xtest['Embarked'])
    
    # print(X.head())
    # print(Xtest.head())
    
    
    
    
    # 2 onehot
    # for i in X['Embarked'].unique():
    #     print(i, 'sum:', sum(X['Embarked']==i))
    
    # X['Embarked'][X['Embarked'].isnull()] = 'S' # most_frequent
    # for i in X['Embarked'].unique():
    #     X['Embarked_type_'+i] = (X['Embarked']==i).astype(int)
    #     Xtest['Embarked_type_'+i] = (Xtest['Embarked']==i).astype(int)
        
    # X = X.drop(['Embarked'],axis=1)
    # Xtest = Xtest.drop(['Embarked'],axis=1)
    # print(X.head(10))
    
    #-------------------------------------------------------------------------------
    # Cabin
    # has a cabin or not
    # print(X.head(5))
    Xtest['Cabin'] = Xtest['Cabin'].apply(lambda i:1 if isinstance(i,str) else 0)
    X['Cabin'] = X['Cabin'].apply(lambda i:1 if isinstance(i,str) else 0)
    # print(X.head(5))
    
    
    #-------------------------------------------------------------------------------
    # age and fare
    # use median to replace nan 
    from sklearn.preprocessing import Imputer
    ip = Imputer(strategy='median')
    X = ip.fit_transform(X)
    Xtest = ip.transform(Xtest)
    print(np.isnan(X).sum(),np.isnan(Xtest).sum())
    
    
    
    # In[24]:
    
    
    from xgboost import XGBClassifier
    xgb = XGBClassifier()
    xgb.fit(X,y)
    ans = xgb.predict(Xtest)
    
    # from sklearn.tree import DecisionTreeClassifier
    # from sklearn.ensemble import ExtraTreesClassifier
    # from sklearn.svm import LinearSVC
    
    
    
    # In[25]:
    
    
    out = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':ans})
    out.to_csv(out_path,index = False)
    print('ok')
    
    
    # In[26]:
    
    
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(['a','b','c'])
    ans = le.transform(['a','a','c'])
    print(ans)
  • 相关阅读:
    「学习记录」《数值分析》第三章计算实习题(Python语言)
    Set原理
    字符串流stringReader
    Collection List接口
    io
    Dubbo 服务容错Hystrix
    Duboo 与springboot整合
    读取配置文件
    springboot 端口号
    springboot 多环境选择
  • 原文地址:https://www.cnblogs.com/cbattle/p/8919752.html
Copyright © 2020-2023  润新知