• 泰坦尼克求胜率预测-基于随机森林实现


    泰坦尼克是一个经典的机器学习数据集,通过决策树对特征进行筛选,获得较大的特征

    import pandas as pd
    pd.set_option('display.max_columns', None)   #显示完整的列
    # pd.set_option('display.max_rows', None)  #显示完整的行
    titanic = pd.read_csv('titanic_train.csv')
    # print(titanic.describe())
    
    # 通过descriable 可以发现 Age 存在缺失值,而且Age这个属性比较重要,因此需要填充,通过均值进行填充
    titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
    print(titanic.describe())
    
    # 要把字符的量转化为数值的量
    print(titanic['Sex'].unique())
    
    # 把离散的变量转化为数值0-1
    titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
    titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1
    
    # 均值如何计算
    print(titanic['Embarked'].unique())
    titanic['Embarked'] = titanic['Embarked'].fillna('S')
    
    titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
    titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
    titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
    
    # 构建机器学习模型,来预测
    # 最简单的线性回归
    from sklearn.linear_model import LinearRegression
    # # 交叉验证
    from sklearn.model_selection import KFold
    from sklearn.model_selection import train_test_split
    #
    # # 第一步, 提取特征
    # predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']
    #
    # # 建立模型
    # alg = LinearRegression()
    # kf = KFold(n_splits=3, shuffle=False)
    #
    # predictions = []
    #
    # for train, test in kf.split(titanic):
    #     train_preditors = (titanic[predictors].iloc[train, :])
    #     train_target = titanic['Survived'].iloc[train]
    #
    #     # 训练模型
    #     alg.fit(train_preditors, train_target)
    #
    #     # 预测
    #     test_predictions = alg.predict(titanic[predictors].iloc[test, :])
    #     predictions.append(test_predictions)
    
    # 预测准确率
    # import numpy as np
    #
    # predictions = np.concatenate(predictions, axis=0)
    #
    # # Map predictions to outcome
    # predictions[predictions > 0.5] = 1
    # predictions[predictions <= 0.5] = 0
    # accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)
    # print(accuracy)
    
    
    print('-------------------------------')
    # 用逻辑回归
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score
    # # 初始化模型
    # alg = LogisticRegression()
    #
    # # 交叉验证
    # scores = cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
    # print(scores.mean())
    
    # 下面用随机森林分类器
    from sklearn.model_selection import cross_val_score
    from sklearn.ensemble import RandomForestClassifier
    
    predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']
    alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
    # 计算
    scores = cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
    print(scores.mean())
    
    
    print('---------------------------------')
    # 这一步花费的时间是最多的
    alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
    # 计算
    scores = cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
    print(scores.mean())
    
    
    # 现在准确率上不去,可以考虑添加新的特征
    titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
    
    titanic['NameLength'] = titanic['Name'].apply(lambda x:len(x))
    
    import re
    
    # 获取名字的头衔
    def get_title(name):
        # 使用正则表达式进行筛选,
        title_search = re.search(' ([A-Za-z]+).',name)
        # 如果头衔存在, 返回
        if title_search:
            return title_search.group(1)
        return ''
    
    titles = titanic['Name'].apply(get_title)
    print(pd.value_counts(titles))
    
    # Map each title and print
    title_mapping = {'Mr': 1,'Miss':2,'Mrs':3,'Master':4,'Dr':5,
    'Rev':6,
    'Major':7,
    'Mlle' :8,
    'Col'  :9,
    'Ms'   :10,
    'Countess' :11,
    'Mme' :12,
    'Lady':13,
    'Sir' :14,
    'Capt':15,
    'Jonkheer':16,
    'Don':17  }
    for k, v in title_mapping.items():
        titles[titles == k] = v
    print(pd.value_counts(titles))
    titanic['Title'] = titles
    # 添加额外的特征,进行测试
    import numpy as np
    from sklearn.feature_selection import SelectKBest, f_classif
    import matplotlib.pyplot as plt
    
    predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked', 'FamilySize', 'Title', 'NameLength']
    
    # 进行最好的特征选择
    selector = SelectKBest(f_classif, k=5)
    selector.fit(titanic[predictors], titanic['Survived'])
    
    # 得到每一个特征的值
    scores = -np.log10(selector.pvalues_)
    
    # 画图展示
    plt.bar(range(len(predictors)), scores)
    plt.xticks(range(len(predictors)), predictors, rotation='vertical')
    plt.show()
    
    # 选择最有价值的特征
    print('---------------')
    predictors = ['Pclass', 'Sex', 'Fare', 'Title']
    alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
    scores = cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
    print(scores.mean())
    
    
    # 使用多个分类器,达到最优的结果
    from sklearn.ensemble import GradientBoostingClassifier
    import numpy as np
    
    # 通过集成两个分类器,达到最优的结果
    algorithms = [
        [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ['Pclass', 'Sex', 'Age', 'Fare','Embarked', 'FamilySize', 'Title']],
        [RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4), ['Pclass', 'Sex', 'Fare', 'FamilySize', 'Title', 'Age', 'Embarked']]
    
    ]
    
    kf = KFold(n_splits=3, shuffle=False)
    
    predictions = []
    
    for train, test in kf.split(titanic):
    
        train_target = titanic['Survived'].iloc[train]
        full_test_predictions = []
        # 训练模型
        for alg, predictors in algorithms:
            alg.fit((titanic[predictors].iloc[train, :]), train_target)
    
            # 预测
            test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]
            full_test_predictions.append(test_predictions)
        # 使用简单的集成模型
        test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    
        # 对概率进行选择
        test_predictions[test_predictions <= 0.5] = 1
        test_predictions[test_predictions > 0.5] = 0
        predictions.append(test_predictions)
    
    # Put all the predictions together into one array
    predictions = np.concatenate(predictions, axis=0)
    
    # 计算准确率
    accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)
    print('-----------------1')
    print(accuracy)
    

  • 相关阅读:
    sublime there are no packages for installation
    linux 安装php扩展mbstring
    生成器表达式和列表推导式
    send()和next()
    迭代器生成器
    装饰器
    函数随笔
    Django进阶
    数据结构与算法入门
    MySQL必会
  • 原文地址:https://www.cnblogs.com/jly1/p/13074234.html
Copyright © 2020-2023  润新知