• 模型融合


    记个笔记,进一步理解了模型融合,开心,整理一下模型融合方式:stacking、blending和voting

    直接上代码(来自网络大佬的分享),理论后续补。

    • blending

      '''创建训练的数据集'''
      data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60)
      
      '''模型融合中使用到的各个单模型'''
      clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
      
      '''切分一部分数据作为测试集'''
      X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017)
      
      
      '''切分训练数据集为d1,d2两部分'''
      X_d1, X_d2, y_d1, y_d2 = train_test_split(X, y, test_size=0.5, random_state=2017)
      dataset_d1 = np.zeros((X_d2.shape[0], len(clfs)))
      dataset_d2 = np.zeros((X_predict.shape[0], len(clfs)))
      
      for j, clf in enumerate(clfs):
          '''依次训练各个单模型'''
          # print(j, clf)
          '''使用第1个部分作为预测,第2部分来训练模型,获得其预测的输出作为第2部分的新特征。'''
          # X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
          clf.fit(X_d1, y_d1)
          y_submission = clf.predict_proba(X_d2)[:, 1]
          dataset_d1[:, j] = y_submission
          '''对于测试集,直接用这k个模型的预测值作为新的特征。'''
          dataset_d2[:, j] = clf.predict_proba(X_predict)[:, 1]
          print("val auc Score: %f" % roc_auc_score(y_predict, dataset_d2[:, j]))
          
      '''融合使用的模型'''
      # clf = LogisticRegression()
      clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
      clf.fit(dataset_d1, y_d2)
      y_submission = clf.predict_proba(dataset_d2)[:, 1]
      
      print("Linear stretch of predictions to [0,1]")
      y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
      print("blend result")
      print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))
      

      结果:

    • stacking

      '''创建训练的数据集'''
      data1, target1 = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60)
      
      '''模型融合中使用到的各个单模型'''
      clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
      
      '''切分一部分数据作为测试集'''
      X, X_predict, y, y_predict = train_test_split(data1, target1, test_size=0.33, random_state=2017)
      
      
      dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
      dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))
      
      '''5折stacking'''
      n_folds = 5
      skf = list(StratifiedKFold(n_splits=n_folds).split(X,y))
      for j, clf in enumerate(clfs):
          '''依次训练各个单模型'''
          # print(j, clf)
          dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf)))
          for i, (train, test) in enumerate(skf):
              '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
              # print("Fold", i)
              X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
              clf.fit(X_train, y_train)
              y_submission = clf.predict_proba(X_test)[:, 1]
              dataset_blend_train[test, j] = y_submission
              dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
          '''对于测试集,直接用这k个模型的预测值均值作为新的特征。'''
          dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
          print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))
      # clf = LogisticRegression()
      clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
      clf.fit(dataset_blend_train, y)
      y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
      
      print("Linear stretch of predictions to [0,1]")
      y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
      print("blend result")
      print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))
      

      结果:

    • voting(加和投票)

      X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017)
      
      # 训练模型
      r1 = RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini')
      r1.fit(X, y)
      
      r2 = RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy')
      r2.fit(X, y)
      
      e1 = ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini')
      e1.fit(X, y)
      
      e2 = ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy')
      e2.fit(X, y)
      
      g = GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)
      g.fit(X, y)
      
      # 预测标签
      y_r1 = r1.predict_proba(X_predict)
      y_r2 = r2.predict_proba(X_predict)
      y_e1 = e1.predict_proba(X_predict)
      y_e2 = e2.predict_proba(X_predict)
      y_g = g.predict_proba(X_predict)
      
      # 转为one-hot标签形式
      y_r1 = np.rint(y_r1)
      y_r2 = np.rint(y_r2)
      y_e1 = np.rint(y_e1)
      y_e2 = np.rint(y_e2)
      y_g = np.rint(y_g)
      
      # 加和投票
      y_en = y_r1 + y_r2 + y_e1 + y_e2 + y_g
      y_pre_en = y_en.argmax(axis=1)
      
      
      print(classification_report(y_predict, y_pre_en, digits=4))
      

      结果:

  • 相关阅读:
    django xadmin后台集成DjangoUeditor
    vm虚拟机下的ubuntu16.04配置静态ip(NAT方式)
    关于mysql里的concat
    xml转换为数组格式
    关于json的中文转码问题
    判断是否是微信浏览器
    数组和xml的互相转换的封装函数
    关于微信登录的接口开发
    curl的封装类
    php里关于文件下载的方法(两种)
  • 原文地址:https://www.cnblogs.com/harbin-ho/p/14808516.html
Copyright © 2020-2023  润新知