• 【集成学习】lightgbm使用案例


    github地址

      1 #!/usr/bin/env python2
      2 # -*- coding: utf-8 -*-
      3 """
      4 Created on Sat Mar 31 21:19:09 2018
      5 
      6 @author: hello4720
      7 """
      8 import numpy as np
      9 import pandas as pd
     10 import lightgbm as lgb
     11 from sklearn import metrics
     12 from sklearn.model_selection import train_test_split
     13 
     14 ### 读取数据
     15 print("载入数据")
     16 dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
     17 dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
     18 dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
     19 dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
     20 dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
     21 
     22 dataset1.drop_duplicates(inplace=True)
     23 dataset2.drop_duplicates(inplace=True)
     24 dataset3.drop_duplicates(inplace=True)
     25 dataset4.drop_duplicates(inplace=True)
     26 dataset5.drop_duplicates(inplace=True)
     27 
     28 ### 数据合并
     29 print("数据合并")
     30 trains = pd.concat([dataset1,dataset2],axis=0)
     31 trains = pd.concat([trains,dataset3],axis=0)
     32 trains = pd.concat([trains,dataset4],axis=0)
     33 
     34 online_test = dataset5
     35 
     36 ### 数据拆分
     37 print("数据拆分")
     38 train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21)
     39 train,val = train_test_split(train_xy, test_size = 0.2,random_state=21)
     40 
     41 print("训练集")
     42 y = train.is_trade                                                  # 训练集标签
     43 X = train.drop(['instance_id','is_trade'],axis=1)                   # 训练集特征矩阵
     44 
     45 print("验证集")
     46 val_y = val.is_trade                                                # 验证集标签
     47 val_X = val.drop(['instance_id','is_trade'],axis=1)                 # 验证集特征矩阵
     48 
     49 print("测试集")
     50 offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 线下测试特征矩阵
     51 online_test_X=online_test.drop(['instance_id'],axis=1)              # 线上测试特征矩阵
     52 
     53 ### 数据转换
     54 lgb_train = lgb.Dataset(X, y, free_raw_data=False)
     55 lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False)
     56 
     57 ### 开始训练
     58 print('设置参数')
     59 params = {
     60             'boosting_type': 'gbdt',
     61             'boosting': 'dart',
     62             'objective': 'binary',
     63             'metric': 'binary_logloss',
     64 
     65             'learning_rate': 0.01,
     66             'num_leaves':25,
     67             'max_depth':3,
     68 
     69             'max_bin':10,
     70             'min_data_in_leaf':8,
     71 
     72             'feature_fraction': 0.6,
     73             'bagging_fraction': 1,
     74             'bagging_freq':0,
     75 
     76             'lambda_l1': 0,
     77             'lambda_l2': 0,
     78             'min_split_gain': 0
     79 }
     80 
     81 print("开始训练")
     82 gbm = lgb.train(params,                     # 参数字典
     83                 lgb_train,                  # 训练集
     84                 num_boost_round=2000,       # 迭代次数
     85                 valid_sets=lgb_eval,        # 验证集
     86                 early_stopping_rounds=30)   # 早停系数
     87 ### 线下预测
     88 print ("线下预测")
     89 preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 输出概率
     90 offline=offline_test[['instance_id','is_trade']]
     91 offline['preds']=preds_offline
     92 offline.is_trade = offline['is_trade'].astype(np.float64)
     93 print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
     94 
     95 ### 线上预测
     96 print("线上预测")
     97 preds_online =  gbm.predict(online_test_X, num_iteration=gbm.best_iteration)  # 输出概率
     98 online=online_test[['instance_id']]
     99 online['preds']=preds_online
    100 online.rename(columns={'preds':'predicted_score'},inplace=True)
    101 online.to_csv("./data/20180405.txt",index=None,sep=' ')
    102 
    103 ### 保存模型
    104 from sklearn.externals import joblib
    105 joblib.dump(gbm,'gbm.pkl')
    106 
    107 ### 特征选择
    108 df = pd.DataFrame(X.columns.tolist(), columns=['feature'])
    109 df['importance']=list(gbm.feature_importance())
    110 df = df.sort_values(by='importance',ascending=False)
    111 df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')
  • 相关阅读:
    2021.11.22 图书管理系统
    2021.12.2 综合案例建模分析
    78 内核级命令实现示例
    74 键盘驱动程序的完善
    81 文件系统设计与实现(一)
    浮点数在内存中的表示
    75 Shell 任务的实现(上)
    79 硬盘驱动程序设计(上)
    浮点数在内存中的表示
    77 Shell 任务的实现(下)
  • 原文地址:https://www.cnblogs.com/wanglei5205/p/8654041.html
Copyright © 2020-2023  润新知