• lightGBM基础模型步骤


    ###基础工具

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import os
    import warnings
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score,roc_curve
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import LabelEncoder,Imputer
    from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
    warnings.filterwarnings("ignore")
    
    os.chdir("C:/Users/my/Desktop/记录/网约车/流失模型")
    driver = pd.read_excel('样本.xlsx')
    
    y_test = driver["target"]
    x_test = driver.drop(['a.driver_id','target'],axis = 1)
    
    #特征工程
    #第一步:填补空值,类别型转化
    str_encoder = LabelEncoder()#类别型变量编码
    str_encoder.fit(driver["a.contract_company"])
    driver["a.contract_company"] = str_encoder.transform(driver["a.contract_company"])
    
    #第二步:特征初步选择,变量值少于0.01,需要删除
    # ValueLess = []
    # for i in x_train.columns:
    #     ValuePct = driver[driver[i]>0][i].count()/driver[i].count()
    #     if ValuePct < 0.05:
    #         ValueLess.append(i)
    #         print(ValueLess,ValuePct)
    #
    # SameValue= []
    # for i in x_train.columns:
    #     SameValuePct = driver[i].value_counts().max()/driver[i].count()
    #     if SameValuePct < 0.05:
    #         SameValue.append(i)
    #         print(SameValue,SameValuePct)
    
    #driver = driver.drop(ValueLess,axis = 1)
    #driver = driver.drop(SameValue,axis = 1)
    
    select_col = ['vehicle_level','max_days','min_days','min_score','tendcy']
    
    os.chdir("C:/Users/my/Desktop/模型/第四版/")
    driver = pd.read_excel('8.8训练样本.xlsx')
    
    
    y = driver["target"]
    x = driver.drop(['a.driver_id','target'],axis = 1)
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)#划分数据集
    
    #类别型转化
    from sklearn.preprocessing import LabelEncoder,Imputer
    imp = Imputer(missing_values = 'NaN',strategy = 'mean',axis = 0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test = imp.transform(x_test)
    
    #第二步:变量分析
    fig = plt.figure()
    fig.set(alpha=0.2)
    # 解决中文的显示问题
    plt.rcParams["font.sans-serif"]=["SimHei"]
    plt.rcParams["axes.unicode_minus"] = False
    
    #看变量分布
    plt.subplot2grid((2, 2), (0, 0))
    x_train['max_days'].plot(kind="kde", grid=True)
    plt.title('max_days')
    plt.show()
    plt.subplot2grid((2, 2), (0, 1))
    x_train['rest_rate'].plot(kind="kde", grid=True)
    plt.title('rest_rate')
    plt.show()

    变量分布
    %matplotlib inline
    import seaborn as sns
    sns.set(color_codes=True)
    np.random.seed(sum(map(ord, "distributions")))
    sns.distplot(df_train.Age, kde=True, bins=20, rug=True)

    #皮尔森相关性
    pd.DataFrame(data = list(driver.corr()['target']),index = driver.columns,columns = ['value']).sort_values("value",ascending = True)
    #多重共线性
    vif_x = np.matrix(x[select_col])
    vif_list = [variance_inflation_factor(vif_x,i) for i in range(x.shape[1])]
    print(max(vif_list))
    
    
    #便捷调用评价函数
    def model_evaluate(model,x,y):
        y_pred = model.predict(x)
        fpr,tpr,_ = roc_curve(y,y_pred)
        ks = abs(fpr - tpr).max()
        auc = roc_auc_score(y,y_pred)
        print('ks:',ks)
        print('auc:',auc)
    
    
    #第三步:建模
    #GBDT建模比较
    from sklearn.ensemble import GradientBoostingClassifier
    gb = GradientBoostingClassifier(learning_rate=0.05,subsample=0.6,min_samples_split= 90,n_estimators = 50,min_samples_leaf = 10
                                    ,max_depth=15,max_features=15,random_state=10)
    gb_model =gb.fit(x_train,y_train)
    model_evaluate(gb_model,x_train,y_train)
    model_evaluate(gb_model,x_test,y_test)
    
    
    
    
    import xgboost as xgb
    # 初始化模型
    xgb_classifier = xgb.XGBClassifier(n_estimators=20,max_depth=4,learning_rate=0.1,subsample=0.7,colsample_bytree=0.7)
    # 拟合模型
    xgb_classifier.fit(x_train, y_train)
    model_evaluate(xgb_classifier,x_train,y_train)
    model_evaluate(xgb_classifier,x_test,y_test)
    
    xgb_y_train_prob = xgb_classifier.predict(x_train)
    fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y_train,xgb_y_train_prob)
    xgb_train_ks = abs(fpr_xgb_train - tpr_xgb_train).max()
    xgb_train_auc = roc_auc_score(y_train,xgb_y_train_prob)
    print("train_ks:",xgb_train_ks)
    print("train_auc:",xgb_train_auc)
    
    
    
    import lightgbm as lgb
    lgb = lgb.LGBMClassifier(
            boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
            max_depth=2, n_estimators=800, objective='binary',
            subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
            learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=-1,
            num_iterations = 800)
    
    lgb.fit(x_train, y_train)
    model_evaluate(lgb,x_train,y_train)
    model_evaluate(lgb,x_test,y_test)
    
    
    
    #随机调参
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    
    gb = GradientBoostingClassifier(learning_rate=0.02,subsample=0.6,min_samples_split= 70,n_estimators = 200,min_samples_leaf = 40
                                    ,max_depth=4,max_features='sqrt',random_state=10)
    
    gbParams = {'loss' : ['deviance', 'exponential'],
                'n_estimators': randint(10,500),
                'max_depth': randint(1,20),
                'subsample':[0.5,0.6,0.7,0.8],
                'min_samples_split':range(10,101,10),
                'min_samples_leaf':range(5,51,5),
                'learning_rate':[0.2,0.1, 0.05,0.02,0.01],
                'max_features':randint(1,20)}
    randomizedSearchGB = RandomizedSearchCV(estimator=gb, param_distributions=gbParams, n_iter=10,
                                       scoring='roc_auc', fit_params=None, cv=None, verbose=2).fit(x_train, y_train)
    print(randomizedSearchGB.best_params_, randomizedSearchGB.best_score_)
    bestGb  = randomizedSearchGB.best_estimator_.fit(x_train, y_train)
    model_evaluate(bestGb,x_train,y_train)
    
    
    #快速查看模型评估值
    from sklearn.metrics import precision_score, recall_score, f1_score
    print('Precision: %.3f' % precision_score(y_true=y_train, y_pred=xgb_y_train_prob))
    print('Recall: %.3f' % recall_score(y_true=y_train, y_pred=xgb_y_train_prob))
    print('F1: %.3f' % f1_score(y_true=y_train, y_pred=xgb_y_train_prob))
    
    
    #变量重要性输出,树模型可以输出变量重要性
    gb_importance = pd.DataFrame({'cols':x_train.columns,'gb':gb_model.feature_importances_}).sort_values('gb',ascending=False)
    gb_importance
    
    
    import pickle
    #将模型保存
    folderOfData = "C:/Users/my/Desktop/模型/"
    saveModel =open(folderOfData+'bestGb.pkl','wb')
    pickle.dump(bestGb,saveModel)
    saveModel.close()
    
    #调用模型
    import pickle
    folderOfData = "C:/Users/my/Desktop/模型/"
    modelFile =open(folderOfData+'bestGb.pkl','rb')
    gb = pickle.load(modelFile)
    modelFile.close()
    #测试数据
    
    
    #概率转分数
    def Prob2Score(prob, basePoint, PDO):
        #将概率转化成分数且为正整数
        y = np.log(prob/(1-prob))
        return (basePoint+PDO/np.log(2)*(-y))
    
    basePoint = 300
    PDO = 100
    prob = pd.DataFrame({'prob':xgb_y_pred,'y_test':y_test})
    prob['score'] = prob['prob'].map(lambda x:Prob2Score(x,basePoint,PDO))
    plt.hist(prob['score'], 100)
    plt.style.use('seaborn')
    plt.xlabel('score')
    plt.ylabel('freq')
    plt.title('distribution')
    plt.show()
    
    
    
    #决策树规则提取
    from sklearn import tree
    dtree = tree.DecisionTreeClassifier(max_depth = 4,min_samples_leaf = 7,min_samples_split = 18)
    dtree = dtree.fit(x,y)
    
    import pydotplus
    from sklearn.externals.six import StringIO
    os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
    dot_data = StringIO()
    tree.export_graphviz(dtree,out_file = dot_data,feature_names=x.columns,
                         class_names=['0','1'],filled=True,rounded=True,
                         special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("loss.pdf")
    print('Visible tree plot saved as pdf.')

    ###贝叶斯优化模型

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import warnings
    from sklearn.model_selection import train_test_split,cross_val_score
    import lightgbm as lgb
    from bayes_opt import BayesianOptimization
    from sklearn.metrics import roc_auc_score,roc_curve
    
    import os
    os.chdir('C:/Users/my/Desktop/')
    data = pd.read_excel('训练数据.xlsx',sheet_name = 'Sheet1')
    print(data.columns)
    
    
    y = data["label"]
    x = data.drop(['passenger_id','label'],axis = 1)
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)#划分数据集
    print(x_train.shape)
    print(y_train.shape)
    
    
    import xgboost as xgb
    
    # 拟合模型
    xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', learning_rate=0.1,
            max_depth=3, min_child_weight=1,n_estimators=100, n_jobs=1,
           objective='binary:logistic', random_state=0, reg_alpha=0,
           reg_lambda=1, scale_pos_weight=1,subsample=0.7)
    xgb.fit(x_train, y_train)
    
    xgb_y_train_prob = xgb.predict(x_train)
    fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y_train,xgb_y_train_prob)
    xgb_train_ks = abs(fpr_xgb_train - tpr_xgb_train).max()
    xgb_train_auc = roc_auc_score(y_train,xgb_y_train_prob)
    print("train_ks:",xgb_train_ks)
    print("train_auc:",xgb_train_auc)
    
    
    from bayes_opt import BayesianOptimization
    import lightgbm as lgb
    def GBM_evaluate(min_child_samples,learning_rate, n_estimators,min_child_weight,num_leaves,colsample_bytree, max_depth, subsample, reg_alpha, reg_lambda):
        """自定义的模型评估函数"""
    
        # 5-fold 交叉检验,注意BayesianOptimization会向最大评估值的方向优化。
        val = cross_val_score(
            lgb.LGBMClassifier(objective= 'binary',metric='auc',random_state= 2018,
                    learning_rate = float(learning_rate),
                    n_estimators=int(n_estimators),
                    max_depth = int(max_depth),
                    num_leaves = int(num_leaves),
                    min_child_samples = int(min_child_samples),
                    subsample = float(subsample),
                    colsample_bytree = float(colsample_bytree),
                    reg_alpha = reg_alpha,
                    reg_lambda = reg_lambda,
                    min_child_weight = min_child_weight,
                    class_weight = 'balanced' ),
            x_train, y_train, scoring='roc_auc', cv=5).mean()
        return val
    
    # 调参范围
    adj_params = {'min_child_weight': (3, 20),
                  'colsample_bytree': (0.4, 1),
                  'n_estimators':(100,300),
                  'learning_rate':(0.05,0.2),
                  'max_depth': (5, 15),
                  'num_leaves':(10, 50),
                  'subsample': (0.5, 1),
                  'reg_lambda': (0.1, 1),
                  'reg_alpha': (0.1, 1),
                  'min_child_samples': (10, 30)}
    # 调用贝叶斯优化
    num_iter = 25
    init_points = 5
    
    bayes = BayesianOptimization(GBM_evaluate,adj_params)
    bayes.maximize(init_points=init_points, n_iter=num_iter)
    params = bayes.max
    print(params)
    # {'target': 0.7452465518984774, 'params': {'colsample_bytree': 0.863774165376339,
    #                                           'learning_rate': 0.05000062849693596,
    #                                           'max_depth': 6.20154732653672,
    #                                           'min_child_samples': 29.985852121149026,
    #                                           'min_child_weight': 6.810125687159286,
    #                                           'n_estimators': 170.32415049570488,
    #                                           'num_leaves': 10.403716972233827,
    #                                           'reg_alpha': 0.999999999999874,
    #                                           'reg_lambda': 0.10000005514579893,
    #                                           'subsample': 0.7261106692459622}}
    
    #{'target': 0.752230340011879, 'params': {'colsample_bytree': 0.6766116352832452,
    # 'learning_rate': 0.08410079723412914, 'max_depth': 6.009908969461344, 'min_child_samples': 10.45373385991692,
    # 'min_child_weight': 5.299569525386938, 'n_estimators': 100.33382248028828, 'num_leaves': 10.861841362739199,
    # 'reg_alpha': 0.7515529745843912, 'reg_lambda': 0.9773103767283371, 'subsample': 0.6742906352043163}}
    
    
    lgbm = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',
                       colsample_bytree= 0.67,  learning_rate=0.08,
                      max_depth= 6,  min_child_samples=10,  min_child_weight= 5.3,
                      n_estimators= 100,  num_leaves= 10,  reg_alpha= 0.75,subsample_freq=1,
                      reg_lambda= 0.9,  subsample= 0.67,random_state=None ,n_jobs=-1,
                      num_iterations = 800,class_weight='balanced')
    lgbm.fit(x_train, y_train, eval_set=[(x_test, y_test)])
    y_pred=lgbm.predict(x_train)
    from sklearn.metrics import precision_score, recall_score, f1_score,roc_auc_score,roc_curve
    fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y_train,y_pred)
    lgb_train_ks = abs(fpr_lgb_train - tpr_lgb_train).max()
    lgb_train_auc = roc_auc_score(y_train,y_pred)
    print("train_ks:",lgb_train_ks)
    print("train_auc:",lgb_train_auc)
    print('precision:', precision_score(y_train, y_pred))
    print('recall:', recall_score(y_train, y_pred))
    
    
    y_pred_test=lgbm.predict(x_test)
    from sklearn.metrics import precision_score, recall_score, f1_score,roc_auc_score,roc_curve
    fpr_lgb_test,tpr_lgb_test,_ = roc_curve(y_test,y_pred_test)
    lgb_test_ks = abs(fpr_lgb_test - tpr_lgb_test).max()
    lgb_test_auc = roc_auc_score(y_test,y_pred_test)
    print("train_ks:",lgb_test_ks)
    print("train_auc:",lgb_test_auc)
    print('precision:', precision_score(y_test, y_pred_test))
    print('recall:', recall_score(y_test, y_pred_test))
    
    
    
    #概率转分数
    def Prob2Score(prob, basePoint, PDO):
        #将概率转化成分数且为正整数
        y = np.log(prob/(1-prob))
        return (basePoint+PDO/np.log(2)*(-y))
    y_pred_test=lgbm.predict_proba(x_test)[:,1]
    basePoint = 300
    PDO = 100
    prob = pd.DataFrame({'prob':y_pred_test,'y_test':y_test})
    prob['score'] = prob['prob'].map(lambda x:Prob2Score(x,basePoint,PDO))
    plt.hist(prob['score'], 100)
    plt.style.use('seaborn')
    plt.xlabel('score')
    plt.ylabel('freq')
    plt.title('distribution')
    plt.show()
    #调整参数value,设置调参区间
    min_value = 40
    max_value = 60
    for value in  range(min_value,max_value+1):
        best_omd = -1
        best_value = -1
        best_ks=[]
        def  lgb_test(train_x,train_y,test_x,test_y):
            clf =lgb.LGBMClassifier(boosting_type = 'gbdt',
                                   objective = 'binary',
                                   metric = 'auc',
                                   learning_rate = 0.1,
                                   n_estimators = value,
                                   max_depth = 5,
                                   num_leaves = 20,
                                   max_bin = 45,
                                   min_data_in_leaf = 6,
                                   bagging_fraction = 0.6,
                                   bagging_freq = 0,
                                   feature_fraction = 0.8,
                                   silent=True
                                   )
            clf.fit(train_x,train_y,eval_set = [(train_x,train_y),(test_x,test_y)],eval_metric = 'auc')
            return clf,clf.best_score_['valid_1']['auc'],
        lgb_model , lgb_auc  = lgb_test(train_x,train_y,test_x,test_y)
    
        y_pred = lgb_model.predict_proba(x)[:,1]
        fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred)
        train_ks = abs(fpr_lgb_train - tpr_lgb_train).max()
    
        y_pred = lgb_model.predict_proba(val_x)[:,1]
        fpr_lgb,tpr_lgb,_ = roc_curve(val_y,y_pred)
        val_ks = abs(fpr_lgb - tpr_lgb).max()
        
        Omd= val_ks + 0.8*(val_ks - train_ks)
        if Omd>best_omd:
            best_omd = Omd
            best_value = value
            best_ks = [train_ks,val_ks]
    print('best_value:',best_value)
    print('best_ks:',best_ks)
    天才是百分之一的灵感,加百分之九十九的汗水,但那百分之一的灵感往往比百分之九十九的汗水来的重要
  • 相关阅读:
    Ajax探讨
    什么叫套接字
    hibernate中的dialect解释
    SpringMvc整合hibernate
    单点登录原理
    微信小程序开发视频教程学习(第6天)2017年6月29日:上午前端下午PHP
    微信小程序开发视频教程学习(第3天)2:PHP测验错误题分析
    微信小程序开发视频教程学习(第3天):上午前端下午PHP
    微信小程序开发视频教程学习(第2天):上午前端下午PHP
    flex开发的企业管理系统ui
  • 原文地址:https://www.cnblogs.com/Christbao/p/14283177.html
Copyright © 2020-2023  润新知