from sklearn.base import BaseEstimator,ClassifierMixin,RegressorMixin from sklearn.preprocessing import OneHotEncoder import numpy as np class TreeLeaf(BaseEstimator,ClassifierMixin,RegressorMixin): """ 树模型和其他模型的结合:树模型输出的叶子节点当成特征输入到其他模型中 """ def __init__(self,treeModel=[],metaModel=[],n_estimators=[],goal="regression"): self.treeModel = treeModel self.metaModel = metaModel self.n_estimators = n_estimators self.goal = goal def fit(self,X,y): self.best_treemodel = [] #用于保存训练参数后的tree模型 self.best_metamodel = [] #用于保存训练参数后的meta模型 self.leaf_list = [] #用于保存叶子节点 for model in self.treeModel: model_param = model.fit(X,y) #得到训练参数后的模型 self.best_treemodel.append(model_param) leaf = model_param.apply(X) #输出叶子 self.leaf_list.append(leaf) #对叶子节点进行拼接 leaf_matrix = np.concatenate(self.leaf_list,axis=1) #对叶子节点进行one_hot编码 self.one_hot_encoder = OneHotEncoder() x_one_hot = self.one_hot_encoder.fit_transform(leaf_matrix) #利用metaModel做拟合 for model in self.metaModel: model_param = model.fit(x_one_hot,y) self.best_metamodel.append(model_param) return self def predict(self,X): leaf_list_pred = [] for model in self.best_treemodel: leaf_list_pred.append(model.apply(X)) leaf_matrix_pred = np.concatenate(leaf_list_pred,axis=1) x_one_hot_pred = self.one_hot_encoder.transform(leaf_matrix_pred) y_pred_list = [] for model in self.best_metamodel: y_pred_list.append(model.predict(x_one_hot_pred)) if self.goal == "regression": return sum(y_pred_list,axis=0) elif self.goal == "classification": y_pred = np.zeros(X.shape[0]) for i,line in enumerate(np.array(y_pred_list).T): y_pred[i] = np.argmax(np.bincount(line)) return y_pred ##################案例测试#################################################### from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.metrics import accuracy_score from lightgbm import LGBMClassifier X,y = load_iris(return_X_y=True) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) treeModel_1 = RandomForestClassifier(n_estimators=20) treeModel_2 = LGBMClassifier( n_estimators=30) #treeModel_2 = GradientBoostingClassifier(n_estimators=30) metaModel_1 = LogisticRegression() metaModel_2 = SVC() tl = TreeLeaf(treeModel=[treeModel_1,treeModel_2],metaModel=[metaModel_1,metaModel_2],n_estimators=[20,30],goal="classification") tl.fit(X_train,y_train) y_pred = tl.predict(X_test) accuracy_score(y_test,y_pred)
上述代码主要完成了基于多个树模型的叶子节点输入到多个分类器或者回归器的模型融合策略,具有一定的扩展性和适应度。后面给出了一个基于随机深林和lightGBM的测试实例,供大家参考。这种模型融合策略在不同的地方效果不同,关键还是特征工程是否做得更好,该类方法在训练集上有一定的过拟合倾向。
欢迎评论和给出意见,如果对你有帮助,请给个关注,激励一下我,谢谢!