Stacking是堆叠的意思,把多个模型堆叠到一起。
它通过一个元模型把数据堆叠到一起,这个元模型训练的特征就是模型+模型输出的结果,标签是训练集的y。
预测时用所有的模型预测一遍,得到的结果作为特征给元模型,输出最终的结果。
from sklearn import datasets
X, y = datasets.load_iris(return_X_y=True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# stacking
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
dtc_classifier = DecisionTreeClassifier()
svc_classifier = LinearSVC()
dtc_classifier.fit(X_train, y_train)
svc_classifier.fit(X_train, y_train)
nb = GaussianNB().fit(X_train, y_train)
from sklearn import metrics
print(metrics.accuracy_score(y_test, dtc_classifier.predict(X_test)))
# 0.8666666666666667
print(metrics.accuracy_score(y_test, svc_classifier.predict(X_test)))
# 1.0
print(metrics.accuracy_score(y_test, nb.predict(X_test)))
# 0.9666666666666667
from sklearn.ensemble import StackingClassifier
# 传入的estimator必须fit过
sc = StackingClassifier(estimators=[('dtc_classifier', dtc_classifier),
('svc_classifier', svc_classifier),
('nb', nb)], final_estimator=LogisticRegression())
# Stacking 也必须再fit下; 用预测结果进行训练
sc.fit(X_train, y_train)
print(metrics.accuracy_score(y_test, sc.predict(X_test)))
# 1.0