voting简单投票
train_for_model = pd.read_csv('train_for_model.csv')
testa_for_model = pd.read_csv('testa_for_model.csv')
train_for_model.drop(columns=['issueDate','earliesCreditLine'], inplace=True)
testa_for_model.drop(columns=['issueDate','earliesCreditLine'], inplace=True)
train_for_model.drop(columns=['id'], inplace=True)
y = train_for_model.isDefault
train_for_model_x = train_for_model.drop(columns=['isDefault'])
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve
import time
from sklearn.model_selection import GridSearchCV
# 处理nan,因此nan超出float64范围,会报错
def nan_to_null(x):
if np.isnan(x):
return -1
else:
return x
# 这个for循环是有问题的,会产生变量data
for data in [train_for_model_x, testa_for_model]:
data['employmentLength'] = data['employmentLength'].apply(nan_to_null)
print(testa_for_model.info())
print(train_for_model_x.info())
x_train, x_valid, y_train, y_valid = train_test_split(train_for_model_x, y, test_size=0.1, random_state=10)
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=50, min_child_weight=2, subsample=0.7,objective='binary:logistic')
vclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('xgb', clf3)])
vclf = vclf.fit(x_train,y_train)
print(roc_auc_score(y_valid,vclf.predict(x_valid)))