# 头部引入 import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns import warnings warnings.filterwarnings("ignore") import missingno as mso import pandas_profiling
# 前期设置 pd.set_option("display.max_columns",100) # 数据读取 data_init=pd.read_csv("训练数据.csv",encoding="gbk") # 预处理 # 1.查看单个值得数量 data_filer["BufferCounter"].unique() # 2.查看所有列 list(data_filer.columns) #数据类型转换 data_filer[[ 'RamUsage', 'CpuUsage', 'VideoTotleTraffic']]=data_filer[['RamUsage','VideoTotleTraffic']].apply(pd.to_numeric,errors="ignore") # 删除列 data_filer=data_filer.dropna(subset=["Latitude"]) # 缺失值处理 ms.matrix(data_filer) RX=np.mean(data_filer["RX"]) data_filer["RX"].fillna(RX,inplace=True) # 各个属性之间的关系 data_filer.corr() # one-hot编码 X_City=pd.get_dummies(X["City"]) X=pd.concat([X,X_City],axis=1) X=X.drop(["City"],axis=1) # 时间处理 data_filer["VideoTestTime"]=data_filer["VideoTestTime"].astype(np.datetime64) X["year"]=X["VideoTestTime"].apply(lambda x:x.year) X["month"]=X["VideoTestTime"].apply(lambda x:x.month) X["Day"]=X["VideoTestTime"].apply(lambda x:x.day) X["hour"]=X["VideoTestTime"].apply(lambda x:x.hour) X["minute"]=X["VideoTestTime"].apply(lambda x:x.minute) X_data=X.drop(["VideoTestTime"],axis=1) # 单独列的处理 # 排序 data2=data_base[data_base["p_date"]==data_base["dateBefore"]].sort_values(by="enodebid") import re def f(x): try: str=re.search("[a-zA-Z]+s",x) if str: return str.group() else: str2=re.search("[a-zA-Z]+",x) if str2: return str2.group() else: return "other2" except: return "other" data_pho["PhoneTypenew"]=data_pho["PhoneType"].apply(f) # Age列处理 #处理Age列以及Series的逐个访问 Age_Pre=data[["Age","NameTitle"]].groupby("NameTitle")["Age"].mean() type(Age_Pre) for index,value in Age_Pre.items(): data.loc[(data.Age.isnull())&(data.NameTitle==index),"Age"]=Age_Pre[index] # Mapping Age dataset.loc[ dataset['Age'] <= 16, 'Age']= 0 dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1 dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2 dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3 dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ; sex={"male":0,"female":1} dataset["Sex"]=dataset["Sex"].map(sex)
# 特征工程
players['birth_date'] = pd.to_datetime(players.birthday, format='%d.%m.%Y') players['age_years'] = ((pd.to_datetime("2013-01-01") - players['birth_date']).dt.days)/365.25 players['age_years']
#对离散category数值进行处理Create higher level categories
position_types = players.position.unique() position_types “”“ array(['Center Back', 'Attacking Midfielder', 'Right Midfielder', 'Center Midfielder', 'Goalkeeper', 'Defensive Midfielder', 'Left Fullback', nan, 'Left Midfielder', 'Right Fullback', 'Center Forward', 'Left Winger', 'Right Winger'], dtype=object) ”“” defense = ['Center Back','Defensive Midfielder', 'Left Fullback', 'Right Fullback', ] midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder',] forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward'] keeper = 'Goalkeeper' # modifying dataframe -- adding the aggregated position categorical position_agg players.loc[players['position'].isin(defense), 'position_agg'] = "Defense" players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield" players.loc[players['position'].isin(forward), 'position_agg'] = "Forward" players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"
X=data_filer[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'City', 'Source', 'NetType', 'APN/SSID', 'RX', 'L_SINR', 'LteRsrq', 'CI', 'VideoAvgSpeed', 'VideoPeakSpeed', 'VideoTestTime', 'VideoTotleTraffic']] y=data_filer["BufferCounter"] # 可视化 ata_pho["PhoneType"].value_counts()[0:20].plot(kind="bar")
# # 下面针对多个模型进行集成操作 from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.pipeline import make_pipeline
集成算法
SEED=666 def get_models(): """Generate a library of base learners.""" nb = GaussianNB() svc = SVC(C=100, probability=True) knn = KNeighborsClassifier(n_neighbors=3) lr = LogisticRegression(C=100, random_state=SEED) nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED) gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED) rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED) models = {'svm': svc, 'knn': knn, 'naive bayes': nb, 'mlp-nn': nn, 'random forest': rf, 'gbm': gb, 'logistic': lr, } return models def train_predict(model_list): """Fit models in list on training set and return preds""" P = np.zeros((y_test.shape[0], len(model_list))) P = pd.DataFrame(P) print("Fitting models.") cols = list() for i, (name, m) in enumerate(models.items()): print("%s..." % name, end=" ", flush=False) m.fit(X_train, y_train) P.iloc[:, i] = m.predict_proba(X_test)[:, 1] cols.append(name) print("done") P.columns = cols print("Done. ") return P def score_models(P, y): """Score model in prediction DF""" print("Scoring models.") for m in P.columns: score = roc_auc_score(y, P.loc[:, m]) print("%-26s: %.3f" % (m, score)) print("Done. ")
# 使用前期各个分类器 models = get_models() P = train_predict(models) score_models(P, y_test)
# 绘制ROC曲线 from sklearn.metrics import roc_curve def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label): """Plot the roc curve for base learners and ensemble.""" plt.figure(figsize=(10, 8)) plt.plot([0, 1], [0, 1], 'k--') cm = [plt.cm.rainbow(i) for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)] for i in range(P_base_learners.shape[1]): p = P_base_learners[:, i] fpr, tpr, _ = roc_curve(ytest, p) plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1]) fpr, tpr, _ = roc_curve(ytest, P_ensemble) plt.plot(fpr, tpr, label=ens_label, c=cm[0]) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(frameon=False) plt.show()
plot_roc_curve(y_test, P.values, P.mean(axis=1), list(P.columns), "ensemble")
#去掉最差的一个
include = [c for c in P.columns if c not in ["mlp-nn"]] print("Truncated ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, P.loc[:, include].mean(axis=1)))
#可视化各个模型预测结果 p = P.apply(lambda x: 1*(x >= 0.5).value_counts(normalize=True)) p.index = ["DEM", "REP"] p.loc["REP", :].sort_values().plot(kind="bar") plt.axhline(0.25, color="k", linewidth=0.5) plt.text(0., 0.23, "True share republicans") plt.show()
Stacking模型
1.定义基础模型
base_learners = get_models()
2.定义我们的权重分配模型(第二层架构)
meta_learner = GradientBoostingClassifier( n_estimators=1000, loss="exponential", max_features=4, max_depth=3, subsample=0.5, learning_rate=0.005, random_state=SEED )
3.将基础模型数据分成两部分,主要供第二层来使用
xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(
xtrain, ytrain, test_size=0.5, random_state=SEED)
4.训练我们的基础模型
def train_base_learners(base_learners, inp, out, verbose=True): """Train all base learners in the library.""" if verbose: print("Fitting models.") for i, (name, m) in enumerate(base_learners.items()): if verbose: print("%s..." % name, end=" ", flush=False) m.fit(inp, out) if verbose: print("done")
train_base_learners(base_learners, xtrain_base, ytrain_base)
5.准备二阶段权重分配分类器的训练数据
def predict_base_learners(pred_base_learners, inp, verbose=True): """Generate a prediction matrix.""" P = np.zeros((inp.shape[0], len(pred_base_learners))) if verbose: print("Generating base learner predictions.") for i, (name, m) in enumerate(pred_base_learners.items()): if verbose: print("%s..." % name, end=" ", flush=False) p = m.predict_proba(inp) # With two classes, need only predictions for one class P[:, i] = p[:, 1] if verbose: print("done") return P
P_base = predict_base_learners(base_learners, xpred_base)
6.训练二阶段得出分类结果!
meta_learner.fit(P_base, ypred_base)
def ensemble_predict(base_learners, meta_learner, inp, verbose=True): """Generate predictions from the ensemble.""" P_pred = predict_base_learners(base_learners, inp, verbose=verbose) return P_pred, meta_learner.predict_proba(P_pred)[:, 1]
P_pred, p = ensemble_predict(base_learners, meta_learner, xtest) print(" Ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
经过上面的操作,损失了一部分数据集,下面采用交叉验证。
from sklearn.base import clone def stacking(base_learners, meta_learner, X, y, generator): """Simple training routine for stacking.""" # Train final base learners for test time print("Fitting final base learners...", end="") train_base_learners(base_learners, X, y, verbose=False) print("done") # Generate predictions for training meta learners # Outer loop: print("Generating cross-validated predictions...") cv_preds, cv_y = [], [] for i, (train_idx, test_idx) in enumerate(generator.split(X)): fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx] fold_xtest, fold_ytest = X[test_idx, :], y[test_idx] # Inner loop: step 4 and 5 fold_base_learners = {name: clone(model) for name, model in base_learners.items()} train_base_learners( fold_base_learners, fold_xtrain, fold_ytrain, verbose=False) fold_P_base = predict_base_learners( fold_base_learners, fold_xtest, verbose=False) cv_preds.append(fold_P_base) cv_y.append(fold_ytest) print("Fold %i done" % (i + 1)) print("CV-predictions done") # Be careful to get rows in the right order cv_preds = np.vstack(cv_preds) cv_y = np.hstack(cv_y) # Train meta learner print("Fitting meta learner...", end="") meta_learner.fit(cv_preds, cv_y) print("done") return base_learners, meta_learner
from sklearn.model_selection import KFold # Train with stacking cv_base_learners, cv_meta_learner = stacking( get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2)) P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False) print(" Ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
并行方法提供效率:
from mlens.ensemble import SuperLearner # Instantiate the ensemble with 10 folds sl = SuperLearner( folds=10, random_state=SEED, verbose=2, backend="multiprocessing" ) # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(xtrain, ytrain) # Predict the test set p_sl = sl.predict_proba(xtest) print(" Super Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))