直接给代码:
1 # -- coding: gbk -- 2 from sklearn.datasets import load_breast_cancer 3 from sklearn.tree import DecisionTreeClassifier 4 from sklearn.model_selection import train_test_split 5 from sklearn.tree import export_graphviz 6 import pandas as pd 7 import graphviz 8 import mglearn 9 from sklearn.ensemble import RandomForestClassifier 10 from sklearn.datasets import make_moons 11 from sklearn.ensemble import GradientBoostingClassifier 12 from sklearn.svm import SVC 13 from pylab import * 14 def 决策树(): 15 cancer = load_breast_cancer() 16 X_train, X_test, y_train, y_test = train_test_split( 17 cancer.data, cancer.target, stratify=cancer.target, random_state=42) 18 tree = DecisionTreeClassifier(random_state=0) 19 print(X_train) 20 print(y_train.shape) 21 tree.fit(X_train, y_train) 22 y_predict=tree.predict(X_test) 23 print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train))) 24 print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test))) 25 ''' 26 export_graphviz(tree, out_file="tree.dot", class_names=["malignant", "benign"], feature_names=cancer.feature_names, 27 impurity=False, filled=True) 28 29 with open("tree.dot") as f: 30 dot_graph = f.read() 31 graphviz.Source(dot_graph) 32 ''' 33 print("特征的重要: {}".format(tree.feature_importances_)) 34 35 def 随机森林(): 36 X, y = make_moons(n_samples=100, noise=0.25, random_state=3) 37 X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=42) 38 '''五颗随机森林''' 39 forest = RandomForestClassifier(n_estimators=5, random_state=2) 40 forest.fit(X_train, y_train) 41 y_pred=forest.predict(X_test) 42 print(y_pred) 43 print(y_test) 44 print(np.mean(y_test==y_pred )) 45 fig, axes = plt.subplots(2, 3, figsize=(20, 10)) 46 for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)): ax.set_title("Tree {}".format(i)) 47 mglearn.plots.plot_tree_partition(X_train, y_train, tree, ax=ax) 48 mglearn.plots.plot_2d_separator(forest, X_train, fill=True, ax=axes[-1, -1], alpha=.4) 49 axes[-1, -1].set_title("Random Forest") 50 mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train) 51 52 def 梯度提升树(): 53 cancer = load_breast_cancer() 54 X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0) 55 #gbrt = GradientBoostingClassifier(random_state=0, max_depth=1) 56 gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01) 57 gbrt.fit(X_train, y_train) 58 y_pred=gbrt.predict(X_test) 59 print(y_pred) 60 print(np.mean(y_pred==y_test)) 61 62 def SVM向量机简易(): 63 X, y = mglearn.tools.make_handcrafted_dataset() 64 ''' 65 gamma参数是上一节给出的公式中的参数,用于控制高斯核的宽度。它决定了点与点之间“靠近”是指多大的距离。 66 C参数是正则化参数,与线性模型中用到的类似。它限制每个点的重要性(或者更确切地说,每个点的dual_coef_)。 67 68 ''' 69 svm = SVC(kernel='rbf', C=10, gamma=0.1).fit(X, y) 70 mglearn.plots.plot_2d_separator(svm, X, eps=.5) 71 mglearn.discrete_scatter(X[:, 0], X[:, 1], y) 72 # 画出支持向量 73 sv = svm.support_vectors_ 74 sv_labels = svm.dual_coef_.ravel() > 0 75 mglearn.discrete_scatter(sv[:, 0], sv[:, 1], sv_labels, s=15, markeredgewidth=3) 76 plt.xlabel("Feature 0") 77 plt.ylabel("Feature 1") 78 plt.show() 79 80 def 预处理向量机数据(): 81 cancer = load_breast_cancer() 82 X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0) 83 svc = SVC() 84 svc.fit(X_train, y_train) 85 y_pred=svc.predict(X_test) 86 print(np.mean(y_pred==y_test)) 87 88 '''预处理——缩放''' 89 min_on_training = X_train.min(axis=0) 90 range_on_training = (X_train - min_on_training).max(axis=0) 91 92 X_train_scaled = (X_train - min_on_training) / range_on_training 93 X_test_scaled = (X_test - min_on_training) / range_on_training 94 print("Minimum for each feature {}".format(X_train_scaled.min(axis=0))) 95 print("Maximum for each feature {}".format(X_train_scaled.max(axis=0))) 96 97 '''变换''' 98 X_test_scaled = (X_test - min_on_training) / range_on_training 99 svc = SVC() 100 svc.fit(X_train_scaled, y_train) 101 y_pred=svc.predict(X_test_scaled) 102 print(np.mean(y_pred==y_test)) 103 if __name__ =='__main__': 104 预处理向量机数据()