• 再论sklearn分类器


    这几天在看 sklearn 的文档,发现他的分类器有很多,这里做一些简略的记录。

    大致可以将这些分类器分成两类: 1)单一分类器,2)集成分类器

    一、单一分类器

    下面这个例子对一些单一分类器效果做了比较

    from sklearn.cross_validation import cross_val_score
    from sklearn.datasets import make_blobs
    
    # meta-estimator
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier 
    
    from sklearn.naive_bayes import GaussianNB
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    
    
    classifiers = {
        'KN': KNeighborsClassifier(3),
        'SVC': SVC(kernel="linear", C=0.025),
        'SVC': SVC(gamma=2, C=1),
        'DT': DecisionTreeClassifier(max_depth=5),
        'RF': RandomForestClassifier(n_estimators=10, max_depth=5, max_features=1),  # clf.feature_importances_
        'ET': ExtraTreesClassifier(n_estimators=10, max_depth=None),  # clf.feature_importances_
        'AB': AdaBoostClassifier(n_estimators=100),
        'GB': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), # clf.feature_importances_
        'GNB': GaussianNB(),
        'LD': LinearDiscriminantAnalysis(),
        'QD': QuadraticDiscriminantAnalysis()}
    
        
        
    X, y = make_blobs(n_samples=10000, n_features=10, centers=100, random_state=0)
    
    
    for name, clf in classifiers.items():
        scores = cross_val_score(clf, X, y)
        print(name,'	--> ',scores.mean())

    下图是效果图:

    二、集成分类器

    集成分类器有四种:Bagging, Voting, GridSearch, PipeLine。最后一个PipeLine其实是管道技术

    1.Bagging

    from sklearn.ensemble import BaggingClassifier
    from sklearn.neighbors import KNeighborsClassifier
    
    meta_clf = KNeighborsClassifier() 
    bg_clf = BaggingClassifier(meta_clf, max_samples=0.5, max_features=0.5)

    2.Voting

    from sklearn import datasets
    from sklearn import cross_validation
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import VotingClassifier
    
    iris = datasets.load_iris()
    X, y = iris.data[:, 1:3], iris.target
    
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[2,1,2])
    
    for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
        scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

    3.GridSearch

    import numpy as np
    
    from sklearn.datasets import load_digits
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.grid_search import GridSearchCV
    from sklearn.grid_search import RandomizedSearchCV
    
    # 生成数据
    digits = load_digits()
    X, y = digits.data, digits.target
    
    # 元分类器
    meta_clf = RandomForestClassifier(n_estimators=20)
    
    # =================================================================
    # 设置参数
    param_dist = {"max_depth": [3, None],
                  "max_features": sp_randint(1, 11),
                  "min_samples_split": sp_randint(1, 11),
                  "min_samples_leaf": sp_randint(1, 11),
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}
    
    # 运行随机搜索 RandomizedSearch
    n_iter_search = 20
    rs_clf = RandomizedSearchCV(meta_clf, param_distributions=param_dist,
                                       n_iter=n_iter_search)
    
    start = time()
    rs_clf.fit(X, y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    print(rs_clf.grid_scores_)
    
    # =================================================================
    # 设置参数
    param_grid = {"max_depth": [3, None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [1, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}
    
    # 运行网格搜索 GridSearch
    gs_clf = GridSearchCV(meta_clf, param_grid=param_grid)
    start = time()
    gs_clf.fit(X, y)
    
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(gs_clf.grid_scores_)))
    print(gs_clf.grid_scores_)

    4.PipeLine

    第一个例子

    from sklearn import svm
    from sklearn.datasets import samples_generator
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_regression
    from sklearn.pipeline import Pipeline
    
    # 生成数据
    X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42)
    
    # 定义Pipeline,先方差分析,再SVM
    anova_filter = SelectKBest(f_regression, k=5)
    clf = svm.SVC(kernel='linear')
    pipe = Pipeline([('anova', anova_filter), ('svc', clf)])
    
    # 设置anova的参数k=10,svc的参数C=0.1(用双下划线"__"连接!)
    pipe.set_params(anova__k=10, svc__C=.1)
    pipe.fit(X, y)
    
    prediction = pipe.predict(X)
    
    pipe.score(X, y)                        
    
    # 得到 anova_filter 选出来的特征
    s = pipe.named_steps['anova'].get_support()
    print(s)

    第二个例子

    import numpy as np
    
    from sklearn import linear_model, decomposition, datasets
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    
    
    digits = datasets.load_digits()
    X_digits = digits.data
    y_digits = digits.target
    
    # 定义管道,先降维(pca),再逻辑回归
    pca = decomposition.PCA()
    logistic = linear_model.LogisticRegression()
    pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
    
    # 把管道再作为grid_search的estimator
    n_components = [20, 40, 64]
    Cs = np.logspace(-4, 4, 3)
    estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs))
    
    estimator.fit(X_digits, y_digits)
  • 相关阅读:
    Android中WebView如何加载JavaScript脚本
    Android中WebView如何加载JavaScript脚本
    Android中WebView如何加载JavaScript脚本
    Android如何使用SQLlite数据库
    Android如何使用SQLlite数据库
    Android如何使用SQLlite数据库
    __declspec(dllimport)的作用
    __declspec,__cdecl,__stdcall都是什么意思?有什么作用?
    #pragma pack(push,1)与#pragma pack(1)的区别
    #pragma pack(n) 的作用
  • 原文地址:https://www.cnblogs.com/hhh5460/p/5132203.html
Copyright © 2020-2023  润新知