• scikit-learn机器学习(四)使用决策树做分类,并画出决策树,随机森林对比


    数据来自 UCI 数据集 匹马印第安人糖尿病数据集

    载入数据

    # -*- coding: utf-8 -*-
    import pandas as pd
    import matplotlib
    matplotlib.rcParams['font.sans-serif']=[u'simHei']
    matplotlib.rcParams['axes.unicode_minus']=False
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    
    from sklearn.datasets import load_breast_cancer
    
    data_set = pd.read_csv('pima-indians-diabetes.csv')
    data = data_set.values[:,:]
    
    y = data[:,8]
    X = data[:,:8]
    X_train,X_test,y_train,y_test = train_test_split(X,y)

    建立决策树,网格搜索微调模型

    # In[1] 网格搜索微调模型
    pipeline = Pipeline([
            ('clf',DecisionTreeClassifier(criterion='entropy'))
            ])
    parameters={
            'clf__max_depth':(3,5,10,15,20,25,30,35,40),
            'clf__min_samples_split':(2,3),
            'clf__min_samples_leaf':(1,2,3)
            }
    #GridSearchCV 用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数。
    grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,scoring='f1')
    grid_search.fit(X_train,y_train)
    
    # 获取搜索到的最优参数
    best_parameters = grid_search.best_estimator_.get_params()
    print("最好的F1值为:",grid_search.best_score_)
    print('最好的参数为:')
    for param_name in sorted(parameters.keys()):
        print('t%s: %r' % (param_name,best_parameters[param_name]))
        
    # In[2] 输出预测结果并评价
    predictions = grid_search.predict(X_test)
    print(classification_report(y_test,predictions))
    最好的F1值为: 0.5573515325670498
    最好的参数为:
    tclf__max_depth: 5
    tclf__min_samples_leaf: 1
    tclf__min_samples_split: 2

    评价模型

    # In[2] 输出预测结果并评价
    predictions = grid_search.predict(X_test)
    print(classification_report(y_test,predictions))
                  precision    recall  f1-score   support
    
             0.0       0.74      0.89      0.81       124
             1.0       0.67      0.43      0.52        68

    画出决策树

    # In[3]打印树
    from sklearn import tree  
    feature_name=data_set.columns.values.tolist()[:-1]   # 列名称
    DT = tree.DecisionTreeClassifier(criterion='entropy',max_depth=5,min_samples_split=2,min_samples_leaf=5)
    DT.fit(X_train,y_train)
    
    '''
    # 法一
    import pydotplus
    from sklearn.externals.six import StringIO
    dot_data = StringIO()
    tree.export_graphviz(DT,out_file = dot_data,feature_names=feature_name,
                         class_names=["有糖尿病","无病"],filled=True,rounded=True,
                         special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("Tree.pdf")
    print('Visible tree plot saved as pdf.')
    '''
    
    # 法二
    import graphviz
    #ID3为决策树分类器fit之后得到的模型,注意这里必须在fit后执行,在predict之后运行会报错
    dot_data = tree.export_graphviz(DT, out_file=None,feature_names=feature_name,class_names=["有糖尿病","无病"]) # doctest: +SKIP
    graph = graphviz.Source(dot_data) # doctest: +SKIP
    #在同级目录下生成tree.pdf文件
    graph.render("tree2") # doctest: +SKIP

    随机森林

    # -*- coding: utf-8 -*-
    import pandas as pd
    import matplotlib
    matplotlib.rcParams['font.sans-serif']=[u'simHei']
    matplotlib.rcParams['axes.unicode_minus']=False
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    
    from sklearn.datasets import load_breast_cancer
    
    data_set = pd.read_csv('pima-indians-diabetes.csv')
    data = data_set.values[:,:]
    
    y = data[:,8]
    X = data[:,:8]
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    
    RF = RandomForestClassifier(n_estimators=10,random_state=11)
    RF.fit(X_train,y_train)
    predictions = RF.predict(X_test)
    print(classification_report(y_test,predictions))
                  precision    recall  f1-score   support
    
             0.0       0.82      0.91      0.86       126
             1.0       0.78      0.61      0.68        66
    
       micro avg       0.81      0.81      0.81       192
       macro avg       0.80      0.76      0.77       192
    weighted avg       0.80      0.81      0.80       192
  • 相关阅读:
    Windows Messenger 5.1 [Download from Microsoft]
    Building Web Parts for SPS读书笔记(2)
    Resources for KM & SharePoint Portal
    Microsoft.SharePoint.Menu Web Part [Free]
    忽略PNG透明区域的事件(AS/Flash)
    android webView 使用方法
    Android开发之Android开发规范(初)
    Android 下使用 JSON 实现 HTTP 请求
    html5 canvas 简单画板实现代码
    设定麦克风的声音品质
  • 原文地址:https://www.cnblogs.com/caiyishuai/p/11192156.html
Copyright © 2020-2023  润新知