• Python_sklearn机器学习库学习笔记(四)decision_tree(决策树)


    # 决策树

    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    import zipfile
    #压缩节省空间
    z=zipfile.ZipFile('ad-dataset.zip')
    # df=pd.read_csv(z.open(z.namelist()[0]),header=None,low_memory=False)
    # df = pd.read_csv(z.open(z.namelist()[0]), header=None, low_memory=False)
    df=pd.read_csv('.\tree_data\ad.data',header=None)
    explanatory_variable_columns=set(df.columns.values)
    response_variable_column=df[len(df.columns.values)-1]
    #最后一列是代表的标签类型
    explanatory_variable_columns.remove(len(df.columns)-1)
    y=[1 if e =='ad.' else 0 for e in response_variable_column]
    X=df.loc[:,list(explanatory_variable_columns)]
    #匹配?字符,并把值转化为-1
    X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)
    X_train,X_test,y_train,y_test=train_test_split(X,y)
    #用信息增益启发式算法建立决策树
    pipeline=Pipeline([('clf',DecisionTreeClassifier(criterion='entropy'))])
    parameters = {
    'clf__max_depth': (150, 155, 160),
    'clf__min_samples_split': (1, 2, 3),
    'clf__min_samples_leaf': (1, 2, 3)
    }
    #f1查全率和查准率的调和平均
    grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,
                             verbose=1,scoring='f1')
    grid_search.fit(X_train,y_train)
    print '最佳效果:%0.3f'%grid_search.best_score_
    print '最优参数'
    best_parameters=grid_search.best_estimator_.get_params()
    best_parameters

    输出结果:

    Fitting 3 folds for each of 27 candidates, totalling 81 fits
    
     
    [Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.0s
    [Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   34.7s finished
    
     
    最佳效果:0.888
    最优参数
    
    Out[123]:
    {'clf': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,
                 max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
                 min_samples_split=3, min_weight_fraction_leaf=0.0,
                 presort=False, random_state=None, splitter='best'),
     'clf__class_weight': None,
     'clf__criterion': 'entropy',
     'clf__max_depth': 160,
     'clf__max_features': None,
     'clf__max_leaf_nodes': None,
     'clf__min_samples_leaf': 1,
     'clf__min_samples_split': 3,
     'clf__min_weight_fraction_leaf': 0.0,
     'clf__presort': False,
     'clf__random_state': None,
     'clf__splitter': 'best',
     'steps': [('clf',
       DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,
                   max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
                   min_samples_split=3, min_weight_fraction_leaf=0.0,
                   presort=False, random_state=None, splitter='best'))]}
    for param_name in sorted(parameters.keys()):
        print ('	%s:%r'%(param_name,best_parameters[param_name]))
    predictions=grid_search.predict(X_test)
    print classification_report(y_test,predictions)

    输出结果:

    clf__max_depth:150
    clf__min_samples_leaf:1
    clf__min_samples_split:1
                 precision    recall  f1-score   support

              0       0.97      0.99      0.98       703
              1       0.91      0.84      0.87       117

    avg / total       0.96      0.96      0.96       820

    df.head()

    输出结果;

     0123456789...1549155015511552155315541555155615571558
    0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    4 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

     # 决策树集成

    #coding:utf-8
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    
    df=pd.read_csv('.\tree_data\ad.data',header=None,low_memory=False)
    explanatory_variable_columns=set(df.columns.values)
    response_variable_column=df[len(df.columns.values)-1]
    df.head()
     0123456789...1549155015511552155315541555155615571558
    0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    4 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
    #The last column describes the targets(去掉最后一列)
    explanatory_variable_columns.remove(len(df.columns.values)-1)
    y=[1 if e=='ad.' else 0 for e in response_variable_column]
    X=df.loc[:,list(explanatory_variable_columns)]
    #置换有?的为-1
    X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)
    X_train,X_test,y_train,y_test=train_test_split(X,y)
    pipeline=Pipeline([('clf',RandomForestClassifier(criterion='entropy'))])
    parameters = {
    'clf__n_estimators': (5, 10, 20, 50),
    'clf__max_depth': (50, 150, 250),
    'clf__min_samples_split': (1, 2, 3),
    'clf__min_samples_leaf': (1, 2, 3)
    }
    grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring='f1')
    grid_search.fit(X_train,y_train)
    print(u'最佳效果:%0.3f'%grid_search.best_score_)
    print u'最优的参数:'
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('	%s:%r'%(param_name,best_parameters[param_name]))

    输出结果:

    最佳效果:0.929 最优的参数: clf__max_depth:250 clf__min_samples_leaf:1 clf__min_samples_split:3 clf__n_estimators:50
    predictions=grid_search.predict(X_test)
    print classification_report(y_test,predictions)

    输出结果:

         precision    recall  f1-score   support

              0       0.98      1.00      0.99       705
              1       0.97      0.90      0.93       115

    avg / total       0.98      0.98      0.98       820

  • 相关阅读:
    Java学习日记Ⅰ
    docker 安装redis
    maven 打包 把第三方包也打进去
    wiki 配置数据源 编码要是utf8 不能是utf8mb4
    SCFT用公钥登录
    配置tomcat重启脚本
    tomcat
    centos7 搭建rabbitmq服务 3.7.15
    安装openoffice
    tomcat 日期切分
  • 原文地址:https://www.cnblogs.com/wuchuanying/p/6251216.html
Copyright © 2020-2023  润新知