• 使用tf.estimator训练Boosted Trees


    1,数据集下载

    !pip install -q tf-nightly  # Requires tf 1.13
    from __future__ import absolute_import, division, print_function

    import numpy as np
    import pandas as pd
    import tensorflow as tf

    tf.enable_eager_execution()

    tf.logging.set_verbosity(tf.logging.ERROR)
    tf.set_random_seed(123)

    # Load dataset.
    dftrain = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_train.csv')
    dfeval = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_eval.csv')
    y_train = dftrain.pop('survived')
    y_eval = dfeval.pop('survived')

    2,一系列数据检查

    这一条特别

    dftrain.sex.value_counts().plot(kind='barh');
    (dftrain['class']
      .value_counts()
      .plot(kind='barh'));

    3,Create feature columns and input functions,特征列和输入函数

    3.1,one-hot-encoding, normalization, and bucketization

    3.2,数字型和分类型

    fc = tf.feature_column
    CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
                           'embark_town', 'alone']
    NUMERIC_COLUMNS = ['age', 'fare']
     
    def one_hot_cat_column(feature_name, vocab):
      return fc.indicator_column(
          fc.categorical_column_with_vocabulary_list(feature_name,
                                                     vocab))
    feature_columns = []
    for feature_name in CATEGORICAL_COLUMNS:
      # Need to one-hot encode categorical features.
      vocabulary = dftrain[feature_name].unique()
      print(feature_name,vocabulary)
      feature_columns.append(one_hot_cat_column(feature_name, vocabulary))
      print(feature_columns," ")
     
    for feature_name in NUMERIC_COLUMNS:
      feature_columns.append(fc.numeric_column(feature_name,
                                               dtype=tf.float32))

    3.3,view all of the feature column transformations//第0条example

    fc.input_layer(dict(example), feature_columns).numpy()

    3.4,create the input functions创建输入函数

    # Use entire batch since this is such a small dataset.
    NUM_EXAMPLES = len(y_train)

    def make_input_fn(X, y, n_epochs=None, shuffle=True):
      def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
          dataset = dataset.shuffle(NUM_EXAMPLES)
        # For training, cycle thru dataset as many times as need (n_epochs=None).    
        dataset = dataset.repeat(n_epochs)  
        # In memory training doesn't use batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
      return input_fn

    # Training and evaluation input functions.
    train_input_fn = make_input_fn(dftrain, y_train)
    eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)

    4,Train and evaluate the model训练和评估

    4.1,first train a linear classifier (logistic regression model),训练一个二分类模型

    linear_est = tf.estimator.LinearClassifier(feature_columns)

    # Train model.
    linear_est.train(train_input_fn, max_steps=100)

    # Evaluation.
    results = linear_est.evaluate(eval_input_fn)
    print('Accuracy : ', results['accuracy'])
    print('Dummy model: ', results['accuracy_baseline'])

    4.2,train a Boosted Trees model

    # Since data fits into memory, use entire dataset per layer. It will be faster.
    # Above one batch is defined as the entire dataset.
    n_batches = 1
    est = tf.estimator.BoostedTreesClassifier(feature_columns,n_batches_per_layer=n_batches)

    //est = tf.estimator.BoostedTreesClassifier(feature_columns,n_batches_per_layer=n_batches,n_trees=300)

    # The model will stop training once the specified number of trees is built, not
    # based on the number of steps.
    est.train(train_input_fn, max_steps=100)

    # Eval.
    results = est.evaluate(eval_input_fn)
    print('Accuracy : ', results['accuracy'])
    print('Dummy model: ', results['accuracy_baseline'])

    4.3,内存模式

    def make_inmemory_train_input_fn(X, y):
      def input_fn():
        return dict(X), y
      return input_fn


    train_input_fn = make_inmemory_train_input_fn(dftrain, y_train)
    eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)
    est = tf.contrib.estimator.boosted_trees_classifier_train_in_memory(
        train_input_fn,
        feature_columns)
    print(est.evaluate(eval_input_fn)['accuracy'])

    5,预测

    pred_dicts = list(est.predict(eval_input_fn))
    probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

    probs.plot(kind='hist', bins=20, title='predicted probabilities');

    6,看看roc曲线

    from sklearn.metrics import roc_curve
    from matplotlib import pyplot as plt

    fpr, tpr, _ = roc_curve(y_eval, probs)
    plt.plot(fpr, tpr)
    plt.title('ROC curve')
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.xlim(0,)
    plt.ylim(0,);

     
  • 相关阅读:
    管理者的存在 说明了企业文化的匮乏【20140124】
    Sublime Text2(ST2)点滴积累及使用技巧_持续更新【20130320】【最近修改20130516】
    WebStorm 点滴积累及使用技巧_持续更新【20130323】【最近修改20130604】
    码农,企业,和资本
    关于赛车游戏的一点体会
    从艺感悟
    三种糟糕的程序员
    关于ios单机盗版
    汽车加速性,功率和扭矩
    ExtJS之面向对象的概念
  • 原文地址:https://www.cnblogs.com/augustone/p/10513045.html
Copyright © 2020-2023  润新知