• tensorflow在文本处理中的使用——Word2Vec预测


    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理

    代码地址:https://github.com/nfmcclure/tensorflow-cookbook

    数据:http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz

    问题:加载和使用预训练的嵌套,并使用这些单词嵌套进行情感分析,通过训练线性逻辑回归模型来预测电影的好坏

    步骤如下:

    • 必要包
    • 声明模型参数
    • 读取并转换文本数据集,划分训练集和测试集
    • 构建图
    • 训练

    step1:必要包

    import tensorflow as tf
    import matplotlib.pyplot as plt
    import numpy as np
    import random
    import os
    import pickle
    import string
    import requests
    import collections
    import io
    import tarfile
    import urllib.request
    import text_helpers
    from nltk.corpus import stopwords
    from tensorflow.python.framework import ops
    ops.reset_default_graph()
    
    os.chdir(os.path.dirname(os.path.realpath(__file__)))
    
    # Start a graph session
    sess = tf.Session()

    step2:声明模型参数

    # Declare model parameters
    embedding_size = 200
    vocabulary_size = 2000
    batch_size = 100
    max_words = 100
    
    # Declare stop words
    stops = stopwords.words('english'

    step3:读取并转换本文数据集,划分训练集和测试集

    参考:tensorflow在文本处理中的使用——辅助函数

    # Load Data
    print('Loading Data')
    data_folder_name = 'temp'
    texts, target = text_helpers.load_movie_data(data_folder_name)
    
    # Normalize text
    print('Normalizing Text Data')
    texts = text_helpers.normalize_text(texts, stops)
    
    # Texts must contain at least 3 words
    target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
    texts = [x for x in texts if len(x.split()) > 2]
    
    # Split up data set into train/test
    train_indices = np.random.choice(len(target), round(0.8*len(target)), replace=False)
    test_indices = np.array(list(set(range(len(target))) - set(train_indices)))
    texts_train = [x for ix, x in enumerate(texts) if ix in train_indices]
    texts_test = [x for ix, x in enumerate(texts) if ix in test_indices]
    target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
    target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])
    
    # Load dictionary and embedding matrix加载CBOW嵌套中保存的单词字典
    dict_file = os.path.join(data_folder_name, 'movie_vocab.pkl')
    word_dictionary = pickle.load(open(dict_file, 'rb'))
    
    # Convert texts to lists of indices根据单词字典将加载的句子转化为数值型numpy数组
    text_data_train = np.array(text_helpers.text_to_numbers(texts_train, word_dictionary))
    text_data_test = np.array(text_helpers.text_to_numbers(texts_test, word_dictionary))
    
    # Pad/crop movie reviews to specific length电影影评长度不一,不满100维的用0凑满,超过100维的取前100维
    text_data_train = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_train]])
    text_data_test = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_test]])

    step4:构建图

    print('Creating Model')
    # Define Embeddings:创建嵌套变量,用于之后加载CBOW训练好的嵌套向量
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    
    # Define model:
    # Create variables for logistic regression变量
    A = tf.Variable(tf.random_normal(shape=[embedding_size,1]))
    b = tf.Variable(tf.random_normal(shape=[1,1]))
    
    # Initialize placeholders数据占位符
    x_data = tf.placeholder(shape=[None, max_words], dtype=tf.int32)
    y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)
    
    # Lookup embeddings vectors
    embed = tf.nn.embedding_lookup(embeddings, x_data)
    # Take average of all word embeddings in documents计算句子中所有单词的平均嵌套
    embed_avg = tf.reduce_mean(embed, 1)
    
    # Declare logistic model (sigmoid in loss function)
    model_output = tf.add(tf.matmul(embed_avg, A), b)
    
    # Declare loss function (Cross Entropy loss)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(model_output, y_target))
    
    # Actual Prediction
    prediction = tf.round(tf.sigmoid(model_output))
    predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32)
    accuracy = tf.reduce_mean(predictions_correct)
    
    # Declare optimizer
    my_opt = tf.train.AdagradOptimizer(0.005)
    train_step = my_opt.minimize(loss)

    step5:训练

    # Intitialize Variables
    init = tf.initialize_all_variables()
    sess.run(init)
    
    # Load model embeddings加载CBOW训练好的嵌套矩阵
    model_checkpoint_path = os.path.join(data_folder_name,'cbow_movie_embeddings.ckpt')
    saver = tf.train.Saver({"embeddings": embeddings})
    saver.restore(sess, model_checkpoint_path)
    
    
    # Start Logistic Regression
    print('Starting Model Training')
    train_loss = []
    test_loss = []
    train_acc = []
    test_acc = []
    i_data = []
    for i in range(10000):
        rand_index = np.random.choice(text_data_train.shape[0], size=batch_size)
        rand_x = text_data_train[rand_index]
        rand_y = np.transpose([target_train[rand_index]])
        sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
        
        # Only record loss and accuracy every 100 generations
        if (i+1)%100==0:
            i_data.append(i+1)
            train_loss_temp = sess.run(loss, feed_dict={x_data: rand_x, y_target: rand_y})
            train_loss.append(train_loss_temp)
            
            test_loss_temp = sess.run(loss, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])})
            test_loss.append(test_loss_temp)
            
            train_acc_temp = sess.run(accuracy, feed_dict={x_data: rand_x, y_target: rand_y})
            train_acc.append(train_acc_temp)
        
            test_acc_temp = sess.run(accuracy, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])})
            test_acc.append(test_acc_temp)
        if (i+1)%500==0:
            acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp, test_acc_temp]
            acc_and_loss = [np.round(x,2) for x in acc_and_loss]
            print('Generation # {}. Train Loss (Test Loss): {:.2f} ({:.2f}). Train Acc (Test Acc): {:.2f} ({:.2f})'.format(*acc_and_loss))

    可视化结果展示:

    # Plot loss over time
    plt.plot(i_data, train_loss, 'k-', label='Train Loss')
    plt.plot(i_data, test_loss, 'r--', label='Test Loss', linewidth=4)
    plt.title('Cross Entropy Loss per Generation')
    plt.xlabel('Generation')
    plt.ylabel('Cross Entropy Loss')
    plt.legend(loc='upper right')
    plt.show()
    
    # Plot train and test accuracy
    plt.plot(i_data, train_acc, 'k-', label='Train Set Accuracy')
    plt.plot(i_data, test_acc, 'r--', label='Test Set Accuracy', linewidth=4)
    plt.title('Train and Test Accuracy')
    plt.xlabel('Generation')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.show()

  • 相关阅读:
    C++ std::stack 基本用法
    linux6 安装 ntopng
    linux 6安装 redis2.6
    Linux6搭建Tomcat服务器
    EXSI6.5忘记root密码
    python3笔记--数字
    python3笔记--运算符
    python3基本数据类型
    python3笔记
    centos6.X升级python3.X方法
  • 原文地址:https://www.cnblogs.com/helloworld0604/p/9009871.html
Copyright © 2020-2023  润新知