tensorflow基于csv数据集实现多元线性回归并预测

#coding:utf8
import tensorflow as tf
from sklearn import linear_model
from sklearn import preprocessing
import numpy as np


def read_data(file_queue):
    '''
    the function is to get features and label (即样本特征和样本的标签）
    数据来源是csv的文件，采用tensorflow 自带的对csv文件的处理方式
    :param file_queue:
    :return: features,label
    '''
    # 读取的时候需要跳过第一行
    reader = tf.TextLineReader(skip_header_lines=1)
    key, value = reader.read(file_queue)
    # 对于数据源中空的值设置默认值
    record_defaults = [[''], [''], [''], [''], [0.], [0.], [0.], [0.], [''],[0], [''], [0.], [''], [''], [0]]
    # 定义decoder，每次读取的执行都从文件中读取一行。然后，decode_csv 操作将结果解析为张量列表
    province, city, address, postCode, longitude,latitude, price, buildingTypeId, buildingTypeName, tradeTypeId, tradeTypeName, expectedDealPrice, listingDate, delislingDate, daysOnMarket = tf.decode_csv(value, record_defaults)
    #对非数值数据进行编码：buildingTypeName
    preprocess_buildingTypeName_op = tf.case({
        tf.equal(buildingTypeName, tf.constant('Residential')): lambda: tf.constant(0.00),
        tf.equal(buildingTypeName, tf.constant('Condo')): lambda: tf.constant(1.00),
        tf.equal(buildingTypeName, tf.constant('Mobile Home')): lambda: tf.constant(2.00),
        tf.equal(buildingTypeName, tf.constant('No Building')): lambda: tf.constant(3.00),
        tf.equal(buildingTypeName, tf.constant('Row / Townhouse')): lambda: tf.constant(4.00),
        tf.equal(buildingTypeName, tf.constant('Duplex')): lambda: tf.constant(5.00),
        tf.equal(buildingTypeName, tf.constant('Manufactured Home')): lambda: tf.constant(6.00),
        tf.equal(buildingTypeName, tf.constant('Commercial')): lambda: tf.constant(7.00),
        tf.equal(buildingTypeName, tf.constant('Other')): lambda: tf.constant(8.00),
    }, lambda: tf.constant(-1.00), exclusive=True)
    # 对tradeTypeName 进行编码 Sale，Lease
    preprocess_tradeTypeName_op = tf.case({
        tf.equal(tradeTypeName, tf.constant('Sale')): lambda: tf.constant(0.00),
        tf.equal(tradeTypeName, tf.constant('Lease')): lambda: tf.constant(1.00),
    }, lambda: tf.constant(-1.00), exclusive=True)
    features = tf.stack([latitude,longitude,price, preprocess_buildingTypeName_op, preprocess_tradeTypeName_op,expectedDealPrice])
    return features, daysOnMarket


def create_pipeline(filename,batch_size,num_epochs=None):
    '''
    the function is to get every batch example and label
    此处使用的是tf.train.batch，即顺序获取，非随机获取，随机获取采用的方法是：tf.train.shuffle_batch
    :param filename:
    :param batch_size:
    :param num_epochs:
    :return:example_batch,label_batch
    '''
    file_queue = tf.train.string_input_producer([filename],num_epochs=num_epochs)
    # example,label 样本和样本标签,batch_size 返回一个样本batch样本集的样本个数
    example,dayOnMarket = read_data(file_queue)
    # 出队后队列至少剩下的数据个数，小于capacity（队列的长度）否则会报错，
    min_after_dequeue = 1000
    #队列的长度
    capacity = min_after_dequeue+batch_size
    # 顺序获取每一批数据
    example_batch,daysOnMarket_batch= tf.train.batch([example,dayOnMarket],batch_size=batch_size,capacity=capacity)#顺序读取
    return example_batch,daysOnMarket_batch


def train(batch_size, feature_num,learn_rate,filename):
    '''
    the function is to train to get w and b
    :param batch_size: 批量大小
    :param feature_num: 特征个数
    :param learn_rate: 学习率
    :param filename:csv文件名称
    :return: w,b
    '''

    # 预处理输入的样本和标签，后面用获取的数据进行喂养
    x_data = tf.placeholder(tf.float32, [batch_size, feature_num])
    y_data = tf.placeholder(tf.float32, [batch_size])
    # 创建参数 w ，b
    w = tf.Variable(tf.random_uniform((feature_num, 1), -1.0, 1.0))
    b = tf.Variable(tf.random_uniform((1, 1), -1.0, 1.0))
    # 定义预测的y
    y = tf.add(tf.matmul(x_data, w), b)
    #定义损失函数
    loss = tf.reduce_mean(tf.square(y - y_data)) / 2
    #定义优化器，这里采用梯度下降的方法
    optimizer = tf.train.GradientDescentOptimizer(learn_rate)
    # 训练
    train = optimizer.minimize(loss)
    # 获取 样本和标签
    example_batch, daysOnMarket_batch = create_pipeline(filename, batch_size)
    # 初始化全局和局部变量
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    print('.........................>>>>开始会话')
    # 创建会话，采用上下文管理器的方式，无需手动关闭会话
    with tf.Session() as sess:
        sess.run(init_op)
        # 创建一个队列
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        for step in range(100):
            #获取正真的样本和标签
            example, label = sess.run([example_batch, daysOnMarket_batch])
            print('第%d批数据'%(step))
            print(example, label)
            print('.......这一批数据的直接参数')
            reg = linear_model.LinearRegression()
            reg.fit(example, label)
            print("Coefficients of sklearn: W=%s, b=%f" % (reg.coef_, reg.intercept_))
            # 数据归一化处理
            scaler = preprocessing.StandardScaler().fit(example)
            print(scaler.mean_, scaler.scale_)
            x_data_standard = scaler.transform(example)

            sess.run(train, feed_dict={x_data: x_data_standard, y_data: label})
            # 每十步获取一次w和b
            if step % 10 == 0:
                print('当前w值和b值')
                print(sess.run(w, feed_dict={x_data: x_data_standard, y_data: label}),
                      sess.run(b, feed_dict={x_data: x_data_standard, y_data: label}))
        print('。。。。。。。》》》训练后得到w和b')
        theta = sess.run(w).flatten()
        intercept = sess.run(b).flatten()
        print('W:%s' % theta)
        print('b:%f' % intercept)
        coord.request_stop()
        coord.join(threads)
    return theta, intercept

def predict(data, theta,intercept, feature_num):
    '''
    the function is to predict label(daysOnMarket)
    :param data: 待预测数据
    :param theta: 训练得到的参数
    :param intercept: 截距
    :param feature_num: 特征个数（自变量个数）
    :return: result（label：预测结果）
    '''
    theta1 = tf.placeholder(tf.float32, [feature_num, 1])
    intercept1 = tf.placeholder(tf.float32, [1, 1])
    x_data = tf.placeholder(tf.float32, [1, feature_num])

    y = tf.add(tf.matmul(x_data, theta1), intercept1)

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        result = sess.run(y, feed_dict={x_data: data, theta1: theta, intercept1: intercept})
        print(result)
    return result


def data_type_conversion(data,theta,intercept,feature_num):
    '''
    the function is to do data_type_conversion(数据类型和形状转换）
    :param data:
    :param theta:
    :param intercept:
    :return:
    '''
    real_data1 = data.astype(np.float32)
    real_data2 = data.reshape(1,feature_num)

    theta_tra = theta.astype(np.float32)
    theta_real = theta.reshape(feature_num, 1)

    intercept_tran = intercept.astype(np.float32)
    intercept_real = intercept.reshape(1, 1)
    return real_data2,theta_real,intercept_real




if __name__ == '__main__':
    input_longitude =int(input('请输入经度'))
    input_latitude = int(input('请输入纬度'))
    input_price = int(input('请输入价格'))
    input_buildingtype = input('请输入房源类型名称：只有9种类型：Residential:0 ，Condo:1 Mobile Home:3,No Building:4 , Row / Townhouse:5 ，Duplex:6 ，Manufactured Home:7 ，Commercial:8 ，Other:9')
    input_tradetype = input('请输入交易形式：只有两种Sale:0，Lease:1')
    input_expected_deal_price = int(input('请输入期望的交易价格'))

    data = np.array([input_longitude,input_latitude,input_price,input_buildingtype,input_tradetype,input_expected_deal_price])
    theta, intercept = train(10, 6, 0.3, 'house_info.csv')
    data_real, theta_real, intercept_real = data_type_conversion(data, theta, intercept,6)
    daysOnmarket = predict(data_real, theta_real, intercept_real, 6)
    print('预测的天数:%d'%int(daysOnmarket))
相关阅读:
Swift3 ——S3 API中间件兼容性测试
 解决 Python.h：没有那个文件或目录错误的方法
 Swift云存储特性研究
 解决updateaptxapi占用资源过高的问题
 dll开发及调用
 git批量备份
 UDP端口扫描
 将markdown文件转换为pdf
指定ssh key访问git
CentOS6.2调整home分区大小
原文地址：https://www.cnblogs.com/bluesl/p/9215749.html