#coding:utf8 import tensorflow as tf from sklearn import linear_model from sklearn import preprocessing import numpy as np def read_data(file_queue): ''' the function is to get features and label (即样本特征和样本的标签) 数据来源是csv的文件,采用tensorflow 自带的对csv文件的处理方式 :param file_queue: :return: features,label ''' # 读取的时候需要跳过第一行 reader = tf.TextLineReader(skip_header_lines=1) key, value = reader.read(file_queue) # 对于数据源中空的值设置默认值 record_defaults = [[''], [''], [''], [''], [0.], [0.], [0.], [0.], [''],[0], [''], [0.], [''], [''], [0]] # 定义decoder,每次读取的执行都从文件中读取一行。然后,decode_csv 操作将结果解析为张量列表 province, city, address, postCode, longitude,latitude, price, buildingTypeId, buildingTypeName, tradeTypeId, tradeTypeName, expectedDealPrice, listingDate, delislingDate, daysOnMarket = tf.decode_csv(value, record_defaults) #对非数值数据进行编码:buildingTypeName preprocess_buildingTypeName_op = tf.case({ tf.equal(buildingTypeName, tf.constant('Residential')): lambda: tf.constant(0.00), tf.equal(buildingTypeName, tf.constant('Condo')): lambda: tf.constant(1.00), tf.equal(buildingTypeName, tf.constant('Mobile Home')): lambda: tf.constant(2.00), tf.equal(buildingTypeName, tf.constant('No Building')): lambda: tf.constant(3.00), tf.equal(buildingTypeName, tf.constant('Row / Townhouse')): lambda: tf.constant(4.00), tf.equal(buildingTypeName, tf.constant('Duplex')): lambda: tf.constant(5.00), tf.equal(buildingTypeName, tf.constant('Manufactured Home')): lambda: tf.constant(6.00), tf.equal(buildingTypeName, tf.constant('Commercial')): lambda: tf.constant(7.00), tf.equal(buildingTypeName, tf.constant('Other')): lambda: tf.constant(8.00), }, lambda: tf.constant(-1.00), exclusive=True) # 对tradeTypeName 进行编码 Sale,Lease preprocess_tradeTypeName_op = tf.case({ tf.equal(tradeTypeName, tf.constant('Sale')): lambda: tf.constant(0.00), tf.equal(tradeTypeName, tf.constant('Lease')): lambda: tf.constant(1.00), }, lambda: tf.constant(-1.00), exclusive=True) features = tf.stack([latitude,longitude,price, preprocess_buildingTypeName_op, preprocess_tradeTypeName_op,expectedDealPrice]) return features, daysOnMarket def create_pipeline(filename,batch_size,num_epochs=None): ''' the function is to get every batch example and label 此处使用的是tf.train.batch,即顺序获取,非随机获取,随机获取采用的方法是:tf.train.shuffle_batch :param filename: :param batch_size: :param num_epochs: :return:example_batch,label_batch ''' file_queue = tf.train.string_input_producer([filename],num_epochs=num_epochs) # example,label 样本和样本标签,batch_size 返回一个样本batch样本集的样本个数 example,dayOnMarket = read_data(file_queue) # 出队后队列至少剩下的数据个数,小于capacity(队列的长度)否则会报错, min_after_dequeue = 1000 #队列的长度 capacity = min_after_dequeue+batch_size # 顺序获取每一批数据 example_batch,daysOnMarket_batch= tf.train.batch([example,dayOnMarket],batch_size=batch_size,capacity=capacity)#顺序读取 return example_batch,daysOnMarket_batch def train(batch_size, feature_num,learn_rate,filename): ''' the function is to train to get w and b :param batch_size: 批量大小 :param feature_num: 特征个数 :param learn_rate: 学习率 :param filename:csv文件名称 :return: w,b ''' # 预处理输入的样本和标签,后面用获取的数据进行喂养 x_data = tf.placeholder(tf.float32, [batch_size, feature_num]) y_data = tf.placeholder(tf.float32, [batch_size]) # 创建参数 w ,b w = tf.Variable(tf.random_uniform((feature_num, 1), -1.0, 1.0)) b = tf.Variable(tf.random_uniform((1, 1), -1.0, 1.0)) # 定义预测的y y = tf.add(tf.matmul(x_data, w), b) #定义损失函数 loss = tf.reduce_mean(tf.square(y - y_data)) / 2 #定义优化器,这里采用梯度下降的方法 optimizer = tf.train.GradientDescentOptimizer(learn_rate) # 训练 train = optimizer.minimize(loss) # 获取 样本和标签 example_batch, daysOnMarket_batch = create_pipeline(filename, batch_size) # 初始化全局和局部变量 init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) print('.........................>>>>开始会话') # 创建会话,采用上下文管理器的方式,无需手动关闭会话 with tf.Session() as sess: sess.run(init_op) # 创建一个队列 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) for step in range(100): #获取正真的样本和标签 example, label = sess.run([example_batch, daysOnMarket_batch]) print('第%d批数据'%(step)) print(example, label) print('.......这一批数据的直接参数') reg = linear_model.LinearRegression() reg.fit(example, label) print("Coefficients of sklearn: W=%s, b=%f" % (reg.coef_, reg.intercept_)) # 数据归一化处理 scaler = preprocessing.StandardScaler().fit(example) print(scaler.mean_, scaler.scale_) x_data_standard = scaler.transform(example) sess.run(train, feed_dict={x_data: x_data_standard, y_data: label}) # 每十步获取一次w和b if step % 10 == 0: print('当前w值和b值') print(sess.run(w, feed_dict={x_data: x_data_standard, y_data: label}), sess.run(b, feed_dict={x_data: x_data_standard, y_data: label})) print('。。。。。。。》》》训练后得到w和b') theta = sess.run(w).flatten() intercept = sess.run(b).flatten() print('W:%s' % theta) print('b:%f' % intercept) coord.request_stop() coord.join(threads) return theta, intercept def predict(data, theta,intercept, feature_num): ''' the function is to predict label(daysOnMarket) :param data: 待预测数据 :param theta: 训练得到的参数 :param intercept: 截距 :param feature_num: 特征个数(自变量个数) :return: result(label:预测结果) ''' theta1 = tf.placeholder(tf.float32, [feature_num, 1]) intercept1 = tf.placeholder(tf.float32, [1, 1]) x_data = tf.placeholder(tf.float32, [1, feature_num]) y = tf.add(tf.matmul(x_data, theta1), intercept1) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) result = sess.run(y, feed_dict={x_data: data, theta1: theta, intercept1: intercept}) print(result) return result def data_type_conversion(data,theta,intercept,feature_num): ''' the function is to do data_type_conversion(数据类型和形状转换) :param data: :param theta: :param intercept: :return: ''' real_data1 = data.astype(np.float32) real_data2 = data.reshape(1,feature_num) theta_tra = theta.astype(np.float32) theta_real = theta.reshape(feature_num, 1) intercept_tran = intercept.astype(np.float32) intercept_real = intercept.reshape(1, 1) return real_data2,theta_real,intercept_real if __name__ == '__main__': input_longitude =int(input('请输入经度')) input_latitude = int(input('请输入纬度')) input_price = int(input('请输入价格')) input_buildingtype = input('请输入房源类型名称:只有9种类型:Residential:0 ,Condo:1 Mobile Home:3,No Building:4 , Row / Townhouse:5 ,Duplex:6 ,Manufactured Home:7 ,Commercial:8 ,Other:9') input_tradetype = input('请输入交易形式:只有两种Sale:0,Lease:1') input_expected_deal_price = int(input('请输入期望的交易价格')) data = np.array([input_longitude,input_latitude,input_price,input_buildingtype,input_tradetype,input_expected_deal_price]) theta, intercept = train(10, 6, 0.3, 'house_info.csv') data_real, theta_real, intercept_real = data_type_conversion(data, theta, intercept,6) daysOnmarket = predict(data_real, theta_real, intercept_real, 6) print('预测的天数:%d'%int(daysOnmarket))