看了网上的一些用tf实现的FM,很多都没有考虑FM实际使用中数据样本稀疏的问题。
我在实现的时候使用 embedding_lookup_sparse来解决这个问题。
对于二阶部分,由于embedding_lookup_sparse没法计算 和的平方 和 平方的和,我参考embedding_lookup_sparse中sum和mean两种实现,自己写了一下。不过数据输入部分还需要改一下,改用dataset会更好。
代码如下:
import tensorflow as tf from tensorflow.python.ops import math_ops from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops import random import numpy as np from sklearn import metrics class Args(): feature_size=925 field_size=15 embedding_size = 20 epoch = 3 batch_size = 2000 learning_rate = 0.001 l2_reg_rate = 0.001 checkpoint_dir = "./model" is_training = True class FMmodel(): def __init__(self): self.feature_sizes = Args.feature_size self.field_size = Args.field_size self.embedding_size = Args.embedding_size self.l2_reg_rate = Args.l2_reg_rate self.epoch = Args.epoch self.learning_rate = Args.learning_rate self.weight = {} self.model_path = Args.checkpoint_dir self.batch_size = Args.batch_size def build_model(self,is_warm_up=False): self.x1_index = tf.sparse_placeholder(tf.int64,name="x1_index") self.x1_value = tf.sparse_placeholder(tf.float32,name="x1_value") self.labels = tf.placeholder(tf.float32,name="labels",shape=[None,1]) init_randomW = tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None, dtype=tf.float32) init_randomV = tf.random_normal_initializer(mean=0.0, stddev=0.00001, seed=None, dtype=tf.float32) #特征向量 self.weight["feature_weight"] = tf.get_variable( shape =[self.feature_sizes,self.embedding_size], name='feature_weight', initializer=init_randomV ) #一次项中的W系数 self.weight["feature_first"] = tf.get_variable( shape=[self.feature_sizes,1], initializer=init_randomW, name='feature_first') self.weight["bais"] = tf.get_variable(shape=[1,1],initializer=tf.constant_initializer(0.0),name="bais") #[batch_size,1] 线性部分的计算结果 xi*wi求和 self.line_part1 = tf.nn.embedding_lookup_sparse(self.weight["feature_first"], sp_ids=self.x1_index,sp_weights=self.x1_value,combiner='sum') self.line_part1_shape = tf.shape(self.line_part1) #[batch*embedding_size] self.embedding_part1_sum_square = tf.nn.embedding_lookup_sparse(self.weight["feature_weight"], sp_ids=self.x1_index,sp_weights=self.x1_value,combiner='sum') #[batch_size,embeding_size] ids_1 = self.x1_index.values self.ids1,self.idx1 = tf.unique(ids_1) self.weight_1 = self.x1_value.values self.weight_1 = tf.reshape(self.weight_1,[-1,1]) if self.weight_1.dtype != dtypes.float32: self.weight_1 = math_ops.cast(self.weight_1,dtypes.float32) #[batch_size,embedding_size] self.embedding_1 = tf.nn.embedding_lookup(self.weight["feature_weight"],ids=self.ids1) self.new_embedding_1 = tf.gather(self.embedding_1,self.idx1) #[batch_value_count,embedding_size] self.embedding_weight_part1 =tf.multiply(self.weight_1,self.new_embedding_1) self.embedding_weight_part1_square = tf.square(self.embedding_weight_part1) self.segment_ids_1 = self.x1_index.indices[:, 0] if self.segment_ids_1.dtype != dtypes.int32: self.segment_ids_1 = math_ops.cast(self.segment_ids_1, dtypes.int32) self.embeddings_square_sum1 = tf.math.segment_sum( self.embedding_weight_part1_square,self.segment_ids_1) self.ess1_shape = tf.shape(self.embeddings_square_sum1) #[batch_size,1] self.y1_v = 0.5*tf.reduce_sum(tf.subtract(self.embedding_part1_sum_square,self.embeddings_square_sum1),1) self.y1_v = tf.reshape(self.y1_v,[-1,1]) self.y1 = tf.add(tf.add(self.line_part1,self.y1_v),self.weight["bais"]) self.o1 = tf.sigmoid(self.y1) self.loss = tf.losses.log_loss(labels=self.labels,predictions=self.o1) self.error = tf.reduce_mean(self.loss) # with tf.name_scope("loss"): # tf.summary.scalar("loss", self.error) self.opt = tf.train.AdamOptimizer().minimize(self.error) self.session = tf.Session() self.init = tf.group(tf.global_variables_initializer()) if is_warm_up: self.saver = tf.train.Saver(tf.global_variables()) self.saver.restore(self.session, self.model_path) else: self.session.run(self.init) def predict(self,file_name): result_list = [] for x1_index, x1_value, true_labels in self.load_data(file_name,is_train=False): predict1 = self.session.run([self.o1],feed_dict={ self.x1_value:x1_value, self.x1_index:x1_index }) # print(len(predict1)) # print(len(predict1[0])) # print(true_labels.shape) for i in range(len(predict1[0])): result_list.append((true_labels[i][0],predict1[0][i])) print(len(result_list)) with open("./data/result.txt",'w') as file1: for tp in result_list: file1.write(str(tp[0])+","+str(tp[1][0])+" ") def save(self,sess,path): saver = tf.train.Saver() saver.save(sess,save_path=path) def restore(self,sess,path): saver = tf.train.Saver() saver.restore(sess,save_path=path) def train(self,train_data_file): index=0 for x1_index,x1_value,true_labels in self.load_data(train_data_file):#ids_1,ids_2,weight_1,weight_2, if(len(true_labels)<2): #print("###$$$$$$ : "+str(len(true_labels))) continue my_o1,myerror,_=self.session.run([self.o1,self.error,self.opt],feed_dict={ self.x1_index : x1_index, self.x1_value : x1_value, self.labels:true_labels }) index+=1 # if(index%1000==0): # for i in range(len(my_o1)): # print(str(my_o1[i])+" : "+str(true_labels[i])) #y_t = true_labels.reshape([-1]) #y_p = np.asarray(my_o1,dtype=float).reshape([-1]) print(metrics.roc_auc_score(true_labels,my_o1)) #print(my_o1) self.save(self.session,self.model_path) self.session.close() def load_data(self,file_name,epoch=3,is_train=True): def __parse_line(line): tokens = line.split("#")[0].split() assert len(tokens)>=2, "Ill-formatted line: {}".format(line) label = float(tokens[0]) uid = tokens[1] mid = tokens[2] kv_pairs = [kv.split(":") for kv in tokens[3:]] features = {k: float(v) for (k,v) in kv_pairs} #print(type(features)) qid = uid return qid,features,label def __encoder_line(sample): qid = sample[0] features = sample[1] label = sample[2] features_arr = [] for key in features.keys(): features_arr.append(str(key)+":"+str(features[key])) return str(label)+" "+"qid:"+str(qid)+" "+" ".join(features_arr) def __gen_sparse_tensor(sample_list): # 生成batch_size数据 # 根据sample_pair_list生成一个batch_size的训练样本 sample_index = 0 tensor_x1_index_ids = [] tensor_x1_index_value = [] tensor_x1_value_ids = [] tensor_x1_value_values = [] label_list = [] for sample in sample_list: x1_feature = sample[0] label_list.append([float(sample[1])]) tmpIndex = 0 for key in x1_feature.keys(): tensor_x1_index_ids.append([sample_index, tmpIndex]) tensor_x1_index_value.append(int(key)) tensor_x1_value_ids.append([sample_index, tmpIndex]) tensor_x1_value_values.append(float(x1_feature[key])) tmpIndex += 1 sample_index+=1 x1_index = tf.SparseTensorValue(indices=tensor_x1_index_ids,values=tensor_x1_index_value, dense_shape=[len(sample_list),self.feature_sizes]) x1_value = tf.SparseTensorValue(indices=tensor_x1_value_ids,values=tensor_x1_value_values, dense_shape=[len(sample_list),self.feature_sizes]) #print("AHAHAHAHA : "+str(len(sample_list))) return x1_index,x1_value,np.asarray(label_list,dtype=np.float32) def __gen_train_data(file_name): new_file_name = file_name+"_train_data" with open(file_name,'r') as filer: with open(new_file_name,'w') as filew: sample_list = [] now_qid = None for l in filer: qid, features, label = __parse_line(l) if now_qid is None or now_qid==qid: now_qid = qid sample_list.append((qid,features,label)) else: sorted_sample_list = sorted(sample_list,key=lambda x:x[2],reverse=True) for sample in sorted_sample_list: sample_str = __encoder_line(sample) filew.write(sample_str+" ") sample_list = [] now_qid = qid sample_list.append((qid, features, label)) return new_file_name if is_train: new_file_name ="./data/new_final_train_data.txt" # __gen_train_data(file_name) print("process data") sample_list = [] while epoch>0: epoch-=1 with open(new_file_name,'r') as filer: for l in filer: qid,features,label = __parse_line(l) #print(len(sample_list)) if len(sample_list)<self.batch_size*10: sample_list.append((features,label)) else: random.shuffle(sample_list) start = 0 end = len(sample_list) while (start < end): tmpEnd = min(end, start + self.batch_size) sub_list = sample_list[start:tmpEnd] x1_index, x1_value,labels = __gen_sparse_tensor(sub_list) # ids_1,ids_2,weight_1,weight_2, if(labels.sum()<1): start += self.batch_size continue yield (x1_index, x1_value,labels) # ids_1,ids_2,weight_1,weight_2, start += self.batch_size sample_list = [] sample_list.append((features, label)) else: with open(file_name, 'r') as filer: sample_list = [] for l in filer: qid, features, label = __parse_line(l) # print(len(sample_list)) if len(sample_list) < self.batch_size: sample_list.append((features, label)) else: start = 0 end = len(sample_list) while (start < end): tmpEnd = min(end, start + self.batch_size) sub_list = sample_list[start:tmpEnd] x1_index, x1_value, labels = __gen_sparse_tensor(sub_list) # ids_1,ids_2,weight_1,weight_2, yield (x1_index, x1_value, labels) # ids_1,ids_2,weight_1,weight_2, start += self.batch_size sample_list = [] sample_list.append((features, label)) if __name__ =="__main__": fm = FMmodel() fm.build_model(is_warm_up=True) #fm.train("./data/new_final_train_data.txt") fm.predict("./data/test.data")