import io # needed because of weird encoding of u.item file import os from surprise import KNNBaseline from surprise import Dataset from surprise import get_dataset_dir from surprise import Reader from surprise import dump def read_item_names(item_file_path,split_flag=' '): """ 从MOVIELNEN 100-K数据集读取UE项目文件并返回两个 映射将原始ID转换成电影名称和电影名称为原始ID。 Read the u.item file from MovieLens 100-k dataset and return two mappings to convert raw ids into movie names and movie names into raw ids. """ # file_name = r'C:UsersFELIXDesktopsurprise库源码分析uitems.txt' file_name=item_file_path rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='utf8') as f: for line in f: line = line.split(split_flag) rid_to_name[line[0]] = line[1].strip() name_to_rid[line[1].strip()] = line[0] return rid_to_name, name_to_rid save_path=os.path.expanduser(r'~/dump_file') def train_data(user_item_score_path,split_flag=' ',user_based=False): # path to dataset file # 数据集路径 # file_path = os.path.expanduser(r'C:UsersFELIXDesktopsurprise库源码分析uuu.txt') file_path = os.path.expanduser(user_item_score_path) reader = Reader(line_format='user item rating timestamp', sep=split_flag) data = Dataset.load_from_file(file_path, reader=reader) # First, train the algortihm to compute the similarities between items # 首先训练算法来计算不同项目之间的相似度 # data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': user_based} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # Dump algorithm and reload it. # file_name = os.path.expanduser(r'C:UsersFELIXDesktopsurprise库源码分析uuu.txtdump_file') dump.dump(save_path, algo=algo) # 模型保存 def get_neighbors(item_name,item_file_path,kk=10): _, algo = dump.load(save_path) # 模型加载 # # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = read_item_names(item_file_path) # print(name_to_rid) # # Retrieve inner id of the movie Toy Story # item_name_raw_id = name_to_rid['uitems10 '] item_name_raw_id = name_to_rid[item_name.strip()] item_name_inner_id = algo.trainset.to_inner_iid(item_name_raw_id) # # Retrieve inner ids of the nearest neighbors of Toy Story. item_name_neighbors = algo.get_neighbors(item_name_inner_id, k=kk) # Convert inner ids of the neighbors into names. item_name_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in item_name_neighbors) item_name_neighbors = (rid_to_name[rid] for rid in item_name_neighbors) return item_name_neighbors u_i_path=r'C:UsersFELIXDesktopsurprise库源码分析uuu.txt' train_data(u_i_path) i_path=r'C:UsersFELIXDesktopsurprise库源码分析uitems.txt' nei_items=get_neighbors('uitems685',i_path,kk=10) for nei in nei_items: print(nei)
如果没有数据的话,可以随机生成测试数据:
# 自己生成数据 1000人 5000商品 1000人,随机对5000个商品中的东西进行评价,评分为1-10 import random for n in range(4): for i in range(1000): t=int(random.random()*100) for j in range(t): # kk=int(random.random()*200) # for k in range(kk): item=int(random.random()*5000) goal=int(random.random()*10) with open('uu.txt','a',encoding='utf8') as f: line=str(i)+' '+str(item)+' '+str(goal)+' '+' ' f.write(line) # 随机打乱评分数据 with open('uu.txt','r',encoding='utf8')as f: data=f.readlines() data2=random.shuffle(data) with open('uuu.txt','a',encoding='utf8')as f2: for line in data: f2.write(line) # 随机生成商品数据 with open('uitems.txt','w',encoding='utf8')as f: for i in range(5000): s=str(i)+' '+'uitems{}'.format(str(i))+' ' f.write(s)