• 基于评分的商品top-N推荐系统


    import io  # needed because of weird encoding of u.item file
    import os
    from surprise import KNNBaseline
    from surprise import Dataset
    from surprise import get_dataset_dir
    from surprise import Reader
    from surprise import dump
    
    def read_item_names(item_file_path,split_flag='	'):
        """
        从MOVIELNEN 100-K数据集读取UE项目文件并返回两个
        映射将原始ID转换成电影名称和电影名称为原始ID。
        Read the u.item file from MovieLens 100-k dataset and return two
        mappings to convert raw ids into movie names and movie names into raw ids.
        """
    
        # file_name = r'C:UsersFELIXDesktopsurprise库源码分析uitems.txt'
        file_name=item_file_path
        rid_to_name = {}
        name_to_rid = {}
        with io.open(file_name, 'r', encoding='utf8') as f:
            for line in f:
                line = line.split(split_flag)
                rid_to_name[line[0]] = line[1].strip()
                name_to_rid[line[1].strip()] = line[0]
    
        return rid_to_name, name_to_rid
    
    save_path=os.path.expanduser(r'~/dump_file')
    def train_data(user_item_score_path,split_flag='	',user_based=False):
        # path to dataset file
        # 数据集路径
        # file_path = os.path.expanduser(r'C:UsersFELIXDesktopsurprise库源码分析uuu.txt')
        file_path = os.path.expanduser(user_item_score_path)
        reader = Reader(line_format='user item rating timestamp', sep=split_flag)
        data = Dataset.load_from_file(file_path, reader=reader)
    
    
        # First, train the algortihm to compute the similarities between items
        # 首先训练算法来计算不同项目之间的相似度
        # data = Dataset.load_builtin('ml-100k')
        trainset = data.build_full_trainset()
        sim_options = {'name': 'pearson_baseline', 'user_based': user_based}
        algo = KNNBaseline(sim_options=sim_options)
        algo.fit(trainset)
        
        # Dump algorithm and reload it.
    #     file_name = os.path.expanduser(r'C:UsersFELIXDesktopsurprise库源码分析uuu.txtdump_file')
        dump.dump(save_path, algo=algo)    # 模型保存
        
    
    def get_neighbors(item_name,item_file_path,kk=10):
        _, algo = dump.load(save_path) # 模型加载
        # # Read the mappings raw id <-> movie name
        rid_to_name, name_to_rid = read_item_names(item_file_path)
    #     print(name_to_rid)
        # # Retrieve inner id of the movie Toy Story
    #     item_name_raw_id = name_to_rid['uitems10
    ']
        item_name_raw_id = name_to_rid[item_name.strip()]
        
        item_name_inner_id = algo.trainset.to_inner_iid(item_name_raw_id)
    
        # # Retrieve inner ids of the nearest neighbors of Toy Story.
        item_name_neighbors = algo.get_neighbors(item_name_inner_id, k=kk)
    
        # Convert inner ids of the neighbors into names.
        item_name_neighbors = (algo.trainset.to_raw_iid(inner_id)
                               for inner_id in item_name_neighbors)
        item_name_neighbors = (rid_to_name[rid]
                               for rid in item_name_neighbors)
        return item_name_neighbors
    
    u_i_path=r'C:UsersFELIXDesktopsurprise库源码分析uuu.txt'
    train_data(u_i_path)
    i_path=r'C:UsersFELIXDesktopsurprise库源码分析uitems.txt'
    nei_items=get_neighbors('uitems685',i_path,kk=10) 
    for nei in nei_items:
        print(nei)

    如果没有数据的话,可以随机生成测试数据:

    # 自己生成数据   1000人   5000商品   1000人,随机对5000个商品中的东西进行评价,评分为1-10
    import random
    for n in range(4):
        for i in range(1000):
            t=int(random.random()*100)
            for j in range(t):
        #         kk=int(random.random()*200)
        #         for k in range(kk):
                    item=int(random.random()*5000)
                    goal=int(random.random()*10)
                    with open('uu.txt','a',encoding='utf8') as f:
                        line=str(i)+'	'+str(item)+'	'+str(goal)+'	'+'
    '
                        f.write(line)
    
    # 随机打乱评分数据
    with open('uu.txt','r',encoding='utf8')as f:
        data=f.readlines()
        data2=random.shuffle(data)
        with open('uuu.txt','a',encoding='utf8')as f2:
            for line in data:
                f2.write(line)
    
    # 随机生成商品数据
    with open('uitems.txt','w',encoding='utf8')as f:
        for i in range(5000):
            s=str(i)+'	'+'uitems{}'.format(str(i))+'
    '
            f.write(s)   
  • 相关阅读:
    Linux系统服务
    Linux进程管理
    Linux压缩打包
    Linux输入输出
    Linux权限管理
    Linux用户管理
    Linux文件管理
    Linux-Shell
    Centos7 安装jdk1.8
    Python数据分析之路
  • 原文地址:https://www.cnblogs.com/felixwang2/p/9415578.html
Copyright © 2020-2023  润新知