• 个性化召回算法实践(三)——PersonalRank算法


    将用户行为表示为二分图模型。假设给用户(u)进行个性化推荐,要计算所有节点相对于用户(u)的相关度,则PersonalRank从用户(u)对应的节点开始游走,每到一个节点都以(1-d)的概率停止游走并从(u)重新开始,或者以(d)的概率继续游走,从当前节点指向的节点中按照均匀分布随机选择一个节点往下游走。这样经过很多轮游走之后,每个顶点被访问到的概率也会收敛趋于稳定,这个时候我们就可以用概率来进行排名了。
    在执行算法之前,我们需要初始化每个节点的初始概率值。如果我们对用户(u)进行推荐,则令(u)对应的节点的初始访问概率为1,其他节点的初始访问概率为0,然后再使用迭代公式计算。

    [PR(i)=(1-d)r_i+dsum_{j in in(i)} frac {PR(j)}{|out(i)|} \ r_i = egin{cases} 1 i=u \ 0 i!=u end{cases} ]

    一般有两种算法实现,一种是矩阵化实现,一种是非矩阵化实现。

    非矩阵化实现

    根据userID与itemID建立二分图。在代码中,self.G代表全局有向图,为区分userID与itemID分别加了不同的前缀。另外,user-item对保存在图中,方向是相互的。接下来,就在图中根据概率进行转移。

    其中G = dict(item_user,**user_item)的含义是将两个dict拼接成一个dict

    import pandas as pd
    import time
    
    class PersonalRank:
        def __init__(self,X,Y):
            X,Y = ['user_'+str(x) for x in X],['item_'+str(y) for y in Y]
            self.G = self.get_graph(X,Y)
    
        def get_graph(self,X,Y):
            """
            Args:
                X: user id
                Y: item id
            Returns:
                graph:dic['user_id1':{'item_id1':1},  ... ]
            """
            item_user = dict()
            for i in range(len(X)):
                user = X[i]
                item = Y[i]
                if item not in item_user:
                    item_user[item] = {}
                item_user[item][user]=1
    
            user_item = dict()
            for i in range(len(Y)):
                user = X[i]
                item = Y[i]
                if user not in user_item:
                    user_item[user] = {}
                user_item[user][item]=1
            G = dict(item_user,**user_item)
            return G
    
    
        def recommend(self, alpha, userID, max_depth,K=10):
            # rank = dict()
            userID = 'user_' + str(userID)
            rank = {x: 0 for x in self.G.keys()}
            rank[userID] = 1
            # 开始迭代
            begin = time.time()
            for k in range(max_depth):
                tmp = {x: 0 for x in self.G.keys()}
                # 取出节点i和他的出边尾节点集合ri
                for i, ri in self.G.items():
                    # 取节点i的出边的尾节点j以及边E(i,j)的权重wij,边的权重都为1,归一化后就是1/len(ri)
                    for j, wij in ri.items():
                        tmp[j] += alpha * rank[i] / (1.0 * len(ri))
                tmp[userID] += (1 - alpha)
                rank = tmp
            end = time.time()
            print('use_time', end - begin)
            lst = sorted(rank.items(), key=lambda x: x[1], reverse=True)[:K]
            for ele in lst:
                print("%s:%.3f, 	" % (ele[0], ele[1]))
    
    if __name__ == '__main__':
        moviesPath = '../data/ml-1m/movies.dat'
        ratingsPath = '../data/ml-1m/ratings.dat'
        usersPath = '../data/ml-1m/users.dat'
    
        # usersDF = pd.read_csv(usersPath,index_col=None,sep='::',header=None,names=['user_id', 'gender', 'age', 'occupation', 'zip'])
        # moviesDF = pd.read_csv(moviesPath,index_col=None,sep='::',header=None,names=['movie_id', 'title', 'genres'])
        ratingsDF = pd.read_csv(ratingsPath, index_col=None, sep='::', header=None,names=['user_id', 'movie_id', 'rating', 'timestamp'])
        X=ratingsDF['user_id'][:1000]
        Y=ratingsDF['movie_id'][:1000]
        PersonalRank(X,Y).recommend(alpha=0.8,userID=1,max_depth=50,K=10)#输出对用户1最接近的 top10 
        # print('PersonalRank result',rank)
    

    矩阵化实现

    [r = (1-alpha)r_o + alpha M^T r ]

    其中,(r)(m+n)行,1列的矩阵,每一行代表该顶点对固定顶点的PR值;是(m+n)行,1列的矩阵,负责选取某一个顶点作为固定顶点,其数值只有1行为1,其余为0。(M)是m+n行,m+n列的矩阵,是转移矩阵,其值(M_{ij}=frac{1}{out(i)},j in out(i) else 0),即为顶点的出度倒数,若没有连接边则为0。上式可转换为:

    [r = (E-alpha M^T)^{-1}(1-alpha)r_o ]

    其中,((E-alpha M^T)^{-1})可以看做所有顶点的推荐结果,每一列代表一个顶点项,对该顶点的PR值。

    #-*-coding:utf-8-*-
    """
    author:jamest
    date:20190310
    PersonalRank function with Matrix
    """
    import pandas as pd
    import numpy as np
    import time
    import operator
    from scipy.sparse import coo_matrix
    from scipy.sparse.linalg import gmres
    
    
    class PersonalRank:
        def __init__(self,X,Y):
            X,Y = ['user_'+str(x) for x in X],['item_'+str(y) for y in Y]
            self.G = self.get_graph(X,Y)
    
        def get_graph(self,X,Y):
            """
            Args:
                X: user id
                Y: item id
            Returns:
                graph:dic['user_id1':{'item_id1':1},  ... ]
            """
            item_user = dict()
            for i in range(len(X)):
                user = X[i]
                item = Y[i]
                if item not in item_user:
                    item_user[item] = {}
                item_user[item][user]=1
    
            user_item = dict()
            for i in range(len(Y)):
                user = X[i]
                item = Y[i]
                if user not in user_item:
                    user_item[user] = {}
                user_item[user][item]=1
            G = dict(item_user,**user_item)
            return G
    
    
        def graph_to_m(self):
            """
            Returns:
                a coo_matrix sparse mat M
                a list,total user item points
                a dict,map all the point to row index
            """
    
            graph = self.G
            vertex = list(graph.keys())
            address_dict = {}
            total_len = len(vertex)
            for index in range(len(vertex)):
                address_dict[vertex[index]] = index
            row = []
            col = []
            data = []
            for element_i in graph:
                weight = round(1/len(graph[element_i]),3)
                row_index=  address_dict[element_i]
                for element_j in graph[element_i]:
                    col_index = address_dict[element_j]
                    row.append(row_index)
                    col.append(col_index)
                    data.append(weight)
            row = np.array(row)
            col = np.array(col)
            data = np.array(data)
            m = coo_matrix((data,(row,col)),shape=(total_len,total_len))
            return m,vertex,address_dict
    
    
        def mat_all_point(self,m_mat,vertex,alpha):
            """
            get E-alpha*m_mat.T
            Args:
                m_mat
                vertex:total item and user points
                alpha:the prob for random walking
            Returns:
                a sparse
            """
            total_len = len(vertex)
            row = []
            col = []
            data = []
            for index in range(total_len):
                row.append(index)
                col.append(index)
                data.append(1)
            row = np.array(row)
            col = np.array(col)
            data = np.array(data)
            eye_t = coo_matrix((data,(row,col)),shape=(total_len,total_len))
            return eye_t.tocsr()-alpha*m_mat.tocsr().transpose()
    
        def recommend_use_matrix(self, alpha, userID, K=10,use_matrix=True):
            """
            Args:
                alpha:the prob for random walking
                userID:the user to recom
                K:recom item num
            Returns:
                a dic,key:itemid ,value:pr score
            """
            m, vertex, address_dict = self.graph_to_m()
            userID = 'user_' + str(userID)
            print('add',address_dict)
            if userID not in address_dict:
                return []
            score_dict = {}
            recom_dict = {}
            mat_all = self.mat_all_point(m,vertex,alpha)
            index = address_dict[userID]
            initial_list = [[0] for row in range(len(vertex))]
            initial_list[index] = [1]
            r_zero = np.array(initial_list)
            res = gmres(mat_all,r_zero,tol=1e-8)[0]
            for index in range(len(res)):
                point = vertex[index]
                if len(point.strip().split('_'))<2:
                    continue
                if point in self.G[userID]:
                    continue
                score_dict[point] = round(res[index],3)
            for zuhe in sorted(score_dict.items(),key=operator.itemgetter(1),reverse=True)[:K]:
                point,score = zuhe[0],zuhe[1]
                recom_dict[point] = score
            return recom_dict
    
    
    
    
    if __name__ == '__main__':
        moviesPath = '../data/ml-1m/movies.dat'
        ratingsPath = '../data/ml-1m/ratings.dat'
        usersPath = '../data/ml-1m/users.dat'
    
        # usersDF = pd.read_csv(usersPath,index_col=None,sep='::',header=None,names=['user_id', 'gender', 'age', 'occupation', 'zip'])
        # moviesDF = pd.read_csv(moviesPath,index_col=None,sep='::',header=None,names=['movie_id', 'title', 'genres'])
        ratingsDF = pd.read_csv(ratingsPath, index_col=None, sep='::', header=None,names=['user_id', 'movie_id', 'rating', 'timestamp'])
        X=ratingsDF['user_id'][:1000]
        Y=ratingsDF['movie_id'][:1000]
        rank = PersonalRank(X,Y).recommend_use_matrix(alpha=0.8,userID=1,K=10)
        print('PersonalRank result',rank)
    
    

    参考:
    推荐系统概述(一)
    Github

  • 相关阅读:
    [PY3]——heap模块 和 堆排序
    [PY3]——求TopN/BtmN 和 排序问题的解决
    [转载+补充][PY3]——环境配置(2)——windows下安装pycharm并连接Linux的python环境
    [转载+补充]windows下SVN客户端的安装
    [Visual studio code 常见问题解决] ——中文乱码、
    Smrty模版总结(转)
    cms内容模型标签
    phpcms图文总结(转)
    phpcms总结(转)
    PHP总结
  • 原文地址:https://www.cnblogs.com/hellojamest/p/11763033.html
Copyright © 2020-2023  润新知