• simrank python实现


    1、数据

    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    pc,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,hp.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    camera,bestbuy.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,hp.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    digital camera,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    tv,bestbuy.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,teleflora.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    flower,orchids.com
    View Code

    2、simrank 的python实现

    import numpy as np 
    from numpy import matrix
    
    with open('sample1 (1).txt','r') as log_fp:
        logs = [log.strip() for log in log_fp.readlines()]
        # print(logs)
    logs_tuple = [tuple(log.split(",")) for log in logs]
    # print (logs_tuple)
    
    queries = list(set([log[0] for log in logs_tuple]))
    # print(queries)    #['digital camera', 'flower', 'pc', 'camera', 'tv']
    ads = list(set([log[1] for log in logs_tuple]))
    # print(ads)#['hp.com', 'teleflora.com', 'bestbuy.com', 'orchids.com']
    
    graph = np.matrix(np.zeros([len(queries),len(ads)]))
    # print(graph)   #6行4列的0矩阵
    
    for log in logs_tuple:
        query = log[0]
        ad = log[1]
        q_i = queries.index(query)
        a_j = ads.index(ad)
        graph[q_i,a_j] +=1
    print(graph)
    
    query_sim = matrix(np.identity(len(queries)))
    print(query_sim)
    ad_sim = matrix(np.identity(len(ads)))
    print(ad_sim)
    
    def get_ads_num(query):
        q_i = queries.index(query)
        return graph[q_i]
    
    def get_queries_num(ad):
        a_j = ads.index(ad)
        return graph.transpose()[a_j]
    
    def get_ads(query):
        series = get_ads_num(query).tolist()[0]
        return [ads[x] for x in range(len(series)) if series[x] > 0]
    
    def get_queries(ad):
        series = get_queries_num(ad).tolist()[0]
        return [queries[x] for x in range(len(series)) if series[x] > 0]
    
    
    def query_simrank(q1,q2,c):
        if q1 == q2 :
            return 1
        prefix = c/(get_ads_num(q1).sum() *get_ads_num(q2).sum())
        postfix = 0
        for ad_i in get_ads(q1):
            for ad_j in get_ads(q2):
                i = ads.index(ad_i)
                j = ads.index(ad_j)
                postfix += ad_sim[i,j]
        return prefix*postfix
    
    
    def ad_simrank(a1,a2,c):
        if a1 == a2 :
            return 1
        prefix = c/(get_queries_num(a1).sum()*get_queries_num(a2).sum())
        postfix = 0
        for query_i in get_queries(a1):
            for query_j in get_queries(a2):
                i = queries.index(query_i)
                j = queries.index(query_j)
                postfix += query_sim[i,j]
        return prefix*postfix
    
    
    def simrank(c=0.8,times = 1):
        global query_sim,ad_sim
    
        for run in range(times):
            new_query_sim = matrix(np.identity(len(queries)))
            for qi in queries:
                for qj in queries:
                    i = queries.index(qi)
                    j = queries.index(qj)
                    new_query_sim[i,j] =query_simrank(qi,qj,c)
    
            new_ad_sim = matrix(np.identity(len(ads)))
            for ai in ads:
                for aj in ads :
                    i = ads.index(ai)
                    j = ads.index(aj)
                    new_ad_sim[i,j] =ad_simrank(ai,aj,c)
    
            query_sim = new_query_sim
            ad_sim = new_ad_sim
    
    
    if __name__ == '__main__':
        print (queries)
        print(ads)
        simrank()
        print(query_sim)
        print(ad_sim)

    [[15.  0.  0.  0.]
     [ 0.  0. 10.  0.]
     [ 5.  0. 20.  0.]
     [ 7.  0. 30.  0.]
     [ 0. 16.  0. 15.]]
    [[
    1. 0. 0. 0. 0.] [0. 1. 0. 0. 0.] [0. 0. 1. 0. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 0. 1.]]
    [[
    1. 0. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 0. 0. 1.]]
    [
    'tv', 'pc', 'camera', 'digital camera', 'flower']
    [
    'bestbuy.com', 'teleflora.com', 'hp.com', 'orchids.com']
    [[
    1. 0. 0.00213333 0.00144144 0. ] [0. 1. 0.0032 0.00216216 0. ] [0.00213333 0.0032 1. 0.00172973 0. ] [0.00144144 0.00216216 0.00172973 1. 0. ] [0. 0. 0. 0. 1. ]]
    [[
    1.00000000e+00 0.00000000e+00 9.87654321e-04 0.00000000e+00] [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.33333333e-03] [9.87654321e-04 0.00000000e+00 1.00000000e+00 0.00000000e+00] [0.00000000e+00 3.33333333e-03 0.00000000e+00 1.00000000e+00]]
  • 相关阅读:
    MySQL中的字符串函数
    用google map实现周边搜索功能
    用 wait-notify 写一段代码来解决生产者-消费者问题
    equals和hashcode为什么要一起重写
    Java知多少(107)几个重要的java数据库访问类和接口
    Java知多少(106)程序与数据库连接
    Java知多少(105)套接字(Socket)
    Java知多少(104)网络编程之统一资源定位符URL
    Java知多少(103)网络编程之IP地址和InetAddress类
    Java知多少(102)多媒体基础
  • 原文地址:https://www.cnblogs.com/spp666/p/11821700.html
Copyright © 2020-2023  润新知