• 简单的user-based协同过滤算法示例代码


    #构造一份打分数据集
    1
    users = {"小明": {"中国合伙人": 5.0, "太平轮": 3.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 3.0, "肖洛特烦恼": 4.5, "火星救援": 5.0}, 2 "小红":{"小时代4": 4.0, "荒野猎人": 3.0, "我的少女时代": 5.0, "肖洛特烦恼": 5.0, "火星救援": 3.0, "后会无期": 3.0}, 3 "小阳": {"小时代4": 2.0, "中国合伙人": 5.0, "我的少女时代": 3.0, "老炮儿": 5.0, "肖洛特烦恼": 4.5, "速度与激情7": 5.0}, 4 "小四": {"小时代4": 5.0, "中国合伙人": 3.0, "我的少女时代": 4.0, "匆匆那年": 4.0, "速度与激情7": 3.5, "火星救援": 3.5, "后会无期": 4.5}, 5 "六爷": {"小时代4": 2.0, "中国合伙人": 4.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 2.0}, 6 "小李": {"荒野猎人": 5.0, "盗梦空间": 5.0, "我的少女时代": 3.0, "速度与激情7": 5.0, "蚁人": 4.5, "老炮儿": 4.0, "后会无期": 3.5}, 7 "隔壁老王": {"荒野猎人": 5.0, "中国合伙人": 4.0, "我的少女时代": 1.0, "Phoenix": 5.0, "甄嬛传": 4.0, "The Strokes": 5.0}, 8 "邻村小芳": {"小时代4": 4.0, "我的少女时代": 4.5, "匆匆那年": 4.5, "甄嬛传": 2.5, "The Strokes": 3.0} 9 }
     1 #定义几种距离计算函数
     2 #更高效的方式为把得分向量化之后使用scipy中定义的distance方法
     3 
     4 from math import sqrt
     5 def euclidean_dis(rating1, rating2):
     6     """计算2个打分序列间的欧式距离. 输入的rating1和rating2都是打分dict
     7        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
     8     distance = 0
     9     commonRatings = False 
    10     for key in rating1:
    11         if key in rating2:
    12             distance += (rating1[key] - rating2[key])^2
    13             commonRatings = True
    14     #两个打分序列之间有公共打分电影
    15     if commonRatings:
    16         return distance
    17     #无公共打分电影
    18     else:
    19         return -1
    20 
    21 
    22 def manhattan_dis(rating1, rating2):
    23     """计算2个打分序列间的曼哈顿距离. 输入的rating1和rating2都是打分dict
    24        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
    25     distance = 0
    26     commonRatings = False 
    27     for key in rating1:
    28         if key in rating2:
    29             distance += abs(rating1[key] - rating2[key])
    30             commonRatings = True
    31     #两个打分序列之间有公共打分电影
    32     if commonRatings:
    33         return distance
    34     #无公共打分电影
    35     else:
    36         return -1
    37 
    38 def cos_dis(rating1, rating2):
    39     """计算2个打分序列间的cos距离. 输入的rating1和rating2都是打分dict
    40        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
    41     distance = 0
    42     dot_product_1 = 0
    43     dot_product_2 = 0
    44     commonRatings = False
    45     
    46     for score in rating1.values():
    47         dot_product_1 += score^2
    48     for score in rating2.values():
    49         dot_product_2 += score^2
    50         
    51     for key in rating1:
    52         if key in rating2:
    53             distance += rating1[key] * rating2[key]
    54             commonRatings = True
    55     #两个打分序列之间有公共打分电影
    56     if commonRatings:
    57         return 1-distance/sqrt(dot_product_1*dot_product_2)
    58     #无公共打分电影
    59     else:
    60         return -1
    61 
    62 def pearson_dis(rating1, rating2):
    63     """计算2个打分序列间的pearson距离. 输入的rating1和rating2都是打分dict
    64        格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
    65     sum_xy = 0
    66     sum_x = 0
    67     sum_y = 0
    68     sum_x2 = 0
    69     sum_y2 = 0
    70     n = 0
    71     for key in rating1:
    72         if key in rating2:
    73             n += 1
    74             x = rating1[key]
    75             y = rating2[key]
    76             sum_xy += x * y
    77             sum_x += x
    78             sum_y += y
    79             sum_x2 += pow(x, 2)
    80             sum_y2 += pow(y, 2)
    81     # now compute denominator
    82     denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
    83     if denominator == 0:
    84         return 0
    85     else:
    86         return (sum_xy - (sum_x * sum_y) / n) / denominator
     1 #查找最近邻
     2 def computeNearestNeighbor(username, users):
     3     """在给定username的情况下,计算其他用户和它的距离并排序"""
     4     distances = []
     5     for user in users:
     6         if user != username:
     7             #distance = manhattan_dis(users[user], users[username])
     8             distance = pearson_dis(users[user], users[username])
     9             distances.append((distance, user))
    10     # 根据距离排序,距离越近,排得越靠前
    11     distances.sort()
    12     return distances
    13 
    14 #推荐
    15 def recommend(username, users):
    16     """对指定的user推荐电影"""
    17     # 找到最近邻
    18     nearest = computeNearestNeighbor(username, users)[0][1]
    19 
    20     recommendations = []
    21     # 找到最近邻看过,但是我们没看过的电影,计算推荐
    22     neighborRatings = users[nearest]
    23     userRatings = users[username]
    24     for artist in neighborRatings:
    25         if not artist in userRatings:
    26             recommendations.append((artist, neighborRatings[artist]))
    27     results = sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)
    28     for result in results:
    29         print(result[0], result[1])
    1 #测试一下
    2 
    3 recommend('小阳', users)
    4     后会无期 4.5
    5     匆匆那年 4.0
    6     火星救援 3.5
     1 #简单的张量分解进行打分和推荐
     2 #要用到numpy模块
     3 import numpy
     4 
     5 #手写矩阵分解
     6 #现在有很多很方便对高维矩阵做分解的package,比如libmf, svdfeature等
     7 def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
     8     Q = Q.T
     9     for step in xrange(steps):
    10         for i in xrange(len(R)):
    11             for j in xrange(len(R[i])):
    12                 if R[i][j] > 0:
    13                     eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
    14                     for k in xrange(K):
    15                         P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
    16                         Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
    17         eR = numpy.dot(P,Q)
    18         e = 0
    19         for i in xrange(len(R)):
    20             for j in xrange(len(R[i])):
    21                 if R[i][j] > 0:
    22                     e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
    23                     for k in xrange(K):
    24                         e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
    25         if e < 0.001:
    26             break
    27     return P, Q.T
     1 #读取user数据并用张量分解进行打分
     2 
     3 R = [
     4      [5,3,0,1],
     5      [4,0,3,1],
     6      [1,1,0,5],
     7      [1,0,0,4],
     8      [0,1,5,4],
     9     ]
    10 
    11 R = numpy.array(R)
    12 
    13 N = len(R)
    14 M = len(R[0])
    15 K = 2
    16 
    17 P = numpy.random.rand(N,K)
    18 Q = numpy.random.rand(M,K)
    19 
    20 nP, nQ = matrix_factorization(R, P, Q, K)
    21 nR = numpy.dot(nP, nQ.T)
    1 nP
    array([[ 0.38345373,  2.181972  ],
           [ 0.32334816,  1.56283276],
           [ 1.99170613,  0.16400981],
           [ 1.59666903,  0.14124969],
           [ 1.64308192,  1.07125805]])
    nQ
    array([[ 0.38946426,  2.29198167],
           [ 0.19720283,  1.18916254],
           [ 1.71589715,  1.76060186],
           [ 2.48314488,  0.03019937]])
    1 nR
    array([[ 5.15038133,  2.67033753,  4.49955112,  1.01806534],
           [ 3.70791658,  1.92222735,  3.30635845,  0.85011689],
           [ 1.15160585,  0.58780442,  3.70631887,  4.95064787],
           [ 0.94558722,  0.48283649,  2.98840431,  3.96902618],
           [ 3.0952255 ,  1.59792036,  4.70541851,  4.11236178]])
    1 R
    array([[5, 3, 0, 1],
           [4, 0, 3, 1],
           [1, 1, 0, 5],
           [1, 0, 0, 4],
           [0, 1, 5, 4]])


  • 相关阅读:
    HDU 1335 Basically Speaking(进制转换)
    2016年团体程序设计天梯赛-决赛 L2-3. 互评成绩(25)
    2016年团体程序设计天梯赛-决赛 L1-8. Left-pad(20)
    2016年团体程序设计天梯赛-决赛 L1-7. 到底是不是太胖了(10)
    2016年团体程序设计天梯赛-决赛 L1-6. 一帮一(15)
    2016年团体程序设计天梯赛-决赛 L1-5. 是不是太胖了(5)
    2016年团体程序设计天梯赛-决赛 L1-3. 出租(20)
    2016年团体程序设计天梯赛-决赛 L1-2. I Love GPLT(5)
    2016年团体程序设计天梯赛-决赛 L1-1. 正整数A+B(15)
    Codeforces Round #321 (Div. 2) C. Kefa and Park dfs
  • 原文地址:https://www.cnblogs.com/luozeng/p/8519414.html
Copyright © 2020-2023  润新知