#构造一份打分数据集
1 users = {"小明": {"中国合伙人": 5.0, "太平轮": 3.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 3.0, "肖洛特烦恼": 4.5, "火星救援": 5.0},
2 "小红":{"小时代4": 4.0, "荒野猎人": 3.0, "我的少女时代": 5.0, "肖洛特烦恼": 5.0, "火星救援": 3.0, "后会无期": 3.0},
3 "小阳": {"小时代4": 2.0, "中国合伙人": 5.0, "我的少女时代": 3.0, "老炮儿": 5.0, "肖洛特烦恼": 4.5, "速度与激情7": 5.0},
4 "小四": {"小时代4": 5.0, "中国合伙人": 3.0, "我的少女时代": 4.0, "匆匆那年": 4.0, "速度与激情7": 3.5, "火星救援": 3.5, "后会无期": 4.5},
5 "六爷": {"小时代4": 2.0, "中国合伙人": 4.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 2.0},
6 "小李": {"荒野猎人": 5.0, "盗梦空间": 5.0, "我的少女时代": 3.0, "速度与激情7": 5.0, "蚁人": 4.5, "老炮儿": 4.0, "后会无期": 3.5},
7 "隔壁老王": {"荒野猎人": 5.0, "中国合伙人": 4.0, "我的少女时代": 1.0, "Phoenix": 5.0, "甄嬛传": 4.0, "The Strokes": 5.0},
8 "邻村小芳": {"小时代4": 4.0, "我的少女时代": 4.5, "匆匆那年": 4.5, "甄嬛传": 2.5, "The Strokes": 3.0}
9 }
1 #定义几种距离计算函数
2 #更高效的方式为把得分向量化之后使用scipy中定义的distance方法
3
4 from math import sqrt
5 def euclidean_dis(rating1, rating2):
6 """计算2个打分序列间的欧式距离. 输入的rating1和rating2都是打分dict
7 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
8 distance = 0
9 commonRatings = False
10 for key in rating1:
11 if key in rating2:
12 distance += (rating1[key] - rating2[key])^2
13 commonRatings = True
14 #两个打分序列之间有公共打分电影
15 if commonRatings:
16 return distance
17 #无公共打分电影
18 else:
19 return -1
20
21
22 def manhattan_dis(rating1, rating2):
23 """计算2个打分序列间的曼哈顿距离. 输入的rating1和rating2都是打分dict
24 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
25 distance = 0
26 commonRatings = False
27 for key in rating1:
28 if key in rating2:
29 distance += abs(rating1[key] - rating2[key])
30 commonRatings = True
31 #两个打分序列之间有公共打分电影
32 if commonRatings:
33 return distance
34 #无公共打分电影
35 else:
36 return -1
37
38 def cos_dis(rating1, rating2):
39 """计算2个打分序列间的cos距离. 输入的rating1和rating2都是打分dict
40 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
41 distance = 0
42 dot_product_1 = 0
43 dot_product_2 = 0
44 commonRatings = False
45
46 for score in rating1.values():
47 dot_product_1 += score^2
48 for score in rating2.values():
49 dot_product_2 += score^2
50
51 for key in rating1:
52 if key in rating2:
53 distance += rating1[key] * rating2[key]
54 commonRatings = True
55 #两个打分序列之间有公共打分电影
56 if commonRatings:
57 return 1-distance/sqrt(dot_product_1*dot_product_2)
58 #无公共打分电影
59 else:
60 return -1
61
62 def pearson_dis(rating1, rating2):
63 """计算2个打分序列间的pearson距离. 输入的rating1和rating2都是打分dict
64 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
65 sum_xy = 0
66 sum_x = 0
67 sum_y = 0
68 sum_x2 = 0
69 sum_y2 = 0
70 n = 0
71 for key in rating1:
72 if key in rating2:
73 n += 1
74 x = rating1[key]
75 y = rating2[key]
76 sum_xy += x * y
77 sum_x += x
78 sum_y += y
79 sum_x2 += pow(x, 2)
80 sum_y2 += pow(y, 2)
81 # now compute denominator
82 denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
83 if denominator == 0:
84 return 0
85 else:
86 return (sum_xy - (sum_x * sum_y) / n) / denominator
1 #查找最近邻
2 def computeNearestNeighbor(username, users):
3 """在给定username的情况下,计算其他用户和它的距离并排序"""
4 distances = []
5 for user in users:
6 if user != username:
7 #distance = manhattan_dis(users[user], users[username])
8 distance = pearson_dis(users[user], users[username])
9 distances.append((distance, user))
10 # 根据距离排序,距离越近,排得越靠前
11 distances.sort()
12 return distances
13
14 #推荐
15 def recommend(username, users):
16 """对指定的user推荐电影"""
17 # 找到最近邻
18 nearest = computeNearestNeighbor(username, users)[0][1]
19
20 recommendations = []
21 # 找到最近邻看过,但是我们没看过的电影,计算推荐
22 neighborRatings = users[nearest]
23 userRatings = users[username]
24 for artist in neighborRatings:
25 if not artist in userRatings:
26 recommendations.append((artist, neighborRatings[artist]))
27 results = sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)
28 for result in results:
29 print(result[0], result[1])
1 #测试一下
2
3 recommend('小阳', users)
4 后会无期 4.5
5 匆匆那年 4.0
6 火星救援 3.5
1 #简单的张量分解进行打分和推荐
2 #要用到numpy模块
3 import numpy
4
5 #手写矩阵分解
6 #现在有很多很方便对高维矩阵做分解的package,比如libmf, svdfeature等
7 def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
8 Q = Q.T
9 for step in xrange(steps):
10 for i in xrange(len(R)):
11 for j in xrange(len(R[i])):
12 if R[i][j] > 0:
13 eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
14 for k in xrange(K):
15 P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
16 Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
17 eR = numpy.dot(P,Q)
18 e = 0
19 for i in xrange(len(R)):
20 for j in xrange(len(R[i])):
21 if R[i][j] > 0:
22 e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
23 for k in xrange(K):
24 e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
25 if e < 0.001:
26 break
27 return P, Q.T
1 #读取user数据并用张量分解进行打分
2
3 R = [
4 [5,3,0,1],
5 [4,0,3,1],
6 [1,1,0,5],
7 [1,0,0,4],
8 [0,1,5,4],
9 ]
10
11 R = numpy.array(R)
12
13 N = len(R)
14 M = len(R[0])
15 K = 2
16
17 P = numpy.random.rand(N,K)
18 Q = numpy.random.rand(M,K)
19
20 nP, nQ = matrix_factorization(R, P, Q, K)
21 nR = numpy.dot(nP, nQ.T)
array([[ 0.38345373, 2.181972 ],
[ 0.32334816, 1.56283276],
[ 1.99170613, 0.16400981],
[ 1.59666903, 0.14124969],
[ 1.64308192, 1.07125805]])
array([[ 0.38946426, 2.29198167],
[ 0.19720283, 1.18916254],
[ 1.71589715, 1.76060186],
[ 2.48314488, 0.03019937]])
array([[ 5.15038133, 2.67033753, 4.49955112, 1.01806534],
[ 3.70791658, 1.92222735, 3.30635845, 0.85011689],
[ 1.15160585, 0.58780442, 3.70631887, 4.95064787],
[ 0.94558722, 0.48283649, 2.98840431, 3.96902618],
[ 3.0952255 , 1.59792036, 4.70541851, 4.11236178]])
array([[5, 3, 0, 1],
[4, 0, 3, 1],
[1, 1, 0, 5],
[1, 0, 0, 4],
[0, 1, 5, 4]])