协同过滤
具体的在推荐系统实践那本书里已经说的差不多了。协同过滤算法
这里主要是做一个算法(UserCF/ItemCF)的复习和说明。补充了那本书的代码。
基本操作
实现步骤:
1、收集用户偏好
2、找到相似的用户或物品
3、计算并推荐
用户评分
1、将不同的行为分组
2、对不同行为进行加权
- 减噪
- 归一化
相似度计算
1、同现相似度:带惩罚权重,减小热门物品和很多物品相似的可能
$W_{A,B} = frac{|N(A) igcap N(B)|}{ sqrt{ |N(A) | N(B)|}}$
2、欧几里得距离:距离越小,相似度越大。
$d(x,y) = sqrt{ sum (x_i - y_i)^2}$
$sim(x,y) = frac{1}{1 + d(x,y) }$
3、皮尔逊相关系数:定距变量间联系的紧密程度[-1,1]
$p(x,y) = frac{sum x_iy_i - noverline{xy}}{(n-1)s_xs_y}= frac{nsum x_iy_i - sum x_i sum y_i}{sqrt {nsum x_i^{2} - (sum x_i)^2} sqrt {nsum y_i^{2} - (sum y_i)^2}}$
推荐计算
1、UserCF
基于用户对物品的偏好找到相邻用户,然后将相邻用户喜欢的物品推荐给当前用户。
计算时,一个用户对所有物品的偏好作为一个向量来计算相似度。
找到K个相邻用户,根据相似度权重以及物品偏好,预测推荐列表。
2、ItemCF
计算邻居时采用物品本身,基于用户对物品的偏好找到相似物品。
将所有用户对某个物品的偏好作为一个向量来计算物品之间的相似度。
根据用户历史偏好推荐相似物品,生成推荐列表。
CODE
#带惩罚的用户相似度 #1/ log1 + |N(i)| def UserSimilarity(train): #建立倒插表 item_users = dict() for u,items in train.items(): for i in items.keys(): if i not in item_users: item_users[i] = set() item_users[i].add(u) #计算相似度 C = dict() N = dict() for i ,users in item_users.items(): for u in users: N[u] += 1 for v in users: if u == v: continue C[u][v] += 1 / math.log(1 + len(users)*1.0) #计算最后的相似度矩阵 W = dict() for u ,related_users in C.items(): for v,cuv in related_users.items(): W[u][v] = cuv / math.sqrt(N[u] * N[v]) return W #UserCF推荐 rvi = 1 def Recommend(user,train,W,K): rank = dict() interacted_items = train[user] for v, wuv in sorted(W[u].items,key=itemgetter(1), reverse=True)[0:K]: for i ,rvi in interacted_items: if i in interacted_items: continue rank[i] += wuv * rvi return rank #带惩罚的物品相似度 def ItemSimilarity(train): #建物品用户倒插表 C = dict() N = dict() for u,items in train.items(): for i in users: N[i] += 1 for j in users: if i == j: continue C[i][j] += 1 / math.log(1 + len(items) * 1.0) #计算最后相似度矩阵 W = dict() for i,related_items in C.items(): for j,cij in related_items.items(): W[u][v] = cij / math.sqrt(N[i] * N[j]) return W #ItemCF推荐 def Recommendation(train,user_id,W,K): rank = dict() ru = train[user_id] for i,pi in ru.items(): for j,wj in sorted(W[i].items(), key = itemgetter(1),reverse=True)[0:K]: if j in ru: continue rank[j] += pi * wj return rank #带解释的ItemCF def Recommendation(train,user_id,W,K): rank = dict() ru = train[user_id] for i, pi in ru.items(): for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]: if j in ru: continue rank[j].weight += pi * wj rank[j].reason[i] = pi * wj return rank
评分预测算法
1、全局平均值:训练集中所有评分记录的评分平均值
2、用户评分平均值:用户u在训练集中所有评分的平均值
3、物品评分平均值:物品i在训练集中接受的所有评分的平均值
4、用户分类对物品的平均值:同类用户对同类物品评分的平均值预测用户对物品的评分
CODE
#评分算法 class Cluster: def __init__(self,records): self.group = dict() def GetGroup(self, i): return 0 class IdCluster(Cluster): def __init__(self,records): Cluster.__init__(self,records) def GetGroup(self, i): return i class UserActivityCluster(Cluster): def __init__(self,records): Cluster.__init__(self,records) activity = dict() for r in records: if r.test != 0: continue basic.AddToDict(activity,r.user ,1) k = 0 for user, n in sorted(activity.items(), key=itemgetter(1), reverse=False): c = int((k*5) / (1.0* len(activity))) self.group[user] = c k += 1 def GetGroup(self,uid): if uid not in self.group: return -1 else: return self.group[uid] class ItemPopularityCluster(Cluster): def __init__(self, records): Cluster.__init__(self, records) popularity = dict() for r in records: if r.test != 0: continue basic.AddToDict(popularity, r.item, 1) k = 0 for item, n in sorted(popularity.items(), key=itemgetter(1), reverse=False): c = int((k * 5) / (1.0 * len(popularity))) self.group[item] = c k += 1 def GetGroup(self, item): if item not in self.group: return -1 else: return self.group[item] class UserVoteCluster(Cluster): def __init__(self, records): Cluster.__init__(self, records) vote = dict() count = dict() for r in records: if r.test != 0: continue basic.AddToDict(vote, r.user, r.vote) basic.AddToDict(count, r.user, 1) k = 0 for user, v in vote.items(): ave = v / (count[user] * 1.0) c = int(ave * 2) self.group[user] = c def GetGroup(self, uid): if uid not in self.group: return -1 else: return self.group[uid] class ItemVoteCluster(Cluster): def __init__(self, records): Cluster.__init__(self, records) vote = dict() count = dict() for r in records: if r.test != 0: continue basic.AddToDict(vote, r.item, r.vote) basic.AddToDict(count, r.item, 1) k = 0 for item , v in vote.items(): ave = v / (count[item] * 1.0) c = int(ave * 2) self.group[user] = c def GetGroup(self, item): if item not in self.group: return -1 else: return self.group[item] #分类平均值的计算 def PredictAll(records, user_cluster, item_cluster): total = dict() count = dict() for r in records: if r.test != 0: continue gu = user_cluster.GetGroup(r.user) gi = item_cluster.GetGroup(r.item) basic.AddToMat(total,gu,gi,r.vote) basic.AddToMat(count,gu,gi,1) for i in records: gu = user_cluster.GetGroup(r.user) gi = item_cluster.GetGroup(r.item) average = total[gu][gi] / (1.0 * count[gu][gi] + 1.0) r.predict = average