1.构造LightFM Dataset
import pandas as np
# 1.user item rating
cols = ["user_id","item_id","rating"]
user_item_df = pd.read_csv("BX-Book-Ratings.csv",encoding="latin-1",delimiter=";",
header=None,names=cols)
# 2. item feature
item_feature_df = pd.read_csv("BX-Books.csv",encoding="latin-1",delimiter=";",
header=None,error_bad_lines=False) # 过滤列数不对的行
1.fit()方法
from lightfm.data import Dataset
dataset=Dataset()
# 1.fit(),输入参数为(userid列表,itemid列表),来构造原始id到内部索引id的映射字典
dataset.fit(list(user_item_df["user_id"].values),user_item_df["item_id"].values))
# 此时可以查看用户映射字典和物品映射字典
dataset._user_id_mapping
dataset._item_id_mapping
其中Dataset实例化的时候有两个默认为True的参数
user_identity_features: bool, optional Create a unique feature for every user in addition to other features. If true (default), a latent vector will be allocated for every user. This is a reasonable default for most applications, but should be set to false if there is very little data for every user. item_identity_features: bool, optional Create a unique feature for every item in addition to other features. If true (default), a latent vector will be allocated for every item. This is a reasonable default for most applications, but should be set to false if there is very little data for every item.
def __init__(self, user_identity_features=True, item_identity_features=True):
self._user_identity_features = user_identity_features
self._item_identity_features = item_identity_features
self._user_id_mapping = {}
self._item_id_mapping = {}
self._user_feature_mapping = {}
self._item_feature_mapping = {}
在为True的情况下,会在fit()方法调用时,同时生成特征-内部索引映射字典,此时这个字典的值和物品-内部映射字典一样。
dataset._item_feature_mapping # 字典长度为340556,即有340556个物品
dataset._user_feature_mapping
2. fit_partial()方法,生成特征映射字典
使用额外的物品特征,调用fit_partial()方法,传入参数为物品id列表,物品特征列表,这里只使用作者作为特征。
问:为何又传入了一次物品id列表,防止有物品在特征矩阵出现,但是没有在交互矩阵出现。
dataset.fit_partial(items=(list(item_feature_df[0].values)),item_features=list(item_feature_df[2].values))
dataset._item_feature_mapping # 字典长度为443789,即物品有443789-340556个特征
3.生成交互矩阵
build_interactions()可以传入(user,item),此时默认weight=1,可以传入(user,item,weight)
分别对应隐式评分和显式评分。
(interactions, weights) = dataset.build_interactions(((x[1]['user_id'], x[1]['item_id'])
for x in user_item_df.iterrows()))
4.创建特征矩阵
item_features = dataset.build_item_features(((x[1][0], [x[1][2]])
for x in item_feature_df.iterrows()))
print(repr(item_features)) # item_features形状为(item_nums,feature_nums)
二、训练模型
from lightfm import LightFM
model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
train_precision = precision_at_k(model, interactions, k=10,item_features=item_features).mean()
print('Precision: train %.2f' % (train_precision))
train_auc = auc_score(model, interactions,item_features=item_features).mean()
print('AUC: train %.2f.' % (train_auc))