• 【笔记3】用pandas实现矩阵数据格式的推荐算法 (基于用户的协同)


    原书作者使用字典dict实现推荐算法,并且惊叹于18行代码实现了向量的余弦夹角公式。

    我用pandas实现相同的公式只要3行。

    特别说明:本篇笔记是针对矩阵数据,下篇笔记是针对条目数据。

    '''
    基于用户的协同推荐
    
    矩阵数据
    '''
    
    import pandas as pd
    from io import StringIO
    import json
    
    #数据类型一:csv矩阵(用户-商品)(适用于小数据量)
    csv_txt = '''"user","Blues Traveler","Broken Bells","Deadmau5","Norah Jones","Phoenix","Slightly Stoopid","The Strokes","Vampire Weekend"
    "Angelica",3.5,2.0,,4.5,5.0,1.5,2.5,2.0
    "Bill",2.0,3.5,4.0,,2.0,3.5,,3.0
    "Chan",5.0,1.0,1.0,3.0,5,1.0,,
    "Dan",3.0,4.0,4.5,,3.0,4.5,4.0,2.0
    "Hailey",,4.0,1.0,4.0,,,4.0,1.0
    "Jordyn",,4.5,4.0,5.0,5.0,4.5,4.0,4.0
    "Sam",5.0,2.0,,3.0,5.0,4.0,5.0,
    "Veronica",3.0,,,5.0,4.0,2.5,3.0,'''
    
    
    #数据类型二:json数据(用户、商品、打分)
    json_txt = '''{"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
                          "Norah Jones": 4.5, "Phoenix": 5.0,
                          "Slightly Stoopid": 1.5,
                          "The Strokes": 2.5, "Vampire Weekend": 2.0},
             
             "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
                     "Deadmau5": 4.0, "Phoenix": 2.0,
                     "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
             
             "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
                      "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
                      "Slightly Stoopid": 1.0},
             
             "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
                     "Deadmau5": 4.5, "Phoenix": 3.0,
                     "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                     "Vampire Weekend": 2.0},
             
             "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
                        "Norah Jones": 4.0, "The Strokes": 4.0,
                        "Vampire Weekend": 1.0},
             
             "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0,
                         "Norah Jones": 5.0, "Phoenix": 5.0,
                         "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                         "Vampire Weekend": 4.0},
             
             "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
                     "Norah Jones": 3.0, "Phoenix": 5.0,
                     "Slightly Stoopid": 4.0, "The Strokes": 5.0},
             
             "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
                          "Phoenix": 4.0, "Slightly Stoopid": 2.5,
                          "The Strokes": 3.0}
    }'''
    
    
    df = None
    
    #方式一:加载csv数据
    def load_csv_txt():
        global df
        df = pd.read_csv(StringIO(csv_txt), header=0, index_col="user")
    
    #方式二:加载json数据(把json读成矩阵)
    def load_json_txt():
        global df
        df = pd.read_json(json_txt, orient='index')
        
        
    #测试:读取数据
    load_csv_txt()
    #load_json_txt()
    
    
    
    def build_xy(user_name1, user_name2):
        #df2 = df.ix[[user_name1, user_name2]].dropna(axis=1)
        #return df2.ix[user_name1], df2.ix[user_name2]
        
        bool_array = df.ix[user_name1].notnull() & df.ix[user_name2].notnull()
        return df.ix[user_name1, bool_array], df.ix[user_name2, bool_array]
    
    
    #曼哈顿距离
    def manhattan(user_name1, user_name2):
        x, y = build_xy(user_name1, user_name2)
        return sum(abs(x - y))
        
    #欧几里德距离
    def euclidean(user_name1, user_name2):
        x, y = build_xy(user_name1, user_name2)
        return sum((x - y)**2)**0.5
        
    #闵可夫斯基距离
    def minkowski(user_name1, user_name2, r):
        x, y = build_xy(user_name1, user_name2)
        return sum(abs(x - y)**r)**(1/r)
        
    #皮尔逊相关系数
    def pearson(user_name1, user_name2):
        x, y = build_xy(user_name1, user_name2)
        mean1, mean2 = x.mean(), y.mean()
        #分母
        denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
        return [sum((x-mean1)*(y-mean2))/denominator, 0][denominator == 0]
        
    
    #余弦相似度(数据的稀疏性问题,在文本挖掘中应用得较多)
    def cosine(user_name1, user_name2):
        x, y = build_xy(user_name1, user_name2)
        #分母
        denominator = (sum(x*x)*sum(y*y))**0.5
        return [sum(x*y)/denominator, 0][denominator == 0]
    
    metric_funcs = {
        'manhattan': manhattan,
        'euclidean': euclidean,
        'minkowski': minkowski,
        'pearson': pearson,
        'cosine': cosine
    }
    
    #df.ix[["Angelica","Bill"]].dropna(axis=1)
    print(manhattan("Angelica","Bill"))
    
    #计算最近的邻居
    def computeNearestNeighbor(user_name, metric='pearson', k=3, r=2):
        '''
        metric: 度量函数
        k:      返回k个邻居
        r:      闵可夫斯基距离专用
        
        返回:pd.Series,其中index是邻居名称,values是距离
        '''
        if metric in ['manhattan', 'euclidean']:
            return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name,)).nsmallest(k)
        elif metric in ['minkowski']:
            return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name, r,)).nsmallest(k)
        elif metric in ['pearson', 'cosine']:
            return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name,)).nlargest(k)
        
    print(computeNearestNeighbor('Hailey', metric='pearson'))
    
    #向给定用户推荐(返回:pd.Series)
    def recommend(user_name):
        # 找到距离最近的用户名
        nearest_username = computeNearestNeighbor(user_name).index[0]
        
        # 找出邻居评价过、但自己未曾评价的乐队(或商品)
        # 结果:index是商品名称,values是评分
        return df.ix[nearest_username, df.ix[user_name].isnull() & df.ix[nearest_username].notnull()].sort_values()
    
    
    #为Hailey做推荐
    print(recommend('Hailey'))
    
    
    
    
    #向给定用户推荐
    def recommend2(user_name, metric='pearson', k=3, n=5, r=2):
        '''
        metric: 度量函数
        k:      根据k个最近邻居,协同推荐
        r:      闵可夫斯基距离专用
        n:      推荐的商品数目
        
        返回:pd.Series,其中index是商品名称,values是加权评分
        '''
        # 找到距离最近的k个邻居
        nearest_neighbors = computeNearestNeighbor(user_name, metric='pearson', k=k, r=r)
        
        # 计算权值
        if metric in ['manhattan', 'euclidean', 'minkowski']: # 距离越小,越类似
            nearest_neighbors = 1 / nearest_neighbors # 所以,取倒数(或者别的减函数,如:y=2**-x)
        elif metric in ['pearson', 'cosine']:                 # 距离越大,越类似
            pass
            
        nearest_neighbors = nearest_neighbors / nearest_neighbors.sum() #已经变为权值(pd.Series)
        
        # 逐个邻居找出其评价过、但自己未曾评价的乐队(或商品)的评分,并乘以权值
        neighbors_rate_with_weight = []
        for neighbor_name in nearest_neighbors.index:
            # 每个结果:pd.Series,其中index是商品名称,values是评分(已乘权值)
            neighbors_rate_with_weight.append(df.ix[neighbor_name, df.ix[user_name].isnull() & df.ix[neighbor_name].notnull()] * nearest_neighbors[neighbor_name])
    
        # 把邻居们的加权评分拼接成pd.DataFrame,按列累加,取最大的前n个商品的评分
        return pd.concat(neighbors_rate_with_weight, axis=1).sum(axis=1, skipna=True).nlargest(n)
        
    
    #为Hailey做推荐
    print(recommend2('Hailey', metric='manhattan', k=3, n=5))
    
    #为Hailey做推荐
    print(recommend2('Hailey', metric='euclidean', k=3, n=5, r=2))
    
    #为Hailey做推荐
    print(recommend2('Hailey', metric='pearson', k=1, n=5))
    
    
  • 相关阅读:
    NFC性价比高频读卡器首选方案:FM17550
    关于ESP8266和ESP8285的对比
    有没有比NRF51822更好的智能穿戴蓝牙方案
    zigbee CC2530首选方案模组:TZU06A1
    PAN3501兼容AS3933-BTST
    NRF51822和NRF52832的主要区别
    集成模拟温度传感器低成本2.4G CC2500RGPR 中文手册
    USB2.0主机控制器 UPD720114 简单详解
    存储器HK1225-7EQ 使用说明书资料
    爬虫 + 数据
  • 原文地址:https://www.cnblogs.com/hhh5460/p/6121839.html
Copyright © 2020-2023  润新知