• Apriori算法在购物篮分析中的运用


      购物篮分析是一个很经典的数据挖掘案例,运用到了Apriori算法。下面从网上下载的一超市某月份的数据库,利用Apriori算法进行管理分析。例子使用Python+MongoDB

      处理过程1 数据建模(将Excel中的数据写入到MongoDB数据库), 2 从数据库中读取数据进行分析。

      Excel文件http://download.csdn.net/detail/artscrafts/6805689

      案例配置文件 setting.py

    1 data_source = 'supermarket.xls'
    2 host = 'localhost'
    3 port = 27017
    4 db_name = 'shopping_basket'
    5 items_name = 'goods_items'
    6 record_name = 'transaction_record'

      读取Excel数据到MongoDB中 load_basket.py

     1 from xlrd import open_workbook
     2 from pymongo import MongoClient
     3 import setting
     4 
     5 wb = open_workbook(setting.data_source, encoding_override='utf-8')
     6 client = MongoClient(setting.host, setting.port)
     7 db = client[setting.db_name]
     8 items = []
     9 
    10 #read xls
    11 def read_one_line(workbook, sheet_index=0, row_index=0, start_col_index=0):
    12     sheet = workbook.sheets()[0]
    13     max_row = sheet.nrows
    14     max_col = sheet.ncols
    15     start_col_index = (start_col_index if (start_col_index > 0 and start_col_index <= max_col) else max_col)
    16     if row_index < 0 or row_index >= max_row:
    17         raise IndexError()
    18     for col_index in xrange(start_col_index, max_col):
    19         yield sheet.cell(row_index, col_index).value
    20 
    21 #read xls
    22 def readlines(workbook, sheet_index=0, start_row_index=0, end_row_index=None, start_col_index=0, end_col_index=None):
    23     sheet = workbook.sheets()[sheet_index]
    24     max_row = sheet.nrows
    25     max_col = sheet.ncols
    26     end_row_index = (end_row_index if end_row_index  else max_row)
    27     end_col_index = (end_col_index if end_col_index  else max_col)
    28     for row_index in xrange(start_row_index, end_row_index):
    29         yield [sheet.cell(row_index, col_index).value for col_index in xrange(start_col_index, end_col_index)]
    30 
    31 #from xls to mongodb
    32 def load_items():
    33     collection = db[setting.items_name]
    34     items_line = read_one_line(wb, row_index=1, start_col_index=1)
    35     id = 1
    36     tmp = []
    37     for item in items_line:
    38         if id % 100 == 0:
    39             collection.insert(tmp)
    40             tmp = []
    41         tmp.append({'id':id, 'name':item})
    42         items.append(item)
    43         id += 1
    44 
    45 # from xls to mongodb
    46 def load_record():
    47     collection = db[setting.record_name]
    48     lines = readlines(wb,start_row_index=2, start_col_index = 1)
    49     tmp = []
    50     id = 1
    51     for line in lines:
    52         if id % 100 == 0:
    53             collection.insert(tmp)
    54             tmp = []
    55         tmp.append({'id':id, 'items':[items[i] for i in xrange(len(line)) if line[i] == 'T']})
    56         id += 1
    57 
    58 
    59 def main():
    60     print '........start loading........'
    61     load_items()
    62     load_record()
    63     client.close()
    64     print '.........end loading.........'
    65 
    66 if __name__ == '__main__':
    67     main()

      进行数据分析 analysis_basket.py

     1 #Apriori
     2 from pymongo import MongoClient
     3 import setting
     4 
     5 client = MongoClient(setting.host, setting.port)
     6 db = client[setting.db_name]
     7 data = []
     8 
     9 #from mongodb to items
    10 def filldata():
    11     collection = db[setting.record_name]
    12     cur = collection.find()
    13     for row in cur:
    14         data.append(row['items'])
    15 
    16 def connect(items):
    17     result = {}
    18     keys = items.keys()
    19     length = len(keys)
    20     for i in range(length):
    21         prev = keys[i][:len(keys[i]) - 1]
    22         for j in range(i + 1, length):
    23             tmp = keys[j][:len(keys[j]) - 1]
    24             if prev == tmp:
    25                 key = keys[i] + (keys[j][len(keys[i]) - 1],)
    26                 result[key] = getsupp(key)
    27             else:
    28                 break
    29     return result
    30 
    31 
    32 def pruning(items, minsupp):
    33     result = {}
    34     for key in items.keys():
    35         if items[key] >= minsupp:
    36             result[key] = items[key]
    37     return result
    38 
    39 def contain(par, sub):
    40     for v in sub:
    41         if not v in par:
    42             return False
    43     return True
    44 
    45 
    46 def getsupp(item):
    47     supp = 0
    48     for row in data:
    49         if contain(row, item):
    50             supp+=1
    51     return supp
    52 
    53 def apriori(data, minsupp, k):
    54     candidate_set = {}
    55     for row in data:
    56         for i in row:
    57             key = (i,)
    58             candidate_set[key] = candidate_set.get(key, 0) + 1
    59     frequently_set = pruning(candidate_set, minsupp)
    60     result = {}
    61     result['k=1'] = frequently_set
    62     for n in range(2, k):
    63         candidate_set = connect(frequently_set)
    64         frequently_set = pruning(candidate_set, minsupp)
    65         if len(frequently_set) <= 1:
    66             return result
    67         result['K=' + str(n)] = frequently_set
    68     return result
    69 
    70 def main():
    71     filldata()
    72     client.close()
    73     res = apriori(data, 30, 8)
    74 
    75 
    76 if __name__ == '__main__':
    77     main()

      

      

  • 相关阅读:
    C语言 · 报时助手
    C语言 · 完美的代价
    C语言 · 十六进制转八进制
    C语言 · 十六进制转十进制
    C语言 · 芯片测试
    C语言 · 素数求和
    C语言 · 五次方数
    Lodop多分出空白页的可能(情况1)
    C-Lodop提示“网页还没下载完毕,请稍等一下再操作.”
    Lodop简答问答大全
  • 原文地址:https://www.cnblogs.com/ArtsCrafts/p/shopping_basket.html
Copyright © 2020-2023  润新知