• 机器学习(十):Apriori算法


    四、代码实现(python)

    以下代码来自Peter Harrington《Machine Learing in Action》。
    代码如下(保存为apriori.py

    # -- coding: utf-8 --
    from numpy import *
    
    def loadDataSet():
        return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
    
    def createC1(dataSet):
        # 该函数构建集合C1:候选1-项集
        C1 = []
        for transaction in dataSet:
            for item in transaction:
                if not [item] in C1:
                    C1.append([item])
    
        C1.sort()
        return map(frozenset, C1)
    
    def scanD(D, Ck, minSupport):
        # 该函数接收3个参数,分别是数据集、候选k-项集、支持度阈值;该函数用于生成频繁项集
        ssCnt = {}
        for tid in D:
            for can in Ck:
                if can.issubset(tid):
                    if not ssCnt.has_key(can): ssCnt[can]=1
                    else: ssCnt[can] += 1
        numItems = float(len(D))
        retList = []                                # retList存储大于支持度阈值的候选1-项集,即频繁1-项集
        supportData = {}                            # supportDatacunc存储各候选1-项集的支持度
        for key in ssCnt:
            support = ssCnt[key]/numItems
            if support >= minSupport:
                retList.insert(0,key)
            supportData[key] = support
        return retList, supportData
    
    def aprioriGen(Lk, k):
        # 该函数接收2个参数,分别是频繁(k-1)-项集、k;该函数用于生成候选项集
        retList = []                                 # 存储候选k-项集
        lenLk = len(Lk)
        for i in range(lenLk):
            for j in range(i+1, lenLk):
                L1 = list(Lk[i])[:k-2]
                L2 = list(Lk[j])[:k-2]
                L1.sort()
                L2.sort()
                if L1==L2:
                    retList.append(Lk[i] | Lk[j])    # 前k-2个项相同,合并Lk[i]与Lk[j]
        return retList
    
    def apriori(dataSet, minSupport = 0.5):
        # 该函数接收2个参数,分别是数据集、支持度阈值(默认0.5)
        C1 = createC1(dataSet)                       # 创建候选1-项集
        D = map(set, dataSet)
        L1, supportData = scanD(D, C1, minSupport)   # L1为频繁1-项集,supportData存储各候选1-项集的支持度
        L = [L1]
        k = 2
        while (len(L[k-2]) > 0):
            # 循环各频繁(k-1)-项集,直至为空
            Ck = aprioriGen(L[k-2], k)               # Ck为候选k-项集
            Lk, supK = scanD(D, Ck, minSupport)      # Lk为频繁k-项集,supportData存储各候选k-项集的支持度
            supportData.update(supK)                 # 存储各候选项集的支持度
            L.append(Lk)                             # 将新生成的频繁k-项集添加进频繁项集数组
            k += 1
        return L, supportData
    
    def generateRules(L, supportData, minConf=0.7):
        # 该函数接收3个参数,分别是频繁项集、包含项集的支持度字典、置信度阈值;
        bigRuleList = []
        for i in range(1, len(L)):
            for freqSet in L[i]:
                H1 = [frozenset([item]) for item in freqSet]
                if (i > 1):
                    # 项集数目大于3执行次函数
                    rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
                else:
                    # 频繁2-项集执行此函数
                    calcConf(freqSet, H1, supportData, bigRuleList, minConf)
        return bigRuleList
    
    def calcConf(freqSet, H, supportData, brl, minConf=0.7):
        # 该函数接收5个参数,分别是用于计算的频繁项集、此项集各个元素、包含项集的支持度字典、关联规则数组、置信度阈值;
        prunedH = []
        for conseq in H:
            conf = supportData[freqSet]/supportData[freqSet-conseq]   # 计算置信度
            if conf >= minConf:
                print freqSet-conseq,'-->',conseq,'conf:',conf
                brl.append((freqSet-conseq, conseq, conf))
                prunedH.append(conseq)
        return prunedH
    
    def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
        # 该函数接收5个参数,分别是用于计算的频繁项集、此项集各个元素、包含项集的支持度字典、关联规则数组、置信度阈值;
        m = len(H[0])
        if (len(freqSet) > (m + 1)):
            Hmp1 = aprioriGen(H, m+1)                                  # 将H中的元素两两合并
            Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)  # 计算置信度
            if (len(Hmp1) > 1):
                rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
    

     

  • 相关阅读:
    PyCharm 的使用(二)
    redis数据库
    mysql大全
    Python 模块详解及import本质
    logging模块
    redis详细配置
    千万 PV,百万PV什么意思?
    elasticsearch集群添加节点
    elasticsearch集群安全重启节点
    记一次redis-cluster的切换
  • 原文地址:https://www.cnblogs.com/pengfeiz/p/11393027.html
Copyright © 2020-2023  润新知