• oracle plsql 实现apriori算法


    对apriori关联关系算法研究了一段时间,网上能搜到的例子,大部分是python写的,数据集长得像下面这样:

    [[I1,I2,I5],[I2,I4],[I2,I3],[I1,I2,I4],[I1,I3],[I2,I3],[I1,I3],[I1,I2,I3,I5],[I1,I2,I3]]

    而实际的交易数据,一般存储到关系型数据库中,数据是按下面的样子保存:

    TRAN_SEQ_NO,ITEM
    1,I1
    1,I2
    1,I5
    2,I2
    2,I4
    .
    .8,I5
    9,I1
    9,I2
    9,I3

    而且python的程序,写了好多循环,效率不高。

    根据小票数据在数据库中存储的特点,并且apriori算法也不是特别复杂,因此想用plsql实现一下。

    plsql实现的aprioir算法,对原算法做了裁剪,只计算2项集和两个商品之间的关联关系,3项集以上的忽略不计。

    表结构创建

    1.小票表(交易事物表)

    create table CMX_APRIORI_TRANSACTION
    (
      tran_seq_no NUMBER(20), --交易号
      item        VARCHAR2(25)  --商品编码
    )

    2.频繁项集C1

    create table CMX_APRIORI_L1
    (
      item    VARCHAR2(25),  --商品编码
      support NUMBER(9,6),          --支持度
      cnt     NUMBER(8)              --交易次数
    )

    2.关联关系L2(结果表)

    create table CMX_APRIORI_L2
    (
      item_a   VARCHAR2(25),    --前件商品编码
      item_b   VARCHAR2(25),    --后件商品编码
      cnt      NUMBER(8),                 --交易次数
      support  NUMBER(9,6),             --支持度
      conf_a_b NUMBER(9,6),  --置信度
      lift_a_b NUMBER(9,6)      --提升度
    )

     说明:

    小票表插入的数据,必须提前进行处理。同一张小票商品要去重。

    完整plsql代码:

    CREATE OR REPLACE PACKAGE CMX_APRIORI_SQL IS
    
      /*-----------------------------------------------------------------------
      * PROCEDURE NAME      : CMX_APRIORI_SQL
      * COMMENTS            : 商品关联关系计算
      * CODED BY            : ONELANG 2019-1-27
      * CHANGED HISTORY     :
      -----------------------------------------------------------------------*/
      FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2)   RETURN BOOLEAN;
    END CMX_APRIORI_SQL;
    /
    CREATE OR REPLACE PACKAGE BODY CMX_APRIORI_SQL IS
    
     /*-----------------------------------------------------------------------
      * PROCEDURE NAME      : INIT_TEST_DATA
      * COMMENTS            : 初始化测试数据
      * CODED BY            : ONELANG 2019-1-27
      * CHANGED HISTORY     :
      -----------------------------------------------------------------------*/
      
      FUNCTION INIT_TEST_DATA(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS
        L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.INIT_TEST_DATA';
      BEGIN
      
        EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_TRANSACTION';
        
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (1,'I1');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (1,'I2');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (1,'I5');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (2,'I2');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (2,'I4');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (3,'I2');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (3,'I3');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (4,'I1');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (4,'I2');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (4,'I4');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (5,'I1');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (5,'I3');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (6,'I2');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (6,'I3');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (7,'I1');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (7,'I3');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (8,'I1');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (8,'I2');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (8,'I3');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (8,'I5');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (9,'I1');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (9,'I2');
        INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
        VALUES (9,'I3');
        COMMIT;
        RETURN TRUE;
      
      EXCEPTION
        WHEN OTHERS THEN
          O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
          RETURN FALSE;
      END;
    
     /*-----------------------------------------------------------------------
      * PROCEDURE NAME      : GET_TEST_L1
      * COMMENTS            : 计算频繁项集L1
      * CODED BY            : ONELANG 2019-1-27
      * CHANGED HISTORY     :
      -----------------------------------------------------------------------*/
      
    FUNCTION GET_TEST_L1(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS
        L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_TEST_L1';
        L_TOTAL NUMBER(8);
      BEGIN
        
        EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L1';
        
        SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL
        FROM CMX_APRIORI_TRANSACTION ;
                 
        INSERT INTO CMX_APRIORI_L1
        SELECT ITEM,ROUND(COUNT(1) / L_TOTAL,6) SUPPORT,COUNT(1) CNT
          FROM CMX_APRIORI_TRANSACTION A
        GROUP BY ITEM
        HAVING COUNT(1) / L_TOTAL >= I_SUPPORT ;
        
        COMMIT;
          
        RETURN TRUE;
      
      EXCEPTION
        WHEN OTHERS THEN
          O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
        
          RETURN FALSE;
      END;
      
      /*-----------------------------------------------------------------------
      * PROCEDURE NAME      : GET_L2
      * COMMENTS            : 获取关联关系L2
      * CODED BY            : 
      * CHANGED HISTORY     :
      -----------------------------------------------------------------------*/
      FUNCTION GET_TEST_L2(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS
        L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_L2';
        L_TOTAL NUMBER(8);
      BEGIN
        
        EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L2';
        
        SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL
        FROM CMX_APRIORI_TRANSACTION ;
        
        --2项集支持度
        INSERT INTO CMX_APRIORI_L2(ITEM_A,ITEM_B,CNT,SUPPORT)
        SELECT ITEM_A,ITEM_B,COUNT(DISTINCT TRAN_SEQ_NO) CNT, ROUND(COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL,6) SUPPORT
          FROM (SELECT A.TRAN_SEQ_NO,
                       A.ITEM ITEM_A,
                       B.ITEM ITEM_B
                FROM CMX_APRIORI_TRANSACTION A,
                     CMX_APRIORI_TRANSACTION B
               WHERE A.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1)
                 AND B.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1)
                 AND A.TRAN_SEQ_NO = B.TRAN_SEQ_NO
                 AND A.ITEM > B.ITEM             
              )
        GROUP BY ITEM_A,ITEM_B
        HAVING  COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL >= I_SUPPORT;
        
        --置信度
        UPDATE CMX_APRIORI_L2 L2
           SET CONF_A_B = (SELECT L2.SUPPORT /  L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_A = L1.ITEM);
        
        --提升度
        UPDATE CMX_APRIORI_L2 L2
           SET LIFT_A_B = (SELECT L2.CONF_A_B /  L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_B = L1.ITEM);
        
        COMMIT;
          
        RETURN TRUE;
      
      EXCEPTION
        WHEN OTHERS THEN
          O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
        
          RETURN FALSE;
      END;  
      
      FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS
        L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.TEST';
        L_MIN_SUPP NUMBER(9,5);
      BEGIN
        
        L_MIN_SUPP := 0.1;
        
        IF INIT_TEST_DATA(O_ERROR_MESSAGE) = FALSE THEN
          RETURN FALSE;
        END IF;
        
        IF GET_TEST_L1(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN
          RETURN FALSE;
        END IF;
        
        IF GET_TEST_L2(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN
          RETURN FALSE;
        END IF;
      
        RETURN TRUE;
      
      EXCEPTION
        WHEN OTHERS THEN
          O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
          RETURN FALSE;
      END;
      
    END CMX_APRIORI_SQL;
    /

    运行:

    declare  
      result boolean;
    begin
      -- Call the function
      result := cmx_apriori_sql.test(o_error_message => :o_error_message);
      -- Convert false/true/null to 0/1/null 
      :result := sys.diutil.bool_to_int(result);
    end;

    运行结果:

    select * from cmx_apriori_l2
    
     ITEM_A  ITEM_B    CNT    SUPPORT    CONF_A_B    LIFT_A_B
    1    I2    I1    4    0.444444    0.571428    0.857142
    2    I4    I2    2    0.222222    1.000000    1.285714
    3    I5    I3    1    0.111111    0.500000    0.750000
    4    I3    I2    4    0.444444    0.666666    0.857142
    5    I5    I1    2    0.222222    1.000000    1.499999
    6    I3    I1    4    0.444444    0.666666    0.999999
    7    I4    I1    1    0.111111    0.500000    0.750000
    8    I5    I2    2    0.222222    1.000000    1.285714

    用一家门店,一年的销售数据计算一下就会发现,尿布->啤酒根本没有关联关系。意外+惊喜。

    技术不行 业务不行 管理不行
  • 相关阅读:
    Confluence未授权模板注入/代码执行(CVE-2019-3396)
    Python实现批量处理扫描特定目录
    windows10 缺失 msvcp140.dll 解决办法
    nessus 故障处理
    python 处理json数据
    python 实现两个文本文件内容去重
    python3 实现多域名批量访问特定目录(一)
    python 简单的实现文件内容去重
    python 实现爬取网站下所有URL
    php强大的filter过滤用户输入
  • 原文地址:https://www.cnblogs.com/onelang/p/10345988.html
Copyright © 2020-2023  润新知