对apriori关联关系算法研究了一段时间,网上能搜到的例子,大部分是python写的,数据集长得像下面这样:
[[I1,I2,I5],[I2,I4],[I2,I3],[I1,I2,I4],[I1,I3],[I2,I3],[I1,I3],[I1,I2,I3,I5],[I1,I2,I3]]
而实际的交易数据,一般存储到关系型数据库中,数据是按下面的样子保存:
TRAN_SEQ_NO,ITEM 1,I1 1,I2 1,I5 2,I2 2,I4 . .8,I5 9,I1 9,I2 9,I3
而且python的程序,写了好多循环,效率不高。
根据小票数据在数据库中存储的特点,并且apriori算法也不是特别复杂,因此想用plsql实现一下。
plsql实现的aprioir算法,对原算法做了裁剪,只计算2项集和两个商品之间的关联关系,3项集以上的忽略不计。
表结构创建
1.小票表(交易事物表)
create table CMX_APRIORI_TRANSACTION
(
tran_seq_no NUMBER(20), --交易号
item VARCHAR2(25) --商品编码
)
2.频繁项集C1
create table CMX_APRIORI_L1
(
item VARCHAR2(25), --商品编码
support NUMBER(9,6), --支持度
cnt NUMBER(8) --交易次数
)
2.关联关系L2(结果表)
create table CMX_APRIORI_L2
(
item_a VARCHAR2(25), --前件商品编码
item_b VARCHAR2(25), --后件商品编码
cnt NUMBER(8), --交易次数
support NUMBER(9,6), --支持度
conf_a_b NUMBER(9,6), --置信度
lift_a_b NUMBER(9,6) --提升度
)
说明:
小票表插入的数据,必须提前进行处理。同一张小票商品要去重。
完整plsql代码:
CREATE OR REPLACE PACKAGE CMX_APRIORI_SQL IS /*----------------------------------------------------------------------- * PROCEDURE NAME : CMX_APRIORI_SQL * COMMENTS : 商品关联关系计算 * CODED BY : ONELANG 2019-1-27 * CHANGED HISTORY : -----------------------------------------------------------------------*/ FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN; END CMX_APRIORI_SQL; / CREATE OR REPLACE PACKAGE BODY CMX_APRIORI_SQL IS /*----------------------------------------------------------------------- * PROCEDURE NAME : INIT_TEST_DATA * COMMENTS : 初始化测试数据 * CODED BY : ONELANG 2019-1-27 * CHANGED HISTORY : -----------------------------------------------------------------------*/ FUNCTION INIT_TEST_DATA(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.INIT_TEST_DATA'; BEGIN EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_TRANSACTION'; INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (1,'I1'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (1,'I2'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (1,'I5'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (2,'I2'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (2,'I4'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (3,'I2'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (3,'I3'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (4,'I1'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (4,'I2'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (4,'I4'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (5,'I1'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (5,'I3'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (6,'I2'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (6,'I3'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (7,'I1'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (7,'I3'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (8,'I1'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (8,'I2'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (8,'I3'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (8,'I5'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (9,'I1'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (9,'I2'); INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM) VALUES (9,'I3'); COMMIT; RETURN TRUE; EXCEPTION WHEN OTHERS THEN O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE); RETURN FALSE; END; /*----------------------------------------------------------------------- * PROCEDURE NAME : GET_TEST_L1 * COMMENTS : 计算频繁项集L1 * CODED BY : ONELANG 2019-1-27 * CHANGED HISTORY : -----------------------------------------------------------------------*/ FUNCTION GET_TEST_L1(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_TEST_L1'; L_TOTAL NUMBER(8); BEGIN EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L1'; SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL FROM CMX_APRIORI_TRANSACTION ; INSERT INTO CMX_APRIORI_L1 SELECT ITEM,ROUND(COUNT(1) / L_TOTAL,6) SUPPORT,COUNT(1) CNT FROM CMX_APRIORI_TRANSACTION A GROUP BY ITEM HAVING COUNT(1) / L_TOTAL >= I_SUPPORT ; COMMIT; RETURN TRUE; EXCEPTION WHEN OTHERS THEN O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE); RETURN FALSE; END; /*----------------------------------------------------------------------- * PROCEDURE NAME : GET_L2 * COMMENTS : 获取关联关系L2 * CODED BY : * CHANGED HISTORY : -----------------------------------------------------------------------*/ FUNCTION GET_TEST_L2(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_L2'; L_TOTAL NUMBER(8); BEGIN EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L2'; SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL FROM CMX_APRIORI_TRANSACTION ; --2项集支持度 INSERT INTO CMX_APRIORI_L2(ITEM_A,ITEM_B,CNT,SUPPORT) SELECT ITEM_A,ITEM_B,COUNT(DISTINCT TRAN_SEQ_NO) CNT, ROUND(COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL,6) SUPPORT FROM (SELECT A.TRAN_SEQ_NO, A.ITEM ITEM_A, B.ITEM ITEM_B FROM CMX_APRIORI_TRANSACTION A, CMX_APRIORI_TRANSACTION B WHERE A.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1) AND B.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1) AND A.TRAN_SEQ_NO = B.TRAN_SEQ_NO AND A.ITEM > B.ITEM ) GROUP BY ITEM_A,ITEM_B HAVING COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL >= I_SUPPORT; --置信度 UPDATE CMX_APRIORI_L2 L2 SET CONF_A_B = (SELECT L2.SUPPORT / L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_A = L1.ITEM); --提升度 UPDATE CMX_APRIORI_L2 L2 SET LIFT_A_B = (SELECT L2.CONF_A_B / L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_B = L1.ITEM); COMMIT; RETURN TRUE; EXCEPTION WHEN OTHERS THEN O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE); RETURN FALSE; END; FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.TEST'; L_MIN_SUPP NUMBER(9,5); BEGIN L_MIN_SUPP := 0.1; IF INIT_TEST_DATA(O_ERROR_MESSAGE) = FALSE THEN RETURN FALSE; END IF; IF GET_TEST_L1(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN RETURN FALSE; END IF; IF GET_TEST_L2(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN RETURN FALSE; END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE); RETURN FALSE; END; END CMX_APRIORI_SQL; /
运行:
declare result boolean; begin -- Call the function result := cmx_apriori_sql.test(o_error_message => :o_error_message); -- Convert false/true/null to 0/1/null :result := sys.diutil.bool_to_int(result); end;
运行结果:
select * from cmx_apriori_l2 ITEM_A ITEM_B CNT SUPPORT CONF_A_B LIFT_A_B 1 I2 I1 4 0.444444 0.571428 0.857142 2 I4 I2 2 0.222222 1.000000 1.285714 3 I5 I3 1 0.111111 0.500000 0.750000 4 I3 I2 4 0.444444 0.666666 0.857142 5 I5 I1 2 0.222222 1.000000 1.499999 6 I3 I1 4 0.444444 0.666666 0.999999 7 I4 I1 1 0.111111 0.500000 0.750000 8 I5 I2 2 0.222222 1.000000 1.285714
用一家门店,一年的销售数据计算一下就会发现,尿布->啤酒根本没有关联关系。意外+惊喜。