• 数据挖掘之聚类算法Apriori总结


    项目中有时候需要用到对数据进行关联分析,比如分析一个小商店中顾客购买习惯.

      1 package com.data.algorithm;
      2 
      3 import com.google.common.base.Splitter;
      4 import com.google.common.collect.Lists;
      5 import com.google.common.collect.Maps;
      6 import org.slf4j.Logger;
      7 import org.slf4j.LoggerFactory;
      8 
      9 import java.io.BufferedReader;
     10 import java.io.FileInputStream;
     11 import java.io.IOException;
     12 import java.io.InputStreamReader;
     13 import java.util.*;
     14 
     15 /**
     16  * *********************************************************
     17  * <p/>
     18  * Author:     XiJun.Gong
     19  * Date:       2017-01-20 15:06
     20  * Version:    default 1.0.0
     21  * Class description:
     22  * <p/>
     23  * *********************************************************
     24  */
     25 
     26 class EOC {
     27 
     28     private static final Logger logger = LoggerFactory.getLogger(EOC.class);
     29     private Map<String, Integer> fmap;  //forward map
     30     private Map<Integer, String> bmap;  //backward map
     31     private List<Map<String, Integer>> elements = null;
     32 
     33     private Integer maxDimension;
     34 
     35     public EOC(final String pathFile, String separatSeq) {
     36 
     37         BufferedReader bufferedReader = null;
     38         try {
     39             this.fmap = Maps.newHashMap();
     40             this.bmap = Maps.newHashMap();
     41             this.elements = Lists.newArrayList();
     42             maxDimension = 0;
     43             bufferedReader = new BufferedReader(
     44                     new InputStreamReader(
     45                             new FileInputStream(pathFile), "UTF-8"));
     46             String _line = null;
     47             Integer keyValue = null, mapIndex = 0;
     48             while ((_line = bufferedReader.readLine()) != null) {
     49                 Map<String, Integer> lineMap = Maps.newHashMap();
     50                 if (_line.trim().length() > 1) {
     51                     if (separatSeq.trim().length() < 1) {
     52                         separatSeq = ",";
     53                     }
     54                     for (String word : Splitter.on(separatSeq).split(_line)) {
     55                         word = word.trim();
     56                         if (null == (keyValue = fmap.get(word))) {
     57                             keyValue = mapIndex++;
     58                         }
     59                         fmap.put(word, keyValue);
     60                         bmap.put(keyValue, word);
     61                         lineMap.put(word, keyValue);
     62                     }
     63                     if (maxDimension < lineMap.size())
     64                         maxDimension = lineMap.size();
     65                     elements.add(lineMap);
     66                 }
     67             }
     68         } catch (Exception e) {
     69             logger.error("读取文件出错 , 错误原因:{}", e);
     70         } finally {
     71             if (bufferedReader != null) {
     72                 try {
     73                     bufferedReader.close();
     74                 } catch (IOException e) {
     75                     logger.error("bufferedReader , 错误原因:{}", e);
     76                 }
     77             }
     78         }
     79     }
     80 
     81     public Integer getMaxDimension() {
     82         return maxDimension;
     83     }
     84 
     85     public float getRateOfSet(Collection<Integer> elementChild) {
     86         float rateCnt = 0f;
     87         int allSize = 1;
     88         for (Map<String, Integer> eMap : elements) {
     89             boolean flag = true;
     90             for (Integer element : elementChild) {
     91                 if (null == eMap.get(bmap.get(element))) {
     92                     flag = false;
     93                     break;
     94                 }
     95             }
     96             if (flag) rateCnt += 1;
     97         }
     98         return rateCnt / ((allSize = elements.size()) > 1 ? (float) allSize : 1.0f);
     99     }
    100 
    101     public Set<Integer> getElements() {
    102 
    103         return new HashSet<Integer>(fmap.values());
    104     }
    105 
    106     public Integer queryByKey(String key) {
    107         return fmap.get(key);
    108     }
    109 
    110     public String queryByValue(Integer value) {
    111         return bmap.get(value);
    112     }
    113 }
    114 
    115 public class Apriori {
    116     private static final Logger logger = LoggerFactory.getLogger(Apriori.class);
    117     private EOC eoc = null;
    118     private Integer maxDimension;
    119     private final float exp = 1e-4f;
    120 
    121     public Apriori(final String pathFile, String separatSeq, Integer maxDimension) {
    122         this(pathFile, separatSeq);
    123         this.maxDimension = maxDimension;
    124     }
    125 
    126     public Apriori(final String pathFile, String separatSeq) {
    127         this.eoc = new EOC(pathFile, separatSeq);
    128         this.maxDimension = this.eoc.getMaxDimension();
    129     }
    130 
    131     public void work(float confidenceLevel) {
    132         List<Set<Integer>> listElement = null;
    133         ArrayList<Set<Integer>> middleWareElement = null;
    134         Map<Set<Integer>, Float> maps = null;
    135         listElement = Lists.newArrayList();
    136         for (Integer element : this.eoc.getElements()) {
    137             Set<Integer> set = new HashSet<Integer>();
    138             set.add(element);
    139             listElement.add(set);
    140         }
    141         maps = Maps.newHashMap();
    142         middleWareElement = Lists.newArrayList();
    143         for (int i = 1; i < this.maxDimension; i++) {
    144             for (Set<Integer> tmpSet : listElement) {
    145                 float rate = eoc.getRateOfSet(tmpSet);
    146                 if (confidenceLevel - exp <= rate)
    147                     maps.put(tmpSet, rate);
    148             }
    149             System.out.println("+++++++++++第 " + i + " 维度关联数据+++++++++++");
    150             output(maps);
    151             listElement.clear();
    152             middleWareElement.addAll(maps.keySet());
    153             maps.clear();
    154             for (int j = 0; j < middleWareElement.size(); j++) {
    155                 Set<Integer> tmpSet = middleWareElement.get(j);
    156                 for (int k = j + 1; k < middleWareElement.size(); k++) {
    157                     Set<Integer> setChild = middleWareElement.get(k);
    158                     for (Integer label : setChild) {
    159                         if (!tmpSet.contains(label)) {
    160                             Set<Integer> newElement = new HashSet<Integer>(tmpSet);
    161                             newElement.add(label);
    162                             if (!listElement.contains(newElement)) {
    163                                 listElement.add(newElement);
    164                                 break;
    165                             }
    166                         }
    167                     }
    168                 }
    169             }
    170             middleWareElement.clear();
    171         }
    172     }
    173 
    174     public void output(Map<Set<Integer>, Float> maps) {
    175         for (Map.Entry<Set<Integer>, Float> iter : maps.entrySet()) {
    176             for (Integer integer : iter.getKey()) {
    177                 System.out.print(eoc.queryByValue(integer) + " ");
    178             }
    179             System.out.println(iter.getValue()*100+"%");
    180         }
    181     }
    182 }
    View Code

      

     1 package com.data.algorithm;
     2 
     3 
     4 /**
     5  * *********************************************************
     6  * <p/>
     7  * Author:     XiJun.Gong
     8  * Date:       2017-01-17 17:57
     9  * Version:    default 1.0.0
    10  * Class description:
    11  * <p/>
    12  * *********************************************************
    13  */
    14 public class Main {
    15     public static void main(String args[]) {
    16         Apriori apriori = new Apriori("/home/com/src/main/java/com/qunar/data/algorithm/demo.data", ",");
    17         apriori.work(0.5f);
    18     }
    19 }
     1 +++++++++++第 1 维度关联数据+++++++++++
     2 苹果 50.0%
     3 西红柿 75.0%
     4 香蕉 75.0%
     5 矿泉水 75.0%
     6 +++++++++++第 2 维度关联数据+++++++++++
     7 苹果 西红柿 50.0%
     8 西红柿 香蕉 50.0%
     9 西红柿 矿泉水 50.0%
    10 香蕉 矿泉水 75.0%
    11 +++++++++++第 3 维度关联数据+++++++++++
    12 西红柿 香蕉 矿泉水 50.0%
  • 相关阅读:
    asm volatile (&quot;B .&quot;)
    最大熵学习笔记(一)预备知识
    12、Cocos2dx 3.0游戏开发找小三之3.0中的生命周期分析
    Android中通过反射来设置Toast的显示时间
    Linux Centos7 Apache 訪问 You don&#39;t have permission to access / on this server.
    校园双选会,你都懂么
    关于虚继承和析构函数的一个奇怪的问题
    Codeforces Round #252 (Div. 2)B. Valera and Fruits
    P3809 【模版】后缀排序
    752. [BJOI2006] 狼抓兔子
  • 原文地址:https://www.cnblogs.com/gongxijun/p/6327000.html
Copyright © 2020-2023  润新知