• 文本小票的一种无监督聚类方法


    基于ostu的无监督文本聚类 对于区分不同店铺的小票效果良好 同店铺小票不同类别区分效果一般,但是对于离群点定位(小样本类别很精准),借鉴了TF/IDF的思想,还有词处理时的去停词,词频因素的考虑优化,分类的阈值计算为每次基于相似度集合的前后背景最大分割点,不断二分类(其实也可以随机选取几个小票作为聚类的种子,类似kmeans那种,但是这样聚类最后会有部分偏差,只是局部最优解,而且当时觉得还是迭代二分干净明了吧)
    ostu为图像二值化处理时的一种算法(类间最大方差),图像专业,前年第一次尝试做聚类时的一个想法和实现,前后设计调优用了三天,不足地方还请见谅。
    转载请标明原址,谢谢

    
    
    public class ClusterHelper {
        private static Logger log = Logger
                .getLogger(ClusterHelper.class);
    
        private String txt = "";
        private String filekey = "";
        private int classify = 0;
    
        public int getClassify() {
            return classify;
        }
    
        public void setClassify(int classify) {
            this.classify = classify;
        }
    
        public void setClusterHelper(String txt, String filekey, int classify) {
            this.txt = txt;
            this.filekey = filekey;
            this.classify = classify;
        }
    
        public String getTxt() {
            return txt;
        }
    
        public void setTxt(String txt) {
            this.txt = txt;
        }
    
        public String getFilekey() {
            return filekey;
        }
    
        public void setFilekey(String filekey) {
            this.filekey = filekey;
        }
    
        /**
         * 得到存有所有cell的bitmap:bf
         * 
         * @param alltxt
         * @return
         */
        public static SimpleBloomFilterHelper allBloomFilter(
                ArrayList<String> alltxt) {
            Set<String> txtcell = new TreeSet<String>();
            String txt = "";
            SimpleBloomFilterHelper bf = new SimpleBloomFilterHelper();
            for (int i = 0; i < alltxt.size(); i++) {
                txt = alltxt.get(i);
                txtcell.clear();
                txtcell = txtToCell(txt);
                for (String s : txtcell) {
                    if (!bf.contains(s))
                        bf.add(s);
                }
            }
            return bf;
    
        }
    
        public static ArrayList<ClusterHelper>cluster(Set<String> filekeys) {
            ArrayList<ClusterHelper>allcluster=new ArrayList<ClusterHelper>();
            for(String i:filekeys){
                ClusterHelper cluster=new ClusterHelper();
                cluster.setFilekey(i);
                cluster.setClassify(0);
                cluster.setTxt( TemplateCompare.filekeyToString(i));
            }
            String txt = "";
            String ctxt = "";
            int k = 1;
            int[][] similar = new int[allcluster.size()][allcluster.size()];
            for (int i = 0; i < allcluster.size(); i++) {
                if (allcluster.get(i).classify == 0) {
                    txt = allcluster.get(i).getTxt();
                    SimpleBloomFilterHelper bf = initBf(txt);
                    for (int j = 0; j < allcluster.size(); j++) {
                        if (j == i) {
                            similar[i][j] = 1000000;
                        } else if (i != 0 && allcluster.get(j).classify != 0) {
                            similar[i][j] = 1000000;
                        } else {
                            ctxt = allcluster.get(j).getTxt();
                            similar[i][j] = similar(similar[i][j], ctxt, bf);
                        }
                    }
    
                    log.info("similar");
                    for (int t : similar[i]) {
                        log.info(t);
                    }
    
                    ArrayList<Integer> s = new ArrayList<Integer>();
                    for (int q = 0; q < similar[i].length; q++) {
                        if (similar[i][q] < 1000000) {
                            s.add(similar[i][q]);
                        }
                    }
                    s.add(0);
                    if (s.size() == 1) {
                        return allcluster;
                    }
                    int[] ss = listtoint(s);
                    int[] sort = ss.clone();
                    Arrays.sort(sort);
                    double T = ostu(sort, ss);
                    log.info("T:" + T);
                    allcluster = classifyByT(T, similar[i], allcluster, k);
                    allcluster.get(i).setClassify(k);
                    k++;
                    int n = 0;
                    log.info(i + "-classify:");
                    for (ClusterHelper h : allcluster) {
                        log.info(n + ":" + h.classify);
                        n++;
                    }
                }
            }
            int n = 0;
    
            for (int i = 0; i < allcluster.size(); i++) {
                if (allcluster.get(i).classify == 0) {
                    allcluster = clusterString(allcluster);
                } else {
                    n++;
                }
                if (n == allcluster.size()) {
                    return allcluster;
                }
            }
            return allcluster;
    
        }
    
        /**
         * 基于ostu的无监督文本聚类 对于区分不同店铺的小票效果良好 同店铺小票不同类别区分效果一般,
         * 但是对于离群点定位(小样本类别很精准)
         * 
         * @param alltxt
         */
        public static ArrayList<ClusterHelper> clusterString(
                ArrayList<ClusterHelper> allcluster) {
            String txt = "";
            String ctxt = "";
            int k = 1;
            int[][] similar = new int[allcluster.size()][allcluster.size()];
            for (int i = 0; i < allcluster.size(); i++) {
                if (allcluster.get(i).classify == 0) {
                    txt = allcluster.get(i).getTxt();
                    SimpleBloomFilterHelper bf = initBf(txt);
                    for (int j = 0; j < allcluster.size(); j++) {
                        if (j == i) {
                            similar[i][j] = 1000000;
                        } else if (i != 0 && allcluster.get(j).classify != 0) {
                            similar[i][j] = 1000000;
                        } else {
                            ctxt = allcluster.get(j).getTxt();
                            similar[i][j] = similar(similar[i][j], ctxt, bf);
                        }
                    }
    
                    log.info("similar");
                    for (int t : similar[i]) {
                        log.info(t);
                    }
    
                    ArrayList<Integer> s = new ArrayList<Integer>();
                    for (int q = 0; q < similar[i].length; q++) {
                        if (similar[i][q] < 1000000) {
                            s.add(similar[i][q]);
                        }
                    }
                    s.add(0);
                    if (s.size() == 1) {
                        return allcluster;
                    }
                    int[] ss = listtoint(s);
                    int[] sort = ss.clone();
                    Arrays.sort(sort);
                    double T = ostu(sort, ss);
                    log.info("T:" + T);
                    allcluster = classifyByT(T, similar[i], allcluster, k);
                    allcluster.get(i).setClassify(k);
                    k++;
                    int n = 0;
                    log.info(i + "-classify:");
                    for (ClusterHelper h : allcluster) {
                        log.info(n + ":" + h.classify);
                        n++;
                    }
                }
            }
            int n = 0;
    
            for (int i = 0; i < allcluster.size(); i++) {
                if (allcluster.get(i).classify == 0) {
                    allcluster = clusterString(allcluster);
                } else {
                    n++;
                }
                if (n == allcluster.size()) {
                    return allcluster;
                }
            }
            return allcluster;
    
        }
    
        public static ArrayList<ClusterHelper> classifyByT(double T, int[] similar,
                ArrayList<ClusterHelper> allcluster, int k) {
            for (int i = 0; i < similar.length; i++) {
                if (similar[i] >= T && similar[i] < 1000000) {
                    allcluster.get(i).classify = k;
                }
            }
    
            return allcluster;
        }
    
        /**
         * 初始化bf
         * 
         * @param txt
         * @param bf
         */
        public static SimpleBloomFilterHelper initBf(String txt) {
            SimpleBloomFilterHelper bf = new SimpleBloomFilterHelper();
            Set<String> txtcell = txtToCell(txt);
            for (String s : txtcell) {
                bf.add(s);
            }
            return bf;
        }
    
        /**
         * 计算相似度similar
         * 
         * @param similar
         * @param ctxt
         * @param bf
         * @return
         */
        public static int similar(int similar, String ctxt,
                SimpleBloomFilterHelper bf) {
            Set<String> cxtcell = txtToCell(ctxt);
            for (String c : cxtcell) {
                if (bf.contains(c)) {
                    similar++;
                }
            }
            return similar;
        }
    
        /**
         * 大津法阈值分割
         * 
         * @param sort
         * @param ss
         * @return
         */
        public static int ostu(int[] sort, int[] ss) {
            // 设当前景与背景的分割阈值为t时,前景点占图像比例为w0,均值为u0,背景点占图像比例为w1,均值为u1。
            // 则整个图像的均值为u = w0*u0+w1*u1。建立目标函数g(t)=w0*(u0-u)^2+w1*(u1-u)^2,
            int T = 0;
            double gt = 0;
            double g = 0;
            int mc1 = 0;
            int mc2 = 0;
            int m = mean(ss);
            ArrayList<Integer> c1 = new ArrayList<Integer>();
            ArrayList<Integer> c2 = new ArrayList<Integer>();
            for (int i = sort[0]; i < sort[sort.length - 1]; i++) {
                for (int j = 0; j < ss.length; j++) {
                    if (ss[j] <= i) {
                        c1.add(ss[j]);
                    } else {
                        c2.add(ss[j]);
                    }
                }
                int[] sc1 = listtoint(c1);
                int[] sc2 = listtoint(c2);
                mc1 = mean(sc1);
                mc2 = mean(sc2);
                g = (sc1.length * (Math.pow(mc1 - m, 2)) + sc2.length
                        * (Math.pow(mc2 - m, 2)))
                        / ss.length;
                if (g > gt) {
                    T = i;
                    gt = g;
                }
            }
            return T;
        }
    
        public static int[] listtoint(ArrayList<Integer> c) {
            int[] s = new int[c.size()];
            for (int i = 0; i < c.size(); i++) {
                s[i] = (int) c.get(i);
            }
            return s;
        }
    
        public static int mean(int[] s) {
            int mean = 0;
            int sum = 0;
            for (int i = 0; i < s.length; i++) {
                sum = sum + s[i];
            }
            mean = sum / s.length;
            return mean;
        }
    
        /**
         * 将txt分解为最小单元集合 以“ ”和":"为分割标记 且不含有“0123456789.-*”
         * 
         * @param txt
         * @return
         */
        public static Set<String> txtToCell(String txt) {
            Set<String> txtcell = new TreeSet<String>();
            ArrayList<Integer> num = new ArrayList<Integer>();
            int k = 0;
            int begin = 0;
            String cell = "";
            String cc = "";
            txt = txt.replaceAll("
    ", " ");
            txt = txt.replaceAll("1", " ");
            txt = txt.replaceAll("2", " ");
            txt = txt.replaceAll("3", " ");
            txt = txt.replaceAll("4", " ");
            txt = txt.replaceAll("5", " ");
            txt = txt.replaceAll("6", " ");
            txt = txt.replaceAll("7", " ");
            txt = txt.replaceAll("8", " ");
            txt = txt.replaceAll("9", " ");
            txt = txt.replaceAll("0", " ");
            txt = txt.replaceAll("\.", " ");
            // txt = txt.replaceAll("F", " ");
            // txt = txt.replaceAll("=", " ");
            // txt = txt.replaceAll("--", " ");
            txt = txt.replaceAll(":", " ");
            if (!txt.substring(0, 1).equals(" ")) {
                num.add(0);
            }
    
            for (int j = 0; j < txt.length(); j++) {
                cc = txt.substring(j, j + 1);
                if (cc.equals(" ")) {
                    if (j != begin + 1) {
                        begin = j;
                        num.add(begin);
                        k = 0;
                    } else {
                        k++;
                        begin = begin + k;
                    }
                }
            }
            for (int i = 0; i < num.size() - 1; i++) {
                cell = txt.substring(num.get(i), num.get(i + 1));
                cell = cell.replaceAll(" +", "");
                if (!cell.equals("")) {
                    txtcell.add(cell);
                }
            }
    
            return txtcell;
        }
    
        /**
         * 判断cell中是否含有“0123456789.-*”
         * 
         * @param cell
         * @return
         */
        public static boolean numberjudge(String cell) {
            if (cell.indexOf("0") == -1 && cell.indexOf("1") == -1
                    && cell.indexOf("2") == -1 && cell.indexOf("3") == -1
                    && cell.indexOf("4") == -1 && cell.indexOf("5") == -1
                    && cell.indexOf("6") == -1 && cell.indexOf("7") == -1
                    && cell.indexOf("8") == -1 && cell.indexOf("9") == -1
                    && cell.indexOf(".") == -1 && cell.indexOf("-") == -1
                    && cell.indexOf("*") == -1) {
                return true;
            } else
                return false;
        }
    
        public static String txtread(String fileName) throws IOException {
            String result = "";
            BufferedReader br = new BufferedReader(new FileReader(fileName));
            String line = "";
            StringBuffer buffer = new StringBuffer();
            while ((line = br.readLine()) != null) {
                buffer.append(line + "
    ");
            }
            result = buffer.toString();
            br.close();
    
            return result;
        }
    
        public static void testtxtToCell() {
            try {
                String txt = txtread("/Users/zhangdebin/Documents/txt/0.txt");
                log.info(txt);
                Set<String> allcell = txtToCell(txt);
                log.info("allcell:");
                for (String s : allcell) {
                    log.info(s);
                }
            } catch (IOException e) {
    
                e.printStackTrace();
            }
        }
    
        public static void main(String[] args) throws IOException {
            String filepath = "/Users/Documents/3000testtxt";
            String filepath2 = "/Users/Documents/imgtxt";
            String filepath3 = "/Users/Documents/testtail";
            File file = new File(filepath2);
            int len = file.list().length - 1;
            ArrayList<String> alltxt = new ArrayList<String>();
            for (int i = 0; i < len; i++) {
                String readfile = file + "/" + i + ".txt";
                try {
                    alltxt.add(txtread(readfile));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
            ArrayList<ClusterHelper> cluster = new ArrayList<ClusterHelper>();
            for (int i = 0; i < alltxt.size(); i++) {
                ClusterHelper c = new ClusterHelper();
                c.setTxt(alltxt.get(i));
                c.setClassify(0);
                c.setFilekey("123");
                cluster.add(c);
            }
            long start = System.currentTimeMillis();
            cluster = clusterString(cluster);
            log.info("cluster.size:" + cluster.size());
            long end = System.currentTimeMillis();
            log.info("time:" + (end - start));
            Set<Integer> cc = new TreeSet<Integer>();
            int a[] = new int[100];
            for (int i = 0; i < cluster.size(); i++) {
                cc.add(cluster.get(i).classify);
                a[cluster.get(i).classify]++;
            }
            log.info("cc:");
            for (int i : cc) {
                log.info(i + "-" + a[i]);
            }
    
        }
    
    }
    
  • 相关阅读:
    【leetcode】对称二叉树
    【leetcode】判断回文数
    053686
    053685
    053684
    053683
    053682
    053681
    053680
    053477
  • 原文地址:https://www.cnblogs.com/zhangdebin/p/5567902.html
Copyright © 2020-2023  润新知