基于ostu的无监督文本聚类 对于区分不同店铺的小票效果良好 同店铺小票不同类别区分效果一般,但是对于离群点定位(小样本类别很精准),借鉴了TF/IDF的思想,还有词处理时的去停词,词频因素的考虑优化,分类的阈值计算为每次基于相似度集合的前后背景最大分割点,不断二分类(其实也可以随机选取几个小票作为聚类的种子,类似kmeans那种,但是这样聚类最后会有部分偏差,只是局部最优解,而且当时觉得还是迭代二分干净明了吧)
ostu为图像二值化处理时的一种算法(类间最大方差),图像专业,前年第一次尝试做聚类时的一个想法和实现,前后设计调优用了三天,不足地方还请见谅。
转载请标明原址,谢谢
public class ClusterHelper {
private static Logger log = Logger
.getLogger(ClusterHelper.class);
private String txt = "";
private String filekey = "";
private int classify = 0;
public int getClassify() {
return classify;
}
public void setClassify(int classify) {
this.classify = classify;
}
public void setClusterHelper(String txt, String filekey, int classify) {
this.txt = txt;
this.filekey = filekey;
this.classify = classify;
}
public String getTxt() {
return txt;
}
public void setTxt(String txt) {
this.txt = txt;
}
public String getFilekey() {
return filekey;
}
public void setFilekey(String filekey) {
this.filekey = filekey;
}
/**
* 得到存有所有cell的bitmap:bf
*
* @param alltxt
* @return
*/
public static SimpleBloomFilterHelper allBloomFilter(
ArrayList<String> alltxt) {
Set<String> txtcell = new TreeSet<String>();
String txt = "";
SimpleBloomFilterHelper bf = new SimpleBloomFilterHelper();
for (int i = 0; i < alltxt.size(); i++) {
txt = alltxt.get(i);
txtcell.clear();
txtcell = txtToCell(txt);
for (String s : txtcell) {
if (!bf.contains(s))
bf.add(s);
}
}
return bf;
}
public static ArrayList<ClusterHelper>cluster(Set<String> filekeys) {
ArrayList<ClusterHelper>allcluster=new ArrayList<ClusterHelper>();
for(String i:filekeys){
ClusterHelper cluster=new ClusterHelper();
cluster.setFilekey(i);
cluster.setClassify(0);
cluster.setTxt( TemplateCompare.filekeyToString(i));
}
String txt = "";
String ctxt = "";
int k = 1;
int[][] similar = new int[allcluster.size()][allcluster.size()];
for (int i = 0; i < allcluster.size(); i++) {
if (allcluster.get(i).classify == 0) {
txt = allcluster.get(i).getTxt();
SimpleBloomFilterHelper bf = initBf(txt);
for (int j = 0; j < allcluster.size(); j++) {
if (j == i) {
similar[i][j] = 1000000;
} else if (i != 0 && allcluster.get(j).classify != 0) {
similar[i][j] = 1000000;
} else {
ctxt = allcluster.get(j).getTxt();
similar[i][j] = similar(similar[i][j], ctxt, bf);
}
}
log.info("similar");
for (int t : similar[i]) {
log.info(t);
}
ArrayList<Integer> s = new ArrayList<Integer>();
for (int q = 0; q < similar[i].length; q++) {
if (similar[i][q] < 1000000) {
s.add(similar[i][q]);
}
}
s.add(0);
if (s.size() == 1) {
return allcluster;
}
int[] ss = listtoint(s);
int[] sort = ss.clone();
Arrays.sort(sort);
double T = ostu(sort, ss);
log.info("T:" + T);
allcluster = classifyByT(T, similar[i], allcluster, k);
allcluster.get(i).setClassify(k);
k++;
int n = 0;
log.info(i + "-classify:");
for (ClusterHelper h : allcluster) {
log.info(n + ":" + h.classify);
n++;
}
}
}
int n = 0;
for (int i = 0; i < allcluster.size(); i++) {
if (allcluster.get(i).classify == 0) {
allcluster = clusterString(allcluster);
} else {
n++;
}
if (n == allcluster.size()) {
return allcluster;
}
}
return allcluster;
}
/**
* 基于ostu的无监督文本聚类 对于区分不同店铺的小票效果良好 同店铺小票不同类别区分效果一般,
* 但是对于离群点定位(小样本类别很精准)
*
* @param alltxt
*/
public static ArrayList<ClusterHelper> clusterString(
ArrayList<ClusterHelper> allcluster) {
String txt = "";
String ctxt = "";
int k = 1;
int[][] similar = new int[allcluster.size()][allcluster.size()];
for (int i = 0; i < allcluster.size(); i++) {
if (allcluster.get(i).classify == 0) {
txt = allcluster.get(i).getTxt();
SimpleBloomFilterHelper bf = initBf(txt);
for (int j = 0; j < allcluster.size(); j++) {
if (j == i) {
similar[i][j] = 1000000;
} else if (i != 0 && allcluster.get(j).classify != 0) {
similar[i][j] = 1000000;
} else {
ctxt = allcluster.get(j).getTxt();
similar[i][j] = similar(similar[i][j], ctxt, bf);
}
}
log.info("similar");
for (int t : similar[i]) {
log.info(t);
}
ArrayList<Integer> s = new ArrayList<Integer>();
for (int q = 0; q < similar[i].length; q++) {
if (similar[i][q] < 1000000) {
s.add(similar[i][q]);
}
}
s.add(0);
if (s.size() == 1) {
return allcluster;
}
int[] ss = listtoint(s);
int[] sort = ss.clone();
Arrays.sort(sort);
double T = ostu(sort, ss);
log.info("T:" + T);
allcluster = classifyByT(T, similar[i], allcluster, k);
allcluster.get(i).setClassify(k);
k++;
int n = 0;
log.info(i + "-classify:");
for (ClusterHelper h : allcluster) {
log.info(n + ":" + h.classify);
n++;
}
}
}
int n = 0;
for (int i = 0; i < allcluster.size(); i++) {
if (allcluster.get(i).classify == 0) {
allcluster = clusterString(allcluster);
} else {
n++;
}
if (n == allcluster.size()) {
return allcluster;
}
}
return allcluster;
}
public static ArrayList<ClusterHelper> classifyByT(double T, int[] similar,
ArrayList<ClusterHelper> allcluster, int k) {
for (int i = 0; i < similar.length; i++) {
if (similar[i] >= T && similar[i] < 1000000) {
allcluster.get(i).classify = k;
}
}
return allcluster;
}
/**
* 初始化bf
*
* @param txt
* @param bf
*/
public static SimpleBloomFilterHelper initBf(String txt) {
SimpleBloomFilterHelper bf = new SimpleBloomFilterHelper();
Set<String> txtcell = txtToCell(txt);
for (String s : txtcell) {
bf.add(s);
}
return bf;
}
/**
* 计算相似度similar
*
* @param similar
* @param ctxt
* @param bf
* @return
*/
public static int similar(int similar, String ctxt,
SimpleBloomFilterHelper bf) {
Set<String> cxtcell = txtToCell(ctxt);
for (String c : cxtcell) {
if (bf.contains(c)) {
similar++;
}
}
return similar;
}
/**
* 大津法阈值分割
*
* @param sort
* @param ss
* @return
*/
public static int ostu(int[] sort, int[] ss) {
// 设当前景与背景的分割阈值为t时,前景点占图像比例为w0,均值为u0,背景点占图像比例为w1,均值为u1。
// 则整个图像的均值为u = w0*u0+w1*u1。建立目标函数g(t)=w0*(u0-u)^2+w1*(u1-u)^2,
int T = 0;
double gt = 0;
double g = 0;
int mc1 = 0;
int mc2 = 0;
int m = mean(ss);
ArrayList<Integer> c1 = new ArrayList<Integer>();
ArrayList<Integer> c2 = new ArrayList<Integer>();
for (int i = sort[0]; i < sort[sort.length - 1]; i++) {
for (int j = 0; j < ss.length; j++) {
if (ss[j] <= i) {
c1.add(ss[j]);
} else {
c2.add(ss[j]);
}
}
int[] sc1 = listtoint(c1);
int[] sc2 = listtoint(c2);
mc1 = mean(sc1);
mc2 = mean(sc2);
g = (sc1.length * (Math.pow(mc1 - m, 2)) + sc2.length
* (Math.pow(mc2 - m, 2)))
/ ss.length;
if (g > gt) {
T = i;
gt = g;
}
}
return T;
}
public static int[] listtoint(ArrayList<Integer> c) {
int[] s = new int[c.size()];
for (int i = 0; i < c.size(); i++) {
s[i] = (int) c.get(i);
}
return s;
}
public static int mean(int[] s) {
int mean = 0;
int sum = 0;
for (int i = 0; i < s.length; i++) {
sum = sum + s[i];
}
mean = sum / s.length;
return mean;
}
/**
* 将txt分解为最小单元集合 以“ ”和":"为分割标记 且不含有“0123456789.-*”
*
* @param txt
* @return
*/
public static Set<String> txtToCell(String txt) {
Set<String> txtcell = new TreeSet<String>();
ArrayList<Integer> num = new ArrayList<Integer>();
int k = 0;
int begin = 0;
String cell = "";
String cc = "";
txt = txt.replaceAll("
", " ");
txt = txt.replaceAll("1", " ");
txt = txt.replaceAll("2", " ");
txt = txt.replaceAll("3", " ");
txt = txt.replaceAll("4", " ");
txt = txt.replaceAll("5", " ");
txt = txt.replaceAll("6", " ");
txt = txt.replaceAll("7", " ");
txt = txt.replaceAll("8", " ");
txt = txt.replaceAll("9", " ");
txt = txt.replaceAll("0", " ");
txt = txt.replaceAll("\.", " ");
// txt = txt.replaceAll("F", " ");
// txt = txt.replaceAll("=", " ");
// txt = txt.replaceAll("--", " ");
txt = txt.replaceAll(":", " ");
if (!txt.substring(0, 1).equals(" ")) {
num.add(0);
}
for (int j = 0; j < txt.length(); j++) {
cc = txt.substring(j, j + 1);
if (cc.equals(" ")) {
if (j != begin + 1) {
begin = j;
num.add(begin);
k = 0;
} else {
k++;
begin = begin + k;
}
}
}
for (int i = 0; i < num.size() - 1; i++) {
cell = txt.substring(num.get(i), num.get(i + 1));
cell = cell.replaceAll(" +", "");
if (!cell.equals("")) {
txtcell.add(cell);
}
}
return txtcell;
}
/**
* 判断cell中是否含有“0123456789.-*”
*
* @param cell
* @return
*/
public static boolean numberjudge(String cell) {
if (cell.indexOf("0") == -1 && cell.indexOf("1") == -1
&& cell.indexOf("2") == -1 && cell.indexOf("3") == -1
&& cell.indexOf("4") == -1 && cell.indexOf("5") == -1
&& cell.indexOf("6") == -1 && cell.indexOf("7") == -1
&& cell.indexOf("8") == -1 && cell.indexOf("9") == -1
&& cell.indexOf(".") == -1 && cell.indexOf("-") == -1
&& cell.indexOf("*") == -1) {
return true;
} else
return false;
}
public static String txtread(String fileName) throws IOException {
String result = "";
BufferedReader br = new BufferedReader(new FileReader(fileName));
String line = "";
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line + "
");
}
result = buffer.toString();
br.close();
return result;
}
public static void testtxtToCell() {
try {
String txt = txtread("/Users/zhangdebin/Documents/txt/0.txt");
log.info(txt);
Set<String> allcell = txtToCell(txt);
log.info("allcell:");
for (String s : allcell) {
log.info(s);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
String filepath = "/Users/Documents/3000testtxt";
String filepath2 = "/Users/Documents/imgtxt";
String filepath3 = "/Users/Documents/testtail";
File file = new File(filepath2);
int len = file.list().length - 1;
ArrayList<String> alltxt = new ArrayList<String>();
for (int i = 0; i < len; i++) {
String readfile = file + "/" + i + ".txt";
try {
alltxt.add(txtread(readfile));
} catch (IOException e) {
e.printStackTrace();
}
}
ArrayList<ClusterHelper> cluster = new ArrayList<ClusterHelper>();
for (int i = 0; i < alltxt.size(); i++) {
ClusterHelper c = new ClusterHelper();
c.setTxt(alltxt.get(i));
c.setClassify(0);
c.setFilekey("123");
cluster.add(c);
}
long start = System.currentTimeMillis();
cluster = clusterString(cluster);
log.info("cluster.size:" + cluster.size());
long end = System.currentTimeMillis();
log.info("time:" + (end - start));
Set<Integer> cc = new TreeSet<Integer>();
int a[] = new int[100];
for (int i = 0; i < cluster.size(); i++) {
cc.add(cluster.get(i).classify);
a[cluster.get(i).classify]++;
}
log.info("cc:");
for (int i : cc) {
log.info(i + "-" + a[i]);
}
}
}