在完成了预处理和特征提取后,下一步就是用 聚类算法 进行文本聚类。在聚类算法中 距离函数的选择很重要,文本挖掘中最好的距离函数就是 余弦距离,但是Weka 3.6.10中 尚不支持 余弦距离,需要自己实现。
我们可以在 Eclipse 中创建一个文本挖掘的项目,引入 weka.jar,然后然后实现一个计算余弦距离的类,让这个类继承自weka中用于计算欧氏距离的类,代码如下:
package cn.csdn.test; import java.util.Enumeration; import weka.core.Attribute; import weka.core.EuclideanDistance; import weka.core.Instance; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.core.neighboursearch.PerformanceStats; //public class CosineDistance implements DistanceFunction, OptionHandler, Serializable, RevisionHandler{ public class CosineDistance extends EuclideanDistance{ public Instances m_Data = null; public String version ="1.0"; @Override public double distance(Instance arg0, Instance arg1) { // TODO Auto-generated method stub return distance(arg0, arg1, Double.POSITIVE_INFINITY, null); } @Override public double distance(Instance arg0, Instance arg1, PerformanceStats arg2) { // TODO Auto-generated method stub return distance(arg0, arg1, Double.POSITIVE_INFINITY, arg2); } @Override public double distance(Instance arg0, Instance arg1, double arg2) { // TODO Auto-generated method stub return distance(arg0, arg1, arg2, null); } @Override public double distance(Instance first, Instance second, double cutOffValue, PerformanceStats arg3) { double distance = 0; int firstI, secondI; int firstNumValues = first.numValues(); int secondNumValues = second.numValues(); int numAttributes = m_Data.numAttributes(); int classIndex = m_Data.classIndex(); double normA, normB; normA = 0; normB = 0; for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) { if (p1 >= firstNumValues) firstI = numAttributes; else firstI = first.index(p1); if (p2 >= secondNumValues) secondI = numAttributes; else secondI = second.index(p2); if (firstI == classIndex) { p1++; continue; } // if ((firstI < numAttributes)) { // p1++; // continue; // } if (secondI == classIndex) { p2++; continue; } // if ((secondI < numAttributes)) { // p2++; // continue; // } double diff; if (firstI == secondI) { diff = difference(firstI, first.valueSparse(p1), second .valueSparse(p2)); normA += Math.pow(first.valueSparse(p1), 2); normB += Math.pow(second.valueSparse(p2), 2); p1++; p2++; } else if (firstI > secondI) { diff = difference(secondI, 0, second.valueSparse(p2)); normB += Math.pow(second.valueSparse(p2), 2); p2++; } else { diff = difference(firstI, first.valueSparse(p1), 0); normA += Math.pow(first.valueSparse(p1), 2); p1++; } if (arg3 != null) arg3.incrCoordCount(); distance = updateDistance(distance, diff); if (distance > cutOffValue) return Double.POSITIVE_INFINITY; } //do the post here, don't depends on other functions //System.out.println(distance + " " + normA + " "+ normB); distance = distance/Math.sqrt(normA)/Math.sqrt(normB); distance = 1-distance; if(distance < 0 || distance > 1) System.err.println("unknown: " + distance); return distance; } public double updateDistance(double currDist, double diff){ double result; result = currDist; result += diff; return result; } public double difference(int index, double val1, double val2){ switch(m_Data.attribute(index).type()){ case Attribute.NOMINAL: return Double.NaN; //break; case Attribute.NUMERIC: return val1 * val2; //break; } return Double.NaN; } @Override public String getAttributeIndices() { // TODO Auto-generated method stub return null; } @Override public Instances getInstances() { // TODO Auto-generated method stub return m_Data; } @Override public boolean getInvertSelection() { // TODO Auto-generated method stub return false; } @Override public void postProcessDistances(double[] arg0) { // TODO Auto-generated method stub } @Override public void setAttributeIndices(String arg0) { // TODO Auto-generated method stub } @Override public void setInstances(Instances arg0) { // TODO Auto-generated method stub m_Data = arg0; } @Override public void setInvertSelection(boolean arg0) { // TODO Auto-generated method stub //do nothing } @Override public void update(Instance arg0) { // TODO Auto-generated method stub //do nothing } @Override public String[] getOptions() { // TODO Auto-generated method stub return null; } @Override public Enumeration listOptions() { // TODO Auto-generated method stub return null; } @Override public void setOptions(String[] arg0) throws Exception { // TODO Auto-generated method stub } @Override public String getRevision() { // TODO Auto-generated method stub return "Cosine Distance function writtern by Tom, version " + version; } public static void main(String[] args) throws Exception{ String src = "sample.csv"; DataSource source = new DataSource(src); Instances data = source.getDataSet(); CosineDistance cd = new CosineDistance(); cd.setInstances(data); System.out.println(cd.distance(data.instance(0), data.instance(1))); System.out.println(cd.distance(data.instance(1), data.instance(2))); } }
然后再建立一个MyTextCluster.java文件用该类作为计算距离的实例,进行 K均值聚类。代码如下:
package cn.csdn.test; import java.io.File; import weka.clusterers.ClusterEvaluation; import weka.clusterers.SimpleKMeans; import weka.core.Instances; import weka.core.converters.ArffLoader; import weka.filters.Filter; import weka.filters.unsupervised.attribute.StringToWordVector; public class MyTextCluster { public static String arffpath="E://2.arff"; //在这里配置arff文件路径 /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // TODO Auto-generated method stub ArffLoader loader = new ArffLoader(); loader.setFile(new File(arffpath)); Instances dataRaw = loader.getDataSet(); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(dataRaw); filter.setWordsToKeep(1000); filter.setIDFTransform(true); filter.setOutputWordCounts(true); Instances dataFiltered = Filter.useFilter(dataRaw, filter); SimpleKMeans skm = new SimpleKMeans(); skm.setDisplayStdDevs(false); //skm.setDistanceFunction(new EuclideanDistance()); skm.setDistanceFunction(new CosineDistance()); skm.setMaxIterations(500); skm.setDontReplaceMissingValues(true); skm.setNumClusters(3); skm.setPreserveInstancesOrder(false); skm.setSeed(100); skm.buildClusterer(dataFiltered); ClusterEvaluation eval; eval = new ClusterEvaluation(); eval.setClusterer(skm); eval.evaluateClusterer(dataFiltered); //System.out.println("# of clusters: " + eval.getNumClusters()); String a = eval.clusterResultsToString(); System.out.println(a); for(int i = 0 ; i< dataFiltered.numInstances(); i++) { System.out.println("Instance"+ String.valueOf(i)+" is in cluster" +skm.clusterInstance(dataFiltered.instance(i))); } } }运行结果如下:
kMeans
======
Number of iterations: 2
Within cluster sum of squared errors: 2.6483113613228255
Cluster centroids:
Cluster#
Attribute Full Data 0 1 2
(7) (1) (2) (4)
=============================================================
# 1.9459 0 0 3.4053
+ 13.3434 0 0 23.3509
- 3.557 0.6729 0.3365 5.8883
-- 0.3579 1.2528 0 0.3132
0 5.6761 21.2654 0 4.6168
01 0.7159 3.7583 0 0.3132
1 2.4783 0 0 4.337
10 0.8473 0 0 1.4828
11 0.4797 0 0 0.8394
12 1.2104 1.6946 0 1.6946
13 0.4842 0.8473 0 0.6355
14 0.3631 0 0 0.6355
15 0.3579 1.2528 0 0.3132
16 0.3579 0 0 0.6264
17 0.3579 0 0 0.6264
19 0.556 0 0 0.973
2 1.9187 0.5596 0 3.2178
20 0.3579 0 0 0.6264
2004 0.8948 0 0.6264 1.2528
2005 3.0579 0 0 5.3513
2006 3.4376 11.1923 0 3.2178
21 0.3631 0 0 0.6355
22 0.3579 0 0 0.6264
23 0.3579 1.2528 0 0.3132
24 0.5369 0 0 0.9396
25 0.3579 0 0 0.6264
26 0.7159 0 0 1.2528
27 0.7159 3.7583 0 0.3132
28 1.0393 5.0365 0 0.5596
29 0.3198 0.5596 0 0.4197
3 0.6249 0.3365 0 1.0094
30 0.5287 0.6729 0.3365 0.5888
31 0.4797 1.1192 0 0.5596
36 0.556 0 0 0.973
37 0.3579 0 0 0.6264
3rd 0.3579 0 0 0.6264
4 0.9593 0 0.2798 1.5389
40 0.3579 0 0 0.6264
49 0.7159 0 0 1.2528
5 1.0894 0 0 1.9064
55 0.556 0 0 0.973
6 1.6107 0 0 2.8187
7 0.7263 0 0 1.2709
7-inch 0.556 0 0 0.973
7/26/06 16.9572 0 0 29.6751
7/27/06 4.7258 0 0 8.2701
8 0.9683 0 0 1.6946
9 0.9683 0 0 1.6946
@ 4.8321 0 0 8.4562
A 0.721 0.6729 0.5047 0.8412
AM 0.5369 0 0 0.9396
API 2.2239 15.5673 0 0
About 0.3845 0.3365 0 0.5888
Actually 0.834 0 0 1.4594
Add 0.556 0 0 0.973
All 0.8948 0 0 1.566
Allah 0.556 0 0 0.973
Also 0.4842 0.8473 0 0.6355
Amazon 0.556 3.8918 0 0
America 0.556 0 0 0.973
An 0.4842 1.6946 0.4236 0.2118
And 0.6729 0 0.1682 1.0935
Annoucements 0.556 0 0 0.973
Antihexe 0.556 0 0 0.973
Anyway 0.3579 0 0 0.6264
April 0.3579 0 0 0.6264
Arbogast 0.834 5.8377 0 0
Archives 0.3631 0.8473 0 0.4236
As 0.3631 0 0 0.6355
Atom 0.5369 2.5055 0 0.3132
August 0.9683 2.5419 0 1.0591
Baker 0.556 3.8918 0 0
Bands 0.556 0 0 0.973
Be 0.3579 0 0.6264 0.3132
Because 0.5369 0 0 0.9396
Best 0.834 0 0 1.4594
Bill 0.556 3.8918 0 0
Blog 0.8473 0 0 1.4828
Blogroll 0.3579 1.2528 0 0.3132
Boondoggle 0.834 0 0 1.4594
Brainiac 1.9459 0 0 3.4053
(略)
Clustered Instances
0 1 ( 14%)
1 2 ( 29%)
2 4 ( 57%)
Instance0 is in cluster 2
Instance1 is in cluster 1
Instance2 is in cluster 0
Instance3 is in cluster 2
Instance4 is in cluster 1
Instance5 is in cluster 2
Instance6 is in cluster 2