• 文本聚类——Kmeans


    上两篇文章分别用朴素贝叶斯算法和KNN算法对newgroup文本进行了分类測试。本文使用Kmeans算法对文本进行聚类。


    1、文本预处理

    文本预处理在前面两本文章中已经介绍,此处(略)。


    2、文本向量化

    package com.datamine.kmeans;
    
    import java.io.*;
    import java.util.*;
    import java.util.Map.Entry;
    
    /**
     * 计算文档的属性向量,将全部文档向量化
     * @author Administrator
     */
    public class ComputeWordsVector {
    
    	/**
    	 * 计算文档的TF-IDF属性向量。返回Map<文件名称,<特征词,TF-IDF值>>
    	 * @param testSampleDir 处理好的聚类样本測试例子集
    	 * @return 全部測试例子的属性向量构成的map
    	 * @throws IOException
    	 */
    	public Map<String,Map<String,Double>> computeTFMultiIDF(String testSampleDir) throws IOException{
    		
    		String word;
    		Map<String,Map<String,Double>> allTestSampleMap = new TreeMap<String, Map<String,Double>>();
    		Map<String,Double> idfPerWordMap = computeIDF(testSampleDir);
    		Map<String,Double> tfPerDocMap = new TreeMap<String, Double>();
    		
    		File[] samples = new File(testSampleDir).listFiles();
    		System.out.println("the total number of test files is " + samples.length);
    		for(int i = 0;i<samples.length;i++){
    			
    			tfPerDocMap.clear();
    			FileReader samReader = new FileReader(samples[i]);
    			BufferedReader samBR = new BufferedReader(samReader);
    			Double wordSumPerDoc = 0.0; //计算每篇文档的总词数
    			while((word = samBR.readLine()) != null){
    				if(!word.isEmpty()){
    					wordSumPerDoc++;
    					if(tfPerDocMap.containsKey(word))
    						tfPerDocMap.put(word, tfPerDocMap.get(word)+1.0);
    					else
    						tfPerDocMap.put(word, 1.0);
    				}
    			}
    			
    			Double maxCount = 0.0,wordWeight; //记录出现次数最多的词的次数,用作归一化  ???
    			Set<Map.Entry<String, Double>> tempTF = tfPerDocMap.entrySet();
    			for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
    				Map.Entry<String, Double> me = mt.next();
    				if(me.getValue() > maxCount)
    					maxCount = me.getValue();
    			}
    			
    			for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
    				Map.Entry<String, Double> me = mt.next();
    				Double IDF = Math.log(samples.length / idfPerWordMap.get(me.getKey()));
    				wordWeight = (me.getValue() / wordSumPerDoc) * IDF;
    				tfPerDocMap.put(me.getKey(), wordWeight);
    			}
    			TreeMap<String,Double> tempMap = new TreeMap<String, Double>();
    			tempMap.putAll(tfPerDocMap);
    			allTestSampleMap.put(samples[i].getName(), tempMap);
    		}
    		printTestSampleMap(allTestSampleMap);
    		return allTestSampleMap;
    	}
    	
    	/**
    	 * 输出測试例子map内容,用于測试
    	 * @param allTestSampleMap
    	 * @throws IOException 
    	 */
    	private void printTestSampleMap(
    			Map<String, Map<String, Double>> allTestSampleMap) throws IOException {
    		// TODO Auto-generated method stub
    		File outPutFile = new File("E:/DataMiningSample/KmeansClusterResult/allTestSampleMap.txt");
    		FileWriter outPutFileWriter = new FileWriter(outPutFile);
    		Set<Map.Entry<String, Map<String,Double>>> allWords = allTestSampleMap.entrySet();
    		
    		for(Iterator<Entry<String, Map<String, Double>>> it = allWords.iterator();it.hasNext();){
    			
    			Map.Entry<String, Map<String,Double>> me = it.next();
    			outPutFileWriter.append(me.getKey()+" ");
    			
    			Set<Map.Entry<String, Double>> vectorSet = me.getValue().entrySet();
    			for(Iterator<Map.Entry<String, Double>> vt = vectorSet.iterator();vt.hasNext();){
    				Map.Entry<String, Double> vme = vt.next();
    				outPutFileWriter.append(vme.getKey()+" "+vme.getValue()+" ");
    			}
    			outPutFileWriter.append("
    ");
    			outPutFileWriter.flush();
    		}
    		outPutFileWriter.close();
    		
    	}
    
    	/**
    	 * 统计每一个词的总出现次数,返回出现次数大于n次的词汇构成终于的属性词典
    	 * @param strDir 处理好的newsgroup文件文件夹的绝对路径
    	 * @param wordMap 记录出现的每一个词构成的属性词典
    	 * @return newWordMap 返回出现次数大于n次的词汇构成终于的属性词典
    	 * @throws IOException
    	 */
    	public SortedMap<String, Double> countWords(String strDir,
    			Map<String, Double> wordMap) throws IOException {
    		
    		File sampleFile = new File(strDir);
    		File[] sample = sampleFile.listFiles();
    		String word;
    		
    		for(int i =0 ;i < sample.length;i++){
    			
    			if(!sample[i].isDirectory()){
    				FileReader samReader = new FileReader(sample[i]);
    				BufferedReader samBR = new BufferedReader(samReader);
    				while((word = samBR.readLine()) != null){
    					if(!word.isEmpty() && wordMap.containsKey(word))
    						wordMap.put(word, wordMap.get(word)+1);
    					else
    						wordMap.put(word, 1.0);
    				}
    				samBR.close();
    			}else{
    				countWords(sample[i].getCanonicalPath(),wordMap);
    			}
    		}
    		
    		/*
    		 * 去除停顿词后。先用DF算法选取特征词,后面再增加特征词的选取算法
    		 */
    		SortedMap<String,Double> newWordMap = new TreeMap<String, Double>();
    		Set<Map.Entry<String, Double>> allWords = wordMap.entrySet();
    		for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){
    			Map.Entry<String, Double> me = it.next();
    			if(me.getValue() > 100) //DF算法降维
    				newWordMap.put(me.getKey(), me.getValue());
    		}
    		
    		return newWordMap;
    	}
    	
    	/**
    	 * 计算IDF,即属性词典中每一个词在多少个文档中出现过
    	 * @param testSampleDir 聚类算法測试样本所在的文件夹
    	 * @return 单词IDFmap <单词,包括该单词的文档数>
    	 * @throws IOException
    	 */
    	public Map<String,Double> computeIDF(String testSampleDir) throws IOException{
    		
    		Map<String,Double> IDFPerWordMap = new TreeMap<String, Double>();
    		//记下当前已经遇到过的该文档中的词
    		Set<String> alreadyCountWord = new HashSet<String>();
    		String word;
    		File[] samples = new File(testSampleDir).listFiles();
    		for(int i = 0;i<samples.length;i++){
    			
    			alreadyCountWord.clear();
    			FileReader tsReader = new FileReader(samples[i]);
    			BufferedReader tsBR = new BufferedReader(tsReader);
    			while((word = tsBR.readLine()) != null){
    				
    				if(!alreadyCountWord.contains(word)){
    					if(IDFPerWordMap.containsKey(word))
    						IDFPerWordMap.put(word, IDFPerWordMap.get(word)+1.0);
    					else
    						IDFPerWordMap.put(word, 1.0);
    					alreadyCountWord.add(word);
    				}
    			}
    		}
    		return IDFPerWordMap;
    	}
    
    	/**
    	 * 创建聚类算法的測试例子集。主要是过滤出仅仅含有特征词的文档写到一个文件夹下
    	 * @param srcDir 源文件夹,已经预处理可是还没有过滤非特征词的文档文件夹
    	 * @param desDir 目的文件夹,聚类算法的測试例子文件夹
    	 * @return 创建測试例子集中特征词数组
    	 * @throws IOException 
    	 */
    	public String[] createTestSamples(String srcDir, String desDir) throws IOException {
    		
    		SortedMap<String,Double> wordMap = new TreeMap<String, Double>();
    		wordMap = countWords(srcDir,wordMap);
    		System.out.println("special words map sizes:" + wordMap.size());
    		String word,testSampleFile;
    		
    		File[] sampleDir = new File(srcDir).listFiles();
    		for(int i =0;i<sampleDir.length;i++){
    			
    			File[] sample = sampleDir[i].listFiles();
    			for(int j =0;j<sample.length;j++){
    				
    				testSampleFile = desDir + sampleDir[i].getName()+"_"+sample[j].getName();
    				FileReader samReader = new FileReader(sample[j]);
    				BufferedReader samBR = new BufferedReader(samReader);
    				FileWriter tsWriter = new FileWriter(new File(testSampleFile));
    				while((word = samBR.readLine()) != null){
    					if(wordMap.containsKey(word))
    						tsWriter.append(word + "
    ");
    				}
    				tsWriter.flush();
    				tsWriter.close();
    			}
    		}
    	
    		//返回属性词典
    		String[] terms = new String[wordMap.size()];
    		int i = 0;
    		Set<Map.Entry<String, Double>> allWords = wordMap.entrySet();
    		for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){
    			Map.Entry<String, Double> me = it.next();
    			terms[i] = me.getKey();
    			i++;
    		}
    		
    		return terms;
    		
    	}
    	
    	
    	
    
    	
    	
    }
    

    3、Kmeans算法

    Kmeans算法是很经典的聚类算法,算法主要过程例如以下:先选K个(或者随机选择)初始聚类点作为初始中心点,然后就算其它全部点到K个聚类中心点的距离,将点分到近期的聚类中。聚类完后。再次计算各个类中的中心点,中心点发生变化,于是更新中心点,然后再计算其它点到中心点的距离又一次聚类。中心点又发生变化,如此迭代下去。


    初始点选取策略:随机选。均匀抽样,最大最小法等....

    距离的度量方法:1-余弦相似度,2-向量内积

    算法停止条件:计算准则函数及设置最大迭代次数

    空聚类的处理:注意空聚类导致的程序bug


    package com.datamine.kmeans;
    
    import java.io.BufferedReader;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.*;
    
    /**
     * kmeans聚类算法的实现类,将newsgroup文档集聚成10类、20类、30类
     * 算法结束条件:当每一个点近期的聚类中心点就是它所属的聚类中心点时。算法结束
     * @author Administrator
     *
     */
    public class KmeansCluster {
    
    	/**
    	 * kmeans算法主过程
    	 * @param allTestSampleMap 聚类算法測试样本map(已经向量化) <文件名称,<特征词,TF-IDF值>>
    	 * @param k 聚类的数量
    	 * @return 聚类结果 <文件名称,聚类完毕后所属的类别号>
    	 */
    	private Map<String, Integer> doProcess(
    			Map<String, Map<String, Double>> allTestSampleMap, int k) {
    		
    		//0、首先获取allTestSampleMap全部文件名称顺序组成的数组
    		String[] testSampleNames = new String[allTestSampleMap.size()];
    		int count =0,tsLength = allTestSampleMap.size();
    		Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet();
    		for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){
    			Map.Entry<String, Map<String,Double>> me = it.next();
    			testSampleNames[count++] = me.getKey();
    		}
    		
    		//1、初始点的选择算法是随机选择或者是均匀分开选择。这里採用后者
    		Map<Integer,Map<String,Double>> meansMap = getInitPoint(allTestSampleMap,k);
    		double [][] distance = new double[tsLength][k]; //distance[i][k]记录点i到聚类中心k的距离
    		
    		//2、初始化k个聚类
    		int[] assignMeans = new int[tsLength]; //记录全部点属于的聚类序号,初始化全部为0
    		Map<Integer,Vector<Integer>> clusterMember = new TreeMap<Integer, Vector<Integer>>();//记录每一个聚类的成员点序号
    		Vector<Integer> mem = new Vector<Integer>();
    		int iterNum = 0; //迭代次数
    		
    		while(true){
    			System.out.println("Iteration No." + (iterNum++) + "-------------------------");
    			//3、计算每一个点和每一个聚类中心的距离
    			for(int i = 0;i < tsLength;i++){
    				for(int j = 0;j<k;j++)
    					distance[i][j] = getDistance(allTestSampleMap.get(testSampleNames[i]),meansMap.get(j));
    			}
    			
    			//4、找出每一个点近期的聚类中心
    			int [] nearestMeans = new int[tsLength];
    			for(int i = 0;i < tsLength;i++){
    				nearestMeans[i] = findNearestMeans(distance,i);
    			}
    			
    			//5、推断当前全部点属于的聚类序号是否已经全部是其离的近期的聚类,假设是或者达到最大的迭代次数。那么结束算法
    			int okCount = 0;
    			for(int i= 0;i<tsLength;i++){
    				if(nearestMeans[i] == assignMeans[i])
    					okCount ++;
    			}
    			System.out.println("okCount = " + okCount);
    			if(okCount == tsLength || iterNum >= 10)
    				break;
    			
    			//6、假设前面条件不满足,那么须要又一次聚类再次进行一次迭代,须要改动每一个聚类的成员和每一个点属于的聚类信息
    			clusterMember.clear();
    			for(int i = 0;i < tsLength;i++){
    				assignMeans[i] = nearestMeans[i];
    				if(clusterMember.containsKey(nearestMeans[i])){
    					clusterMember.get(nearestMeans[i]).add(i);
    				}
    				else{
    					mem.clear();
    					mem.add(i);
    					Vector<Integer> tempMem = new Vector<Integer>();
    					tempMem.addAll(mem);
    					clusterMember.put(nearestMeans[i], tempMem);
    				}
    			}
    			
    			//7、又一次计算每一个聚类的中心点
    			for(int i = 0;i<k;i++){
    				
    				if(!clusterMember.containsKey(i)) //注意kmeans可能产生空聚类
    					continue;
    				
    				Map<String,Double> newMean = computeNewMean(clusterMember.get(i),allTestSampleMap,testSampleNames);
    				Map<String,Double> tempMean = new TreeMap<String,Double>();
    				tempMean.putAll(newMean);
    				meansMap.put(i, tempMean);
    			}
    		
    		}
    		
    		//8、形成聚类结果而且返回
     		Map<String,Integer> resMap = new TreeMap<String,Integer>();
    		for(int i = 0;i<tsLength;i++){
    			resMap.put(testSampleNames[i], assignMeans[i]);
    		}
    		
    		return resMap;
    	}
    	
    	/**
    	 * 计算当前聚类的新中心,採用向量平均
    	 * @param clusterM 该点到全部聚类中心的距离
    	 * @param allTestSampleMap 全部測试例子 <文件名称,向量>
    	 * @param testSampleNames 全部測试例子名构成的数组
    	 * @return 新的聚类中心向量
    	 */
    	private Map<String, Double> computeNewMean(Vector<Integer> clusterM,
    			Map<String, Map<String, Double>> allTestSampleMap,
    			String[] testSampleNames) {
    		
    		double memberNum = (double)clusterM.size();
    		Map<String,Double> newMeanMap = new TreeMap<String,Double>();
    		Map<String,Double> currentMemMap = new TreeMap<String, Double>();
    		
    		for(Iterator<Integer> it = clusterM.iterator();it.hasNext();){
    			int me = it.next();
    			currentMemMap = allTestSampleMap.get(testSampleNames[me]);
    			Set<Map.Entry<String, Double>> currentMemMapSet = currentMemMap.entrySet();
    			for(Iterator<Map.Entry<String, Double>> jt = currentMemMapSet.iterator();jt.hasNext();){
    				Map.Entry<String, Double> ne = jt.next();
    				if(newMeanMap.containsKey(ne.getKey()))
    					newMeanMap.put(ne.getKey(), newMeanMap.get(ne.getKey())+ne.getValue());
    				else
    					newMeanMap.put(ne.getKey(), ne.getValue());
    			}
    		}
    		
    		Set<Map.Entry<String, Double>> newMeanMapSet = newMeanMap.entrySet();
    		for(Iterator<Map.Entry<String, Double>> it = newMeanMapSet.iterator();it.hasNext();){
    			Map.Entry<String, Double> me = it.next();
    			newMeanMap.put(me.getKey(), newMeanMap.get(me.getKey()) / memberNum);
    		}
    		
    		return newMeanMap;
    	}
    
    	/**
    	 * 找出距离当前点近期的聚类中心
    	 * @param distance 点到全部聚类中心的距离
    	 * @param m 点(文本号)
    	 * @return 近期聚类中心的序号j
    	 */
    	private int findNearestMeans(double[][] distance, int m) {
    		
    		double minDist = 10;
    		int j = 0;
    		for(int i = 0;i<distance[m].length;i++){
    			if(distance[m][i] < minDist){
    				minDist = distance[m][i];
    				j = i;
    			}
    		}
    		return j;
    	}
    
    	/**
    	 * 计算两个点的距离
    	 * @param map1 点1的向量map
    	 * @param map2 点2的向量map
    	 * @return 两个点的欧式距离
    	 */
    	private double getDistance(Map<String, Double> map1, Map<String, Double> map2) {
    
    		return 1 - computeSim(map1,map2);
    	}
    
    	/**计算两个文本的类似度
    	 * @param testWordTFMap 文本1的<单词,词频>向量
    	 * @param trainWordTFMap 文本2<单词,词频>向量
    	 * @return Double 向量之间的类似度 以向量夹角余弦计算(加上凝视部分代码就可以)或者向量内积计算(不加凝视部分,效果相当而速度更快)
    	 * @throws IOException 
    	 */
    	private double computeSim(Map<String, Double> testWordTFMap,
    			Map<String, Double> trainWordTFMap) {
    		// TODO Auto-generated method stub
    		double mul = 0;//, testAbs = 0, trainAbs = 0;
    		Set<Map.Entry<String, Double>> testWordTFMapSet = testWordTFMap.entrySet();
    		for(Iterator<Map.Entry<String, Double>> it = testWordTFMapSet.iterator(); it.hasNext();){
    			Map.Entry<String, Double> me = it.next();
    			if(trainWordTFMap.containsKey(me.getKey())){
    				mul += me.getValue()*trainWordTFMap.get(me.getKey());
    			}
    			//testAbs += me.getValue() * me.getValue();
    		}
    		//testAbs = Math.sqrt(testAbs);
    		
    		/*Set<Map.Entry<String, Double>> trainWordTFMapSet = trainWordTFMap.entrySet();
    		for(Iterator<Map.Entry<String, Double>> it = trainWordTFMapSet.iterator(); it.hasNext();){
    			Map.Entry<String, Double> me = it.next();
    			trainAbs += me.getValue()*me.getValue();
    		}
    		trainAbs = Math.sqrt(trainAbs);*/
    		return mul ;/// (testAbs * trainAbs);
    	}
    
    	/**
    	 * 获取kmeans算法迭代的初始点
    	 * @param allTestSampleMap <文件名称,<特征词。TF-IDF值>>
    	 * @param k 聚类的数量
    	 * @return  meansMap k个聚类的中心点向量
    	 */
    	private Map<Integer, Map<String, Double>> getInitPoint(
    			Map<String, Map<String, Double>> allTestSampleMap, int k) {
    		
    		int count = 0, i = 0;
    		//保存k个聚类的中心向量
    		Map<Integer,Map<String,Double>> meansMap = new TreeMap<Integer, Map<String,Double>>();
    		System.out.println("本次聚类的初始点相应的文件为:");
    		Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet();
    		for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){
    			Map.Entry<String, Map<String,Double>> me = it.next();
    			if(count == i*allTestSampleMapSet.size() / k){
    				meansMap.put(i, me.getValue());
    				System.out.println(me.getKey());
    				i++;
    			}
    			count++ ;
    		}
    		
    		return meansMap;
    	}
    
    	/**
    	 * 输出聚类结果到文件里
    	 * @param kmeansClusterResult 聚类结果
    	 * @param kmeansClusterResultFile 输出聚类结果到文件里
    	 * @throws IOException 
    	 */
    	private void printClusterResult(Map<String, Integer> kmeansClusterResult,
    			String kmeansClusterResultFile) throws IOException {
    
    		FileWriter resultWriter = new FileWriter(kmeansClusterResultFile);
    		Set<Map.Entry<String, Integer>> kmeansClusterResultSet = kmeansClusterResult.entrySet();
    		for(Iterator<Map.Entry<String, Integer>> it = kmeansClusterResultSet.iterator();it.hasNext();){
    			Map.Entry<String, Integer> me = it.next();
    			resultWriter.append(me.getKey()+" "+me.getValue()+"
    ");
    		}
    		resultWriter.flush();
    		resultWriter.close();
    	}
    	
    	/**
    	 * 评估函数依据聚类结果文件统计熵 和 混淆矩阵
    	 * @param kmeansClusterResultFile 聚类结果文件
    	 * @param k 聚类数目
    	 * @return 聚类结果的熵值
    	 * @throws IOException 
    	 */
    	private double evaluateClusterResult(String kmeansClusterResultFile, int k) throws IOException {
    
    		Map<String,String> rightCate = new TreeMap<String, String>();
    		Map<String,String> resultCate = new TreeMap<String, String>();
    		FileReader crReader = new FileReader(kmeansClusterResultFile);
    		BufferedReader crBR  = new BufferedReader(crReader);
    		String[] s;
    		String line;
    		while((line = crBR.readLine()) != null){
    			s = line.split(" ");
    			resultCate.put(s[0], s[1]);
    			rightCate.put(s[0], s[0].split("_")[0]);
    		}
    		crBR.close();
    		return computeEntropyAndConfuMatrix(rightCate,resultCate,k);//返回熵
    	}
    	
    	/**
    	 * 计算混淆矩阵并输出,返回熵
    	 * @param rightCate 正确的类目相应map
    	 * @param resultCate 聚类结果相应map
    	 * @param k 聚类的数目
    	 * @return 返回聚类熵
    	 */
    	private double computeEntropyAndConfuMatrix(Map<String, String> rightCate,
    			Map<String, String> resultCate, int k) {
    		
    		//k行20列,[i,j]表示聚类i中属于类目j的文件数
    		int[][] confusionMatrix = new int[k][20];
    		
    		//首先求出类目相应的数组索引
    		SortedSet<String> cateNames = new TreeSet<String>();
    		Set<Map.Entry<String, String>> rightCateSet = rightCate.entrySet();
    		for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){
    			Map.Entry<String, String> me = it.next();
    			cateNames.add(me.getValue());
    		}
    		
    		String[] cateNamesArray = cateNames.toArray(new String[0]);
    		Map<String,Integer> cateNamesToIndex = new TreeMap<String, Integer>();
    		for(int i =0;i < cateNamesArray.length ;i++){
    			cateNamesToIndex.put(cateNamesArray[i], i);
    		}
    		
    		for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){
    			Map.Entry<String, String> me = it.next();
    			confusionMatrix[Integer.parseInt(resultCate.get(me.getKey()))][cateNamesToIndex.get(me.getValue())]++;
    		}
    		
    		//输出混淆矩阵
    		double [] clusterSum = new double[k]; //记录每一个聚类的文件数
    		double [] everyClusterEntropy = new double[k]; //记录每一个聚类的熵
    		double clusterEntropy = 0;
    		
    		System.out.print("      ");
    		
    		for(int i=0;i<20;i++){
    			System.out.printf("%-6d",i);
    		}
    		
    		System.out.println();
    		
    		for(int i =0;i<k;i++){
    			System.out.printf("%-6d",i);
    			for(int j = 0;j<20;j++){
    				clusterSum[i] += confusionMatrix[i][j];
    				System.out.printf("%-6d",confusionMatrix[i][j]);
    			}
    			System.out.println();
    		}
    		System.out.println();
    		
    		//计算熵值
    		for(int i = 0;i<k;i++){
    			if(clusterSum[i] != 0){
    				for(int j = 0;j< 20 ;j++){
    					double p = (double)confusionMatrix[i][j]/clusterSum[i];
    					if(p!=0)
    						everyClusterEntropy[i] += -p * Math.log(p); 
    				}
    				clusterEntropy += clusterSum[i]/(double)rightCate.size() * everyClusterEntropy[i];  
    			}
    		}
    		return clusterEntropy;
    	}
    
    	public void KmeansClusterMain(String testSampleDir) throws IOException {
    		
    		//首先计算文档TF-IDF向量,保存为Map<String,Map<String,Double>> 即为Map<文件名称,Map<特征词,TF-IDF值>>
    		ComputeWordsVector computV = new ComputeWordsVector();
    		
    		//int k[] = {10,20,30}; 三组分类
    		int k[] = {20};
    		
    		Map<String,Map<String,Double>> allTestSampleMap = computV.computeTFMultiIDF(testSampleDir);
    		
    		for(int i =0;i<k.length;i++){
    			System.out.println("開始聚类。聚成"+k[i]+"类");
    			String KmeansClusterResultFile = "E:\DataMiningSample\KmeansClusterResult\";
    			Map<String,Integer> KmeansClusterResult = new TreeMap<String, Integer>();
    			KmeansClusterResult = doProcess(allTestSampleMap,k[i]);
    			KmeansClusterResultFile += k[i];
    			printClusterResult(KmeansClusterResult,KmeansClusterResultFile);
    			System.out.println("The Entropy for this Cluster is " + evaluateClusterResult(KmeansClusterResultFile,k[i]));
    		}
    		
    	}
    	
    	
    	public static void main(String[] args) throws IOException {
    		
    		KmeansCluster test = new KmeansCluster();
    		
    		String KmeansClusterResultFile = "E:\DataMiningSample\KmeansClusterResult\20";
    		System.out.println("The Entropy for this Cluster is " + test.evaluateClusterResult(KmeansClusterResultFile,20));
    	}
    
    
    	
    }
    

    4、程序入口

    package com.datamine.kmeans;
    
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.Date;
    
    public class ClusterMain {
    
    	/**
    	 * Kmeans 聚类主程序入口
    	 * @param args
    	 * @throws IOException 
    	 */
    	public static void main(String[] args) throws IOException {
    		
    		//数据预处理 在分类算法中已经实现 这里(略)
    		
    		ComputeWordsVector computeV = new ComputeWordsVector();
    		
    		KmeansCluster kmeansCluster = new KmeansCluster();
    		
    		String srcDir = "E:\DataMiningSample\processedSample\";
    		String desDir = "E:\DataMiningSample\clusterTestSample\";
    		
    		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    		String beginTime = sdf.format(new Date());
    		System.out.println("程序開始运行时间:"+beginTime);
    		
    		String[] terms = computeV.createTestSamples(srcDir,desDir);
    		kmeansCluster.KmeansClusterMain(desDir);
    		
    		String endTime = sdf.format(new Date());
    		System.out.println("程序结束运行时间:"+endTime);
    		
    	}
    	
    	
    }
    

    5、聚类结果

    程序開始运行时间:2016-03-14 17:02:38
    special words map sizes:3832
    the total number of test files is 18828
    開始聚类,聚成20类
    本次聚类的初始点相应的文件为:
    alt.atheism_49960
    comp.graphics_38307
    comp.os.ms-windows.misc_10112
    comp.sys.ibm.pc.hardware_58990
    comp.sys.mac.hardware_50449
    comp.windows.x_66402
    comp.windows.x_68299
    misc.forsale_76828
    rec.autos_103685
    rec.motorcycles_105046
    rec.sport.baseball_104941
    rec.sport.hockey_54126
    sci.crypt_15819
    sci.electronics_54016
    sci.med_59222
    sci.space_61185
    soc.religion.christian_20966
    talk.politics.guns_54517
    talk.politics.mideast_76331
    talk.politics.misc_178699
    Iteration No.0-------------------------
    okCount = 512
    Iteration No.1-------------------------
    okCount = 10372
    Iteration No.2-------------------------
    okCount = 15295
    Iteration No.3-------------------------
    okCount = 17033
    Iteration No.4-------------------------
    okCount = 17643
    Iteration No.5-------------------------
    okCount = 18052
    Iteration No.6-------------------------
    okCount = 18282
    Iteration No.7-------------------------
    okCount = 18404
    Iteration No.8-------------------------
    okCount = 18500
    Iteration No.9-------------------------
    okCount = 18627
          0     1     2     3     4     5     6     7     8     9     10    11    12    13    14    15    16    17    18    19    
    0     482   0     3     3     1     1     0     5     2     1     0     0     2     27    11    53    4     6     15    176   
    1     4     601   69    8     14    127   7     5     5     8     0     14    31    16    34    2     2     2     1     5     
    2     1     64    661   96    18    257   26    9     3     0     0     13    25    13    6     2     3     2     6     2     
    3     0     56    78    575   213   15    119   15    6     2     1     4     131   2     4     2     6     0     2     1     
    4     1     25    13    151   563   11    50    3     3     1     2     14    125   4     8     1     0     3     0     0     
    5     2     28    78    25    37    348   13    2     0     0     2     5     38    5     6     2     1     1     2     8     
    6     20    80    24    21    23    166   38    45    45    26    10    37    87    34    27    22    15    8     35    12    
    7     4     20    6     24    45    6     629   28    20    14    0     3     87    10    4     1     8     0     13    0     
    8     0     2     1     10    8     4     25    781   40    1     1     0     70    5     10    2     8     4     2     3     
    9     4     2     11    0     1     1     11    34    831   1     0     1     7     7     0     1     1     1     8     0     
    10    10    7     6     2     4     1     7     7     4     633   4     5     11    18    9     5     13    8     10    3     
    11    1     0     1     9     4     1     20    1     3     286   961   0     17    8     4     2     2     0     5     3     
    12    3     14    0     6     1     2     2     0     1     1     0     858   51    1     1     2     16    8     69    4     
    13    3     15    4     7     7     17    5     12    8     5     2     5     46    13    793   6     5     2     30    5     
    14    2     4     0     1     0     2     4     6     3     4     4     2     14    746   3     1     2     3     55    11    
    15    30    43    29    39    15    18    12    13    7     3     4     13    195   38    36    5     6     18    5     11    
    16    195   1     0     2     0     1     1     0     4     1     4     1     4     16    6     846   3     6     16    274   
    17    8     2     0     2     4     2     1     5     7     0     0     10    30    12    5     28    363   9     289   23    
    18    19    1     0     0     2     0     0     6     0     1     1     3     1     3     2     9     8     843   48    18    
    19    10    8     1     1     1     0     2     13    2     6     3     3     9     12    18    5     444   16    164   69    
    
    The Entropy for this Cluster is 1.2444339205006887
    程序结束运行时间:2016-03-14 17:08:24




  • 相关阅读:
    mysql source 乱码
    php5.6.11 openssl安装
    python threading模块的Lock和RLock区别
    python多线程一些知识点梳理
    多核处理器中进程和线程是如何一起工作的?
    IO是否会一直占用CPU?(转)
    Python TypeError: __init__() got multiple values for argument 'master'(转)
    Jquery中.bind()、.live()、.delegate()和.on()之间的区别详解(转)
    浏览器环境下JavaScript脚本加载与执行探析之代码执行顺序(转)
    为什么有的网页右击没有出现审查元素
  • 原文地址:https://www.cnblogs.com/brucemengbm/p/7229347.html
Copyright © 2020-2023  润新知