改用MyAnalyzer的KMeans聚类算法

<strong><span style="font-size:18px;">/***
 * @author YangXin
 * @info 改用MyAnalyzer的KMeans聚类算法
 */
package unitTen;
import java.io.File;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.distance.CosineDistanceMeasure;
import org.apache.mahout.vectorizer.DictionaryVectorizer;
import org.apache.mahout.vectorizer.DocumentProcessor;
import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
public class NewsKMeansClustering {
	public static void main(String args[]) throws Exception {
	    
	    int minSupport = 5;
	    int minDf = 5;
	    int maxDFPercent = 99;
	    int maxNGramSize = 1;
	    int minLLRValue = 50;
	    int reduceTasks = 1;
	    int chunkSize = 200;
	    int norm = -1;
	    boolean sequentialAccessOutput = true;
	    
	    String inputDir = "reuters-seqfiles";
	    File inputDirFile = new File(inputDir);
	    Configuration conf = new Configuration();
	    FileSystem fs = FileSystem.get(conf);

	    String outputDir = "newsClusters";
	    HadoopUtil.delete(conf, new Path(outputDir));
	    Path tokenizedPath = new Path(outputDir,
	        DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
	    MyAnalyzer analyzer = new MyAnalyzer();
	    DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer
	        .getClass().asSubclass(Analyzer.class), tokenizedPath, conf);

	    DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
	      new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,
	      chunkSize, sequentialAccessOutput, false);
	    TFIDFConverter.processTfIdf(
	      new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
	      new Path(outputDir), conf, chunkSize, minDf,
	      maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);
	    Path vectorsFolder = new Path(outputDir, "tfidf-vectors");
	    Path centroids = new Path(outputDir, "centroids");
	    Path clusterOutput = new Path(outputDir, "clusters");
	    
	    RandomSeedGenerator.buildRandom(conf, vectorsFolder, centroids, 20,
	      new CosineDistanceMeasure());
	    KMeansDriver.run(conf, vectorsFolder, centroids, clusterOutput,
	      new CosineDistanceMeasure(), 0.01, 20, true, false);
	    
	    SequenceFile.Reader reader = new SequenceFile.Reader(fs,
	        new Path(clusterOutput, Cluster.CLUSTERED_POINTS_DIR
	                                + "/part-m-00000"), conf);
	  }
}
</span></strong>

相关阅读:
Server requested plaintext password but ‘client plaintext auth’ is disabled
LDAP目录树中常见的关键字
 LINUX下让一个用户添加进多个组中
 CentOS5.6下samba+ldap+smbldaptools的安装
 解决configure: error: Cannot find pam headers. Please check if your system is ready for pam module development
解决configure: error: C++ compiler cannot create executables问题
 IE中拖动DOM元素的例子
 请确保此代码文件中定义的类与“inherits”属性匹配，并且该类扩展的基类(例如Page 或UserControl)是正确的。
DOM中的高级事件处理
 好导网(推荐)
原文地址：https://www.cnblogs.com/clnchanpin/p/7238208.html