• mahout学习之二——mahout0.9kmeans聚类实例


    最近学习《Mahout实战》,但是书中的代码是实用mahout0.5版本,很多地方在mahout0.9版本中已经改头换面了,经调试,阅读mahout0.9api,

    运行结果如图:


    修改代码如下:


    package cn.kelaile.hadooptest;


    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.SequenceFile;
    import org.apache.hadoop.io.Text;
    import org.apache.mahout.clustering.Cluster;
    import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
    import org.apache.mahout.clustering.kmeans.KMeansDriver;
    import org.apache.mahout.clustering.kmeans.Kluster;
    import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
    import org.apache.mahout.math.RandomAccessSparseVector;
    import org.apache.mahout.math.Vector;
    import org.apache.mahout.math.VectorWritable;

    import java.io.File;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;

    public class TestKMeansClusteringBy9 {

    public static final double[][] points = {
    {1, 1}, {2, 1}, {1, 2},
    {2, 2}, {3, 3}, {8, 8},
    {9, 8}, {8, 9}, {9, 9}};

    public static void writePointsToFile(List<Vector> points,
    String fileName,
    FileSystem fs,
    Configuration conf) throws IOException {
    Path path = new Path(fileName);
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
    path, LongWritable.class, VectorWritable.class);
    long recNum = 0;
    VectorWritable vec = new VectorWritable();
    for (Vector point : points) {
    vec.set(point);
    writer.append(new LongWritable(recNum++), vec);
    }
    writer.close();
    }

    public static List<Vector> getPoints(double[][] raw) {
    List<Vector> points = new ArrayList<Vector>();
    for (int i = 0; i < raw.length; i++) {
    double[] fr = raw[i];
    Vector vec = new RandomAccessSparseVector(fr.length);
    vec.assign(fr);
    points.add(vec);
    }
    return points;
    }

    public static void main(String args[]) throws Exception {

    int k = 2;

    List<Vector> vectors = getPoints(points);

    File testData = new File("clustering/testdata");
    if (!testData.exists()) {
    testData.mkdir();
    }
    testData = new File("clustering/testdata/points");
    if (!testData.exists()) {
    testData.mkdir();
    }

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    writePointsToFile(vectors, "clustering/testdata/points/file1", fs, conf);

    Path path = new Path("clustering/testdata/clusters/part-00000");
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Kluster.class);

    for (int i = 0; i < k; i++) {
    Vector vec = vectors.get(i);
    Kluster cluster = new Kluster(vec, i, new EuclideanDistanceMeasure());
    writer.append(new Text(cluster.getIdentifier()), cluster);
    }
    writer.close();

    KMeansDriver.run(conf,
    new Path("clustering/testdata/points"),
    new Path("clustering/testdata/clusters"),
    new Path("clustering/output"),
    0.001,
    10,
    true,
    0,
    true);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs,
    new Path("clustering/output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-0"), conf);

    IntWritable key = new IntWritable();
    WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable();
    while (reader.next(key, value)) {
    System.out.println(value.toString() + " belongs to cluster " + key.toString());
    }
    reader.close();
    }

    }

  • 相关阅读:
    Memory Limit Exceeded
    浙江省程序设计竞赛2019
    hdu3974 Assign the task
    TCP面向字节流和UDP面向报文的区别
    django-admin和django-admin.py的区别
    利用 pip 安装 Python 程序包到个人用户文件夹下
    PyCharm中目录directory与包package的区别及相关import详解
    分布式表示(Distributed Representation)
    Nginx
    32.最长有效括号
  • 原文地址:https://www.cnblogs.com/zhangdebin/p/5567913.html
Copyright © 2020-2023  润新知