1. BSP
通过继承 org.apache.hama.bsp.BSP 类,创建自己的BSP类。
public abstract void bsp(BSPPeer<K1, V1, K2, V2, M extends Writable> peer) throws IOException, SyncException, InterruptedException{}
HamaConfiguration conf = new HamaConfiguration(); BSPJob job = new BSPJob(conf, MyBSP.class); job.setJobName("My BSP program"); job.setBspClass(MyBSP.class); job.setInputFormat(NullInputFormat.class); job.setOutputKeyClass(Text.class); ... job.waitForCompletion(true);
job.setInputPath(new Path("/tmp/sequence.dat"); job.setInputFormat(org.apache.hama.bsp.SequenceFileInputFormat.class); or, SequenceFileInputFormat.addInputPath(job, new Path("/tmp/sequence.dat")); or, SequenceFileInputFormat.addInputPaths(job, "/tmp/seq1.dat,/tmp/seq2.dat,/tmp/seq3.dat"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path("/tmp/result"));
@Override public final void bsp( BSPPeer<LongWritable, Text, Text, LongWritable, Text> peer) throws IOException, InterruptedException, SyncException { // this method reads the next key value record from file KeyValuePair<LongWritable, Text> pair = peer.readNext(); // the following lines do the same: LongWritable key = new LongWritable(); Text value = new Text(); peer.readNext(key, value); // write peer.write(value, key); }
for(int i = 0; i < 5; i++){ LongWritable key = new LongWritable(); Text value = new Text(); while (peer.readNext(key, value)) { // read everything } // reopens the input peer.reopenInput() //*************** }
方法 | 描述 |
send(String peerName, BSPMessage msg) | 向另外一个peer发送消息 |
getCurrentMessage() | 返回接收到的消息 |
getNumCurrentMessages() | 返回接收到的消息数 |
sync() | 障栅同步 |
getPeerName() | 返回peer的名称 |
getAllPeerNames() | 返回所有peer的名称 |
getSuperstepCount() | 返回超步数 |
@Override public void bsp( BSPPeer<NullWritable, NullWritable, Text, DoubleWritable, Text> peer) throws IOException, SyncException, InterruptedException { for (String peerName : peer.getAllPeerNames()) { peer.send(peerName, new Text("Hello from " + peer.getPeerName(), System.currentTimeMillis())); } peer.sync(); }
当所有的进程都进入同步状态,接下来将就进入下一个超步。需要注意的是,sync()方法并不是BSP Job的结束。如前所述,所有的通信方法都非常的灵活。例如,可以在一个for循环中执行sync(),这样就可以对迭代顺序进行控制。
@Override public void bsp( BSPPeer<NullWritable, NullWritable, Text, DoubleWritable, Text> peer) throws IOException, SyncException, InterruptedException { for (int i = 0; i < 100; i++) { // send some messages peer.sync(); } }
private static Path TMP_OUTPUT = new Path("/tmp/pi-" + System.currentTimeMillis()); public static class MyEstimator extends BSP<NullWritable, NullWritable, Text, DoubleWritable, DoubleWritable> { public static final Log LOG = LogFactory.getLog(MyEstimator.class); private String masterTask; private static final int iterations = 10000; @Override public void bsp( BSPPeer<NullWritable, NullWritable, Text, DoubleWritable, DoubleWritable> peer) throws IOException, SyncException, InterruptedException { int in = 0; for (int i = 0; i < iterations; i++) { double x = 2.0 * Math.random() - 1.0, y = 2.0 * Math.random() - 1.0; if ((Math.sqrt(x * x + y * y) < 1.0)) { in++; } } double data = 4.0 * in / iterations; peer.send(masterTask, new DoubleWritable(data)); peer.sync(); } @Override public void setup( BSPPeer<NullWritable, NullWritable, Text, DoubleWritable, DoubleWritable> peer) throws IOException { // Choose one as a master this.masterTask = peer.getPeerName(peer.getNumPeers() / 2); } @Override public void cleanup( BSPPeer<NullWritable, NullWritable, Text, DoubleWritable, DoubleWritable> peer) throws IOException { if (peer.getPeerName().equals(masterTask)) { double pi = 0.0; int numPeers = peer.getNumCurrentMessages(); DoubleWritable received; while ((received = peer.getCurrentMessage()) != null) { pi += received.get(); } pi = pi / numPeers; peer.write(new Text("Estimated value of PI is"), new DoubleWritable(pi)); } } } static void printOutput(HamaConfiguration conf) throws IOException { FileSystem fs = FileSystem.get(conf); FileStatus[] files = fs.listStatus(TMP_OUTPUT); for (int i = 0; i < files.length; i++) { if (files[i].getLen() > 0) { FSDataInputStream in = fs.open(files[i].getPath()); IOUtils.copyBytes(in, System.out, conf, false); in.close(); break; } } fs.delete(TMP_OUTPUT, true); } public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException { // BSP job configuration HamaConfiguration conf = new HamaConfiguration(); BSPJob bsp = new BSPJob(conf, PiEstimator.class); // Set the job name bsp.setJobName("Pi Estimation Example"); bsp.setBspClass(MyEstimator.class); bsp.setInputFormat(NullInputFormat.class); bsp.setOutputKeyClass(Text.class); bsp.setOutputValueClass(DoubleWritable.class); bsp.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(bsp, TMP_OUTPUT); BSPJobClient jobClient = new BSPJobClient(conf); ClusterStatus cluster = jobClient.getClusterStatus(true); if (args.length > 0) { bsp.setNumBspTask(Integer.parseInt(args[0])); } else { // Set to maximum bsp.setNumBspTask(cluster.getMaxTasks()); } long startTime = System.currentTimeMillis(); if (bsp.waitForCompletion(true)) { printOutput(conf); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } }
2. Graph
hama提供了Graph包,支持顶点为中心的图计算,使用较少的代码就可以实现google Pregel风格的应用。
Vertex API
实现一个Hama Graph应用包括对预定义的Vertex类进行子类化,模板参数涉及3种类型,顶点、边和消息(vertices, edges, and messages):
public abstract class Vertex<V extends Writable, E extends Writable, M extends Writable> implements VertexInterface<V, E, M> { public void compute(Iterator<M> messages) throws IOException; .. }
VertexReader API
通过继承 org.apache.hama.graph.VertexInputReader 类,根据自己的文件格式创建自己的 VertexReader,示例如下:
public static class PagerankTextReader extends VertexInputReader<LongWritable, Text, Text, NullWritable, DoubleWritable> { /** * 输入文件的格式 * The text file essentially should look like: <br/> * VERTEX_ID\t(n-tab separated VERTEX_IDs)<br/> * E.G:<br/> * 1\t2\t3\t4<br/> * 2\t3\t1<br/> * etc. */ @Override /*** * 解析节点,如hadoop类似,以行为一个单位进行输入。以制表符作为分割符, * 将每一行分割为String类型的数组,最后转化为vertex类的一个实例 */ public boolean parseVertex(LongWritable key, Text value, Vertex<Text, NullWritable, DoubleWritable> vertex) throws Exception { String[] split = value.toString().split("\t"); for (int i = 0; i < split.length; i++) { if (i == 0) { vertex.setVertexID(new Text(split[i])); } else { vertex .addEdge(new Edge<Text, NullWritable>(new Text(split[i]), null)); } } return true; } }
public static class PageRankVertex extends Vertex<Text, NullWritable, DoubleWritable> { @Override public void compute(Iterator<DoubleWritable> messages) throws IOException { if (this.getSuperstepCount() == 0) { this.setValue(new DoubleWritable(1.0 / (double) this.getNumVertices())); } if (this.getSuperstepCount() >= 1) { double sum = 0; while (messages.hasNext()) { DoubleWritable msg = messages.next(); sum += msg.get(); } double ALPHA = (1 - 0.85) / (double) this.getNumVertices(); this.setValue(new DoubleWritable(ALPHA + (0.85 * sum))); } if (this.getSuperstepCount() < this.getMaxIteration()) { int numEdges = this.getOutEdges().size(); sendMessageToNeighbors(new DoubleWritable(this.getValue().get() / numEdges)); } } }