• hadoop实现共同出现的单词(Word co-occurrence)


    共同出现的单词(Word co-occurrence)是指在一个句子中相邻的两个单词。每一个相邻的单词就是一个Co-Occurrence对。

    Sample Input:

    a b cc, c d d c
    I Love U.
    dd ee f g s sa dew ad da
    So shaken as we are, so wan with care.
    Find we a time for frighted peace to pant.
    And breathe short-winded accents of new broil.
    To be commenced in strands afar remote.
    I Love U U love i.
    i i i i

    Sample Output:

    a:b 1
    a:time1
    a:we1
    accents:of1
    accents:short-winded1
    ad:da1
    ad:dew1
    afar:remote1
    afar:strands1
    and:breathe1
    are:so1
    are:we1
    as:shaken1
    as:we1
    b:cc1
    be:commenced1
    be:to1
    breathe:short-winded1
    broil:new1
    c:cc1
    c:d2
    care:with1
    commenced:in1
    d:d1
    dd:ee1
    dew:sa1
    ee:f1
    f:g1
    find:we1
    for:frighted1
    for:time1
    frighted:peace1
    g:s1
    i:i3
    i:love3
    in:strands1
    love:u3
    new:of1
    pant:to1
    peace:to1
    s:sa1
    shaken:so1
    so:wan1
    u:u1
    wan:with1

    Code:

    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.RawComparator;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;
    import org.apache.hadoop.io.WritableUtils;
    import org.apache.hadoop.mapred.Reporter;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Partitioner;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.util.GenericOptionsParser;
    
    public class CoOccurrence {
    
    
      public static class TextPair implements WritableComparable<TextPair> {
        private Text first;
        private Text second;
        
        public TextPair(){
        	set(new Text(), new Text());
        }
        public TextPair(String left, String right) {
            set(new Text(left), new Text(right));
        }
        public TextPair(Text left, Text right) {
        	set(left, right);
        }
        
        public void set(Text left, Text right){
        	String l = left.toString();
        	String r = right.toString();
        	int cmp = l.compareTo(r);    	
        	if(cmp <= 0){
        		this.first = left;
        		this.second = right;
        	}else{
        		this.first = right;
        		this.second = left;
        	}
        }
        
        public Text getFirst() {
          return first;
        }
        public Text getSecond() {
          return second;
        }
    
        @Override
        public void readFields(DataInput in) throws IOException {
          first.readFields(in);
          second.readFields(in);
        }
        @Override
        public void write(DataOutput out) throws IOException {
        	first.write(out);
        	second.write(out);
        }
        @Override
        public int hashCode() {
          return first.hashCode() * 163 + second.hashCode();//May be some trouble here. why 163? sometimes 157
        }
        @Override
        public boolean equals(Object o) {
          if (o instanceof TextPair) {
            TextPair tp = (TextPair) o;
            return first.equals(tp.first) && second.equals(tp.second);
          }
          return false;
        }
        @Override
        public String toString(){
        	return first + ":" + second;
        }
        @Override
        public int compareTo(TextPair tp) {
        	int cmp = first.compareTo(tp.first);
        	if(cmp != 0)
        		return cmp;
        	return second.compareTo(tp.second);
        }
    
        // A Comparator that com.pares serialized StringPair.  
        public static class Comparator extends WritableComparator {
        	private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
        	public Comparator() {
        		super(TextPair.class);
        	}
        	@Override
        	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
        		try {
        			int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
        			int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
        			int cmp = TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
        			if(cmp != 0)
        				return cmp;
        			return TEXT_COMPARATOR.compare(b1, s1 + firstl1, l1 - firstl1,
        										   b2, s2 + firstl2, l1 - firstl2);
        		}catch (IOException e) {
        			throw new IllegalArgumentException(e);
        		}
        	}
        }//End of Comparator
        static { // register this comparator
          WritableComparator.define(TextPair.class, new Comparator());
        }
    
        // Compare only the first part of the pair, so that reduce is called once for each value of the first part.
        public static class FirstComparator extends WritableComparator {
        	private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
        	public FirstComparator() {
        		super(TextPair.class);
        	}  	
        	@Override
        	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
        		try {
        			int firstl1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
        			int firstl2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
        			return TEXT_COMPARATOR.compare(b1, s1, firstl1, b2, s2, firstl2);
        		}catch (IOException e) {
        			throw new IllegalArgumentException(e);
        		}
        	}
        	/*
          @Override
          public int compare(WritableComparator a, WritableComparator b) {
          	if(a instanceof TextPair && b instanceof TextPair)
          		return ((TextPair)a).first.compareTo(((TextPair)b).first);
          	return super.compare(a, b);
          }*/
        }//End of FirstComparator    
      }//End of TextPair
      
      //Partition based on the first part of the pair.
      public static class FirstPartitioner extends Partitioner<TextPair,IntWritable>{
        @Override
        public int getPartition(TextPair key, IntWritable value, int numPartitions) {
          return Math.abs(key.getFirst().toString().indexOf(0) * 127) % numPartitions;//May be some trouble here.
        }
      }//End of FirstPartitioner
    
      public static class MyMapper extends Mapper<LongWritable, Text, TextPair, IntWritable> {    
        private final static IntWritable one = new IntWritable(1);
        private static Text word0 = new Text();
        private static Text word1 = new Text();
        private String pattern = "[^a-zA-Z0-9-']";
    
        @Override
        public void map(LongWritable inKey, Text inValue, Context context)throws IOException, InterruptedException {
        	String line = inValue.toString();
        	line = line.replaceAll(pattern, " ");
        	line = line.toLowerCase();
        	String[] str = line.split(" +");
        	for(int i=0; i< str.length-1; i++)
        	{
        		word0.set(str[i]);
        		word1.set(str[i+1]);
        		TextPair pair = new TextPair(word0, word1);
        		context.write(pair, one);
        	}
        }
      }//End of MapClass
      public static class MyReducer extends Reducer<TextPair, IntWritable, TextPair, IntWritable> {
    	    private IntWritable result = new IntWritable();
    	    
    	    @Override
    	    public void reduce(TextPair inKey, Iterable<IntWritable> inValues, Context context) throws IOException, InterruptedException {
    	    	int sum = 0;
    		      for (IntWritable val : inValues) {
    		        sum += val.get();
    		      }
    		      result.set(sum);
    		      context.write(inKey, result);
    	    }
      }//End of MyReducer
      
      public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //conf.set("Hadoop.job.ugi", "sunguoli,cs402");
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        //if (otherArgs.length != 2) {
        //  System.err.println("Usage: CoOccurrence <in> <out>");
        //  System.exit(2);
        //}
        Job job = new Job(conf, "Co-Occurrence");
        job.setJarByClass(CoOccurrence.class);
        
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        job.setCombinerClass(MyReducer.class);
    
        // group and partition by the first int in the pair
        //job.setPartitionerClass(FirstPartitioner.class);
        //job.setGroupingComparatorClass(FirstGroupingComparator.class);
    
        // the reduce output is Text, IntWritable
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(TextPair.class);
        job.setOutputValueClass(IntWritable.class);
        
        //FileInputFormat.addInputPath(job, new Path("../shakespeareinput"));
        //FileOutputFormat.setOutputPath(job, new Path("output"));
    	FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
      }//End of main
    }//End of CoOccurrence
  • 相关阅读:
    收藏的博客
    MVC 之 System.Web.Optimization找不到引用
    SQL Server 之 附加数据库出现“ 拒绝访问 ”
    Android Studio 之 环境搭建
    PD 之 连接数据库并导出数据及生成PDM文件
    Windows 之 可以Ping通服务器但无法使用服务器连接的共享打印机
    JQuery 之 在数据加载完成后才自动执行函数
    JavaScript 之 动态加载JS代码或JS文件
    JQuery 之 动态加载JS或JS文件
    JavaScript 之 解码类似eval(function(p,a,c,k,e,d){}))的JavaScript代码
  • 原文地址:https://www.cnblogs.com/aukle/p/3215028.html
Copyright © 2020-2023  润新知