• mapreduce实现"浏览该商品的人大多数还浏览了"经典应用


    转自:http://blog.csdn.net/u011750989/article/details/12004065

    输入:

    日期    ...cookie id.        ...商品id..

    xx            xx                        xx

    输出:

    商品id         商品id列表(按优先级排序,用逗号分隔)

    xx                   xx

    比如:

    id1              id3,id0,id4,id2

    id2             id0,id5

    整个计算过程分为4步

    1、提取原始日志日期,cookie id,商品id信息,按天计算,最后输出数据格式

    商品id-0 商品id-1

    xx           x x         

    这一步做了次优化,商品id-0一定比商品id-1小,为了减少存储,在最后汇总数据转置下即可

    reduce做局部排序及排重

    2、基于上次的结果做汇总,按天计算

    商品id-0 商品id-1  关联值(关联值即同时访问这两个商品的用户数)

    xx             x x                xx

    3、汇总最近三个月数据,同时考虑时间衰减,时间越久关联值的贡献越低,最后输出两两商品的关联值(包括转置后)

    4、行列转换,生成最后要的推荐结果数据,按关联值排序生成

    第一个MR

    [java] view plaincopy
     
    1. import java.io.IOException;  
    2. import java.util.ArrayList;  
    3. import org.apache.hadoop.conf.Configuration;  
    4. import org.apache.hadoop.fs.FileSystem;  
    5. import org.apache.hadoop.fs.Path;  
    6. import org.apache.hadoop.io.LongWritable;  
    7. import org.apache.hadoop.io.Text;  
    8. import org.apache.hadoop.io.WritableComparable;  
    9. import org.apache.hadoop.io.WritableComparator;  
    10. import org.apache.hadoop.mapreduce.Job;  
    11. import org.apache.hadoop.mapreduce.Mapper;  
    12. import org.apache.hadoop.mapreduce.Partitioner;  
    13. import org.apache.hadoop.mapreduce.Reducer;  
    14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
    15. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
    16. import org.apache.hadoop.util.GenericOptionsParser;  
    17. import org.apache.log4j.Logger;  
    18.   
    19.   
    20. /* 
    21.  * 输入:原始数据,会有重复 
    22.  *日期 cookie 楼盘id 
    23.  *  
    24.  * 输出: 
    25.  * 日期 楼盘id1 楼盘id2  //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组 
    26.  *  
    27.  */  
    28.   
    29. public class HouseMergeAndSplit {  
    30.       
    31.     public static class Partitioner1 extends Partitioner<TextPair, Text> {  
    32.           @Override  
    33.           public int getPartition(TextPair key, Text value, int numParititon) {  
    34.                       return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;  
    35.   
    36.           }  
    37.     }  
    38.           public static class Comp1 extends WritableComparator {  
    39.               public Comp1() {  
    40.                super(TextPair.class, true);  
    41.               }  
    42.               @SuppressWarnings("unchecked")  
    43.               public int compare(WritableComparable a, WritableComparable b) {  
    44.                TextPair t1 = (TextPair) a;  
    45.                TextPair t2 = (TextPair) b;  
    46.                int comp= t1.getFirst().compareTo(t2.getFirst());  
    47.                if (comp!=0)  
    48.                    return comp;  
    49.                return t1.getSecond().compareTo(t2.getSecond());  
    50.               }  
    51.             }  
    52.       public static class TokenizerMapper   
    53.            extends Mapper<LongWritable, Text, TextPair, Text>{  
    54.                   Text val=new Text("test");  
    55.         public void map(LongWritable key, Text value, Context context  
    56.                         ) throws IOException, InterruptedException {  
    57.                          String s[]=value.toString().split("01");              
    58.              TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]); //thedate cookie city+houseid  
    59.              context.write(tp, val);  
    60.         }  
    61.       }  
    62.         
    63.       public static class IntSumReducer   
    64.            extends Reducer<TextPair,Text,Text,Text> {  
    65.           private static String comparedColumn[] = new String[3];  
    66.           ArrayList<String> houselist= new ArrayList<String>();  
    67.           private static Text keyv = new Text();  
    68.             
    69.           private static Text valuev = new Text();  
    70.           static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName());  
    71.             
    72.         public void reduce(TextPair key, Iterable<Text> values,   
    73.                            Context context  
    74.                            ) throws IOException, InterruptedException {  
    75.               
    76.             houselist.clear();  
    77.             String thedate=key.getFirst().toString();  
    78.             String cookie=key.getSecond().toString();    
    79.              
    80.             for (int i=0;i<3;i++)  
    81.                 comparedColumn[i]="";  
    82.               
    83.             //first+second为分组键,每次不同重新调用reduce函数  
    84.             for (Text val:values)  
    85.             {  
    86.           
    87.                 if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&&  !key.getThree().toString().equals(comparedColumn[2]))  
    88.                  {  
    89.                     // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));  
    90.                      houselist.add(key.getThree().toString());  
    91.                        
    92.                      comparedColumn[0]=key.getFirst().toString();  
    93.                        comparedColumn[1]=key.getSecond().toString();  
    94.                        comparedColumn[2]=key.getThree().toString();  
    95.                         
    96.                  }  
    97.                      
    98.                    if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1]))  
    99.                       
    100.                        {  
    101.                        
    102.                      //  context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));  
    103.                        houselist.add(key.getThree().toString());  
    104.                        comparedColumn[0]=key.getFirst().toString();  
    105.                        comparedColumn[1]=key.getSecond().toString();  
    106.                        comparedColumn[2]=key.getThree().toString();  
    107.                          
    108.                        }  
    109.               
    110.               
    111.                           
    112.             }  
    113.   
    114.   
    115.               
    116.             keyv.set(comparedColumn[0]); //日期  
    117.             //valuev.set(houselist.toString());  
    118.             //logger.info(houselist.toString());  
    119.             //context.write(keyv,valuev);  
    120.               
    121.               
    122.             for (int i=0;i<houselist.size()-1;i++)  
    123.             {  
    124.                 for (int j=i+1;j<houselist.size();j++)  
    125.                 {    valuev.set(houselist.get(i)+"  "+houselist.get(j)); //关联的楼盘  
    126.                     context.write(keyv,valuev);  
    127.                 }  
    128.             }   
    129.               
    130.         }  
    131.       }  
    132.   
    133.       public static void main(String[] args) throws Exception {  
    134.         Configuration conf = new Configuration();  
    135.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
    136.         if (otherArgs.length != 2) {  
    137.           System.err.println("Usage: wordcount <in> <out>");  
    138.           System.exit(2);  
    139.         }  
    140.           
    141.         FileSystem fstm = FileSystem.get(conf);     
    142.         Path outDir = new Path(otherArgs[1]);     
    143.         fstm.delete(outDir, true);  
    144.           
    145.    conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符  
    146.         Job job = new Job(conf, "HouseMergeAndSplit");  
    147.         job.setNumReduceTasks(4);  
    148.         job.setJarByClass(HouseMergeAndSplit.class);  
    149.         job.setMapperClass(TokenizerMapper.class);  
    150.           
    151.         job.setMapOutputKeyClass(TextPair.class);  
    152.         job.setMapOutputValueClass(Text.class);  
    153.         // 设置partition  
    154.         job.setPartitionerClass(Partitioner1.class);  
    155.         // 在分区之后按照指定的条件分组  
    156.         job.setGroupingComparatorClass(Comp1.class);  
    157.         // 设置reduce  
    158.         // 设置reduce的输出  
    159.         job.setReducerClass(IntSumReducer.class);  
    160.         job.setOutputKeyClass(Text.class);  
    161.         job.setOutputValueClass(Text.class);  
    162.         //job.setNumReduceTasks(18);  
    163.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
    164.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
    165.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
    166.       }  
    167. }  

    TextPair

    [java] view plaincopy
     
    1. import java.io.DataInput;  
    2. import java.io.DataOutput;  
    3. import java.io.IOException;  
    4.   
    5. import org.apache.hadoop.io.Text;  
    6. import org.apache.hadoop.io.WritableComparable;  
    7.   
    8. public class TextPair implements WritableComparable<TextPair> {  
    9.     private Text first;  
    10.     private Text second;  
    11.     private Text three;  
    12.     public TextPair() {  
    13.       set(new Text(), new Text(),new Text());  
    14.     }  
    15.     public TextPair(String first, String second,String three) {  
    16.       set(new Text(first), new Text(second),new Text(three));  
    17.     }  
    18.     public TextPair(Text first, Text second,Text Three) {  
    19.       set(first, second,three);  
    20.     }  
    21.     public void set(Text first, Text second,Text three) {  
    22.       this.first = first;  
    23.       this.second = second;  
    24.       this.three=three;  
    25.     }  
    26.     public Text getFirst() {  
    27.       return first;  
    28.     }  
    29.     public Text getSecond() {  
    30.       return second;  
    31.     }  
    32.     public Text getThree() {  
    33.           return three;  
    34.         }  
    35.     public void write(DataOutput out) throws IOException {  
    36.       first.write(out);  
    37.       second.write(out);  
    38.       three.write(out);  
    39.     }  
    40.     public void readFields(DataInput in) throws IOException {  
    41.       first.readFields(in);  
    42.       second.readFields(in);  
    43.       three.readFields(in);  
    44.     }  
    45.     public int compareTo(TextPair tp) {  
    46.       int cmp = first.compareTo(tp.first);  
    47.       if (cmp != 0) {  
    48.        return cmp;  
    49.       }  
    50.       cmp= second.compareTo(tp.second);  
    51.       if (cmp != 0) {  
    52.            return cmp;  
    53.           }  
    54.       return three.compareTo(tp.three);  
    55.     }  
    56.     }  


    TextPairSecond

    [java] view plaincopy
     
    1. import java.io.DataInput;  
    2. import java.io.DataOutput;  
    3. import java.io.IOException;  
    4.   
    5. import org.apache.hadoop.io.FloatWritable;  
    6. import org.apache.hadoop.io.Text;  
    7. import org.apache.hadoop.io.WritableComparable;  
    8.   
    9. public class TextPairSecond implements WritableComparable<TextPairSecond> {  
    10.     private Text first;  
    11.     private FloatWritable second;  
    12.     public TextPairSecond() {  
    13.       set(new Text(), new FloatWritable());  
    14.     }  
    15.     public TextPairSecond(String first, float second) {  
    16.       set(new Text(first), new FloatWritable(second));  
    17.     }  
    18.     public TextPairSecond(Text first, FloatWritable second) {  
    19.       set(first, second);  
    20.     }  
    21.     public void set(Text first, FloatWritable second) {  
    22.       this.first = first;  
    23.       this.second = second;  
    24.     }  
    25.     public Text getFirst() {  
    26.       return first;  
    27.     }  
    28.     public FloatWritable getSecond() {  
    29.       return second;  
    30.     }  
    31.     public void write(DataOutput out) throws IOException {  
    32.       first.write(out);  
    33.       second.write(out);  
    34.     }  
    35.     public void readFields(DataInput in) throws IOException {  
    36.       first.readFields(in);  
    37.       second.readFields(in);  
    38.     }  
    39.     public int compareTo(TextPairSecond tp) {  
    40.       int cmp = first.compareTo(tp.first);  
    41.       if (cmp != 0) {  
    42.        return cmp;  
    43.       }  
    44.       return second.compareTo(tp.second);  
    45.     }  
    46.   
    47.     }  

    第二个MR

    [java] view plaincopy
     
    1. import java.io.IOException;  
    2. import java.text.SimpleDateFormat;  
    3. import java.util.ArrayList;  
    4. import java.util.Date;  
    5.   
    6. import org.apache.hadoop.conf.Configuration;  
    7. import org.apache.hadoop.fs.FileSystem;  
    8. import org.apache.hadoop.fs.Path;  
    9. import org.apache.hadoop.io.IntWritable;  
    10. import org.apache.hadoop.io.LongWritable;  
    11. import org.apache.hadoop.io.NullWritable;  
    12. import org.apache.hadoop.io.Text;  
    13. import org.apache.hadoop.io.WritableComparable;  
    14. import org.apache.hadoop.io.WritableComparator;  
    15. import org.apache.hadoop.mapred.OutputCollector;  
    16. import org.apache.hadoop.mapreduce.Job;  
    17. import org.apache.hadoop.mapreduce.Mapper;  
    18. import org.apache.hadoop.mapreduce.Partitioner;  
    19. import org.apache.hadoop.mapreduce.Reducer;  
    20.   
    21. import org.apache.hadoop.mapreduce.Mapper.Context;  
    22. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
    23. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
    24. import org.apache.hadoop.util.GenericOptionsParser;  
    25. import org.apache.log4j.Logger;  
    26.   
    27.   
    28. /* 
    29.  *  统计楼盘之间共同出现的次数 
    30.  * 输入: 
    31.  * 日期 楼盘1 楼盘2 
    32.  *  
    33.  * 输出: 
    34.  * 日期 楼盘1 楼盘2 共同出现的次数 
    35.  *  
    36.  */  
    37.   
    38. public class HouseCount {  
    39.       
    40.   
    41.       public static class TokenizerMapper   
    42.            extends Mapper<LongWritable, Text, Text, IntWritable>{  
    43.           
    44.       
    45.     IntWritable iw=new IntWritable(1);  
    46.         public void map(LongWritable key, Text value, Context context  
    47.                         ) throws IOException, InterruptedException {  
    48.               
    49.           
    50.          context.write(value, iw);  
    51.         }  
    52.       }  
    53.         
    54.       public static class IntSumReducer   
    55.            extends Reducer<Text,IntWritable,Text,IntWritable> {  
    56.       
    57.          IntWritable result=new IntWritable();  
    58.         public void reduce(Text key, Iterable<IntWritable> values,   
    59.                            Context context  
    60.                            ) throws IOException, InterruptedException {  
    61.               
    62.              int sum=0;  
    63.              for (IntWritable iw:values)  
    64.              {  
    65.                  sum+=iw.get();  
    66.              }  
    67.              result.set(sum);  
    68.          context.write(key, result) ;  
    69.               
    70.         }  
    71.       }  
    72.   
    73.       public static void main(String[] args) throws Exception {  
    74.         Configuration conf = new Configuration();  
    75.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
    76.         if (otherArgs.length != 2) {  
    77.           System.err.println("Usage: wordcount <in> <out>");  
    78.           System.exit(2);  
    79.         }  
    80.           
    81.         FileSystem fstm = FileSystem.get(conf);     
    82.         Path outDir = new Path(otherArgs[1]);     
    83.         fstm.delete(outDir, true);  
    84.           
    85.    conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符  
    86.         Job job = new Job(conf, "HouseCount");  
    87.         job.setNumReduceTasks(2);  
    88.         job.setJarByClass(HouseCount.class);  
    89.         job.setMapperClass(TokenizerMapper.class);  
    90.           
    91.         job.setMapOutputKeyClass(Text.class);  
    92.         job.setMapOutputValueClass(IntWritable.class);  
    93.       
    94.         // 设置reduce  
    95.         // 设置reduce的输出  
    96.         job.setReducerClass(IntSumReducer.class);  
    97.         job.setOutputKeyClass(Text.class);  
    98.         job.setOutputValueClass(IntWritable.class);  
    99.         //job.setNumReduceTasks(18);  
    100.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
    101.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
    102.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
    103.       }  
    104. }  


    第三个MR

    [java] view plaincopy
     
    1. import java.io.IOException;  
    2. import java.text.ParseException;  
    3. import java.text.SimpleDateFormat;  
    4. import java.util.ArrayList;  
    5. import java.util.Calendar;  
    6. import java.util.Date;  
    7.   
    8. import org.apache.hadoop.conf.Configuration;  
    9. import org.apache.hadoop.fs.FileSystem;  
    10. import org.apache.hadoop.fs.Path;  
    11. import org.apache.hadoop.io.FloatWritable;  
    12. import org.apache.hadoop.io.IntWritable;  
    13. import org.apache.hadoop.io.LongWritable;  
    14. import org.apache.hadoop.io.NullWritable;  
    15. import org.apache.hadoop.io.Text;  
    16. import org.apache.hadoop.io.WritableComparable;  
    17. import org.apache.hadoop.io.WritableComparator;  
    18. import org.apache.hadoop.mapred.OutputCollector;  
    19. import org.apache.hadoop.mapreduce.Job;  
    20. import org.apache.hadoop.mapreduce.Mapper;  
    21. import org.apache.hadoop.mapreduce.Partitioner;  
    22. import org.apache.hadoop.mapreduce.Reducer;  
    23.   
    24. import org.apache.hadoop.mapreduce.Mapper.Context;  
    25. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
    26. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
    27. import org.apache.hadoop.util.GenericOptionsParser;  
    28. import org.apache.log4j.Logger;  
    29.   
    30.   
    31. /* 
    32.  * 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数, 并最后a b 转成 b a输出一次 
    33.  * 输入: 
    34.  * 日期  楼盘1 楼盘2 共同出现的次数 
    35.  *  
    36.  * 输出 
    37.  * 楼盘1 楼盘2 共同出现的次数(考虑了衰减系数,每天的衰减系数不一样) 
    38.  *  
    39.  */  
    40.   
    41. public class HouseCountHz {  
    42.       
    43.   
    44.       public static class HouseCountHzMapper   
    45.            extends Mapper<LongWritable, Text, Text, FloatWritable>{  
    46.           
    47.     Text keyv=new Text();  
    48.       
    49.     FloatWritable valuev=new FloatWritable();  
    50.         public void map(LongWritable key, Text value, Context context  
    51.                         ) throws IOException, InterruptedException {  
    52.               
    53.         String[] s=value.toString().split(" ");  
    54.         keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2  
    55.         Calendar date1=Calendar.getInstance();  
    56.           Calendar d2=Calendar.getInstance();  
    57.       
    58.           Date b = null;  
    59.           SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");  
    60.           try {  
    61.             b=sdf.parse(s[0]);  
    62.           } catch (ParseException e) {  
    63.            e.printStackTrace();  
    64.           }  
    65.           d2.setTime(b);  
    66.           long n=date1.getTimeInMillis();  
    67.           long birth=d2.getTimeInMillis();  
    68.           long sss=n-birth;  
    69.           int day=(int)((sss)/(3600*24*1000)); //该条记录的日期与当前日期的日期差  
    70.           float factor=1/(1+(float)(day-1)/10); //衰减系数  
    71.         valuev.set(Float.parseFloat(s[3])*factor);  
    72.           
    73.          context.write(keyv, valuev);  
    74.         }  
    75.       }  
    76.         
    77.       public static class HouseCountHzReducer   
    78.            extends Reducer<Text,FloatWritable,Text,FloatWritable> {  
    79.       
    80.           FloatWritable result=new FloatWritable();  
    81.           Text keyreverse=new Text();  
    82.         public void reduce(Text key, Iterable<FloatWritable> values,   
    83.                            Context context  
    84.                            ) throws IOException, InterruptedException {  
    85.               
    86.              float sum=0;  
    87.              for (FloatWritable iw:values)  
    88.              {  
    89.                  sum+=iw.get();  
    90.              }  
    91.              result.set(sum);  
    92.              String[] keys=key.toString().split(" ");  
    93.              keyreverse.set(keys[1]+"   "+keys[0]);  
    94.          context.write(key, result) ;  
    95.          context.write(keyreverse, result)  ;  
    96.               
    97.         }  
    98.       }  
    99.   
    100.       public static void main(String[] args) throws Exception {  
    101.         Configuration conf = new Configuration();  
    102.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
    103.         if (otherArgs.length != 2) {  
    104.           System.err.println("Usage: wordcount <in> <out>");  
    105.           System.exit(2);  
    106.         }  
    107.           
    108.         FileSystem fstm = FileSystem.get(conf);     
    109.         Path outDir = new Path(otherArgs[1]);     
    110.         fstm.delete(outDir, true);  
    111.           
    112.    conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符  
    113.         Job job = new Job(conf, "HouseCountHz");  
    114.         job.setNumReduceTasks(2);  
    115.         job.setJarByClass(HouseCountHz.class);  
    116.         job.setMapperClass(HouseCountHzMapper.class);  
    117.           
    118.         job.setMapOutputKeyClass(Text.class);  
    119.         job.setMapOutputValueClass(FloatWritable.class);  
    120.       
    121.         // 设置reduce  
    122.         // 设置reduce的输出  
    123.         job.setReducerClass(HouseCountHzReducer.class);  
    124.         job.setOutputKeyClass(Text.class);  
    125.         job.setOutputValueClass(FloatWritable.class);  
    126.         //job.setNumReduceTasks(18);  
    127.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
    128.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
    129.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
    130.       }  
    131. }  


    第四个MR

    [java] view plaincopy
     
      1. import java.io.IOException;  
      2. import java.util.Iterator;  
      3.   
      4.   
      5. import org.apache.hadoop.conf.Configuration;  
      6. import org.apache.hadoop.fs.FileSystem;  
      7. import org.apache.hadoop.fs.Path;  
      8. import org.apache.hadoop.io.FloatWritable;  
      9.   
      10. import org.apache.hadoop.io.LongWritable;  
      11.   
      12. import org.apache.hadoop.io.Text;  
      13. import org.apache.hadoop.io.WritableComparable;  
      14. import org.apache.hadoop.io.WritableComparator;  
      15.   
      16. import org.apache.hadoop.mapreduce.Job;  
      17. import org.apache.hadoop.mapreduce.Mapper;  
      18. import org.apache.hadoop.mapreduce.Partitioner;  
      19. import org.apache.hadoop.mapreduce.Reducer;  
      20.   
      21. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
      22. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
      23. import org.apache.hadoop.util.GenericOptionsParser;  
      24.   
      25.   
      26.   
      27. /* 
      28.  * 输入数据: 
      29.  * 楼盘1 楼盘2 共同出现的次数 
      30.  *  
      31.  * 输出数据 
      32.  *  楼盘1 楼盘2,楼盘3,楼盘4 (按次数排序) 
      33.  */  
      34.   
      35. public class HouseRowToCol {  
      36.       
      37.     public static class Partitioner1 extends Partitioner<TextPairSecond, Text> {  
      38.           @Override  
      39.           //分区  
      40.           public int getPartition(TextPairSecond key, Text value, int numParititon) {  
      41.                       return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;  
      42.   
      43.           }  
      44.     }  
      45.     //分组  
      46.           public static class Comp1 extends WritableComparator {  
      47.               public Comp1() {  
      48.                super(TextPairSecond.class, true);  
      49.               }  
      50.               @SuppressWarnings("unchecked")  
      51.               public int compare(WritableComparable a, WritableComparable b) {  
      52.                   TextPairSecond t1 = (TextPairSecond) a;  
      53.                   TextPairSecond t2 = (TextPairSecond) b;  
      54.                 return t1.getFirst().compareTo(t2.getFirst());  
      55.   
      56.               }  
      57.             }  
      58.             
      59.           //排序  
      60.           public static class KeyComp extends WritableComparator {  
      61.               public KeyComp() {  
      62.                super(TextPairSecond.class, true);  
      63.               }  
      64.               @SuppressWarnings("unchecked")  
      65.               public int compare(WritableComparable a, WritableComparable b) {  
      66.                   TextPairSecond t1 = (TextPairSecond) a;  
      67.                   TextPairSecond t2 = (TextPairSecond) b;  
      68.                int comp= t1.getFirst().compareTo(t2.getFirst());  
      69.                if (comp!=0)  
      70.                    return comp;  
      71.                return -t1.getSecond().compareTo(t2.getSecond());  
      72.               }  
      73.             }   
      74.       public static class HouseRowToColMapper   
      75.            extends Mapper<LongWritable, Text, TextPairSecond, Text>{  
      76.   
      77.           Text houseid1=new Text();  
      78.           Text houseid2=new Text();  
      79.           FloatWritable weight=new FloatWritable();  
      80.         public void map(LongWritable key, Text value, Context context  
      81.                         ) throws IOException, InterruptedException {  
      82.               
      83.          String s[]=value.toString().split(" ");  
      84.        
      85.            weight.set(Float.parseFloat(s[2]));  
      86.            houseid1.set(s[0]);  
      87.            houseid2.set(s[1]);  
      88.          TextPairSecond tp=new TextPairSecond(houseid1,weight);   
      89.          context.write(tp, houseid2);  
      90.         }  
      91.       }  
      92.         
      93.       public static class HouseRowToColReducer   
      94.            extends Reducer<TextPairSecond,Text,Text,Text> {  
      95.             
      96.        Text valuev=new Text();  
      97.         public void reduce(TextPairSecond key, Iterable<Text> values,   
      98.                            Context context  
      99.                            ) throws IOException, InterruptedException {  
      100.             Text keyv=key.getFirst();  
      101.             Iterator<Text> it=values.iterator();  
      102.             StringBuilder sb=new StringBuilder(it.next().toString());  
      103.             while(it.hasNext())  
      104.             {  
      105.                 sb.append(","+it.next().toString());  
      106.             }  
      107.             valuev.set(sb.toString());  
      108.             context.write(keyv, valuev);  
      109.               
      110.               
      111.               
      112.         }  
      113.       }  
      114.   
      115.       public static void main(String[] args) throws Exception {  
      116.         Configuration conf = new Configuration();  
      117.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
      118.         if (otherArgs.length != 2) {  
      119.           System.err.println("Usage: wordcount <in> <out>");  
      120.           System.exit(2);  
      121.         }  
      122.           
      123.         FileSystem fstm = FileSystem.get(conf);     
      124.         Path outDir = new Path(otherArgs[1]);     
      125.         fstm.delete(outDir, true);  
      126.           
      127.    conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符  
      128.         Job job = new Job(conf, "HouseRowToCol");  
      129.         job.setNumReduceTasks(4);  
      130.         job.setJarByClass(HouseRowToCol.class);  
      131.         job.setMapperClass(HouseRowToColMapper.class);  
      132.           
      133.         job.setMapOutputKeyClass(TextPairSecond.class);  
      134.         job.setMapOutputValueClass(Text.class);  
      135.         // 设置partition  
      136.         job.setPartitionerClass(Partitioner1.class);  
      137.         // 在分区之后按照指定的条件分组  
      138.         job.setGroupingComparatorClass(Comp1.class);  
      139.         job.setSortComparatorClass(KeyComp.class);  
      140.         // 设置reduce  
      141.         // 设置reduce的输出  
      142.         job.setReducerClass(HouseRowToColReducer.class);  
      143.         job.setOutputKeyClass(Text.class);  
      144.         job.setOutputValueClass(Text.class);  
      145.         //job.setNumReduceTasks(18);  
      146.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
      147.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
      148.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
      149.       }  
      150. }  
  • 相关阅读:
    线程池的优雅关闭实践
    InheritableThreadLocal原理解析
    线程池踩坑
    两个线程通讯(生产-卖面包问题)
    谈谈redis的热key问题如何解决
    中国软件杯选题A1数据智能分析报告系统
    《程序员的思维修炼》读后感
    《算法导论》读后感
    《重构》读后感
    《代码整洁之道》读后感
  • 原文地址:https://www.cnblogs.com/cxzdy/p/5103802.html
Copyright © 2020-2023  润新知