一:准备
1.统计的维度
guid
tracktime
provice
2.key与value的设定
key:date+provice_guid
value:NullWritable
3.案例分析
表示某天某个省份的某个人无论访问网站多少次,仅仅记做一次访问统计
UV:统计页面访问的总人数---》userID对于用户进行去重
二:程序
1.map程序
2.reduce程序
3.结果
4.理解点
1)怎么去重
数据key的形式:date+provice_guid。
当guid是相同的时候,在shuffle的group分组时,key被分组,一起的放在一起,而value则是nullwritable,没有使用value。
所以到达reduce的时候,数据已经被去重了。
2)NullWritable.get()
使用反射,获得NullWritable的对象。
5.完整程序
1 package com.senior.network; 2 3 import java.io.IOException; 4 import java.util.HashMap; 5 import java.util.Map; 6 import java.util.Set; 7 8 import org.apache.commons.lang.StringUtils; 9 import org.apache.hadoop.conf.Configuration; 10 import org.apache.hadoop.conf.Configured; 11 import org.apache.hadoop.fs.Path; 12 import org.apache.hadoop.io.IntWritable; 13 import org.apache.hadoop.io.LongWritable; 14 import org.apache.hadoop.io.NullWritable; 15 import org.apache.hadoop.io.Text; 16 import org.apache.hadoop.mapreduce.Job; 17 import org.apache.hadoop.mapreduce.Mapper; 18 import org.apache.hadoop.mapreduce.Mapper.Context; 19 import org.apache.hadoop.mapreduce.Reducer; 20 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 import org.apache.hadoop.util.Tool; 23 import org.apache.hadoop.util.ToolRunner; 24 25 public class WebUvCount extends Configured implements Tool{ 26 //Mapper 27 public static class WebUvCountMapper extends Mapper<LongWritable,Text,Text,NullWritable>{ 28 private Text mapoutputkey=new Text(); 29 @Override 30 protected void cleanup(Context context) throws IOException,InterruptedException { 31 32 } 33 @Override 34 protected void setup(Context context) throws IOException,InterruptedException { 35 36 } 37 38 @Override 39 protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { 40 String lineValue=value.toString(); 41 String[] strs=lineValue.split(" "); 42 if(30>strs.length){ 43 context.getCounter("webPvMapper_counter", "length_LT_30").increment(1L); 44 return; 45 } 46 String guidValue=strs[5]; // 47 if(StringUtils.isEmpty(guidValue)){ 48 return; 49 } 50 String trackTimeValue=strs[17]; 51 if(StringUtils.isEmpty(trackTimeValue)){ 52 return; 53 } 54 String dateVAlue=trackTimeValue.substring(0,13);// 55 String priviceIdValue=strs[23]; 56 57 Integer priviceId=Integer.MAX_VALUE; 58 try{ 59 priviceId = Integer.valueOf(priviceIdValue); // 60 }catch(Exception e){ 61 return; 62 } 63 64 mapoutputkey.set(dateVAlue+" "+priviceIdValue+"_"+guidValue); 65 context.write(mapoutputkey,NullWritable.get()); 66 } 67 68 } 69 70 71 72 //Reducer 73 public static class WebUvCountReducer extends Reducer<Text,NullWritable,Text,IntWritable>{ 74 private Text outputkey=new Text(); 75 private Map<String,Integer> dateMap; 76 private IntWritable outputvalue=new IntWritable(); 77 78 @Override 79 protected void setup(Context context)throws IOException, InterruptedException { 80 dateMap=new HashMap<String,Integer>(); 81 } 82 83 @Override 84 protected void reduce(Text key, Iterable<NullWritable> values,Context context)throws IOException, InterruptedException { 85 String date=key.toString().split("_")[0]; 86 if(dateMap.containsKey(date)){ 87 Integer previousUV=dateMap.get(date); 88 Integer uv=previousUV+1; 89 dateMap.put(date, uv); 90 }else{ 91 dateMap.put(date, 1); 92 } 93 } 94 95 @Override 96 protected void cleanup(Context context)throws IOException, InterruptedException { 97 Set<String> dateSet=dateMap.keySet(); 98 for(String date:dateSet){ 99 Integer uv=dateMap.get(date); 100 outputkey.set(date); 101 outputvalue.set(uv); 102 context.write(outputkey, outputvalue); 103 } 104 } 105 106 107 } 108 109 //Driver 110 public int run(String[] args)throws Exception{ 111 Configuration conf=this.getConf(); 112 Job job=Job.getInstance(conf,this.getClass().getSimpleName()); 113 job.setJarByClass(WebUvCount.class); 114 //input 115 Path inpath=new Path(args[0]); 116 FileInputFormat.addInputPath(job, inpath); 117 118 //output 119 Path outpath=new Path(args[1]); 120 FileOutputFormat.setOutputPath(job, outpath); 121 122 //map 123 job.setMapperClass(WebUvCountMapper.class); 124 job.setMapOutputKeyClass(Text.class); 125 job.setMapOutputValueClass(NullWritable.class); 126 127 //shuffle 128 129 //reduce 130 job.setReducerClass(WebUvCountReducer.class); 131 job.setOutputKeyClass(Text.class); 132 job.setOutputValueClass(IntWritable.class); 133 134 //submit 135 boolean isSucess=job.waitForCompletion(true); 136 return isSucess?0:1; 137 } 138 139 //main 140 public static void main(String[] args)throws Exception{ 141 Configuration conf=new Configuration(); 142 //compress 143 conf.set("mapreduce.map.output.compress", "true"); 144 conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); 145 args=new String[]{ 146 "hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/inputWebData", 147 "hdfs://linux-hadoop01.ibeifeng.com:8020/user/beifeng/mapreduce/wordcount/outputWebData6" 148 }; 149 int status=ToolRunner.run(new WebUvCount(), args); 150 System.exit(status); 151 } 152 153 }