• MapReduce清洗日志数据统计PV量


      1 package mapreduce.webpv;
      2 
      3 import java.io.IOException;
      4 import org.apache.commons.lang.StringUtils;
      5 import org.apache.hadoop.conf.Configuration;
      6 import org.apache.hadoop.conf.Configured;
      7 import org.apache.hadoop.fs.Path;
      8 import org.apache.hadoop.io.IntWritable;
      9 import org.apache.hadoop.io.LongWritable;
     10 import org.apache.hadoop.io.Text;
     11 import org.apache.hadoop.mapreduce.Job;
     12 import org.apache.hadoop.mapreduce.Mapper;
     13 import org.apache.hadoop.mapreduce.Reducer;
     14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     16 import org.apache.hadoop.util.Tool;
     17 import org.apache.hadoop.util.ToolRunner;
     18 
     19 public class WebPvMapReduce extends Configured implements Tool {
     20 
     21     // step 1: Mapper
     22     public static class WebPvMapper extends
     23             Mapper<LongWritable, Text, IntWritable, IntWritable> {
     24         private IntWritable mapOutputKey = new IntWritable();
     25         private IntWritable mapOutputValue = new IntWritable(1);
     26 
     27         @Override
     28         public void map(LongWritable key, Text value, Context context)
     29                 throws IOException, InterruptedException {
     30 
     31             // line value
     32             String lineValue = value.toString();
     33 
     34             // spilt
     35             String[] values = lineValue.split("	");
     36 
     37             // url
     38             String urlValue = values[1];
     39 
     40             if (StringUtils.isBlank(urlValue)) {
     41                 // conuter
     42                 context.getCounter("WEBPVMAPPER_CUUNTERS", "URL_BLANK")
     43                         .increment(1L);
     44                 return;
     45             }
     46 
     47             if (30 > values.length) {
     48 
     49                 // conuter
     50                 context.getCounter("WEBPVMAPPER_CUUNTERS", "LENGTH_LT_30")
     51                         .increment(1L);
     52 
     53                 return;
     54             }
     55 
     56             // province id
     57             String provinceIdValue = values[23];
     58 
     59             if (StringUtils.isBlank(provinceIdValue)) {
     60                 // conuter
     61                 context.getCounter("WEBPVMAPPER_CUUNTERS", "PROVINCEID_BLANK")
     62                         .increment(1L);
     63                 return;
     64             }
     65 
     66             Integer provinceId = Integer.MAX_VALUE;
     67             try {
     68                 provinceId = Integer.valueOf(provinceIdValue);
     69             } catch (Exception e) {
     70                 // conuter
     71                 context.getCounter("WEBPVMAPPER_CUUNTERS",
     72                         "PROVINCEID_NOT_NUMBER").increment(1L);
     73                 return;
     74             }
     75 
     76             // map outpu key
     77             mapOutputKey.set(provinceId);
     78 
     79             context.write(mapOutputKey, mapOutputValue);
     80         }
     81     }
     82 
     83     // step 2: Reducer
     84     public static class WebPvReducer extends
     85             Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
     86         private IntWritable outputValue = new IntWritable();
     87 
     88         @Override
     89         protected void reduce(IntWritable key, Iterable<IntWritable> values,
     90                 Context context) throws IOException, InterruptedException {
     91             // temp sum
     92             int sum = 0;
     93 
     94             // iterator
     95             for (IntWritable value : values) {
     96                 sum += value.get();
     97             }
     98 
     99             // set output
    100             outputValue.set(sum);
    101 
    102             context.write(key, outputValue);
    103         }
    104     }
    105 
    106     // step 3: Driver
    107     public int run(String[] args) throws Exception {
    108 
    109         Configuration configuration = this.getConf();
    110 
    111         Job job = Job.getInstance(configuration, this.getClass()
    112                 .getSimpleName());
    113         job.setJarByClass(WebPvMapReduce.class);
    114 
    115         // set job
    116         // input
    117         Path inpath = new Path(args[0]);
    118         FileInputFormat.addInputPath(job, inpath);
    119 
    120         // output
    121         Path outPath = new Path(args[1]);
    122         FileOutputFormat.setOutputPath(job, outPath);
    123 
    124         // Mapper
    125         job.setMapperClass(WebPvMapper.class);
    126         job.setMapOutputKeyClass(IntWritable.class);
    127         job.setMapOutputValueClass(IntWritable.class);
    128 
    129         // Reducer
    130         job.setReducerClass(WebPvReducer.class);
    131         job.setOutputKeyClass(IntWritable.class);
    132         job.setOutputValueClass(IntWritable.class);
    133 
    134         // submit job -> YARN
    135         boolean isSuccess = job.waitForCompletion(true);
    136         return isSuccess ? 0 : 1;
    137     }
    138 
    139     public static void main(String[] args) throws Exception {
    140 
    141         Configuration configuration = new Configuration();
    142 
    143         args = new String[] {
    144                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/input/testdata/2015082818",
    145                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/output1" };
    146 
    147         int status = ToolRunner.run(configuration, new WebPvMapReduce(), args);
    148 
    149         // exit program
    150         System.exit(status);
    151     }
    152 }

     查看结果

     1 $ bin/hdfs dfs -text /user/beifeng01/mapreduce/output1/pa*
     2 1       3527
     3 2       1672
     4 3       511
     5 4       325
     6 5       776
     7 6       661
     8 7       95
     9 8       80
    10 9       183
    11 10      93
    12 11      135
    13 12      289
    14 13      264
    15 14      374
    16 15      163
    17 16      419
    18 17      306
    19 18      272
    20 19      226
    21 20      2861
    22 21      124
    23 22      38
    24 23      96
    25 24      100
    26 25      20
    27 26      157
    28 27      49
    29 28      21
    30 29      85
    31 30      42
    32 32      173
  • 相关阅读:
    洛谷 P1567 统计天数【最长上升子序列/断则归一】
    洛谷 P3742 umi的函数【构造】
    洛谷 P1036 选数【背包型DFS/选or不选】
    nyoj zb的生日【背包型DFS/选or不选】
    POJ 3628 Bookshelf 2【背包型DFS/选or不选】
    【AHOI2013复仇】从一道题来看DFS及其优化的一般步骤和数组分层问题【转】
    洛谷 P1217 [USACO1.5]回文质数 Prime Palindromes【取回文数/数论/字符串】
    洛谷 P1004 方格取数 【多线程DP/四维DP/】
    Codeforces Round #449 (Div. 2) B. Chtholly's request【偶数位回文数】
    Codeforces Round #449 (Div. 2) A. Scarborough Fair【多次区间修改字符串】
  • 原文地址:https://www.cnblogs.com/perfectdata/p/10103171.html
Copyright © 2020-2023  润新知