• 学习日志---9


    日志分析---从10000条数据中统计各个浏览器占比,数据格式如下

    183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getadv HTTP/1.1" 200 813 "www.neusoft.com" "-" cid=0&timestamp=1478707261865&uid=2871142&marking=androidbanner&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=f51e97d1cb1a9caac669ea8acc162b96 "neuedu/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.134.244:80 200 0.027 0.027
    10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
    117.35.88.11 - - [10/Nov/2016:00:01:02 +0800] "GET /article/ajaxcourserecommends?id=124 HTTP/1.1" 200 2345 "www.neusoft.com" "http://www.neusoft.com/code/1852" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36" "-" 10.100.136.65:80 200 0.616 0.616
    182.106.215.93 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.neuedu.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.004 0.004
    10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
    183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/userdynamic HTTP/1.1" 200 19501 "www.neusoft.com" "-" cid=0&timestamp=1478707261847&uid=2871142&touid=2871142&page=1&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=3837a5bf27ea718fe18bda6c53fbbc14 "neuedu/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.195 0.195
    10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
    114.248.161.26 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getcourseintro HTTP/1.1" 200 2510 "www.neusoft.com" "-" cid=283&secrect=86b720f312c2b25da3b20e59e7c89780&timestamp=1478707261951&token=4c144b3f4314178b9527d1e91ecc0fac&uid=3372975 "neuedu/5.0.2 (iPhone; iOS 8.4.1; Scale/2.00)" "-" 10.100.136.65:80 200 0.007 0.008
    120.52.94.105 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getmediainfo_ver2 HTTP/1.1" 200 633 "www.neusoft.com" "-" cid=608&secrect=e25994750eb2bbc7ade1a36708b999a5&timestamp=1478707261945&token=9bbdba949aec02735e59e0868b538e19&uid=4203162 "neuedu/5.0.2 (iPhone; iOS 10.0.1; Scale/3.00)" "-" 10.100.136.65:80 200 0.049 0.049
    10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
    112.10.136.45 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.neuedu.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.006 0.006
    211.162.33.31 - - [10/Nov/2016:00:01:02 +0800] "GET /u/card HTTP/1.1" 200 331 "www.neusoft.com" "http://www.neusoft.com/code/2053" - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" "-" 10.100.136.65:80 200 0.371 0.371
    116.22.196.70 - - [10/Nov/2016:00:01:02 +0800] "POST /course/ajaxmediauser HTTP/1.1" 200 54 "www.neusoft.com" "http://www.neusoft.com/code/3500" mid=3500&time=60 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0" "-" 10.100.134.244:80 200 0.026 0.026

    难点:

      1.从每一行中找到描述浏览器信息的字符串;

      2.从这字符串中解析出浏览器;

    解决:

      1.多观察可以发现,每一行在第七个“ " ”之后的字符串为浏览器信息,可以通过以下方式获取

     1  /**
     2   * 获取指定字符串中指定标识符出现的索引位置
     3   **/
     4 private int getCharacterPosition(String value, String operator, int index) {
     5         Matcher slashMatcher = Pattern.compile(operator).matcher(value);
     6         int mIdex = 0;
     7         while (slashMatcher.find()) {
     8             mIdex++;
     9 
    10             if (mIdex == index) {
    11                 break;
    12             }
    13         }
    14         return slashMatcher.start();
    15     }

      2.使用GitHub上现成的工具UserAgentParser,使用方法如下

    1 userAgentParser = new UserAgentParser();
    2 String userAgentString = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.13) Gecko/20100914 Firefox/3.5.13 (.NET CLR 3.5.30729)";
    3 String browser = userAgentParser.browser(userAgentString);

    全部代码如下:

    TravelMapper.java

     1 package travel;
     2 
     3 import java.io.IOException;
     4 import java.util.regex.Matcher;
     5 import java.util.regex.Pattern;
     6 
     7 import org.apache.hadoop.io.LongWritable;
     8 import org.apache.hadoop.io.Text;
     9 import org.apache.hadoop.mapreduce.Mapper;
    10 
    11 import com.kumkee.userAgent.UserAgent;
    12 import com.kumkee.userAgent.UserAgentParser;
    13 
    14 public class TravelMapper extends Mapper<LongWritable, Text, Text, Text> {
    15 
    16     @Override
    17     protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
    18             throws IOException, InterruptedException {
    19         // TODO 自动生成的方法存根
    20         String line=value.toString();
    21         UserAgentParser userAgentParser= new UserAgentParser();
    22         String source = line.substring(getCharacterPosition(line, """, 7) + 1);
    23         UserAgent agent = userAgentParser.parse(source);
    24         String browser = agent.getBrowser();
    25 
    26         // 通过上下文把map的处理结果输出
    27         context.write(new Text(browser), new Text("1"));
    28     }
    29 
    30     private int getCharacterPosition(String value, String operator, int index) {
    31         Matcher slashMatcher = Pattern.compile(operator).matcher(value);
    32         int mIdex = 0;
    33         while (slashMatcher.find()) {
    34             mIdex++;
    35 
    36             if (mIdex == index) {
    37                 break;
    38             }
    39         }
    40         return slashMatcher.start();
    41     }
    42     
    43 }

    TravelReducer.java

     1 package travel;
     2 
     3 import java.io.IOException;
     4 
     5 import org.apache.hadoop.io.Text;
     6 import org.apache.hadoop.mapreduce.Reducer;
     7 
     8 public class TravelReducer extends Reducer<Text, Text, Text, Text> {
     9 
    10     @Override
    11     protected void reduce(Text arg0, Iterable<Text> arg1,
    12             Reducer<Text, Text, Text, Text>.Context arg2) throws IOException, InterruptedException {
    13         // TODO 自动生成的方法存根
    14 
    15         int count=0;
    16         for (Text text : arg1) {
    17             count++;
    18             
    19         }
    20         double percent=(double)count/100;
    21         arg2.write(arg0, new Text(percent+"%"));
    22     }
    23     
    24 
    25 }

    MyJob.java

     1 package travel;
     2 
     3 import org.apache.hadoop.conf.Configuration;
     4 import org.apache.hadoop.conf.Configured;
     5 import org.apache.hadoop.fs.Path;
     6 import org.apache.hadoop.io.Text;
     7 import org.apache.hadoop.mapreduce.Job;
     8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    10 import org.apache.hadoop.util.Tool;
    11 import org.apache.hadoop.util.ToolRunner;
    12 
    13 
    14 
    15 public class MyJob extends Configured implements Tool{
    16     
    17     public static void main(String[] args) throws Exception {
    18         System.setProperty("hadoop.home.dir", "E:\hadoop");
    19         MyJob myJob=new MyJob();
    20         ToolRunner.run(myJob, null);
    21     }
    22     public int run(String[] args) throws Exception {
    23         // TODO Auto-generated method stub
    24         Configuration conf=new Configuration();
    25         conf.set("fs.default.name", "hdfs://192.168.137.11:9000");
    26         Job job=Job.getInstance(conf);
    27         job.setJarByClass(MyJob.class);
    28         job.setMapperClass(TravelMapper.class);
    29         job.setReducerClass(TravelReducer.class);
    30         job.setOutputKeyClass(Text.class);
    31         job.setOutputValueClass(Text.class);
    32         job.setMapOutputKeyClass(Text.class);
    33         job.setMapOutputValueClass(Text.class);
    34         FileInputFormat.addInputPath(job, new Path("/hadoop/test.log"));
    35         FileOutputFormat.setOutputPath(job, new Path("/hadoop/TravelResult"));
    36         job.waitForCompletion(true);
    37         
    38         return 0;
    39     }
    40 
    41 }

    结果:

  • 相关阅读:
    [每天进步一点 流水账]第4周
    单指令流多数据流( SIMD)
    [每天进步一点 流水账]第2周
    写时复制技术(COW)
    ECMAScript 运算符乘性运算符
    ECMAScript 运算符Boolean 运算符
    ECMAScript 基础保留字
    ECMAScript 基础关键字
    ECMAScript 运算符一元运算符
    ECMAScript 基础原始类型
  • 原文地址:https://www.cnblogs.com/yifengyifeng/p/9333832.html
Copyright © 2020-2023  润新知