059 使用脚本做一次案例分析（不包括数据的收集）

1.日志样本

27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
110.52.250.126 - - [30/May/2013:17:38:20 +0800] "GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1" 200 1292
27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_1.gif HTTP/1.1" 200 680
27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_2.gif HTTP/1.1" 200 682
27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/filetype/common.gif HTTP/1.1" 200 90

2.需求

　　-》统计PV数
　　-》统计注册人数
　　-》统计ip数
　　-》统计跳出率-》求跳出人数

3.分析

　　-》数据采集 shell脚本定时上传到hdfs
　　-》数据清洗过滤字段格式化字段
　　-》数据分析分区表
　　-》数据导出 sqoop
　　-》使用框架 shell脚本 hdfs mapreduce hive sqoop mysql

4.脚本

　　这个包括自动上传日志，清洗日志，hive建立表，分析处理

 1 #!/bin/bash
 2 
 3 #get the yesterday date
 4 yesterday=`date -d "-1 day" +"%Y_%m_%d"`
 5 
 6 #define the HADOOP_HOME  and  HIVE_HOME
 7 HADOOP_HOME=/opt/modules/hadoop-2.5.0
 8 HIVE_HOME=/opt/modules/hive-0.13.1-bin
 9 LOG_DIR=/opt/datas/logs
10 FILE=access_$yesterday.log
11 HDFS_DIR=/log/source/$yesterday
12 
13 JAR_PATH=$LOG_DIR/logclean.jar
14 ENTRANCE=org.apache.hadoop.log.project.LogClean
15 OUTPUT_DIR=/log/clean/date=$yesterday
16 
17 HIVE_DB=log_case
18 HIVE_TB=use_tb
19 
20 SQOOP_HOME=/opt/cdh-5.3.6/sqoop-1.4.5-cdh5.3.6
21 
22 ########################################
23 #  load the data into hdfs             #
24 ########################################
25 
26 #show the yesterday date
27 echo "[**yesterday is $yesterday**]"
28 
29 #create the hdfs_path
30 $HADOOP_HOME/bin/hdfs dfs -rm -r $HDFS_DIR >/dev/null 2>&1
31 $HADOOP_HOME/bin/hdfs dfs -rm -r $OUTPUT_DIR >/dev/null 2>&1
32 $HADOOP_HOME/bin/hdfs dfs -mkdir $HDFS_DIR
33 
34 #put the date to hdfs 
35 $HADOOP_HOME/bin/hdfs dfs -put $LOG_DIR/$FILE $HDFS_DIR
36 echo "[**the file $FILE is put to $HDFS_DIR**]"
37 
38 ########################################
39 #  clean the source data               #
40 ########################################
41 
42 $HADOOP_HOME/bin/yarn jar $JAR_PATH $ENTRANCE $HDFS_DIR $OUTPUT_DIR
43 echo "[**the file $FILE is cleaned**]"
44 
45 ########################################
46 #  load the cleaned data to hive       #
47 ########################################
48 
49 $HIVE_HOME/bin/hive -e "create database if not exists $HIVE_DB"
50 $HIVE_HOME/bin/hive -e "create external table if not exists $HIVE_DB.$HIVE_TB(ip string,time string,url string) partitioned by (date string) row format delimited fields terminated by '	' location '/log/clean'"
51 $HIVE_HOME/bin/hive --database $HIVE_DB -e "alter table $HIVE_TB drop partition (date='$yesterday')"
52 $HIVE_HOME/bin/hive --database $HIVE_DB -e "alter table $HIVE_TB add partition (date='$yesterday')"
53 echo "[**add a partition $yesterday to $HIVE_DB.$HIVE_TB**]"
54 
55 ########################################
56 #  analysis the date using hive        #
57 ########################################
58 
59 ##PV
60 echo "-------------------------pv start---------------------------------------"
61 $HIVE_HOME/bin/hive --database $HIVE_DB -e "CREATE TABLE if not exists pv_tb(pv string) row format delimited fields terminated by '	'"
62 $HIVE_HOME/bin/hive --database $HIVE_DB -e "insert overwrite table pv_tb SELECT COUNT(1) FROM $HIVE_TB WHERE date='$yesterday'"
63 
64 echo "-------------------------pv finished------------------------------------"
65 
66 ##register
67 echo "-------------------------rg start---------------------------------------"
68 $HIVE_HOME/bin/hive --database $HIVE_DB -e "create table if not exists register_tb(rg string) row format delimited fields terminated by '	'"
69 $HIVE_HOME/bin/hive --database $HIVE_DB -e "insert overwrite table register_tb select count(1) from $HIVE_TB where date='$yesterday' and instr(url,'member.php?mod=register')>0"
70 echo "-------------------------rg finished------------------------------------"
71 
72 ##ip
73 echo "-------------------------ip start---------------------------------------"
74 $HIVE_HOME/bin/hive --database $HIVE_DB -e "CREATE TABLE if not exists ip_tb(ip string) row format delimited fields terminated by '	'"
75 $HIVE_HOME/bin/hive --database $HIVE_DB -e "insert overwrite table ip_tb select count(distinct ip) from $HIVE_TB where date='$yesterday'"
76 echo "-------------------------ip finished------------------------------------"
77 
78 ##jump
79 echo "-------------------------jp start---------------------------------------"
80 $HIVE_HOME/bin/hive --database $HIVE_DB -e "CREATE TABLE if not exists jump_tb(jp string) row format delimited fields terminated by '	'"
81 $HIVE_HOME/bin/hive --database $HIVE_DB -e "insert overwrite table jump_tb select count(1) from (select count(ip) ip_single from $HIVE_TB where date='$yesterday' group by ip having ip_single=1) jump"
82 echo "-------------------------jp finished------------------------------------"
83 
84 ##result
85 echo "**************************create the result table************************"
86 $HIVE_HOME/bin/hive --database $HIVE_DB -e "create table if not exists result(day string,pv string,register string ,ip string ,jump string) row format delimited fields terminated by '	'"
87 $HIVE_HOME/bin/hive --database $HIVE_DB -e "insert overwrite table result select '$yesterday',a.pv,b.rg,c.ip,d.jp from pv_tb a join register_tb b on 1=1 join ip_tb c on 1=1 join jump_tb d on 1=1"
88 
89 ##export to mysql
90 $SQOOP_HOME/bin/sqoop --options-file /opt/datas/logs/sqoop.file

5.数据清洗的jar源代码

  1 package org.apache.hadoop.log.project;
  2 
  3 import java.net.URI;
  4 import java.text.ParseException;
  5 import java.text.SimpleDateFormat;
  6 import java.util.Date;
  7 import java.util.Locale;
  8 
  9 import org.apache.hadoop.conf.Configuration;
 10 import org.apache.hadoop.conf.Configured;
 11 import org.apache.hadoop.fs.FileSystem;
 12 import org.apache.hadoop.fs.Path;
 13 import org.apache.hadoop.io.LongWritable;
 14 import org.apache.hadoop.io.NullWritable;
 15 import org.apache.hadoop.io.Text;
 16 import org.apache.hadoop.mapreduce.Job;
 17 import org.apache.hadoop.mapreduce.Mapper;
 18 import org.apache.hadoop.mapreduce.Reducer;
 19 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 20 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 21 import org.apache.hadoop.util.Tool;
 22 import org.apache.hadoop.util.ToolRunner;
 23 
 24 
 25 public class LogClean extends Configured implements Tool {
 26 
 27     public static void main(String[] args) {
 28         Configuration conf = new Configuration();
 29         try {
 30             int res = ToolRunner.run(conf, new LogClean(), args);
 31             System.exit(res);
 32         } catch (Exception e) {
 33             e.printStackTrace();
 34         }
 35     }
 36 
 37     public int run(String[] args) throws Exception {
 38         Configuration conf = new Configuration();
 39         Job job = Job.getInstance(conf, "logclean");
 40         // 设置为可以打包运行
 41 
 42         job.setJarByClass(LogClean.class);
 43         FileInputFormat.setInputPaths(job, args[0]);
 44         job.setMapperClass(MyMapper.class);
 45         job.setMapOutputKeyClass(LongWritable.class);
 46         job.setMapOutputValueClass(Text.class);
 47         job.setReducerClass(MyReducer.class);
 48         job.setOutputKeyClass(Text.class);
 49         job.setOutputValueClass(NullWritable.class);
 50         FileOutputFormat.setOutputPath(job, new Path(args[1]));
 51         // 清理已存在的输出文件
 52         FileSystem fs = FileSystem.get(new URI(args[0]), getConf());
 53         Path outPath = new Path(args[1]);
 54         if (fs.exists(outPath)) {
 55             fs.delete(outPath, true);
 56         }
 57         
 58         boolean success = job.waitForCompletion(true);
 59         if(success){
 60             System.out.println("Clean process success!");
 61         }
 62         else{
 63             System.out.println("Clean process failed!");
 64         }
 65         return 0;
 66     }
 67 
 68     static class MyMapper extends
 69             Mapper<LongWritable, Text, LongWritable, Text> {
 70         LogParser logParser = new LogParser();
 71         Text outputValue = new Text();
 72 
 73         protected void map(
 74                 LongWritable key,
 75                 Text value,
 76                 org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, LongWritable, Text>.Context context)
 77                 throws java.io.IOException, InterruptedException {
 78             final String[] parsed = logParser.parse(value.toString());
 79 
 80             // step1.过滤掉静态资源访问请求
 81             if (parsed[2].startsWith("GET /static/")
 82                     || parsed[2].startsWith("GET /uc_server")) {
 83                 return;
 84             }
 85             // step2.过滤掉开头的指定字符串
 86             if (parsed[2].startsWith("GET /")) {
 87                 parsed[2] = parsed[2].substring("GET /".length());
 88             } else if (parsed[2].startsWith("POST /")) {
 89                 parsed[2] = parsed[2].substring("POST /".length());
 90             }
 91             // step3.过滤掉结尾的特定字符串
 92             if (parsed[2].endsWith(" HTTP/1.1")) {
 93                 parsed[2] = parsed[2].substring(0, parsed[2].length()
 94                         - " HTTP/1.1".length());
 95             }
 96             // step4.只写入前三个记录类型项
 97             outputValue.set(parsed[0] + "	" + parsed[1] + "	" + parsed[2]);
 98             context.write(key, outputValue);
 99         }
100     }
101 
102     static class MyReducer extends
103             Reducer<LongWritable, Text, Text, NullWritable> {
104         protected void reduce(
105                 LongWritable k2,
106                 java.lang.Iterable<Text> v2s,
107                 org.apache.hadoop.mapreduce.Reducer<LongWritable, Text, Text, NullWritable>.Context context)
108                 throws java.io.IOException, InterruptedException {
109             for (Text v2 : v2s) {
110                 context.write(v2, NullWritable.get());
111             }
112         };
113     }
114 
115     /*
116      * 日志解析类
117      */
118     static class LogParser {
119         public static final SimpleDateFormat FORMAT = new SimpleDateFormat(
120                 "d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
121         public static final SimpleDateFormat dateformat1 = new SimpleDateFormat(
122                 "yyyyMMddHHmmss");
123 
124         public static void main(String[] args) throws ParseException {
125             final String S1 = "27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127";
126             LogParser parser = new LogParser();
127             final String[] array = parser.parse(S1);
128             System.out.println("样例数据： " + S1);
129             System.out.format(
130                     "解析结果：  ip=%s, time=%s, url=%s, status=%s, traffic=%s",
131                     array[0], array[1], array[2], array[3], array[4]);
132         }
133 
134         /**
135          * 解析英文时间字符串
136          * 
137          * @param string
138          * @return
139          * @throws ParseException
140          */
141         private Date parseDateFormat(String string) {
142             Date parse = null;
143             try {
144                 parse = FORMAT.parse(string);
145             } catch (ParseException e) {
146                 e.printStackTrace();
147             }
148             return parse;
149         }
150 
151         /**
152          * 解析日志的行记录
153          * 
154          * @param line
155          * @return 数组含有5个元素，分别是ip、时间、url、状态、流量
156          */
157         public String[] parse(String line) {
158             String ip = parseIP(line);
159             String time = parseTime(line);
160             String url = parseURL(line);
161             String status = parseStatus(line);
162             String traffic = parseTraffic(line);
163 
164             return new String[] { ip, time, url, status, traffic };
165         }
166 
167         private String parseTraffic(String line) {
168             final String trim = line.substring(line.lastIndexOf(""") + 1)
169                     .trim();
170             String traffic = trim.split(" ")[1];
171             return traffic;
172         }
173 
174         private String parseStatus(String line) {
175             final String trim = line.substring(line.lastIndexOf(""") + 1)
176                     .trim();
177             String status = trim.split(" ")[0];
178             return status;
179         }
180 
181         private String parseURL(String line) {
182             final int first = line.indexOf(""");
183             final int last = line.lastIndexOf(""");
184             String url = line.substring(first + 1, last);
185             return url;
186         }
187 
188         private String parseTime(String line) {
189             final int first = line.indexOf("[");
190             final int last = line.indexOf("+0800]");
191             String time = line.substring(first + 1, last).trim();
192             Date date = parseDateFormat(time);
193             return dateformat1.format(date);
194         }
195 
196         private String parseIP(String line) {
197             String ip = line.split("- -")[0].trim();
198             return ip;
199         }
200     }
201 }

相关阅读:
我是这样搞懂一个神奇的BUG
如何在Promise链中共享变量？
.NET Core中基类可以反射子类的成员
 .NET Core中NETSDK1061错误解决（转载）
为何.NET Core控制台项目发布后是一个dll文件，而不是exe文件？
SQL Server 子查询错误：No column name was specified for column 2 of 'a' error （转载）
如何处理Entity Framework / Entity Framework Core中的DbUpdateConcurrencyException异常（转载）
EF Core 中多次从数据库查询实体数据，DbContext跟踪实体的情况
 ASP.NET Core MVC中的IActionFilter.OnActionExecuted方法执行时，Controller中Action返回的对象是否已经输出到Http Response中
 ASP.NET Core MVC中Controller的Action如何直接使用Response.Body的Stream流输出数据
原文地址：https://www.cnblogs.com/juncaoit/p/8904240.html