一、项目要求
- 本文讨论的日志处理方法中的日志,仅指Web日志。事实上并没有精确的定义,可能包含但不限于各种前端Webserver——apache、lighttpd、nginx、tomcat等产生的用户訪问日志,以及各种Web应用程序自己输出的日志。
二、需求分析: KPI指标设计
PV(PageView): 页面訪问量统计
IP: 页面独立IP的訪问量统计
Time: 用户每小时PV的统计
Source: 用户来源域名的统计
Browser: 用户的訪问设备统计
以下我着重分析浏览器统计
三、分析过程
1、 日志的一条nginx记录内容
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939
"http://www.angularjs.cn/A00n"
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
2、对上面的日志记录进行分析
remote_addr : 记录client的ip地址, 222.68.172.190
remote_user : 记录clientusername称, –
time_local: 记录訪问时间与时区, [18/Sep/2013:06:49:57 +0000]
request: 记录请求的url与http协议, “GET /images/my.jpg HTTP/1.1″
status: 记录请求状态,成功是200, 200
body_bytes_sent: 记录发送给client文件主体内容大小, 19939
http_referer: 用来记录从那个页面链接訪问过来的, “http://www.angularjs.cn/A00n”
http_user_agent: 记录客户浏览器的相关信息, “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36″
3、java语言分析上面一条日志记录(使用空格切分)
1 |
String line =
"222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/A00n" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"" ;
|
2 |
String[] elementList = line.split( " " );
|
3 |
for ( int
i= 0 ;i<elementList.length;i++){
|
4 |
System.out.println(i+ " : " +elementList[i]);
|
測试结果:
04 |
3
: [ 18 /Sep/ 2013 : 06 : 49 : 57 |
11 |
10
: "http://www.angularjs.cn/A00n" |
16 |
15
: AppleWebKit/ 537.36 |
20 |
19
: Chrome/ 29.0 . 1547.66 |
4、实体Kpi类的代码:
02 |
private
String remote_addr; |
03 |
private
String remote_user; |
04 |
private
String time_local; |
05 |
private
String request; |
06 |
private
String status; |
07 |
private
String body_bytes_sent; |
08 |
private
String http_referer; |
09 |
private
String http_user_agent; |
10 |
private
String method; |
11 |
private
String http_version;
|
13 |
public
String getMethod() {
|
16 |
public
void
setMethod(String method) { |
19 |
public
String getHttp_version() {
|
22 |
public
void
setHttp_version(String http_version) { |
23 |
this .http_version = http_version;
|
25 |
public
String getRemote_addr() {
|
28 |
public
void
setRemote_addr(String remote_addr) { |
29 |
this .remote_addr = remote_addr;
|
31 |
public
String getRemote_user() {
|
34 |
public
void
setRemote_user(String remote_user) { |
35 |
this .remote_user = remote_user;
|
37 |
public
String getTime_local() {
|
40 |
public
void
setTime_local(String time_local) { |
41 |
this .time_local = time_local;
|
43 |
public
String getRequest() {
|
46 |
public
void
setRequest(String request) { |
47 |
this .request = request;
|
49 |
public
String getStatus() {
|
52 |
public
void
setStatus(String status) { |
55 |
public
String getBody_bytes_sent() {
|
56 |
return
body_bytes_sent; |
58 |
public
void
setBody_bytes_sent(String body_bytes_sent) { |
59 |
this .body_bytes_sent = body_bytes_sent;
|
61 |
public
String getHttp_referer() {
|
64 |
public
void
setHttp_referer(String http_referer) { |
65 |
this .http_referer = http_referer;
|
67 |
public
String getHttp_user_agent() {
|
68 |
return
http_user_agent; |
70 |
public
void
setHttp_user_agent(String http_user_agent) { |
71 |
this .http_user_agent = http_user_agent;
|
74 |
public
String toString() { |
75 |
return
"Kpi [remote_addr="
+ remote_addr + ", remote_user=" |
76 |
+ remote_user +
", time_local="
+ time_local + ", request=" |
77 |
+ request +
", status="
+ status + ", body_bytes_sent=" |
78 |
+ body_bytes_sent +
", http_referer="
+ http_referer |
79 |
+
", http_user_agent="
+ http_user_agent + ", method="
+ method |
80 |
+
", http_version="
+ http_version + "]" ;
|
5、kpi的工具类
03 |
public
class
KpiUtil { |
09 |
public
static
Kpi transformLineKpi(String line){ |
10 |
String[] elementList = line.split( " " );
|
12 |
kpi.setRemote_addr(elementList[ 0 ]);
|
13 |
kpi.setRemote_user(elementList[ 1 ]);
|
14 |
kpi.setTime_local(elementList[ 3 ].substring( 1 ));
|
15 |
kpi.setMethod(elementList[ 5 ].substring( 1 ));
|
16 |
kpi.setRequest(elementList[ 6 ]);
|
17 |
kpi.setHttp_version(elementList[ 7 ]);
|
18 |
kpi.setStatus(elementList[ 8 ]);
|
19 |
kpi.setBody_bytes_sent(elementList[ 9 ]);
|
20 |
kpi.setHttp_referer(elementList[ 10 ]);
|
21 |
kpi.setHttp_user_agent(elementList[ 11 ] +
" " + elementList[ 12 ]);
|
6、算法模型: 并行算法
Browser: 用户的訪问设备统计
– Map: {key:$http_user_agent,value:1}
– Reduce: {key:$http_user_agent,value:求和(sum)}
7、map-reduce分析代码
01 |
import
java.io.IOException;
|
02 |
import
java.util.Iterator; |
04 |
import
org.apache.hadoop.fs.Path;
|
05 |
import
org.apache.hadoop.io.IntWritable;
|
06 |
import
org.apache.hadoop.io.Text;
|
07 |
import
org.apache.hadoop.mapred.FileInputFormat;
|
08 |
import
org.apache.hadoop.mapred.FileOutputFormat;
|
09 |
import
org.apache.hadoop.mapred.JobClient;
|
10 |
import
org.apache.hadoop.mapred.JobConf;
|
11 |
import
org.apache.hadoop.mapred.MapReduceBase;
|
12 |
import
org.apache.hadoop.mapred.Mapper;
|
13 |
import
org.apache.hadoop.mapred.OutputCollector;
|
14 |
import
org.apache.hadoop.mapred.Reducer;
|
15 |
import
org.apache.hadoop.mapred.Reporter;
|
16 |
import
org.apache.hadoop.mapred.TextInputFormat;
|
17 |
import
org.apache.hadoop.mapred.TextOutputFormat;
|
18 |
import
org.hmahout.kpi.entity.Kpi;
|
19 |
import
org.hmahout.kpi.util.KpiUtil;
|
21 |
import
cz.mallat.uasparser.UASparser;
|
22 |
import
cz.mallat.uasparser.UserAgentInfo;
|
24 |
public
class
KpiBrowserSimpleV { |
26 |
public
static
class KpiBrowserSimpleMapper
extends
MapReduceBase |
27 |
implements
Mapper<Object, Text, Text, IntWritable> {
|
28 |
UASparser parser =
null ; |
30 |
public
void
map(Object key, Text value, |
31 |
OutputCollector<Text, IntWritable> out, Reporter reporter)
|
33 |
Kpi kpi = KpiUtil.transformLineKpi(value.toString());
|
35 |
if (kpi!= null
&& kpi.getHttP_user_agent_info()!= null ){
|
37 |
parser =
new UASparser();
|
40 |
parser.parseBrowserOnly(kpi.getHttP_user_agent_info());
|
41 |
if ( "unknown" .equals(info.getUaName())){
|
42 |
out.collect( new
Text(info.getUaName()),
new IntWritable( 1 ));
|
44 |
out.collect( new
Text(info.getUaFamily()),
new IntWritable( 1 ));
|
51 |
public
static
class KpiBrowserSimpleReducer
extends
MapReduceBase implements |
52 |
Reducer<Text, IntWritable, Text, IntWritable>{
|
55 |
public
void
reduce(Text key, Iterator<IntWritable> value,
|
56 |
OutputCollector<Text, IntWritable> out, Reporter reporter)
|
58 |
IntWritable sum =
new IntWritable( 0 );
|
59 |
while (value.hasNext()){
|
60 |
sum.set(sum.get()+value.next().get());
|
62 |
out.collect(key, sum);
|
65 |
public
static
void main(String[] args)
throws IOException {
|
66 |
String input =
"hdfs://127.0.0.1:9000/user/tianbx/log_kpi/input" ;
|
67 |
String output = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/browerSimpleV" ;
|
68 |
JobConf conf =
new JobConf(KpiBrowserSimpleV. class );
|
69 |
conf.setJobName( "KpiBrowserSimpleV" );
|
70 |
String url =
"classpath:" ;
|
71 |
conf.addResource(url+ "/hadoop/core-site.xml" );
|
72 |
conf.addResource(url+ "/hadoop/hdfs-site.xml" );
|
73 |
conf.addResource(url+ "/hadoop/mapred-site.xml" );
|
75 |
conf.setMapOutputKeyClass(Text. class );
|
76 |
conf.setMapOutputValueClass(IntWritable. class );
|
78 |
conf.setOutputKeyClass(Text. class );
|
79 |
conf.setOutputValueClass(IntWritable. class );
|
81 |
conf.setMapperClass(KpiBrowserSimpleMapper. class );
|
82 |
conf.setCombinerClass(KpiBrowserSimpleReducer. class );
|
83 |
conf.setReducerClass(KpiBrowserSimpleReducer. class );
|
85 |
conf.setInputFormat(TextInputFormat. class );
|
86 |
conf.setOutputFormat(TextOutputFormat. class );
|
88 |
FileInputFormat.setInputPaths(conf,
new Path(input));
|
89 |
FileOutputFormat.setOutputPath(conf,
new Path(output));
|
91 |
JobClient.runJob(conf);
|
8、输出文件log_kpi/browerSimpleV内容
AOL Explorer 1
Android Webkit 123
Chrome 4867
CoolNovo 23
Firefox 1700
Google App Engine 5
IE 1521
Jakarta Commons-HttpClient 3
Maxthon 27
Mobile Safari 273
Mozilla 130
Openwave Mobile Browser 2
Opera 2
Pale Moon 1
Python-urllib 4
Safari 246
Sogou Explorer 157
unknown 4685
8 R制作图片
data<-read.table(file="borwer.txt",header=FALSE,sep=",")
names(data)<-c("borwer","num")
qplot(borwer,num,data=data,geom="bar")
解决这个问题
1、排除爬虫和程序点击,对抗作弊
解决的方法:页面做个检測鼠标是否动。
2、浏览量 怎么排除图片
3、浏览量排除假点击?
4、哪一个搜索引擎訪问的?
5、点击哪一个关键字訪问的?
6、从哪一个地方訪问的?
7、使用哪一个浏览器訪问的?