• 大数据学习——日志分析


    有两个海量日志文件存储在hdfs上, 
    其中登陆日志格式:user,ip,time,oper(枚举值:1为上线,2为下线);
    访问之日格式为:ip,time,url,假设登陆日志中上下线信息完整,切同一上下线时间段内是用的ip唯一,
    计算访问日志中独立user数量最多的前10个url,用MapReduce实现。
    
    提示:
    1、要统计前10,需要两个步骤,第一个步骤实现join,统计出每个url对应的独立用户数,第二步骤求出top10
    2、两个大表join,用同一job多输入
    3、要根据ip字段join,所以要根据ip分区
    4、求top10

     数据:

    login.log

    tom,192.168.1.11,2017-11-20 10:00,1
    tom,192.168.1.11,2017-11-20 11:00,2
    sua,192.168.1.12,2017-11-20 10:01,1
    sua,192.168.1.12,2017-11-20 10:30,2
    lala,192.168.1.11,2017-11-20 11:01,1
    lala,192.168.1.11,2017-11-20 11:30,2
    tom,192.168.1.14,2017-11-20 11:01,1
    tom,192.168.1.14,2017-11-20 11:40,2
    jer,192.168.1.15,2017-11-20 10:00,1
    jer,192.168.1.15,2017-11-20 10:40,2
    sua,192.168.1.16,2017-11-20 11:00,1
    sua,192.168.1.16,2017-11-20 12:00,2

    visit.log

    192.168.1.11,2017-11-20 10:02,url1
    192.168.1.11,2017-11-20 10:04,url1
    192.168.1.14,2017-11-20 11:02,url1
    192.168.1.12,2017-11-20 10:02,url1
    192.168.1.11,2017-11-20 11:02,url1
    192.168.1.15,2017-11-20 10:02,url1
    192.168.1.16,2017-11-20 11:02,url1
    192.168.1.11,2017-11-20 10:03,url2
    192.168.1.14,2017-11-20 11:03,url2
    192.168.1.12,2017-11-20 10:03,url2
    192.168.1.15,2017-11-20 10:03,url2
    192.168.1.16,2017-11-20 11:03,url2
    192.168.1.12,2017-11-20 10:15,url3
    192.168.1.15,2017-11-20 10:16,url3
    192.168.1.16,2017-11-20 11:02,url3

    实现代码:

    pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
    
      <modelVersion>4.0.0</modelVersion>
    
      <groupId>com.cyf</groupId>
      <artifactId>TwoLog</artifactId>
      <packaging>jar</packaging>
      <version>1.0-SNAPSHOT</version>
    
      <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
      </properties>
      <dependencies>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-common</artifactId>
          <version>2.6.4</version>
        </dependency>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-hdfs</artifactId>
          <version>2.6.4</version>
        </dependency>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-client</artifactId>
          <version>2.6.4</version>
        </dependency>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-mapreduce-client-core</artifactId>
          <version>2.6.4</version>
        </dependency>
      </dependencies>
    
      <build>
        <plugins>
          <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-jar-plugin</artifactId>
            <version>2.4</version>
            <configuration>
              <archive>
                <manifest>
                  <addClasspath>true</addClasspath>
                  <classpathPrefix>lib/</classpathPrefix>
                  <mainClass>com.cyf.LoginlogFormatMP</mainClass>
                </manifest>
              </archive>
            </configuration>
          </plugin>
        </plugins>
      </build>
    
    </project>
    IpJoinBean.java
    package log;
    
    import org.apache.hadoop.io.WritableComparable;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    /**
     * Created by mac on 17/11/24.
     *  根据IPjoin登陆表和访问表的输出值
     */
    public class IpJoinBean implements WritableComparable<IpJoinBean>{
         private  String user = "";
         private String ip = "";
         private String loginTime = "";
         private String logoutTime = "";
         private String type = "";
         private String url = "";
         private String visitTime = "";
    
        public String getUser() {
            return user;
        }
    
        public void setUser(String user) {
            this.user = user;
        }
        public String getIp() {
            return ip;
        }
    
        public void setIp(String ip) {
            this.ip = ip;
        }
    
        public String getLoginTime() {
            return loginTime;
        }
    
        public void setLoginTime(String loginTime) {
            this.loginTime = loginTime;
        }
    
        public String getLogoutTime() {
            return logoutTime;
        }
    
        public void setLogoutTime(String logoutTime) {
            this.logoutTime = logoutTime;
        }
    
        public String getType() {
            return type;
        }
    
        public void setType(String type) {
            this.type = type;
        }
    
        public String getUrl() {
            return url;
        }
    
        public void setUrl(String url) {
            this.url = url;
        }
    
        public String getVisitTime() {
            return visitTime;
        }
    
        public void setVisitTime(String visitTime) {
            this.visitTime = visitTime;
        }
    
    
    
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeUTF(type);
            dataOutput.writeUTF(ip);
            dataOutput.writeUTF(user);
            dataOutput.writeUTF(url);
            dataOutput.writeUTF(loginTime);
            dataOutput.writeUTF(logoutTime);
            dataOutput.writeUTF(visitTime);
    
        }
    
        public void readFields(DataInput dataInput) throws IOException {
            this.type = dataInput.readUTF();
            this.ip = dataInput.readUTF();
            this.user = dataInput.readUTF();
            this.url = dataInput.readUTF();
            this.loginTime = dataInput.readUTF();
            this.logoutTime = dataInput.readUTF();
            this.visitTime = dataInput.readUTF();
    
        }
        public boolean visitMatchLogin(IpJoinBean login){
            if(this.visitTime.compareTo(login.loginTime)>=0&&this.visitTime.compareTo(login.logoutTime)<=0){
                return true;
            }
            return false;
        }
    
        public int compareTo(IpJoinBean o) {
            return this.ip.compareTo(o.ip)>=0?1:-1;
        }
    
        @Override
        public String toString() {
            return "ip:"+ip+" user:"+user+" url:"+url+" loginTime:"+loginTime+" logouttime:"+logoutTime+" visitTime:" +visitTime;
        }
        public IpJoinBean(String ip,String user,String url,String loginTime,String logoutTime,String visitTime){
            this.ip = ip;
            this.user = user;
            this.url = url;
            this.loginTime = loginTime;
            this.logoutTime = logoutTime;
            this.visitTime = visitTime;
        }
        public IpJoinBean(){
            super();
        }
    }
    ReversBean.java
    package log;
    
    import org.apache.hadoop.io.WritableComparable;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    /**
     * Created by mac on 17/11/27.
     */
    public class ReversBean implements WritableComparable<ReversBean>{
        private String count;
    
        public int compareTo(ReversBean o) {
            return o.count.compareTo(this.count);
        }
    
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeUTF(count);
        }
    
        public void readFields(DataInput dataInput) throws IOException {
            this.count=dataInput.readUTF();
        }
    
        public ReversBean(String count) {
            this.count = count;
        }
    
        public ReversBean() {
        }
    
        public String getCount() {
            return count;
        }
    
        public void setCount(String count) {
            this.count = count;
        }
    }
    AllToOneGroupingComparator.java
    package log;
    
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;
    
    /**
     * Created by mac on 17/11/25.
     */
    public class AllToOneGroupingComparator extends WritableComparator {
        protected  AllToOneGroupingComparator(){
            super(ReversBean.class,true);
        }
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            return 0;
        }
    }
    LoginlogFormatMP.java
    package com.cyf;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.List;
    
    /**
     * Created by mac on 17/11/24.
     * 将用户登陆信息整理为 ip-user-loginTime-logoutTime格式文件
     */
    public class LoginlogFormatMP {
        public static class readFilesMapper extends Mapper<LongWritable,Text,Text,Text>{
            Text outKey = new Text();
            Text outValue = new Text();
            @Override
            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
                String[] line = value.toString().split(",");
                String user = line[0];
                String ip = line[1];
                String time = line[2];
                String opr = line[3];
                System.out.print("map=============================line"+user+ip+time+opr);
                outKey.set(user+"_"+ip);
                outValue.set(time+"_"+opr);
                context.write(outKey,outValue);
            }
        }
    
        public static class timeConcatReducer extends Reducer<Text,Text,Text,NullWritable> {
            List loginTimes = new ArrayList();
            List logoutTimes = new ArrayList();
            @Override
            protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
                for(Text value :values){
                    String time = value.toString().split("_")[0];
                    String opr = value.toString().split("_")[1];
                    if(opr.equals("1")){
                        loginTimes.add(time);
                    }else if(opr.equals("2")){
                        logoutTimes.add(time);
                    }
                }
                Collections.sort(loginTimes);
                Collections.sort(logoutTimes);
                for(int i = 0;i<loginTimes.size();i++){
                    System.out.print(key+"_"+loginTimes.get(i)+"_"+logoutTimes.get(i));
                    //假设登陆和登出信息完整
                    context.write(new Text(key+"_"+loginTimes.get(i)+"_"+logoutTimes.get(i)),NullWritable.get());
                }
                loginTimes.clear();
                logoutTimes.clear();
            }
    
        }
    
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(LoginlogFormatMP.class);
            job.setMapperClass(readFilesMapper.class);
            job.setReducerClass(timeConcatReducer.class);
            job.setInputFormatClass(TextInputFormat.class);
            job.setNumReduceTasks(1);
    
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputKeyClass(Text.class);
            FileInputFormat.addInputPath(job,new Path("/logs/input/login.log"));
    
            Path outPath = new Path("/logs/output/out");
            FileOutputFormat.setOutputPath(job,outPath);
    
            job.waitForCompletion(true);
        }
    
    }
    JoinWithIpMp.java
    package com.cyf;
    
    
    import log.IpJoinBean;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    
    /**
     * Created by mac on 17/11/25.
     *  通过IP将处理后的登陆日志与访问日志关联
     */
    public class JoinWithIpMp {
        public static class readFilesMapper extends Mapper<LongWritable,Text,Text,IpJoinBean> {
            private String inputPath ;
            private String fileName;
    
            private IpJoinBean outPutValue = new IpJoinBean();
    
            @ Override
            protected void setup(Context context) throws IOException, InterruptedException {
                // 每个文件传进来时获得文件中属性前缀
                FileSplit input = (FileSplit) context.getInputSplit();
    
                try {
                    //获得文件名
                    fileName = input.getPath().getName();;
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            @Override
            protected void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException {
                //将Ip作为key,把两份日志中的统一Ip分到同一分区分组下,在reduce中进行join
                if(fileName.startsWith("visit")){
    
                    String[] parms = value.toString().split(",");
                    String ip = parms[0];
                    String time = parms[1];
                    String url = parms[2];
                    outPutValue.setIp(ip);
                    outPutValue.setVisitTime(time);
                    outPutValue.setUrl(url);
                    outPutValue.setType("visit");
    
                }else{
                    String[] parms = value.toString().split("_");
                    String user = parms[0];
                    String ip = parms[1];
                    String loginTime = parms[2];
                    String logoutTime = parms[3];
                    outPutValue.setUser(user);
                    outPutValue.setIp(ip);
                    outPutValue.setType("login");
                    outPutValue.setLoginTime(loginTime);
                    outPutValue.setLogoutTime(logoutTime);
    
                }
    
                context.write(new Text(outPutValue.getIp()),outPutValue);
            }
        }
        public static class countClicksReducer extends Reducer<Text,IpJoinBean,Text,NullWritable> {
            private List<IpJoinBean> loginData = new ArrayList<IpJoinBean>();
            private List<IpJoinBean> visitData = new ArrayList<IpJoinBean>();
            @Override
            protected void reduce(Text key,Iterable<IpJoinBean> values,Context context) throws IOException,InterruptedException{
    
                for(IpJoinBean value:values){
                    System.out.println("value============================  "+value);
                    if(value.getType().equals("visit")){
                        visitData.add(new IpJoinBean(value.getIp(),value.getUser(),value.getUrl(),value.getLoginTime(),value.getLogoutTime(),value.getVisitTime()));
                    }else if(value.getType().equals("login")){
                        loginData.add(new IpJoinBean(value.getIp(),value.getUser(),value.getUrl(),value.getLoginTime(),value.getLogoutTime(),value.getVisitTime()));
                    }
                }
                 //统计出能够关联到登陆信息的访问时间和ip,与用户关联
                for(IpJoinBean visit:visitData){
                    for(IpJoinBean login:loginData){
                        if(visit.visitMatchLogin(login)){
                            context.write(new Text(visit.getIp()+"_"+login.getUser()+"_"+visit.getUrl()),NullWritable.get());
                        }
                    }
                }
                loginData.clear();
                visitData.clear();
    
            }
    
        }
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(JoinWithIpMp.class);
            job.setMapperClass(readFilesMapper.class);
            job.setReducerClass(countClicksReducer.class);
            //job.setInputFormatClass(TextInputFormat.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IpJoinBean.class);
            job.setNumReduceTasks(1);
    
            Path[] inPaths = new Path[2];
            inPaths[0] = new Path("/logs/output/out/");
            inPaths[1] = new Path("/logs/input/visit.log");
            FileInputFormat.setInputPaths(job,inPaths);
    
            Path outPath = new Path("/logs/joinOut");
            FileOutputFormat.setOutputPath(job,outPath);
    
            job.waitForCompletion(true);
    
        }
    
    }
    CountUVMP.java
    package com.cyf;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    import java.util.*;
    
    /**
     * Created by mac on 17/11/25.
     * 统计每个url的独立访问数
     */
    public class CountUVMP {
        public static class readFilesMapper extends Mapper<LongWritable,Text,Text,Text> {
            Text outKey = new Text();
            Text outValue = new Text();
            @Override
            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                String[] line = value.toString().split("_");
    
                String user = line[1];
                String url = line[2];
                context.write(new Text(url),new Text(user));
    
            }
        }
    
        public static class timeConcatReducer extends Reducer<Text,Text,Text,NullWritable> {
    
            @Override
            protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
                int i = 0;
                List<String> users = new ArrayList<String>();
                for(Text value : values){
                    if( users.contains(value.toString())){
                        continue;
                    }
                    users.add(value.toString());
                }
                context.write(new Text(key+"_"+users.size()),NullWritable.get());
            }
    
        }
    
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(CountUVMP.class);
            job.setMapperClass(readFilesMapper.class);
            job.setReducerClass(timeConcatReducer.class);
            job.setInputFormatClass(TextInputFormat.class);
            job.setNumReduceTasks(1);
    
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputKeyClass(Text.class);
            FileInputFormat.addInputPath(job,new Path("/logs/joinOut"));
    
            Path outPath = new Path("/logs/uvResult");
            FileOutputFormat.setOutputPath(job,outPath);
    
            job.waitForCompletion(true);
        }
    }
    Top10MP.java
    package com.cyf;
    
    import java.io.IOException;
    
    
    import log.AllToOneGroupingComparator;
    import log.ReversBean;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapreduce.Job;
    
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    /**
     * Created by mac on 17/11/25.
     * 求出UV前10的url
     */
    
    public class Top10MP {
        public static class TopOneMap extends Mapper<LongWritable, Text, ReversBean, Text> {
    
            int i = 0 ;
    
            @Override
            protected void map(LongWritable key, Text value,Context context)
                    throws IOException, InterruptedException {
                //以url的访客数作为key,实现倒序排序
                 String[] values = value.toString().split("_");
                 String url = values[0];
                 String count = values[1];
                 context.write(new ReversBean(count),value);
    
            }
        }
        public static class TopReduce extends Reducer<ReversBean, Text, Text,IntWritable> {
            int count = 0;
            int top = 0;
    
            @Override
            protected void setup(Context context) throws IOException, InterruptedException {
                Configuration conf = context.getConfiguration();
                top = Integer.parseInt(conf.get("top"));
            }
            //每个分区下的所有key分为一个组,按照uv数从大到小排序,每组取出前10即可
            @Override
            protected void reduce(ReversBean key, Iterable<Text> values,Context context)
                    throws IOException, InterruptedException {
                for (Text value : values ){
                    if(count >= top){
                        return;
                    }else{
                        count++;
                        String[] countresults = value.toString().split("_");
                        String url = countresults[0];
                        String count = countresults[1];
                        context.write(new Text(url),new IntWritable((Integer.parseInt(count))));
                    }
    
                }
    
            }
        }
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            conf.set("top", "2");
    
            /*FileSystem fileSystem = FileSystem.get(conf);
            Path dr = new Path("/Users/mac/Test/uvTop10");
            if(fileSystem.exists(dr)){
                fileSystem.delete(dr);
            }*/
    
            Job job = Job.getInstance(conf);
            job.setJarByClass(Top10MP.class);
    
            job.setMapperClass(TopOneMap.class);
            job.setReducerClass(TopReduce.class);
    
            job.setMapOutputKeyClass(ReversBean.class);
            job.setMapOutputValueClass(Text.class);
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
    
            job.setGroupingComparatorClass(AllToOneGroupingComparator.class);
    
            FileInputFormat.setInputPaths(job,new Path("/logs/uvResult/"));
            FileOutputFormat.setOutputPath(job, new Path("/logs/uvTop10"));
    
            //根据数据量决定,数据量太大可以先分两步,第一步骤分区数尽量大,第二步骤分区数唯一实现全局topN
            job.setNumReduceTasks(1);
    
            boolean b = job.waitForCompletion(true);
            System.exit(b?0:1);
        }
    }


    修改pom.xml
    <mainClass>com.cyf.LoginlogFormatMP</mainClass>
    <mainClass>com.cyf.JoinWithIpMp</mainClass>
    <mainClass>com.cyf.CountUVMP</mainClass>
    <mainClass>com.cyf.Top10MP</mainClass>

    运行命令
    hadoop jar TwoLog-1.0.jar com.cyf.LoginlogFormatMP
    hadoop jar TwoLog-2.0.jar com.cyf.JoinWithIpMp
    hadoop jar TwoLog-3.0.jar com.cyf.CountUVMP
    hadoop jar TwoLog-4.0.jar com.cyf.Top10MP

  • 相关阅读:
    代理的原理
    nodemon:让node自动重启
    http与https的区别
    Content-Type:几种常用数据编码格式
    vue ssr服务端渲染
    vue 实现文件上传和文件下载
    vue element-ui表格里时间戳转换成时间显示
    vue npm start 自动打开网页
    yearProgress.vue
    vuejs中class与style的绑定
  • 原文地址:https://www.cnblogs.com/feifeicui/p/10247352.html
Copyright © 2020-2023  润新知