• Hadoop小程序数据筛选


      1、有一批路由日志,需要提取MAC地址和时间,删除其他内容。

      日志内容格式如下:

    Apr 15 10:04:42 hostapd: wlan0: STA 14:7D:C5:9E:84
    Apr 15 10:04:43 hostapd: wlan0: STA 14:7D:C5:9E:85
    Apr 15 10:04:44 hostapd: wlan0: STA 14:7D:C5:9E:86
    Apr 15 10:04:45 hostapd: wlan0: STA 14:7D:C5:9E:87
    Apr 15 10:04:46 hostapd: wlan0: STA 14:7D:C5:9E:88
    Apr 15 10:04:47 hostapd: wlan0: STA 14:7D:C5:9E:89
    Apr 15 10:04:48 hostapd: wlan0: STA 14:7D:C5:9E:14
    Apr 15 10:04:49 hostapd: wlan0: STA 14:7D:C5:9E:24
    Apr 15 10:04:52 hostapd: wlan0: STA 14:7D:C5:9E:34
    Apr 15 10:04:32 hostapd: wlan0: STA 14:7D:C5:9E:44
    Apr 15 10:04:22 hostapd: wlan0: STA 14:7D:C5:9E:54

      筛选后的内容格式为:

    Apr 15 10:04:42 14:7D:C5:9E:84
    Apr 15 10:04:43 14:7D:C5:9E:85
    Apr 15 10:04:44 14:7D:C5:9E:86
    Apr 15 10:04:45 14:7D:C5:9E:87
    Apr 15 10:04:46 14:7D:C5:9E:88
    Apr 15 10:04:47 14:7D:C5:9E:89
    Apr 15 10:04:48 14:7D:C5:9E:14
    Apr 15 10:04:49 14:7D:C5:9E:24
    Apr 15 10:04:52 14:7D:C5:9E:34
    Apr 15 10:04:32 14:7D:C5:9E:44
    Apr 15 10:04:22 14:7D:C5:9E:54

      

      2、算法思路

      源文件——》Mapper(分割原始数据、输出所需数据、处理异常数据)——》输出到HDFS

      3、编写程序

    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    
    public class App_1 extends Configured implements Tool{
        enum Counter{
            LINESKIP,//记录出错的行
        }
        /**
         *Mapper<LongWritable,Text,NullWritable,Text>
         *LongWritable,Text 是输入数据的key和value  如:路由日志的每一行的首字符的偏移量作为key,整一行的内容作为value
         *NullWritable,Text    是输出数据的key和value
         *
         */
        public static class RouterMapper extends Mapper<LongWritable,Text,NullWritable,Text>{
        
            //map(LongWritable key,Text value,Context context)
            //LongWritable key,Text value,和RouterMapper类的输入数据的key、value对应
            //Context 上下文环境
            public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException{
                String line = value.toString();
                try{
                    
                    String[] lineSplit = line.split(" ");//分割原始数据
                    String month = lineSplit[0];//获取月份
                    String day = lineSplit[1];//获取日期
                    String time = lineSplit[2];//获取时间
                    String mac = lineSplit[6];//获取网卡地址
                    
                    //转换成hadoop能读取的输出格式,要和RouterMapper类的输出数据格式一致
                    Text out = new Text(month+" "+day+" "+time+" "+mac);
                    
                    //输出
                    context.write(NullWritable.get(), out);
                    
                }catch(ArrayIndexOutOfBoundsException e) {
                    //对异常数据进行处理,出现异常,令计数器+1
                    context.getCounter(Counter.LINESKIP).increment(1);
                    return;
                }
            }
        }
        
        @Override
        public int run(String[] arg0) throws Exception {
            Configuration conf = getConf();
            Job job = new Job(conf,"App_1");//指定任务名称
            job.setJarByClass(App_1.class);//指定Class
            FileInputFormat.addInputPath(job, new Path(arg0[0]));//输入路径
            FileOutputFormat.setOutputPath(job, new Path(arg0[1]));//输出路径
            
            job.setMapperClass(RouterMapper.class);//调用RouterMapper类作为Mapper的任务代码
            job.setOutputFormatClass(TextOutputFormat.class);
            
            job.setOutputKeyClass(NullWritable.class);//指定输出的key格式,要和RouterMapper的输出数据格式一致
            job.setOutputValueClass(Text.class);//指定输出的value格式,要和RouterMapper的输出数据格式一致
            job.waitForCompletion(true);
            return job.isSuccessful()?0:1;
        }
        
        //测试用的main方法
        //main方法运行的时候需要指定输入路径和输出路径
        public static void main(String[] args) throws Exception{
            int res = ToolRunner.run(new Configuration(), new App_1(), args);
            System.exit(res);
            
        }
    }

      要分析的路由日志文件已上传到HDFS的hdfs://h1:9000/user/coder/in目录中,h1是我的namenode主机名。 运行前配置参数,注意输出路径的存放目录必须不存在。

      4、运行结束后,可在eclipse中直接查看结果

      

  • 相关阅读:
    关于sql json数据的处理
    时间函数strtotime的强大
    /usr/bin/install: cannot create regular file `/usr/local/jpeg6/include/jconfig.h'
    linux安装php7.2.7
    关于sql时间方面的处理
    关于centos防火墙的一些问题
    linux 安装ssl 失败原因
    linux安装php7.2.7
    拾取坐标和反查询接口api
    【转】通过点击获取地址等信息、可以传值
  • 原文地址:https://www.cnblogs.com/luxh/p/2994522.html
Copyright © 2020-2023  润新知