• MapReduce-二进制输入


    Hadoop的MapReduce不只是可以处理文本信息,它还可以处理二进制格式的数据
    1. 关于SequenceFileInputFormat类
    Hadoop的顺序文件格式存储二进制的键/值对的序列。由于它们是可分割的(它们有同步点,所以reader可以从文件中的任意一点雨记录边界进行同步,例如分片的起点),所以它们很符合MapReduce数据的格式要求,并且它们还支持压缩,可以使用一些序列化技术来存储任意类型。
    如果要用顺序文件数据作为MapReduce的输入,应用SequenceFileInputFormat。键和值是由顺序文件决定,所以只需要保证map输入的类型匹配。
    虽然从名称上看不出来,但SequenceFileInputFormat可以读MapFile(排序后的SequenceFile)和SequenceFile。如果在处理顺序文件时遇到目录,SequenceFileInputFormat类会认为自己正在读MapFile,使用的是其数据文件。

    2. 关于SequenceFileAsTextInputFormat类
    SequenceFileAsTextInputFormat是SequenceFileInputFormat的变体,它将顺序文件的键和值转换为Text对象。这个转换通过在键和值上调用toString方法实现。这个格式是顺序文件作为Streaming的合适的输入类型。

    3. 关于SequenceFileAsBinaryInputFormat类
    SequenceFileAsBinaryInputFormat是SequenceFileInputFormat的一种变体,它获取顺序文件的键和值作为二进制对象。它们被封装为BytesWritable对象,因而应用程序可以任意地解释这些字节数组。结合使用SequenceFile.Reader的appendRaw()方法或SequenceFileAsBinaryOutputFormat,它提供了在MapReduce中可以使用任意二进制数据类型的方法。

    例子

    将数据文件存为SequenceFile

    package com.zhen.mapreduce.sequenceToText;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.SequenceFile.CompressionType;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.VLongWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
    
    /**
     * @author FengZhen
     * @date 2018年8月18日
     * 输出为SequenceFile
     */
    public class TextToSequence {
    
    	 public static void main(String[] args) throws Exception {        
    	        Configuration conf = new Configuration();
    	        Job job = Job.getInstance(conf);
    	        job.setJarByClass(TextToSequence.class);
    
    	        job.setMapperClass(WCMapper.class);
    	        job.setReducerClass(WCReducer.class);
    
    	        job.setOutputKeyClass(Text.class);
    	        job.setOutputValueClass(VLongWritable.class);        
    
    	        // 设置输出类
    	        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    
    	        /**
    	         * 设置sequecnfile的格式,对于sequencefile的输出格式,有多种组合方式,
    	         * 从下面的模式中选择一种,并将其余的注释掉
    	         */
    
    	        // 组合方式1:不压缩模式
    	        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.NONE);
    
    	        //组合方式2:record压缩模式,并指定采用的压缩方式 :默认、gzip压缩等
    	        //        SequenceFileOutputFormat.setOutputCompressionType(job,
    	        //                CompressionType.RECORD);
    	        //        SequenceFileOutputFormat.setOutputCompressorClass(job,
    	        //                DefaultCodec.class);
    
    
    	        //组合方式3:block压缩模式,并指定采用的压缩方式 :默认、gzip压缩等
    	        //        SequenceFileOutputFormat.setOutputCompressionType(job,
    	        //                CompressionType.BLOCK);
    	        //        SequenceFileOutputFormat.setOutputCompressorClass(job,
    	        //                DefaultCodec.class);
    
    	        FileInputFormat.addInputPaths(job, "hdfs://fz/user/hdfs/MapReduce/data/squenceFile/origin");
    	        SequenceFileOutputFormat.setOutputPath(job, new Path("hdfs://fz/user/hdfs/MapReduce/data/squenceFile/textToSequence/output"));
    
    	        System.exit(job.waitForCompletion(true)?0:1);
    	    }
    	    //map
    	    public static class WCMapper extends
    	    Mapper<LongWritable, Text, Text, VLongWritable> {
    	        public void map(LongWritable key, Text value, Context context)
    	                throws IOException, InterruptedException {
    	            String[] split = value.toString().split("");
                    for(String s : split){
                        context.write(new Text(s), new VLongWritable(1L));
                    }            
    	        }
    	    }
    	    //reduce
    	    public static class WCReducer extends Reducer<Text, VLongWritable, Text, VLongWritable>{
    	        @Override
    	        protected void reduce(Text key, Iterable<VLongWritable> v2s, Context context)
    	                throws IOException, InterruptedException {
    
    	            long sum=0;
    
    	            for(VLongWritable vl : v2s){
    	                sum += vl.get();                 
    	            }
    	            context.write(key, new VLongWritable(sum));
    	        }
    	    }
    }
    

    读取SequenceFile存为Text

    package com.zhen.mapreduce.sequenceToText;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.VLongWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    /**
     * @author FengZhen
     * @date 2018年8月18日
     * 输入为SequenceFile
     */
    public class SequenceToText extends Configured implements Tool{
    
    	static class SequenceToTextMapper extends Mapper<Text, VLongWritable, Text, VLongWritable>{
    		@Override
    		protected void map(Text key, VLongWritable value,
    				Mapper<Text, VLongWritable, Text, VLongWritable>.Context context)
    				throws IOException, InterruptedException {
    			String contents = value.toString();
    			System.out.println(contents);
    			context.write(key, value);
    		}
    	}
    	
    	static class SequenceToTextReducer extends Reducer<Text, VLongWritable, Text, VLongWritable>{
    		@Override
    		protected void reduce(Text key, Iterable<VLongWritable> value,
    				Reducer<Text, VLongWritable, Text, VLongWritable>.Context context)
    				throws IOException, InterruptedException {
    			long sum = 0;
    			while (value.iterator().hasNext()) {
    				sum += Integer.parseInt(value.iterator().next().toString());
    			}
    			context.write(key, new VLongWritable(sum));
    		}
    	}
    	
    	public int run(String[] args) throws Exception {
    		Configuration conf = new Configuration();
    		Job job = Job.getInstance(conf);
    		job.setJobName("SequenceToText");
    		job.setJarByClass(SequenceToText.class);
    		
    		job.setInputFormatClass(SequenceFileInputFormat.class);
    		job.setOutputFormatClass(TextOutputFormat.class);
    		
    		job.setMapperClass(SequenceToTextMapper.class);
    		job.setReducerClass(SequenceToTextReducer.class);
    		
    		job.setMapOutputKeyClass(Text.class);
    		job.setMapOutputValueClass(VLongWritable.class);
    		
    		job.setOutputKeyClass(Text.class);
    		job.setOutputValueClass(VLongWritable.class);
    		
    		SequenceFileInputFormat.setInputPaths(job, new Path(args[0]));
    		TextOutputFormat.setOutputPath(job, new Path(args[1]));
    		
    		return job.waitForCompletion(true) ? 0 : 1;
    	}
    
    	public static void main(String[] args) throws Exception {
    		String[] params = new String[]{"hdfs://fz/user/hdfs/MapReduce/data/squenceFile/textToSequence/output","hdfs://fz/user/hdfs/MapReduce/data/squenceFile/sequenceToText/output"};
    		int exitCode = ToolRunner.run(new SequenceToText(), params);
    		System.out.println(exitCode);
    		System.exit(exitCode);
    	}
    	
    }
    

     

     

  • 相关阅读:
    android之自定义ViewGroup和自动换行的布局的实现
    早上开发有感:事情原本可以变的简单
    android中textview显示汉字,字母,数字乱行行问题解决
    MeasureSpec介绍及使用详解
    android中status bar 小结
    Android推送方式比较(转)
    error: Entry 'xxxxxx' not uptodate. Cannot merge.
    android2.3:加载你的SD卡与can't mount /dev/block/mmcblk0
    有客户自远方来,不亦乐乎?唉,怎一个愁字了得
    卸载Norton 8企业版的一次经历
  • 原文地址:https://www.cnblogs.com/EnzoDin/p/9520702.html
Copyright © 2020-2023  润新知