• mapreduce的一个模版


    import java.io.IOException;
    import java.text.DateFormat;
    import java.text.SimpleDateFormat;
    import java.util.Date;
    
    
    
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapreduce.*;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
     
    /**  
     * map就是把key先分出来。系统会自己主动把同样key的value放到一个iterator里面,reduce就是去处理key和已经归并好的iterator
     */  
    public class Template extends Configured implements Tool {	
    	
    	/**  
    	 * 计数器
    	 * 用于计数各种异常数据
    	 */  
    	enum Counter 
    	{
    		LINESKIP,	//出错的行
    	}
    	
    	/**  
    	 * MAP任务
    	 */  
    	public static class Map extends Mapper<LongWritable, Text, Text, Text> //输入的key(详细是什么由job.setInputFormatClass决定),输入的value,输出的key,输出的value
    	{
    		public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException 
    		{
    			String line = value.toString();				//读取源数据
    			
    			try
    			{
    				//数据处理
    				String [] lineSplit = line.split(" ");
    				String anum = lineSplit[0];
    				String bnum = lineSplit[1];
    				
    				context.write( new Text(bnum), new Text(anum) );	//输出
    			}
    			catch ( java.lang.ArrayIndexOutOfBoundsException e )
    			{
    				context.getCounter(Counter.LINESKIP).increment(1);	//出错令计数器+1
    				return;
    			}
    		}
    	}
    
    
    	/**  
    	 * REDUCE任务
    	 */ 
    	public static class Reduce extends Reducer<Text, Text, Text, Text> 
    	{
    		public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException
    		{
    			String valueString;
    			String out = "";
    			
    			for ( Text value : values )
    			{
    				valueString = value.toString();
    				out += valueString + "|";
    			}
    			
    			context.write( key, new Text(out) );
    		}
    	}
    
    
    	@Override
    	public int run(String[] args) throws Exception 
    	{
    		Configuration conf = getConf();
    
    
    		Job job = new Job(conf, "Test_2");								//任务名
    		job.setJarByClass(Test_2.class);								//指定Class
    		
    		FileInputFormat.addInputPath( job, new Path(args[0]) );			//输入路径
    		FileOutputFormat.setOutputPath( job, new Path(args[1]) );		//输出路径
    		
    		job.setMapperClass( Map.class );								//调用上面Map类作为Map任务代码
    		job.setReducerClass ( Reduce.class );							//调用上面Reduce类作为Reduce任务代码,没有这行就调用默认的reduce
    		job.setOutputFormatClass( TextOutputFormat.class );
    		job.setOutputKeyClass( Text.class );							//指定输出的KEY的格式
    		job.setOutputValueClass( Text.class );							//指定输出的VALUE的格式
    		
    		job.waitForCompletion(true);
    		
    		//输出任务完毕情况
    		System.out.println( "任务名称:" + job.getJobName() );
    		System.out.println( "任务成功:" + ( job.isSuccessful()?

    "是":"否" ) ); System.out.println( "输入行数:" + job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue() ); System.out.println( "输出行数:" + job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_OUTPUT_RECORDS").getValue() ); System.out.println( "跳过的行:" + job.getCounters().findCounter(Counter.LINESKIP).getValue() ); return job.isSuccessful() ? 0 : 1; } /** * 设置系统说明 * 设置MapReduce任务 */ public static void main(String[] args) throws Exception { //推断參数个数是否正确 //假设无參数执行则显示以作程序说明 if ( args.length != 2 ) { System.err.println(""); System.err.println("Usage: Test_2 < input path > < output path > "); System.err.println("Example: hadoop jar ~/Test_2.jar hdfs://localhost:9000/home/james/Test_2 hdfs://localhost:9000/home/james/output"); System.err.println("Counter:"); System.err.println(" "+"LINESKIP"+" "+"Lines which are too short"); System.exit(-1); } //记录開始时间 DateFormat formatter = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss" ); Date start = new Date(); //执行任务 int res = ToolRunner.run(new Configuration(), new Test_2(), args); //输出任务耗时 Date end = new Date(); float time = (float) (( end.getTime() - start.getTime() ) / 60000.0) ; System.out.println( "任务開始:" + formatter.format(start) ); System.out.println( "任务结束:" + formatter.format(end) ); System.out.println( "任务耗时:" + String.valueOf( time ) + " 分钟" ); System.exit(res); } }


  • 相关阅读:
    4 决策树
    Seaborn中几种作图方式的比较
    centso7设置防火墙
    让普通用户拥有
    TensorFlow 训练只用cpu
    loss训练技巧
    Ubuntu 16.04安装sublime text3
    GPU运行Tensorflow的几点建议
    挂载共享文件夹
    ubuntu 用管理员身份进入系统
  • 原文地址:https://www.cnblogs.com/jzssuanfa/p/6788857.html
Copyright © 2020-2023  润新知