mapreduce多文件输出的两方法

mapreduce多文件输出的两方法

mapreduce多文件输出的两方法

package duogemap;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.*;

import org.apache.hadoop.mapred.lib.MultipleOutputs;

import org.apache.hadoop.util.GenericOptionsParser;

public class OldMulOutput {

public static class MapClass extends MapReduceBase implements Mapper<LongWritable, Text, NullWritable, Text>{

private MultipleOutputs mos;

private OutputCollector<NullWritable, Text> collector;

public void Configured(JobConf conf){

mos=new MultipleOutputs(conf);

}

public void map(LongWritable key, Text value, OutputCollector<NullWritable, Text> output,Reporter reporter)

throws IOException{

String[] arr=value.toString().split(",", -1);

String chrono=arr[1]+","+arr[2];

String geo=arr[4]+","+arr[5];

collector=mos.getCollector("chrono", reporter);

collector.collect(NullWritable.get(),new Text(chrono));

collector=mos.getCollector("geo", reporter);

collector.collect(NullWritable.get(),new Text(geo));

}

public void close() throws IOException{

mos.close();

}

public static void main(String[] args) throws IOException {

Configuration conf=new Configuration();

String[] remainingArgs=new GenericOptionsParser(conf, args).getRemainingArgs();

if (remainingArgs.length !=2) {

System.err.println("Error!");

System.exit(1);

}

JobConf job=new JobConf(conf,OldMulOutput.class);

Path in=new Path(remainingArgs[0]);

Path out=new Path(remainingArgs[1]);

FileInputFormat.setInputPaths(job, in);

FileOutputFormat.setOutputPath(job, out);

job.setJobName("Multifile");

job.setMapperClass(MapClass.class);

job.setInputFormat(TextInputFormat.class);

job.setOutputKeyClass(NullWritable.class);

job.setOutputValueClass(Text.class);

job.setNumReduceTasks(0);

MultipleOutputs.addNamedOutput(job, "chrono", TextOutputFormat.class, NullWritable.class, Text.class);

MultipleOutputs.addNamedOutput(job, "geo", TextOutputFormat.class, NullWritable.class, Text.class);

JobClient.runJob(job);

}

}

}

package duogemap;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import org.apache.hadoop.util.GenericOptionsParser;

import duogemap.OldMulOutput.MapClass;

public class MulOutput {

public static class MapClass extends Mapper<LongWritable, Text, NullWritable, Text>{

private MultipleOutputs mos;

@Override

protected void setup(Context context)

throws IOException, InterruptedException {

// TODO Auto-generated method stub

super.setup(context);

mos=new MultipleOutputs(context);

}

@Override

protected void map(LongWritable key, Text value,Context context)

throws IOException, InterruptedException {

mos.write(NullWritable.get(),value,generateFileName(value));

}

private String generateFileName(Text value) {

// TODO Auto-generated method stub

String[] split=value.toString().split(",", -1);

String country=split[4].substring(1, 3);

return country+"/";

}

@Override

protected void cleanup(Context context)

throws IOException, InterruptedException {

// TODO Auto-generated method stub

super.cleanup(context);

mos.close();

}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

Configuration conf=new Configuration();

Job job=Job.getInstance(conf, "Muloutput");

String[] remainingArgs=new GenericOptionsParser(conf, args).getRemainingArgs();

if (remainingArgs.length !=2) {

System.err.println("Error!");

System.exit(1);

}

Path in=new Path(remainingArgs[0]);

Path out=new Path(remainingArgs[1]);

FileInputFormat.setInputPaths(job, in);

FileOutputFormat.setOutputPath(job, out);

job.setMapperClass(MapClass.class);

job.setInputFormatClass(TextInputFormat.class);

job.setOutputKeyClass(NullWritable.class);

job.setOutputValueClass(Text.class);

job.setNumReduceTasks(0);

System.exit(job.waitForCompletion(true)?0:1);

}

}

}
相关阅读:
[BZOJ1015] [JSOI2008]星球大战starwar
[BZOJ2321,LuoguP1861]星(之)器
 Google Search Operators
Python blockchain
CCAE词频表（转）
python小技巧（转）
Python著名的lib和开发框架（均为转载）
Yarn取代job/task tracker
hadoop 2.73‘s four xml
HDFS NN,SNN,BN和HA
原文地址：https://www.cnblogs.com/liquan-anran/p/6253087.html