• Reducejoin sample


    示例文件同sample join analysis

    之前的示例是使用map端的join.这次使用reduce端的join.

    根据源的类别写不同的mapper,处理不同的文件,输出的key都是studentno.value是其他的信息同时加上类别信息。

    然后使用multipleinputs不同的路径注册不同的mapper.

    reduce端相同的studentno的学生信息和考试成绩分配给同一个reduce,而且value中包含了这些信息,

    把这些信息抽取出来,再做笛卡尔积即可。

    下面的示例代码中,我没有使用multipleinputs来处理,自己修改了TextInputFormat的一些信息,使用返回文件名和当前行的信息。

    根据文件名我在mapper中处理两个不同文件的信息,加上不同的类别送出去。

    下面的代码中还有很多可以优化的地方,以后再更新。

    package myexamples;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.compress.CompressionCodec;
    import org.apache.hadoop.io.compress.CompressionCodecFactory;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.JobContext;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.RecordReader;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.apache.hadoop.util.LineReader;
    
    public class reducejoin {
    
        public static class MyTextInputFormat extends FileInputFormat<Text, Text> {
    
            @Override
            public MyLineRecordReader createRecordReader(InputSplit split,
                    TaskAttemptContext context) {
                return new MyLineRecordReader();
            }
    
            @Override
            protected boolean isSplitable(JobContext context, Path file) {
                CompressionCodec codec = new CompressionCodecFactory(
                        context.getConfiguration()).getCodec(file);
                return codec == null;
            }
    
        }
    
        public static class MyLineRecordReader extends RecordReader<Text, Text> {
            private static final Log LOG = LogFactory
                    .getLog(LineRecordReader.class);
    
            private CompressionCodecFactory compressionCodecs = null;
            private long start;
            private long pos;
            private long end;
            private LineReader in;
            private int maxLineLength;
            private Text key = null;
            private Text value = null;
    
            Text filename = null;
    
            public void initialize(InputSplit genericSplit,
                    TaskAttemptContext context) throws IOException {
                FileSplit split = (FileSplit) genericSplit;
                Configuration job = context.getConfiguration();
                this.maxLineLength = job.getInt(
                        "mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
                start = split.getStart();
                end = start + split.getLength();
                final Path file = split.getPath();
                key = new Text(file.getName());
                compressionCodecs = new CompressionCodecFactory(job);
                final CompressionCodec codec = compressionCodecs.getCodec(file);
    
                // open the file and seek to the start of the split
                FileSystem fs = file.getFileSystem(job);
                FSDataInputStream fileIn = fs.open(split.getPath());
                boolean skipFirstLine = false;
                if (codec != null) {
                    in = new LineReader(codec.createInputStream(fileIn), job);
                    end = Long.MAX_VALUE;
                } else {
                    if (start != 0) {
                        skipFirstLine = true;
                        --start;
                        fileIn.seek(start);
                    }
                    in = new LineReader(fileIn, job);
                }
                if (skipFirstLine) { // skip first line and re-establish "start".
                    start += in.readLine(new Text(), 0,
                            (int) Math.min((long) Integer.MAX_VALUE, end - start));
                }
                this.pos = start;
            }
    
            public boolean nextKeyValue() throws IOException {
                if (key == null) {
    
                }
    
                if (value == null) {
                    value = new Text();
                }
                int newSize = 0;
                while (pos < end) {
                    newSize = in.readLine(value, maxLineLength, Math.max(
                            (int) Math.min(Integer.MAX_VALUE, end - pos),
                            maxLineLength));
                    if (newSize == 0) {
                        break;
                    }
                    pos += newSize;
                    if (newSize < maxLineLength) {
                        break;
                    }
    
                    // line too long. try again
                    LOG.info("Skipped line of size " + newSize + " at pos "
                            + (pos - newSize));
                }
                if (newSize == 0) {
                    key = null;
                    value = null;
                    return false;
                } else {
                    return true;
                }
            }
    
            @Override
            public Text getCurrentKey() {
                return key;
            }
    
            @Override
            public Text getCurrentValue() {
                return value;
            }
    
            /**
             * Get the progress within the split
             */
            public float getProgress() {
                if (start == end) {
                    return 0.0f;
                } else {
                    return Math.min(1.0f, (pos - start) / (float) (end - start));
                }
            }
    
            public synchronized void close() throws IOException {
                if (in != null) {
                    in.close();
                }
            }
        }
    
        public static class studentMapper extends Mapper<Text, Text, Text, Text> {
            public void map(Text key, Text value, Context context)
                    throws IOException, InterruptedException {
                Text newvalue = null;
                String strv = value.toString().substring(
                        value.toString().indexOf(","));
                if (key.toString().contains("student")) // student file
                    newvalue = new Text("student" + strv);
                else
                    newvalue = new Text("score" + strv);
                Text newkey = new Text(value.toString().substring(0,
                        value.toString().indexOf(",")));
                context.write(newkey, newvalue);
            }
        }
    
        public static class studentReducer extends Reducer<Text, Text, Text, Text> {
            public void reduce(Text key, Iterable<Text> values, Context context)
                    throws IOException, InterruptedException {
                List<String> students = new ArrayList<String>();
                List<String> scores = new ArrayList<String>();
                for (Text value : values)
                    if (value.toString().startsWith("student"))
                        students.add(value.toString().substring(8));
                    else
                        scores.add(value.toString().substring(6));
                // split real results
                for (String student : students)
                    for (String score : scores)
                        context.write(key, new Text(student + "," + score));
            }
        }
    
        public static void main(String[] args) throws Exception {
            args = "hdfs://namenode:9000/user/hadoop/student/ hdfs://namenode:9000/user/hadoop/reducejoinout"
                    .split(" ");
    
            Configuration conf = new Configuration();
            String[] otherArgs = new GenericOptionsParser(conf, args)
                    .getRemainingArgs();
            if (otherArgs.length != 2) {
                System.err.println("Usage: wordcount <in> <out>");
                System.exit(2);
            }
    
            myUtils.myUtils.DeleteFolder(conf, otherArgs[1]);
            conf.set("io.sort.mb", "10");
            Job job = new Job(conf, "reduce join");
            job.setInputFormatClass(MyTextInputFormat.class);
            // job.setOutputFormatClass(SequenceFileOutputFormat.class);
    
            job.setJarByClass(reducejoin.class);
            job.setMapperClass(studentMapper.class);
            job.setReducerClass(studentReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
            FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }
    Looking for a job working at Home about MSBI
  • 相关阅读:
    Windows Server2012R2 添加Microsoft .NET Framework 3.5 功能失败的解决方法
    Windows Server2012R2 安装 SharePoint 2013 的必备组件
    pig加载两个不同字段个数的文件?load file with different items(f1有42列,f2有43列读到一个对象中)
    正则表达式的子模式详解
    PHP 递归函数的理解
    仿照美食杰tab选项卡
    tab简单选项卡
    PHP字符串变驼峰方法
    PHP笔试题
    PHP面试题集
  • 原文地址:https://www.cnblogs.com/huaxiaoyao/p/4305895.html
Copyright © 2020-2023  润新知