• hadoop学习笔记之倒排索引


    开发工具:eclipse

    目标:对下面文档phone_numbers进行倒排索引:

    13599999999 10086
    13899999999 120
    13944444444 13800138000
    13722222222 13800138000
    18800000000 120
    13722222222 10086
    18944444444 10086

    代码:

     1 import java.io.IOException;
     2 import org.apache.hadoop.conf.Configured;
     3 import org.apache.hadoop.conf.Configuration;
     4 import org.apache.hadoop.fs.Path;
     5 import org.apache.hadoop.util.Tool;
     6 import org.apache.hadoop.util.ToolRunner;
     7 import org.apache.hadoop.io.*;
     8 import org.apache.hadoop.mapreduce.*;
     9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    11 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    12 
    13 public class Test_1 extends Configured implements Tool 
    14 {
    15     enum Counter
    16     {
    17         LINESKIP, // error lines
    18     }
    19     
    20     public static class Map extends Mapper<LongWritable, Text, Text, Text>
    21     {
    22         public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
    23         {
    24             String line = value.toString(); // read original data
    25             
    26             try
    27             {
    28                 // process data
    29                 String[] lineSplit = line.split(" ");
    30                 String anum = lineSplit[0];
    31                 String bnum = lineSplit[1];
    32                 
    33                 context.write(new Text(bnum), new Text(anum)); // map output
    34             }
    35             catch(java.lang.ArrayIndexOutOfBoundsException e)
    36             {
    37                 context.getCounter(Counter.LINESKIP).increment(1);
    38                 return;
    39             }
    40             
    41         }
    42     }
    43     public static class Reduce extends Reducer<Text, Text, Text, Text>
    44     {
    45         public void reduce(Text key, Iterable<Text>values, Context context)throws IOException, InterruptedException
    46         {
    47             String valueString;
    48             String out = "";
    49             
    50             for (Text value : values)
    51             {
    52                 valueString = value.toString();
    53                 out += valueString + "|";
    54             }
    55             
    56             context.write(key, new Text(out));    // reduce output
    57         }
    58     }
    59     public int run(String[] args)throws Exception
    60     {
    61         Configuration conf = getConf();
    62         
    63         Job job = new Job(conf, "Test_1");  // task name
    64         job.setJarByClass(Test_1.class);    // specified task
    65         
    66         FileInputFormat.addInputPath(job, new Path(args[0]));  // input path
    67         FileOutputFormat.setOutputPath(job, new Path(args[1])); // output path
    68         
    69         job.setMapperClass(Map.class);
    70         job.setReducerClass(Reduce.class);
    71         job.setOutputFormatClass(TextOutputFormat.class);
    72         job.setOutputKeyClass(Text.class);
    73         job.setOutputValueClass(Text.class);
    74         
    75         job.waitForCompletion(true);
    76         
    77         return job.isSuccessful() ? 0 : 1;
    78     }
    79     
    80     public static void main(String[] args)throws Exception
    81     {
    82         int res = ToolRunner.run(new Configuration(), new Test_1(), args);
    83         System.exit(res);
    84     }
    85 }

    运行结果:

     

  • 相关阅读:
    LOJ 6192 城市网络(树上倍增)
    SDOI2010代码拍卖会 (计数类DP)
    失控的未来交通工具 (LOJ 508,带权并查集,数论)
    线段树维护区间前k小
    POJ 1966 Cable TV Network (最大流最小割)
    网络流学习笔记
    最大子矩阵和
    POJ 1723 Soldiers (中位数)
    最大子矩阵求法详解
    CH0805 防线 (二分值域,前缀和,特殊性质)
  • 原文地址:https://www.cnblogs.com/90zeng/p/hadoop_sort.html
Copyright © 2020-2023  润新知