有这样一个实际的问题需要要通过hadoop的来解决一下。
有一个学生成绩表,有学生姓名 和成绩格式如下
zs 89 zs 100 ls 98 ls 100 zs 20 ww 89 ww 67 ls 30 ww 20
一个学生 有多个科目,有不同的成绩。
需要对每个同学的成绩求平均值。
同时,把这个student.txt 上传到 hadoop的 file System 中。
./bin/hadoop fs -put ~/file/student.txt
代码如下:
package com.picc.test; import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.picc.mapreducetest.MyMapReduceTest; /*** * 定义一个AvgScore 求学生的平均值 要实现一个Tool 工具类,是为了初始化一个hadoop配置实例 */ public class AvgScore implements Tool{ public static final Logger log=LoggerFactory.getLogger(AvgScore.class); Configuration configuration; // 是版本 0.20.2的实现 public static class MyMap extends Mapper<Object, Text, Text, IntWritable>{ @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String stuInfo = value.toString();//将输入的纯文本的数据转换成String System.out.println("studentInfo:"+stuInfo); log.info("MapSudentInfo:"+stuInfo); //将输入的数据先按行进行分割 StringTokenizer tokenizerArticle = new StringTokenizer(stuInfo, "\n"); //分别对每一行进行处理 while(tokenizerArticle.hasMoreTokens()){ // 每行按空格划分 StringTokenizer tokenizer = new StringTokenizer(tokenizerArticle.nextToken()); String name = tokenizer.nextToken();//学生姓名 String score = tokenizer.nextToken();//学生成绩 Text stu = new Text(name); int intscore = Integer.parseInt(score); log.info("MapStu:"+stu.toString()+" "+intscore); context.write(stu,new IntWritable(intscore));//输出学生姓名和成绩 } } } public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException { int sum=0; int count=0; Iterator<IntWritable> iterator= values.iterator(); while(iterator.hasNext()){ sum+=iterator.next().get();//计算总分 count++;//统计总科目 } int avg= (int)sum/count; context.write(key,new IntWritable(avg));//输出学生姓名和平均值 } } public int run(String [] args) throws Exception{ Job job = new Job(getConf()); job.setJarByClass(AvgScore.class); job.setJobName("avgscore"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMap.class); job.setCombinerClass(MyReduce.class); job.setReducerClass(MyReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0]));//设置输入文件路径 FileOutputFormat.setOutputPath(job, new Path(args[1]));//设置输出文件路径 boolean success= job.waitForCompletion(true); return success ? 0 : 1; } public static void main(String[] args) throws Exception { //在eclipse 工具上配置输入和输出参数 int ret = ToolRunner.run(new AvgScore(), args); System.exit(ret); } @Override public Configuration getConf() { return configuration; } @Override public void setConf(Configuration conf) { conf = new Configuration(); configuration=conf; } }我在eclipse 上配置参数。会报异常。所以,我把以上代码导出成 avgscore.jar
把这个avgscore.jar 放到hadoop 0.20.2/目录下。
输入命令 ./bin/hadoop jar avgscore.jar com/picc/test/AvgScore input/student.txt out1
结果 图:
和计算的结果 没有错。
以下是对 以上算法的一个分析:
package com.picc.test; import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.picc.mapreducetest.MyMapReduceTest; public class AvgScore implements Tool{ public static final Logger log=LoggerFactory.getLogger(AvgScore.class); Configuration configuration; public static class MyMap extends Mapper<Object, Text, Text, IntWritable>{ Configuration config = HBaseConfiguration.create();//获取hbase 的操作上下文 private static IntWritable linenum = new IntWritable(1);//初始化一个变量值 @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String stuInfo = value.toString(); System.out.println("studentInfo:"+stuInfo); log.info("MapSudentInfo:"+stuInfo); StringTokenizer tokenizerArticle = new StringTokenizer(stuInfo, "\n"); while(tokenizerArticle.hasMoreTokens()){ StringTokenizer tokenizer = new StringTokenizer(tokenizerArticle.nextToken()); String name = tokenizer.nextToken(); String score = tokenizer.nextToken(); Text stu = new Text(name); int intscore = Integer.parseInt(score); log.info("MapStu:"+stu.toString()+" "+intscore); context.write(stu,new IntWritable(intscore)); //zs 90 //create 'stu','name','score' HTable table=new HTable(config,"stu"); byte[] row1 = Bytes.toBytes("name"+linenum); Put p1=new Put(row1); byte[] databytes = Bytes.toBytes("name"); p1.add(databytes, Bytes.toBytes("1"), Bytes.toBytes(name)); table.put(p1);//put 'stu','name','name:1','zs' table.flushCommits(); byte [] row2 = Bytes.toBytes("score"+linenum); Put p2 = new Put(row2); byte [] databytes2 = Bytes.toBytes("score"); p2.add(databytes2, Bytes.toBytes("1"), Bytes.toBytes(score)); table.put(p2);//put 'stu','score','score:1','90' table.flushCommits(); linenum= new IntWritable(linenum.get()+1);//对变量值进行变值处理 } } } public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException { int sum=0; int count=0; Iterator<IntWritable> iterator= values.iterator(); while(iterator.hasNext()){ sum+=iterator.next().get(); count++; } int avg= (int)sum/count; context.write(key,new IntWritable(avg)); } } public int run(String [] args) throws Exception{ Job job = new Job(getConf()); job.setJarByClass(AvgScore.class); job.setJobName("avgscore"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMap.class); job.setCombinerClass(MyReduce.class); job.setReducerClass(MyReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean success= job.waitForCompletion(true); return success ? 0 : 1; } public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new AvgScore(), args); System.exit(ret); } @Override public Configuration getConf() { return configuration; } @Override public void setConf(Configuration conf) { conf = new Configuration(); configuration=conf; } }
这个代码是对上一个代码的调试分析处理后的代码,
把map 处理的过程放到的数据库中,在MapReduce 中处理 hbase数据时,需要 把hbase 的数据包放到hadoop的lib 包下。
处理的结果,见视图:
注意,在hbase数据库中 row中的Key是不能相同的,否则会 后一条会覆盖前一条值。需要保让其唯一性。
name1 和score1 是一条数据,这两列表是一个学生的成绩,和关系型数据库不同,以列值存储,思想需要转换一下。