mapreduce实现"浏览该商品的人大多数还浏览了"经典应用

mapreduce实现"浏览该商品的人大多数还浏览了"经典应用
转自：http://blog.csdn.net/u011750989/article/details/12004065

输入:

日期    ...cookie id.        ...商品id..

xx            xx                        xx

输出:

商品id        商品id列表(按优先级排序,用逗号分隔)

xx                   xx

比如:

id1              id3,id0,id4,id2

id2             id0,id5

整个计算过程分为4步

1、提取原始日志日期,cookie id,商品id信息，按天计算,最后输出数据格式

商品id-0 商品id-1

xx           x x

这一步做了次优化,商品id-0一定比商品id-1小，为了减少存储，在最后汇总数据转置下即可

reduce做局部排序及排重

2、基于上次的结果做汇总,按天计算

商品id-0 商品id-1 关联值(关联值即同时访问这两个商品的用户数)

xx            x x                xx

3、汇总最近三个月数据,同时考虑时间衰减,时间越久关联值的贡献越低,最后输出两两商品的关联值（包括转置后)

4、行列转换，生成最后要的推荐结果数据,按关联值排序生成

第一个MR
[java] view plain copy
1. import java.io.IOException;
2. import java.util.ArrayList;
3. import org.apache.hadoop.conf.Configuration;
4. import org.apache.hadoop.fs.FileSystem;
5. import org.apache.hadoop.fs.Path;
6. import org.apache.hadoop.io.LongWritable;
7. import org.apache.hadoop.io.Text;
8. import org.apache.hadoop.io.WritableComparable;
9. import org.apache.hadoop.io.WritableComparator;
10. import org.apache.hadoop.mapreduce.Job;
11. import org.apache.hadoop.mapreduce.Mapper;
12. import org.apache.hadoop.mapreduce.Partitioner;
13. import org.apache.hadoop.mapreduce.Reducer;
14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16. import org.apache.hadoop.util.GenericOptionsParser;
17. import org.apache.log4j.Logger;
20. /*
21. * 输入:原始数据,会有重复
22. *日期 cookie 楼盘id
23. *
24. * 输出:
25. * 日期楼盘id1 楼盘id2 //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组
26. *
27. */
29. public class HouseMergeAndSplit {
31. public static class Partitioner1 extends Partitioner<TextPair, Text> {
32. @Override
33. public int getPartition(TextPair key, Text value, int numParititon) {
34. return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;
36. }
37. }
38. public static class Comp1 extends WritableComparator {
39. public Comp1() {
40. super(TextPair.class, true);
41. }
42. @SuppressWarnings("unchecked")
43. public int compare(WritableComparable a, WritableComparable b) {
44. TextPair t1 = (TextPair) a;
45. TextPair t2 = (TextPair) b;
46. int comp= t1.getFirst().compareTo(t2.getFirst());
47. if (comp!=0)
48. return comp;
49. return t1.getSecond().compareTo(t2.getSecond());
50. }
51. }
52. public static class TokenizerMapper
53. extends Mapper<LongWritable, Text, TextPair, Text>{
54. Text val=new Text("test");
55. public void map(LongWritable key, Text value, Context context
56. ) throws IOException, InterruptedException {
57. String s[]=value.toString().split("01");
58. TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]); //thedate cookie city+houseid
59. context.write(tp, val);
60. }
61. }
63. public static class IntSumReducer
64. extends Reducer<TextPair,Text,Text,Text> {
65. private static String comparedColumn[] = new String[3];
66. ArrayList<String> houselist= new ArrayList<String>();
67. private static Text keyv = new Text();
69. private static Text valuev = new Text();
70. static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName());
72. public void reduce(TextPair key, Iterable<Text> values,
73. Context context
74. ) throws IOException, InterruptedException {
76. houselist.clear();
77. String thedate=key.getFirst().toString();
78. String cookie=key.getSecond().toString();
80. for (int i=0;i<3;i++)
81. comparedColumn[i]="";
83. //first+second为分组键,每次不同重新调用reduce函数
84. for (Text val:values)
85. {
87. if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&& !key.getThree().toString().equals(comparedColumn[2]))
88. {
89. // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));
90. houselist.add(key.getThree().toString());
92. comparedColumn[0]=key.getFirst().toString();
93. comparedColumn[1]=key.getSecond().toString();
94. comparedColumn[2]=key.getThree().toString();
96. }
98. if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1]))
100. {
102. // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));
103. houselist.add(key.getThree().toString());
104. comparedColumn[0]=key.getFirst().toString();
105. comparedColumn[1]=key.getSecond().toString();
106. comparedColumn[2]=key.getThree().toString();
108. }
112. }
116. keyv.set(comparedColumn[0]); //日期
117. //valuev.set(houselist.toString());
118. //logger.info(houselist.toString());
119. //context.write(keyv,valuev);
122. for (int i=0;i<houselist.size()-1;i++)
123. {
124. for (int j=i+1;j<houselist.size();j++)
125. { valuev.set(houselist.get(i)+" "+houselist.get(j)); //关联的楼盘
126. context.write(keyv,valuev);
127. }
128. }
130. }
131. }
133. public static void main(String[] args) throws Exception {
134. Configuration conf = new Configuration();
135. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
136. if (otherArgs.length != 2) {
137. System.err.println("Usage: wordcount <in> <out>");
138. System.exit(2);
139. }
141. FileSystem fstm = FileSystem.get(conf);
142. Path outDir = new Path(otherArgs[1]);
143. fstm.delete(outDir, true);
145. conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符
146. Job job = new Job(conf, "HouseMergeAndSplit");
147. job.setNumReduceTasks(4);
148. job.setJarByClass(HouseMergeAndSplit.class);
149. job.setMapperClass(TokenizerMapper.class);
151. job.setMapOutputKeyClass(TextPair.class);
152. job.setMapOutputValueClass(Text.class);
153. // 设置partition
154. job.setPartitionerClass(Partitioner1.class);
155. // 在分区之后按照指定的条件分组
156. job.setGroupingComparatorClass(Comp1.class);
157. // 设置reduce
158. // 设置reduce的输出
159. job.setReducerClass(IntSumReducer.class);
160. job.setOutputKeyClass(Text.class);
161. job.setOutputValueClass(Text.class);
162. //job.setNumReduceTasks(18);
163. FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
164. FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
165. System.exit(job.waitForCompletion(true) ? 0 : 1);
166. }
167. }
TextPair
[java] view plain copy
1. import java.io.DataInput;
2. import java.io.DataOutput;
3. import java.io.IOException;
5. import org.apache.hadoop.io.Text;
6. import org.apache.hadoop.io.WritableComparable;
8. public class TextPair implements WritableComparable<TextPair> {
9. private Text first;
10. private Text second;
11. private Text three;
12. public TextPair() {
13. set(new Text(), new Text(),new Text());
14. }
15. public TextPair(String first, String second,String three) {
16. set(new Text(first), new Text(second),new Text(three));
17. }
18. public TextPair(Text first, Text second,Text Three) {
19. set(first, second,three);
20. }
21. public void set(Text first, Text second,Text three) {
22. this.first = first;
23. this.second = second;
24. this.three=three;
25. }
26. public Text getFirst() {
27. return first;
28. }
29. public Text getSecond() {
30. return second;
31. }
32. public Text getThree() {
33. return three;
34. }
35. public void write(DataOutput out) throws IOException {
36. first.write(out);
37. second.write(out);
38. three.write(out);
39. }
40. public void readFields(DataInput in) throws IOException {
41. first.readFields(in);
42. second.readFields(in);
43. three.readFields(in);
44. }
45. public int compareTo(TextPair tp) {
46. int cmp = first.compareTo(tp.first);
47. if (cmp != 0) {
48. return cmp;
49. }
50. cmp= second.compareTo(tp.second);
51. if (cmp != 0) {
52. return cmp;
53. }
54. return three.compareTo(tp.three);
55. }
56. }
TextPairSecond
[java] view plain copy
1. import java.io.DataInput;
2. import java.io.DataOutput;
3. import java.io.IOException;
5. import org.apache.hadoop.io.FloatWritable;
6. import org.apache.hadoop.io.Text;
7. import org.apache.hadoop.io.WritableComparable;
9. public class TextPairSecond implements WritableComparable<TextPairSecond> {
10. private Text first;
11. private FloatWritable second;
12. public TextPairSecond() {
13. set(new Text(), new FloatWritable());
14. }
15. public TextPairSecond(String first, float second) {
16. set(new Text(first), new FloatWritable(second));
17. }
18. public TextPairSecond(Text first, FloatWritable second) {
19. set(first, second);
20. }
21. public void set(Text first, FloatWritable second) {
22. this.first = first;
23. this.second = second;
24. }
25. public Text getFirst() {
26. return first;
27. }
28. public FloatWritable getSecond() {
29. return second;
30. }
31. public void write(DataOutput out) throws IOException {
32. first.write(out);
33. second.write(out);
34. }
35. public void readFields(DataInput in) throws IOException {
36. first.readFields(in);
37. second.readFields(in);
38. }
39. public int compareTo(TextPairSecond tp) {
40. int cmp = first.compareTo(tp.first);
41. if (cmp != 0) {
42. return cmp;
43. }
44. return second.compareTo(tp.second);
45. }
47. }
第二个MR
[java] view plain copy
1. import java.io.IOException;
2. import java.text.SimpleDateFormat;
3. import java.util.ArrayList;
4. import java.util.Date;
6. import org.apache.hadoop.conf.Configuration;
7. import org.apache.hadoop.fs.FileSystem;
8. import org.apache.hadoop.fs.Path;
9. import org.apache.hadoop.io.IntWritable;
10. import org.apache.hadoop.io.LongWritable;
11. import org.apache.hadoop.io.NullWritable;
12. import org.apache.hadoop.io.Text;
13. import org.apache.hadoop.io.WritableComparable;
14. import org.apache.hadoop.io.WritableComparator;
15. import org.apache.hadoop.mapred.OutputCollector;
16. import org.apache.hadoop.mapreduce.Job;
17. import org.apache.hadoop.mapreduce.Mapper;
18. import org.apache.hadoop.mapreduce.Partitioner;
19. import org.apache.hadoop.mapreduce.Reducer;
21. import org.apache.hadoop.mapreduce.Mapper.Context;
22. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
23. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
24. import org.apache.hadoop.util.GenericOptionsParser;
25. import org.apache.log4j.Logger;
28. /*
29. * 统计楼盘之间共同出现的次数
30. * 输入:
31. * 日期楼盘1 楼盘2
32. *
33. * 输出：
34. * 日期楼盘1 楼盘2 共同出现的次数
35. *
36. */
38. public class HouseCount {
41. public static class TokenizerMapper
42. extends Mapper<LongWritable, Text, Text, IntWritable>{
45. IntWritable iw=new IntWritable(1);
46. public void map(LongWritable key, Text value, Context context
47. ) throws IOException, InterruptedException {
50. context.write(value, iw);
51. }
52. }
54. public static class IntSumReducer
55. extends Reducer<Text,IntWritable,Text,IntWritable> {
57. IntWritable result=new IntWritable();
58. public void reduce(Text key, Iterable<IntWritable> values,
59. Context context
60. ) throws IOException, InterruptedException {
62. int sum=0;
63. for (IntWritable iw:values)
64. {
65. sum+=iw.get();
66. }
67. result.set(sum);
68. context.write(key, result) ;
70. }
71. }
73. public static void main(String[] args) throws Exception {
74. Configuration conf = new Configuration();
75. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
76. if (otherArgs.length != 2) {
77. System.err.println("Usage: wordcount <in> <out>");
78. System.exit(2);
79. }
81. FileSystem fstm = FileSystem.get(conf);
82. Path outDir = new Path(otherArgs[1]);
83. fstm.delete(outDir, true);
85. conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符
86. Job job = new Job(conf, "HouseCount");
87. job.setNumReduceTasks(2);
88. job.setJarByClass(HouseCount.class);
89. job.setMapperClass(TokenizerMapper.class);
91. job.setMapOutputKeyClass(Text.class);
92. job.setMapOutputValueClass(IntWritable.class);
94. // 设置reduce
95. // 设置reduce的输出
96. job.setReducerClass(IntSumReducer.class);
97. job.setOutputKeyClass(Text.class);
98. job.setOutputValueClass(IntWritable.class);
99. //job.setNumReduceTasks(18);
100. FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
101. FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
102. System.exit(job.waitForCompletion(true) ? 0 : 1);
103. }
104. }
第三个MR
[java] view plain copy
1. import java.io.IOException;
2. import java.text.ParseException;
3. import java.text.SimpleDateFormat;
4. import java.util.ArrayList;
5. import java.util.Calendar;
6. import java.util.Date;
8. import org.apache.hadoop.conf.Configuration;
9. import org.apache.hadoop.fs.FileSystem;
10. import org.apache.hadoop.fs.Path;
11. import org.apache.hadoop.io.FloatWritable;
12. import org.apache.hadoop.io.IntWritable;
13. import org.apache.hadoop.io.LongWritable;
14. import org.apache.hadoop.io.NullWritable;
15. import org.apache.hadoop.io.Text;
16. import org.apache.hadoop.io.WritableComparable;
17. import org.apache.hadoop.io.WritableComparator;
18. import org.apache.hadoop.mapred.OutputCollector;
19. import org.apache.hadoop.mapreduce.Job;
20. import org.apache.hadoop.mapreduce.Mapper;
21. import org.apache.hadoop.mapreduce.Partitioner;
22. import org.apache.hadoop.mapreduce.Reducer;
24. import org.apache.hadoop.mapreduce.Mapper.Context;
25. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
26. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
27. import org.apache.hadoop.util.GenericOptionsParser;
28. import org.apache.log4j.Logger;
31. /*
32. * 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数，并最后a b 转成 b a输出一次
33. * 输入:
34. * 日期楼盘1 楼盘2 共同出现的次数
35. *
36. * 输出
37. * 楼盘1 楼盘2 共同出现的次数（考虑了衰减系数，每天的衰减系数不一样)
38. *
39. */
41. public class HouseCountHz {
44. public static class HouseCountHzMapper
45. extends Mapper<LongWritable, Text, Text, FloatWritable>{
47. Text keyv=new Text();
49. FloatWritable valuev=new FloatWritable();
50. public void map(LongWritable key, Text value, Context context
51. ) throws IOException, InterruptedException {
53. String[] s=value.toString().split(" ");
54. keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2
55. Calendar date1=Calendar.getInstance();
56. Calendar d2=Calendar.getInstance();
58. Date b = null;
59. SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
60. try {
61. b=sdf.parse(s[0]);
62. } catch (ParseException e) {
63. e.printStackTrace();
64. }
65. d2.setTime(b);
66. long n=date1.getTimeInMillis();
67. long birth=d2.getTimeInMillis();
68. long sss=n-birth;
69. int day=(int)((sss)/(3600*24*1000)); //该条记录的日期与当前日期的日期差
70. float factor=1/(1+(float)(day-1)/10); //衰减系数
71. valuev.set(Float.parseFloat(s[3])*factor);
73. context.write(keyv, valuev);
74. }
75. }
77. public static class HouseCountHzReducer
78. extends Reducer<Text,FloatWritable,Text,FloatWritable> {
80. FloatWritable result=new FloatWritable();
81. Text keyreverse=new Text();
82. public void reduce(Text key, Iterable<FloatWritable> values,
83. Context context
84. ) throws IOException, InterruptedException {
86. float sum=0;
87. for (FloatWritable iw:values)
88. {
89. sum+=iw.get();
90. }
91. result.set(sum);
92. String[] keys=key.toString().split(" ");
93. keyreverse.set(keys[1]+" "+keys[0]);
94. context.write(key, result) ;
95. context.write(keyreverse, result) ;
97. }
98. }
100. public static void main(String[] args) throws Exception {
101. Configuration conf = new Configuration();
102. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
103. if (otherArgs.length != 2) {
104. System.err.println("Usage: wordcount <in> <out>");
105. System.exit(2);
106. }
108. FileSystem fstm = FileSystem.get(conf);
109. Path outDir = new Path(otherArgs[1]);
110. fstm.delete(outDir, true);
112. conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符
113. Job job = new Job(conf, "HouseCountHz");
114. job.setNumReduceTasks(2);
115. job.setJarByClass(HouseCountHz.class);
116. job.setMapperClass(HouseCountHzMapper.class);
118. job.setMapOutputKeyClass(Text.class);
119. job.setMapOutputValueClass(FloatWritable.class);
121. // 设置reduce
122. // 设置reduce的输出
123. job.setReducerClass(HouseCountHzReducer.class);
124. job.setOutputKeyClass(Text.class);
125. job.setOutputValueClass(FloatWritable.class);
126. //job.setNumReduceTasks(18);
127. FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
128. FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
129. System.exit(job.waitForCompletion(true) ? 0 : 1);
130. }
131. }
第四个MR
[java] view plain copy
相关阅读:
线程池的优雅关闭实践
 InheritableThreadLocal原理解析
 线程池踩坑
 两个线程通讯（生产-卖面包问题）
谈谈redis的热key问题如何解决
 中国软件杯选题A1数据智能分析报告系统
 《程序员的思维修炼》读后感
 《算法导论》读后感
 《重构》读后感
 《代码整洁之道》读后感
原文地址：https://www.cnblogs.com/cxzdy/p/5103802.html