• 0008.MapReduce基础



    05-06-分析求每个部门的工资总额

    emp.png

    emp.csv
    
    7369	SMITH	CLERK	7902	1980/12/17	800	0	20
    7499	ALLEN	SALESMAN	7698	1981/2/20	1600	300	30
    7521	WARD	SALESMAN	7698	1981/2/22	1250	500	30
    7566	JONES	MANAGER	7839	1981/4/2	2975	0	20
    7654	MARTIN	SALESMAN	7698	1981/9/28	1250	1400	30
    7698	BLAKE	MANAGER	7839	1981/5/1	2850	0	30
    7782	CLARK	MANAGER	7839	1981/6/9	2450	0	10
    7788	SCOTT	ANALYST	7566	1987/4/19	3000	0	20
    7839	KING	PRESIDENT	-1	1981/11/17	5000	0	10
    7844	TURNER	SALESMAN	7698	1981/9/8	1500	0	30
    7876	ADAMS	CLERK	7788	1987/5/23	1100	0	20
    7900	JAMES	CLERK	7698	1981/12/3	950	0	30
    7902	FORD	ANALYST	7566	1981/12/3	3000	0	20
    7934	MILLER	CLERK	7782	1982/1/23	1300	0	10
    
    
    员工表.png

    求部门的工资总额

    分析WordCount数据处理的过程


    05-07-开发程序求每个部门的工资总额

    开发程序
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    public class SalaryTotalMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
    
    	@Override
    	protected void map(LongWritable key1, Text value1,Context context)
    			throws IOException, InterruptedException {
    		// 数据:7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
    		String data = value1.toString();
    		
    		//分词
    		String[] words = data.split(",");
    		
    		//输出:k2 部门号   v2员工的工资
    		context.write(new IntWritable(Integer.parseInt(words[7])), 
    				      new IntWritable(Integer.parseInt(words[5])));
    	}
    }
    
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class SalaryTotalReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
    
    	@Override
    	protected void reduce(IntWritable k3, Iterable<IntWritable> v3,Context context)
    			throws IOException, InterruptedException {
    		// 求v3求和
    		int total = 0;
    		for(IntWritable v:v3){
    			total = total + v.get();
    		}
    		
    		//输出  k4  部门号    v4是部门的工资总额
    		context.write(k3, new IntWritable(total));
    	}
    
    }
    
    
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class SalaryTotalMain {
    
    	public static void main(String[] args) throws Exception {
    		//1、创建任务、指定任务的入口 
    		Job job = Job.getInstance(new Configuration());
    		job.setJarByClass(SalaryTotalMain.class);
    		
    		//2、指定任务的map和map输出的数据类型
    		job.setMapperClass(SalaryTotalMapper.class);
    		job.setMapOutputKeyClass(IntWritable.class);
    		job.setMapOutputValueClass(IntWritable.class);
    		
    		//3、指定任务的reducer和reducer输出的类型
    		job.setReducerClass(SalaryTotalReducer.class);
    		job.setOutputKeyClass(IntWritable.class);
    		job.setOutputValueClass(IntWritable.class);
    		
    		//4、指定任务输入路径和输出路径
    		FileInputFormat.setInputPaths(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    		
    		//5、执行任务
    		job.waitForCompletion(true);
    	}
    }
    
    [root@bigdatal11 temp]# 1s *.csv dept.csv emp.csv
    [root@bigdatal11 temp]# hdfs dfs -mkdir /scott
    [root@bigdatalll temp]# hdfs dfs -put *.csv  /scott
    [root@bigdatal11 temp]# hdfs dfs-1s/scott Found 2 items
    -rw-r--r--1 root supergroup842018-09-1020:38/scott/dept.csv
    -rw-r--r--1 root supergroup 6292018-09-1020:38/scott/emp.csv
    [root@bigdatal11 temp]# hadoop jar sl.jar /scott/emp.csv /output/0910/s1
    
    [root@bigdatal11 temp]# hdfs dfs-1s /output/0910/s1
    Found 2 items
    -rw-r--r--1 root supergroup 02018-09-1020:41/output/0910/s1/_SUCCESS
    -rw-r--r--1 root supergroup 252018-09-1020:41/output/0910/s1/part-r-00000
    [root@bigdatal11 temp]# hdfs dfs -cat /output/0910/s1/part-r-00000
    10   8750
    20   10875
    30   9400
    [root@bigdatal11 temp]#
    

    05-08-Java的序列化

    Java的序列化
    import java.io.Serializable;
    
    public class Student implements Serializable {
    
    	private int stuID;
    	private String stuName;
    	
    	public int getStuID() {
    		return stuID;
    	}
    	public void setStuID(int stuID) {
    		this.stuID = stuID;
    	}
    	public String getStuName() {
    		return stuName;
    	}
    	public void setStuName(String stuName) {
    		this.stuName = stuName;
    	}
    	
    }
    
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.ObjectOutputStream;
    import java.io.OutputStream;
    
    public class TestMain {
    
    	public static void main(String[] args) throws Exception {
    		// 创建一个学生对象
    		Student s = new Student();
    		s.setStuID(1);
    		s.setStuName("Tom");
    		
    		//把这个对象保存到文件中 -----> 序列化
    		OutputStream out = new FileOutputStream("d:\temp\student.ooo");
    		ObjectOutputStream oos = new ObjectOutputStream(out);
    		
    		oos.writeObject(s);
    		
    		oos.close();
    		out.close();
    	}
    }
    

    05-09-MapReduce的序列化

    读取员工数据,生成员工的对象,直接输出到HDFS
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    import org.apache.hadoop.io.Writable;
    
    //代表员工
    //数据:7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
    public class Emp implements Writable{
    	
    	private int empno;//员工号
    	private String ename; //员工姓名
    	private String job; //职位
    	private int mgr; //经理的员工号
    	private String hiredate;//入职日期
    	private int sal; //月薪
    	private int comm; //奖金
    	private int deptno; //部门号
    	
    		
    	@Override
    	public String toString() {
    		return "Emp [empno=" + empno + ", ename=" + ename + ", sal=" + sal + ", deptno=" + deptno + "]";
    	}
    
    	@Override
    	public void readFields(DataInput input) throws IOException {
    		//实现反序列化,从输入流中读取对象
    		this.empno = input.readInt();
    		this.ename = input.readUTF();
    		this.job = input.readUTF();
    		this.mgr = input.readInt();
    		this.hiredate = input.readUTF();
    		this.sal = input.readInt();
    		this.comm = input.readInt();
    		this.deptno = input.readInt();
    	}
    	
    	@Override
    	public void write(DataOutput output) throws IOException {
    		// 实现序列化,把对象输出到输出流
    		output.writeInt(this.empno);
    		output.writeUTF(this.ename);
    		output.writeUTF(this.job);
    		output.writeInt(this.mgr);
    		output.writeUTF(this.hiredate);
    		output.writeInt(this.sal);
    		output.writeInt(this.comm);
    		output.writeInt(this.deptno);
    	}
    	
    	
    	public int getEmpno() {
    		return empno;
    	}
    	public void setEmpno(int empno) {
    		this.empno = empno;
    	}
    	public String getEname() {
    		return ename;
    	}
    	public void setEname(String ename) {
    		this.ename = ename;
    	}
    	public String getJob() {
    		return job;
    	}
    	public void setJob(String job) {
    		this.job = job;
    	}
    	public int getMgr() {
    		return mgr;
    	}
    	public void setMgr(int mgr) {
    		this.mgr = mgr;
    	}
    	public String getHiredate() {
    		return hiredate;
    	}
    	public void setHiredate(String hiredate) {
    		this.hiredate = hiredate;
    	}
    	public int getSal() {
    		return sal;
    	}
    	public void setSal(int sal) {
    		this.sal = sal;
    	}
    	public int getComm() {
    		return comm;
    	}
    	public void setComm(int comm) {
    		this.comm = comm;
    	}
    	public int getDeptno() {
    		return deptno;
    	}
    	public void setDeptno(int deptno) {
    		this.deptno = deptno;
    	}
    }
    
    
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    //                                                             k2:员工号           v2:员工对象
    public class EmpInfoMapper extends Mapper<LongWritable, Text, IntWritable, Emp> {
    
    	@Override
    	protected void map(LongWritable key1, Text value1, Context context)
    			throws IOException, InterruptedException {
    		// 数据:7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
    		String data = value1.toString();
    		
    		//分词
    		String[] words = data.split(",");
    		
    		//生成员工对象
    		Emp emp = new Emp();
    		emp.setEmpno(Integer.parseInt(words[0]));
    		emp.setEname(words[1]);
    		emp.setJob(words[2]);
    		emp.setMgr(Integer.parseInt(words[3]));
    		emp.setHiredate(words[4]);
    		emp.setSal(Integer.parseInt(words[5]));
    		emp.setComm(Integer.parseInt(words[6]));
    		emp.setDeptno(Integer.parseInt(words[7]));
    		
    		//输出员工对象  k2:员工号                                                                     v2:员工对象
    		context.write(new IntWritable(emp.getEmpno()), emp);		
    	}
    }
    
    
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class EmpInfoMain {
    
    	public static void main(String[] args) throws Exception {
    		Job job = Job.getInstance(new Configuration());
    		job.setJarByClass(EmpInfoMain.class);
    		
    		job.setMapperClass(EmpInfoMapper.class);
    		job.setMapOutputKeyClass(IntWritable.class);
    		job.setMapOutputValueClass(Emp.class);  // 输出就是员工对象
    		
    		job.setOutputKeyClass(IntWritable.class);
    		job.setOutputValueClass(Emp.class);
    		
    		FileInputFormat.setInputPaths(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
    		job.waitForCompletion(true);
    	}
    
    }
    

    05-10-使用序列化求部门工资总额

    使用MapReduce序列化重写“求部门工资的总额的例子”
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    import org.apache.hadoop.io.Writable;
    
    //代表员工
    //数据:7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
    public class Emp implements Writable{
    	
    	private int empno;//员工号
    	private String ename; //员工姓名
    	private String job; //职位
    	private int mgr; //经理的员工号
    	private String hiredate;//入职日期
    	private int sal; //月薪
    	private int comm; //奖金
    	private int deptno; //部门号
    	
    		
    	@Override
    	public String toString() {
    		return "Emp [empno=" + empno + ", ename=" + ename + ", sal=" + sal + ", deptno=" + deptno + "]";
    	}
    
    	@Override
    	public void readFields(DataInput input) throws IOException {
    		//实现反序列化,从输入流中读取对象
    		this.empno = input.readInt();
    		this.ename = input.readUTF();
    		this.job = input.readUTF();
    		this.mgr = input.readInt();
    		this.hiredate = input.readUTF();
    		this.sal = input.readInt();
    		this.comm = input.readInt();
    		this.deptno = input.readInt();
    	}
    	
    	@Override
    	public void write(DataOutput output) throws IOException {
    		// 实现序列化,把对象输出到输出流
    		output.writeInt(this.empno);
    		output.writeUTF(this.ename);
    		output.writeUTF(this.job);
    		output.writeInt(this.mgr);
    		output.writeUTF(this.hiredate);
    		output.writeInt(this.sal);
    		output.writeInt(this.comm);
    		output.writeInt(this.deptno);
    	}
    	
    	
    	public int getEmpno() {
    		return empno;
    	}
    	public void setEmpno(int empno) {
    		this.empno = empno;
    	}
    	public String getEname() {
    		return ename;
    	}
    	public void setEname(String ename) {
    		this.ename = ename;
    	}
    	public String getJob() {
    		return job;
    	}
    	public void setJob(String job) {
    		this.job = job;
    	}
    	public int getMgr() {
    		return mgr;
    	}
    	public void setMgr(int mgr) {
    		this.mgr = mgr;
    	}
    	public String getHiredate() {
    		return hiredate;
    	}
    	public void setHiredate(String hiredate) {
    		this.hiredate = hiredate;
    	}
    	public int getSal() {
    		return sal;
    	}
    	public void setSal(int sal) {
    		this.sal = sal;
    	}
    	public int getComm() {
    		return comm;
    	}
    	public void setComm(int comm) {
    		this.comm = comm;
    	}
    	public int getDeptno() {
    		return deptno;
    	}
    	public void setDeptno(int deptno) {
    		this.deptno = deptno;
    	}
    }
    
    
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    //                                                                k2:部门号           v2:员工对象
    public class SalaryTotalMapper extends Mapper<LongWritable, Text, IntWritable, Emp> {
    
    	@Override
    	protected void map(LongWritable key1, Text value1, Context context)
    			throws IOException, InterruptedException {
    		// 数据:7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
    		String data = value1.toString();
    		
    		//分词
    		String[] words = data.split(",");
    		
    		//生成员工对象
    		Emp emp = new Emp();
    		emp.setEmpno(Integer.parseInt(words[0]));
    		emp.setEname(words[1]);
    		emp.setJob(words[2]);
    		emp.setMgr(Integer.parseInt(words[3]));
    		emp.setHiredate(words[4]);
    		emp.setSal(Integer.parseInt(words[5]));
    		emp.setComm(Integer.parseInt(words[6]));
    		emp.setDeptno(Integer.parseInt(words[7]));
    		
    		//输出员工对象  k2:部门号                                                                     v2:员工对象
    		context.write(new IntWritable(emp.getDeptno()), emp);	
    	}
    }
    
    
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class SalaryTotalReducer extends Reducer<IntWritable, Emp, IntWritable, IntWritable> {
    
    	@Override
    	protected void reduce(IntWritable k3, Iterable<Emp> v3,Context context) throws IOException, InterruptedException {
    		int total = 0;
    		
    		//取出员工薪水,并求和
    		for(Emp e:v3){
    			total = total + e.getSal();
    		}
    		
    		context.write(k3, new IntWritable(total));
    	}
    }
    
    
    [root@bigdatal11 temp]# hdfs dfs-1s /output/0910/s2
    Found 2 items
    -rw-r--r--1root supergroup 02018-09-1021:35/output/0910/s2/_SUCCESS
    -rw-r--r--1root supergroup782 2018-09-1021:35/output/0910/s2/part-r-00000
    [root@bigdatal11 temp]# hdfs dfs-cat /output/0910/s2/part-r-00000
    7369 Emp [empno=7369,ename=SMITH,sal=800,deptno=20]
    7499 Emp [empno=7499,ename=ALLEN,sal=1600,deptno=30]
    7521 Emp [empno=7521,ename=WARD,sal=1250,deptno=30]
    7566 Emp [empno=7566,ename=JONES,sal=2975,deptno=20]
    7654 Emp [empno=7654,ename=MARTIN,sal=1250,deptno=30]
    7698 Emp [empno=7698,ename=BLAKE,sal=2850,deptno=30]
    7782 Emp [empno=7782,ename=CLARK,sal=2450,deptno=10]
    7788 Emp [empno=7788,ename=SCOTT,sal=3000,deptno=20]
    7839 Emp [empno=7839,ename=KING,sal=5000,deptno=10]
    7844 Emp [empno=7844,ename=TURNER,sal=1500,deptno=30]
    7876 Emp [empno=7876,ename=ADAMS,sal=1100,deptno=20]
    7900 Emp [empno=7900,ename=JAMES,sal=950,deptno=30]
    7902 Emp [empno=7902,ename=FORD,sal=3000,deptno=20]
    7934 Emp [empno=7934,ename=MILLER,sal=1300,deptno=10]
    
    
    public class SalaryTotalMain {
    
    	public static void main(String[] args) throws Exception {
    		//1、创建任务、指定任务的入口 
    		Job job = Job.getInstance(new Configuration());
    		job.setJarByClass(SalaryTotalMain.class);
    		
    		//2、指定任务的map和map输出的数据类型
    		job.setMapperClass(SalaryTotalMapper.class);
    		job.setMapOutputKeyClass(IntWritable.class);
    		job.setMapOutputValueClass(Emp.class);
    		
    		//3、指定任务的reducer和reducer输出的类型
    		job.setReducerClass(SalaryTotalReducer.class);
    		job.setOutputKeyClass(IntWritable.class);
    		job.setOutputValueClass(IntWritable.class);
    		
    		//4、指定任务输入路径和输出路径
    		FileInputFormat.setInputPaths(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    		
    		//5、执行任务
    		job.waitForCompletion(true);
    	}
    }
    
    [root@bigdatal11 temp]# hdfs dfs-1s /output/0910/s3
    Found 2 items
    -rw-r--r--1 root supergroup02018-09-1021:50/output/0910/s3/_SUCCESS
    |-rw-r--r--1 root supergroup25 2018-09-1021:50/output/0910/s3/part-r-00000
    [root@bigdata111 temp]# hdfs dfs -cat /output/0910/s3/part-r-00000
    10  8750
    20  10875
    30  9400
    

    05-11-基本数据类型的排序

    
    import org.apache.hadoop.io.IntWritable;
    
    //针对数字创建自己的比较规则,执行降序排序
    public class MyNumberComparator extends IntWritable.Comparator {
    
    	@Override
    	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
    		// TODO Auto-generated method stub
    		return -super.compare(b1, s1, l1, b2, s2, l2);
    	}
    
    }
    
    
    public class SalaryTotalMain {
    
    	public static void main(String[] args) throws Exception {
    		//1、创建任务、指定任务的入口 
    		Job job = Job.getInstance(new Configuration());
    		job.setJarByClass(SalaryTotalMain.class);
    		
    		//2、指定任务的map和map输出的数据类型
    		job.setMapperClass(SalaryTotalMapper.class);
    		job.setMapOutputKeyClass(IntWritable.class);
    		job.setMapOutputValueClass(Emp.class);
    		
    		// 指定自己的比较规则
    		job.setSortComparatorClass(MyNumberComparator.class);
    		
    		
    		//3、指定任务的reducer和reducer输出的类型
    		job.setReducerClass(SalaryTotalReducer.class);
    		job.setOutputKeyClass(IntWritable.class);
    		job.setOutputValueClass(IntWritable.class);
    		
    		//4、指定任务输入路径和输出路径
    		FileInputFormat.setInputPaths(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    		
    		//5、执行任务
    		job.waitForCompletion(true);
    	}
    }
    
    [root@bigdatal11 temp]# hdfs dfs-cat /output/0910/s4/part-r-00000
    30  9400
    20  10875
    10  8750
    [rootebigdatal11 temp]# 
    
  • 相关阅读:
    京东咚咚架构演讲读后感
    京东峰值系统设计读后感
    游戏服务器的架构演讲读后感
    菜鸟弹性调度系统的架构设计读后感
    阿里如何实现秒级百万TPS?搜索离线大数据平台架构解读读后感
    阿里游戏高可用架构设计实践读后感
    淘宝架构背后——零售业务中台架构设计探讨及实践读后感
    本地存储的时候需要类型转换
    禁止输入框显示用户历史输入历史记录
    项目必备!永无 bug 注释
  • 原文地址:https://www.cnblogs.com/RoyalGuardsTomCat/p/13835009.html
Copyright © 2020-2023  润新知