1、mapper类
package com.cr.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WorcountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
//无参构造
public WorcountMapper(){
System.out.println("-->new WorcountMapper");
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text keyOut = new Text();
IntWritable valueout = new IntWritable();
String[] arr = value.toString().split(" ");
for(String s : arr){
keyOut.set(s);
valueout.set(1);
context.write(keyOut,valueout);
//每调用一次该mapper文件,就记录一次
context.getCounter("m组",Utils.getInfo(this,"map类")).increment(1);
}
}
}
2、reducer类
package com.cr.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordcountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
//无参构造
public WordcountReducer(){
System.out.println("-->new WordcountReducer");
}
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(IntWritable iw : values){
count += iw.get();
}
//获取当前线程
String tno = Thread.currentThread().getName();
System.out.println("线程==>"+ tno + "===> reducer ===> " + key.toString() + "===>" + count);
//进入一次reducer就加1次
// context.getCounter("==reduce===","===WordcountReducer.reduce===").increment(1);
context.getCounter("r组",Utils.getInfo(this,"reducer类")).increment(1);
context.write(key,new IntWritable(count));
}
}
3、计数器主函数
package com.cr.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.examples.WordMean;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordcountApp {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//单例作业
Configuration conf = new Configuration();
// conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
//设置job的各种属性
job.setJobName("wordcountAPP"); //设置job名称
job.setJarByClass(WordcountApp.class); //设置搜索类
job.setInputFormatClass(TextInputFormat.class);
//设置输入路径
FileInputFormat.addInputPath(job,new Path((args[0])));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
// //设置分区
// job.setPartitionerClass(MyPartitioner.class);
// //设置合成类s
job.setCombinerClass(WordcountReducer.class);
job.setMapperClass(WorcountMapper.class); //设置mapper类
job.setReducerClass(WordcountReducer.class); //设置reduecer类
job.setNumReduceTasks(3); //设置reduce个数
job.setMapOutputKeyClass(Text.class); //设置之map输出key
job.setMapOutputValueClass(IntWritable.class); //设置map输出value
job.setOutputKeyClass(Text.class); //设置mapreduce 输出key
job.setOutputValueClass(IntWritable.class); //设置mapreduce输出value
job.waitForCompletion(true);
}
}
4、工具类获取进程
package com.cr.wordcount;
import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.UnknownHostException;
public class Utils {
public static String getInfo(Object o,String msg){
return "主机名:" + getHostname() + "=>" + "进程id" + getPID() + "=>" + "线程ID" + getTID() + "=>" + "类对象信息" + getObjInfo(o) + "=>" + "运行类" + msg;
}
/**
* 获取主机名
* @return
*/
public static String getHostname(){
try {
return InetAddress.getLocalHost().getHostName();
} catch (UnknownHostException e) {
e.printStackTrace();
}
return null;
}
/**
* 获取当前程序所在进程id
* @return
*/
public static int getPID(){
String info = ManagementFactory.getRuntimeMXBean().getName();
return Integer.parseInt(info.substring(0,info.indexOf("@")));
}
/**
* 获取当前程序所在进程id
* @return
*/
public static String getTID(){
return Thread.currentThread().getName();
}
/**
* 获取对象的简单类名和哈希值
* @param o
* @return
*/
public static String getObjInfo(Object o){
String sname = o.getClass().getSimpleName();
return sname + "@" + o.hashCode();
}
}
5、添加core-site.xml和yarn-site.xml
6、打包,运行在Hadoop集群上
待计数的文件有三个
drwxr-xr-x - xiaoqiu supergroup 0 2018-01-05 09:10 /user
drwxr-xr-x - xiaoqiu supergroup 0 2018-01-05 10:44 /user/xiaoqiu
drwxr-xr-x - xiaoqiu supergroup 0 2018-01-05 12:06 /user/xiaoqiu/data
-rw-r--r-- 2 xiaoqiu supergroup 12 2018-01-05 10:45 /user/xiaoqiu/data/wc.txt
-rw-r--r-- 2 xiaoqiu supergroup 18 2018-01-05 12:06 /user/xiaoqiu/data/wc1.txt
-rw-r--r-- 2 xiaoqiu supergroup 16 2018-01-05 12:06 /user/xiaoqiu/data/wc2.txt
运行jar包
[xiaoqiu@s150 /home/xiaoqiu]$ hadoop jar wordcounter.jar com.cr.wordcount.WordcountApp hdfs://s150/user/xiaoqiu/data hdfs://s150/user/xiaoqiu/data/out
7、运行结果
File System Counters
FILE: Number of bytes read=124
FILE: Number of bytes written=732401
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=351
HDFS: Number of bytes written=44
HDFS: Number of read operations=18
HDFS: Number of large read operations=0
HDFS: Number of write operations=6
Job Counters
Killed map tasks=2
Launched map tasks=4
Launched reduce tasks=3
Data-local map tasks=4
Total time spent by all maps in occupied slots (ms)=79906
Total time spent by all reduces in occupied slots (ms)=74886
Total time spent by all map tasks (ms)=79906
Total time spent by all reduce tasks (ms)=74886
Total vcore-milliseconds taken by all map tasks=79906
Total vcore-milliseconds taken by all reduce tasks=74886
Total megabyte-milliseconds taken by all map tasks=81823744
Total megabyte-milliseconds taken by all reduce tasks=76683264
Map-Reduce Framework
Map input records=3
Map output records=10
Map output bytes=86
Map output materialized bytes=160
Input split bytes=305
Combine input records=10
Combine output records=10
Reduce input groups=7
Reduce shuffle bytes=160
Reduce input records=10
Reduce output records=7
Spilled Records=20
Shuffled Maps =9
Failed Shuffles=0
Merged Map outputs=9
GC time elapsed (ms)=646
CPU time spent (ms)=5080
Physical memory (bytes) snapshot=899522560
Virtual memory (bytes) snapshot=12507885568
Total committed heap usage (bytes)=416440320
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
m组
主机名:s150=>进程id6421=>线程IDmain=>类对象信息WorcountMapper@1910896157=>运=4
主机名:s151=>进程id3857=>线程IDmain=>类对象信息WorcountMapper@479734028=>运行=2
主机名:s152=>进程id3080=>线程IDmain=>类对象信息WorcountMapper@479734028=>运行=4
File Input Format Counters
Bytes Read=46
File Output Format Counters
Bytes Written=44
r组
主机名:s150=>进程id6421=>线程IDmain=>类对象信息WordcountReducer@612693043=>=2
主机名:s150=>进程id6421=>线程IDmain=>类对象信息WordcountReducer@878991463=>=2
主机名:s151=>进程id3816=>线程IDmain=>类对象信息WordcountReducer@275056979=>=3
主机名:s151=>进程id3857=>线程IDmain=>类对象信息WordcountReducer@905847077=>=1
主机名:s151=>进程id3857=>线程IDmain=>类对象信息WordcountReducer@908384914=>=1
主机名:s152=>进程id3080=>线程IDmain=>类对象信息WordcountReducer@156199931=>=1
主机名:s152=>进程id3080=>线程IDmain=>类对象信息WordcountReducer@905847077=>=1
主机名:s152=>进程id3080=>线程IDmain=>类对象信息WordcountReducer@908384914=>=2
主机名:s152=>进程id3128=>线程IDmain=>类对象信息WordcountReducer@275056979=>=2
主机名:s152=>进程id3168=>线程IDmain=>类对象信息WordcountReducer@275056979=>=2