两个类,一个HDFS文件操作类,一个是wordcount 词数统计类,都是从网上看来的。上代码:
package mapreduce; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.io.IOUtils; /** * file operation on HDFS * @author liuxingjiaofu * */ public class HDFS_File { //read the file from HDFS public void ReadFile(Configuration conf, String FileName){ try{ FileSystem hdfs = FileSystem.get(conf); FSDataInputStream dis = hdfs.open(new Path(FileName)); IOUtils.copyBytes(dis, System.out, 4096, false); dis.close(); }catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //copy the file from HDFS to local public void GetFile(Configuration conf, String srcFile, String dstFile){ try { FileSystem hdfs = FileSystem.get(conf); Path srcPath = new Path(srcFile); Path dstPath = new Path(dstFile); hdfs.copyToLocalFile(true,srcPath, dstPath); }catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //copy the local file to HDFS public void PutFile(Configuration conf, String srcFile, String dstFile){ try { FileSystem hdfs = FileSystem.get(conf); Path srcPath = new Path(srcFile); Path dstPath = new Path(dstFile); hdfs.copyFromLocalFile(srcPath, dstPath); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //create the new file public FSDataOutputStream CreateFile(Configuration conf, String FileName){ try { FileSystem hdfs = FileSystem.get(conf); Path path = new Path(FileName); FSDataOutputStream outputStream = hdfs.create(path); return outputStream; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } //rename the file name public boolean ReNameFile(Configuration conf, String srcName, String dstName){ try { Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(config); Path fromPath = new Path(srcName); Path toPath = new Path(dstName); boolean isRenamed = hdfs.rename(fromPath, toPath); return isRenamed; }catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return false; } //delete the file // tyep = true, delete the directory // type = false, delete the file public boolean DelFile(Configuration conf, String FileName, boolean type){ try { FileSystem hdfs = FileSystem.get(conf); Path path = new Path(FileName); boolean isDeleted = hdfs.delete(path, type); return isDeleted; }catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return false; } //Get HDFS file last modification time public long GetFileModTime(Configuration conf, String FileName){ try{ FileSystem hdfs = FileSystem.get(conf); Path path = new Path(FileName); FileStatus fileStatus = hdfs.getFileStatus(path); long modificationTime = fileStatus.getModificationTime(); return modificationTime; }catch(IOException e){ e.printStackTrace(); } return 0; } //check if a file exists in HDFS public boolean CheckFileExist(Configuration conf, String FileName){ try{ FileSystem hdfs = FileSystem.get(conf); Path path = new Path(FileName); boolean isExists = hdfs.exists(path); return isExists; }catch(IOException e){ e.printStackTrace(); } return false; } //Get the locations of a file in the HDFS cluster public List<String []> GetFileBolckHost(Configuration conf, String FileName){ try{ List<String []> list = new ArrayList<String []>(); FileSystem hdfs = FileSystem.get(conf); Path path = new Path(FileName); FileStatus fileStatus = hdfs.getFileStatus(path); BlockLocation[] blkLocations = hdfs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); int blkCount = blkLocations.length; for (int i=0; i < blkCount; i++) { String[] hosts = blkLocations[i].getHosts(); list.add(hosts); } return list; }catch(IOException e){ e.printStackTrace(); } return null; } //Get a list of all the nodes host names in the HDFS cluster // have no authorization to do this operation public String[] GetAllNodeName(Configuration conf){ try{ FileSystem fs = FileSystem.get(conf); DistributedFileSystem hdfs = (DistributedFileSystem) fs; DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats(); String[] names = new String[dataNodeStats.length]; for (int i = 0; i < dataNodeStats.length; i++) { names[i] = dataNodeStats[i].getHostName(); } return names; }catch(IOException e){ System.out.println("error!!!!"); e.printStackTrace(); } return null; } }
wordcount.java:
package mapreduce; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class mywordcount { public static class wordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{ String line = value.toString(); StringTokenizer itr = new StringTokenizer(line); while(itr.hasMoreElements()){ word.set(itr.nextToken()); context.write(word, one); } } } public static class wordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ public void reduce(Text key, Iterable<IntWritable>values, Context context)throws IOException, InterruptedException{ int sum = 0; for (IntWritable str : values){ sum += str.get(); } context.write(key, new IntWritable(sum)); } } /** * 2 args, the file you want to count words from and the directory you want to save the result * @param args /home/hadooper/testmp/testtext /home/hadooper/testmp/testresult * @throws Exception */ public static void main(String args[])throws Exception{ //首先定义两个临时文件夹,这里可以使用随机函数+文件名,这样重名的几率就很小。 String dstFile = "temp_src"; String srcFile = "temp_dst"; //这里生成文件操作对象。 HDFS_File file = new HDFS_File(); Configuration conf = new Configuration(); // must!!! config the fs.default.name be the same to the value in core-site.xml conf.set("fs.default.name","hdfs://node1"); conf.set("mapred.job.tracker","node1:54311"); //从本地上传文件到HDFS,可以是文件也可以是目录 file.PutFile(conf, args[0], dstFile); System.out.println("up ok"); Job job = new Job(conf, "mywordcount"); job.setJarByClass(mywordcount.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(wordcountMapper.class); job.setReducerClass(wordcountReducer.class); job.setCombinerClass(wordcountReducer.class); //注意这里的输入输出都应该是在HDFS下的文件或目录 FileInputFormat.setInputPaths(job, new Path(dstFile)); FileOutputFormat.setOutputPath(job, new Path(srcFile)); //开始运行 job.waitForCompletion(true); //从HDFS取回文件保存至本地 file.GetFile(conf, srcFile, args[1]); System.out.println("down the result ok!"); //删除临时文件或目录 file.DelFile(conf, dstFile, true); file.DelFile(conf, srcFile, true); System.out.println("delete file on hdfs ok!"); } }
期间,遇到几个错误:
1.HDFS版本问题--Call to node1/172.*.*.*:8020 failed on local exception: java.io.EOFException
main() {……
Configuration conf = new Configuration();
conf.set("fs.default.name","hdfs://node1");//与conf/core-site里的值对应,必须
HDFS_File file = new HDFS_File();
//print all the node name
String[] host_name = file.GetAllNodeName(conf);
……}
public String[] GetAllNodeName(Configuration conf){
try{
// Configuration config = new Configuration();
FileSystem fs = FileSystem.get(conf);
DistributedFileSystem hdfs = (DistributedFileSystem) fs;
DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();
String[] names = new String[dataNodeStats.length];
for (int i = 0; i < dataNodeStats.length; i++) {
names[i] = dataNodeStats[i].getHostName();
}
return names;
}catch(IOException e){
System.out.println("eeeeeeeeeeeeeeeeeeeerror!!!!");
e.printStackTrace();
}
return null;
}
异常:
eeeeeeeeeeeeeeeeeeeerror!!!!
java.io.IOException: Call to node1/172.10.39.250:8020 failed on local exception: java.io.EOFException
at org.apache.hadoop.ipc.Client.wrapException(Client.java:775)
at org.apache.hadoop.ipc.Client.call(Client.java:743)
at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:220)
at $Proxy0.getProtocolVersion(Unknown Source)
at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:359)
at org.apache.hadoop.hdfs.DFSClient.createRPCNamenode(DFSClient.java:112)
at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:213)
at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:176)
at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:82)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:1378)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:66)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:1390)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:196)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:95)
at mapreduce.HDFS_File.GetAllNodeName(HDFS_File.java:151)
at mapreduce.File_Operation.main(File_Operation.java:15)
Caused by: java.io.EOFException
at java.io.DataInputStream.readInt(DataInputStream.java:392)
at org.apache.hadoop.ipc.Client$Connection.receiveResponse(Client.java:501)
at org.apache.hadoop.ipc.Client$Connection.run(Client.java:446)
Exception in thread "main" java.lang.NullPointerException
at mapreduce.File_Operation.main(File_Operation.java:16)
原因:版本问题,确保java中的jar包跟hadoop集群的jar包是相同版本的
2.HDFS权限问题
org.apache.hadoop.security.AccessControlException: org.apache.hadoop.security.AccessControlException: Permission denied: user=hadooper, access=WRITE, inode="/user":root:supergroup:drwxr-xr-x
解决方案之
(1 added this entry to conf/hdfs-site.xml
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
(2.放开 要写入目录 hadoop 目录的权限 , 命令如下 :$ hadoop fs -chmod 777 /user/
我用的是第2种方案
3.HDFS 2011-12-20 17:00:32 org.apache.hadoop.util.NativeCodeLoader <clinit>
警告: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
在Hadoop的配置文件core-site.xml中可以设置是否使用本地库:
<property>
<name>hadoop.native.lib</name>
<value>true</value>
<description>Should native hadoop libraries, if present, be used.</description>
</property>
Hadoop默认的配置为启用本地库。
另外,可以在环境变量中设置使用本地库的位置:
export JAVA_LIBRARY_PATH=/path/to/hadoop-native-libs
有的时候也会发现Hadoop自带的本地库无法使用,这种情况下就需要自己去编译本地库了。在$HADOOP_HOME目录下,使用如下命令即可:
ant compile-native
编译完成后,可以在$HADOOP_HOME/build/native目录下找到相应的文件,然后指定文件的路径或者移动编译好的文件到默认目录下即可
我试了下,那个是64位的,我电脑是32位的,没有源代码,编译不了,那只好一段段程序的试,找出哪段代码出了这个警告,我的是
try {
FileSystem hdfs = FileSystem.get(conf);
Path srcPath = new Path(srcFile);
Path dstPath = new Path(dstFile);
hdfs.copyToLocalFile(true,srcPath, dstPath);//定位到此句
}catch (IOException e) {
到了此步,便只能如此了,为什么呢,java不是跨平台的吗
4.MR-jar包缺失
ClassNotFoundException: org.codehaus.jackson.map.JsonMappingException
NoClassDefFoundError: org/apache/commons/httpclient/HttpMethod
添加jar包到java工程中
jackson-core-asl-1.5.2.jar
jackson-mapper-asl-1.5.2.jar
commons-httpclient-3.0.1.jar
我是不习惯将所有Jar包都加到工程里,觉得这样很容易便加多了,浪费时空。
完成第一次mapreduce,不错!
5.远程的JOB挂掉了,居然还能运行成功,发现是mapred.job.tracker属性没设,默认在local下运行,其值在namenode的mapred-site.xml中看
conf.set("mapred.job.tracker","node1:54311");
配置完了,运行可以初始化,但是找不到mapper类:
信息: Task Id : attempt_201112221123_0010_m_000000_0, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: mapreduce.mywordcount$wordcountMapper
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:996)
at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:212)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:611)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:325)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1127)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
将程序打成jar包放到hadoop集群的jobtracker上可用,正常,结果也正确,但是在客户端运行却报上述错误,暂时还没解决。
总结
1.远程操作HDFS文件以及远程提交MR任务,必须配置的两项(其他暂时还没发现):
conf.set("fs.default.name","hdfs://node1");//与conf/core-site.xml里的值对应,必须
conf.set("mapred.job.tracker","node1:54311");//mapred-site.xml
2.耐心分析问题,解决问题