hadoop集群搭建参考:https://www.cnblogs.com/asker009/p/9126354.html
1、创建一个maven工程,添加依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.home</groupId> <artifactId>FileSystemCat</artifactId> <version>1.0-SNAPSHOT</version> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <java.version>1.8</java.version> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>3.1.0</version> </dependency> </dependencies> </project>
2、实现cat、copy、filestatus等基本代码,代码可以在windows的IDE环境中正常运行(参考上篇在windows里调试hadoop),也可以打成jar包放入远程hadoop集群上执行。
如果对hadoop的端口不熟悉,在测试环境可以关闭hadoop集群上的防火墙。
import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.IOUtils; import java.io.*; import java.net.URI; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneId; /** * @Author: xu.dm * @Date: 2019/1/31 14:39 * @Description: */ public class FileSystemCat { private static String HDFSUri = "hdfs://bigdata-senior01.home.com:9000"; public static void main(String[] args) throws Exception { long startTime = System.currentTimeMillis(); //文本文件cat // fileCat(args); //文件copy fileCopyWithProgress(args); //file status // fileStatus(args); // file status pattern // filePattern(args); long endTime = System.currentTimeMillis(); long timeSpan = endTime - startTime; System.out.println("耗费时间:" + timeSpan + "毫秒"); } private static FileSystem getFileSystem() { Configuration conf = new Configuration(); //文件系统 FileSystem fs = null; String hdfsUri = HDFSUri; if (StringUtils.isBlank(hdfsUri)) { //返回默认文件系统,如果在hadoop集群下运行,使用此种方法可直接获取默认文件系统; try { fs = FileSystem.get(conf); } catch (IOException e) { e.printStackTrace(); } } else { //返回指定的文件系统,如果在本地测试,需要此种方法获取文件系统; try { URI uri = new URI(hdfsUri.trim()); fs = FileSystem.get(uri, conf); } catch (Exception e) { e.printStackTrace(); } } return fs; } ///hadoop输出文本文件内容 private static void fileCat(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); // conf.set("fs.hdfs.impl",org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.defaultFS","hdfs://bigdata-senior01.home.com:9000"); FileSystem fs = FileSystem.get(URI.create(uri), conf); // InputStream in = null; // FSDataInputStream继承 java.io.DataInputStream,支持随机访问 FSDataInputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); in.seek(0); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } //将本地文件拷贝到hadoop文件系统上 //需要开通datanode用于数据传输端口:9866 private static void fileCopyWithProgress(String[] args) throws Exception { String locaSrc = args[0]; String dst = args[1]; //从windows环境提交的时候需要设置hadoop用户名 //在linux的其他用户环境下估计也需要 System.setProperty("HADOOP_USER_NAME","hadoop"); Configuration conf = new Configuration(); // conf.set("fs.DefaultFs", "hdfs://bigdata-senior01.home.com:9000"); //因为涉及到两个文件系统的数据传输,如果dst不是全路径的话(带不带hdfs的头)用这种方式取到的还是本地文件系统 //如果不涉及两个文件系统,dst写短路径是没问题的 // FileSystem fs = FileSystem.get(URI.create(dst), conf); FileSystem fs = FileSystem.get(URI.create(HDFSUri),conf); // fs.copyFromLocalFile(new Path(locaSrc),new Path(dst)); // fs.close(); // System.out.println("copyFromLocalFile...done"); InputStream in = new BufferedInputStream(new FileInputStream(locaSrc)); FSDataOutputStream out = null; out = fs.create(new Path(dst), () -> System.out.print(".")); IOUtils.copyBytes(in, out, 4096, true); } //查找文件,递归列出给定目录或者文件的属性 private static void fileStatus(String[] args) throws IOException { Configuration conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://bigdata-senior01.home.com:9000"); Path file = new Path(args[0]); FileSystem fs = null; try { // fs = FileSystem.get(URI.create(args[0]),conf); fs = FileSystem.get(conf); // fs = getFileSystem(); //对单个文件或目录 // FileStatus fileStatus = fs.getFileStatus(file); //对单个文件或目录下所有文件和目录 FileStatus[] fileStatuses = fs.listStatus(file); //FileUtil封装了很多文件功能 //FileStatus[] 和 Path[]转换 // Path[] files = FileUtil.stat2Paths(fileStatuses); for (FileStatus fileStatus : fileStatuses) { System.out.println("-------------->"); System.out.println("是否目录:" + fileStatus.isDirectory()); System.out.println("path:" + fileStatus.getPath().toString()); System.out.println("length:" + fileStatus.getLen()); System.out.println("accessTime:" + LocalDateTime.ofInstant(Instant.ofEpochMilli(fileStatus.getAccessTime()), ZoneId.systemDefault())); System.out.println("permission:" + fileStatus.getPermission().toString()); //递归查找子目录 if (fileStatus.isDirectory()) { FileSystemCat.fileStatus(new String[]{fileStatus.getPath().toString()}); } } } finally { if (fs != null) fs.close(); } } //查找文件,通配模式,不能直接用于递归,应该作为递归的最外层,不进入递归 private static void filePattern(String[] args) throws IOException { Configuration conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://bigdata-senior01.home.com:9000"); FileSystem fs = null; try { fs = FileSystem.get(conf); //输入参数大于1,第二个参数作为排除参数 //例如:hadoop jar FileSystemCat.jar /demo ^.*/demo[1-3]$ 排除/demo1,/demo2,/demo3 //例如:hadoop jar FileSystemCat.jar /demo/wc* ^.*/demo/wc3.*$ 排除/demo下wc3开头所有文件 FileStatus[] fileStatuses = null; if (args.length > 1) { System.out.println("过滤路径:" + args[1]); fileStatuses = fs.globStatus(new Path(args[0]), new RegexExcludePathFilter(args[1])); } else { fileStatuses = fs.globStatus(new Path(args[0])); } for (FileStatus fileStatus : fileStatuses) { System.out.println("-------------->"); System.out.println("是否目录:" + fileStatus.isDirectory()); System.out.println("path:" + fileStatus.getPath().toString()); System.out.println("length:" + fileStatus.getLen()); System.out.println("modificationTime:" + LocalDateTime.ofInstant(Instant.ofEpochMilli(fileStatus.getModificationTime()), ZoneId.systemDefault())); System.out.println("permission:" + fileStatus.getPermission().toString()); if (fileStatus.isDirectory()) { FileSystemCat.fileStatus(new String[]{fileStatus.getPath().toString()}); } } } finally { if (fs != null) fs.close(); } } }
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; /** * @Author: xu.dm * @Date: 2019/2/1 13:38 * @Description: */ public class RegexExcludePathFilter implements PathFilter { private final String regex; public RegexExcludePathFilter(String regex) { this.regex = regex; } @Override public boolean accept(Path path) { return !path.toString().matches(this.regex); } }