• java使用tika批量识别文件的真实mime类型


    生产环境中,服务器使用JDK1.7,服务器上了为了限制文件类型,现在想把已上传类型进行汇总。

    idea中新建maven项目,选择quickstart

    pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
      xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
      <modelVersion>4.0.0</modelVersion>
    
      <groupId>com.h2</groupId>
      <artifactId>mimetype</artifactId>
      <version>1.0</version>
    
      <name>mimetype</name>
    
      <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
      </properties>
    
      <dependencies>
    
        <!-- mime type-->
        <dependency>
          <groupId>org.apache.tika</groupId>
          <artifactId>tika-core</artifactId>
          <version>1.18</version>
        </dependency>
      </dependencies>
    
      <build>
        <plugins>
          <plugin>
            <groupId>org.scala-tools</groupId>
            <artifactId>maven-scala-plugin</artifactId>
            <version>2.15.2</version>
            <executions>
              <execution>
                <goals>
                  <goal>compile</goal>
                  <goal>testCompile</goal>
                </goals>
              </execution>
            </executions>
          </plugin>
    
          <plugin>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.6.0</version>
            <configuration>
              <source>1.7</source>
              <target>1.7</target>
            </configuration>
          </plugin>
          <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-assembly-plugin</artifactId>
            <version>2.3</version>
            <configuration>
              <descriptorRefs>
                <descriptorRef>jar-with-dependencies</descriptorRef>
              </descriptorRefs>
            </configuration>
          </plugin>
    
          <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-surefire-plugin</artifactId>
            <version>2.19</version>
            <configuration>
              <skip>true</skip>
            </configuration>
          </plugin>
    
          <plugin><!--包含class目录资源文件-->
            <groupId>org.codehaus.mojo</groupId>
            <artifactId>build-helper-maven-plugin</artifactId>
            <version>1.8</version>
            <executions>
              <execution>
                <id>add-resource</id>
                <phase>generate-resources</phase>
                <goals>
                  <goal>add-resource</goal>
                </goals>
                <configuration>
                  <resources>
                    <resource>
                      <directory>src/main/java</directory>
                      <includes>
                        <include>com/netmarch/*.txt</include>
                      </includes>
                    </resource>
                  </resources>
                </configuration>
              </execution>
            </executions>
          </plugin>
    
          <plugin><!--将第三方的jar文件打包进来-->
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>2.4.3</version>
            <executions>
              <execution>
                <phase>package</phase>
                <goals>
                  <goal>shade</goal>
                </goals>
                <configuration>
                  <filters>
                    <filter>
                      <artifact>*:*</artifact>
                      <excludes>
                        <exclude>META-INF/*.SF</exclude>
                        <exclude>META-INF/*.DSA</exclude>
                        <exclude>META-INF/*.RSA</exclude>
                      </excludes>
                    </filter>
                  </filters>
                  <transformers>
                    <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                      <mainClass>com.h2.MimeTypeMain</mainClass><!--main方法所在类-->
                    </transformer>
                  </transformers>
                </configuration>
              </execution>
            </executions>
          </plugin>
        </plugins>
        <defaultGoal>package</defaultGoal>
      </build>
    </project>

    MimeTypeMain.java

    package com.h2;
    
    import org.apache.tika.Tika;
    
    import java.io.File;
    import java.io.FileWriter;
    import java.io.IOException;
    
    public class MimeTypeMain {
    
        Tika tika = new Tika();
        public static void main(String[] args){
            int length = args.length;
            if(length<2)
            {
                System.out.printf("usage: MimeTypeMain dir out.txt");
            }else
            {
                MimeTypeMain main = new MimeTypeMain();
                main.walk(args[0],args[1]);
            }
        }
    
        public void walk(String dir,String out){
            File dirs = new File(dir);
            File outFile = new File(out);
            if(!outFile.exists()){
                try {
                    outFile.createNewFile();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            File[] files = dirs.listFiles();
            for (int i = 0; i < files.length; i++) {
                File file = files[i];
                if(file.isFile())
                {
                    try {
                        String mimeType = tika.detect(file);
                        wirteToFile(out, String.format("mimeType:%s , path:%s
    ", mimeType, file.getAbsolutePath()));
                    }catch (IOException e)
                    {
                        e.printStackTrace();;
                    }
                }else{
                String path = file.getAbsolutePath();
                walk(path,out);
                }
            }
        }
    
        void wirteToFile(String outFile,String content) throws IOException {
            try(FileWriter fileWritter = new FileWriter(outFile,true)) {
                fileWritter.write(content);
            }
        }
    }

    mvn package 生成可执行文件

     本地测试样本

     运行程序

    java -jar mimetype-1.0.jar d:样本 d:1.txt

    结果如下:

    mimeType:application/msword , path:D:样本1.企业技术开发项目设计书(下进风机柜) V161015.doc
    mimeType:application/vnd.ms-excel , path:D:样本12月份利润表1.xls
    mimeType:application/vnd.ms-excel , path:D:样本12月份资产负债表1.xls
    mimeType:application/zip , path:D:样本18年度公司财务报表.zip
    mimeType:image/jpeg , path:D:样本1b125ae7ef59b854685cc8d6af8645c7.jpg
    mimeType:application/x-rar-compressed , path:D:样本2018财务报表.rar
    mimeType:image/tiff , path:D:样本5.4-专利受理通知书-一种下托盘摩擦焊工装夹具.tif
    mimeType:image/jpeg , path:D:样本6E9D2271-1CB1-45AE-858D-4502F5EB2096.jpeg
    mimeType:application/pdf , path:D:样本ASR手册.2019-10-12.pdf
    mimeType:image/png , path:D:样本default_av_boy_v3.png
    mimeType:image/png , path:D:样本default_av_girl_v3.png
    mimeType:image/jpeg , path:D:样本ECF4A384BD56535EFB3335C39F778023.png
    mimeType:image/jpeg , path:D:样本F48AF50EFD9316C865A832888DA8AEF1.png
    mimeType:image/bmp , path:D:样本三证合一副本.bmp
    mimeType:application/pdf , path:D:样本喇叭盖智能点胶组装设备研发项目设计书
    mimeType:application/pdf , path:D:样本喇叭盖智能点胶组装设备项目研发情况表
    mimeType:application/pdf , path:D:样本喇叭盖智能点胶组装设备项目研发立项决议
    mimeType:application/pdf , path:D:样本塑料旋转开关项目情况表
    mimeType:application/x-7z-compressed , path:D:样本新建文件夹 (3).7z
    mimeType:application/x-tika-msoffice , path:D:样本江苏省企业研发项目情况表 1
    mimeType:application/pdf , path:D:样本汽车安全系统组件智能装配线研发立项决议
    mimeType:application/pdf , path:D:样本汽车安全系统组件智能装配线研发项目情况表
    mimeType:application/pdf , path:D:样本汽车安全系统组件智能装配线研发项目设计书
    mimeType:image/jpeg , path:D:样本项目情况3-1
    mimeType:image/jpeg , path:D:样本项目情况3-2
    mimeType:application/pdf , path:D:样本项目情况表
    mimeType:application/pdf , path:D:样本项目立项决议
    mimeType:application/pdf , path:D:样本项目设计书

     一些常见的mime type类型

    image/bmp

    image/x-bitmap

    image/x-pixmap

    image/jpg
    image/png

    image/jpeg
    application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx
    application/x-rar-compressed
    application/vnd.openxmlformats-officedocument.wordprocessingml.document docx
    application/zip zip
    image/tiff tiff
    image/bmp bmp
    application/rtf rtf
    application/x-tika-ooxml xls
    application/x-bplist pdf
    application/pdf
    application/vnd.ms-word.document.macroenabled.12 docm
    image/gif
    application/vnd.openxmlformats-officedocument.presentationml.presentation pptx
    application/x-tika-msoffice pdf
    application/msword
    application/x-7z-compressed 7z
    application/vnd.ms-xpsdocument xps

    项目中使用的代码

    final  Tika tika = new Tika();
    
    Set<String> allowMimeType =
            ImmutableSet.of("image/pjpeg","application/pdf","application/msword","image/jpeg",
                    "image/x-png","image/tiff","application/vnd.ms-excel","application/zip",
                    "image/bmp","image/x-bitmap","image/x-pixmap","image/jpg",
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" /*xlsx*/
                    ,"application/x-rar-compressed","application/rtf","application/x-tika-ooxml",/*xls*/
                    "application/x-bplist"/*pdf*/,"application/pdf",
                    "application/vnd.ms-word.document.macroenabled.12"/*docm*/,"application/x-tika-msoffice"/*pdf*/,
                    "application/vnd.openxmlformats-officedocument.presentationml.presentation"/*pptx*/
                    ,"application/x-7z-compressed","application/vnd.ms-xpsdocument"/*xps*/);
    
    byte[] byteInfoFile = prjInfoFile.getBytes();
    
    if(byteInfoFile.length>0) {
    
        String mimeType = tika.detect(prjDesignFile.getBytes());
    
        log.info("文件类型:{}",mimeType);
    
        if(!allowMimeType.contains(mimeType))
        {
            return "error:服务器暂不接受此类型的附件";
        }    byte[] byteInfoFile = prjInfoFile.getBytes();
    }
        
  • 相关阅读:
    scrapy Request方法
    from lxml import etree报错
    python文件管道 下载图集
    scrapy基本爬虫,采集多页
    python操作excel xlwt (转)
    matplotlib 设置标题 xy标题等
    matplotlib 饼状图
    acwing 600. 仰视奶牛
    LeetCode 684. 冗余连接
    LeetCode 200. 岛屿数量
  • 原文地址:https://www.cnblogs.com/passedbylove/p/12732280.html
Copyright © 2020-2023  润新知