• Word,Excel,pdf,txt等文件上传并提取内容


    近期项目需求:1.要用到各种文件上传,下载。

    2.并对文件进行搜索。

    3.仅仅要文件里包括有搜索的内容,所有显示出来。



    今天正好有时间整理一下,方便以后阅读,及对须要用到的朋友提供微薄之力。首先在实现文件上传时,使用的struts2自带的文件上传功能,通过流的方式将文件保存,在下载的时候通过流的方式写出就可以。这个实现起来不是非常难,主要是对各种文件内容的提取比較麻烦,比方word,excel,pdf等文件,不能使用普通的BufferedReader,BufferedWriter等流的方式读写提取,由于这些文件的格式不是普通的文本,他们有自定义的格式,必须使用他们自己提供的jar包进行操作,Word,Excel使用的是Apache提供的poi进行操作,当然在操作的过程中要注意一些使用的方法,比方Word,Excel有不同的版本号,操作的方式也不同,这里会出现非常多问题,在上一篇中我整理了一些,我在操作过程出现的问题,并提供了解决方式,还有提供了本人操作这些文件的源代码,下载就可以使用。

    一下是我在操作过程用到的具体信息:


    struts.xml配置:

    <action name="upload" class="lucenesAction" method="upload">
    <!-- 此处能够限制上传文件类型

    <interceptor-ref name="fileUpload">
    上传单个文件的大小
    <param name="maximumSize">500000</param>
    文件的扩展名
    <param name="allowedExtensions">.jsp</param>
    文件的类型
    <param name="allowedTypes">image/pjpeg,image/gif,text/xml,text/plain,application/msword,application/vnd.ms-excel</param>
    </interceptor-ref>
    <interceptor-ref name="defaultStack"/> -->
    <result name="input">/demo/lucenes/upload_fail.jsp</result>
    <result name="success">/demo/lucenes/upload_ok.jsp</result>
    </action>



    Action中用到的操作文件信息:

    //下载使用的流

    public InputStream tInputStream;


    // 上传文件,必须的三个字段。
    private File data;
    // 文件名称
    private String dataFileName;
    // 文件类型
    private String dataContentType;

    //省略set/get.............


    Word操作类:

    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import org.apache.poi.POIXMLDocument;
    import org.apache.poi.POIXMLTextExtractor;
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.hwpf.usermodel.CharacterProperties;
    import org.apache.poi.hwpf.usermodel.HWPFList;
    import org.apache.poi.hwpf.usermodel.ParagraphProperties;
    import org.apache.poi.hwpf.usermodel.Range;
    import org.apache.poi.openxml4j.opc.OPCPackage;
    import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
    import org.junit.Test;


    /**
     * 提取word内容
     * 
     * @author wangshouhai
     * @Version 2014-4-17:下午12:07:04
     */
    public class ReadWord {


    public static void main(String[] args) {


    File file = new File("C:\Users\Administrator\Desktop\測试文档.docx");
    // readWord2003(file);


    readWord2007(file);

    }


    /**
    * 支持word-2003

    * @param args
    */
    private static void readWord2003(File file) {
    try {
    FileInputStream fis = new FileInputStream(file);
    // 创建WordExtractor对象
    WordExtractor wordExtractor = new WordExtractor(fis);
    // 取得全部文本内容
    String text = wordExtractor.getText();
    System.out.println("readWord2003--------------->"+text);
    } catch (Exception e) {
    e.printStackTrace();
    }
    }


    @Test
    // 支持word-2003
    public static void readWordExtractor(File file) {
    try {
    FileInputStream fis = new FileInputStream(file);
    // 创建WordExtractor对象
    WordExtractor wordExtractor = new WordExtractor(fis);
    // 通过getParagraphText()提取每一个段落
    String[] paragraph = wordExtractor.getParagraphText();
    System.out.println("该Word文件共同拥有" + paragraph.length + "段。");


    for (int i = 0; i < paragraph.length; i++) {
    System.out.println("readWordExtractor--------------->"+paragraph[i]);
    }
    } catch (Exception e) {
    e.printStackTrace();
    }
    }


    /**
    * word 2007解决方式

    * @param args
    */
    @Test
    public static void readWord2007(File file) {
    try {
    // word 2007,读取word中字符
    OPCPackage opcPackage = POIXMLDocument.openPackage("D:\apache-tomcat-6.0.18\webapps\GOVWBWeb\upload\user\2014\04\18\08\193b299f-e8fc-4a32-a7ba-f951beeec1d9");
    POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
    String text2007 = extractor.getText();
    System.out.println("readWord2007--------------->"+text2007);
    } catch (Exception e) {
    e.printStackTrace();
    }
    }

    }



    Excel操作类:

    import java.io.File;
    import java.io.FileInputStream;
    import org.apache.poi.hssf.usermodel.HSSFCell;
    import org.apache.poi.hssf.usermodel.HSSFRow;
    import org.apache.poi.ss.usermodel.Sheet;
    import org.apache.poi.ss.usermodel.Workbook;
    import org.apache.poi.ss.usermodel.WorkbookFactory;
    import org.apache.poi.xssf.usermodel.XSSFCell;
    import org.apache.poi.xssf.usermodel.XSSFRow;
    import org.apache.poi.xssf.usermodel.XSSFSheet;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;


    /**
     * 读取excel内容
     * @author wangshouhai
     * @Version 2014-4-18:下午12:56:23
     */
    public class ReadExcel {

    // 文件上传
    public static void readExcel2007(File file) {


    try {
    // 创建工作区,读取上传文件
    XSSFWorkbook wb= new XSSFWorkbook(new FileInputStream(file));
    XSSFSheet sheet =wb.getSheetAt(0); 
    int rows = sheet.getPhysicalNumberOfRows();// 获取全部的行
    if (rows > 0) {
    for (int i = 1; i < rows; i++) {
    XSSFRow row=sheet.getRow(i);
    if (row == null) {
    continue;
    }
    try {
    XSSFCell idCell = row.getCell(0);
    if (idCell != null) {
    double id = idCell.getNumericCellValue();
    //int id = Integer.parseInt(idCell.getRichStringCellValue().toString());
    System.out.println("id----------->"+id);
    }


    // 账号
    XSSFCell accountsCell = row.getCell(1);
    String accounts = null;
    if (accountsCell != null) {
    accounts = accountsCell.getRichStringCellValue().toString();
    System.out.println("accounts----------->"+accounts);
    }


    // password
    XSSFCell passwordCell = row.getCell(2);
    if (passwordCell != null) {
    String password = passwordCell.getRichStringCellValue().toString();
    System.out.println("password----------->"+password);
    }


    // 姓名
    XSSFCell nameCell = row.getCell(3);
    if (nameCell != null) {
    String name = nameCell.getRichStringCellValue().toString();
    System.out.println("name----------->"+name);
    }


    // 性别
    XSSFCell sexCell = row.getCell(4);
    if (sexCell != null) {
    double sex = idCell.getNumericCellValue();
    //String sex = sexCell.getRichStringCellValue().toString();
    //int sexs = Integer.parseInt(sex);
    System.out.println("sex----------->"+sex);
    }


    // 邮箱
    XSSFCell emailCell = row.getCell(5);
    if (emailCell != null) {
    String email = emailCell.getRichStringCellValue().toString();
    System.out.println("email----------->"+email);
    }


    // 手机
    XSSFCell phoneCell = row.getCell(6);
    if (phoneCell != null) {
    String phone = phoneCell.getRichStringCellValue().toString();
    System.out.println("phone----------->"+phone);
    }


    } catch (Exception e) {
    throw new RuntimeException(e);
    }
    }
    }
    } catch (Exception e) {
    throw new RuntimeException(e);
    }
    }



    public static void readExcel2003(File file) {


    try {
    // 创建工作区,读取上传文件
    Workbook wb = WorkbookFactory.create(new FileInputStream(file));
    Sheet sheet = wb.getSheetAt(0);
    int rows = sheet.getPhysicalNumberOfRows();// 获取全部的行


    if (rows > 0) {
    for (int i = 1; i < rows; i++) {
    //获取每一行
    HSSFRow row = (HSSFRow) sheet.getRow(i);
    if (row == null) {
    continue;
    }
    try {
    //获取列数開始
    HSSFCell idCell = row.getCell(0);
    if (idCell != null) {
    double id = idCell.getNumericCellValue();
    // int id =
    // Integer.parseInt(idCell.getRichStringCellValue().toString());
    System.out.print("id: "+id+",");
    }


    // 账号
    HSSFCell accountsCell = row.getCell(1);
    String accounts = null;
    if (accountsCell != null) {
    accounts = accountsCell.getRichStringCellValue().toString();
    System.out.print("accounts: "+accounts+",");
    }


    // password
    HSSFCell passwordCell = row.getCell(2);
    if (passwordCell != null) {
    String password = passwordCell.getRichStringCellValue().toString();
    System.out.print("password: "+password+",");
    }


    // 姓名
    HSSFCell nameCell = row.getCell(3);
    if (nameCell != null) {
    String name = nameCell.getRichStringCellValue().toString();
    System.out.print("name: "+name+",");
    }


    // 性别
    HSSFCell sexCell = row.getCell(4);
    if (sexCell != null) {
    double sex = idCell.getNumericCellValue();
    // String sex =
    // sexCell.getRichStringCellValue().toString();
    // int sexs = Integer.parseInt(sex);
    System.out.print("sex: "+sex+",");
    }


    // 邮箱
    HSSFCell emailCell = row.getCell(5);
    if (emailCell != null) {
    String email = emailCell.getRichStringCellValue().toString();
    System.out.print("email: "+email+",");
    }


    // 手机
    HSSFCell phoneCell = row.getCell(6);
    if (phoneCell != null) {
    String phone = phoneCell.getRichStringCellValue().toString();
    System.out.println("phone: "+phone);
    }
    } catch (Exception e) {
    throw new RuntimeException(e);
    }
    }
    }
    } catch (Exception e) {
    throw new RuntimeException(e);
    }
    }

    /**
    * 读取Excel2007
    * @param args
    */

    public static void main(String[] args) {
    File file = new File("D:\apache-tomcat-6.0.18\webapps\GOVWBWeb\upload\user\2014\04\18\11\adcc6bc6-bd5e-43e9-9a53-3ba879dfa62d.xlsx");
    //readExcel2007(file);
    readExcel2003(new File("C:\Users\Administrator\Desktop\export.xls"));
    }
    }



    Pdf操作类:

    import java.io.FileInputStream;
    import org.apache.pdfbox.cos.COSDocument;
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.util.PDFTextStripper;


    /**
     * 提取pdf中的内容
     * @author wangshouhai
     * @Version 2014-4-18:下午12:47:27
     */
    public class ReadPdf {
    public String readFdf(String file) {
    try {
    PDFParser parser = new PDFParser(new FileInputStream(file));
    parser.parse();
    COSDocument doc=parser.getDocument();
    PDFTextStripper stripper = new PDFTextStripper();
    String docText = stripper.getText(new PDDocument(doc));
    docText= convertorSymbol(docText); 
    return docText;
    } catch (Exception e) {
    throw new RuntimeException(e);
    }
    }




    /**
    * 处理特殊字符

    * @param sub
    * @param docText
    */
    public static String convertorSymbol(String docText) {
    StringBuilder sub = new StringBuilder();
    char[] ch = docText.toCharArray();
    for (int i = 0; i < ch.length; i++) {
    char buf = ch[i];
    if (9 == buf || 10 == buf || 13 == buf || 32 <= buf && !Character.isISOControl(buf)) {
    sub.append(buf);
    }
    }
    return sub.toString().replaceAll("\s*", "");
    }

    public static void main(String args[]) {
    String text =new ReadPdf().readFdf("D:\html2.pdf");
    System.out.println("ReadPdf---------->"+text);
    }
    }


    文本操作类:



    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;


    public class ReadText {
    /**
    * 读取文本内容

    * @param dataFile
    * @return
    */
    public static String readText(File file) {
    StringBuilder sub = new StringBuilder();
    BufferedReader bufReader = null;
    try {
    bufReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
    String str;
    while ((str = bufReader.readLine()) != null) {
    sub.append(str);
    }
    return convertorSymbol(sub.toString());
    } catch (Exception e) {
    throw new RuntimeException(e);
    } finally {
    if (bufReader != null) {
    try {
    bufReader.close();
    } catch (IOException e) {
    throw new RuntimeException(e);
    }
    }
    }
    }

    /**
    * 处理特殊字符

    * @param sub
    * @param docText
    */
    public static String convertorSymbol(String docText) {
    StringBuilder sub = new StringBuilder();
    char[] ch = docText.toCharArray();
    for (int i = 0; i < ch.length; i++) {
    char buf = ch[i];
    if (9 == buf || 10 == buf || 13 == buf || 32 <= buf && !Character.isISOControl(buf)) {
    sub.append(buf);
    }
    }
    return sub.toString();
    }



    public static void main(String[] args) {
    File file = new File("C:/Users/Administrator/Desktop/异常信息列表.txt");
    String text = readText(file);
    System.out.println("text-->"+text);
    }
    }


  • 相关阅读:
    bzoj 4237 稻草人
    bzoj 4537 最小公倍数
    POJ 2763 Housewife Wind(树链剖分)(线段树单点修改)
    HDU 3966 Aragorn's Story(树链剖分)(线段树区间修改)
    spoj 913 Query on a tree II (倍增lca)
    spoj 375 Query on a tree (树链剖分)
    hiho一下第133周 2-SAT·hihoCoder音乐节(2-SAT)(强连通)
    hiho一下第131周 后缀自动机二·重复旋律8(循环相似子串)
    hiho一下第130周 后缀自动机二·重复旋律7
    hiho一下第129周 后缀自动机二·重复旋律6
  • 原文地址:https://www.cnblogs.com/zfyouxi/p/4298214.html
Copyright © 2020-2023  润新知