• 基于Lucene的文件检索Demo


    通过Lucene实现了简单的文件检索功能的Demo。这个Demo支持基于文件内容的检索,支持中文分词和高亮显示。

    下面简单的介绍下核心的类

    1)索引相关的类

             1.FileIndexBuilder ---建立索引

    package uap.pub.bap.fs.search.indexer;
    
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.commons.lang.ArrayUtils;
    import org.apache.commons.lang.StringUtils;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.IndexWriterConfig.OpenMode;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import uap.pub.bap.fs.search.util.FileSearchUtils;
    
    /**
     * 文件索引生成器
     * 
     * @author chenfeic
     * 
     */
    public class FileIndexBuilder {
    
        /**
         * 需要索引的文件列表
         */
        private List<File> fileList = new ArrayList<File>();
    
        private IndexWriter writer;
    
        /**
         * 
         * @param fileDir
         *            文件位置
         * @param indexDir
         *            索引位置
         */
        public void generateIndexer(String fileDir, String indexDir) {
            if (StringUtils.isEmpty(indexDir) || StringUtils.isEmpty(fileDir)) {
                System.out.println("文件和索引路径都不能为空");
                throw new RuntimeException("文件和索引路径都不能为空");
            }
            Directory d = null;
            try {
                // 初始化IndexWriter
                d = FSDirectory.open(new File(indexDir));
                initWriter(indexDir, d);
                // 创建索引文档
                initIndex(fileDir);
                System.out.println("索引创建成功!");
            } catch (Exception e) {
                System.out.println("创建索引失败");
                System.out.println(e);
            } finally {
                FileSearchUtils.closeIndexWriter(writer);
                FileSearchUtils.closeDirectory(d);
            }
        }
    
        /**
         * 初始化 Lucene Index Writer 步骤1: Directory创建索引存放的位置 步骤2:创建分析器Analyzer
         * 步骤3:创建IndexWriterConfig,使用分析器Analyzer 步骤4:创建IndexWriter
         * 
         * @param indexDir
         * @param directory
         * @throws IOException
         */
        private void initWriter(String indexDir, Directory directory)
                throws IOException {
            Analyzer analyzer = new IKAnalyzer();
            IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46,
                    analyzer);
            conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
            writer = new IndexWriter(directory, conf);
        }
    
        /**
         * 初始化索引文档
         * 
         * @param fileDir
         *            文档目录
         * @return
         */
        private int initIndex(String fileDir) {
            getAllSubFile(new File(fileDir));
            TextFileFilter fileter = new TextFileFilter();
            for (File file : fileList) {
                if (fileter.accept(file)) {
                    try {
                        DocumentBuilder db = new DocumentBuilder(file);
                        Document doc = db.createDocument();
                        writer.addDocument(doc);
                    } catch (FileNotFoundException e) {
                        System.out.println("创建索引失败,文件不存在:" + e.getMessage());
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
            return writer.numDocs();
    
        }
    
        private void getAllSubFile(File file) {
            File[] listFiles = file.listFiles();
            if (ArrayUtils.isEmpty(listFiles)) {
                return;
            }
            for (File subfile : listFiles) {
                if (subfile.isDirectory()) {
                    getAllSubFile(subfile);
                } else {
                    fileList.add(subfile);
                }
            }
        }
    
        public static void main(String[] args) {
            String fileDir = "E:\lucene\data";
            String indexDir = "E:\lucene\index";
            FileIndexBuilder indexer = new FileIndexBuilder();
            indexer.generateIndexer(fileDir, indexDir);
        }
    
    }

    2. DocumentBuilder  --索引内容生成器

    package uap.pub.bap.fs.search.indexer;
    
    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    
    import uap.pub.bap.fs.search.IFileSearchConst;
    import uap.pub.bap.fs.search.util.FileSearchUtils;
    
    /**
     * Document生成器
     * 
     * @author chenfeic
     * 
     */
    public class DocumentBuilder {
    
        private File file = null;
    
        private IContextConverter contextConverter;
    
        public DocumentBuilder(File file) {
            this.file = file;
            initConverter();
        }
    
        /**
         * 初始化内容转换器
         */
        private void initConverter() {
            String fileType = FileSearchUtils.getFileType(file.getName());
            // 1.word
            if ("docx".equalsIgnoreCase(fileType)
                    || "doc".equalsIgnoreCase(fileType)) {
                contextConverter = new WordContextConverter();
            }
            // 2. excel
            else if ("xlsx".equalsIgnoreCase(fileType)
                    || "xls".equalsIgnoreCase(fileType)) {
                contextConverter = new ExcelContextConverter();
            }
            // 3.pdf
            else if ("pdf".equalsIgnoreCase(fileType)) {
                contextConverter = new PdfContextConverter();
            }
            // 4.txt(log)
            else {
                contextConverter = new TextContextConverter();
            }
        }
    
        public Document createDocument() {
            if (file == null || !file.exists()) {
                return null;
            }
            Document doc = new Document();
            try {
                doc.add(new TextField(IFileSearchConst.CONTENT_TYPE,
                        contextConverter.context2String(file), Field.Store.YES));
                doc.add(new StringField(IFileSearchConst.FILENAM_TYPE, file
                        .getName(), Field.Store.YES));
                doc.add(new StringField(IFileSearchConst.PATH_TYPE, file
                        .getCanonicalPath(), Field.Store.YES));
            } catch (IOException e) {
                e.printStackTrace();
            }
            return doc;
        }
    }

    3: IContextConverter--文件内容转换器接口,将文件内容装换为字符串

    package uap.pub.bap.fs.search.indexer;
    
    import java.io.File;
    
    /**
     * 内容转换器,将文本内容转换成字符串
     * 
     * @author chenfeic
     * 
     */
    public interface IContextConverter {
    
        /**
         * 文件内容转换成字符串
         * 
         * @param file 文件
         * @return
         */
        public String context2String(File file);
    
    }

    4:AbstractContextConverter--这个类主要利用第三方开源包cpdetector获取文件编码格式

    package uap.pub.bap.fs.search.indexer;
    
    import info.monitorenter.cpdetector.io.ASCIIDetector;
    import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
    import info.monitorenter.cpdetector.io.JChardetFacade;
    import info.monitorenter.cpdetector.io.ParsingDetector;
    import info.monitorenter.cpdetector.io.UnicodeDetector;
    
    import java.io.File;
    
    public abstract class AbstractContextConverter implements IContextConverter {
    
    
        /**
         * 利用第三方开源包cpdetector获取文件编码格式
         * 
         * @param path
         *            要判断文件编码格式的源文件的路径
         * @author huanglei
         * @version 2012-7-12 14:05
         */
        protected String getFileEncode(String path) {
            /*
             * detector是探测器,它把探测任务交给具体的探测实现类的实例完成。
             * cpDetector内置了一些常用的探测实现类,这些探测实现类的实例可以通过add方法 加进来,如ParsingDetector、
             * JChardetFacade、ASCIIDetector、UnicodeDetector。
             * detector按照“谁最先返回非空的探测结果,就以该结果为准”的原则返回探测到的
             * 字符集编码。使用需要用到三个第三方JAR包:antlr.jar、chardet.jar和cpdetector.jar
             * cpDetector是基于统计学原理的,不保证完全正确。
             */
            CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
            /*
             * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于
             * 指示是否显示探测过程的详细信息,为false不显示。
             */
            detector.add(new ParsingDetector(false));
            /*
             * JChardetFacade封装了由Mozilla组织提供的JChardet,它可以完成大多数文件的编码
             * 测定。所以,一般有了这个探测器就可满足大多数项目的要求,如果你还不放心,可以
             * 再多加几个探测器,比如下面的ASCIIDetector、UnicodeDetector等。
             */
            detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar
            // ASCIIDetector用于ASCII编码测定
            detector.add(ASCIIDetector.getInstance());
            // UnicodeDetector用于Unicode家族编码的测定
            detector.add(UnicodeDetector.getInstance());
            java.nio.charset.Charset charset = null;
            File f = new File(path);
            try {
                charset = detector.detectCodepage(f.toURI().toURL());
            } catch (Exception ex) {
                ex.printStackTrace();
            }
            if (charset != null)
                return charset.name();
            else
                return null;
        }
        
    }

    4.TextContextConverter 

    package uap.pub.bap.fs.search.indexer;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    
    import uap.pub.bap.fs.search.util.FileSearchUtils;
    
    /**
     * txt、配置文件、log等文本的Document生成器
     * 
     * @author chenfeic
     * 
     */
    public class TextContextConverter extends AbstractContextConverter {
    
        @Override
        public String context2String(File file) {
            StringBuilder sb = new StringBuilder();
            BufferedReader reader = null;
            InputStream in = null;
            try {
                String encoding = getFileEncode(file.getCanonicalPath());
                in = new FileInputStream(file);
                if (encoding != null && !"".equals(encoding.trim())) {
                    reader = new BufferedReader(new InputStreamReader(in, encoding));
                } else {
                    reader = new BufferedReader(new InputStreamReader(in));
                }
                // 将输入流写入输出流
                String line = "";
                while ((line = reader.readLine()) != null) {
                    sb.append(line + "
    ");
                }
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                FileSearchUtils.closeInputStream(in);
                FileSearchUtils.closeReader(reader);
            }
            return sb.toString();
        }
    
    }

    下面的两个类主要是读取excel、word等office 办公软件的内容,用到的第三方插件为poi

    5.WordContextConverter  

    package uap.pub.bap.fs.search.indexer;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStream;
    
    import org.apache.poi.POIXMLDocument;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
    import org.apache.poi.openxml4j.opc.OPCPackage;
    import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
    import org.apache.xmlbeans.XmlException;
    
    import uap.pub.bap.fs.search.util.FileSearchUtils;
    
    /**
     * word文档内容转换器
     * 
     * @author chenfeic
     * 
     */
    public class WordContextConverter extends AbstractContextConverter {
    
        @Override
        public String context2String(File file) {
            if (isWord2003(file)) {
                return readWord2003(file);
            } else {
                return readWord2007(file);
            }
        }
    
        /**
         * 判断是否是Word 97(-2003)版本
         * 
         * @param file
         * @return
         */
        private boolean isWord2003(File file) {
            String fileType = FileSearchUtils.getFileType(file.getName());
            return "doc".equalsIgnoreCase(fileType);
        }
    
        /**
         * 读取Word 97(-2003)文件内容
         * 
         * @param file
         * @return
         */
        private String readWord2003(File file) {
            InputStream inputStream = null;
            String context = null;
            try {
                inputStream = new FileInputStream(file);
                WordExtractor extractor = new WordExtractor(inputStream);
                context = extractor.getText();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                FileSearchUtils.closeInputStream(inputStream);
            }
            return context;
        }
    
        private String readWord2007(File file) {
            String text = null;
            OPCPackage openPackage = null;
            try {
                // 得到.docx文件提取器
                openPackage = POIXMLDocument.openPackage(file.getCanonicalPath());
                XWPFWordExtractor docx = new XWPFWordExtractor(openPackage);
                // 提取.docx正文文本
                text = docx.getText();
            } catch (IOException e) {
                e.printStackTrace();
            } catch (XmlException e) {
                e.printStackTrace();
            } catch (OpenXML4JException e) {
                e.printStackTrace();
            } finally {
                if (openPackage != null) {
                    try {
                        openPackage.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
            return text;
        }
    
    }

    6.ExcelContextConverter 

    package uap.pub.bap.fs.search.indexer;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStream;
    
    import org.apache.poi.hssf.extractor.ExcelExtractor;
    import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    import org.apache.poi.xssf.usermodel.XSSFCell;
    import org.apache.poi.xssf.usermodel.XSSFRow;
    import org.apache.poi.xssf.usermodel.XSSFSheet;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;
    
    import uap.pub.bap.fs.search.util.FileSearchUtils;
    
    /**
     * excel内容转换器
     * 
     * @author chenfeic
     * 
     */
    public class ExcelContextConverter extends AbstractContextConverter {
    
        @Override
        public String context2String(File file) {
            if (isExcel2003(file)) {
                return readExcel2003(file);
            } else {
                return readExcel2007(file);
            }
        }
    
        /**
         * 判断是否是Excel 97(-2003)版本
         * 
         * @param file
         * @return
         */
        private boolean isExcel2003(File file) {
            String fileType = FileSearchUtils.getFileType(file.getName());
            return "xls".equalsIgnoreCase(fileType);
        }
        
        public String readExcel2003(File file) {
            InputStream inputStream = null;
            String content = null;
            try {
                inputStream = new FileInputStream(file.getCanonicalPath());
                HSSFWorkbook wb = new HSSFWorkbook(inputStream);
                ExcelExtractor extractor = new ExcelExtractor(wb);
                extractor.setFormulasNotResults(true);
                extractor.setIncludeSheetNames(false);
                content = extractor.getText();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                FileSearchUtils.closeInputStream(inputStream);
            }
            return content;
        }
    
        public String readExcel2007(File file) {
            StringBuffer content = new StringBuffer();
            InputStream inputStream = null;
            try {
                inputStream = new FileInputStream(file.getCanonicalPath());
                XSSFWorkbook xwb = new XSSFWorkbook(inputStream);
                // 循环工作表Sheet
                for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
                    XSSFSheet xSheet = xwb.getSheetAt(numSheet);
                    if (xSheet == null) {
                        continue;
                    }
                    // 循环行Row
                    for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
                        XSSFRow xRow = xSheet.getRow(rowNum);
                        if (xRow == null) {
                            continue;
                        }
                        // 循环列Cell
                        for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
                            XSSFCell xCell = xRow.getCell(cellNum);
                            if (xCell == null) {
                                continue;
                            }
                            if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
                                content.append(xCell.getBooleanCellValue());
                            } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
                                content.append(xCell.getNumericCellValue());
                            } else {
                                content.append(xCell.getStringCellValue());
                            }
                        }
                    }
                }
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                FileSearchUtils.closeInputStream(inputStream);
            }
    
            return content.toString();
        }
    }

    2)检索相关的类

    FileSearchServiceImpl --查询关键字

    package uap.pub.bap.fs.search.service;
    
    import java.io.File;
    import java.io.IOException;
    import java.io.StringReader;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.commons.lang.StringUtils;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexNotFoundException;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.search.highlight.Highlighter;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import uap.pub.bap.fs.search.IFileSearchConst;
    import uap.pub.bap.fs.search.SearchResult;
    import uap.pub.bap.fs.search.util.FileSearchUtils;
    
    public class FileSearchServiceImpl implements IFileSerachService {
    
        private int count = 0;
    

       @Override

    public List<SearchResult> search(String type, String key) {
    //type:查询的类型---标题,文件内容等 //key:查询关键字 List
    <SearchResult> results = new ArrayList<SearchResult>(); if (StringUtils.isEmpty(key)) { return results; } // TODO chenfeic String indexDir = "E:\lucene\index"; IndexReader reader = null; Directory directory = null; try { directory = FSDirectory.open(new File(indexDir)); reader = DirectoryReader.open(directory); IndexSearcher search = new IndexSearcher(reader); // 使用QueryParser查询分析器构造Query对象 Analyzer analyzer = new IKAnalyzer(); QueryParser qp = new QueryParser(Version.LUCENE_46, type, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(key); // 一个简单的指针容器,指向前N个排名的搜索结果 TopDocs hits = search.search(query, null, 100); count = hits.totalHits; for (ScoreDoc soreDoc : hits.scoreDocs) { Document doc = search.doc(soreDoc.doc); String summary = toHighlighter(query, doc, IFileSearchConst.CONTENT_TYPE, analyzer); String title = doc.get(IFileSearchConst.FILENAM_TYPE); String path = doc.get(IFileSearchConst.PATH_TYPE); SearchResult result = new SearchResult(); result.setPath(path); result.setTitle(title); if (!StringUtils.isEmpty(summary)) { result.setSummary(summary); } results.add(result); } } catch (IndexNotFoundException e1) { System.out.println("无查询结果,没有此词条的索引"); }catch (IOException e) { System.out.println("无查询结果!"); e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { FileSearchUtils.closeIndexReader(reader); FileSearchUtils.closeDirectory(directory); } return results; } /** * 高亮显示 * * @param query * @param doc * @param field * @return */ private String toHighlighter(Query query, Document doc, String field, Analyzer analyzer) { try { SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( "<font color="red">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); // highlighter.setTextFragmenter(new // SimpleFragmenter(20));//显示20个字符,默认是100个 TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { e.printStackTrace(); } return null; } @Override public int getCount() { return this.count; } }

     上述基本上就是此Demo的核心类,其他的一些工具类和jsp、servlet处理类就没多写了

      基本上用的都是第三方的开源工具,比如支持中文分词的IK_Analyzer(版本IK Analyzer 2012FF_hf1)(注:开始的时候想要的是paoding作为中文的分词器,用户之后发现paoding不支持Lucene4.0版本,估计3.X就已经不支持了,原因是因为坑跌的Lucene总是变化的他的实现和结构,使得有些方法变为final这样paoding中重写了此方法,导致编译出错),为了支持基于文件内容的检索,所以需要对文件内容进行索引并保存,所以用到了poi用于对ms office的处理,对于txt等读取时通过cpdetector检测文件的编码格式。代码都可以参照上面。代码有些是自己写的,有些是参照网上其他同仁的,再次一并谢过

    相关jar包列表为 

  • 相关阅读:
    python并行编程学习之绪论
    flask学习之解决Internal Server Error问题的方式之一
    mysql ERROR 1045 (28000): Access denied for user 'ODBC'@'localhost' (using password: NO)错误解决办法
    python之numpy矩阵库的使用(续)
    python常用序列list、tuples及矩阵库numpy的使用
    计算机网络学习之概述篇
    C++数据结构学习之顺序表
    python-networkx学习(1)
    Html&CSS学习笔记03---CSS介绍、CSS语法、CSS与HTML的结合、CSS选择器、CSS常用样式
    Html&CSS学习笔记02---HTML标签的介绍
  • 原文地址:https://www.cnblogs.com/chenfei0801/p/3488242.html
Copyright © 2020-2023  润新知