• Lucene基础(三)-- 中文分词及高亮显示


    Lucene分词器及高亮

    分词器

    在lucene中我们按照分词方式把文档进行索引,不同的分词器索引的效果不太一样,之前的例子使用的都是标准分词器,对于英文的效果很好,但是中文分词效果就不怎么样,他会按照汉字的字直接分词,没有词语的概念。

    使用分词的地方只需要把Analyzer实例化成我们第三方的分词器即可

    中文分词有很多,这里使用IKAnalyzer 为例, 
    下载地址 https://git.oschina.net/wltea/IK-Analyzer-2012FF 现在下来后里面有一篇教程。

    高亮

    导入lucene-highlighter-xxx.jar 在对查询出来的结果实现高亮显示

    // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar

      SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");

      Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

      for (int i = 0; i < hits.length; i++) {

        Document doc = isearcher.doc(hits[i].doc);

        // 内容增加高亮显示

        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));

        String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.println(content);

      }

    Lucene中文分词器

    package lucene_demo04;

    import java.io.IOException;
    import java.io.StringReader;

    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.IndexWriterConfig.OpenMode;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.QueryWrapperFilter;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.highlight.Highlighter;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.RAMDirectory;
    import org.apache.lucene.util.Version;
    import org.wltea.analyzer.lucene.IKAnalyzer;

    /**
    * 中文分词,IKAnalayzer,对索引结果实现高亮显示
    *
    * @author YipFun
    */
    public class LuceneDemo04
    {
      private static final Version version = Version.LUCENE_4_9;
      private Directory directory = null;
      private DirectoryReader ireader = null;
      private IndexWriter iwriter = null;
      private IKAnalyzer analyzer;

      // 测试数据
      private String[] content = { "你好,我是中共人", "中华人民共和国", "中国人民从此站起来了", "Lucene是一个不错的全文检索的工具", "全文检索中文分词" };

      /**
       * 构造方法
       */
      public LuceneDemo04()
      {
        directory = new RAMDirectory();
      }

      private IKAnalyzer getAnalyzer()
      {
        if (analyzer == null)
        {
          return new IKAnalyzer();
        } else
        {
          return analyzer;
        }
      }

      /**
      * 创建索引
      */
      public void createIndex()
      {
        Document doc = null;
        try
        {
          IndexWriterConfig iwConfig = new IndexWriterConfig(version, getAnalyzer());
          iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
          iwriter = new IndexWriter(directory, iwConfig);
          for (String text : content)
          {
          doc = new Document();
          doc.add(new TextField("content", text, Field.Store.YES));
          iwriter.addDocument(doc);
        }

      } catch (IOException e)
      {
        e.printStackTrace();
      } finally
      {
      try
      {
        if (iwriter != null)
        iwriter.close();
      } catch (IOException e)
      {
        e.printStackTrace();
      }
      }

      }

      public IndexSearcher getSearcher()
      {
        try
        {
          if (ireader == null)
          {
            ireader = DirectoryReader.open(directory);
          } else
          {
            DirectoryReader tr = DirectoryReader.openIfChanged(ireader);
            if (tr != null)
            {
              ireader.close();
              ireader = tr;
            }
          }
          return new IndexSearcher(ireader);
        } catch (CorruptIndexException e)
        {
          e.printStackTrace();
        } catch (IOException e)
        {
          e.printStackTrace();
        }
        return null;
      }

      public void searchByTerm(String field, String keyword, int num) throws InvalidTokenOffsetsException
      {
        IndexSearcher isearcher = getSearcher();
        Analyzer analyzer = getAnalyzer();
        // 使用QueryParser查询分析器构造Query对象
        QueryParser qp = new QueryParser(version, field, analyzer);
        // 这句所起效果?
        qp.setDefaultOperator(QueryParser.OR_OPERATOR);
        try
        {
          Query query = qp.parse(keyword);
          ScoreDoc[] hits;

          // 注意searcher的几个方法
          hits = isearcher.search(query, null, num).scoreDocs;

          // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar
          SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
          Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

          for (int i = 0; i < hits.length; i++)
          {
            Document doc = isearcher.doc(hits[i].doc);
            // 内容增加高亮显示
            TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
            String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
            System.out.println(content);
          }

        } catch (IOException e)
        {
          e.printStackTrace();
        } catch (ParseException e)
        {
          e.printStackTrace();
        }
      }

      /**
      * 使用过滤器查询
      *
      * @param field
      * @param keyword
      * @param num
      * @throws InvalidTokenOffsetsException
      */
      public void searchByTermFilter(String field, String keyword, int num) throws InvalidTokenOffsetsException
      {
        IndexSearcher isearcher = getSearcher();
        Analyzer analyzer = getAnalyzer();
        // 使用QueryParser查询分析器构造Query对象
        QueryParser qp = new QueryParser(version, field, analyzer);
        // 这句所起效果?
        qp.setDefaultOperator(QueryParser.OR_OPERATOR);
        try
        {
          Query query = qp.parse(keyword);
          Query q2 = qp.parse("全文检索");
          ScoreDoc[] hits;

          QueryWrapperFilter filter = new QueryWrapperFilter(q2);
          // 注意searcher的几个方法
          hits = isearcher.search(query, filter, num).scoreDocs;

          // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar
          SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
          Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));

          for (int i = 0; i < hits.length; i++)
          {
            Document doc = isearcher.doc(hits[i].doc);
            // 内容增加高亮显示
            TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content")));
            String content = highlighter.getBestFragment(tokenStream, doc.get("content"));
            System.out.println(content);
          }

        } catch (IOException e)
        {
          e.printStackTrace();
        } catch (ParseException e)
        {
          e.printStackTrace();
        }
      }

      public static void main(String[] args) throws InvalidTokenOffsetsException
      {
        System.out.println("start");
        LuceneDemo04 ld = new LuceneDemo04();
        ld.createIndex();
        long start = System.currentTimeMillis();
        ld.searchByTerm("content", "人民", 500);
        System.out.println("end search use " + (System.currentTimeMillis() - start) + "ms");
      }

    }

    运行结果:

    start 加载扩展词典:ext.dic

    加载扩展停止词典:stopword.dic

    中华<span style='color:red'>人民</span>共和国

    中国<span style='color:red'>人民</span>从此站起来了

    end search use 129ms

  • 相关阅读:
    Yum安装Lamp环境
    Cacti系统监控安装
    源码安装Memcache
    Lamp源码编译+SVN安装
    分页数据列表写法
    文件单位转换函数
    Session写入到Memcache,Redis和数据库中
    [LeetCode#30]Substring with Concatenation of All Words
    快速创建php server
    Git skills in reseting files
  • 原文地址:https://www.cnblogs.com/downey/p/4890783.html
Copyright © 2020-2023  润新知