【Lucene】三个高亮显示模块的简单示例-Highlighter

【Lucene】三个高亮显示模块的简单示例-Highlighter
Lucene针对高亮显示功能提供了两种实现方式,分别是Highlighter和FastVectorHighlighter

这里的三个示例都是使用Highlighter；

示例代码：
1. package com.tan.code;
3. import java.io.File;
4. import java.io.IOException;
5. import java.io.StringReader;
7. import org.apache.lucene.analysis.TokenStream;
8. import org.apache.lucene.analysis.core.SimpleAnalyzer;
9. import org.apache.lucene.document.Document;
10. import org.apache.lucene.index.DirectoryReader;
11. import org.apache.lucene.index.IndexReader;
12. import org.apache.lucene.index.Term;
13. import org.apache.lucene.queryparser.classic.ParseException;
14. import org.apache.lucene.queryparser.classic.QueryParser;
15. import org.apache.lucene.search.IndexSearcher;
16. import org.apache.lucene.search.Query;
17. import org.apache.lucene.search.ScoreDoc;
18. import org.apache.lucene.search.TermQuery;
19. import org.apache.lucene.search.TopDocs;
20. import org.apache.lucene.search.highlight.Highlighter;
21. import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
22. import org.apache.lucene.search.highlight.QueryScorer;
23. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
24. import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
25. import org.apache.lucene.search.highlight.TokenSources;
26. import org.apache.lucene.store.Directory;
27. import org.apache.lucene.store.SimpleFSDirectory;
28. import org.apache.lucene.util.Version;
29. import org.wltea.analyzer.lucene.IKAnalyzer;
31. public class HighlighterTest {
33. // 高亮處理文本（以下内容纯属虚构）
34. private String text = "China has lots of people,most of them are very poor.China is very big.China become strong now,but the poor people is also poor than other controry";
36. // 原文高亮
37. public void highlighter() throws IOException, InvalidTokenOffsetsException {
39. TermQuery termQuery = new TermQuery(new Term("field", "china"));
40. TokenStream tokenStream = new SimpleAnalyzer(Version.LUCENE_43)
41. .tokenStream("field", new StringReader(text));
43. QueryScorer queryScorer = new QueryScorer(termQuery);
44. Highlighter highlighter = new Highlighter(queryScorer);
45. highlighter.setTextFragmenter(new SimpleSpanFragmenter(queryScorer));
46. System.out.println(highlighter.getBestFragment(tokenStream, text));
47. }
49. // 使用CSS進行高亮顯示處理
50. public void highlighter_CSS(String searchText) throws ParseException,
51. IOException, InvalidTokenOffsetsException {
53. // 創建查詢
54. QueryParser queryParser = new QueryParser(Version.LUCENE_43, "field",
55. new SimpleAnalyzer(Version.LUCENE_43));
56. Query query = queryParser.parse(searchText);
58. // 自定义标注高亮文本标签
59. SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(
60. "", "");
61. // 语汇单元化
62. TokenStream tokenStream = new SimpleAnalyzer(Version.LUCENE_43)
63. .tokenStream("field", new StringReader(text));
65. // 創建QueryScoer
66. QueryScorer queryScorer = new QueryScorer(query, "field");
68. Highlighter highlighter = new Highlighter(htmlFormatter, queryScorer);
69. highlighter.setTextFragmenter(new SimpleSpanFragmenter(queryScorer));
71. System.out.println(highlighter.getBestFragments(tokenStream, text, 4,
72. "..."));
73. }
75. // 高亮顯示搜索結果
76. public void highlighter_SR(String field, String searchText)
77. throws IOException, ParseException, InvalidTokenOffsetsException {
79. //本次示例为了简便直接使用之前实验建立的索引
80. Directory directory = new SimpleFSDirectory(new File("E://MyIndex"));
81. IndexReader reader = DirectoryReader.open(directory);// 读取目录
82. IndexSearcher search = new IndexSearcher(reader);// 初始化查询组件
83. QueryParser parser = new QueryParser(Version.LUCENE_43, field,
84. new IKAnalyzer(true));
86. Query query = parser.parse(searchText);
88. TopDocs td = search.search(query, 10000);// 获取匹配上元素的一个docid
89. ScoreDoc[] sd = td.scoreDocs;// 加载所有的Documnet文档
91. System.out.println("本次命中数据:" + sd.length);
92. QueryScorer scorer = new QueryScorer(query, "content");
94. Highlighter highlighter = new Highlighter(scorer);
95. highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
97. for (ScoreDoc scoreDoc : sd) {
98. Document document = search.doc(scoreDoc.doc);
99. String content = document.get("content");
100. TokenStream tokenStream = TokenSources.getAnyTokenStream(
101. search.getIndexReader(), scoreDoc.doc, "content", document,
102. new IKAnalyzer(true));
103. System.out.println(highlighter
104. .getBestFragment(tokenStream, content));
105. }
106. }
107. }
测试代码：
1. @Test
2. public void test() throws IOException, InvalidTokenOffsetsException,
3. ParseException {
4. // fail("Not yet implemented");
5. HighlighterTest highlighterTest = new HighlighterTest();
6. highlighterTest.highlighter();
7. highlighterTest.highlighter_CSS("china");
8. highlighterTest.highlighter_CSS("poor");
9. highlighterTest.highlighter_SR("content", "床前明月光");
10. }
测试结果：
1. China has lots of people,most of them are very poor。China is very big.China become strong now,but the poor people is also poor than other controry
2. China has lots of people,most of them are very poor。China is very big.China become strong now,but the poor people is also poor than other controry
3. China has lots of people,most of them are very poor。China is very big.China become strong now,but the poor people is also poor than other controry
4. 本次命中数据:1
5. 床前明月光，疑是地上霜
相关阅读:
加密web.config
SQL FOR XML
SQL语句中拆分字段
 Units specified don't exist SHSUCDX can't install
SQLSERVER与C#中数据类型的对应关系
 使用 FOR XML PATH 產生 XML 格式時，遇到 NULL 該如何處理?
T_SQL的 FOR XML PATH 用法
 T-SQL with关键字
 Sqlserver获取行号
 win10以太网没有有效的ip配置
原文地址：https://www.cnblogs.com/dingjiaoyang/p/6115292.html