(转)lucene.net和（pangu）盘古分词搜索引擎的简单实现

(转)lucene.net和（pangu）盘古分词搜索引擎的简单实现
本文转载自：http://blog.csdn.net/dongdongleng/article/details/6058416

这篇文章是介绍lucene.net和盘古分词的简单的实现调用。建立索引，搜索，盘古分词的基类如下：
[c-sharp] view plain copy
1. using System;
2. using System.Data;
3. using System.Configuration;
4. using System.Linq;
5. using System.Web;
6. using System.Web.Security;
7. using System.Web.UI;
8. using System.Web.UI.HtmlControls;
9. using System.Web.UI.WebControls;
10. using System.Web.UI.WebControls.WebParts;
11. using System.Xml.Linq;
12. using Lucene.Net.Index;
13. using Lucene.Net.Documents;
14. using Lucene.Net.QueryParsers;
15. using Lucene.Net.Analysis;
16. using Lucene.Net.Search;
17. using Lucene.Net.Analysis.Standard;
18. using PanGu;
19. using Lucene.Net.Analysis.PanGu;
20. using StringBulider;
21. using System.Text;
22. using System.Collections;
23. using System.Collections.Generic;
25. /// <summary>
26. ///辅导资料的搜索逻辑的摘要说明
27. /// </summary>
28. public class DataSearch
29. {
30. public DataSearch()
31. {
32. //
33. //TODO: 在此处添加构造函数逻辑
34. //
35. }
36. A_DB_Conn db = null;
37. string index = System.Web.HttpContext.Current.Server.MapPath("~/Gongxin/DataIndex");//索引路径
38. string wordPath = System.Web.HttpContext.Current.Server.MapPath("~/Dictionaries");//字典路径
39. //用于存储检索出数据总数
40. private int _count;
42. public int Count
43. {
44. get { return _count; }
45. set { _count = value; }
46. }
47. //搜索关键字
48. private string _keyword;
50. public string KeyWord
51. {
52. get { return _keyword;}
53. set{_keyword = value;}
54. }
55. //搜索时间
56. private double _time;
57. public double Time
58. {
59. get { return _time; }
60. set { _time = value; }
61. }
62. /// <summary>
63. /// 得到数据结果集
64. /// </summary>
65. /// <returns></returns>
66. public DataTable GetDS()
67. {
68. db = new A_DB_Conn();
69. string sql = "select N_id,N_title,N_datetime,N_Content from N_Newinfo where N_ifkping=1 and N_id=12 order by N_datetime desc";
70. DataTable dataTable = null;
71. try
72. {
73. dataTable = db.GetDataSet(sql).Tables[0];
74. }
75. catch
76. {
77. dataTable = null;
78. }
79. finally
80. {
81. db.close();
82. db.Dispose();
83. }
84. return dataTable;
85. }
87. /// <summary>
88. /// 添加索引域
89. /// </summary>
90. /// <param name="writer"></param>
91. /// <param name="dataTable"></param>
92. public void AddDocument(IndexWriter writer, DataTable dataTable)
93. {
94. for (int i = 0; i < dataTable.Rows.Count; i++)
95. {
96. Document doc = new Document();
97. doc.Add(new Field("ID", dataTable.Rows[i]["N_id"].ToString(),Field.Store.YES,Field.Index.TOKENIZED));
98. doc.Add(new Field("N_title",StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_title"].ToString()),Field.Store.YES,Field.Index.TOKENIZED));
99. doc.Add(new Field("N_datetime", dataTable.Rows[i]["N_datetime"].ToString(),Field.Store.YES,Field.Index.TOKENIZED));
100. if (StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_Content"].ToString()).Length > 100)
101. {
102. doc.Add(new Field("N_Content", StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_Content"].ToString()).Replace("<p>","").Replace("</p>","").Substring(0, 200), Field.Store.YES, Field.Index.TOKENIZED));
103. }
104. else
105. {
106. doc.Add(new Field("N_Content", StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_Content"].ToString()).Replace("<p>", "").Replace("</p>", ""), Field.Store.YES, Field.Index.TOKENIZED));
107. }
108. writer.AddDocument(doc);
109. }
110. }
112. /// <summary>
113. /// 建立索引
114. /// </summary>
115. public void Index()
116. {
117. PanGu.Segment.Init();
118. PanGuAnalyzer analyzer = new PanGuAnalyzer();
119. IndexWriter writer = new IndexWriter(index, analyzer, true);
120. DataTable dataTable = GetDS();
121. AddDocument(writer, dataTable);
122. writer.Optimize();
123. writer.Close();
124. }
126. /// <summary>
127. /// 对药搜索的词进行分词
128. /// </summary>
129. /// <param name="keyWords">要收索的词</param>
130. /// <param name="ktTokenizer">分词对象</param>
131. /// <returns>分词后的结果</returns>
132. public string GetKeyWordSplitBySpace(string keyWords, PanGuTokenizer ktTokenizer)
133. {
134. StringBuilder builder = new StringBuilder();
135. ICollection<WordInfo> words = ktTokenizer.SegmentToWordInfos(keyWords);
136. foreach(WordInfo word in words)
137. {
138. if (word == null)
139. {
140. continue;
141. }
142. KeyWord = KeyWord + word+",";
143. builder.AppendFormat("{0}^{1}",word.Word,(int)Math.Pow(3,word.Rank));
144. }
145. KeyWord = KeyWord.Substring(0, KeyWord.Length - 1);
146. return builder.ToString().Trim();
147. }
149. /// <summary>
150. /// 检索信息
151. /// </summary>
152. /// <param name="keyWord">检索关键字</param>
153. /// <param name="pageNumber">当前第几条</param>
154. /// <param name="pageSize">每页显示的条数</param>
155. /// <returns></returns>
156. public string Search(string keyWord, int pageNumber, int pageSize)
157. {
158. string word = GetKeyWordSplitBySpace(keyWord, new PanGuTokenizer());
160. IndexSearcher search = new IndexSearcher(index);
161. StringBuilder builder = new StringBuilder();
162. PanGuAnalyzer analyzer = new PanGuAnalyzer(true);
163. //多字段搜索字段
164. MultiFieldQueryParser parser = new MultiFieldQueryParser(new string[] { "N_title" },analyzer);
165. //分词
166. Query query = parser.Parse(word);
167. Hits hits = search.Search(query);
168. Count = hits.Length();
169. int num = 0;//记录每页最后一条
170. if (Count < pageNumber + pageSize)
171. {
172. num = Count;
173. }
174. else
175. {
176. num = pageSize + pageNumber;
177. }
178. DateTime begin = DateTime.Now;
179. for (int i = pageNumber; i < num; i++)
180. {
181. Document doc = hits.Doc(i);
182. //创建HTMLFormatter,参数为高亮搜索词的HTML代码
183. PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color='red'>", "</font>");
184. //创建高亮，输入HTML代码和盘古对象Semgent
185. PanGu.HighLight.Highlighter highter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment());
186. //设置每个摘要字段的字符数
187. highter.FragmentSize = 200;
188. string title = highter.GetBestFragment(keyWord, doc.Get("N_title"));
189. builder.Append("<div class='sea_h_list'>");
190. if (!string.IsNullOrEmpty(title))
191. {
192. builder.Append("<h2><a href="D_NewBody.aspx?gxjy=" + UrlEncrpt.Encrypt(doc.Get("ID")) + "" mce_href="D_NewBody.aspx?gxjy=" + UrlEncrpt.Encrypt(doc.Get("ID")) + "" target='_blank'>" + highter.GetBestFragment(keyWord, doc.Get("N_title")) + "</a></h2>");
193. }
194. else
195. {
196. builder.Append("<h2>" + doc.Get("N_title") + "</h2>");
197. }
198. builder.Append(doc.Get("N_Content"));
199. DateTime time = Convert.ToDateTime(doc.Get("N_datetime"));
200. builder.Append("<div>"+time.ToString("yyyy-MM-dd")+"</div>");
201. builder.Append("</div>");
203. }
204. DateTime end = DateTime.Now;
205. double ts = (end - begin).TotalMilliseconds;
206. Time =ts / 1000.000;
207. search.Close();//关闭检索器
208. return builder.ToString();
209. }
210. }
前台可以用一个button调用一个button事件来生成索引

前台生成索引事件如下。
[c-sharp:nogutter] view plain copy
1. protected void Button1_Click(object sender, EventArgs e)
2. {
3. search = new NewsSearch();
4. search.Index();
5. Response.Write("<Script>alert('索引生成成功！')</Script>");
6. }
索引生成成功后，就可以进行搜索了，本人实现搜索的功能时调用了AspNetPager分页控件，具体的搜索的方法如下：
[c-sharp:nogutter] view plain copy
1. public void GetInfoList(string keyWord)
2. {
3. try
4. {
5. search = new NewsSearch();
6. Stopwatch sw = new Stopwatch();
7. sw.Start();
8. InfoList = search.Search(keyWord, (this.AspNetPager1.CurrentPageIndex - 1) * this.AspNetPager1.PageSize, this.AspNetPager1.PageSize);
9. sw.Stop();
10. time = (sw.ElapsedMilliseconds / 1000.000).ToString();//显示所用搜索时间
11. if (InfoList.Trim().Length == 0)
12. {
13. InfoList = "<div class='rig01_center red'>暂无搜索信息。</div>";
14. }
15. this.AspNetPager1.RecordCount = search.Count;
16. count = search.Count.ToString();//搜索的条数
17. strkeyWord = search.KeyWord;//显示搜索关键字
19. }
20. catch
21. {
22. Response.Redirect("404.html");
23. }
25. }
其中 Stopwatch使用了计算搜索所用的时间，将它复制到time 全局变量中,count 全局变量时用来存储搜索的条数。

搜索出来数据后，前台页面用<%=count%><%=time%> <%=InfoList%>

调用即可。

前台显示效果：

本文只是简单的实现了lucene.net 和盘古分词,仅供大家学习，有不足的地方还请见谅，有疑问随时联系，谢谢！
相关阅读:
linux基础——文件的压缩解压缩以及vim编辑
 linux基础——关于chmod用户权限和文件的相关操作
 linux基础的基础命令操作
 操作系统和网络基础之网络协议
 计算机基础
 【python小记】访问mysql数据库
 Qt去掉treeview项的焦点虚线
 嵌入式qt显示中文和隐藏鼠标
 【vim小记】自动保存配置
 重回ubutntu12.04小记（装完ubuntu做的几件事）
原文地址：https://www.cnblogs.com/wpcnblog/p/2394061.html

(转)lucene.net和（pangu）盘古分词 搜索引擎的简单实现

(转)lucene.net和（pangu）盘古分词搜索引擎的简单实现