本文转载自:http://blog.csdn.net/dongdongleng/article/details/6058416
这篇文章是介绍lucene.net和盘古分词的简单的实现调用 。建立索引,搜索,盘古分词的基类如下:
- using System;
- using System.Data;
- using System.Configuration;
- using System.Linq;
- using System.Web;
- using System.Web.Security;
- using System.Web.UI;
- using System.Web.UI.HtmlControls;
- using System.Web.UI.WebControls;
- using System.Web.UI.WebControls.WebParts;
- using System.Xml.Linq;
- using Lucene.Net.Index;
- using Lucene.Net.Documents;
- using Lucene.Net.QueryParsers;
- using Lucene.Net.Analysis;
- using Lucene.Net.Search;
- using Lucene.Net.Analysis.Standard;
- using PanGu;
- using Lucene.Net.Analysis.PanGu;
- using StringBulider;
- using System.Text;
- using System.Collections;
- using System.Collections.Generic;
- /// <summary>
- ///辅导资料的搜索逻辑 的摘要说明
- /// </summary>
- public class DataSearch
- {
- public DataSearch()
- {
- //
- //TODO: 在此处添加构造函数逻辑
- //
- }
- A_DB_Conn db = null;
- string index = System.Web.HttpContext.Current.Server.MapPath("~/Gongxin/DataIndex");//索引路径
- string wordPath = System.Web.HttpContext.Current.Server.MapPath("~/Dictionaries");//字典路径
- //用于存储检索出数据总数
- private int _count;
- public int Count
- {
- get { return _count; }
- set { _count = value; }
- }
- //搜索关键字
- private string _keyword;
- public string KeyWord
- {
- get { return _keyword;}
- set{_keyword = value;}
- }
- //搜索时间
- private double _time;
- public double Time
- {
- get { return _time; }
- set { _time = value; }
- }
- /// <summary>
- /// 得到数据结果集
- /// </summary>
- /// <returns></returns>
- public DataTable GetDS()
- {
- db = new A_DB_Conn();
- string sql = "select N_id,N_title,N_datetime,N_Content from N_Newinfo where N_ifkping=1 and N_id=12 order by N_datetime desc";
- DataTable dataTable = null;
- try
- {
- dataTable = db.GetDataSet(sql).Tables[0];
- }
- catch
- {
- dataTable = null;
- }
- finally
- {
- db.close();
- db.Dispose();
- }
- return dataTable;
- }
- /// <summary>
- /// 添加索引域
- /// </summary>
- /// <param name="writer"></param>
- /// <param name="dataTable"></param>
- public void AddDocument(IndexWriter writer, DataTable dataTable)
- {
- for (int i = 0; i < dataTable.Rows.Count; i++)
- {
- Document doc = new Document();
- doc.Add(new Field("ID", dataTable.Rows[i]["N_id"].ToString(),Field.Store.YES,Field.Index.TOKENIZED));
- doc.Add(new Field("N_title",StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_title"].ToString()),Field.Store.YES,Field.Index.TOKENIZED));
- doc.Add(new Field("N_datetime", dataTable.Rows[i]["N_datetime"].ToString(),Field.Store.YES,Field.Index.TOKENIZED));
- if (StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_Content"].ToString()).Length > 100)
- {
- doc.Add(new Field("N_Content", StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_Content"].ToString()).Replace("<p>","").Replace("</p>","").Substring(0, 200), Field.Store.YES, Field.Index.TOKENIZED));
- }
- else
- {
- doc.Add(new Field("N_Content", StringCheck.SqlQueryDecode(dataTable.Rows[i]["N_Content"].ToString()).Replace("<p>", "").Replace("</p>", ""), Field.Store.YES, Field.Index.TOKENIZED));
- }
- writer.AddDocument(doc);
- }
- }
- /// <summary>
- /// 建立索引
- /// </summary>
- public void Index()
- {
- PanGu.Segment.Init();
- PanGuAnalyzer analyzer = new PanGuAnalyzer();
- IndexWriter writer = new IndexWriter(index, analyzer, true);
- DataTable dataTable = GetDS();
- AddDocument(writer, dataTable);
- writer.Optimize();
- writer.Close();
- }
- /// <summary>
- /// 对药搜索的词进行分词
- /// </summary>
- /// <param name="keyWords">要收索的词</param>
- /// <param name="ktTokenizer">分词对象</param>
- /// <returns>分词后的结果</returns>
- public string GetKeyWordSplitBySpace(string keyWords, PanGuTokenizer ktTokenizer)
- {
- StringBuilder builder = new StringBuilder();
- ICollection<WordInfo> words = ktTokenizer.SegmentToWordInfos(keyWords);
- foreach(WordInfo word in words)
- {
- if (word == null)
- {
- continue;
- }
- KeyWord = KeyWord + word+",";
- builder.AppendFormat("{0}^{1}",word.Word,(int)Math.Pow(3,word.Rank));
- }
- KeyWord = KeyWord.Substring(0, KeyWord.Length - 1);
- return builder.ToString().Trim();
- }
- /// <summary>
- /// 检索信息
- /// </summary>
- /// <param name="keyWord">检索关键字</param>
- /// <param name="pageNumber">当前第几条</param>
- /// <param name="pageSize">每页显示的条数</param>
- /// <returns></returns>
- public string Search(string keyWord, int pageNumber, int pageSize)
- {
- string word = GetKeyWordSplitBySpace(keyWord, new PanGuTokenizer());
- IndexSearcher search = new IndexSearcher(index);
- StringBuilder builder = new StringBuilder();
- PanGuAnalyzer analyzer = new PanGuAnalyzer(true);
- //多字段搜索字段
- MultiFieldQueryParser parser = new MultiFieldQueryParser(new string[] { "N_title" },analyzer);
- //分词
- Query query = parser.Parse(word);
- Hits hits = search.Search(query);
- Count = hits.Length();
- int num = 0;//记录每页最后一条
- if (Count < pageNumber + pageSize)
- {
- num = Count;
- }
- else
- {
- num = pageSize + pageNumber;
- }
- DateTime begin = DateTime.Now;
- for (int i = pageNumber; i < num; i++)
- {
- Document doc = hits.Doc(i);
- //创建HTMLFormatter,参数为高亮搜索词的HTML代码
- PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color='red'>", "</font>");
- //创建高亮,输入HTML代码和 盘古对象Semgent
- PanGu.HighLight.Highlighter highter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment());
- //设置每个摘要字段的字符数
- highter.FragmentSize = 200;
- string title = highter.GetBestFragment(keyWord, doc.Get("N_title"));
- builder.Append("<div class='sea_h_list'>");
- if (!string.IsNullOrEmpty(title))
- {
- builder.Append("<h2><a href="D_NewBody.aspx?gxjy=" + UrlEncrpt.Encrypt(doc.Get("ID")) + "" mce_href="D_NewBody.aspx?gxjy=" + UrlEncrpt.Encrypt(doc.Get("ID")) + "" target='_blank'>" + highter.GetBestFragment(keyWord, doc.Get("N_title")) + "</a></h2>");
- }
- else
- {
- builder.Append("<h2>" + doc.Get("N_title") + "</h2>");
- }
- builder.Append(doc.Get("N_Content"));
- DateTime time = Convert.ToDateTime(doc.Get("N_datetime"));
- builder.Append("<div>"+time.ToString("yyyy-MM-dd")+"</div>");
- builder.Append("</div>");
- }
- DateTime end = DateTime.Now;
- double ts = (end - begin).TotalMilliseconds;
- Time =ts / 1000.000;
- search.Close();//关闭检索器
- return builder.ToString();
- }
- }
前台可以用一个button调用一个button事件来生成索引
前台生成索引事件如下。
- protected void Button1_Click(object sender, EventArgs e)
- {
- search = new NewsSearch();
- search.Index();
- Response.Write("<Script>alert('索引生成成功!')</Script>");
- }
索引生成成功后,就可以进行搜索了,本人实现搜索的功能时调用了AspNetPager分页控件,具体的搜索的方法如下:
- public void GetInfoList(string keyWord)
- {
- try
- {
- search = new NewsSearch();
- Stopwatch sw = new Stopwatch();
- sw.Start();
- InfoList = search.Search(keyWord, (this.AspNetPager1.CurrentPageIndex - 1) * this.AspNetPager1.PageSize, this.AspNetPager1.PageSize);
- sw.Stop();
- time = (sw.ElapsedMilliseconds / 1000.000).ToString();//显示所用搜索时间
- if (InfoList.Trim().Length == 0)
- {
- InfoList = "<div class='rig01_center red'>暂无搜索信息。</div>";
- }
- this.AspNetPager1.RecordCount = search.Count;
- count = search.Count.ToString();//搜索的条数
- strkeyWord = search.KeyWord;//显示搜索关键字
- }
- catch
- {
- Response.Redirect("404.html");
- }
- }
其中 Stopwatch使用了计算搜索所用的时间,将它复制到time 全局变量中,count 全局变量时用来存储搜索的条数。
搜索出来数据后,前台页面用<%=count%><%=time%> <%=InfoList%>
调用即可。
前台显示效果:
本文只是简单的实现了lucene.net 和盘古分词,仅供大家学习,有不足的地方还请见谅,有疑问随时联系,谢谢!