大多数情况下,我们的搜索一般用的是sql的模糊搜索,但是这个模糊搜索,总是不够精确,而且总达不到我们的要求,于是乎,偶专门上网找了一些资料,研究了一下,现在比较流行的Lucene.net,感觉还蛮不错的,搜索效果也蛮好的,再配合盘古分词,感觉超酷.
嗯,我就动手尝试了一下类似百度的多模块搜索,感觉蛮好玩的.
网上一般都只做了一个搜索,借鉴他们的代码,我在这里专门设计怎么做多个模块搜索,做完可以考虑,怎么把这些内容整合搜索!
网上介绍lucene.net也蛮多了,这里就说一下,我做lucene.net的心得,lucene.net一般做出一个搜索比较容易,但是涉及到多个索引比较麻烦,这里我就专门看了一些资料写了一个多模块的搜索,正在看怎么把多种索引组合起来,能够综合性搜索,主要实现了2个模块,新闻和工艺知识,
进入正题:
做站内搜索主要涉及以下几个技术:
多线程技术,
Lucene.net,
盘古分词.
Lucene.net实际上就是把数据建立一个索引库保存起来,然后以后就像翻书一样搜索.
盘古分词就是专门为了把一段话分词,比如李明去吃饭.盘古分词就会拆分出关键词,李明 吃饭,这样,就可以到索引库去查找这两个词.
多线程为了保证让系统自动去索引我们写入的文章或者工艺知识等内容,每次做增删改系统就自动去更新索引库.
这里主要涉及以下几个dll,
其中还包括盘古分词的高亮显示和盘古分词的配置文件.
主要代码:
IndexJobItem类,
这个类定义了关于luncene.net的子任务信息类
1: using System;
2: using System.Data;
3: using System.Configuration;
4: using System.Linq;
5: using System.Web;
6: using System.Web.Security;
7: using System.Web.UI;
8: using System.Web.UI.HtmlControls;
9: using System.Web.UI.WebControls;
10: using System.Web.UI.WebControls.WebParts;
11: using System.Xml.Linq;
12:
13: /// <summary>
14: ///任务类型
15: /// </summary>
16: public class IndexJobItem
17: {
18: //任务类型
19: public enum JobType
20: {
21: Delete, Add
22: }
23:
24: public JobType ItemType { get; set; }
25: public long ThreadId { get; set; }
26: public int Id { get; set; }
27:
28: public override bool Equals(object obj)
29: {
30: IndexJobItem item = obj as IndexJobItem;
31: if (item == null)
32: {
33: return false;
34: }
35: return this.ItemType == item.ItemType && this.ThreadId == item.ThreadId;
36: //return base.Equals(obj);
37: }
38: public override int GetHashCode()
39: {
40: return base.GetHashCode();
41: }
42: public override string ToString()
43: {
44: return ItemType + ":" + ThreadId;
45: }
46: public IndexJobItem()
47: {
48: //
49: //TODO: 在此处添加构造函数逻辑
50: //
51: }
52: }
IndexManager类
这个是专门线程进行索引操作
1: using System;
2: using System.Data;
3: using System.Configuration;
4: using System.Linq;
5: using System.Web;
6: using System.Web.Security;
7: using System.Web.UI;
8: using System.Web.UI.HtmlControls;
9: using System.Web.UI.WebControls;
10: using System.Web.UI.WebControls.WebParts;
11: using System.Xml.Linq;
12: using log4net;
13: using System.Web.Hosting;
14: using Lucene.Net.Store;
15: using Lucene.Net.Index;
16: using System.IO;
17: using Lucene.Net.Analysis.PanGu;
18: using System.Net;
19: using czcraft.BLL;
20: using mshtml;
21: using czcraft;
22: using System.Text;
23: using Lucene.Net.Documents;
24: using System.Text.RegularExpressions;
25: using Quartz.Collection;
26: using System.Collections.Generic;
27: using System.Threading;
28:
29: /// <summary>
30: ///IndexManager 只能通过Instance实例化
31: /// </summary>
32: public class IndexManager
33: {
34: //单例模式
35: public readonly static IndexManager Instance = new IndexManager();
36: /// <summary>
37: /// 任务是否停止
38: /// </summary>
39: private bool IsStopped;
40: /// <summary>
41: /// 搜索类别枚举
42: /// </summary>
43: public enum JobSearchType
44: {
45: Product, News, Knowledge
46: }
47: /// <summary>
48: /// 搜索类别
49: /// </summary>
50: public JobSearchType jobSearchType { get; set; }
51: /// <summary>
52: /// 索引任务集合列表
53: /// </summary>
54: private List<IndexJobItem> jobs = new List<IndexJobItem>();
55: private static ILog log = LogManager.GetLogger(typeof(IndexManager));
56: /// <summary>
57: /// 私有构造函数所有的地方要对索引库进行修改都通过IndexManger,所以要单例,因为同时只能有一个在写索引库,
58: /// 别的地方要写索引库要请求这个类来进行索引
59: /// </summary>
60: private IndexManager()
61: {
62: //
63: //TODO: 在此处添加构造函数逻辑
64: //
65: }
66: /// <summary>
67: /// 启动任务
68: /// </summary>
69: public void Start()
70: {
71: IsStopped = false;
72: Thread thread = new Thread(ScanThread);
73: //背景线程
74: thread.IsBackground = true;
75: thread.Start();
76:
77: }
78: /// <summary>
79: /// 停止任务
80: /// </summary>
81: public void Stop()
82: {
83: IsStopped = true;
84:
85: }
86: /// <summary>
87: /// 扫描线程
88: /// </summary>
89: private void ScanThread()
90: {
91: //如果停止,则不在无限循环
92: while (!IsStopped)
93: {
94: //休息5秒钟,尽可能多的积累任务
95: Thread.Sleep(5000);
96: if (jobs.Count <= 0)
97: {
98: //如果没有任务,线程等待
99: log.Debug("没有任务,继续线程等待");
100: Thread.Sleep(10 * 1000);
101: continue;
102: }
103: //为什么每次循环都要打开,关闭索引库,因为关闭索引库以后才会把写入的数据提交到索引库中.也可以每次操作都"提交"(参考Lucene.net文档)
104: //Enum.Parse(typeof(JobSearchType), jobSearchType).ToString()获取枚举名称
105: string indexPath = System.IO.Path.Combine(HostingEnvironment.ApplicationPhysicalPath, ConfigurationManager.AppSettings["path"] + @"\" + Enum.Parse(typeof(JobSearchType), jobSearchType.ToString ()).ToString());
106: FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
107:
108: bool isUpdate = IndexReader.IndexExists(directory);
109: log.Debug("索引库是否存在:" + isUpdate);
110: if (isUpdate)
111: {
112: //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
113: if (IndexWriter.IsLocked(directory))
114: {
115: log.Debug("开始解锁索引库");
116: IndexWriter.Unlock(directory);
117: log.Debug("解锁库完成");
118: }
119: }
120: //索引
121: IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
122:
123: //开始建立索引
124: ProcessJob(writer);
125:
126: writer.Close();
127: //不要忘了close
128: directory.Close();
129: log.Debug("全部索引完毕");
130: }
131: }
132: /// <summary>
133: /// 索引任务
134: /// </summary>
135: /// <param name="writer"></param>
136: private void ProcessJob(IndexWriter writer)
137: {
138: foreach (var job in jobs.ToArray())
139: {
140: //删除任务
141: jobs.Remove(job);
142: //因为自己的网站,直接读取数据库,不用WebClient
143: //为避免重复索引,所以先删除number=i的记录,再重新添加
144: writer.DeleteDocuments(new Term("number", job.Id.ToString()));
145: //索引
146: Document document = new Document();
147: string TypeName = "";
148: //如果"添加" 任务则再添加
149: if (job.ItemType == IndexJobItem.JobType.Add)
150: {
151: switch (jobSearchType)
152: {
153: //索引工艺知识
154: case JobSearchType.Knowledge:
155: document = AddDocumentBycraftknowledge(job);
156: TypeName = "工艺知识";
157: break;
158: case JobSearchType.News:
159: document = AddDocumentByNews(job);
160: TypeName = "新闻";
161: break;
162: case JobSearchType.Product:
163: AddDocumentByProduct(job);
164: TypeName = "商品";
165: break;
166: default:
167: log.Debug("未设置JobSearchType属性,无法索引");
168: return;
169:
170:
171:
172: }
173:
174: writer.AddDocument(document);
175: log.Debug("索引" + TypeName + ":" + job.Id + "完成!");
176:
177:
178:
179: }
180: }
181: }
182: /// <summary>
183: /// 给商品添加索引
184: /// </summary>
185: /// <param name="job"></param>
186: /// <returns></returns>
187: public Document AddDocumentByProduct(IndexJobItem job)
188: {
189: return null;
190:
191: }
192: /// <summary>
193: /// 给新闻添加索引
194: /// </summary>
195: /// <param name="job"></param>
196: /// <returns></returns>
197: public Document AddDocumentByNews(IndexJobItem job)
198: {
199: newsBLL bll = new newsBLL();
200: //有可能刚添加就被删除了
201: if (bll == null)
202: {
203: return null;
204:
205: }
206: var craftknowledge = bll.Get(job.Id);
207: string title = craftknowledge.Title;
208: //这里要去除标签
209: string body = Common.Tools.HtmlToTxt(craftknowledge.Content);
210: Document document = new Document();
211: document.Add(new Field("number", job.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
212: document.Add(new Field("ArticleHtmlUrl", craftknowledge.ArticleHtmlUrl, Field.Store.YES, Field.Index.NOT_ANALYZED));
213: //以下内容要索引
214: document.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
215: document.Add(new Field("body", body, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
216: return document;
217: }
218: /// <summary>
219: /// 给工艺知识添加索引
220: /// </summary>
221: /// <returns></returns>
222: public Document AddDocumentBycraftknowledge(IndexJobItem job)
223: {
224: craftknowledgeBLL bll = new craftknowledgeBLL();
225: //有可能刚添加就被删除了
226: if (bll == null)
227: {
228: return null;
229:
230: }
231: var craftknowledge = bll.Get(job.Id);
232: string title = craftknowledge.Title;
233: //这里要去除标签
234: string body = Common.Tools.HtmlToTxt(craftknowledge.Content);
235: Document document = new Document();
236: document.Add(new Field("number", job.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
237: document.Add(new Field("ArticleHtmlUrl", craftknowledge.ArticleHtmlUrl, Field.Store.YES, Field.Index.NOT_ANALYZED));
238: //以下内容要索引
239: document.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
240: document.Add(new Field("body", body, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
241: return document;
242: }
243: /// <summary>
244: /// 添加任务
245: /// </summary>
246: /// <param name="Id">根据id</param>
247: public void AddJob(int Id)
248: {
249: IndexJobItem job = new IndexJobItem();
250: job.Id = Id;
251: job.ItemType = IndexJobItem.JobType.Add;
252: log.Debug(Id + "加入到任务列表中");
253: //把任务加入任务列表
254: jobs.Add(job);
255:
256: }
257: /// <summary>
258: /// 删除任务
259: /// </summary>
260: /// <param name="Id">根据Id</param>
261: public void RemoveJob(int Id)
262: {
263: IndexJobItem job = new IndexJobItem();
264: job.Id = Id;
265: job.ItemType = IndexJobItem.JobType.Delete;
266: log.Debug(Id + "加入删除任务列表");
267: jobs.Add(job);
268: }
269: /// <summary>
270: /// 实例化
271: /// </summary>
272: /// <returns></returns>
273: public static IndexManager GetInstance(JobSearchType jobType)
274: {
275: //设置job的类别为
276: Instance.jobSearchType = jobType;
277: return Instance;
278: }
279:
280:
281:
282: }
接下来在全局配置文件中开启线程
在webconfig配置索引目录
在这里一个很重要的问题就是,我的商品,工艺知识,和新闻是根据一个枚举来判断到底是给哪个进行索引,
索引的目录也是动态的,根据枚举判断的
索引的目录
搜索BLL
1: using System;
2: using System.Collections.Generic;
3: using System.Linq;
4: using System.Web;
5: using Lucene.Net.Store;
6: using System.IO;
7: using Lucene.Net.Index;
8: using Lucene.Net.Analysis.PanGu;
9: using System.Net;
10: using Lucene.Net.Documents;
11: using log4net;
12: using Lucene.Net.Search;
13: using System.Text;
14: using mshtml;
15: using PanGu;
16: using System.Xml.Linq;
17: using System.Text.RegularExpressions;
18: using czcraft.BLL;
19: using czcraft.Model;
20: using System.Collections;
21: using System.Web.Hosting;
22: using System.Configuration;
23:
24: namespace czcraft.BLL
25: {
26: public partial class SearchBLL
27: {
28: private ILog logger = LogManager.GetLogger(typeof(SearchBLL));
29:
30: /// <summary>
31: /// 搜索
32: /// </summary>
33: /// <param name="kw">关键词</param>
34: /// <param name="startIndex">开始页码</param>
35: /// <param name="pageSize">每页显示个数</param>
36: /// <param name="totalCount">总个数</param>
37: /// <returns></returns>
38: public IEnumerable<SearchResult> Search(string kw, int startIndex, int pageSize, out int totalCount,SearchSum.searchType Type)
39: {
40: string indexPath = System.IO.Path.Combine(HostingEnvironment.ApplicationPhysicalPath, ConfigurationManager.AppSettings["path"] + @"\" + Type.ToString ());
41: FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
42: IndexReader reader = IndexReader.Open(directory, true);
43: IndexSearcher searcher = new IndexSearcher(reader);
44: PhraseQuery query = new PhraseQuery();
45:
46: //todo:把用户输入的关键词进行拆词
47:
48: foreach (string word in CommonHelper.SplitWord(kw))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业”
49: {
50: query.Add(new Term("body", word));
51: }
52:
53: query.SetSlop(50);
54: TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
55: searcher.Search(query, null, collector);
56: totalCount = collector.GetTotalHits();//返回总条数
57: ScoreDoc[] docs = collector.TopDocs(startIndex, pageSize).scoreDocs;
58: List<SearchResult> listResult = new List<SearchResult>();
59: for (int i = 0; i < docs.Length; i++)
60: {
61: int docId = docs[i].doc;//取到文档的编号(主键,这个是Lucene .net分配的)
62: //检索结果中只有文档的id,如果要取Document,则需要Doc再去取
63: //降低内容占用
64: Document doc = searcher.Doc(docId);//根据id找Document
65: string number = doc.Get("number");
66: string title = doc.Get("title");
67: string body = doc.Get("body");
68: string ArticleHtmlUrl = doc.Get("ArticleHtmlUrl");
69: SearchResult result = new SearchResult();
70: result.Number = number;
71: result.Title = title;
72:
73:
74: result.BodyPreview = Preview(body, kw);
75: result.ArticleHtmlUrl = ArticleHtmlUrl;
76: listResult.Add(result);
77: }
78: return listResult;
79: }
80: /// <summary>
81: /// 设置高亮显示
82: /// </summary>
83: /// <param name="body">文章主体</param>
84: /// <param name="keyword">关键词</param>
85: /// <returns></returns>
86: private static string Preview(string body, string keyword)
87: {
88: //创建HTMLFormatter,参数为高亮单词的前后缀
89: PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
90: new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");
91: //创建 Highlighter ,输入HTMLFormatter 和 盘古分词对象Semgent
92: PanGu.HighLight.Highlighter highlighter =
93: new PanGu.HighLight.Highlighter(simpleHTMLFormatter,
94: new Segment());
95: //设置每个摘要段的字符数
96: highlighter.FragmentSize = 100;
97: //获取最匹配的摘要段
98: String bodyPreview = highlighter.GetBestFragment(keyword, body);
99: return bodyPreview;
100: }
101: }
102: }
搜索也根据搜索类别枚举动态判断搜索类别!
前端页面设计:
1: <%@ Page Language="C#" MasterPageFile="~/Top_Down.master" AutoEventWireup="true"
2: CodeFile="SearchKnowledge.aspx.cs" Inherits="Search_SearchKnowledge" Title="找找看-工艺知识" %>
3:
4: <asp:Content ID="Content1" ContentPlaceHolderID="head" runat="Server">
5: <link href="../css/baidu.css" rel="stylesheet" type="text/css" />
6: <link href="../css/other.css" rel="stylesheet" type="text/css" />
7: <link href="../css/ui-lightness/jquery-ui-1.8.2.custom.css" rel="stylesheet" type="text/css" />
8: <link href="../css/Pager.css" rel="stylesheet" type="text/css" />
9: <link href="../css/Search.css" rel="stylesheet" type="text/css" />
10: <script src="../Admin/scripts/jquery-1.7.1.min.js" type="text/javascript"></script>
11:
12: <script src="../js/jquery-ui-1.8.2.custom.min.js" type="text/javascript"></script>
13:
14: <script type="text/javascript">
15:
16: $(function () {
17: $("#kw").autocomplete(
18: { source: "Data/SearchSuggestion.ashx",
19: select: function (event, ui) { $("#kw").val(ui.item.value); $("#form1").submit(); }
20: });
21: });
22: </script>
23:
24:
25: </asp:Content>
26: <asp:Content ID="Content2" ContentPlaceHolderID="ContentPlaceHolder1" runat="Server">
27: <div class="content">
28: <div class="left_side">
29: <div class="logo_bottom">
30: </div>
31: </div>
32: <div class="gjss_load">
33: <h4>
34: 找找看</h4>
35: <span>当前位置:<a href="#">首页</a> > <a href="#">找找看</a></span>
36: </div>
37: <div class="gjss">
38: <div class="gjss_top">
39: </div>
40: <div class="gjss_c">
41: <table width="804">
42: <tr>
43: <td colspan="7" align="center">
44: <label id="lbNews" style="margin-left:260px" class="tab"><a href="SearchNews.aspx">新闻</a></label> <label id="lbKnowledge" style="margin-left:50px" class="tab"><a href="SearchKnowledge.aspx">工艺知识</a></label><label id="lbProduct" style="margin-left:50px" class="tab"><a href="#">商品</a></label>
45:
46: </td>
47: </tr>
48: <tr>
49: <td class="style1">
50: <div id="m" align="center">
51: <div id="fm">
52: <form name="form1">
53: <span class="s_ipt_wr" style="float: left">
54: <input id="kw" class="s_ipt" name="kw" maxlength="100" value='<%=Request["kw"] %>' />
55: </span><span class="s_btn_wr">
56: <input id="su" class="s_btn" onmouseout="this.className='s_btn'" onmousedown="this.className='s_btn s_btn_h'"
57: value="找找看" type="submit" /></span></form>
58: </div>
59: </div>
60: </td>
61: </tr>
62: <tr>
63: <td colspan="7" align="center" class="style1">
64: <div style="text-align: center">
65: <ul id="hotwordsUL" class="hotWords">
66: <asp:Repeater ID="repeaterHotWords" runat="server">
67: <ItemTemplate>
68: <li><a href='SearchKnowledge.aspx?kw=<%#Eval("KeyWord") %>'>
69: <%#Eval("KeyWord") %>
70: </a></li>
71: </ItemTemplate>
72: </asp:Repeater>
73: </ul>
74: </div>
75: </td>
76: </tr>
77: <tr>
78: <td colspan="7" align="center">
79: <br />
80: <ul id="ulResult" class="hotWords">
81: <asp:Repeater EnableViewState="false" ID="repeaterResult" runat="server">
82: <ItemTemplate>
83: <li><span><%--<a href='../CraftKnowledge/ViewCraftKnowledge.aspx?KnowledgeId=<%#Eval("Number") %>'>--%>
84: <a href='<%#Eval("ArticleHtmlUrl") %>'>
85: <%#Eval("Title") %></a></span>
86: <br />
87: <span> <%#Eval("BodyPreview")%></span>
88: </li>
89: </ItemTemplate>
90: </asp:Repeater>
91: </ul>
92: <br />
93: <div class="pager">
94: <%=PageHtml%>
95: </div>
96:
97: </td>
98: </tr>
99: </table>
100: </div>
101: </div>
102: </div>
103: </asp:Content>
后台代码:
1: using System;
2: using System.Collections;
3: using System.Configuration;
4: using System.Data;
5: using System.Linq;
6: using System.Web;
7: using System.Web.Security;
8: using System.Web.UI;
9: using System.Web.UI.HtmlControls;
10: using System.Web.UI.WebControls;
11: using System.Web.UI.WebControls.WebParts;
12: using System.Xml.Linq;
13: using czcraft;
14: using czcraft.BLL;
15: using Common;
16: using czcraft.Model;
17: using System.Collections.Generic;
18:
19: public partial class Search_SearchKnowledge : System.Web.UI.Page
20: {
21: //分页控件
22: public string PageHtml { get;private set; }
23: protected void Page_Load(object sender, EventArgs e)
24: {
25: //加载热词
26: repeaterHotWords.DataSource = new SearchInfoBLL().GetHotWords(SearchSum.searchType.Knowledge);
27: repeaterHotWords.DataBind();
28:
29: //如果kw为空,则是第一次进入界面
30: string kw = Request["kw"];
31:
32: if (!Tools.IsValidInput(ref kw,true)||string.IsNullOrEmpty(kw))
33: {
34: return;
35: }
36: //把搜索记录加入数据库
37: SearchInfo kwLog = new SearchInfo();
38: kwLog.KeyWord = kw;
39: kwLog.DateTime = DateTime.Now;
40: kwLog.Ip = Request.UserHostAddress;
41: kwLog.SearchType = SearchSum.searchType.Knowledge.GetHashCode().ToString ();
42: new SearchInfoBLL().AddNew(kwLog);
43:
44: var pager =new Common.RupengPager();
45: pager.UrlFormat = "SearchKnowledge.aspx?pagenum={n}&kw=" + Server.UrlEncode(kw);
46: pager.PageSize = 10;
47: //解析当前页面
48: pager.TryParseCurrentPageIndex(Request["pagenum"]);
49: int startRowIndex = (pager.CurrentPageIndex - 1) * pager.PageSize;
50:
51: int totalCount;
52: IEnumerable<SearchResult> result = new SearchBLL().Search(kw, startRowIndex, 10, out totalCount, SearchSum.searchType.Knowledge);
53: pager.TotalCount = totalCount;
54: PageHtml = pager.Render();//渲染页码条HTML
55:
56: repeaterResult.DataSource = result;
57: repeaterResult.DataBind();
58:
59: }
60: }
我们还可以再做做当前热点,这里详细粘贴代码了
效果图:
这里复制了好多重复数据,不是程序问题………………
哈哈,一个站内搜索给网站增添了不少亮点,
这个搜索框当然是copy的百度的,哈哈