• 初探站内搜索(中)


    接上篇

    首先我们配置global.asax.在Application_Start中添加如下代码。创建日志,和定时建立索引任务。

    1 protected void Application_Start(object sender, EventArgs e)
    2 {
    3 log4net.Config.XmlConfigurator.Configure();//log4net起初配置,注意web.config里面改
    4   Segment.Init(HttpContext.Current.Server.MapPath("~/PanGu.xml"));//采用配置文件来初始化
    5  
    6 ISchedulerFactory sf = new StdSchedulerFactory(); //创建计划工厂
    7 IScheduler sched = sf.GetScheduler(); //创建任务的执行者
    8 JobDetail job = new JobDetail("job1", "group1", typeof(Index_Job));//IndexJob为实现了IJob接口的类
    9 DateTime ts = TriggerUtils.GetNextGivenSecondDate(null, 1); //1秒后开始执行
    10 TimeSpan interval = TimeSpan.FromHours (1);//每隔1小时运行一次,这里的方法是可以改的
    11 Trigger trigger = new SimpleTrigger("trigger1", "group1", "job1", "group1", ts, null,
    12 SimpleTrigger.RepeatIndefinitely, interval);//创建触发器
    13 sched.AddJob(job, true);
    14 sched.ScheduleJob(trigger);
    15 sched.Start(); //启动后任务开始计划
    16
    17 }

    然后我们进入Index_Job类中写它的定时任务代码:注意继承IJob接口把计划任务的执行代码写到Execute方法里面。

    索引完成后需要索引库的文件都被lucene.net扔到Index文件夹下。

    1 public class Index_Job:IJob
    2 {
    3 private static ILog log = LogManager.GetLogger(typeof(Index_Job));
    4
    5 #region IJob 成员
    6
    7 public void Execute(JobExecutionContext context)
    8 {
    9 //string indexPath = HttpContext.Current.Server.MapPath("~/Index");
    10
    11 string indexPath = HostingEnvironment.MapPath("~/Index");
    12 log.Debug("开始创建索引,索引目录:" + indexPath);
    13 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
    14 bool isUpdate = IndexReader.IndexExists(directory);
    15 log.Debug("索引目录存在状态:" + isUpdate);
    16 if (isUpdate)
    17 {
    18 if (IndexWriter.IsLocked(directory))
    19 {
    20 log.Debug("解锁索引库");
    21 IndexWriter.Unlock(directory);
    22 }
    23 }
    24 log.Debug("开始爬文章");
    25 IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
    26 string siteURL = ConfigurationManager.AppSettings["SiteURL"];
    27 //for (int i = 1; i <= GetMaxId(siteURL); i++)
    28
    29 for (int i = 900; i <= 1000; i++)
    30 {
    31 log.Debug("开始爬编号为" + i.ToString() + "的帖子");
    32 try
    33 {
    34 WebClient wc = new WebClient();
    35 wc.Encoding = Encoding.UTF8;//
    36 string url = siteURL + "showtopic-" + i + ".aspx";
    37 //string url = "http://localhost:8081/showtopic-" + i + ".aspx";
    38 string txt = wc.DownloadString(url);
    39 HTMLDocumentClass htmldoc = new HTMLDocumentClass();
    40 htmldoc.designMode = "on"; //这样就不解析javascript了
    41 htmldoc.IHTMLDocument2_write(txt);
    42 string title = htmldoc.title;
    43 string bodyText = htmldoc.body.innerText;
    44
    45 writer.DeleteDocuments(new Term("url", url));//删除旧的数据,以url键为主键,这样就避免重复
    46
    47 Document document = new Document();
    48 document.Add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
    49 document.Add(new Field("title", title, Field.Store.YES, Field.Index.NOT_ANALYZED));
    50 document.Add(new Field("body", bodyText, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
    51
    52 writer.AddDocument(document);
    53 log.Debug("爬编号为" + i.ToString() + "的帖子结束");
    54 }
    55 catch (Exception ex)
    56 {
    57 log.Error("爬编号为" + i.ToString() + "的帖子发生异常", ex);
    58 }
    59 }
    60 log.Debug("结束索引,开始关闭Writer和Directory");
    61 writer.Close();
    62 directory.Close();
    63 log.Debug("关闭Writer和Directiory完成");
    64 //ClientScript.RegisterStartupScript(GetType(), "alert", "alert('索引完成')", true);
    65 }
    66
    67 #endregion
    68
    69
    70 private static int GetMaxId(string siteURL) //获得最新的id帖子编号
    71 {
    72 WebClient wc = new WebClient();
    73 wc.Encoding = Encoding.UTF8;
    74 string html = wc.DownloadString(siteURL + "tools/rss.aspx");
    75 XDocument doc = XDocument.Parse(html);
    76 string link = doc.Descendants("item").First().Element("link").Value;
    77 System.Text.RegularExpressions.Regex regex = new Regex(@"showtopic-(\d+)");
    78 Match match = regex.Match(link);
    79 string id = match.Groups[1].Value;
    80
    81 return Convert.ToInt32(id);
    82 }
    83
    84
    85
    86 }

    Default.aspx页面前台html代码(表单form中):

    1 <form id="form1" action="Default.aspx" method ="get">
    2 <div>
    3
    4 <input type="text" id="txtKw" value="<%=Request["kw"] %>" name ="kw" />
    5 <script type="text/javascript" >
    6 $("#txtKw").autocomplete({
    7 source:"SearchSuggestion.ashx" , select:function (e, ui) {
    8 $("#txtKw").val(ui.item.value);
    9 $("#sb").click();
    10 } //自动发出Ajax请求
    11 });
    12 </script>
    13
    14 <input type="submit" id="sb" value="搜索" />
    15
    16 </div>
    17 <asp:Repeater ID="RepeaterResult" runat="server">
    18 <ItemTemplate>
    19 <a href='<%#Eval("URL")%>' > <%#Eval("TITLE")%> </a>
    20 <br />
    21 <p>
    22 <%#Eval("BODY") %>
    23 </p>
    24 </ItemTemplate>
    25 </asp:Repeater>
    26 </form>

    注意把viewstate禁用了,防止最后生成的客户端html代码中有一大堆viewstate的东东,显得我们不专业 呵呵~(EnableViewState ="false")

    我们知道一旦禁用viewstate所有跑在服务端的基本控件都不能用了,那些数控绑定和链接控件除外,所以我们回归原始的html。用get方法提交表单。

    因此后台代码我们就在pageload方法中实现,通过用户输入的关键字Request["kw"]是否为空提交表单后判断是否加载运行。后台处理代码如下:(其中有注释我就不详细解释了)

    1 public partial class _Default : System.Web.UI.Page
    2 {
    3 protected void Page_Load(object sender, EventArgs e)
    4 {
    5 if (string.IsNullOrEmpty(Request["kw"]))
    6 {
    7 return;
    8 }
    9 else
    10 {
    11
    12 string kw = Request["kw"];
    13 new SearchLogTableAdapter().Insert(Guid.NewGuid(), DateTime.Now, Request.UserHostAddress, kw); //把关键词插入数据库
    14
    15 string indexPath = Server.MapPath("~/Index"); //获得要搜索的文本路径(已经被lucene建立好的文本的索引文件)
    16 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
    17 IndexReader indexReader = IndexReader.Open(directory, true);
    18 IndexSearcher searcher = new IndexSearcher(indexReader);//加入搜索者
    19 PhraseQuery query = new PhraseQuery();//查询条件
    20 foreach (string word in segString(kw)) //分词
    21 {
    22 query.Add(new Term("body", word)); //加入分词的查询条件
    23
    24 }
    25 query.SetSlop(1000); //相邻1000个字有效
    26
    27 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//创建收集器最多收集1000个文本
    28 searcher.Search(query, null, collector);//开始查询,使用query条件,结果放入collector中
    29 TopDocs topDocs = collector.TopDocs();//获得结果
    30 int sum = collector.GetTotalHits(); //得到结果条数
    31 List<SearchResult> list = new List<SearchResult>();
    32 foreach (ScoreDoc scoreDoc in topDocs.scoreDocs)
    33 {
    34
    35 int docId = scoreDoc.doc;//拿到搜到的文档ID
    36 Document document = searcher.Doc(docId);//根据文档ID创建DOCUMENT
    37 string url = document.Get("url");
    38 string title = document.Get("title");
    39 string body = document.Get("body");
    40
    41 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); //设置高亮,还可以实现其他功能。提示之类的~~
    42 Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new Segment());
    43 highlighter.FragmentSize = 200;
    44 body = highlighter.GetBestFragment(kw, body); //把关键字设为高亮
    45
    46 SimpleHTMLFormatter simpleHtmlFormatter1 = new SimpleHTMLFormatter("<font style='background-color:red'>", "</font>"); //把标题背景设为红
    47 Highlighter highlighter1 = new Highlighter(simpleHtmlFormatter1, new Segment());
    48 highlighter1.FragmentSize = 200;
    49 title = highlighter1.GetBestFragment(title, title);
    50
    51 SearchResult result = new SearchResult()
    52 {
    53 URL = url,
    54 TITLE = title,
    55 BODY = body
    56 };
    57 list.Add(result);
    58
    59 }
    60 searcher.Close();
    61 indexReader.Close();
    62 directory.Close();
    63 RepeaterResult.DataSource = list;
    64 RepeaterResult.DataBind();
    65 }
    66
    67 }
    68 private static string[] segString(string s)
    69 {
    70 Segment segment = new Segment();
    71 return (from wordInfo in segment.DoSegment(s) select wordInfo.Word).ToArray();
    72 }
    73
    74 }
    75 public class SearchResult
    76 {
    77 public string URL { get; set; }
    78 public string TITLE { get; set; }
    79 public string BODY { get; set; }
    80 }

    接下去我们实现输入关键词自动补全。用JQueryUI和AJAX请求后台关键词效果。

    未完,待续。。。

  • 相关阅读:
    poj 2728 Desert King(最小比率生成树,迭代法)
    HDU
    hud 2089 不要62 (数位dp)
    食物链(带全并查集)
    docNet基础学完感想
    zoj 1081 (改进的弧长算法)(转)
    zoj 1962 How Many Fibs?(字符串化为数字处理)
    zoj 1109 zoj 1109 Language of FatMouse(字典树)
    iOS开发网络数据之AFNetworking使用
    iOS 使用AFNetworking
  • 原文地址:https://www.cnblogs.com/lys_013/p/1851963.html
Copyright © 2020-2023  润新知