由于公司这段时间比较缺人手,这段时间上游戏厂商的专题活动也较为频繁,不得不做一个新闻采集的小软件,采集别人站点的一些新闻到我们平台上。 自己总结了下,新闻采集主要有几点: 1、通过模拟http请求,请求页面内容 2、通过正则表达式,把页面内容进行过滤,取出想要的部分。 3、把数据给整合成符合我们需要的数据。
模拟http请求,请求页面内容 关于模拟http请求。我这里不详细解释,如果英语好的,可以看官方的文档:HttpWebRequest 我贴下我这里的模拟Http请求的helper
/// <summary> /// 模拟Http请求 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetHttpRequest(string url) { if (string.IsNullOrEmpty(url)) return string.Empty; HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.Method = "GET"; request.ServicePoint.Expect100Continue = false; StreamReader stream = null; string responseValue = string.Empty; try { stream = new StreamReader(request.GetResponse().GetResponseStream()); responseValue = stream.ReadToEnd(); } catch { throw; } finally { request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null; } return responseValue; } /// <summary> /// 模拟Http请求 /// </summary> /// <param name="url"></param> /// <returns></returns> public static byte[] GetHttpRequestStream(string url) { byte[] bytes = null; StreamReader stream = null; if (string.IsNullOrEmpty(url)) return bytes; HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.CookieContainer = new CookieContainer(); CookieContainer cookie = request.CookieContainer; request.Method = "GET"; request.ServicePoint.Expect100Continue = false; //string responseValue = string.Empty; try { stream = new StreamReader(request.GetResponse().GetResponseStream()); List<byte> lBtyes = new List<byte>(); while (stream.BaseStream.CanRead) { int result = stream.BaseStream.ReadByte(); if (result == -1) break; lBtyes.Add((byte) result); } bytes = lBtyes.ToArray(); // responseValue = stream.ReadToEnd(); } catch { throw; } finally { request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null; } return bytes; } /// <summary> /// 模拟Http请求 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetHttpRequest(string url, Encoding ec) { if (string.IsNullOrEmpty(url)) return string.Empty; HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.Method = "GET"; request.ServicePoint.Expect100Continue = false; StreamReader stream = null; string responseValue = string.Empty; try { stream = new StreamReader(request.GetResponse().GetResponseStream(), ec); responseValue = stream.ReadToEnd(); } catch { throw; } finally { request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null; } return responseValue; }
我采集的是完美官网的新闻,新闻链接: http://sw.wanmei.com/news/gamenews/list.shtml
请求的代码:
//str是请求的结果的html内容 string str = PostRegister.Tools.GetHttpRequest("http://sw.wanmei.com/news/gamenews/list.shtml");
通过正则表达式,过滤html内容:
string reg = @"<a[^>]*href=(""(?<href>[^""]*)""|'(?<href>[^']*)'|(?<href>[^s>]*))[^>]*>(?<text>[sS]*?)</a>"; MatchCollection mc = Regex.Matches(str, reg); StringBuilder strTitle = new StringBuilder(); Dictionary<string, string> titleUrlList = new Dictionary<string, string>(); strTitle.Append(""); for (int i = 0; i < mc.Count; i++) { string href = mc[i].Groups["href"].Value;// 这是href内容 string text = mc[i].Groups["text"].Value;// 这是text内容,就是<a>这里的内容</a> if (i < mc.Count - 1) { if (text.Contains("新闻") && mc[i + 1].Groups[0].Value.Contains("_blank") && mc[i + 1].Groups[0].Value.Contains("hidefocus") == false) { strTitle.Append("'" + mc[i + 1].Groups["text"].Value + "',");//拼接字符串,用于查询数据库使用 titleUrlList.Add(mc[i + 1].Groups["text"].Value, "http://sw.wanmei.com" + mc[i + 1].Groups["href"].Value);//新闻标题和新闻链接 } } } DbClassLibrary.Spiders.CommonSpider commonSpider = new CommonSpider(); string allTitle = ""; if (strTitle.ToString() != "") { allTitle = strTitle.ToString().Substring(0, strTitle.ToString().Length - 1);//截掉拼接的字符串中的最后一个,号 List<string> allNotExists = commonSpider.GetNotExistsNews(allTitle, 1);//查询数据库中不存在的新闻,gameID=1表示 圣王 for (int i = 0; i < allNotExists.Count; i++) { listBoxTtitle.Items.Add(allNotExists[i]); listBoxLink.Items.Add(titleUrlList[allNotExists[i]]); } } lblResult.Text = "共发现" + listBoxTtitle.Items.Count + "条新数据"; if (listBoxTtitle.Items.Count > 0) MessageBox.Show("啦啦啦,发现新数据!共发现" + listBoxTtitle.Items.Count + "条新数据"); else { MessageBox.Show("对不起,暂时没有发现官网有新数据"); }
通过上面的代码,我们过滤出来所有符合条件的新闻的标题和新闻链接。 注:关于正则表达式的使用方法和解释,可参照我的另外一篇文章: http://www.woaic.com/2012/09/159 下面就是请求新闻链接,获取新闻主体内容了,方法也挺简单,贴出来,主要是获取id是article_txt的内容:
/// <summary> /// 根据新闻详情页面url获取url主体内容 /// </summary> /// <param name="url"></param> /// <returns></returns> private string GetHtmlContent(string url) { string str = PostRegister.Tools.GetHttpRequest(url); string reg = @"<div id=""article_txt"">((?!</?div>)[sS]*?)</div>"; MatchCollection mc = Regex.Matches(str, reg); StringBuilder strHref = new StringBuilder(); StringBuilder strText = new StringBuilder(); StringBuilder strTemp = new StringBuilder(); for (int i = 0; i < mc.Count; i++) { strTemp.AppendFormat(mc[i].Groups[0].Value.Replace("/resources/JPG", "http://sw.wanmei.com/resources/JPG").Replace("/resources/jpg", "http://sw.wanmei.com/resources/jpg")); return strTemp.ToString().Substring(0, strTemp.ToString().Length - 6).Replace("<div id="article_txt">", "").Trim(); } return ""; }
希望对你能有所帮助,^_^