关于网络爬虫和网站数据采集的一些总结

由于公司这段时间比较缺人手，这段时间上游戏厂商的专题活动也较为频繁，不得不做一个新闻采集的小软件，采集别人站点的一些新闻到我们平台上。自己总结了下，新闻采集主要有几点： 1、通过模拟http请求，请求页面内容 2、通过正则表达式，把页面内容进行过滤，取出想要的部分。 3、把数据给整合成符合我们需要的数据。

模拟http请求，请求页面内容 关于模拟http请求。我这里不详细解释，如果英语好的，可以看官方的文档：HttpWebRequest 我贴下我这里的模拟Http请求的helper

        /// <summary> 
        /// 模拟Http请求 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        public static string GetHttpRequest(string url)
        {
            if (string.IsNullOrEmpty(url))
                return string.Empty;
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.Method = "GET";
            request.ServicePoint.Expect100Continue = false;
            StreamReader stream = null;
            string responseValue = string.Empty;
            try
            {
                stream = new StreamReader(request.GetResponse().GetResponseStream());
                responseValue = stream.ReadToEnd();
            }
            catch
            {
                throw;
            }
            finally
            {
                request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null;
            } return responseValue;
        }
        /// <summary> 
        /// 模拟Http请求 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        public static byte[] GetHttpRequestStream(string url)
        {
            byte[] bytes = null;
            StreamReader stream = null;
            if (string.IsNullOrEmpty(url)) return bytes;
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.CookieContainer = new CookieContainer();
            CookieContainer cookie = request.CookieContainer;
            request.Method = "GET";
            request.ServicePoint.Expect100Continue = false;
            //string responseValue = string.Empty; 
            try
            {
                stream = new StreamReader(request.GetResponse().GetResponseStream());
                List<byte> lBtyes = new List<byte>();
                while (stream.BaseStream.CanRead)
                {
                    int result = stream.BaseStream.ReadByte();
                    if (result == -1) break;
                    lBtyes.Add((byte) result);
                }
                bytes = lBtyes.ToArray();
                // 
                responseValue = stream.ReadToEnd();
            }
            catch
            {
                throw;
            }
            finally
            {
                request.GetResponse().GetResponseStream().Close();
                stream.Close();
                stream = null;
            } return bytes;
        }

        /// <summary> 
        /// 模拟Http请求 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        public static string GetHttpRequest(string url, Encoding ec)
        {
            if (string.IsNullOrEmpty(url)) return string.Empty;
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.Method = "GET";
            request.ServicePoint.Expect100Continue = false;
            StreamReader stream = null;
            string responseValue = string.Empty;
            try
            {
                stream = new StreamReader(request.GetResponse().GetResponseStream(), ec);
                responseValue = stream.ReadToEnd();
            }
            catch
            {
                throw;
            }
            finally
            {
                request.GetResponse().GetResponseStream().Close();
                stream.Close();
                stream = null;
            }
            return responseValue;
        }

我采集的是完美官网的新闻，新闻链接： http://sw.wanmei.com/news/gamenews/list.shtml

请求的代码：

//str是请求的结果的html内容 

string str = PostRegister.Tools.GetHttpRequest("http://sw.wanmei.com/news/gamenews/list.shtml");

通过正则表达式，过滤html内容：

string reg = @"<a[^>]*href=(""(?<href>[^""]*)""|'(?<href>[^']*)'|(?<href>[^s>]*))[^>]*>(?<text>[sS]*?)</a>";
            MatchCollection mc = Regex.Matches(str, reg);
            StringBuilder strTitle = new StringBuilder();
            Dictionary<string, string> titleUrlList = new Dictionary<string, string>();
            strTitle.Append("");
            for (int i = 0; i < mc.Count; i++)
            {
                string href = mc[i].Groups["href"].Value;// 这是href内容 
                string text = mc[i].Groups["text"].Value;// 这是text内容,就是<a>这里的内容</a> 
                if (i < mc.Count - 1)
                {
                    if (text.Contains("新闻") && mc[i + 1].Groups[0].Value.Contains("_blank") && mc[i + 1].Groups[0].Value.Contains("hidefocus") == false)
                    {
                        strTitle.Append("'" + mc[i + 1].Groups["text"].Value + "',");//拼接字符串，用于查询数据库使用 
                        titleUrlList.Add(mc[i + 1].Groups["text"].Value, "http://sw.wanmei.com" + mc[i + 1].Groups["href"].Value);//新闻标题和新闻链接 
                    }
                }
            }
            DbClassLibrary.Spiders.CommonSpider commonSpider = new CommonSpider();
            string allTitle = "";
            if (strTitle.ToString() != "")
            {
                allTitle = strTitle.ToString().Substring(0, strTitle.ToString().Length - 1);//截掉拼接的字符串中的最后一个,号 
                List<string> allNotExists = commonSpider.GetNotExistsNews(allTitle, 1);//查询数据库中不存在的新闻，gameID=1表示 圣王 
                for (int i = 0; i < allNotExists.Count; i++)
                {
                    listBoxTtitle.Items.Add(allNotExists[i]); listBoxLink.Items.Add(titleUrlList[allNotExists[i]]);
                }
            }
            lblResult.Text = "共发现" + listBoxTtitle.Items.Count + "条新数据";
            if (listBoxTtitle.Items.Count > 0)
                MessageBox.Show("啦啦啦，发现新数据!共发现" + listBoxTtitle.Items.Count + "条新数据");
            else
            {
                MessageBox.Show("对不起，暂时没有发现官网有新数据");
            }

通过上面的代码，我们过滤出来所有符合条件的新闻的标题和新闻链接。注：关于正则表达式的使用方法和解释，可参照我的另外一篇文章： http://www.woaic.com/2012/09/159 下面就是请求新闻链接，获取新闻主体内容了，方法也挺简单，贴出来，主要是获取id是article_txt的内容：

        /// <summary> 
        /// 根据新闻详情页面url获取url主体内容 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        private string GetHtmlContent(string url)
        {
            string str = PostRegister.Tools.GetHttpRequest(url);
            string reg = @"<div id=""article_txt"">((?!</?div>)[sS]*?)</div>";
            MatchCollection mc = Regex.Matches(str, reg);
            StringBuilder strHref = new StringBuilder();
            StringBuilder strText = new StringBuilder();
            StringBuilder strTemp = new StringBuilder();
            for (int i = 0; i < mc.Count; i++)
            {
                strTemp.AppendFormat(mc[i].Groups[0].Value.Replace("/resources/JPG", "http://sw.wanmei.com/resources/JPG").Replace("/resources/jpg", "http://sw.wanmei.com/resources/jpg"));
                return strTemp.ToString().Substring(0, strTemp.ToString().Length - 6).Replace("<div id="article_txt">", "").Trim();
            }
            return "";
        }

希望对你能有所帮助，^_^

相关阅读:
android中textview字数过长解决方法
 Android的EditText无法自动弹出输入法问题 .
android中dip、dp、px、sp和屏幕密度
 android横竖屏切换判断activity 是横屏还是竖屏
 设置ListView中图片的大小大方法 Android
TextView属性详细分析
 ArcGIS API For Silverlight 实例分析
 Visual Studio 2008 里修改数据库表结构报错解决办法
 未能加载文件或程序集“xxx”或它的某一个依赖项。生成此程序集的运行时比当前加载的运行时新，无法加载此程序集
 SuperMap iClient 6R for Silverlight 产品简介及Beta测试软件下载地址
原文地址：https://www.cnblogs.com/woaic/p/3942923.html