• 关于网络爬虫和网站数据采集的一些总结


    由于公司这段时间比较缺人手,这段时间上游戏厂商的专题活动也较为频繁,不得不做一个新闻采集的小软件,采集别人站点的一些新闻到我们平台上。 自己总结了下,新闻采集主要有几点: 1、通过模拟http请求,请求页面内容 2、通过正则表达式,把页面内容进行过滤,取出想要的部分。 3、把数据给整合成符合我们需要的数据。

    模拟http请求,请求页面内容 关于模拟http请求。我这里不详细解释,如果英语好的,可以看官方的文档:HttpWebRequest 我贴下我这里的模拟Http请求的helper

            /// <summary> 
            /// 模拟Http请求 
            /// </summary> 
            /// <param name="url"></param> 
            /// <returns></returns> 
            public static string GetHttpRequest(string url)
            {
                if (string.IsNullOrEmpty(url))
                    return string.Empty;
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                request.Method = "GET";
                request.ServicePoint.Expect100Continue = false;
                StreamReader stream = null;
                string responseValue = string.Empty;
                try
                {
                    stream = new StreamReader(request.GetResponse().GetResponseStream());
                    responseValue = stream.ReadToEnd();
                }
                catch
                {
                    throw;
                }
                finally
                {
                    request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null;
                } return responseValue;
            }
            /// <summary> 
            /// 模拟Http请求 
            /// </summary> 
            /// <param name="url"></param> 
            /// <returns></returns> 
            public static byte[] GetHttpRequestStream(string url)
            {
                byte[] bytes = null;
                StreamReader stream = null;
                if (string.IsNullOrEmpty(url)) return bytes;
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                request.CookieContainer = new CookieContainer();
                CookieContainer cookie = request.CookieContainer;
                request.Method = "GET";
                request.ServicePoint.Expect100Continue = false;
                //string responseValue = string.Empty; 
                try
                {
                    stream = new StreamReader(request.GetResponse().GetResponseStream());
                    List<byte> lBtyes = new List<byte>();
                    while (stream.BaseStream.CanRead)
                    {
                        int result = stream.BaseStream.ReadByte();
                        if (result == -1) break;
                        lBtyes.Add((byte) result);
                    }
                    bytes = lBtyes.ToArray();
                    // 
                    responseValue = stream.ReadToEnd();
                }
                catch
                {
                    throw;
                }
                finally
                {
                    request.GetResponse().GetResponseStream().Close();
                    stream.Close();
                    stream = null;
                } return bytes;
            }
    
            /// <summary> 
            /// 模拟Http请求 
            /// </summary> 
            /// <param name="url"></param> 
            /// <returns></returns> 
            public static string GetHttpRequest(string url, Encoding ec)
            {
                if (string.IsNullOrEmpty(url)) return string.Empty;
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                request.Method = "GET";
                request.ServicePoint.Expect100Continue = false;
                StreamReader stream = null;
                string responseValue = string.Empty;
                try
                {
                    stream = new StreamReader(request.GetResponse().GetResponseStream(), ec);
                    responseValue = stream.ReadToEnd();
                }
                catch
                {
                    throw;
                }
                finally
                {
                    request.GetResponse().GetResponseStream().Close();
                    stream.Close();
                    stream = null;
                }
                return responseValue;
            }  

    我采集的是完美官网的新闻,新闻链接: http://sw.wanmei.com/news/gamenews/list.shtml

    请求的代码:

    //str是请求的结果的html内容 
    
    string str = PostRegister.Tools.GetHttpRequest("http://sw.wanmei.com/news/gamenews/list.shtml"); 

    通过正则表达式,过滤html内容:

    string reg = @"<a[^>]*href=(""(?<href>[^""]*)""|'(?<href>[^']*)'|(?<href>[^s>]*))[^>]*>(?<text>[sS]*?)</a>";
                MatchCollection mc = Regex.Matches(str, reg);
                StringBuilder strTitle = new StringBuilder();
                Dictionary<string, string> titleUrlList = new Dictionary<string, string>();
                strTitle.Append("");
                for (int i = 0; i < mc.Count; i++)
                {
                    string href = mc[i].Groups["href"].Value;// 这是href内容 
                    string text = mc[i].Groups["text"].Value;// 这是text内容,就是<a>这里的内容</a> 
                    if (i < mc.Count - 1)
                    {
                        if (text.Contains("新闻") && mc[i + 1].Groups[0].Value.Contains("_blank") && mc[i + 1].Groups[0].Value.Contains("hidefocus") == false)
                        {
                            strTitle.Append("'" + mc[i + 1].Groups["text"].Value + "',");//拼接字符串,用于查询数据库使用 
                            titleUrlList.Add(mc[i + 1].Groups["text"].Value, "http://sw.wanmei.com" + mc[i + 1].Groups["href"].Value);//新闻标题和新闻链接 
                        }
                    }
                }
                DbClassLibrary.Spiders.CommonSpider commonSpider = new CommonSpider();
                string allTitle = "";
                if (strTitle.ToString() != "")
                {
                    allTitle = strTitle.ToString().Substring(0, strTitle.ToString().Length - 1);//截掉拼接的字符串中的最后一个,号 
                    List<string> allNotExists = commonSpider.GetNotExistsNews(allTitle, 1);//查询数据库中不存在的新闻,gameID=1表示 圣王 
                    for (int i = 0; i < allNotExists.Count; i++)
                    {
                        listBoxTtitle.Items.Add(allNotExists[i]); listBoxLink.Items.Add(titleUrlList[allNotExists[i]]);
                    }
                }
                lblResult.Text = "共发现" + listBoxTtitle.Items.Count + "条新数据";
                if (listBoxTtitle.Items.Count > 0)
                    MessageBox.Show("啦啦啦,发现新数据!共发现" + listBoxTtitle.Items.Count + "条新数据");
                else
                {
                    MessageBox.Show("对不起,暂时没有发现官网有新数据");
                }

    通过上面的代码,我们过滤出来所有符合条件的新闻的标题和新闻链接。 注:关于正则表达式的使用方法和解释,可参照我的另外一篇文章: http://www.woaic.com/2012/09/159 下面就是请求新闻链接,获取新闻主体内容了,方法也挺简单,贴出来,主要是获取id是article_txt的内容:

            /// <summary> 
            /// 根据新闻详情页面url获取url主体内容 
            /// </summary> 
            /// <param name="url"></param> 
            /// <returns></returns> 
            private string GetHtmlContent(string url)
            {
                string str = PostRegister.Tools.GetHttpRequest(url);
                string reg = @"<div id=""article_txt"">((?!</?div>)[sS]*?)</div>";
                MatchCollection mc = Regex.Matches(str, reg);
                StringBuilder strHref = new StringBuilder();
                StringBuilder strText = new StringBuilder();
                StringBuilder strTemp = new StringBuilder();
                for (int i = 0; i < mc.Count; i++)
                {
                    strTemp.AppendFormat(mc[i].Groups[0].Value.Replace("/resources/JPG", "http://sw.wanmei.com/resources/JPG").Replace("/resources/jpg", "http://sw.wanmei.com/resources/jpg"));
                    return strTemp.ToString().Substring(0, strTemp.ToString().Length - 6).Replace("<div id="article_txt">", "").Trim();
                }
                return "";
            }

    希望对你能有所帮助,^_^

  • 相关阅读:
    android中textview字数过长解决方法
    Android的EditText无法自动弹出输入法问题 .
    android中dip、dp、px、sp和屏幕密度
    android横竖屏切换 判断activity 是横屏还是竖屏
    设置ListView中图片的大小大方法 Android
    TextView属性详细分析
    ArcGIS API For Silverlight 实例分析
    Visual Studio 2008 里修改数据库表结构报错 解决办法
    未能加载文件或程序集“xxx”或它的某一个依赖项。生成此程序集的运行时比当前加载的运行时新,无法加载此程序集
    SuperMap iClient 6R for Silverlight 产品简介及Beta测试软件下载地址
  • 原文地址:https://www.cnblogs.com/woaic/p/3942923.html
Copyright © 2020-2023  润新知