• C# 多线程网络爬虫


    原文 C#制作多线程处理强化版网络爬虫

    上次做了一个帮公司妹子做了爬虫,不是很精致,这次公司项目里要用到,于是有做了一番修改,功能添加了网址图片采集,下载,线程处理界面网址图片下载等。

    说说思路:首相获取初始网址的所有内容 在初始网址采集图片 去初始网址采集链接 把采集到的链接放入队列 继续采集图片,然后继续采集链接,无限循环

    还是上图片大家看一下:

    处理网页内容抓取跟网页网址爬取都做了改进,下面还是大家来看看代码,有不足之处,还请之处!

    网页内容抓取HtmlCodeRequest,

    网页网址爬取GetHttpLinks,用正则去筛选html中的Links

    图片抓取GetHtmlImageUrlList,用正则去筛选html中的Img

    都写进了一个封装类里面 HttpHelper

     /// <summary>  
        /// 取得HTML中所有图片的 URL。  
        /// </summary>  
        /// <param name="sHtmlText">HTML代码</param>  
        /// <returns>图片的URL列表</returns> 
    public static string HtmlCodeRequest(string Url)
        {
          if (string.IsNullOrEmpty(Url))
          {
            return "";
          }
          try
          {
            //创建一个请求
            HttpWebRequest httprequst = (HttpWebRequest)WebRequest.Create(Url);
            //不建立持久性链接
            httprequst.KeepAlive = true;
            //设置请求的方法
            httprequst.Method = "GET";
            //设置标头值
            httprequst.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";
            httprequst.Accept = "*/*";
            httprequst.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
            httprequst.ServicePoint.Expect100Continue = false;
            httprequst.Timeout = 5000;
            httprequst.AllowAutoRedirect = true;//是否允许302
            ServicePointManager.DefaultConnectionLimit = 30;
            //获取响应
            HttpWebResponse webRes = (HttpWebResponse)httprequst.GetResponse();
            //获取响应的文本流
            string content = string.Empty;
            using (System.IO.Stream stream = webRes.GetResponseStream())
            {
              using (System.IO.StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("utf-8")))
              {
                content = reader.ReadToEnd();
              }
            }
            //取消请求
            httprequst.Abort();
            //返回数据内容
            return content;
          }
          catch (Exception)
          {
     
            return "";
          }
        }
    /// <summary>
        /// 提取页面链接
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
    public static List<string> GetHtmlImageUrlList(string url)
        {
          string html = HttpHelper.HtmlCodeRequest(url);
          if (string.IsNullOrEmpty(html))
          {
            return new List<string>();
          }
          // 定义正则表达式用来匹配 img 标签  
          Regex regImg = new Regex(@"<img[^<>]*?src[s	
    ]*=[s	
    ]*[""']?[s	
    ]*(?<imgUrl>[^s	
    ""'<>]*)[^<>]*?/?[s	
    ]*>", RegexOptions.IgnoreCase);
     
          // 搜索匹配的字符串  
          MatchCollection matches = regImg.Matches(html);
          List<string> sUrlList = new List<string>();
     
          // 取得匹配项列表  
          foreach (Match match in matches)
            sUrlList.Add(match.Groups["imgUrl"].Value);
          return sUrlList;
        }
     
     
        /// <summary>
        /// 提取页面链接
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static List<string> GetHttpLinks(string url)
        {
          //获取网址内容
          string html = HttpHelper.HtmlCodeRequest(url);
          if (string.IsNullOrEmpty(html))
          {
            return new List<string>();
          }
          //匹配http链接
          const string pattern2 = @"http(s)?://([w-]+.)+[w-]+(/[w- ./?%&=]*)?";
          Regex r2 = new Regex(pattern2, RegexOptions.IgnoreCase);
          //获得匹配结果
          MatchCollection m2 = r2.Matches(html);
          List<string> links = new List<string>();
          foreach (Match url2 in m2)
          {
            if (StringHelper.CheckUrlIsLegal(url2.ToString()) || !StringHelper.IsPureUrl(url2.ToString()) || links.Contains(url2.ToString()))
              continue;
            links.Add(url2.ToString());
          }
          //匹配href里面的链接
          const string pattern = @"(?i)<as[^>]*?href=(['""]?)(?!javascript|__doPostBack)(?<url>[^'""s*#<>]+)[^>]*>"; ;
          Regex r = new Regex(pattern, RegexOptions.IgnoreCase);
          //获得匹配结果
          MatchCollection m = r.Matches(html);
          foreach (Match url1 in m)
          {
            string href1 = url1.Groups["url"].Value;
            if (!href1.Contains("http"))
            {
              href1 = Global.WebUrl + href1;
            }
            if (!StringHelper.IsPureUrl(href1) || links.Contains(href1)) continue;
            links.Add(href1);
          }
          return links;
        }  

    这边下载图片有个任务条数限制,限制是200条。如果超过的话线程等待5秒,这里下载图片是异步调用的委托

    public string DownLoadimg(string url)
        {
          if (!string.IsNullOrEmpty(url))
          {
            try
            {
              if (!url.Contains("http"))
              {
                url = Global.WebUrl + url;
              }
              HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
              request.Timeout = 2000;
              request.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";
              //是否允许302
              request.AllowAutoRedirect = true;
              WebResponse response = request.GetResponse();
              Stream reader = response.GetResponseStream();
              //文件名
              string aFirstName = Guid.NewGuid().ToString();
              //扩展名
              string aLastName = url.Substring(url.LastIndexOf(".") + 1, (url.Length - url.LastIndexOf(".") - 1));
              FileStream writer = new FileStream(Global.FloderUrl + aFirstName + "." + aLastName, FileMode.OpenOrCreate, FileAccess.Write);
              byte[] buff = new byte[512];
              //实际读取的字节数
              int c = 0;
              while ((c = reader.Read(buff, 0, buff.Length)) > 0)
              {
                writer.Write(buff, 0, c);
              }
              writer.Close();
              writer.Dispose();
              reader.Close();
              reader.Dispose();
              response.Close();
              return (aFirstName + "." + aLastName);
            }
            catch (Exception)
            {
              return "错误:地址" + url;
            }
          }
          return "错误:地址为空";
        }

    话不多说,更多的需要大家自己去改进咯!

  • 相关阅读:
    #include <sys/stat.h>的作用
    如何使用SecureCRT连接vmware下ubuntu
    64位CentOS安装32位开发环境编译Nachos
    Apache安装完服务没有安装的情况
    Java实现八皇后
    动态规划初级练习(二):BadNeighbors
    打造你的办公环境-email篇
    IRC配置for open source community
    static wechat red package tool
    Trafic control 大框图(HTB )
  • 原文地址:https://www.cnblogs.com/arxive/p/5885082.html
Copyright © 2020-2023  润新知