• C#实现网页爬虫


    HTTP请求工具类(功能:1、获取网页html;2、下载网络图片;):

    using System;
    using System.Collections.Generic;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Threading.Tasks;
    using System.Windows.Forms;
    
    namespace Utils
    {
        /// <summary>
        /// HTTP请求工具类
        /// </summary>
        public class HttpRequestUtil
        {
            /// <summary>
            /// 获取页面html
            /// </summary>
            public static string GetPageHtml(string url)
            {
                // 设置参数
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
                //发送请求并获取相应回应数据
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                //直到request.GetResponse()程序才开始向目标网页发送Post请求
                Stream responseStream = response.GetResponseStream();
                StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
                //返回结果网页(html)代码
                string content = sr.ReadToEnd();
                return content;
            }
    
            /// <summary>
            /// Http下载文件
            /// </summary>
            public static void HttpDownloadFile(string url, int minWidth, int minHeight)
            {
                int pos = url.LastIndexOf("/") + 1;
                string fileName = url.Substring(pos);
                string path = Application.StartupPath + "\download";
                if (!Directory.Exists(path))
                {
                    Directory.CreateDirectory(path);
                }
                string filePathName = path + "\" + fileName;
                if (File.Exists(filePathName)) return;
    
                // 设置参数
                HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
                request.Proxy = null;
                //发送请求并获取相应回应数据
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                //直到request.GetResponse()程序才开始向目标网页发送Post请求
                Stream responseStream = response.GetResponseStream();
    
                MemoryStream memoryStream = new MemoryStream();
                byte[] bArr = new byte[1024];
                int size = responseStream.Read(bArr, 0, (int)bArr.Length);
                while (size > 0)
                {
                    memoryStream.Write(bArr, 0, size);
                    size = responseStream.Read(bArr, 0, (int)bArr.Length);
                }
                Image tempImage = System.Drawing.Image.FromStream(memoryStream, true);
                int imageHeight = tempImage.Height;
                int imageWidth = tempImage.Width;
                if (imageHeight >= minHeight && imageWidth >= minWidth)
                {
                    memoryStream.Seek(0, SeekOrigin.Begin);
                    size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                    FileStream fs = new FileStream(filePathName, FileMode.Create);
                    while (size > 0)
                    {
                        fs.Write(bArr, 0, size);
                        size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                    }
                    fs.Close();
                }
                memoryStream.Close();
                responseStream.Close();
            }
        }
    }
    View Code

    VisitedHelper类:

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    using System.Windows.Forms;
    
    namespace Utils
    {
        /// <summary>
        /// 已访问的网址列表
        /// </summary>
        public class VisitedHelper
        {
            private static List<string> m_VisitedList = new List<string>();
    
            #region 判断是否已访问
            /// <summary>
            /// 判断是否已访问
            /// </summary>
            public static bool IsVisited(string url)
            {
                if (m_VisitedList.Exists(a => a == url))
                {
                    return true;
                }
                return false;
            }
            #endregion
    
            #region 添加已访问
            /// <summary>
            /// 添加已访问
            /// </summary>
            public static void Add(string url)
            {
                m_VisitedList.Add(url);
            }
            #endregion
    
        }
    }
    View Code

    多线程爬取网页代码:

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading;
    using System.Threading.Tasks;
    using System.Windows.Forms;
    using Utils;
    
    namespace 爬虫
    {
        public partial class Form1 : Form
        {
            private static int m_MinWidth = 300;
            private static int m_MinHeight = 300;
            private static int m_CompletedCount = 0;
    
            public Form1()
            {
                InitializeComponent();
            }
    
            private void button1_Click(object sender, EventArgs e)
            {
                ThreadPool.SetMaxThreads(100, 100);
                int.TryParse(txtMinWidth.Text, out m_MinWidth);
                int.TryParse(txtMinHeight.Text, out m_MinHeight);
                button1.Enabled = false;
                lblMsg.Text = "正在爬取图片…";
                timer1.Start();
                new Thread(new ThreadStart(delegate()
                {
                    Crawling(txtUrl.Text, null);
                })).Start();
            }
    
            /// <summary>
            /// 爬取
            /// </summary>
            private void Crawling(string url, string host)
            {
                if (!VisitedHelper.IsVisited(url))
                {
                    VisitedHelper.Add(url);
    
                    if (host == null)
                    {
                        host = GetHost(url);
                    }
    
                    string pageHtml = HttpRequestUtil.GetPageHtml(url);
                    Regex regA = new Regex(@"<a[s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>", RegexOptions.IgnoreCase);
                    Regex regImg = new Regex(@"<img[s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>", RegexOptions.IgnoreCase);
    
                    MatchCollection mcImg = regImg.Matches(pageHtml);
                    foreach (Match mImg in mcImg)
                    {
                        string imageUrl = mImg.Groups[1].Value;
                        try
                        {
                            int imageWidth = GetImageWidthOrHeight(mImg.Value, true);
                            int imageHeight = GetImageWidthOrHeight(imageUrl, false);
                            if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight)
                            {
                                if (imageUrl.IndexOf("javascript") == -1)
                                {
                                    if (imageUrl.IndexOf("http") == 0)
                                    {
                                        HttpRequestUtil.HttpDownloadFile(imageUrl, m_MinWidth, m_MinHeight);
                                    }
                                    else
                                    {
                                        HttpRequestUtil.HttpDownloadFile(host + imageUrl, m_MinWidth, m_MinHeight);
                                    }
                                }
                            }
                        }
                        catch { }
                    }
    
                    //递归遍历
                    MatchCollection mcA = regA.Matches(pageHtml);
                    foreach (Match mA in mcA)
                    {
                        try
                        {
                            string nextUrl = mA.Groups[1].Value;
                            if (nextUrl.IndexOf("javascript") == -1)
                            {
                                if (nextUrl.IndexOf("http") == 0)
                                {
                                    if (GetHost(url) == host)
                                    {
                                        ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                        {
                                            try
                                            {
                                                Crawling(nextUrl, host);
                                                m_CompletedCount++;
                                            }
                                            catch { }
                                        }));
                                    }
                                }
                                else
                                {
                                    if (GetHost(url) == host)
                                    {
                                        ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                        {
                                            try
                                            {
                                                Crawling(host + nextUrl, host);
                                                m_CompletedCount++;
                                            }
                                            catch { }
                                        }));
                                    }
                                }
                            }
                        }
                        catch { }
                    }
                }
            } //end Crawling方法
    
            /// <summary>
            /// 获取主机
            /// </summary>
            private string GetHost(string url)
            {
                Regex regHost = new Regex(@"(?:http|https)://[a-z0-9-.:]+", RegexOptions.IgnoreCase);
                Match mHost = regHost.Match(url);
                return mHost.Value + "/";
            }
    
            //计时器事件
            private void timer1_Tick(object sender, EventArgs e)
            {
                int workerThreads;
                int completionPortThreads;
                ThreadPool.GetAvailableThreads(out workerThreads, out completionPortThreads);
                if (workerThreads == 100 && m_CompletedCount > 0)
                {
                    lblMsg.Text = "已结束";
                }
                else
                {
                    lblMsg.Text = "正在爬取图片…";
                }
            }
    
            /// <summary>
            /// 获取图片宽度或高度
            /// </summary>
            private int GetImageWidthOrHeight(string imageTagString, bool isWidth)
            {
                string tag = isWidth ? "width" : "height";
                Regex reg = new Regex(string.Format(@"{0}=""([d.]+)""", tag), RegexOptions.IgnoreCase);
                Match match = reg.Match(imageTagString);
                if (match.Success)
                {
                    return (int)Convert.ToDouble(match.Groups[1].Value);
                }
                else
                {
                    reg = new Regex(string.Format(@"{0}[s]*:[s]*([d.]+)[s]*px[s]*;", tag), RegexOptions.IgnoreCase);
                    match = reg.Match(imageTagString);
                    if (match.Success)
                    {
                        return (int)Convert.ToDouble(match.Groups[1].Value);
                    }
                }
                return int.MaxValue;
            }
    
        } //end Form1类
    
        /// <summary>
        /// 跨线程访问控件的委托
        /// </summary>
        public delegate void InvokeDelegate();
    }
    View Code

    截图:

  • 相关阅读:
    普通PC硬盘与DVR专用硬盘主要差别
    远程监控,需要安装控件,安装前对浏览器设置如下。硬盘录像机,采集卡通用...
    SQL Server不允许进行远程连接
    远程备份(还原)SQL2000数据库
    安装MSDE时提示 实例名无效
    冰雹,刨冰,危险人物
    北京首现最严重的0day攻击方式
    孤独,寂寞,无聊
    大家平时都在做什么
    华山之旅
  • 原文地址:https://www.cnblogs.com/s0611163/p/5170263.html
Copyright © 2020-2023  润新知