• 使用 WebBrowser 获取Ajax动态加载网页信息


    直接上代码(代码较粗糙,可根据需要优化):

    WebBrowser  直接执行时会报一个单线程的问题,我的解决方法是:使用“STAThread”,指定线程模型为单线程单元

    [STAThread]
    static void Main(string[] args)

    using System;
    using System.IO;
    using System.Net;
    using System.Text;
    using System.Windows.Forms;
    using System.Text.RegularExpressions;
    using System.Collections.Specialized;
    
    namespace CrawlerTest
    {
        public class HttpHelper
        {
            /// <summary>
            /// 下载Ajax Html
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string DownloadAjaxHtml(string url)
            {
                string htmlstr = null;
                try
                {
                    WebBrowser wb = new WebBrowser();
                    wb.AllowNavigation = true;
                    wb.ScriptErrorsSuppressed = true;
    
                    int hitCount = 1;
                    wb.Navigating += (sender, e) =>
                    {
                        hitCount++;
                    };
    
                    wb.DocumentCompleted += (sender, e) =>
                    {
                        hitCount++;
                    };
    
                    wb.Navigate(url);
    
                    DateTime dtime = DateTime.Now;
                    double timespan = 0;
                    while (timespan <= 3 || wb.ReadyState != WebBrowserReadyState.Complete)
                    {
                        Application.DoEvents();
                        DateTime time2 = DateTime.Now;
                        timespan = (time2 - dtime).TotalSeconds;
                    }
    
                    if (wb.ReadyState == WebBrowserReadyState.Complete)
                    {
                        htmlstr = wb.Document.Body.OuterHtml;
                        htmlstr = System.Web.HttpUtility.UrlDecode(htmlstr);//解码
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"DownloadAjaxHtml-Error:{ex.ToString()}");
                }
    
                return htmlstr;
            }
    		
            //获取Html后再获取想要的内容
            public static List<NewsHotTitle> GetHotTitle(Encoding encoding)
            {
                var url = "http://www.news.cn/2021homepro/rsznb/";
    
                string strHtml = HttpHelper.DownloadAjaxHtml(url);
                if (string.IsNullOrEmpty(strHtml)) { Console.WriteLine($"获取数据失败"); }
    
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(strHtml);
    
                HtmlNode rootnode = doc.DocumentNode;
                HtmlNodeCollection hotlist = rootnode.SelectNodes("//ul[@class='htList']//li");
                if (hotlist == null || !hotlist.Any()) { Console.WriteLine($"获取数据失败"); }
    
                var list = new List<NewsHotTitle>();
                foreach (HtmlNode item in hotlist)
                {
                    NewsHotTitle model = new NewsHotTitle();
                    model.Title = HttpHelper.RemoveHtml(item.InnerHtml);
                    model.PublishTime = DateTime.Now;
    
                    Console.WriteLine($"{model.ToJson()}");
                }
    
                return list;
            }
        }
    }
    

      

  • 相关阅读:
    python易错点2
    python易错点1
    Spring:注解组件注册
    KubeSphere建立多租户系统
    Centos7.7下KubeSphere最小化安装和定制化配置安装
    安装create-react-app
    查看react版本等信息
    react自动生成React组件命令
    sqlserver 随机数
    Error while downloading 'http://java.sun.com/xml/ns/javaee/javaee_web_services_client_1_2.xsd'
  • 原文地址:https://www.cnblogs.com/mlinber/p/15683825.html
Copyright © 2020-2023  润新知