• HtmlAgilityPack组件


    HtmlAgilityPack组件用于解析Html字符串,一个典型的应用场景是用于网页爬虫。

    示例程序

    using Common.Tools;
    using Datebase.Entity;
    using HtmlAgilityPack;
    using Http.Extension;
    using ServiceStack.Orm.Extension.Imples;
    using ServiceStack.Orm.Extension.Interface;
    using ServiceStack.OrmLite;
    using System;
    using System.Collections.Generic;
    using System.Configuration;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading.Tasks;
    
    namespace WebSpider
    {
        class Program
        {
            public static IOrmClient dbClient = new OrmClient(ConfigurationManager.ConnectionStrings["mssql"].ConnectionString, SqlServerDialect.Provider);
            static void Main(string[] args)
            {
                List<Task> tasks = FetchSinger();
                Task.WaitAll(tasks.ToArray());
                Console.WriteLine("歌手信息抓取完毕!");
                Console.ReadLine();
            }
    
            /// <summary>
            /// 网页爬虫程序,从音乐网站获取最热的前100位歌手的信息
            /// </summary>
            private static List<Task> FetchSinger()
            {
                List<Task> tasks = new List<Task>();
                HttpResult result = HttpCore.Send(new HttpItem()
                {
                    URL = "http://mp3.sogou.com/static_new/topsinger_remen.html",
                    Method = MethodType.GET
                });
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(result.Html);
                var rootNode = document.DocumentNode;
                //获取第1到第10位歌手
                var top10Nodes = rootNode.SelectNodes("//div[@id='right2']/ul[@class='singerlist2']/li/a");
                if (top10Nodes != null)
                {
                    Task t = new Task(nodes =>
                    {
                        var singerNodes = nodes as HtmlNodeCollection;
                        if (singerNodes != null)
                        {
                            foreach (var hrefNode in singerNodes)
                            {
                                //歌手链接
                                var link = hrefNode.GetAttributeValue("href", "");
                                //歌手的序列号码
                                var noNode = hrefNode.SelectSingleNode("./strong[@class='singertop10']");
                                if (noNode != null)
                                {
                                    int sNo = -1;
                                    int.TryParse(noNode.InnerText.Replace("Top", "").Trim(), out sNo);
                                    SingerDetail(sNo, link);
                                }
                            }
                        }
                    }, top10Nodes);
                    t.Start();
                    tasks.Add(t);
                }
                //获取第11到第100位歌手
                var tbNodes = rootNode.SelectNodes("//table[@class='indextable']");
                //遍历捕获的所有的table对象
                foreach (var e in tbNodes)
                {
                    Task t = new Task(p =>
                    {
                        var tbNode = p as HtmlNode;
                        if (tbNode != null)
                        {
                            var hrefNodes = tbNode.SelectNodes("./tbody/tr/td/a");
                            if (hrefNodes != null)
                            {
                                foreach (var href in hrefNodes)
                                {
                                    //序号
                                    var sNo = -1;
                                    var trNode = href.ParentNode.PreviousSibling.PreviousSibling;
                                    if (trNode != null)
                                    {
                                        int.TryParse(trNode.InnerText.Trim().TrimEnd('.'), out sNo);
                                    }
                                    var link = href.GetAttributeValue("href", "");
                                    if (!string.IsNullOrEmpty(link))
                                    {
                                        SingerDetail(sNo, link);
                                    }
                                }
                            }
                        }
                    }, e);
                    t.Start();
                    tasks.Add(t);
                }
                return tasks;
            }
    
            /// <summary>
            /// 通过歌手链接访问歌手详细信息
            /// </summary>
            /// <param name="sNo">序列号</param>
            /// <param name="link">歌手的链接地址</param>
            private static void SingerDetail(int sNo, string link)
            {
                var linkResult = HttpCore.Send(new HttpItem()
                {
                    URL = link,
                    Method = MethodType.GET
                });
                if (!string.IsNullOrEmpty(linkResult.Html))
                {
                    T_Singer user = new T_Singer();
                    user.ID = Utility.GenerateId();
                    user.SerialNumber = sNo;
                    user.IsApprove = true;
                    user.CreateBy = "admin";
                    user.CreateDate = DateTime.Now;
                    user.ModifyBy = "admin";
                    user.ModifyDate = DateTime.Now;
                    HtmlDocument linkDoc = new HtmlDocument();
                    linkDoc.LoadHtml(linkResult.Html);
                    //姓名/昵称
                    var name = linkDoc.DocumentNode.SelectSingleNode("//div[@class='song_tit']");
                    if (name != null)
                    {
                        user.RealName = user.NickName = name.InnerText.Trim().Replace("<br>", System.Environment.NewLine);
                    }
                    //包含个人信息的所有的li元素
                    var lis = linkDoc.DocumentNode.SelectNodes("//ul[@class='song_detail']/li");
                    //国籍
                    var Nationality = linkDoc.DocumentNode.SelectSingleNode("//ul[@class='song_detail']/li[1]/span");
                    user.Nationality = Search(lis, "国籍");
                    //出生地
                    user.Birthplace = Search(lis, "出生地");
                    //出生日期
                    //出生日期
                    var temp = Search(lis, "出生日期");
                    var match = Regex.Match(temp, @"d{0,4}年d{1,2}月d{1,2}日");
                    var bir = string.Empty;
                    if (match != null)
                    {
                        var birArr = match.Value.Split(new string[] { "", "", "" }, StringSplitOptions.RemoveEmptyEntries);
                        if (birArr.Length > 0)
                            bir += birArr[0];
                        if (birArr.Length > 1)
                            bir += "-" + birArr[1];
                        if (birArr.Length > 2)
                            bir += "-" + birArr[2];
                    }
                    DateTime bDay = new DateTime(1900, 1, 1);
                    if (DateTime.TryParse(bir, out bDay))
                        user.Birthday = bDay;
                    //星座
                    user.Constellation = Search(lis, "星座");
                    //简介
                    var selfDescNode = linkDoc.GetElementbyId("desc_long");
                    selfDescNode = selfDescNode ?? linkDoc.GetElementbyId("desc_short");
                    if (selfDescNode != null)
                        user.BriefIntroduction = selfDescNode.InnerText.Replace("<br>", "").Trim();
                    dbClient.Insert(user);
                }
            }
    
            /// <summary>
            /// 从节点中查找指定数据方法
            /// </summary>
            private static string Search(HtmlNodeCollection nodes, string key)
            {
                if (nodes != null)
                {
                    foreach (var node in nodes)
                    {
                        if (node.FirstChild.InnerText.Trim().StartsWith(key))
                        {
                            var spanNode = node.SelectSingleNode("./span");
                            if (spanNode != null)
                            {
                                return spanNode.InnerText.Trim().Replace("<br>", System.Environment.NewLine);
                            }
                        }
                    }
                }
                return string.Empty;
            }
        }
    }
    View Code
  • 相关阅读:
    SpringBoot与(Security)安全
    SpringBoot任务
    SSM框架整合思路
    数据库连接池 Druid和C3p0
    YAML语法:
    Mbatis使用
    为什么要使用Mybatis-现有持久化技术的对比
    SpringMVC拦截器
    ARC109D
    一类求斯坦纳树大小的问题
  • 原文地址:https://www.cnblogs.com/Jabben_Yi/p/5720431.html
Copyright © 2020-2023  润新知