• HtmlAgilityPack Sample


    通过html Table获取到内部数据,并执行去重.

                    HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb();
                    //加载本地文件  (之前是通过System.Net.Http.HttpClient post采集到的)
                    HtmlAgilityPack.HtmlDocument doc = hw.Load(dir + "2019-12-03.html");
                    //取rootNode
                    HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode;
                    //获取 Table/tbody
                    string xpath = "//*[@id="DDetail2"]/tbody";
                    HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath);
                    //删除 tr之间的#Text子对象
                    foreach (var script in node.Descendants("#Text").ToArray())
                        script.Remove();
                    if (node.ChildNodes.Count>1)
                    {
                        List<dailyDetail> li = new List<dailyDetail>();
                        //node.ChildNodes.Count - 1 去除最后一个新建行
                        for (int i = 0; i < node.ChildNodes.Count - 1; i++)
                        {
                            //取子Node (相对 xpath)
                            var id = node.ChildNodes[i].SelectSingleNode($"td[1]/input[2]");
                            var text = node.ChildNodes[i].SelectSingleNode($"td[2]/input");
                            li.Add(new dailyDetail() { dailyDetailId= id.Attributes["value"].Value ,dailyContent= text.Attributes["value"].Value });
                        }
                        //找出重复值
                        var query = (from dd in li
                                     where
                                       dd.dailyContent != null
                                     group dd by new
                                     {
                                         dd.dailyContent
                                     } into g
                                     where g.Count() > 1
                                     select new
                                     {
                                         g.Key.dailyContent
                                     }).ToList();
    
    
                        foreach (var item in query)
                        {
                            Console.WriteLine($"重复值:{item.dailyContent}");
                            Console.WriteLine($"首个Id:{li.FirstOrDefault(q=>q.dailyContent==item.dailyContent)?.dailyDetailId}");
    
                        }
    
                    }
    

      

  • 相关阅读:
    python中如何对数据进行各种排序?
    js原型链
    js局部变量,参数
    计算字符串中每个字符出现次数
    推荐几个web中常用js图表插件
    getElementsByTagName("div")和$("div")区别
    Hadoop集群(第6期)JDK和SSH无密码配置
    Hadoop集群(第5期)SecureCRT使用
    Hadoop集群(第4期)VSFTP安装配置
    /etc/vsftpd/vsftpd.conf
  • 原文地址:https://www.cnblogs.com/honk/p/12883675.html
Copyright © 2020-2023  润新知