• HtmlAgilityPackxpath用法


    <div class="m-repbox"><!--/html/body/div-->
            <div class="m-repbody firstPage"><!--/html/body/div/div-->
    <div class="t1">基本信息</div>
    <div class="g-tt-h3 f-tleft f-mgtop">基本概况信息</div><!--/html/body/div/div[1]/div[2]-->
    <table class="g-tab-bor f-tab-nomargin">
                    <tr>
                        <th class="g-w-4">经济类型</th>
                        <td class="g-w-4 ">股份有限(公司)</td>
                        <th class="g-w-4">组织机构类型</th>
                        <td class="g-w-4 ">企业</td>
                    </tr>
                    <tr>
                        <th>企业规模</th>
                        <td class="">微型企业</td>
                        <th>所属行业</th>
                        <td class="">建材批发</td>
                    </tr>
    </table>
                <div class="g-tt-h3 f-tleft f-mgtop">实际控制人</div><!--/html/body/div/div[1]/div[2]-->
                <table class="g-tab-bor f-tab-nomargin">
                    <tr>
                        <th class="g-w-4">名称</th>
                        <th class="g-w-4">身份标识类型</th>
                        <th class="g-w-4">身份标识号码</th>
                        <th class="g-w-4">更新日期</th>
                    </tr>
                    <tbody class="">
                        <tr>
                            <td>控制人</td>
                            <td class="g-w-4">身份证</td>
                            <td class="g-w-4">*******************</td>
                            <td class="g-w-4">2017-03-01</td>
                        </tr>
                    </tbody>
                    <tbody class="">
                        <tr>
                            <td>控制人二二二二二</td>
                            <td class="g-w-4">组织机构代码</td>
                            <td class="g-w-4">***********</td>
                            <td class="g-w-4">2017-03-01</td>
                        </tr>
                    </tbody>
                </table>
    </div>
    </div>
    NuGet 引入 HtmlAgilityPack 包
    
    
    HtmlDocument htmlDoc;
    
            /// <summary>
            /// Load the html page source.
            /// </summary>
            /// <param name="htmlSource"></param>
            public void LoadHtml(string htmlSource)
            {
                htmlDoc = new HtmlDocument();
                htmlDoc.LoadHtml(htmlSource);
            }
    
            public int GetNodeIndexByKeyword(string xPath, string keyword)
            {
                var index = int.MinValue;
                var nodes = htmlDoc.DocumentNode.SelectNodes(xPath);
                if (nodes != null)
                {
                    for (var i = 0; i < nodes.Count; i++)
                    {
                        var data = nodes[i].InnerText;
                        if (Regex.IsMatch(data, keyword))
                        {
                            index = i + 1;
                            break;
                        }
                    }
                }
                return index;
            }
    
    public int GetNodeIndex(string divPath, int divIndex)
            {
                var index = int.MinValue;
    
                var tableXPath = string.Format("{0}[{1}]/following-sibling::table[1]/preceding-sibling::div[1]", divPath, divIndex);
        //tableXPath = "/html/body/div/div[4]/div[2]/following-sibling::table[1]/preceding-sibling::div[1]";
                var nodes = htmlDoc.DocumentNode.SelectNodes(tableXPath);
                if (nodes != null)
                {
                    foreach (var node in nodes)
                    {
                        var lastS = node.XPath.Substring(node.XPath.LastIndexOf("/") + 1);
                        var rgx = new Regex(@"(?i)(?<=\[)(.*)(?=\])");
                        var trimS = rgx.Match(lastS).Value;
                        _ = int.TryParse(trimS, out int i);
                        index = i;
                    }
    
                }
                return index;
            }
    
            var xPath = "/html/body/div/div";
    var keyword = "基本信息";
            var divIndex = GetNodeIndexByKeyword(xPath, keyword);
    
    xPath = string.Format("/html/body/div/div[{0}]/div", divIndex);//"/html/body/div/div[4]/div"
    keyword = "基本概况信息";
    var divIndex2 = htmlDocument.GetNodeIndexByKeyword(xPath, keyword);//2
    
    var precedingSiblingIndeox = GetNodeIndex(xPath, divIndex2);
    
    var eq = divIndex == precedingSiblingIndeox;
  • 相关阅读:
    python 3.x 不再提供raw_print()
    php中fileatim,filectime和filemtime函数的区别
    如何将文本以BLOB类型存入数据库并取出
    tomcat启动不了——Error initializing endpoint——java.net.BindException: Address already in use: JVM_Bind
    hdu 2188(巴什博弈入门 )
    hdu 2187(贪心)
    Sentinel数据处理工具包SNAP Python开发环境搭建
    conda 安装包命令
    使用SSH连接WSL
    win10 安装WSL 出现 WslRegisterDistribution failed with error: 0x8000000d
  • 原文地址:https://www.cnblogs.com/hofmann/p/16643211.html
Copyright © 2020-2023  润新知