• C#Xpath解析HtmlDocument的使用方法与递归取得页面所有标签xpath值(附源码)


    引用:https://www.cnblogs.com/wangchuang/archive/2013/03/11/2953638.html

    在学习HTML Xpath之前呢我们先来下载一下Dll文件
    下载地址:http://htmlagilitypack.codeplex.com/
    大家下载单击如下图片下载就行了
    <ignore_js_op>xpath1.jpg

    接下来就是在程序中引用一下,
    <ignore_js_op>xpath2.jpg
    然后就可以直接调用 了,大家看看
    代码吧
    普通浏览复制代码

      //htmlDcoument对象用来访问Html文档s
                HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
                //加载Html文档
                hd.LoadHtml(strhtml);
      string str = hd.DocumentNode.SelectSingleNode("//*[@id='e_font']").OuterHtml;
    

    这样就可以得到一个标签的HTml代码了
    OuterHtml是取包含本身的Html如果是InnerHtml就是取的包含在这个标签之内的所有Html代码了
    这点大家要注意了
    如果大家想获取Html代码的Xpath路径就是这部分

    //*[@id='e_font']
    

    复制代码

    这个其实很简单只在大家安装一个Firbug就行了,
    看下图片
    <ignore_js_op>xpath3.jpg
    大家只要进入选择模式,然后选择你要的内容,然后右键复制一下就行了。
    然后放在SelectSingleNode()方法里就OK了
    下面我说说几个方法和属性的意思吧、
    方法

    SelectNodes 获取的是一个集合
    SelectSingleNode 获取一个标签
    SetAttributeValue 设置标签的属性值例如:SetAttributeValue("name","xpath-89");这说明把name属性的值修改为xpath-89
    属性

    OuterHtml 是取包含本身的Html
    InnerHtml 取的包含在这个标签之内的所有Html代码了
    XPath 获取相对应的Xpath值
    Attributes 获取一个属性的值例如:Attributes("name")
    也可以进行添加属性例如:
    普通浏览复制代码

    hd.DocumentNode.SelectSingleNode(item.Key).Attributes.Add("xpathid", "xpath_1" );
    

    下面我写了一个递归获取Html页面所有Xpath值的方法大家看一下吧
    普通浏览复制代码

      //key(Xpath),value(整个节点)
            public List<ObjXpath> XpathList = new List<ObjXpath>();
            public string strhtml = "";//这里就是你的Html代码具体怎么获取请参考我的<a href=\"http://www.sufeinet.com/thread-3-1-1.html\" target=\"_blank\">HttpHelper</a>类吧
              private int Index = 0;
    //开始处理Node
            private void SartNode()
            {
                //htmlDcoument对象用来访问Html文档s
                HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
                //加载Html文档
                hd.LoadHtml(strhtml);
                HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes;
                Index = 0;
                XpathList.Clear();
                foreach (HtmlNode em in htmllist)
                {
                    Setxpath(em);
                }
            }
            /// <summary>
            /// 递归获取Html Dom
            /// </summary>
            /// <param name="node">要处理的节点</param>
            private void Setxpath(HtmlNode node)
            {
                foreach (HtmlNode item in node.ChildNodes)
                {
                    if (item.XPath.Contains("#"))
                    {
                        continue;
                    }
                    if (item.ChildNodes.Count > 0)
                    {
                        XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" });
                        Index++;
                        Setxpath(item);
                    }
                    else
                    {
                        XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" });
                        Index++;
                    }
                }
            }
      public class ObjXpath
        {
            public string id { get; set; }
            public string Key { get; set; }
            public string Value { get; set; }
        }
    

    XpathList 就是获取的所有Xpath值了,大家有兴趣的话可以试试
    我们先来看看效果吧
    <ignore_js_op>xpath4.jpg
    好了下面放出所有代码给大家
    普通浏览复制代码

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Text.RegularExpressions;
    using System.Threading;
    using HtmlAgilityPack;
    using System.IO;
    using System.Runtime.Serialization.Json;
    namespace AutoXpathTools
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }
            #region 私有变量和方法
            //委托传入一个字符串
            private delegate void SetListBox(string str);
            //key(Xpath),value(整个节点)
            List<ObjXpath> XpathList = new List<ObjXpath>();
            private int Index = 0;
            //htmlDcoument对象用来访问Html文档
            HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
            #endregion
            //分析Xpath的所有代码
            private void btnGetXpath_Click(object sender, EventArgs e)
            {
                try
                {
                    HttpHelper http = new HttpHelper();
                    HttpItem item = new HttpItem() { URL = textBox1.Text.Trim(), IsToLower = false, Encoding = "gbk" };
                    txtXml.Text = http.GetHtml(item);
                    if (!string.IsNullOrWhiteSpace(txtXml.Text) && txtXml.Text.Trim().ToLower() != "error")
                    {
                        //加载Html文档
                        hd.LoadHtml(txtXml.Text);
                      
                        Thread pingTask = new Thread(new ThreadStart(delegate
                        {
                            //代码,线程要执行的代码
                            SartNode(txtXml.Text);
                        }));
                        pingTask.Start();
                       
                    }
                    else
                    {
                        txtXml.Text = "根据您的的ULR:" + textBox1.Text.Trim() + "无法得到任何内容";
                    }
                }
                catch (Exception ex)
                {
                    txtXml.Text = ex.Message.Trim();
                }
            }
           
            //开始处理Node
            private void SartNode(string strhtml)
            {
                //htmlDcoument对象用来访问Html文档s
                HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
                //加载Html文档
                hd.LoadHtml(strhtml);
                HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes;
                Index = 0;
                XpathList.Clear();
                foreach (HtmlNode em in htmllist)
                {
                    Setxpath(em);
                }
            }
            /// <summary>
            /// 递归获取Html Dom
            /// </summary>
            /// <param name="node">要处理的节点</param>
            private void Setxpath(HtmlNode node)
            {
                foreach (HtmlNode item in node.ChildNodes)
                {
                    if (item.XPath.Contains("#"))
                    {
                        continue;
                    }
                    if (item.ChildNodes.Count > 0)
                    {
                        XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" });
                        UIContorol(item.XPath);
                        Index++;
                        Setxpath(item);
                    }
                    else
                    {
                        XpathList.Add(new ObjXpath() { id = Index.ToString(), Key = item.XPath, Value = "" });
                        UIContorol(item.XPath);
                        Index++;
                    }
                }
            }
          
            //使用委托给控件赋值
            private void UIContorol(string str)
            {
                listBox1.Items.Add(str);
                toolStripStatusLabel1.Text = str;
            }
            private void listBox1_SelectedValueChanged(object sender, EventArgs e)
            {
                if (listBox1.SelectedItem != null)
                {
                    txtPath.Text = listBox1.SelectedItem.ToString().Trim();
                }
            }
            private void button3_Click(object sender, EventArgs e)
            {
                txtContents.Text = hd.DocumentNode.SelectSingleNode(txtPath.Text.Trim()).OuterHtml;
            }
          
            private void Form1_Load(object sender, EventArgs e)
            {
                //HttpItem item = new HttpItem()
                //{
                //    URL = "http://www.diandian.com/login",
                //    Method = "post",
                //    Cookie = "dtid=ZfXUVo1IsplHR4mHW1HYmgKbY4GJa003; kvf=1358855337188; alf=1; dru=1356356040; _l5=y",
                //    ContentType = "application/x-www-form-urlencoded",
                //    Postdata = "account=xinsuilie1998@163.com&password=wjlove520&nextUrl=&lcallback=&persistent=1",
                //    Referer = "http://www.diandian.com/logout?formKey=e4714d863c862a84fafd83d98e5ecb22"
                //};
                //HttpHelper http = new HttpHelper();
                //string html = http.GetHtml(item);
                //string cookie = item.Cookie;
                //item = new HttpItem() { URL = "http://www.diandian.com/home", Cookie = cookie };
                //html = http.GetHtml(item);
            }
        }
        public class ObjXpath
        {
            public string id { get; set; }
            public string Key { get; set; }
            public string Value { get; set; }
        }
    }
    

    就到这里吧,大家可以下载我的源代码试试手
    打包下载:
    <ignore_js_op> AutoXpathTools.zip (76.32 KB, 下载次数: 0)
    如果你感觉可以话就给我推荐一下吧。感谢大家

  • 相关阅读:
    0107 Git与路飞短信云
    0106 git与路飞项目配置
    关闭SSH,程序后台运行
    ImportError: libSM.so.6: cannot open shared object file: No such file or directory
    A problem has been detected and windows has been shut down to prevent damage
    卸载Windows,安装纯Linux
    No module named '_tkinter'
    [转载] 图片文档扫描矫正处理(手机扫描仪),OCR识别,图片修改库整合
    [ 完美 ] 解决Python依赖(包)环境
    安装 VMware Tools
  • 原文地址:https://www.cnblogs.com/wdcwy/p/15737723.html
Copyright © 2020-2023  润新知