• 利用SgmlReader获取网页源代码,进行提取


    1根据sgmlReader类获得完整的html代码

       /// <summary>
            /// 读取html页面内容
            /// </summary>
            /// <param name="uri">网址</param>
            /// <returns></returns>
            private string GetWellFormedHTML(string uri)
            {
                StreamReader sReader = null;//读取字节流
                StringWriter sw = null;//写入字符串
                SgmlReader reader = null;//sgml读取方法
                XmlTextWriter writer = null;//生成xml数据流
                try
                {
                    if (uri == String.Empty)
                        return null;
                    WebClient webclient = new WebClient();
                    webclient.Encoding = Encoding.UTF8;
                    //页面内容
                    string strWebContent = webclient.DownloadString(uri);
    
    
                    reader = new SgmlReader();
                    reader.DocType = "HTML";
                    reader.InputStream = new StringReader(strWebContent);
    
    
                    sw = new StringWriter();
                    writer = new XmlTextWriter(sw);
                    writer.Formatting = System.Xml.Formatting.Indented;
                    while (reader.Read())
                    {
                        if (reader.NodeType != XmlNodeType.Whitespace)
                        {
                            writer.WriteNode(reader, true);
                        }
                    }
                    return sw.ToString();
    
                }
                catch (Exception exp)
                {
                    writer.Close();
                    reader.Close();
                    sw.Close();
                    sReader.Close();
                    return exp.Message;
                }
            }
    View Code

    2根据xpath规则,进行查找

      /// <summary>
            /// 加载html源码,根据xpath规则查找所需内容
            /// </summary>
            /// <param name="htmlStr">源码</param>
            /// <param name="xpath">xpath规则</param>
            /// <returns>查询结果</returns>
            private string GetResult(string htmlStr, string xpath)
            {
                StringBuilder sb = new StringBuilder();//存储结果
                XPathDocument doc = new XPathDocument(new StringReader(htmlStr));//记载文件
                XPathNavigator nav = doc.CreateNavigator();//产生节点
                XPathNodeIterator nodes = nav.Select(xpath);//需找目标
                while (nodes.MoveNext())
                {
                    XPathNavigator navCon = nodes.Current;
                    sb.AppendLine(navCon.InnerXml);//获取全部内容(包含属性等)
                    sb.AppendLine(navCon.Value);//获取值(不包含属性等)
                }
                return sb.ToString();
            }
    View Code

    完!

  • 相关阅读:
    【嵌入式硬件Esp32】Ubuntu18.04 更换阿里云软件源
    【嵌入式硬件Esp32】Ubuntu 1804下ESP32交叉编译环境搭建
    【嵌入式硬件Esp32】Eclipse c++切换回英文方法
    Ant Design使用问题记录
    C#调用python脚本
    C# 6新特性简单总结
    ASP.NET动态网站制作(30)-- WEBService
    ASP.NET动态网站制作(29)-- 正则
    ASP.NET动态网站制作(28)-- 三层框架(2)
    ASP.NET动态网站制作(27)-- 三层框架(1)
  • 原文地址:https://www.cnblogs.com/wwz-wwz/p/7551477.html
Copyright © 2020-2023  润新知