• 通过链接获取Html源码内容


           /// <summary>
            /// 通过链接获取网页源码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
           public  static string GetContenFrommUrl(string url)
            {
                string Content = string.Empty;
                Uri uri = new Uri(url);
                //WebRequest需要添加引用  System.Net;
                WebRequest myReq = WebRequest.Create(uri);
                WebResponse result = myReq.GetResponse();
                Stream receviceStream = result.GetResponseStream();
                //Encoding.UTF8
                //StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312"));
                StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.UTF8);
                Content = readerOfStream.ReadToEnd();
                readerOfStream.Close();
                receviceStream.Close();
                result.Close();
                return Content;
            }
    
            /// <summary>
            /// 获取指定DIV的内容
            /// </summary>
            /// <param name="strHTML">被筛选的字符串</param>
            /// <param name="name">ID名</param>
            /// <returns></returns>
            public  static string GetDivFromStr(string strHTML)
            {
                string Content = string.Empty;
                //Match,Regex需要添加引用 System.Text.RegularExpressions;
                Match m = Regex.Match(strHTML, @"<div[^>]*?id=""listLeft""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[sS]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
                if (m.Success)
                {
                    Content = m.Value;
                }
                return Content;
            }
    
            /// <summary>
            /// 下载图片,并将图片保存到本地
            /// </summary>
            /// <param name="URL">图片链接</param>
            /// <returns>本地图片地址</returns>
         public    static string DowmLoadImage(string URL)
            {
                string Image = string.Empty;
                string Path = "D:/MyJob/HtmlToData/Images/";
                //WebClient需要添加引用 System.Net;
                WebClient myWebClient = new System.Net.WebClient();
                //URL 图片路径, Path + System.IO.Path.GetFileName(URL) 图片保存位置
                myWebClient.DownloadFile(URL, Path + System.IO.Path.GetFileName(URL));
                Image = "2016/12/22/" + System.IO.Path.GetFileName(URL);
                return Image;
            }
    
            /// <summary>
            /// 替换指定图片
            /// </summary>
            /// <param name="Content">Html代码</param>
            /// <returns>返回替换后的Html代码</returns>
            public static string ReplaceImage(string Content)
            {
                //获取图片路径
                //Regex需要添加引用 System.Text.RegularExpressions;
                Regex regImg = new Regex(@"<img[^<>]*?src[s	
    ]*=[s	
    ]*[""']?[s	
    ]*(?<imgUrl>[^s	
    ""'<>]*)[^<>]*?/?[s	
    ]*>", RegexOptions.IgnoreCase);
                //MatchCollection 需要添加引用 System.Text.RegularExpressions;
                MatchCollection matches = regImg.Matches(Content);
                //将某一特定图片(横杠杠)替换为<hr />
                foreach (Match match in matches)
                {
                    if (match.Groups["imgUrl"].Value == "http://en.shio.gov.cn/file/images/split-e5.gif")
                    {
                        Content = Content.Replace(match.Value, "<hr />"); //将图片http://en.shio.gov.cn/file/images/split-e5.gif替换为<hr />
                        break;
                    }
                }
                return Content;
            }
    
            /// <summary>
            /// 替换指定Div
            /// </summary>
            /// <param name="Content">Html代码</param>
           /// <param name="strHTML">被筛选的字符串</param>
            /// <returns>返回替换后的Html代码</returns>
            public static string ReplaceDiv(string Content,string strHTML)
            {
                //将< div id = "pages" ></div>中的内容替换为<hr />
                //Match,Regex需要添加引用 System.Text.RegularExpressions;
                Match mm = Regex.Match(strHTML, @"<div[^>]*?id=""pages""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[sS]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
                Content = Content.Replace(mm.Value, "<hr />");
                return Content;
            }
    
            /// <summary>
            /// 获取指定imge标签的src
            /// </summary>
            /// <param name="strHTML"></param>
            /// <returns></returns>
            public  static string GetImageSrc(string strHTML)
            {
                string Titleimage = "";
                //Match,Regex需要添加引用 System.Text.RegularExpressions;
                Match maimage = Regex.Match(strHTML, @"<img[^<>]*?src[s	
    ]*=[s	
    ]*[""']?[s	
    ]*(?<imgUrl>[^s	
    ""'<>]*)[^<>]*?/?[s	
    ]*>");
                //获取标题图片
                if (maimage.Success)
                {
                   Titleimage = DowmLoadImage(maimage.Groups["imgUrl"].Value);
                }
                return Titleimage;
            }
    
            /// <summary>
            ///获取<a> 标签的href和内容   
            /// </summary>
            /// <param name="AStr">Html代码</param>
            /// <returns></returns>
            public static string[] GetHref(string AStr)
            {
                string[] ListStr = new string[2];
                //Match,Regex需要添加引用 System.Text.RegularExpressions;
                Match ma = Regex.Match(AStr, @"(?is)<a[^>]+?href=(['""])([^'""]*)1[^>]*>(.+)</a>");
                if (ma.Success)
                {
                    ListStr[0] = ma.Groups[3].Value;//text
                    ListStr[1] = ma.Groups[2].Value;//超链接
                }
                return ListStr;
            }
    
            /// <summary>
            /// 获取指定p(<p class="auxiInfo">)标签的内容
            /// </summary>
            /// <param name="PStr">Html代码</param>
            /// <returns>返回P标签的内容</returns>
            public static string GetTargetPContent(string PStr)
            {
                string content = "";
                //Match,Regex需要添加引用 System.Text.RegularExpressions;
                Match mtime = Regex.Match(PStr, @"<p[^>]*?class=""auxiInfo""[^>]*>((?>(?<o><p[^>]*>)|(?<-o></p>)|(?:(?!</?p)[sS]))*)(?(o)(?!))</p>", RegexOptions.IgnoreCase);
                if (mtime.Success)
                {
                    content = mtime.Groups[1].Value;
                }
                return content;
            }
    
            /// <summary>
            /// 获取P标签的内容
            /// </summary>
            /// <param name="PStr">Html代码</param>
            /// <returns>返回P标签的内容</returns>
            public static string GetPContent(string PStr)
            {
                string content = "";
                //Match,Regex需要添加引用 System.Text.RegularExpressions;
                Match mp = Regex.Match(PStr, @"(?is)<p>(.*?)</p>");
                if (mp.Success)
                {
                    content = mp.Groups[1].Value;
                }
                return content;
            }
    

      

  • 相关阅读:
    面试题
    面向切面编程 AOP
    matlab提取wind底层数据库操作
    tensorflow(4):神经网络框架总结
    tensorflow(3):神经网络优化(ema,regularization)
    tensorflow(2):神经网络优化(loss,learning_rate)
    tensorflow(1) 基础: 神经网络基本框架
    在anaconda中安装tensorflow
    anaconda利用pip安装module
    python(10): xlsxwriter模块
  • 原文地址:https://www.cnblogs.com/suflowers1700218/p/11528046.html
Copyright © 2020-2023  润新知