/// <summary> /// 通过链接获取网页源码 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetContenFrommUrl(string url) { string Content = string.Empty; Uri uri = new Uri(url); //WebRequest需要添加引用 System.Net; WebRequest myReq = WebRequest.Create(uri); WebResponse result = myReq.GetResponse(); Stream receviceStream = result.GetResponseStream(); //Encoding.UTF8 //StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312")); StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.UTF8); Content = readerOfStream.ReadToEnd(); readerOfStream.Close(); receviceStream.Close(); result.Close(); return Content; } /// <summary> /// 获取指定DIV的内容 /// </summary> /// <param name="strHTML">被筛选的字符串</param> /// <param name="name">ID名</param> /// <returns></returns> public static string GetDivFromStr(string strHTML) { string Content = string.Empty; //Match,Regex需要添加引用 System.Text.RegularExpressions; Match m = Regex.Match(strHTML, @"<div[^>]*?id=""listLeft""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[sS]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase); if (m.Success) { Content = m.Value; } return Content; } /// <summary> /// 下载图片,并将图片保存到本地 /// </summary> /// <param name="URL">图片链接</param> /// <returns>本地图片地址</returns> public static string DowmLoadImage(string URL) { string Image = string.Empty; string Path = "D:/MyJob/HtmlToData/Images/"; //WebClient需要添加引用 System.Net; WebClient myWebClient = new System.Net.WebClient(); //URL 图片路径, Path + System.IO.Path.GetFileName(URL) 图片保存位置 myWebClient.DownloadFile(URL, Path + System.IO.Path.GetFileName(URL)); Image = "2016/12/22/" + System.IO.Path.GetFileName(URL); return Image; } /// <summary> /// 替换指定图片 /// </summary> /// <param name="Content">Html代码</param> /// <returns>返回替换后的Html代码</returns> public static string ReplaceImage(string Content) { //获取图片路径 //Regex需要添加引用 System.Text.RegularExpressions; Regex regImg = new Regex(@"<img[^<>]*?src[s ]*=[s ]*[""']?[s ]*(?<imgUrl>[^s ""'<>]*)[^<>]*?/?[s ]*>", RegexOptions.IgnoreCase); //MatchCollection 需要添加引用 System.Text.RegularExpressions; MatchCollection matches = regImg.Matches(Content); //将某一特定图片(横杠杠)替换为<hr /> foreach (Match match in matches) { if (match.Groups["imgUrl"].Value == "http://en.shio.gov.cn/file/images/split-e5.gif") { Content = Content.Replace(match.Value, "<hr />"); //将图片http://en.shio.gov.cn/file/images/split-e5.gif替换为<hr /> break; } } return Content; } /// <summary> /// 替换指定Div /// </summary> /// <param name="Content">Html代码</param> /// <param name="strHTML">被筛选的字符串</param> /// <returns>返回替换后的Html代码</returns> public static string ReplaceDiv(string Content,string strHTML) { //将< div id = "pages" ></div>中的内容替换为<hr /> //Match,Regex需要添加引用 System.Text.RegularExpressions; Match mm = Regex.Match(strHTML, @"<div[^>]*?id=""pages""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[sS]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase); Content = Content.Replace(mm.Value, "<hr />"); return Content; } /// <summary> /// 获取指定imge标签的src /// </summary> /// <param name="strHTML"></param> /// <returns></returns> public static string GetImageSrc(string strHTML) { string Titleimage = ""; //Match,Regex需要添加引用 System.Text.RegularExpressions; Match maimage = Regex.Match(strHTML, @"<img[^<>]*?src[s ]*=[s ]*[""']?[s ]*(?<imgUrl>[^s ""'<>]*)[^<>]*?/?[s ]*>"); //获取标题图片 if (maimage.Success) { Titleimage = DowmLoadImage(maimage.Groups["imgUrl"].Value); } return Titleimage; } /// <summary> ///获取<a> 标签的href和内容 /// </summary> /// <param name="AStr">Html代码</param> /// <returns></returns> public static string[] GetHref(string AStr) { string[] ListStr = new string[2]; //Match,Regex需要添加引用 System.Text.RegularExpressions; Match ma = Regex.Match(AStr, @"(?is)<a[^>]+?href=(['""])([^'""]*)1[^>]*>(.+)</a>"); if (ma.Success) { ListStr[0] = ma.Groups[3].Value;//text ListStr[1] = ma.Groups[2].Value;//超链接 } return ListStr; } /// <summary> /// 获取指定p(<p class="auxiInfo">)标签的内容 /// </summary> /// <param name="PStr">Html代码</param> /// <returns>返回P标签的内容</returns> public static string GetTargetPContent(string PStr) { string content = ""; //Match,Regex需要添加引用 System.Text.RegularExpressions; Match mtime = Regex.Match(PStr, @"<p[^>]*?class=""auxiInfo""[^>]*>((?>(?<o><p[^>]*>)|(?<-o></p>)|(?:(?!</?p)[sS]))*)(?(o)(?!))</p>", RegexOptions.IgnoreCase); if (mtime.Success) { content = mtime.Groups[1].Value; } return content; } /// <summary> /// 获取P标签的内容 /// </summary> /// <param name="PStr">Html代码</param> /// <returns>返回P标签的内容</returns> public static string GetPContent(string PStr) { string content = ""; //Match,Regex需要添加引用 System.Text.RegularExpressions; Match mp = Regex.Match(PStr, @"(?is)<p>(.*?)</p>"); if (mp.Success) { content = mp.Groups[1].Value; } return content; }