获取网页HTML元素内容方法
①通过 正则表达式 匹配获取
View Code
1 string resquestUrl = string.Empty;
2 //过来HTML元素方法
3 Regex rxGetInfo = new Regex("<label for=\"caller\">.*?</label>", RegexOptions.IgnoreCase);
4 Regex rxFilter = new Regex("<.*?>");
5 HttpWebRequest request = WebRequest.Create(resquestUrl) as HttpWebRequest;
6 HttpWebResponse response = request.GetResponse() as HttpWebResponse;
7 StreamReader sr = new StreamReader(response.GetResponseStream());
8 string returnContent = sr.ReadToEnd();
9 sr.Close();
10 response.Close();
11 MatchCollection mc = rxGetInfo.Matches(returnContent);
2 //过来HTML元素方法
3 Regex rxGetInfo = new Regex("<label for=\"caller\">.*?</label>", RegexOptions.IgnoreCase);
4 Regex rxFilter = new Regex("<.*?>");
5 HttpWebRequest request = WebRequest.Create(resquestUrl) as HttpWebRequest;
6 HttpWebResponse response = request.GetResponse() as HttpWebResponse;
7 StreamReader sr = new StreamReader(response.GetResponseStream());
8 string returnContent = sr.ReadToEnd();
9 sr.Close();
10 response.Close();
11 MatchCollection mc = rxGetInfo.Matches(returnContent);
②根据元素属性 GetElementById获取
HtmlDocument temphtml = new HtmlDocument();
temphtml.GetElementById();
③过滤html标签
View Code
1 /// <summary>
2 /// 过滤html标签
3 /// </summary>
4 /// <param name="strHtml">html的内容</param>
5 /// <returns></returns>
6 public static string StripHTML(string stringToStrip)
7 {
8 // paring using RegEx //
9 stringToStrip = Regex.Replace(stringToStrip, "</p(?:\\s*)>(?:\\s*)<p(?:\\s*)>", "\n\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
10 stringToStrip = Regex.Replace(stringToStrip, "<br(?:\\s*)/>", "\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
11 stringToStrip = Regex.Replace(stringToStrip, "\"", "''", RegexOptions.IgnoreCase | RegexOptions.Compiled);
12 stringToStrip = StripHtmlXmlTags(stringToStrip);
13 return stringToStrip;
14 }
15
16 private static string StripHtmlXmlTags(string content)
17 {
18 return Regex.Replace(content, "<[^>]+>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
19 }
2 /// 过滤html标签
3 /// </summary>
4 /// <param name="strHtml">html的内容</param>
5 /// <returns></returns>
6 public static string StripHTML(string stringToStrip)
7 {
8 // paring using RegEx //
9 stringToStrip = Regex.Replace(stringToStrip, "</p(?:\\s*)>(?:\\s*)<p(?:\\s*)>", "\n\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
10 stringToStrip = Regex.Replace(stringToStrip, "<br(?:\\s*)/>", "\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
11 stringToStrip = Regex.Replace(stringToStrip, "\"", "''", RegexOptions.IgnoreCase | RegexOptions.Compiled);
12 stringToStrip = StripHtmlXmlTags(stringToStrip);
13 return stringToStrip;
14 }
15
16 private static string StripHtmlXmlTags(string content)
17 {
18 return Regex.Replace(content, "<[^>]+>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
19 }