public static string ClearHtml(string content) { Regex regex = new Regex(""); //首先把p标签的属性去掉,只留<p> regex = new Regex(@"<p.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline); content = regex.Replace(content, "<p>"); //找到网页中的各种标签,留待后续处理 regex = new Regex(@"<[/]*(?<txt>.*?)[s>]", RegexOptions.Singleline | RegexOptions.IgnoreCase); List<string> labels = new List<string>(); MatchCollection mclabels = regex.Matches(content); foreach (Match m in mclabels) { if(labels.Contains(m.Groups["txt"].Value) == false) labels.Add(m.Groups["txt"].Value.ToLower()); } //对各种标签进行替换,p、img、strong除外。br后面会进行单独处理 foreach (string lable in labels) { if (lable=="p" || lable == "img" || lable=="strong" || lable=="br") continue; regex = new Regex(@"<[/]*" + lable + ".*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline); content = regex.Replace(content, ""); } MatchCollection mc = null; regex = new Regex(@"<img.*?srcs*?=s*?['""](?<txt>.*?)['""].*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase); mc = regex.Matches(content); foreach (Match m in mc) { content = content.Replace(m.Value,"<img src='"+ m.Groups["txt"].Value+"' />"); } Regex r = new Regex(@"<br.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline); content = r.Replace(content, " "); r = new Regex(@"[ ]", RegexOptions.IgnoreCase | RegexOptions.Singleline); content = r.Replace(content, "</p><p>"); content = content.Trim(); if (content.StartsWith("</p>") == true) content = content.Substring(4); if (content.EndsWith("<p>") == true) content = content.Remove(content.Length - 3); //替换段前空格开始 regex = new Regex(@"<p>s* ", RegexOptions.IgnoreCase | RegexOptions.Singleline); while (regex.IsMatch(content)) { content = regex.Replace(content, @"<p>"); } regex = new Regex(@"<p>s+", RegexOptions.IgnoreCase | RegexOptions.Singleline); while (regex.IsMatch(content)) { content = regex.Replace(content, @"<p>"); } regex = new Regex(@"<p> +", RegexOptions.IgnoreCase | RegexOptions.Singleline); while (regex.IsMatch(content)) { content = regex.Replace(content, @"<p>"); } //替换段前空格结束 //替换p标签空嵌套的情况 regex = new Regex(@"<p>s*?<p>", RegexOptions.Singleline | RegexOptions.IgnoreCase); while (regex.IsMatch(content)) content = regex.Replace(content, "<p>"); regex = new Regex(@"</p>s*?</p>", RegexOptions.Singleline | RegexOptions.IgnoreCase); while (regex.IsMatch(content)) content = regex.Replace(content, @"</p>"); //替换p标签内容为空的情况 regex = new Regex(@"<p>(?<txt>.*?)</p>", RegexOptions.Singleline | RegexOptions.IgnoreCase); mc = regex.Matches(content); foreach (Match m in mc) { string value = m.Groups["txt"].Value; value = value.Replace(" ", "").Trim(); if (string.IsNullOrEmpty(value) == true) content = content.Replace(m.Value,""); } //段首加空格 content = content.Replace("<p>", "<p> "); return content; }
剔除了除p、img、strong之外的其他标签,对p、img的各种属性也进行了清除,专门用于生成干净的网页正文,可用于信息采集后的内容整理和格式化排版。自用代码,算法效率可能不高,但是足以满足目前需求了。