本文转载:http://blog.csdn.net/cjh200102/article/details/6824895
//2、提取html的正文 类 using System; using System.Text; namespace HtmlStrip { class MainClass { public static void Main (string[] args) { string str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo"; //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html"); //str=rd.ReadToEnd (); HtmlParser t = new HtmlParser (str); // t.KeepTag (new string[] { "br" }); //设置br标签不过虑 Console.Write (t.Text ()); } } class HtmlParser { private string[] htmlcode; //把html转为数组形式用于分析 private StringBuilder result = new StringBuilder (); //输出的结果 private int seek; //分析文本时候的指针位置 private string[] keepTag; //用于保存要保留的尖括号内容 private bool _inTag; //标记现在的指针是不是在尖括号内 private bool needContent = true; //是否要提取正文 private string tagName; //当前尖括号的名字 private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容,一般这些标签的正文是不要的 /// <summary> /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字 /// </summary> public bool inTag { get { return _inTag; } set { _inTag = value; if (!value) return; bool ok = true; tagName = ""; while (ok) { string word = read (); if (word != " " && word != ">") { tagName += word; } else if (word == " " && tagName.Length > 0) { ok = false; } else if (word == ">") { ok = false; inTag = false; seek -= 1; } } } } /// <summary> /// 初始化类 /// </summary> /// <param name="html"> /// 要分析的html代码 /// </param> public HtmlParser (string html) { htmlcode = new string[html.Length]; for (int i = 0; i < html.Length; i++) { htmlcode[i] = html[i].ToString (); } KeepTag (new string[] { }); } /// <summary> /// 设置要保存那些标签不要被过滤掉 /// </summary> /// <param name="tags"> /// /// </param> public void KeepTag (string[] tags) { keepTag = tags; } /// <summary> /// /// </summary> /// <returns> /// 输出处理后的文本 /// </returns> public string Text () { int startTag = 0; int endTag = 0; while (seek < htmlcode.Length) { string word = read (); if (word.ToLower () == "<") { startTag = seek; inTag = true; } else if (word.ToLower () == ">") { endTag = seek; inTag = false; if (iskeepTag (tagName.Replace ("/", ""))) { for (int i = startTag - 1; i < endTag; i++) { result.Append (htmlcode[i].ToString ()); } } else if (tagName.StartsWith ("!--")) { bool ok = true; while (ok) { if (read () == "-") { if (read () == "-") { if (read () == ">") { ok = false; } else { seek -= 1; } } } } } else { foreach (string str in specialTag) { if (tagName == str) { needContent = false; break; } else needContent = true; } } } else if (!inTag && needContent) { result.Append (word); } } return result.ToString (); } /// <summary> /// 判断是否要保存这个标签 /// </summary> /// <param name="tag"> /// A <see cref="System.String"/> /// </param> /// <returns> /// A <see cref="System.Boolean"/> /// </returns> private bool iskeepTag (string tag) { foreach (string ta in keepTag) { if (tag.ToLower () == ta.ToLower ()) { return true; } } return false; } private string read () { return htmlcode[seek++]; } } }