• 提取HTML的正文类


    本文转载:http://blog.csdn.net/cjh200102/article/details/6824895

    //2、提取html的正文 类
    using System;
     using System.Text;
     namespace HtmlStrip
     {
         class MainClass
         {
             public static void Main (string[] args)
             {
                 string str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo";
                 //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");
                 //str=rd.ReadToEnd ();
                 HtmlParser t = new HtmlParser (str); //
                 t.KeepTag (new string[] { "br" }); //设置br标签不过虑
                 Console.Write (t.Text ());
             }
             
             
             
         }
         class HtmlParser
         {
             private string[] htmlcode; //把html转为数组形式用于分析
             private StringBuilder result = new StringBuilder ();  //输出的结果
             private int seek; //分析文本时候的指针位置
             private string[] keepTag;  //用于保存要保留的尖括号内容
             private bool _inTag;  //标记现在的指针是不是在尖括号内
             private bool needContent = true;  //是否要提取正文
             private string tagName;  //当前尖括号的名字
             private string[] specialTag = new string[] { "script", "style", "!--" };  //特殊的尖括号内容,一般这些标签的正文是不要的
             
             /// <summary>
             /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字
             /// </summary>
             public bool inTag {
                 get { return _inTag; }
                 set {
                     _inTag = value;
                     if (!value)
                         return;
                     bool ok = true;
                     tagName = "";
                     while (ok) {
                         string word = read ();
                         if (word != " " && word != ">") {
                             tagName += word;
                         } else if (word == " " && tagName.Length > 0) {
                             ok = false;
                         } else if (word == ">") {
                             ok = false;
                             inTag = false;
                             seek -= 1;
                         }
                     }
                 }
             }
             /// <summary>
             /// 初始化类
             /// </summary>
             /// <param name="html">
             ///  要分析的html代码
             /// </param>
             public HtmlParser (string html)
             {
                 htmlcode = new string[html.Length];
                 for (int i = 0; i < html.Length; i++) {
                     htmlcode[i] = html[i].ToString ();
                 }
                 KeepTag (new string[] {  });
             }
             /// <summary>
             /// 设置要保存那些标签不要被过滤掉
             /// </summary>
             /// <param name="tags">
             ///
             /// </param>
             public void KeepTag (string[] tags)
             {
                 keepTag = tags;
             }
             
             /// <summary>
             /// 
             /// </summary>
             /// <returns>
             /// 输出处理后的文本
             /// </returns>
             public string Text ()
             {
                 int startTag = 0;
                 int endTag = 0;
                 while (seek < htmlcode.Length) {
                     string word = read ();
                     if (word.ToLower () == "<") {
                         startTag = seek;
                         inTag = true;
                     } else if (word.ToLower () == ">") {
                         endTag = seek;
                         inTag = false;
                         if (iskeepTag (tagName.Replace ("/", ""))) {
                             for (int i = startTag - 1; i < endTag; i++) {
                                 result.Append (htmlcode[i].ToString ());
                             }
                         } else if (tagName.StartsWith ("!--")) {
                             bool ok = true;
                             while (ok) {
                                 if (read () == "-") {
                                     if (read () == "-") {
                                         if (read () == ">") {
                                             ok = false;
                                         } else {
                                             seek -= 1;
                                         }
                                     }
                                 }
                             }
                         } else {
                             foreach (string str in specialTag) {
                                 if (tagName == str) {
                                     needContent = false;
                                     break;
                                 } else
                                     needContent = true;
                             }
                         }
                     } else if (!inTag && needContent) {
                         result.Append (word);
                     }
                     
                 }
                 return result.ToString ();
             }
             /// <summary>
             /// 判断是否要保存这个标签
             /// </summary>
             /// <param name="tag">
             /// A <see cref="System.String"/>
             /// </param>
             /// <returns>
             /// A <see cref="System.Boolean"/>
             /// </returns>
             private bool iskeepTag (string tag)
             {
                 foreach (string ta in keepTag) {
                     if (tag.ToLower () == ta.ToLower ()) {
                         return true;
                     }
                 }
                 return false;
             }
             private string read ()
             {
                 return htmlcode[seek++];
             }
     
         }
     }
    

      

  • 相关阅读:
    linux系统中如何查看日志 (转)
    php 获取随机字符串(原创)
    php Aes 128位算法
    linux 在线实验
    number随时间随机递增每天 不同 php(原创)
    php 判断字符串包含中文(转)
    同步,异步 阻塞,非阻塞, 异步+回调机制 线程队列 事件Event 丶协程
    线程的理论知识 开启线程的两种方式(Thread) 线程和进程之间的对比 线程的其他方法 守护进程 互斥锁 死锁现象,递归锁 信号量
    获取进程以及父进程的pid 验证进程之间的数据隔离 join方法 进程对象的其他属性 僵尸进程与孤儿进程(存在Linux系统中) 守护进程
    进程基础知识 操作系统 操作系统的发展史(多道技术) 进程介绍 python并发编程之:多进程
  • 原文地址:https://www.cnblogs.com/51net/p/3532200.html
Copyright © 2020-2023  润新知