• asp.net如何去掉HTML标记


    ///   <summary>   
      
    ///   去除HTML标记   
      
    ///   </summary>   
      
    ///   <param   name="NoHTML">包括HTML的源码   </param>   
      
    ///   <returns>已经去除后的文字</returns>   

      public   static   string   NoHTML(string   Htmlstring)   
      
    {   
      
    //删除脚本   
      Htmlstring   =   Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);   
      
    //删除HTML   
      Htmlstring   =   Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);   
        
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(nbsp|#160);","   ",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,   @"&#(\d+);","",RegexOptions.IgnoreCase);   
        
      Htmlstring.Replace(
    "<","");   
      Htmlstring.Replace(
    ">","");   
      Htmlstring.Replace(
    "\r\n","");   
      Htmlstring
    =HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();   
        
      
    return   Htmlstring;   
      }



    ///提取HTML代码中文字的C#函数     
      
    ///   <summary>   
      
    ///   去除HTML标记   
      
    ///   </summary>   
      
    ///   <param   name="strHtml">包括HTML的源码   </param>   
      
    ///   <returns>已经去除后的文字</returns>   

      using   System;   
      
    using   System.Text.RegularExpressions;   
      
    public   class   StripHTMLTest{   
          
    public   static   void   Main(){   
              
    string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");   
              Console.WriteLine(s);   
          }
       
        
          
    public   static   string   StripHTML(string   strHtml){   
              
    string   []   aryReg   ={   
                          
    @"<script[^>]*?>.*?</script>",   
        
                          
    @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",   
                          
    @"([\r\n])[\s]+",   
                          
    @"&(quot|#34);",   
                          
    @"&(amp|#38);",   
                          
    @"&(lt|#60);",   
                          
    @"&(gt|#62);",     
                          
    @"&(nbsp|#160);",     
                          
    @"&(iexcl|#161);",   
                          
    @"&(cent|#162);",   
                          
    @"&(pound|#163);",   
                          
    @"&(copy|#169);",   
                          
    @"&#(\d+);",   
                          
    @"-->",   
                          
    @"<!--.*\n"   
                        }
    ;   
        
              
    string   []   aryRep   =   {   
                            
    "",   
                            
    "",   
                            
    "",   
                            
    "\"",   
                            "&",   
                            
    "<",   
                            
    ">",   
                            
    "   ",   
                            
    "\xa1",//chr(161),   
                            "\xa2",//chr(162),   
                            "\xa3",//chr(163),   
                            "\xa9",//chr(169),   
                            "",   
                            
    "\r\n",   
                            
    ""   
                          }
    ;   
        
              
    string   newReg   =aryReg[0];   
              
    string   strOutput=strHtml;   
              
    for(int   i   =   0;i<aryReg.Length;i++){   
                  Regex   regex   
    =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);   
                  strOutput   
    =   regex.Replace(strOutput,aryRep[i]);   
              }
       
              strOutput.Replace(
    "<","");   
              strOutput.Replace(
    ">","");   
              strOutput.Replace(
    "\r\n","");   
              
    return   strOutput;   
          }
       
      }


    写一个静态方法   
      
    移除HTML标签   
        
                      
    取出文本中的图片地址
  • 相关阅读:
    hdu 5007 水题 (2014西安网赛A题)
    hdu 1698 线段树(成段替换 区间求和)
    poj 3468 线段树 成段增减 区间求和
    hdu 2795 公告板 (单点最值)
    UVaLive 6833 Miscalculation (表达式计算)
    UVaLive 6832 Bit String Reordering (模拟)
    CodeForces 124C Prime Permutation (数论+贪心)
    SPOJ BALNUM (数位DP)
    CodeForces 628D Magic Numbers (数位DP)
    POJ 3252 Round Numbers (数位DP)
  • 原文地址:https://www.cnblogs.com/goody9807/p/961195.html
Copyright © 2020-2023  润新知