• 用.net正则去除所有的html源码或者提取出图片地址以及存数据库时的符号替换(如富文本编辑器)


    参照了别人的博客~前两个基本照搬qwq

    http://www.cnblogs.com/vingi/articles/2447861.html

    一、去除所有的html源码,只留下文字。(已试验过)

    1、引头文件

    using System.Text.RegularExpressions

    2、编写函数

    public   static   string    NoHTML(string    Htmlstring)   
      {   
      //删除脚本   
       Htmlstring   =    Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);   
      //删除HTML   
       Htmlstring   =    Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"([
    ])[s]+","",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);   
        
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(quot|#34);",""",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(nbsp|#160);","   ",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(iexcl|#161);","xa1",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(cent|#162);","xa2",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(pound|#163);","xa3",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,@"&(copy|#169);","xa9",RegexOptions.IgnoreCase);   
       Htmlstring   =    Regex.Replace(Htmlstring,   @"&#(d+);","",RegexOptions.IgnoreCase);   
        
       Htmlstring.Replace("<","");   
       Htmlstring.Replace(">","");   
       Htmlstring.Replace("
    ","");   
       Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();   
        
      return    Htmlstring;   
       }

    二、把html中图片地址提取出来(未实验)

    1、先把html的标签符号移除

     public   static   string      ParseTags(string    HTMLStr)   
      {   
      return    System.Text.RegularExpressions.Regex.Replace(HTMLStr,   "<[^>]*>",   "");     
       }  

    2、再把图片地址取出来

    public   static   string    GetImgUrl(string    HTMLStr)   
                      {   
                              string    str   =   string.Empty;   
                              string    sPattern   =   @"^<imgs+[^>]*>";   
                               Regex    r   =   new    Regex(@"<imgs+[^>]*s*srcs*=s*([']?)(?<url>S+)'?[^>]*>",   
                                       RegexOptions.Compiled);   
                               Match    m   =    r.Match(HTMLStr.ToLower());   
                              if    (m.Success)   
                                       str   =    m.Result("${url}");   
                              return    str;   
                       }  

    三、把富文本编辑器里的内容存入数据库时需要进行图片替换 不然会出bug

    static public string change(string x)//对富文本编辑器中获取的内容 html标签进行处理,避免它存到数据库的时候被转义
        {
            x = x.Replace("&lt;", "<");//对一些特殊字符进行替换
            x = x.Replace("&gt;", ">");
            x = x.Replace("&quot;", """);
    
            return x;
        }
  • 相关阅读:
    python 安装与pip安装
    使用通配符来解决数据1和11、12/13/14的问题
    数据库中一行变多行,拆分数据
    15-哈希表 HashTable
    13-自平衡二分搜索树 AVLTree
    12-并查集 UnionFind
    11-字典树 Trie
    10-线段树 Segment Tree
    09-堆 Heap(最大堆)
    08-映射 Map
  • 原文地址:https://www.cnblogs.com/ivan99/p/6658011.html
Copyright © 2020-2023  润新知